• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 1996-2016, International Business Machines Corporation and    *
7  * others. All Rights Reserved.                                                *
8  *******************************************************************************
9  */
10 
11 package ohos.global.icu.text;
12 
13 /**
14 * A decompression engine implementing the Standard Compression Scheme
15 * for Unicode (SCSU) as outlined in <A
16 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
17 * Report #6</A>.
18 *
19 * <P><STRONG>USAGE</STRONG></P>
20 *
21 * <P>The static methods on <TT>UnicodeDecompressor</TT> may be used in a
22 * straightforward manner to decompress simple strings:</P>
23 *
24 * <PRE>
25 *  byte [] compressed = ... ; // get compressed bytes from somewhere
26 *  String result = UnicodeDecompressor.decompress(compressed);
27 * </PRE>
28 *
29 * <P>The static methods have a fairly large memory footprint.
30 * For finer-grained control over memory usage,
31 * <TT>UnicodeDecompressor</TT> offers more powerful APIs allowing
32 * iterative decompression:</P>
33 *
34 * <PRE>
35 *  // Decompress an array "bytes" of length "len" using a buffer of 512 chars
36 *  // to the Writer "out"
37 *
38 *  UnicodeDecompressor myDecompressor         = new UnicodeDecompressor();
39 *  final static int    BUFSIZE                = 512;
40 *  char []             charBuffer             = new char [ BUFSIZE ];
41 *  int                 charsWritten           = 0;
42 *  int []              bytesRead              = new int [1];
43 *  int                 totalBytesDecompressed = 0;
44 *  int                 totalCharsWritten      = 0;
45 *
46 *  do {
47 *    // do the decompression
48 *    charsWritten = myDecompressor.decompress(bytes, totalBytesDecompressed,
49 *                                             len, bytesRead,
50 *                                             charBuffer, 0, BUFSIZE);
51 *
52 *    // do something with the current set of chars
53 *    out.write(charBuffer, 0, charsWritten);
54 *
55 *    // update the no. of bytes decompressed
56 *    totalBytesDecompressed += bytesRead[0];
57 *
58 *    // update the no. of chars written
59 *    totalCharsWritten += charsWritten;
60 *
61 *  } while(totalBytesDecompressed &lt; len);
62 *
63 *  myDecompressor.reset(); // reuse decompressor
64 * </PRE>
65 *
66 * <P>Decompression is performed according to the standard set forth in
67 * <A HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
68 * Report #6</A></P>
69 *
70 * @see UnicodeCompressor
71 *
72 * @author Stephen F. Booth
73 * @hide exposed on OHOS
74 */
75 public final class UnicodeDecompressor implements SCSU
76 {
77     //==========================
78     // Instance variables
79     //==========================
80 
81     /** Alias to current dynamic window */
82     private int       fCurrentWindow   = 0;
83 
84     /** Dynamic compression window offsets */
85     private int []    fOffsets         = new int [ NUMWINDOWS ];
86 
87     /** Current compression mode */
88     private int       fMode            = SINGLEBYTEMODE;
89 
90     /** Size of our internal buffer */
91     private final static int BUFSIZE   = 3;
92 
93     /** Internal buffer for saving state */
94     private byte []   fBuffer          = new byte [BUFSIZE];
95 
96     /** Number of characters in our internal buffer */
97     private int       fBufferLength    = 0;
98 
99 
100     /**
101      * Create a UnicodeDecompressor.
102      * Sets all windows to their default values.
103      * @see #reset
104      */
UnicodeDecompressor()105     public UnicodeDecompressor(){
106         reset();              // initialize to defaults
107     }
108 
109     /**
110      * Decompress a byte array into a String.
111      * @param buffer The byte array to decompress.
112      * @return A String containing the decompressed characters.
113      * @see #decompress(byte [], int, int)
114      */
decompress(byte [] buffer)115     public static String decompress(byte [] buffer){
116         char [] buf = decompress(buffer, 0, buffer.length);
117         return new String(buf);
118     }
119 
120     /**
121      * Decompress a byte array into a Unicode character array.
122      * @param buffer The byte array to decompress.
123      * @param start The start of the byte run to decompress.
124      * @param limit The limit of the byte run to decompress.
125      * @return A character array containing the decompressed bytes.
126      * @see #decompress(byte [])
127      */
decompress(byte [] buffer, int start, int limit)128     public static char [] decompress(byte [] buffer, int start, int limit) {
129         UnicodeDecompressor comp = new UnicodeDecompressor();
130 
131         // use a buffer we know will never overflow
132         // in the worst case, each byte will decompress
133         // to a surrogate pair (buffer must be at least 2 chars)
134         int len = Math.max(2, 2 * (limit - start));
135         char [] temp = new char [len];
136 
137         int charCount = comp.decompress(buffer, start, limit, null,
138                         temp, 0, len);
139 
140         char [] result = new char [charCount];
141         System.arraycopy(temp, 0, result, 0, charCount);
142         return result;
143     }
144 
145     /**
146      * Decompress a byte array into a Unicode character array.
147      *
148      * This function will either completely fill the output buffer,
149      * or consume the entire input.
150      *
151      * @param byteBuffer The byte buffer to decompress.
152      * @param byteBufferStart The start of the byte run to decompress.
153      * @param byteBufferLimit The limit of the byte run to decompress.
154      * @param bytesRead A one-element array.  If not null, on return
155      * the number of bytes read from byteBuffer.
156      * @param charBuffer A buffer to receive the decompressed data.
157      * This buffer must be at minimum two characters in size.
158      * @param charBufferStart The starting offset to which to write
159      * decompressed data.
160      * @param charBufferLimit The limiting offset for writing
161      * decompressed data.
162      * @return The number of Unicode characters written to charBuffer.
163      */
decompress(byte [] byteBuffer, int byteBufferStart, int byteBufferLimit, int [] bytesRead, char [] charBuffer, int charBufferStart, int charBufferLimit)164     public int decompress(byte []    byteBuffer,
165               int        byteBufferStart,
166               int        byteBufferLimit,
167               int []     bytesRead,
168               char []    charBuffer,
169               int        charBufferStart,
170               int        charBufferLimit)
171     {
172     // the current position in the source byte buffer
173     int bytePos      = byteBufferStart;
174 
175     // the current position in the target char buffer
176     int ucPos        = charBufferStart;
177 
178         // the current byte from the source buffer
179     int aByte        = 0x00;
180 
181 
182     // charBuffer must be at least 2 chars in size
183     if(charBuffer.length < 2 || (charBufferLimit - charBufferStart) < 2)
184         throw new IllegalArgumentException("charBuffer.length < 2");
185 
186     // if our internal buffer isn't empty, flush its contents
187     // to the output buffer before doing any more decompression
188     if(fBufferLength > 0) {
189 
190         int newBytes = 0;
191 
192         // fill the buffer completely, to guarantee one full character
193         if(fBufferLength != BUFSIZE) {
194         newBytes = fBuffer.length - fBufferLength;
195 
196         // verify there are newBytes bytes in byteBuffer
197         if(byteBufferLimit - byteBufferStart < newBytes)
198             newBytes = byteBufferLimit - byteBufferStart;
199 
200         System.arraycopy(byteBuffer, byteBufferStart,
201                  fBuffer, fBufferLength, newBytes);
202         }
203 
204         // reset buffer length to 0 before recursive call
205         fBufferLength = 0;
206 
207         // call self recursively to decompress the buffer
208         int count = decompress(fBuffer, 0, fBuffer.length, null,
209                    charBuffer, charBufferStart,
210                    charBufferLimit);
211 
212         // update the positions into the arrays
213         ucPos += count;
214         bytePos += newBytes;
215     }
216 
217         // the main decompression loop
218     mainLoop:
219     while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
220         switch(fMode) {
221         case SINGLEBYTEMODE:
222         // single-byte mode decompression loop
223         singleByteModeLoop:
224         while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
225         aByte = byteBuffer[bytePos++] & 0xFF;
226         switch(aByte) {
227             // All bytes from 0x80 through 0xFF are remapped
228             // to chars or surrogate pairs according to the
229             // currently active window
230         case 0x80: case 0x81: case 0x82: case 0x83: case 0x84:
231         case 0x85: case 0x86: case 0x87: case 0x88: case 0x89:
232         case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E:
233         case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93:
234         case 0x94: case 0x95: case 0x96: case 0x97: case 0x98:
235         case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D:
236         case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2:
237         case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7:
238         case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC:
239         case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1:
240         case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6:
241         case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB:
242         case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0:
243         case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5:
244         case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA:
245         case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF:
246         case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4:
247         case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9:
248         case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE:
249         case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3:
250         case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8:
251         case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED:
252         case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2:
253         case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7:
254         case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC:
255         case 0xFD: case 0xFE: case 0xFF:
256             // For offsets <= 0xFFFF, convert to a single char
257             // by adding the window's offset and subtracting
258             // the generic compression offset
259             if(fOffsets[ fCurrentWindow ] <= 0xFFFF) {
260             charBuffer[ucPos++] = (char)
261                 (aByte + fOffsets[ fCurrentWindow ]
262                  - COMPRESSIONOFFSET);
263             }
264             // For offsets > 0x10000, convert to a surrogate pair by
265             // normBase = window's offset - 0x10000
266             // high surr. = 0xD800 + (normBase >> 10)
267             // low  surr. = 0xDC00 + (normBase & 0x3FF) + (byte & 0x7F)
268             else {
269             // make sure there is enough room to write
270             // both characters
271             // if not, save state and break out
272             if((ucPos + 1) >= charBufferLimit) {
273                 --bytePos;
274                 System.arraycopy(byteBuffer, bytePos,
275                          fBuffer, 0,
276                          byteBufferLimit - bytePos);
277                 fBufferLength = byteBufferLimit - bytePos;
278                 bytePos += fBufferLength;
279                 break mainLoop;
280             }
281 
282             int normalizedBase = fOffsets[ fCurrentWindow ]
283                 - 0x10000;
284             charBuffer[ucPos++] = (char)
285                 (0xD800 + (normalizedBase >> 10));
286             charBuffer[ucPos++] = (char)
287                 (0xDC00 + (normalizedBase & 0x3FF)+(aByte & 0x7F));
288             }
289             break;
290 
291             // bytes from 0x20 through 0x7F are treated as ASCII and
292             // are remapped to chars by padding the high byte
293             // (this is the same as quoting from static window 0)
294             // NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D)
295             // are treated as ASCII as well
296         case 0x00: case 0x09: case 0x0A: case 0x0D:
297         case 0x20: case 0x21: case 0x22: case 0x23: case 0x24:
298         case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:
299         case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E:
300         case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33:
301         case 0x34: case 0x35: case 0x36: case 0x37: case 0x38:
302         case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D:
303         case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42:
304         case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
305         case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:
306         case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51:
307         case 0x52: case 0x53: case 0x54: case 0x55: case 0x56:
308         case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B:
309         case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60:
310         case 0x61: case 0x62: case 0x63: case 0x64: case 0x65:
311         case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A:
312         case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:
313         case 0x70: case 0x71: case 0x72: case 0x73: case 0x74:
314         case 0x75: case 0x76: case 0x77: case 0x78: case 0x79:
315         case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E:
316         case 0x7F:
317             charBuffer[ucPos++] = (char) aByte;
318             break;
319 
320             // quote unicode
321         case SQUOTEU:
322             // verify we have two bytes following tag
323             // if not, save state and break out
324             if( (bytePos + 1) >= byteBufferLimit ) {
325             --bytePos;
326             System.arraycopy(byteBuffer, bytePos,
327                      fBuffer, 0,
328                      byteBufferLimit - bytePos);
329             fBufferLength = byteBufferLimit - bytePos;
330             bytePos += fBufferLength;
331             break mainLoop;
332             }
333 
334             aByte = byteBuffer[bytePos++];
335             charBuffer[ucPos++] = (char)
336             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
337             break;
338 
339             // switch to Unicode mode
340         case SCHANGEU:
341             fMode = UNICODEMODE;
342             break singleByteModeLoop;
343             //break;
344 
345             // handle all quote tags
346         case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3:
347         case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7:
348             // verify there is a byte following the tag
349             // if not, save state and break out
350             if(bytePos >= byteBufferLimit) {
351             --bytePos;
352             System.arraycopy(byteBuffer, bytePos,
353                      fBuffer, 0,
354                      byteBufferLimit - bytePos);
355             fBufferLength = byteBufferLimit - bytePos;
356             bytePos += fBufferLength;
357             break mainLoop;
358             }
359 
360             // if the byte is in the range 0x00 - 0x7F, use
361             // static window n otherwise, use dynamic window n
362             int dByte = byteBuffer[bytePos++] & 0xFF;
363             charBuffer[ucPos++] = (char)
364             (dByte+ (dByte >= 0x00 && dByte < 0x80
365                  ? sOffsets[aByte - SQUOTE0]
366                  : (fOffsets[aByte - SQUOTE0]
367                     - COMPRESSIONOFFSET)));
368             break;
369 
370             // handle all change tags
371         case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3:
372         case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7:
373             fCurrentWindow = aByte - SCHANGE0;
374             break;
375 
376             // handle all define tags
377         case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3:
378         case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7:
379             // verify there is a byte following the tag
380             // if not, save state and break out
381             if(bytePos >= byteBufferLimit) {
382             --bytePos;
383             System.arraycopy(byteBuffer, bytePos,
384                      fBuffer, 0,
385                      byteBufferLimit - bytePos);
386             fBufferLength = byteBufferLimit - bytePos;
387             bytePos += fBufferLength;
388             break mainLoop;
389             }
390 
391             fCurrentWindow = aByte - SDEFINE0;
392             fOffsets[fCurrentWindow] =
393             sOffsetTable[byteBuffer[bytePos++] & 0xFF];
394             break;
395 
396             // handle define extended tag
397         case SDEFINEX:
398             // verify we have two bytes following tag
399             // if not, save state and break out
400             if((bytePos + 1) >= byteBufferLimit ) {
401             --bytePos;
402             System.arraycopy(byteBuffer, bytePos,
403                      fBuffer, 0,
404                      byteBufferLimit - bytePos);
405             fBufferLength = byteBufferLimit - bytePos;
406             bytePos += fBufferLength;
407             break mainLoop;
408             }
409 
410             aByte = byteBuffer[bytePos++] & 0xFF;
411             fCurrentWindow = (aByte & 0xE0) >> 5;
412             fOffsets[fCurrentWindow] = 0x10000 +
413             (0x80 * (((aByte & 0x1F) << 8)
414                  | (byteBuffer[bytePos++] & 0xFF)));
415             break;
416 
417             // reserved, shouldn't happen
418         case SRESERVED:
419             break;
420 
421         } // end switch
422         } // end while
423         break;
424 
425         case UNICODEMODE:
426         // unicode mode decompression loop
427         unicodeModeLoop:
428         while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
429         aByte = byteBuffer[bytePos++] & 0xFF;
430         switch(aByte) {
431             // handle all define tags
432         case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3:
433         case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7:
434             // verify there is a byte following tag
435             // if not, save state and break out
436             if(bytePos >= byteBufferLimit ) {
437             --bytePos;
438             System.arraycopy(byteBuffer, bytePos,
439                      fBuffer, 0,
440                      byteBufferLimit - bytePos);
441             fBufferLength = byteBufferLimit - bytePos;
442             bytePos += fBufferLength;
443             break mainLoop;
444             }
445 
446             fCurrentWindow = aByte - UDEFINE0;
447             fOffsets[fCurrentWindow] =
448             sOffsetTable[byteBuffer[bytePos++] & 0xFF];
449             fMode = SINGLEBYTEMODE;
450             break unicodeModeLoop;
451             //break;
452 
453             // handle define extended tag
454         case UDEFINEX:
455             // verify we have two bytes following tag
456             // if not, save state and break out
457             if((bytePos + 1) >= byteBufferLimit ) {
458             --bytePos;
459             System.arraycopy(byteBuffer, bytePos,
460                      fBuffer, 0,
461                      byteBufferLimit - bytePos);
462             fBufferLength = byteBufferLimit - bytePos;
463             bytePos += fBufferLength;
464             break mainLoop;
465             }
466 
467             aByte = byteBuffer[bytePos++] & 0xFF;
468             fCurrentWindow = (aByte & 0xE0) >> 5;
469             fOffsets[fCurrentWindow] = 0x10000 +
470             (0x80 * (((aByte & 0x1F) << 8)
471                  | (byteBuffer[bytePos++] & 0xFF)));
472             fMode = SINGLEBYTEMODE;
473             break unicodeModeLoop;
474             //break;
475 
476             // handle all change tags
477         case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3:
478         case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7:
479             fCurrentWindow = aByte - UCHANGE0;
480             fMode = SINGLEBYTEMODE;
481             break unicodeModeLoop;
482             //break;
483 
484             // quote unicode
485         case UQUOTEU:
486             // verify we have two bytes following tag
487             // if not, save state and break out
488             if(bytePos >= byteBufferLimit  - 1) {
489             --bytePos;
490             System.arraycopy(byteBuffer, bytePos,
491                      fBuffer, 0,
492                      byteBufferLimit - bytePos);
493             fBufferLength = byteBufferLimit - bytePos;
494             bytePos += fBufferLength;
495             break mainLoop;
496             }
497 
498             aByte = byteBuffer[bytePos++];
499             charBuffer[ucPos++] = (char)
500             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
501             break;
502 
503         default:
504             // verify there is a byte following tag
505             // if not, save state and break out
506             if(bytePos >= byteBufferLimit ) {
507             --bytePos;
508             System.arraycopy(byteBuffer, bytePos,
509                      fBuffer, 0,
510                      byteBufferLimit - bytePos);
511             fBufferLength = byteBufferLimit - bytePos;
512             bytePos += fBufferLength;
513             break mainLoop;
514             }
515 
516             charBuffer[ucPos++] = (char)
517             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
518             break;
519 
520         } // end switch
521         } // end while
522         break;
523 
524         } // end switch( fMode )
525     } // end while
526 
527         // fill in output parameter
528     if(bytesRead != null)
529         bytesRead [0] = (bytePos - byteBufferStart);
530 
531         // return # of chars written
532     return (ucPos - charBufferStart);
533     }
534 
535     /**
536      * Reset the decompressor to its initial state.
537      */
reset()538     public void reset()
539     {
540         // reset dynamic windows
541         fOffsets[0] = 0x0080;    // Latin-1
542         fOffsets[1] = 0x00C0;    // Latin-1 Supplement + Latin Extended-A
543         fOffsets[2] = 0x0400;    // Cyrillic
544         fOffsets[3] = 0x0600;    // Arabic
545         fOffsets[4] = 0x0900;    // Devanagari
546         fOffsets[5] = 0x3040;    // Hiragana
547         fOffsets[6] = 0x30A0;    // Katakana
548         fOffsets[7] = 0xFF00;    // Fullwidth ASCII
549 
550 
551         fCurrentWindow  = 0;                // Make current window Latin-1
552         fMode           = SINGLEBYTEMODE;   // Always start in single-byte mode
553         fBufferLength   = 0;                // Empty buffer
554     }
555 }
556