• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 1996-2013, International Business Machines Corporation and    *
7  * others. All Rights Reserved.                                                *
8  *******************************************************************************
9  *
10  */
11 
12 package ohos.global.icu.text;
13 
14 /**
15  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
16  * BOM will be used if it is present.
17  */
18 abstract class CharsetRecog_Unicode extends CharsetRecognizer {
19 
20     /* (non-Javadoc)
21      * @see ohos.global.icu.text.CharsetRecognizer#getName()
22      */
23     @Override
getName()24     abstract String getName();
25 
26     /* (non-Javadoc)
27      * @see ohos.global.icu.text.CharsetRecognizer#match(ohos.global.icu.text.CharsetDetector)
28      */
29     @Override
match(CharsetDetector det)30     abstract CharsetMatch match(CharsetDetector det);
31 
codeUnit16FromBytes(byte hi, byte lo)32     static int codeUnit16FromBytes(byte hi, byte lo) {
33         return ((hi & 0xff) << 8) | (lo & 0xff);
34     }
35 
36     // UTF-16 confidence calculation. Very simple minded, but better than nothing.
37     //   Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
38     //     and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
39     //   NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
40     //   NULs should be rare in actual text.
adjustConfidence(int codeUnit, int confidence)41     static int adjustConfidence(int codeUnit, int confidence) {
42         if (codeUnit == 0) {
43             confidence -= 10;
44         } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
45             confidence += 10;
46         }
47         if (confidence < 0) {
48             confidence = 0;
49         } else if (confidence > 100) {
50             confidence = 100;
51         }
52         return confidence;
53     }
54 
55     static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
56     {
57         @Override
getName()58         String getName()
59         {
60             return "UTF-16BE";
61         }
62 
63         @Override
match(CharsetDetector det)64         CharsetMatch match(CharsetDetector det)
65         {
66             byte[] input = det.fRawInput;
67             int confidence = 10;
68 
69             int bytesToCheck = Math.min(input.length, 30);
70             for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
71                 int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]);
72                 if (charIndex == 0 && codeUnit == 0xFEFF) {
73                     confidence = 100;
74                     break;
75                 }
76                 confidence = adjustConfidence(codeUnit, confidence);
77                 if (confidence == 0 || confidence == 100) {
78                     break;
79                 }
80             }
81             if (bytesToCheck < 4 && confidence < 100) {
82                 confidence = 0;
83             }
84             if (confidence > 0) {
85                 return new CharsetMatch(det, this, confidence);
86             }
87             return null;
88         }
89     }
90 
91     static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
92     {
93         @Override
getName()94         String getName()
95         {
96             return "UTF-16LE";
97         }
98 
99         @Override
match(CharsetDetector det)100         CharsetMatch match(CharsetDetector det)
101         {
102             byte[] input = det.fRawInput;
103             int confidence = 10;
104 
105             int bytesToCheck = Math.min(input.length, 30);
106             for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
107                 int codeUnit = codeUnit16FromBytes(input[charIndex+1], input[charIndex]);
108                 if (charIndex == 0 && codeUnit == 0xFEFF) {
109                     confidence = 100;
110                     break;
111                 }
112                 confidence = adjustConfidence(codeUnit, confidence);
113                 if (confidence == 0 || confidence == 100) {
114                     break;
115                 }
116             }
117             if (bytesToCheck < 4 && confidence < 100) {
118                 confidence = 0;
119             }
120             if (confidence > 0) {
121                 return new CharsetMatch(det, this, confidence);
122             }
123             return null;
124         }
125     }
126 
127     static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
128     {
getChar(byte[] input, int index)129         abstract int getChar(byte[] input, int index);
130 
131         @Override
getName()132         abstract String getName();
133 
134         @Override
match(CharsetDetector det)135         CharsetMatch match(CharsetDetector det)
136         {
137             byte[] input   = det.fRawInput;
138             int limit      = (det.fRawLength / 4) * 4;
139             int numValid   = 0;
140             int numInvalid = 0;
141             boolean hasBOM = false;
142             int confidence = 0;
143 
144             if (limit==0) {
145                 return null;
146             }
147             if (getChar(input, 0) == 0x0000FEFF) {
148                 hasBOM = true;
149             }
150 
151             for(int i = 0; i < limit; i += 4) {
152                 int ch = getChar(input, i);
153 
154                 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
155                     numInvalid += 1;
156                 } else {
157                     numValid += 1;
158                 }
159             }
160 
161 
162             // Cook up some sort of confidence score, based on presence of a BOM
163             //    and the existence of valid and/or invalid multi-byte sequences.
164             if (hasBOM && numInvalid==0) {
165                 confidence = 100;
166             } else if (hasBOM && numValid > numInvalid*10) {
167                 confidence = 80;
168             } else if (numValid > 3 && numInvalid == 0) {
169                 confidence = 100;
170             } else if (numValid > 0 && numInvalid == 0) {
171                 confidence = 80;
172             } else if (numValid > numInvalid*10) {
173                 // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
174                 confidence = 25;
175             }
176 
177             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
178         }
179     }
180 
181     static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
182     {
183         @Override
getChar(byte[] input, int index)184         int getChar(byte[] input, int index)
185         {
186             return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
187                    (input[index + 2] & 0xFF) <<  8 | (input[index + 3] & 0xFF);
188         }
189 
190         @Override
getName()191         String getName()
192         {
193             return "UTF-32BE";
194         }
195     }
196 
197 
198     static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
199     {
200         @Override
getChar(byte[] input, int index)201         int getChar(byte[] input, int index)
202         {
203             return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
204                    (input[index + 1] & 0xFF) <<  8 | (input[index + 0] & 0xFF);
205         }
206 
207         @Override
getName()208         String getName()
209         {
210             return "UTF-32LE";
211         }
212     }
213 }
214