1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2013, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 * 10 */ 11 12 package ohos.global.icu.text; 13 14 /** 15 * This class matches UTF-16 and UTF-32, both big- and little-endian. The 16 * BOM will be used if it is present. 17 */ 18 abstract class CharsetRecog_Unicode extends CharsetRecognizer { 19 20 /* (non-Javadoc) 21 * @see ohos.global.icu.text.CharsetRecognizer#getName() 22 */ 23 @Override getName()24 abstract String getName(); 25 26 /* (non-Javadoc) 27 * @see ohos.global.icu.text.CharsetRecognizer#match(ohos.global.icu.text.CharsetDetector) 28 */ 29 @Override match(CharsetDetector det)30 abstract CharsetMatch match(CharsetDetector det); 31 codeUnit16FromBytes(byte hi, byte lo)32 static int codeUnit16FromBytes(byte hi, byte lo) { 33 return ((hi & 0xff) << 8) | (lo & 0xff); 34 } 35 36 // UTF-16 confidence calculation. Very simple minded, but better than nothing. 37 // Any 8 bit non-control characters bump the confidence up. These have a zero high byte, 38 // and are very likely to be UTF-16, although they could also be part of a UTF-32 code. 39 // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. 40 // NULs should be rare in actual text. adjustConfidence(int codeUnit, int confidence)41 static int adjustConfidence(int codeUnit, int confidence) { 42 if (codeUnit == 0) { 43 confidence -= 10; 44 } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { 45 confidence += 10; 46 } 47 if (confidence < 0) { 48 confidence = 0; 49 } else if (confidence > 100) { 50 confidence = 100; 51 } 52 return confidence; 53 } 54 55 static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode 56 { 57 @Override getName()58 String getName() 59 { 60 return "UTF-16BE"; 61 } 62 63 @Override match(CharsetDetector det)64 CharsetMatch match(CharsetDetector det) 65 { 66 byte[] input = det.fRawInput; 67 int confidence = 10; 68 69 int bytesToCheck = Math.min(input.length, 30); 70 for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { 71 int codeUnit = codeUnit16FromBytes(input[charIndex], input[charIndex + 1]); 72 if (charIndex == 0 && codeUnit == 0xFEFF) { 73 confidence = 100; 74 break; 75 } 76 confidence = adjustConfidence(codeUnit, confidence); 77 if (confidence == 0 || confidence == 100) { 78 break; 79 } 80 } 81 if (bytesToCheck < 4 && confidence < 100) { 82 confidence = 0; 83 } 84 if (confidence > 0) { 85 return new CharsetMatch(det, this, confidence); 86 } 87 return null; 88 } 89 } 90 91 static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode 92 { 93 @Override getName()94 String getName() 95 { 96 return "UTF-16LE"; 97 } 98 99 @Override match(CharsetDetector det)100 CharsetMatch match(CharsetDetector det) 101 { 102 byte[] input = det.fRawInput; 103 int confidence = 10; 104 105 int bytesToCheck = Math.min(input.length, 30); 106 for (int charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { 107 int codeUnit = codeUnit16FromBytes(input[charIndex+1], input[charIndex]); 108 if (charIndex == 0 && codeUnit == 0xFEFF) { 109 confidence = 100; 110 break; 111 } 112 confidence = adjustConfidence(codeUnit, confidence); 113 if (confidence == 0 || confidence == 100) { 114 break; 115 } 116 } 117 if (bytesToCheck < 4 && confidence < 100) { 118 confidence = 0; 119 } 120 if (confidence > 0) { 121 return new CharsetMatch(det, this, confidence); 122 } 123 return null; 124 } 125 } 126 127 static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode 128 { getChar(byte[] input, int index)129 abstract int getChar(byte[] input, int index); 130 131 @Override getName()132 abstract String getName(); 133 134 @Override match(CharsetDetector det)135 CharsetMatch match(CharsetDetector det) 136 { 137 byte[] input = det.fRawInput; 138 int limit = (det.fRawLength / 4) * 4; 139 int numValid = 0; 140 int numInvalid = 0; 141 boolean hasBOM = false; 142 int confidence = 0; 143 144 if (limit==0) { 145 return null; 146 } 147 if (getChar(input, 0) == 0x0000FEFF) { 148 hasBOM = true; 149 } 150 151 for(int i = 0; i < limit; i += 4) { 152 int ch = getChar(input, i); 153 154 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { 155 numInvalid += 1; 156 } else { 157 numValid += 1; 158 } 159 } 160 161 162 // Cook up some sort of confidence score, based on presence of a BOM 163 // and the existence of valid and/or invalid multi-byte sequences. 164 if (hasBOM && numInvalid==0) { 165 confidence = 100; 166 } else if (hasBOM && numValid > numInvalid*10) { 167 confidence = 80; 168 } else if (numValid > 3 && numInvalid == 0) { 169 confidence = 100; 170 } else if (numValid > 0 && numInvalid == 0) { 171 confidence = 80; 172 } else if (numValid > numInvalid*10) { 173 // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance. 174 confidence = 25; 175 } 176 177 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 178 } 179 } 180 181 static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 182 { 183 @Override getChar(byte[] input, int index)184 int getChar(byte[] input, int index) 185 { 186 return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 | 187 (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF); 188 } 189 190 @Override getName()191 String getName() 192 { 193 return "UTF-32BE"; 194 } 195 } 196 197 198 static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 199 { 200 @Override getChar(byte[] input, int index)201 int getChar(byte[] input, int index) 202 { 203 return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 | 204 (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF); 205 } 206 207 @Override getName()208 String getName() 209 { 210 return "UTF-32LE"; 211 } 212 } 213 } 214