1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 **************************************************************************** 6 * Copyright (C) 2005-2012, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 **************************************************************************** 9 * 10 */ 11 package ohos.global.icu.text; 12 13 import java.util.Arrays; 14 15 /** 16 * CharsetRecognizer implemenation for Asian - double or multi-byte - charsets. 17 * Match is determined mostly by the input data adhering to the 18 * encoding scheme for the charset, and, optionally, 19 * frequency-of-occurence of characters. 20 * <p/> 21 * Instances of this class are singletons, one per encoding 22 * being recognized. They are created in the main 23 * CharsetDetector class and kept in the global list of available 24 * encodings to be checked. The specific encoding being recognized 25 * is determined by subclass. 26 */ 27 abstract class CharsetRecog_mbcs extends CharsetRecognizer { 28 29 /** 30 * Get the IANA name of this charset. 31 * @return the charset name. 32 */ 33 @Override getName()34 abstract String getName() ; 35 36 37 /** 38 * Test the match of this charset with the input text data 39 * which is obtained via the CharsetDetector object. 40 * 41 * @param det The CharsetDetector, which contains the input text 42 * to be checked for being in this charset. 43 * @return Two values packed into one int (Damn java, anyhow) 44 * <br/> 45 * bits 0-7: the match confidence, ranging from 0-100 46 * <br/> 47 * bits 8-15: The match reason, an enum-like value. 48 */ match(CharsetDetector det, int [] commonChars)49 int match(CharsetDetector det, int [] commonChars) { 50 @SuppressWarnings("unused") 51 int singleByteCharCount = 0; //TODO Do we really need this? 52 int doubleByteCharCount = 0; 53 int commonCharCount = 0; 54 int badCharCount = 0; 55 int totalCharCount = 0; 56 int confidence = 0; 57 iteratedChar iter = new iteratedChar(); 58 59 detectBlock: { 60 for (iter.reset(); nextChar(iter, det);) { 61 totalCharCount++; 62 if (iter.error) { 63 badCharCount++; 64 } else { 65 long cv = iter.charValue & 0xFFFFFFFFL; 66 67 if (cv <= 0xff) { 68 singleByteCharCount++; 69 } else { 70 doubleByteCharCount++; 71 if (commonChars != null) { 72 // NOTE: This assumes that there are no 4-byte common chars. 73 if (Arrays.binarySearch(commonChars, (int) cv) >= 0) { 74 commonCharCount++; 75 } 76 } 77 } 78 } 79 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 80 // Bail out early if the byte data is not matching the encoding scheme. 81 break detectBlock; 82 } 83 } 84 85 if (doubleByteCharCount <= 10 && badCharCount== 0) { 86 // Not many multi-byte chars. 87 if (doubleByteCharCount == 0 && totalCharCount < 10) { 88 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 89 // We don't have enough data to have any confidence. 90 // Statistical analysis of single byte non-ASCII charcters would probably help here. 91 confidence = 0; 92 } 93 else { 94 // ASCII or ISO file? It's probably not our encoding, 95 // but is not incompatible with our encoding, so don't give it a zero. 96 confidence = 10; 97 } 98 99 break detectBlock; 100 } 101 102 // 103 // No match if there are too many characters that don't fit the encoding scheme. 104 // (should we have zero tolerance for these?) 105 // 106 if (doubleByteCharCount < 20*badCharCount) { 107 confidence = 0; 108 break detectBlock; 109 } 110 111 if (commonChars == null) { 112 // We have no statistics on frequently occuring characters. 113 // Assess confidence purely on having a reasonable number of 114 // multi-byte characters (the more the better 115 confidence = 30 + doubleByteCharCount - 20*badCharCount; 116 if (confidence > 100) { 117 confidence = 100; 118 } 119 }else { 120 // 121 // Frequency of occurence statistics exist. 122 // 123 double maxVal = Math.log((float)doubleByteCharCount / 4); 124 double scaleFactor = 90.0 / maxVal; 125 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10); 126 confidence = Math.min(confidence, 100); 127 } 128 } // end of detectBlock: 129 130 return confidence; 131 } 132 133 // "Character" iterated character class. 134 // Recognizers for specific mbcs encodings make their "characters" available 135 // by providing a nextChar() function that fills in an instance of iteratedChar 136 // with the next char from the input. 137 // The returned characters are not converted to Unicode, but remain as the raw 138 // bytes (concatenated into an int) from the codepage data. 139 // 140 // For Asian charsets, use the raw input rather than the input that has been 141 // stripped of markup. Detection only considers multi-byte chars, effectively 142 // stripping markup anyway, and double byte chars do occur in markup too. 143 // 144 static class iteratedChar { 145 int charValue = 0; // 1-4 bytes from the raw input data 146 int nextIndex = 0; 147 boolean error = false; 148 boolean done = false; 149 reset()150 void reset() { 151 charValue = 0; 152 nextIndex = 0; 153 error = false; 154 done = false; 155 } 156 nextByte(CharsetDetector det)157 int nextByte(CharsetDetector det) { 158 if (nextIndex >= det.fRawLength) { 159 done = true; 160 return -1; 161 } 162 int byteValue = det.fRawInput[nextIndex++] & 0x00ff; 163 return byteValue; 164 } 165 } 166 167 /** 168 * Get the next character (however many bytes it is) from the input data 169 * Subclasses for specific charset encodings must implement this function 170 * to get characters according to the rules of their encoding scheme. 171 * 172 * This function is not a method of class iteratedChar only because 173 * that would require a lot of extra derived classes, which is awkward. 174 * @param it The iteratedChar "struct" into which the returned char is placed. 175 * @param det The charset detector, which is needed to get at the input byte data 176 * being iterated over. 177 * @return True if a character was returned, false at end of input. 178 */ nextChar(iteratedChar it, CharsetDetector det)179 abstract boolean nextChar(iteratedChar it, CharsetDetector det); 180 181 182 183 184 185 /** 186 * Shift-JIS charset recognizer. 187 * 188 */ 189 static class CharsetRecog_sjis extends CharsetRecog_mbcs { 190 static int [] commonChars = 191 // TODO: This set of data comes from the character frequency- 192 // of-occurence analysis tool. The data needs to be moved 193 // into a resource and loaded from there. 194 {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 195 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 196 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 197 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 198 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 199 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 200 201 @Override nextChar(iteratedChar it, CharsetDetector det)202 boolean nextChar(iteratedChar it, CharsetDetector det) { 203 it.error = false; 204 int firstByte; 205 firstByte = it.charValue = it.nextByte(det); 206 if (firstByte < 0) { 207 return false; 208 } 209 210 if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) { 211 return true; 212 } 213 214 int secondByte = it.nextByte(det); 215 if (secondByte < 0) { 216 return false; 217 } 218 it.charValue = (firstByte << 8) | secondByte; 219 if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) { 220 // Illegal second byte value. 221 it.error = true; 222 } 223 return true; 224 } 225 226 @Override match(CharsetDetector det)227 CharsetMatch match(CharsetDetector det) { 228 int confidence = match(det, commonChars); 229 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 230 } 231 232 @Override getName()233 String getName() { 234 return "Shift_JIS"; 235 } 236 237 @Override getLanguage()238 public String getLanguage() 239 { 240 return "ja"; 241 } 242 243 244 } 245 246 247 /** 248 * Big5 charset recognizer. 249 * 250 */ 251 static class CharsetRecog_big5 extends CharsetRecog_mbcs { 252 static int [] commonChars = 253 // TODO: This set of data comes from the character frequency- 254 // of-occurence analysis tool. The data needs to be moved 255 // into a resource and loaded from there. 256 {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 257 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 258 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 259 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 260 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 261 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 262 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 263 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 264 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 265 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 266 267 @Override nextChar(iteratedChar it, CharsetDetector det)268 boolean nextChar(iteratedChar it, CharsetDetector det) { 269 it.error = false; 270 int firstByte; 271 firstByte = it.charValue = it.nextByte(det); 272 if (firstByte < 0) { 273 return false; 274 } 275 276 if (firstByte <= 0x7f || firstByte==0xff) { 277 // single byte character. 278 return true; 279 } 280 281 int secondByte = it.nextByte(det); 282 if (secondByte < 0) { 283 return false; 284 } 285 it.charValue = (it.charValue << 8) | secondByte; 286 287 if (secondByte < 0x40 || 288 secondByte ==0x7f || 289 secondByte == 0xff) { 290 it.error = true; 291 } 292 return true; 293 } 294 295 @Override match(CharsetDetector det)296 CharsetMatch match(CharsetDetector det) { 297 int confidence = match(det, commonChars); 298 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 299 } 300 301 @Override getName()302 String getName() { 303 return "Big5"; 304 } 305 306 307 @Override getLanguage()308 public String getLanguage() 309 { 310 return "zh"; 311 } 312 } 313 314 315 /** 316 * EUC charset recognizers. One abstract class that provides the common function 317 * for getting the next character according to the EUC encoding scheme, 318 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. 319 * 320 */ 321 abstract static class CharsetRecog_euc extends CharsetRecog_mbcs { 322 323 /* 324 * (non-Javadoc) 325 * Get the next character value for EUC based encodings. 326 * Character "value" is simply the raw bytes that make up the character 327 * packed into an int. 328 */ 329 @Override nextChar(iteratedChar it, CharsetDetector det)330 boolean nextChar(iteratedChar it, CharsetDetector det) { 331 it.error = false; 332 int firstByte = 0; 333 int secondByte = 0; 334 int thirdByte = 0; 335 //int fourthByte = 0; 336 337 buildChar: { 338 firstByte = it.charValue = it.nextByte(det); 339 if (firstByte < 0) { 340 // Ran off the end of the input data 341 it.done = true; 342 break buildChar; 343 } 344 if (firstByte <= 0x8d) { 345 // single byte char 346 break buildChar; 347 } 348 349 secondByte = it.nextByte(det); 350 it.charValue = (it.charValue << 8) | secondByte; 351 352 if (firstByte >= 0xA1 && firstByte <= 0xfe) { 353 // Two byte Char 354 if (secondByte < 0xa1) { 355 it.error = true; 356 } 357 break buildChar; 358 } 359 if (firstByte == 0x8e) { 360 // Code Set 2. 361 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 362 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 363 // We don't know which we've got. 364 // Treat it like EUC-JP. If the data really was EUC-TW, the following two 365 // bytes will look like a well formed 2 byte char. 366 if (secondByte < 0xa1) { 367 it.error = true; 368 } 369 break buildChar; 370 } 371 372 if (firstByte == 0x8f) { 373 // Code set 3. 374 // Three byte total char size, two bytes of actual char value. 375 thirdByte = it.nextByte(det); 376 it.charValue = (it.charValue << 8) | thirdByte; 377 if (thirdByte < 0xa1) { 378 it.error = true; 379 } 380 } 381 } 382 383 return (it.done == false); 384 } 385 386 /** 387 * The charset recognize for EUC-JP. A singleton instance of this class 388 * is created and kept by the public CharsetDetector class 389 */ 390 static class CharsetRecog_euc_jp extends CharsetRecog_euc { 391 static int [] commonChars = 392 // TODO: This set of data comes from the character frequency- 393 // of-occurence analysis tool. The data needs to be moved 394 // into a resource and loaded from there. 395 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 396 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 397 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 398 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 399 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 400 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 401 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 402 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 403 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 404 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 405 @Override getName()406 String getName() { 407 return "EUC-JP"; 408 } 409 410 @Override match(CharsetDetector det)411 CharsetMatch match(CharsetDetector det) { 412 int confidence = match(det, commonChars); 413 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 414 } 415 416 @Override getLanguage()417 public String getLanguage() 418 { 419 return "ja"; 420 } 421 } 422 423 /** 424 * The charset recognize for EUC-KR. A singleton instance of this class 425 * is created and kept by the public CharsetDetector class 426 */ 427 static class CharsetRecog_euc_kr extends CharsetRecog_euc { 428 static int [] commonChars = 429 // TODO: This set of data comes from the character frequency- 430 // of-occurence analysis tool. The data needs to be moved 431 // into a resource and loaded from there. 432 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 433 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 434 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 435 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 436 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 437 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 438 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 439 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 440 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 441 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 442 443 @Override getName()444 String getName() { 445 return "EUC-KR"; 446 } 447 448 @Override match(CharsetDetector det)449 CharsetMatch match(CharsetDetector det) { 450 int confidence = match(det, commonChars); 451 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 452 } 453 454 @Override getLanguage()455 public String getLanguage() 456 { 457 return "ko"; 458 } 459 } 460 } 461 462 /** 463 * 464 * GB-18030 recognizer. Uses simplified Chinese statistics. 465 * 466 */ 467 static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs { 468 469 /* 470 * (non-Javadoc) 471 * Get the next character value for EUC based encodings. 472 * Character "value" is simply the raw bytes that make up the character 473 * packed into an int. 474 */ 475 @Override nextChar(iteratedChar it, CharsetDetector det)476 boolean nextChar(iteratedChar it, CharsetDetector det) { 477 it.error = false; 478 int firstByte = 0; 479 int secondByte = 0; 480 int thirdByte = 0; 481 int fourthByte = 0; 482 483 buildChar: { 484 firstByte = it.charValue = it.nextByte(det); 485 486 if (firstByte < 0) { 487 // Ran off the end of the input data 488 it.done = true; 489 break buildChar; 490 } 491 492 if (firstByte <= 0x80) { 493 // single byte char 494 break buildChar; 495 } 496 497 secondByte = it.nextByte(det); 498 it.charValue = (it.charValue << 8) | secondByte; 499 500 if (firstByte >= 0x81 && firstByte <= 0xFE) { 501 // Two byte Char 502 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) { 503 break buildChar; 504 } 505 506 // Four byte char 507 if (secondByte >= 0x30 && secondByte <= 0x39) { 508 thirdByte = it.nextByte(det); 509 510 if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 511 fourthByte = it.nextByte(det); 512 513 if (fourthByte >= 0x30 && fourthByte <= 0x39) { 514 it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte; 515 break buildChar; 516 } 517 } 518 } 519 520 it.error = true; 521 break buildChar; 522 } 523 } 524 525 return (it.done == false); 526 } 527 528 static int [] commonChars = 529 // TODO: This set of data comes from the character frequency- 530 // of-occurence analysis tool. The data needs to be moved 531 // into a resource and loaded from there. 532 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 533 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 534 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 535 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 536 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 537 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 538 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 539 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 540 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 541 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 542 543 544 @Override getName()545 String getName() { 546 return "GB18030"; 547 } 548 549 @Override match(CharsetDetector det)550 CharsetMatch match(CharsetDetector det) { 551 int confidence = match(det, commonChars); 552 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 553 } 554 555 @Override getLanguage()556 public String getLanguage() 557 { 558 return "zh"; 559 } 560 } 561 562 563 } 564