• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /**
5 *******************************************************************************
6 * Copyright (C) 2005 - 2014, International Business Machines Corporation and  *
7 * others. All Rights Reserved.                                                *
8 *******************************************************************************
9 */
10 package ohos.global.icu.text;
11 
12 /**
13  * Charset recognizer for UTF-8
14  */
15 class CharsetRecog_UTF8 extends CharsetRecognizer {
16 
17     @Override
getName()18     String getName() {
19         return "UTF-8";
20     }
21 
22     /* (non-Javadoc)
23      * @see ohos.global.icu.text.CharsetRecognizer#match(ohos.global.icu.text.CharsetDetector)
24      */
25     @Override
match(CharsetDetector det)26     CharsetMatch match(CharsetDetector det) {
27         boolean     hasBOM = false;
28         int         numValid = 0;
29         int         numInvalid = 0;
30         byte        input[] = det.fRawInput;
31         int         i;
32         int         trailBytes = 0;
33         int         confidence;
34 
35         if (det.fRawLength >= 3 &&
36                 (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
37             hasBOM = true;
38         }
39 
40         // Scan for multi-byte sequences
41         for (i=0; i<det.fRawLength; i++) {
42             int b = input[i];
43             if ((b & 0x80) == 0) {
44                 continue;   // ASCII
45             }
46 
47             // Hi bit on char found.  Figure out how long the sequence should be
48             if ((b & 0x0e0) == 0x0c0) {
49                 trailBytes = 1;
50             } else if ((b & 0x0f0) == 0x0e0) {
51                 trailBytes = 2;
52             } else if ((b & 0x0f8) == 0xf0) {
53                 trailBytes = 3;
54             } else {
55                 numInvalid++;
56                 continue;
57             }
58 
59             // Verify that we've got the right number of trail bytes in the sequence
60             for (;;) {
61                 i++;
62                 if (i>=det.fRawLength) {
63                     break;
64                 }
65                 b = input[i];
66                 if ((b & 0xc0) != 0x080) {
67                     numInvalid++;
68                     break;
69                 }
70                 if (--trailBytes == 0) {
71                     numValid++;
72                     break;
73                 }
74             }
75         }
76 
77         // Cook up some sort of confidence score, based on presense of a BOM
78         //    and the existence of valid and/or invalid multi-byte sequences.
79         confidence = 0;
80         if (hasBOM && numInvalid==0) {
81             confidence = 100;
82         } else if (hasBOM && numValid > numInvalid*10) {
83             confidence = 80;
84         } else if (numValid > 3 && numInvalid == 0) {
85             confidence = 100;
86         } else if (numValid > 0 && numInvalid == 0) {
87             confidence = 80;
88         } else if (numValid == 0 && numInvalid == 0) {
89             // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
90             //              accepts ASCII with confidence = 10.
91             // TODO: add plain ASCII as an explicitly detected type.
92             confidence = 15;
93         } else if (numValid > numInvalid*10) {
94             // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
95             confidence = 25;
96         }
97         return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
98     }
99 
100 }
101