• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 *******************************************************************************
6 * Copyright (C) 2005 - 2012, International Business Machines Corporation and  *
7 * others. All Rights Reserved.                                                *
8 *******************************************************************************
9 */
10 package ohos.global.icu.text;
11 
12 /**
13  *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
14  *                           This is a superclass for the individual detectors for
15  *                           each of the detectable members of the ISO 2022 family
16  *                           of encodings.
17  *
18  *                           The separate classes are nested within this class.
19  */
20 abstract class CharsetRecog_2022 extends CharsetRecognizer {
21 
22 
23     /**
24      * Matching function shared among the 2022 detectors JP, CN and KR
25      * Counts up the number of legal an unrecognized escape sequences in
26      * the sample of text, and computes a score based on the total number &
27      * the proportion that fit the encoding.
28      *
29      *
30      * @param text the byte buffer containing text to analyse
31      * @param textLen  the size of the text in the byte.
32      * @param escapeSequences the byte escape sequences to test for.
33      * @return match quality, in the range of 0-100.
34      */
match(byte [] text, int textLen, byte [][] escapeSequences)35     int   match(byte [] text, int textLen, byte [][] escapeSequences) {
36         int     i, j;
37         int     escN;
38         int     hits   = 0;
39         int     misses = 0;
40         int     shifts = 0;
41         int     quality;
42         scanInput:
43             for (i=0; i<textLen; i++) {
44                 if (text[i] == 0x1b) {
45                     checkEscapes:
46                         for (escN=0; escN<escapeSequences.length; escN++) {
47                             byte [] seq = escapeSequences[escN];
48 
49                             if ((textLen - i) < seq.length) {
50                                 continue checkEscapes;
51                             }
52 
53                             for (j=1; j<seq.length; j++) {
54                                 if (seq[j] != text[i+j])  {
55                                     continue checkEscapes;
56                                 }
57                             }
58 
59                             hits++;
60                             i += seq.length-1;
61                             continue scanInput;
62                         }
63 
64                         misses++;
65                 }
66 
67                 if (text[i] == 0x0e || text[i] == 0x0f) {
68                     // Shift in/out
69                     shifts++;
70                 }
71             }
72 
73         if (hits == 0) {
74             return 0;
75         }
76 
77         //
78         // Initial quality is based on relative proportion of recongized vs.
79         //   unrecognized escape sequences.
80         //   All good:  quality = 100;
81         //   half or less good: quality = 0;
82         //   linear inbetween.
83         quality = (100*hits - 100*misses) / (hits + misses);
84 
85         // Back off quality if there were too few escape sequences seen.
86         //   Include shifts in this computation, so that KR does not get penalized
87         //   for having only a single Escape sequence, but many shifts.
88         if (hits+shifts < 5) {
89             quality -= (5-(hits+shifts))*10;
90         }
91 
92         if (quality < 0) {
93             quality = 0;
94         }
95         return quality;
96     }
97 
98 
99 
100 
101     static class CharsetRecog_2022JP extends CharsetRecog_2022 {
102         private byte [] [] escapeSequences = {
103                 {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
104                 {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
105                 {0x1b, 0x24, 0x40},         // JIS C 6226-1978
106                 {0x1b, 0x24, 0x41},         // GB 2312-80
107                 {0x1b, 0x24, 0x42},         // JIS X 208-1983
108                 {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
109                 {0x1b, 0x28, 0x42},         // ASCII
110                 {0x1b, 0x28, 0x48},         // JIS-Roman
111                 {0x1b, 0x28, 0x49},         // Half-width katakana
112                 {0x1b, 0x28, 0x4a},         // JIS-Roman
113                 {0x1b, 0x2e, 0x41},         // ISO 8859-1
114                 {0x1b, 0x2e, 0x46}          // ISO 8859-7
115                 };
116 
117         @Override
getName()118         String getName() {
119             return "ISO-2022-JP";
120         }
121 
122         @Override
match(CharsetDetector det)123         CharsetMatch   match(CharsetDetector det) {
124             int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
125             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
126         }
127     }
128 
129     static class CharsetRecog_2022KR extends CharsetRecog_2022 {
130         private byte [] [] escapeSequences = {
131                 {0x1b, 0x24, 0x29, 0x43}
132                  };
133 
134         @Override
getName()135         String getName() {
136             return "ISO-2022-KR";
137         }
138 
139         @Override
match(CharsetDetector det)140         CharsetMatch   match(CharsetDetector det) {
141             int confidence =  match(det.fInputBytes, det.fInputLen, escapeSequences);
142             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
143         }
144     }
145 
146     static class CharsetRecog_2022CN extends CharsetRecog_2022 {
147         private byte [] [] escapeSequences = {
148                 {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
149                 {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
150                 {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
151                 {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
152                 {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
153                 {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
154                 {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
155                 {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
156                 {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
157                 {0x1b, 0x4e},               // SS2
158                 {0x1b, 0x4f},               // SS3
159         };
160 
161         @Override
getName()162         String getName() {
163             return "ISO-2022-CN";
164         }
165 
166         @Override
match(CharsetDetector det)167         CharsetMatch   match(CharsetDetector det) {
168             int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
169             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
170         }
171     }
172 
173 }
174 
175