1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2005 - 2012, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.text; 11 12 /** 13 * class CharsetRecog_2022 part of the ICU charset detection imlementation. 14 * This is a superclass for the individual detectors for 15 * each of the detectable members of the ISO 2022 family 16 * of encodings. 17 * 18 * The separate classes are nested within this class. 19 */ 20 abstract class CharsetRecog_2022 extends CharsetRecognizer { 21 22 23 /** 24 * Matching function shared among the 2022 detectors JP, CN and KR 25 * Counts up the number of legal an unrecognized escape sequences in 26 * the sample of text, and computes a score based on the total number & 27 * the proportion that fit the encoding. 28 * 29 * 30 * @param text the byte buffer containing text to analyse 31 * @param textLen the size of the text in the byte. 32 * @param escapeSequences the byte escape sequences to test for. 33 * @return match quality, in the range of 0-100. 34 */ match(byte [] text, int textLen, byte [][] escapeSequences)35 int match(byte [] text, int textLen, byte [][] escapeSequences) { 36 int i, j; 37 int escN; 38 int hits = 0; 39 int misses = 0; 40 int shifts = 0; 41 int quality; 42 scanInput: 43 for (i=0; i<textLen; i++) { 44 if (text[i] == 0x1b) { 45 checkEscapes: 46 for (escN=0; escN<escapeSequences.length; escN++) { 47 byte [] seq = escapeSequences[escN]; 48 49 if ((textLen - i) < seq.length) { 50 continue checkEscapes; 51 } 52 53 for (j=1; j<seq.length; j++) { 54 if (seq[j] != text[i+j]) { 55 continue checkEscapes; 56 } 57 } 58 59 hits++; 60 i += seq.length-1; 61 continue scanInput; 62 } 63 64 misses++; 65 } 66 67 if (text[i] == 0x0e || text[i] == 0x0f) { 68 // Shift in/out 69 shifts++; 70 } 71 } 72 73 if (hits == 0) { 74 return 0; 75 } 76 77 // 78 // Initial quality is based on relative proportion of recongized vs. 79 // unrecognized escape sequences. 80 // All good: quality = 100; 81 // half or less good: quality = 0; 82 // linear inbetween. 83 quality = (100*hits - 100*misses) / (hits + misses); 84 85 // Back off quality if there were too few escape sequences seen. 86 // Include shifts in this computation, so that KR does not get penalized 87 // for having only a single Escape sequence, but many shifts. 88 if (hits+shifts < 5) { 89 quality -= (5-(hits+shifts))*10; 90 } 91 92 if (quality < 0) { 93 quality = 0; 94 } 95 return quality; 96 } 97 98 99 100 101 static class CharsetRecog_2022JP extends CharsetRecog_2022 { 102 private byte [] [] escapeSequences = { 103 {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992 104 {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990 105 {0x1b, 0x24, 0x40}, // JIS C 6226-1978 106 {0x1b, 0x24, 0x41}, // GB 2312-80 107 {0x1b, 0x24, 0x42}, // JIS X 208-1983 108 {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997 109 {0x1b, 0x28, 0x42}, // ASCII 110 {0x1b, 0x28, 0x48}, // JIS-Roman 111 {0x1b, 0x28, 0x49}, // Half-width katakana 112 {0x1b, 0x28, 0x4a}, // JIS-Roman 113 {0x1b, 0x2e, 0x41}, // ISO 8859-1 114 {0x1b, 0x2e, 0x46} // ISO 8859-7 115 }; 116 117 @Override getName()118 String getName() { 119 return "ISO-2022-JP"; 120 } 121 122 @Override match(CharsetDetector det)123 CharsetMatch match(CharsetDetector det) { 124 int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 125 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 126 } 127 } 128 129 static class CharsetRecog_2022KR extends CharsetRecog_2022 { 130 private byte [] [] escapeSequences = { 131 {0x1b, 0x24, 0x29, 0x43} 132 }; 133 134 @Override getName()135 String getName() { 136 return "ISO-2022-KR"; 137 } 138 139 @Override match(CharsetDetector det)140 CharsetMatch match(CharsetDetector det) { 141 int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 142 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 143 } 144 } 145 146 static class CharsetRecog_2022CN extends CharsetRecog_2022 { 147 private byte [] [] escapeSequences = { 148 {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80 149 {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1 150 {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2 151 {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165 152 {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3 153 {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4 154 {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5 155 {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6 156 {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7 157 {0x1b, 0x4e}, // SS2 158 {0x1b, 0x4f}, // SS3 159 }; 160 161 @Override getName()162 String getName() { 163 return "ISO-2022-CN"; 164 } 165 166 @Override match(CharsetDetector det)167 CharsetMatch match(CharsetDetector det) { 168 int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences); 169 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 170 } 171 } 172 173 } 174 175