1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2012-2016, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.text; 11 12 import static ohos.global.icu.impl.CharacterIteration.DONE32; 13 import static ohos.global.icu.impl.CharacterIteration.current32; 14 import static ohos.global.icu.impl.CharacterIteration.next32; 15 16 import java.io.IOException; 17 import java.text.CharacterIterator; 18 19 import ohos.global.icu.impl.Assert; 20 21 class CjkBreakEngine extends DictionaryBreakEngine { 22 private static final UnicodeSet fHangulWordSet = new UnicodeSet(); 23 private static final UnicodeSet fHanWordSet = new UnicodeSet(); 24 private static final UnicodeSet fKatakanaWordSet = new UnicodeSet(); 25 private static final UnicodeSet fHiraganaWordSet = new UnicodeSet(); 26 static { 27 fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]"); 28 fHanWordSet.applyPattern("[:Han:]"); 29 fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]"); 30 fHiraganaWordSet.applyPattern("[:Hiragana:]"); 31 32 // freeze them all fHangulWordSet.freeze()33 fHangulWordSet.freeze(); fHanWordSet.freeze()34 fHanWordSet.freeze(); fKatakanaWordSet.freeze()35 fKatakanaWordSet.freeze(); fHiraganaWordSet.freeze()36 fHiraganaWordSet.freeze(); 37 } 38 39 private DictionaryMatcher fDictionary = null; 40 CjkBreakEngine(boolean korean)41 public CjkBreakEngine(boolean korean) throws IOException { 42 fDictionary = DictionaryData.loadDictionaryFor("Hira"); 43 if (korean) { 44 setCharacters(fHangulWordSet); 45 } else { //Chinese and Japanese 46 UnicodeSet cjSet = new UnicodeSet(); 47 cjSet.addAll(fHanWordSet); 48 cjSet.addAll(fKatakanaWordSet); 49 cjSet.addAll(fHiraganaWordSet); 50 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 51 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 52 setCharacters(cjSet); 53 } 54 } 55 56 @Override equals(Object obj)57 public boolean equals(Object obj) { 58 if (obj instanceof CjkBreakEngine) { 59 CjkBreakEngine other = (CjkBreakEngine)obj; 60 return this.fSet.equals(other.fSet); 61 } 62 return false; 63 } 64 65 @Override hashCode()66 public int hashCode() { 67 return getClass().hashCode(); 68 } 69 70 private static final int kMaxKatakanaLength = 8; 71 private static final int kMaxKatakanaGroupLength = 20; 72 private static final int maxSnlp = 255; 73 private static final int kint32max = Integer.MAX_VALUE; getKatakanaCost(int wordlength)74 private static int getKatakanaCost(int wordlength) { 75 int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 }; 76 return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength]; 77 } 78 isKatakana(int value)79 private static boolean isKatakana(int value) { 80 return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) || 81 (value >= 0xFF66 && value <= 0xFF9F); 82 } 83 84 @Override divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks)85 public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, 86 DequeI foundBreaks) { 87 if (startPos >= endPos) { 88 return 0; 89 } 90 91 inText.setIndex(startPos); 92 93 int inputLength = endPos - startPos; 94 int[] charPositions = new int[inputLength + 1]; 95 StringBuffer s = new StringBuffer(""); 96 inText.setIndex(startPos); 97 while (inText.getIndex() < endPos) { 98 s.append(inText.current()); 99 inText.next(); 100 } 101 String prenormstr = s.toString(); 102 boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES || 103 Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0); 104 CharacterIterator text; 105 int numCodePts = 0; 106 if (isNormalized) { 107 text = new java.text.StringCharacterIterator(prenormstr); 108 int index = 0; 109 charPositions[0] = 0; 110 while (index < prenormstr.length()) { 111 int codepoint = prenormstr.codePointAt(index); 112 index += Character.charCount(codepoint); 113 numCodePts++; 114 charPositions[numCodePts] = index; 115 } 116 } else { 117 String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC); 118 text = new java.text.StringCharacterIterator(normStr); 119 charPositions = new int[normStr.length() + 1]; 120 Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0); 121 int index = 0; 122 charPositions[0] = 0; 123 while (index < normalizer.endIndex()) { 124 normalizer.next(); 125 numCodePts++; 126 index = normalizer.getIndex(); 127 charPositions[numCodePts] = index; 128 } 129 } 130 131 // From here on out, do the algorithm. Note that our indices 132 // refer to indices within the normalized string. 133 int[] bestSnlp = new int[numCodePts + 1]; 134 bestSnlp[0] = 0; 135 for (int i = 1; i <= numCodePts; i++) { 136 bestSnlp[i] = kint32max; 137 } 138 139 int[] prev = new int[numCodePts + 1]; 140 for (int i = 0; i <= numCodePts; i++) { 141 prev[i] = -1; 142 } 143 144 final int maxWordSize = 20; 145 int values[] = new int[numCodePts]; 146 int lengths[] = new int[numCodePts]; 147 // dynamic programming to find the best segmentation 148 149 // In outer loop, i is the code point index, 150 // ix is the corresponding code unit index. 151 // They differ when the string contains supplementary characters. 152 int ix = 0; 153 text.setIndex(ix); 154 boolean is_prev_katakana = false; 155 for (int i = 0; i < numCodePts; i++, text.setIndex(ix), next32(text)) { 156 ix = text.getIndex(); 157 if (bestSnlp[i] == kint32max) { 158 continue; 159 } 160 161 int maxSearchLength = (i + maxWordSize < numCodePts) ? maxWordSize : (numCodePts - i); 162 int[] count_ = new int[1]; 163 fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); 164 int count = count_[0]; 165 166 // if there are no single character matches found in the dictionary 167 // starting with this character, treat character as a 1-character word 168 // with the highest value possible (i.e. the least likely to occur). 169 // Exclude Korean characters from this treatment, as they should be 170 // left together by default. 171 text.setIndex(ix); // fDictionary.matches() advances the text position; undo that. 172 if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) { 173 values[count] = maxSnlp; 174 lengths[count] = 1; 175 count++; 176 } 177 178 for (int j = 0; j < count; j++) { 179 int newSnlp = bestSnlp[i] + values[j]; 180 if (newSnlp < bestSnlp[lengths[j] + i]) { 181 bestSnlp[lengths[j] + i] = newSnlp; 182 prev[lengths[j] + i] = i; 183 } 184 } 185 186 // In Japanese, single-character Katakana words are pretty rare. 187 // So we apply the following heuristic to Katakana: any continuous 188 // run of Katakana characters is considered a candidate word with 189 // a default cost specified in the katakanaCost table according 190 // to its length. 191 boolean is_katakana = isKatakana(current32(text)); 192 if (!is_prev_katakana && is_katakana) { 193 int j = i + 1; 194 next32(text); 195 while (j < numCodePts && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) { 196 next32(text); 197 ++j; 198 } 199 200 if ((j - i) < kMaxKatakanaGroupLength) { 201 int newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 202 if (newSnlp < bestSnlp[j]) { 203 bestSnlp[j] = newSnlp; 204 prev[j] = i; 205 } 206 } 207 } 208 is_prev_katakana = is_katakana; 209 } 210 211 int t_boundary[] = new int[numCodePts + 1]; 212 int numBreaks = 0; 213 if (bestSnlp[numCodePts] == kint32max) { 214 t_boundary[numBreaks] = numCodePts; 215 numBreaks++; 216 } else { 217 for (int i = numCodePts; i > 0; i = prev[i]) { 218 t_boundary[numBreaks] = i; 219 numBreaks++; 220 } 221 Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0); 222 } 223 224 if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) { 225 t_boundary[numBreaks++] = 0; 226 } 227 228 int correctedNumBreaks = 0; 229 for (int i = numBreaks - 1; i >= 0; i--) { 230 int pos = charPositions[t_boundary[i]] + startPos; 231 if (!(foundBreaks.contains(pos) || pos == startPos)) { 232 foundBreaks.push(charPositions[t_boundary[i]] + startPos); 233 correctedNumBreaks++; 234 } 235 } 236 237 if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) { 238 foundBreaks.pop(); 239 correctedNumBreaks--; 240 } 241 if (!foundBreaks.isEmpty()) 242 inText.setIndex(foundBreaks.peek()); 243 return correctedNumBreaks; 244 } 245 } 246