1 /* 2 ******************************************************************************* 3 * Copyright (C) 2012-2014, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.text; 8 9 import static com.ibm.icu.impl.CharacterIteration.DONE32; 10 import static com.ibm.icu.impl.CharacterIteration.current32; 11 import static com.ibm.icu.impl.CharacterIteration.next32; 12 13 import java.io.IOException; 14 import java.text.CharacterIterator; 15 16 import com.ibm.icu.impl.Assert; 17 18 class CjkBreakEngine extends DictionaryBreakEngine { 19 private static final UnicodeSet fHangulWordSet = new UnicodeSet(); 20 private static final UnicodeSet fHanWordSet = new UnicodeSet(); 21 private static final UnicodeSet fKatakanaWordSet = new UnicodeSet(); 22 private static final UnicodeSet fHiraganaWordSet = new UnicodeSet(); 23 static { 24 fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]"); 25 fHanWordSet.applyPattern("[:Han:]"); 26 fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]"); 27 fHiraganaWordSet.applyPattern("[:Hiragana:]"); 28 29 // freeze them all fHangulWordSet.freeze()30 fHangulWordSet.freeze(); fHanWordSet.freeze()31 fHanWordSet.freeze(); fKatakanaWordSet.freeze()32 fKatakanaWordSet.freeze(); fHiraganaWordSet.freeze()33 fHiraganaWordSet.freeze(); 34 } 35 36 private DictionaryMatcher fDictionary = null; 37 CjkBreakEngine(boolean korean)38 public CjkBreakEngine(boolean korean) throws IOException { 39 super(BreakIterator.KIND_WORD); 40 fDictionary = DictionaryData.loadDictionaryFor("Hira"); 41 if (korean) { 42 setCharacters(fHangulWordSet); 43 } else { //Chinese and Japanese 44 UnicodeSet cjSet = new UnicodeSet(); 45 cjSet = new UnicodeSet(); 46 cjSet.addAll(fHanWordSet); 47 cjSet.addAll(fKatakanaWordSet); 48 cjSet.addAll(fHiraganaWordSet); 49 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 50 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 51 setCharacters(cjSet); 52 } 53 } 54 equals(Object obj)55 public boolean equals(Object obj) { 56 if (obj instanceof CjkBreakEngine) { 57 CjkBreakEngine other = (CjkBreakEngine)obj; 58 return this.fSet.equals(other.fSet); 59 } 60 return false; 61 } 62 hashCode()63 public int hashCode() { 64 return getClass().hashCode(); 65 } 66 67 private static final int kMaxKatakanaLength = 8; 68 private static final int kMaxKatakanaGroupLength = 20; 69 private static final int maxSnlp = 255; 70 private static final int kint32max = Integer.MAX_VALUE; getKatakanaCost(int wordlength)71 private static int getKatakanaCost(int wordlength) { 72 int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 }; 73 return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength]; 74 } 75 isKatakana(int value)76 private static boolean isKatakana(int value) { 77 return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) || 78 (value >= 0xFF66 && value <= 0xFF9F); 79 } 80 divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks)81 public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, 82 DequeI foundBreaks) { 83 if (startPos >= endPos) { 84 return 0; 85 } 86 87 inText.setIndex(startPos); 88 89 int inputLength = endPos - startPos; 90 int[] charPositions = new int[inputLength + 1]; 91 StringBuffer s = new StringBuffer(""); 92 inText.setIndex(startPos); 93 while (inText.getIndex() < endPos) { 94 s.append(inText.current()); 95 inText.next(); 96 } 97 String prenormstr = s.toString(); 98 boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES || 99 Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0); 100 CharacterIterator text; 101 int numChars = 0; 102 if (isNormalized) { 103 text = new java.text.StringCharacterIterator(prenormstr); 104 int index = 0; 105 charPositions[0] = 0; 106 while (index < prenormstr.length()) { 107 int codepoint = prenormstr.codePointAt(index); 108 index += Character.charCount(codepoint); 109 numChars++; 110 charPositions[numChars] = index; 111 } 112 } else { 113 String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC); 114 text = new java.text.StringCharacterIterator(normStr); 115 charPositions = new int[normStr.length() + 1]; 116 Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0); 117 int index = 0; 118 charPositions[0] = 0; 119 while (index < normalizer.endIndex()) { 120 normalizer.next(); 121 numChars++; 122 index = normalizer.getIndex(); 123 charPositions[numChars] = index; 124 } 125 } 126 127 // From here on out, do the algorithm. Note that our indices 128 // refer to indices within the normalized string. 129 int[] bestSnlp = new int[numChars + 1]; 130 bestSnlp[0] = 0; 131 for (int i = 1; i <= numChars; i++) { 132 bestSnlp[i] = kint32max; 133 } 134 135 int[] prev = new int[numChars + 1]; 136 for (int i = 0; i <= numChars; i++) { 137 prev[i] = -1; 138 } 139 140 final int maxWordSize = 20; 141 int values[] = new int[numChars]; 142 int lengths[] = new int[numChars]; 143 // dynamic programming to find the best segmentation 144 boolean is_prev_katakana = false; 145 for (int i = 0; i < numChars; i++) { 146 text.setIndex(i); 147 if (bestSnlp[i] == kint32max) { 148 continue; 149 } 150 151 int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i); 152 int[] count_ = new int[1]; 153 fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); 154 int count = count_[0]; 155 156 // if there are no single character matches found in the dictionary 157 // starting with this character, treat character as a 1-character word 158 // with the highest value possible (i.e. the least likely to occur). 159 // Exclude Korean characters from this treatment, as they should be 160 // left together by default. 161 if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) { 162 values[count] = maxSnlp; 163 lengths[count] = 1; 164 count++; 165 } 166 167 for (int j = 0; j < count; j++) { 168 int newSnlp = bestSnlp[i] + values[j]; 169 if (newSnlp < bestSnlp[lengths[j] + i]) { 170 bestSnlp[lengths[j] + i] = newSnlp; 171 prev[lengths[j] + i] = i; 172 } 173 } 174 175 // In Japanese, single-character Katakana words are pretty rare. 176 // So we apply the following heuristic to Katakana: any continuous 177 // run of Katakana characters is considered a candidate word with 178 // a default cost specified in the katakanaCost table according 179 // to its length. 180 text.setIndex(i); 181 boolean is_katakana = isKatakana(current32(text)); 182 if (!is_prev_katakana && is_katakana) { 183 int j = i + 1; 184 next32(text); 185 while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) { 186 next32(text); 187 ++j; 188 } 189 190 if ((j - i) < kMaxKatakanaGroupLength) { 191 int newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 192 if (newSnlp < bestSnlp[j]) { 193 bestSnlp[j] = newSnlp; 194 prev[j] = i; 195 } 196 } 197 } 198 is_prev_katakana = is_katakana; 199 } 200 201 int t_boundary[] = new int[numChars + 1]; 202 int numBreaks = 0; 203 if (bestSnlp[numChars] == kint32max) { 204 t_boundary[numBreaks] = numChars; 205 numBreaks++; 206 } else { 207 for (int i = numChars; i > 0; i = prev[i]) { 208 t_boundary[numBreaks] = i; 209 numBreaks++; 210 } 211 Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0); 212 } 213 214 if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) { 215 t_boundary[numBreaks++] = 0; 216 } 217 218 int correctedNumBreaks = 0; 219 for (int i = numBreaks - 1; i >= 0; i--) { 220 int pos = charPositions[t_boundary[i]] + startPos; 221 if (!(foundBreaks.contains(pos) || pos == startPos)) { 222 foundBreaks.push(charPositions[t_boundary[i]] + startPos); 223 correctedNumBreaks++; 224 } 225 } 226 227 if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) { 228 foundBreaks.pop(); 229 correctedNumBreaks--; 230 } 231 if (!foundBreaks.isEmpty()) 232 inText.setIndex(foundBreaks.peek()); 233 return correctedNumBreaks; 234 } 235 } 236