• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *******************************************************************************
3  * Copyright (C) 2012-2014, International Business Machines Corporation and         *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 package com.ibm.icu.text;
8 
9 import static com.ibm.icu.impl.CharacterIteration.DONE32;
10 import static com.ibm.icu.impl.CharacterIteration.current32;
11 import static com.ibm.icu.impl.CharacterIteration.next32;
12 
13 import java.io.IOException;
14 import java.text.CharacterIterator;
15 
16 import com.ibm.icu.impl.Assert;
17 
18 class CjkBreakEngine extends DictionaryBreakEngine {
19     private static final UnicodeSet fHangulWordSet = new UnicodeSet();
20     private static final UnicodeSet fHanWordSet = new UnicodeSet();
21     private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
22     private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
23     static {
24         fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
25         fHanWordSet.applyPattern("[:Han:]");
26         fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
27         fHiraganaWordSet.applyPattern("[:Hiragana:]");
28 
29         // freeze them all
fHangulWordSet.freeze()30         fHangulWordSet.freeze();
fHanWordSet.freeze()31         fHanWordSet.freeze();
fKatakanaWordSet.freeze()32         fKatakanaWordSet.freeze();
fHiraganaWordSet.freeze()33         fHiraganaWordSet.freeze();
34     }
35 
36     private DictionaryMatcher fDictionary = null;
37 
CjkBreakEngine(boolean korean)38     public CjkBreakEngine(boolean korean) throws IOException {
39         super(BreakIterator.KIND_WORD);
40         fDictionary = DictionaryData.loadDictionaryFor("Hira");
41         if (korean) {
42             setCharacters(fHangulWordSet);
43         } else { //Chinese and Japanese
44             UnicodeSet cjSet = new UnicodeSet();
45             cjSet = new UnicodeSet();
46             cjSet.addAll(fHanWordSet);
47             cjSet.addAll(fKatakanaWordSet);
48             cjSet.addAll(fHiraganaWordSet);
49             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
50             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
51             setCharacters(cjSet);
52         }
53     }
54 
equals(Object obj)55     public boolean equals(Object obj) {
56         if (obj instanceof CjkBreakEngine) {
57             CjkBreakEngine other = (CjkBreakEngine)obj;
58             return this.fSet.equals(other.fSet);
59         }
60         return false;
61     }
62 
hashCode()63     public int hashCode() {
64         return getClass().hashCode();
65     }
66 
67     private static final int kMaxKatakanaLength = 8;
68     private static final int kMaxKatakanaGroupLength = 20;
69     private static final int maxSnlp = 255;
70     private static final int kint32max = Integer.MAX_VALUE;
getKatakanaCost(int wordlength)71     private static int getKatakanaCost(int wordlength) {
72         int katakanaCost[] =  new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
73         return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
74     }
75 
isKatakana(int value)76     private static boolean isKatakana(int value) {
77         return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
78                 (value >= 0xFF66 && value <= 0xFF9F);
79     }
80 
divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks)81     public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
82             DequeI foundBreaks) {
83         if (startPos >= endPos) {
84             return 0;
85         }
86 
87         inText.setIndex(startPos);
88 
89         int inputLength = endPos - startPos;
90         int[] charPositions = new int[inputLength + 1];
91         StringBuffer s = new StringBuffer("");
92         inText.setIndex(startPos);
93         while (inText.getIndex() < endPos) {
94             s.append(inText.current());
95             inText.next();
96         }
97         String prenormstr = s.toString();
98         boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
99                                Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
100         CharacterIterator text;
101         int numChars = 0;
102         if (isNormalized) {
103             text = new java.text.StringCharacterIterator(prenormstr);
104             int index = 0;
105             charPositions[0] = 0;
106             while (index < prenormstr.length()) {
107                 int codepoint = prenormstr.codePointAt(index);
108                 index += Character.charCount(codepoint);
109                 numChars++;
110                 charPositions[numChars] = index;
111             }
112         } else {
113             String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
114             text = new java.text.StringCharacterIterator(normStr);
115             charPositions = new int[normStr.length() + 1];
116             Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
117             int index = 0;
118             charPositions[0] = 0;
119             while (index < normalizer.endIndex()) {
120                 normalizer.next();
121                 numChars++;
122                 index = normalizer.getIndex();
123                 charPositions[numChars] = index;
124             }
125         }
126 
127         // From here on out, do the algorithm. Note that our indices
128         // refer to indices within the normalized string.
129         int[] bestSnlp = new int[numChars + 1];
130         bestSnlp[0] = 0;
131         for (int i = 1; i <= numChars; i++) {
132             bestSnlp[i] = kint32max;
133         }
134 
135         int[] prev = new int[numChars + 1];
136         for (int i = 0; i <= numChars; i++) {
137             prev[i] = -1;
138         }
139 
140         final int maxWordSize = 20;
141         int values[] = new int[numChars];
142         int lengths[] = new int[numChars];
143         // dynamic programming to find the best segmentation
144         boolean is_prev_katakana = false;
145         for (int i = 0; i < numChars; i++) {
146             text.setIndex(i);
147             if (bestSnlp[i] == kint32max) {
148                 continue;
149             }
150 
151             int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
152             int[] count_ = new int[1];
153             fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
154             int count = count_[0];
155 
156             // if there are no single character matches found in the dictionary
157             // starting with this character, treat character as a 1-character word
158             // with the highest value possible (i.e. the least likely to occur).
159             // Exclude Korean characters from this treatment, as they should be
160             // left together by default.
161             if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
162                 values[count] = maxSnlp;
163                 lengths[count] = 1;
164                 count++;
165             }
166 
167             for (int j = 0; j < count; j++) {
168                 int newSnlp = bestSnlp[i] + values[j];
169                 if (newSnlp < bestSnlp[lengths[j] + i]) {
170                     bestSnlp[lengths[j] + i] = newSnlp;
171                     prev[lengths[j] + i] = i;
172                 }
173             }
174 
175             // In Japanese, single-character Katakana words are pretty rare.
176             // So we apply the following heuristic to Katakana: any continuous
177             // run of Katakana characters is considered a candidate word with
178             // a default cost specified in the katakanaCost table according
179             // to its length.
180             text.setIndex(i);
181             boolean is_katakana = isKatakana(current32(text));
182             if (!is_prev_katakana && is_katakana) {
183                 int j = i + 1;
184                 next32(text);
185                 while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
186                     next32(text);
187                     ++j;
188                 }
189 
190                 if ((j - i) < kMaxKatakanaGroupLength) {
191                     int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
192                     if (newSnlp < bestSnlp[j]) {
193                         bestSnlp[j] = newSnlp;
194                         prev[j] = i;
195                     }
196                 }
197             }
198             is_prev_katakana = is_katakana;
199         }
200 
201         int t_boundary[] = new int[numChars + 1];
202         int numBreaks = 0;
203         if (bestSnlp[numChars] == kint32max) {
204             t_boundary[numBreaks] = numChars;
205             numBreaks++;
206         } else {
207             for (int i = numChars; i > 0; i = prev[i]) {
208                 t_boundary[numBreaks] = i;
209                 numBreaks++;
210             }
211             Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
212         }
213 
214         if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
215             t_boundary[numBreaks++] = 0;
216         }
217 
218         int correctedNumBreaks = 0;
219         for (int i = numBreaks - 1; i >= 0; i--) {
220             int pos = charPositions[t_boundary[i]] + startPos;
221             if (!(foundBreaks.contains(pos) || pos == startPos)) {
222                 foundBreaks.push(charPositions[t_boundary[i]] + startPos);
223                 correctedNumBreaks++;
224             }
225         }
226 
227         if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
228             foundBreaks.pop();
229             correctedNumBreaks--;
230         }
231         if (!foundBreaks.isEmpty())
232             inText.setIndex(foundBreaks.peek());
233         return correctedNumBreaks;
234     }
235 }
236