• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2012-2016, International Business Machines Corporation and         *
7  * others. All Rights Reserved.                                                *
8  *******************************************************************************
9  */
10 package ohos.global.icu.text;
11 
12 import static ohos.global.icu.impl.CharacterIteration.DONE32;
13 import static ohos.global.icu.impl.CharacterIteration.current32;
14 import static ohos.global.icu.impl.CharacterIteration.next32;
15 
16 import java.io.IOException;
17 import java.text.CharacterIterator;
18 
19 import ohos.global.icu.impl.Assert;
20 
21 class CjkBreakEngine extends DictionaryBreakEngine {
22     private static final UnicodeSet fHangulWordSet = new UnicodeSet();
23     private static final UnicodeSet fHanWordSet = new UnicodeSet();
24     private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
25     private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
26     static {
27         fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
28         fHanWordSet.applyPattern("[:Han:]");
29         fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
30         fHiraganaWordSet.applyPattern("[:Hiragana:]");
31 
32         // freeze them all
fHangulWordSet.freeze()33         fHangulWordSet.freeze();
fHanWordSet.freeze()34         fHanWordSet.freeze();
fKatakanaWordSet.freeze()35         fKatakanaWordSet.freeze();
fHiraganaWordSet.freeze()36         fHiraganaWordSet.freeze();
37     }
38 
39     private DictionaryMatcher fDictionary = null;
40 
CjkBreakEngine(boolean korean)41     public CjkBreakEngine(boolean korean) throws IOException {
42         fDictionary = DictionaryData.loadDictionaryFor("Hira");
43         if (korean) {
44             setCharacters(fHangulWordSet);
45         } else { //Chinese and Japanese
46             UnicodeSet cjSet = new UnicodeSet();
47             cjSet.addAll(fHanWordSet);
48             cjSet.addAll(fKatakanaWordSet);
49             cjSet.addAll(fHiraganaWordSet);
50             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
51             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
52             setCharacters(cjSet);
53         }
54     }
55 
56     @Override
equals(Object obj)57     public boolean equals(Object obj) {
58         if (obj instanceof CjkBreakEngine) {
59             CjkBreakEngine other = (CjkBreakEngine)obj;
60             return this.fSet.equals(other.fSet);
61         }
62         return false;
63     }
64 
65     @Override
hashCode()66     public int hashCode() {
67         return getClass().hashCode();
68     }
69 
70     private static final int kMaxKatakanaLength = 8;
71     private static final int kMaxKatakanaGroupLength = 20;
72     private static final int maxSnlp = 255;
73     private static final int kint32max = Integer.MAX_VALUE;
getKatakanaCost(int wordlength)74     private static int getKatakanaCost(int wordlength) {
75         int katakanaCost[] =  new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
76         return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
77     }
78 
isKatakana(int value)79     private static boolean isKatakana(int value) {
80         return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
81                 (value >= 0xFF66 && value <= 0xFF9F);
82     }
83 
84     @Override
divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks)85     public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
86             DequeI foundBreaks) {
87         if (startPos >= endPos) {
88             return 0;
89         }
90 
91         inText.setIndex(startPos);
92 
93         int inputLength = endPos - startPos;
94         int[] charPositions = new int[inputLength + 1];
95         StringBuffer s = new StringBuffer("");
96         inText.setIndex(startPos);
97         while (inText.getIndex() < endPos) {
98             s.append(inText.current());
99             inText.next();
100         }
101         String prenormstr = s.toString();
102         boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
103                                Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
104         CharacterIterator text;
105         int numCodePts = 0;
106         if (isNormalized) {
107             text = new java.text.StringCharacterIterator(prenormstr);
108             int index = 0;
109             charPositions[0] = 0;
110             while (index < prenormstr.length()) {
111                 int codepoint = prenormstr.codePointAt(index);
112                 index += Character.charCount(codepoint);
113                 numCodePts++;
114                 charPositions[numCodePts] = index;
115             }
116         } else {
117             String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
118             text = new java.text.StringCharacterIterator(normStr);
119             charPositions = new int[normStr.length() + 1];
120             Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
121             int index = 0;
122             charPositions[0] = 0;
123             while (index < normalizer.endIndex()) {
124                 normalizer.next();
125                 numCodePts++;
126                 index = normalizer.getIndex();
127                 charPositions[numCodePts] = index;
128             }
129         }
130 
131         // From here on out, do the algorithm. Note that our indices
132         // refer to indices within the normalized string.
133         int[] bestSnlp = new int[numCodePts + 1];
134         bestSnlp[0] = 0;
135         for (int i = 1; i <= numCodePts; i++) {
136             bestSnlp[i] = kint32max;
137         }
138 
139         int[] prev = new int[numCodePts + 1];
140         for (int i = 0; i <= numCodePts; i++) {
141             prev[i] = -1;
142         }
143 
144         final int maxWordSize = 20;
145         int values[] = new int[numCodePts];
146         int lengths[] = new int[numCodePts];
147         // dynamic programming to find the best segmentation
148 
149         // In outer loop, i  is the code point index,
150         //                ix is the corresponding code unit index.
151         //    They differ when the string contains supplementary characters.
152         int ix = 0;
153         text.setIndex(ix);
154         boolean is_prev_katakana = false;
155         for (int i = 0; i < numCodePts; i++, text.setIndex(ix), next32(text)) {
156             ix = text.getIndex();
157             if (bestSnlp[i] == kint32max) {
158                 continue;
159             }
160 
161             int maxSearchLength = (i + maxWordSize < numCodePts) ? maxWordSize : (numCodePts - i);
162             int[] count_ = new int[1];
163             fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
164             int count = count_[0];
165 
166             // if there are no single character matches found in the dictionary
167             // starting with this character, treat character as a 1-character word
168             // with the highest value possible (i.e. the least likely to occur).
169             // Exclude Korean characters from this treatment, as they should be
170             // left together by default.
171             text.setIndex(ix);  // fDictionary.matches() advances the text position; undo that.
172             if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
173                 values[count] = maxSnlp;
174                 lengths[count] = 1;
175                 count++;
176             }
177 
178             for (int j = 0; j < count; j++) {
179                 int newSnlp = bestSnlp[i] + values[j];
180                 if (newSnlp < bestSnlp[lengths[j] + i]) {
181                     bestSnlp[lengths[j] + i] = newSnlp;
182                     prev[lengths[j] + i] = i;
183                 }
184             }
185 
186             // In Japanese, single-character Katakana words are pretty rare.
187             // So we apply the following heuristic to Katakana: any continuous
188             // run of Katakana characters is considered a candidate word with
189             // a default cost specified in the katakanaCost table according
190             // to its length.
191             boolean is_katakana = isKatakana(current32(text));
192             if (!is_prev_katakana && is_katakana) {
193                 int j = i + 1;
194                 next32(text);
195                 while (j < numCodePts && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
196                     next32(text);
197                     ++j;
198                 }
199 
200                 if ((j - i) < kMaxKatakanaGroupLength) {
201                     int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
202                     if (newSnlp < bestSnlp[j]) {
203                         bestSnlp[j] = newSnlp;
204                         prev[j] = i;
205                     }
206                 }
207             }
208             is_prev_katakana = is_katakana;
209         }
210 
211         int t_boundary[] = new int[numCodePts + 1];
212         int numBreaks = 0;
213         if (bestSnlp[numCodePts] == kint32max) {
214             t_boundary[numBreaks] = numCodePts;
215             numBreaks++;
216         } else {
217             for (int i = numCodePts; i > 0; i = prev[i]) {
218                 t_boundary[numBreaks] = i;
219                 numBreaks++;
220             }
221             Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
222         }
223 
224         if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
225             t_boundary[numBreaks++] = 0;
226         }
227 
228         int correctedNumBreaks = 0;
229         for (int i = numBreaks - 1; i >= 0; i--) {
230             int pos = charPositions[t_boundary[i]] + startPos;
231             if (!(foundBreaks.contains(pos) || pos == startPos)) {
232                 foundBreaks.push(charPositions[t_boundary[i]] + startPos);
233                 correctedNumBreaks++;
234             }
235         }
236 
237         if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
238             foundBreaks.pop();
239             correctedNumBreaks--;
240         }
241         if (!foundBreaks.isEmpty())
242             inText.setIndex(foundBreaks.peek());
243         return correctedNumBreaks;
244     }
245 }
246