• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2014, International Business Machines Corporation and         *
7  * others. All Rights Reserved.                                                *
8  *******************************************************************************
9  */
10 package android.icu.text;
11 
12 import java.io.IOException;
13 import java.text.CharacterIterator;
14 
15 import android.icu.lang.UCharacter;
16 import android.icu.lang.UProperty;
17 import android.icu.lang.UScript;
18 
19 class ThaiBreakEngine extends DictionaryBreakEngine {
20 
21     // Constants for ThaiBreakIterator
22     // How many words in a row are "good enough"?
23     private static final byte THAI_LOOKAHEAD = 3;
24     // Will not combine a non-word with a preceding dictionary word longer than this
25     private static final byte THAI_ROOT_COMBINE_THRESHOLD = 3;
26     // Will not combine a non-word that shares at least this much prefix with a
27     // dictionary word with a preceding word
28     private static final byte THAI_PREFIX_COMBINE_THRESHOLD = 3;
29     // Ellision character
30     private static final char THAI_PAIYANNOI = 0x0E2F;
31     // Repeat character
32     private static final char THAI_MAIYAMOK = 0x0E46;
33     // Minimum word size
34     private static final byte THAI_MIN_WORD = 2;
35     // Minimum number of characters for two words
36     private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
37 
38     private DictionaryMatcher fDictionary;
39     private static UnicodeSet fThaiWordSet;
40     private static UnicodeSet fEndWordSet;
41     private static UnicodeSet fBeginWordSet;
42     private static UnicodeSet fSuffixSet;
43     private static UnicodeSet fMarkSet;
44 
45     static {
46         // Initialize UnicodeSets
47         fThaiWordSet = new UnicodeSet();
48         fMarkSet = new UnicodeSet();
49         fBeginWordSet = new UnicodeSet();
50         fSuffixSet = new UnicodeSet();
51 
52         fThaiWordSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]]");
fThaiWordSet.compact()53         fThaiWordSet.compact();
54 
55         fMarkSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
56         fMarkSet.add(0x0020);
57         fEndWordSet = new UnicodeSet(fThaiWordSet);
58         fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
59         fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
60         fBeginWordSet.add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
61         fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
62         fSuffixSet.add(THAI_PAIYANNOI);
63         fSuffixSet.add(THAI_MAIYAMOK);
64 
65         // Compact for caching
fMarkSet.compact()66         fMarkSet.compact();
fEndWordSet.compact()67         fEndWordSet.compact();
fBeginWordSet.compact()68         fBeginWordSet.compact();
fSuffixSet.compact()69         fSuffixSet.compact();
70 
71         // Freeze the static UnicodeSet
fThaiWordSet.freeze()72         fThaiWordSet.freeze();
fMarkSet.freeze()73         fMarkSet.freeze();
fEndWordSet.freeze()74         fEndWordSet.freeze();
fBeginWordSet.freeze()75         fBeginWordSet.freeze();
fSuffixSet.freeze()76         fSuffixSet.freeze();
77     }
78 
ThaiBreakEngine()79     public ThaiBreakEngine() throws IOException {
80         super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
81         setCharacters(fThaiWordSet);
82         // Initialize dictionary
83         fDictionary = DictionaryData.loadDictionaryFor("Thai");
84     }
85 
equals(Object obj)86     public boolean equals(Object obj) {
87         // Normally is a singleton, but it's possible to have duplicates
88         //   during initialization. All are equivalent.
89         return obj instanceof ThaiBreakEngine;
90     }
91 
hashCode()92     public int hashCode() {
93         return getClass().hashCode();
94     }
95 
handles(int c, int breakType)96     public boolean handles(int c, int breakType) {
97         if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
98             int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
99             return (script == UScript.THAI);
100         }
101         return false;
102     }
103 
divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, DequeI foundBreaks)104     public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
105             DequeI foundBreaks) {
106 
107         if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
108             return 0;  // Not enough characters for word
109         }
110         int wordsFound = 0;
111         int wordLength;
112         PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD];
113         for (int i = 0; i < THAI_LOOKAHEAD; i++) {
114             words[i] = new PossibleWord();
115         }
116 
117         int uc;
118         fIter.setIndex(rangeStart);
119         int current;
120         while ((current = fIter.getIndex()) < rangeEnd) {
121             wordLength = 0;
122 
123             //Look for candidate words at the current position
124             int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
125 
126             // If we found exactly one, use that
127             if (candidates == 1) {
128                 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
129                 wordsFound += 1;
130             }
131 
132             // If there was more than one, see which one can take us forward the most words
133             else if (candidates > 1) {
134                 // If we're already at the end of the range, we're done
135                 if (fIter.getIndex() < rangeEnd) {
136                   foundBest:
137                     do {
138                         int wordsMatched = 1;
139                         if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
140                             if (wordsMatched < 2) {
141                                 // Followed by another dictionary word; mark first word as a good candidate
142                                 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
143                                 wordsMatched = 2;
144                             }
145 
146                             // If we're already at the end of the range, we're done
147                             if (fIter.getIndex() >= rangeEnd) {
148                                 break foundBest;
149                             }
150 
151                             // See if any of the possible second words is followed by a third word
152                             do {
153                                 // If we find a third word, stop right away
154                                 if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
155                                     words[wordsFound%THAI_LOOKAHEAD].markCurrent();
156                                     break foundBest;
157                                 }
158                             } while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(fIter));
159                         }
160                     }
161                     while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter));
162                     // foundBest: end of loop
163                 }
164                 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
165                 wordsFound += 1;
166             }
167 
168             // We come here after having either found a word or not. We look ahead to the
169             // next word. If it's not a dictionary word, we will combine it with the word we
170             // just found (if there is one), but only if the preceding word does not exceed
171             // the threshold.
172             // The text iterator should now be positioned at the end of the word we found.
173             if (fIter.getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
174                 // If it is a dictionary word, do nothing. If it isn't, then if there is
175                 // no preceding word, or the non-word shares less than the minimum threshold
176                 // of characters with a dictionary word, then scan to resynchronize
177                 if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
178                         (wordLength == 0 ||
179                                 words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
180                     // Look for a plausible word boundary
181                     int remaining = rangeEnd - (current + wordLength);
182                     int pc = fIter.current();
183                     int chars = 0;
184                     for (;;) {
185                         fIter.next();
186                         uc = fIter.current();
187                         chars += 1;
188                         if (--remaining <= 0) {
189                             break;
190                         }
191                         if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
192                             // Maybe. See if it's in the dictionary.
193                             // Note: In the original Apple code, checked that the next
194                             // two characters after uc were not 0x0E4C THANTHAKHAT before
195                             // checking the dictionary. That is just a performance filter,
196                             // but it's not clear it's faster than checking the trie
197                             int candidate = words[(wordsFound + 1) %THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
198                             fIter.setIndex(current + wordLength + chars);
199                             if (candidate > 0) {
200                                 break;
201                             }
202                         }
203                         pc = uc;
204                     }
205 
206                     // Bump the word count if there wasn't already one
207                     if (wordLength <= 0) {
208                         wordsFound += 1;
209                     }
210 
211                     // Update the length with the passed-over characters
212                     wordLength += chars;
213                 } else {
214                     // Backup to where we were for next iteration
215                     fIter.setIndex(current+wordLength);
216                 }
217             }
218 
219             // Never stop before a combining mark.
220             int currPos;
221             while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
222                 fIter.next();
223                 wordLength += fIter.getIndex() - currPos;
224             }
225 
226             // Look ahead for possible suffixes if a dictionary word does not follow.
227             // We do this in code rather than using a rule so that the heuristic
228             // resynch continues to function. For example, one of the suffix characters
229             // could be a typo in the middle of a word.
230             if (fIter.getIndex() < rangeEnd && wordLength > 0) {
231                 if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
232                         fSuffixSet.contains(uc = fIter.current())) {
233                     if (uc == THAI_PAIYANNOI) {
234                         if (!fSuffixSet.contains(fIter.previous())) {
235                             // Skip over previous end and PAIYANNOI
236                             fIter.next();
237                             fIter.next();
238                             wordLength += 1;
239                             uc = fIter.current();
240                         } else {
241                             // Restore prior position
242                             fIter.next();
243                         }
244                     }
245                     if (uc == THAI_MAIYAMOK) {
246                         if (fIter.previous() != THAI_MAIYAMOK) {
247                             // Skip over previous end and MAIYAMOK
248                             fIter.next();
249                             fIter.next();
250                             wordLength += 1;
251                         } else {
252                             // restore prior position
253                             fIter.next();
254                         }
255                     }
256                 } else {
257                     fIter.setIndex(current + wordLength);
258                 }
259             }
260 
261             // Did we find a word on this iteration? If so, push it on the break stack
262             if (wordLength > 0) {
263                 foundBreaks.push(Integer.valueOf(current + wordLength));
264             }
265         }
266 
267         // Don't return a break for the end of the dictionary range if there is one there
268         if (foundBreaks.peek() >= rangeEnd) {
269             foundBreaks.pop();
270             wordsFound -= 1;
271         }
272 
273         return wordsFound;
274     }
275 
276 }
277