• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2014, International Business Machines Corporation and         *
7  * others. All Rights Reserved.                                                *
8  *******************************************************************************
9  */
10 package ohos.global.icu.text;
11 
12 import java.io.IOException;
13 import java.text.CharacterIterator;
14 
15 import ohos.global.icu.lang.UCharacter;
16 import ohos.global.icu.lang.UProperty;
17 import ohos.global.icu.lang.UScript;
18 
19 class BurmeseBreakEngine extends DictionaryBreakEngine {
20 
21     // Constants for BurmeseBreakIterator
22     // How many words in a row are "good enough"?
23     private static final byte BURMESE_LOOKAHEAD = 3;
24     // Will not combine a non-word with a preceding dictionary word longer than this
25     private static final byte BURMESE_ROOT_COMBINE_THRESHOLD = 3;
26     // Will not combine a non-word that shares at least this much prefix with a
27     // dictionary word with a preceding word
28     private static final byte BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
29     // Minimum word size
30     private static final byte BURMESE_MIN_WORD = 2;
31 
32     private DictionaryMatcher fDictionary;
33     private static UnicodeSet fBurmeseWordSet;
34     private static UnicodeSet fEndWordSet;
35     private static UnicodeSet fBeginWordSet;
36     private static UnicodeSet fMarkSet;
37 
38     static {
39         // Initialize UnicodeSets
40         fBurmeseWordSet = new UnicodeSet();
41         fMarkSet = new UnicodeSet();
42         fBeginWordSet = new UnicodeSet();
43 
44         fBurmeseWordSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]]");
fBurmeseWordSet.compact()45         fBurmeseWordSet.compact();
46 
47         fMarkSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]");
48         fMarkSet.add(0x0020);
49         fEndWordSet = new UnicodeSet(fBurmeseWordSet);
50         fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
51 
52         // Compact for caching
fMarkSet.compact()53         fMarkSet.compact();
fEndWordSet.compact()54         fEndWordSet.compact();
fBeginWordSet.compact()55         fBeginWordSet.compact();
56 
57         // Freeze the static UnicodeSet
fBurmeseWordSet.freeze()58         fBurmeseWordSet.freeze();
fMarkSet.freeze()59         fMarkSet.freeze();
fEndWordSet.freeze()60         fEndWordSet.freeze();
fBeginWordSet.freeze()61         fBeginWordSet.freeze();
62     }
63 
BurmeseBreakEngine()64     public BurmeseBreakEngine() throws IOException {
65         setCharacters(fBurmeseWordSet);
66         // Initialize dictionary
67         fDictionary = DictionaryData.loadDictionaryFor("Mymr");
68     }
69 
70     @Override
equals(Object obj)71     public boolean equals(Object obj) {
72         // Normally is a singleton, but it's possible to have duplicates
73         //   during initialization. All are equivalent.
74         return obj instanceof BurmeseBreakEngine;
75     }
76 
77     @Override
hashCode()78     public int hashCode() {
79         return getClass().hashCode();
80     }
81 
82     @Override
handles(int c)83     public boolean handles(int c) {
84         int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
85         return (script == UScript.MYANMAR);
86     }
87 
88     @Override
divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, DequeI foundBreaks)89     public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
90             DequeI foundBreaks) {
91 
92 
93         if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) {
94             return 0;  // Not enough characters for word
95         }
96         int wordsFound = 0;
97         int wordLength;
98         int current;
99         PossibleWord words[] = new PossibleWord[BURMESE_LOOKAHEAD];
100         for (int i = 0; i < BURMESE_LOOKAHEAD; i++) {
101             words[i] = new PossibleWord();
102         }
103         int uc;
104 
105         fIter.setIndex(rangeStart);
106         while ((current = fIter.getIndex()) < rangeEnd) {
107             wordLength = 0;
108 
109             //Look for candidate words at the current position
110             int candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
111 
112             // If we found exactly one, use that
113             if (candidates == 1) {
114                 wordLength = words[wordsFound%BURMESE_LOOKAHEAD].acceptMarked(fIter);
115                 wordsFound += 1;
116             }
117 
118             // If there was more than one, see which one can take us forward the most words
119             else if (candidates > 1) {
120                 boolean foundBest = false;
121                 // If we're already at the end of the range, we're done
122                 if (fIter.getIndex() < rangeEnd) {
123                     do {
124                         int wordsMatched = 1;
125                         if (words[(wordsFound+1)%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
126                             if (wordsMatched < 2) {
127                                 // Followed by another dictionary word; mark first word as a good candidate
128                                 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
129                                 wordsMatched = 2;
130                             }
131 
132                             // If we're already at the end of the range, we're done
133                             if (fIter.getIndex() >= rangeEnd) {
134                                 break;
135                             }
136 
137                             // See if any of the possible second words is followed by a third word
138                             do {
139                                 // If we find a third word, stop right away
140                                 if (words[(wordsFound+2)%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
141                                     words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
142                                     foundBest = true;
143                                     break;
144                                 }
145                             } while (words[(wordsFound+1)%BURMESE_LOOKAHEAD].backUp(fIter));
146                         }
147                     } while (words[wordsFound%BURMESE_LOOKAHEAD].backUp(fIter) && !foundBest);
148                 }
149                 wordLength = words[wordsFound%BURMESE_LOOKAHEAD].acceptMarked(fIter);
150                 wordsFound += 1;
151             }
152 
153             // We come here after having either found a word or not. We look ahead to the
154             // next word. If it's not a dictionary word, we will combine it with the word we
155             // just found (if there is one), but only if the preceding word does not exceed
156             // the threshold.
157             // The text iterator should now be positioned at the end of the word we found.
158             if (fIter.getIndex() < rangeEnd && wordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
159                 // If it is a dictionary word, do nothing. If it isn't, then if there is
160                 // no preceding word, or the non-word shares less than the minimum threshold
161                 // of characters with a dictionary word, then scan to resynchronize
162                 if (words[wordsFound%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
163                         (wordLength == 0 ||
164                                 words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
165                     // Look for a plausible word boundary
166                     int remaining = rangeEnd - (current + wordLength);
167                     int pc = fIter.current();
168                     int chars = 0;
169                     for (;;) {
170                         fIter.next();
171                         uc = fIter.current();
172                         chars += 1;
173                         if (--remaining <= 0) {
174                             break;
175                         }
176                         if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
177                             // Maybe. See if it's in the dictionary.
178                             int candidate = words[(wordsFound + 1) %BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
179                             fIter.setIndex(current + wordLength + chars);
180                             if (candidate > 0) {
181                                 break;
182                             }
183                         }
184                         pc = uc;
185                     }
186 
187                     // Bump the word count if there wasn't already one
188                     if (wordLength <= 0) {
189                         wordsFound += 1;
190                     }
191 
192                     // Update the length with the passed-over characters
193                     wordLength += chars;
194                 } else {
195                     // Backup to where we were for next iteration
196                     fIter.setIndex(current+wordLength);
197                 }
198             }
199 
200             // Never stop before a combining mark.
201             int currPos;
202             while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
203                 fIter.next();
204                 wordLength += fIter.getIndex() - currPos;
205             }
206 
207             // Look ahead for possible suffixes if a dictionary word does not follow.
208             // We do this in code rather than using a rule so that the heuristic
209             // resynch continues to function. For example, one of the suffix characters
210             // could be a typo in the middle of a word.
211             // NOT CURRENTLY APPLICABLE TO BURMESE
212 
213             // Did we find a word on this iteration? If so, push it on the break stack
214             if (wordLength > 0) {
215                 foundBreaks.push(Integer.valueOf(current + wordLength));
216             }
217         }
218 
219         // Don't return a break for the end of the dictionary range if there is one there
220         if (foundBreaks.peek() >= rangeEnd) {
221             foundBreaks.pop();
222             wordsFound -= 1;
223         }
224 
225         return wordsFound;
226     }
227 
228 }
229