• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  *******************************************************************************
3  * Copyright (C) 2006-2014, International Business Machines Corporation
4  * and others. All Rights Reserved.
5  *******************************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_BREAK_ITERATION
11 
12 #include "brkeng.h"
13 #include "dictbe.h"
14 #include "unicode/uniset.h"
15 #include "unicode/chariter.h"
16 #include "unicode/ubrk.h"
17 #include "uvectr32.h"
18 #include "uvector.h"
19 #include "uassert.h"
20 #include "unicode/normlzr.h"
21 #include "cmemory.h"
22 #include "dictionarydata.h"
23 
24 U_NAMESPACE_BEGIN
25 
26 /*
27  ******************************************************************
28  */
29 
DictionaryBreakEngine(uint32_t breakTypes)30 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
31     fTypes = breakTypes;
32 }
33 
~DictionaryBreakEngine()34 DictionaryBreakEngine::~DictionaryBreakEngine() {
35 }
36 
37 UBool
handles(UChar32 c,int32_t breakType) const38 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
39     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
40             && fSet.contains(c));
41 }
42 
43 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,UBool reverse,int32_t breakType,UStack & foundBreaks) const44 DictionaryBreakEngine::findBreaks( UText *text,
45                                  int32_t startPos,
46                                  int32_t endPos,
47                                  UBool reverse,
48                                  int32_t breakType,
49                                  UStack &foundBreaks ) const {
50     int32_t result = 0;
51 
52     // Find the span of characters included in the set.
53     //   The span to break begins at the current position in the text, and
54     //   extends towards the start or end of the text, depending on 'reverse'.
55 
56     int32_t start = (int32_t)utext_getNativeIndex(text);
57     int32_t current;
58     int32_t rangeStart;
59     int32_t rangeEnd;
60     UChar32 c = utext_current32(text);
61     if (reverse) {
62         UBool   isDict = fSet.contains(c);
63         while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
64             c = utext_previous32(text);
65             isDict = fSet.contains(c);
66         }
67         if (current < startPos) {
68             rangeStart = startPos;
69         } else {
70             rangeStart = current;
71             if (!isDict) {
72                 utext_next32(text);
73                 rangeStart = utext_getNativeIndex(text);
74             }
75         }
76         // rangeEnd = start + 1;
77         utext_setNativeIndex(text, start);
78         utext_next32(text);
79         rangeEnd = utext_getNativeIndex(text);
80     }
81     else {
82         while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
83             utext_next32(text);         // TODO:  recast loop for postincrement
84             c = utext_current32(text);
85         }
86         rangeStart = start;
87         rangeEnd = current;
88     }
89     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
90         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
91         utext_setNativeIndex(text, current);
92     }
93 
94     return result;
95 }
96 
97 void
setCharacters(const UnicodeSet & set)98 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
99     fSet = set;
100     // Compact for caching
101     fSet.compact();
102 }
103 
104 /*
105  ******************************************************************
106  * PossibleWord
107  */
108 
109 // Helper class for improving readability of the Thai/Lao/Khmer word break
110 // algorithm. The implementation is completely inline.
111 
112 // List size, limited by the maximum number of words in the dictionary
113 // that form a nested sequence.
114 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
115 
116 class PossibleWord {
117 private:
118     // list of word candidate lengths, in increasing length order
119     // TODO: bytes would be sufficient for word lengths.
120     int32_t   count;      // Count of candidates
121     int32_t   prefix;     // The longest match with a dictionary word
122     int32_t   offset;     // Offset in the text of these candidates
123     int32_t   mark;       // The preferred candidate's offset
124     int32_t   current;    // The candidate we're currently looking at
125     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
126     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
127 
128 public:
PossibleWord()129     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
~PossibleWord()130     ~PossibleWord() {};
131 
132     // Fill the list of candidates if needed, select the longest, and return the number found
133     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
134 
135     // Select the currently marked candidate, point after it in the text, and invalidate self
136     int32_t   acceptMarked( UText *text );
137 
138     // Back up from the current candidate to the next shorter one; return TRUE if that exists
139     // and point the text after it
140     UBool     backUp( UText *text );
141 
142     // Return the longest prefix this candidate location shares with a dictionary word
143     // Return value is in code points.
longestPrefix()144     int32_t   longestPrefix() { return prefix; };
145 
146     // Mark the current candidate as the one we like
markCurrent()147     void      markCurrent() { mark = current; };
148 
149     // Get length in code points of the marked word.
markedCPLength()150     int32_t   markedCPLength() { return cpLengths[mark]; };
151 };
152 
153 
candidates(UText * text,DictionaryMatcher * dict,int32_t rangeEnd)154 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
155     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
156     int32_t start = (int32_t)utext_getNativeIndex(text);
157     if (start != offset) {
158         offset = start;
159         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
160         // Dictionary leaves text after longest prefix, not longest word. Back up.
161         if (count <= 0) {
162             utext_setNativeIndex(text, start);
163         }
164     }
165     if (count > 0) {
166         utext_setNativeIndex(text, start+cuLengths[count-1]);
167     }
168     current = count-1;
169     mark = current;
170     return count;
171 }
172 
173 int32_t
acceptMarked(UText * text)174 PossibleWord::acceptMarked( UText *text ) {
175     utext_setNativeIndex(text, offset + cuLengths[mark]);
176     return cuLengths[mark];
177 }
178 
179 
180 UBool
backUp(UText * text)181 PossibleWord::backUp( UText *text ) {
182     if (current > 0) {
183         utext_setNativeIndex(text, offset + cuLengths[--current]);
184         return TRUE;
185     }
186     return FALSE;
187 }
188 
189 /*
190  ******************************************************************
191  * ThaiBreakEngine
192  */
193 
194 // How many words in a row are "good enough"?
195 static const int32_t THAI_LOOKAHEAD = 3;
196 
197 // Will not combine a non-word with a preceding dictionary word longer than this
198 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
199 
200 // Will not combine a non-word that shares at least this much prefix with a
201 // dictionary word, with a preceding word
202 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
203 
204 // Ellision character
205 static const int32_t THAI_PAIYANNOI = 0x0E2F;
206 
207 // Repeat character
208 static const int32_t THAI_MAIYAMOK = 0x0E46;
209 
210 // Minimum word size
211 static const int32_t THAI_MIN_WORD = 2;
212 
213 // Minimum number of characters for two words
214 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
215 
ThaiBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)216 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
217     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
218       fDictionary(adoptDictionary)
219 {
220     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
221     if (U_SUCCESS(status)) {
222         setCharacters(fThaiWordSet);
223     }
224     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
225     fMarkSet.add(0x0020);
226     fEndWordSet = fThaiWordSet;
227     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
228     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
229     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
230     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
231     fSuffixSet.add(THAI_PAIYANNOI);
232     fSuffixSet.add(THAI_MAIYAMOK);
233 
234     // Compact for caching.
235     fMarkSet.compact();
236     fEndWordSet.compact();
237     fBeginWordSet.compact();
238     fSuffixSet.compact();
239 }
240 
~ThaiBreakEngine()241 ThaiBreakEngine::~ThaiBreakEngine() {
242     delete fDictionary;
243 }
244 
245 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UStack & foundBreaks) const246 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
247                                                 int32_t rangeStart,
248                                                 int32_t rangeEnd,
249                                                 UStack &foundBreaks ) const {
250     utext_setNativeIndex(text, rangeStart);
251     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
252     if (utext_getNativeIndex(text) >= rangeEnd) {
253         return 0;       // Not enough characters for two words
254     }
255     utext_setNativeIndex(text, rangeStart);
256 
257 
258     uint32_t wordsFound = 0;
259     int32_t cpWordLength = 0;    // Word Length in Code Points.
260     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
261     int32_t current;
262     UErrorCode status = U_ZERO_ERROR;
263     PossibleWord words[THAI_LOOKAHEAD];
264 
265     utext_setNativeIndex(text, rangeStart);
266 
267     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
268         cpWordLength = 0;
269         cuWordLength = 0;
270 
271         // Look for candidate words at the current position
272         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
273 
274         // If we found exactly one, use that
275         if (candidates == 1) {
276             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
277             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
278             wordsFound += 1;
279         }
280         // If there was more than one, see which one can take us forward the most words
281         else if (candidates > 1) {
282             // If we're already at the end of the range, we're done
283             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
284                 goto foundBest;
285             }
286             do {
287                 int32_t wordsMatched = 1;
288                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
289                     if (wordsMatched < 2) {
290                         // Followed by another dictionary word; mark first word as a good candidate
291                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
292                         wordsMatched = 2;
293                     }
294 
295                     // If we're already at the end of the range, we're done
296                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
297                         goto foundBest;
298                     }
299 
300                     // See if any of the possible second words is followed by a third word
301                     do {
302                         // If we find a third word, stop right away
303                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
304                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
305                             goto foundBest;
306                         }
307                     }
308                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
309                 }
310             }
311             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
312 foundBest:
313             // Set UText position to after the accepted word.
314             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
315             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
316             wordsFound += 1;
317         }
318 
319         // We come here after having either found a word or not. We look ahead to the
320         // next word. If it's not a dictionary word, we will combine it with the word we
321         // just found (if there is one), but only if the preceding word does not exceed
322         // the threshold.
323         // The text iterator should now be positioned at the end of the word we found.
324 
325         UChar32 uc = 0;
326         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
327             // if it is a dictionary word, do nothing. If it isn't, then if there is
328             // no preceding word, or the non-word shares less than the minimum threshold
329             // of characters with a dictionary word, then scan to resynchronize
330             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
331                   && (cuWordLength == 0
332                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
333                 // Look for a plausible word boundary
334                 int32_t remaining = rangeEnd - (current+cuWordLength);
335                 UChar32 pc;
336                 int32_t chars = 0;
337                 for (;;) {
338                     int32_t pcIndex = utext_getNativeIndex(text);
339                     pc = utext_next32(text);
340                     int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
341                     chars += pcSize;
342                     remaining -= pcSize;
343                     if (remaining <= 0) {
344                         break;
345                     }
346                     uc = utext_current32(text);
347                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
348                         // Maybe. See if it's in the dictionary.
349                         // NOTE: In the original Apple code, checked that the next
350                         // two characters after uc were not 0x0E4C THANTHAKHAT before
351                         // checking the dictionary. That is just a performance filter,
352                         // but it's not clear it's faster than checking the trie.
353                         int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
354                         utext_setNativeIndex(text, current + cuWordLength + chars);
355                         if (candidates > 0) {
356                             break;
357                         }
358                     }
359                 }
360 
361                 // Bump the word count if there wasn't already one
362                 if (cuWordLength <= 0) {
363                     wordsFound += 1;
364                 }
365 
366                 // Update the length with the passed-over characters
367                 cuWordLength += chars;
368             }
369             else {
370                 // Back up to where we were for next iteration
371                 utext_setNativeIndex(text, current+cuWordLength);
372             }
373         }
374 
375         // Never stop before a combining mark.
376         int32_t currPos;
377         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
378             utext_next32(text);
379             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
380         }
381 
382         // Look ahead for possible suffixes if a dictionary word does not follow.
383         // We do this in code rather than using a rule so that the heuristic
384         // resynch continues to function. For example, one of the suffix characters
385         // could be a typo in the middle of a word.
386         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
387             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
388                 && fSuffixSet.contains(uc = utext_current32(text))) {
389                 if (uc == THAI_PAIYANNOI) {
390                     if (!fSuffixSet.contains(utext_previous32(text))) {
391                         // Skip over previous end and PAIYANNOI
392                         utext_next32(text);
393                         int32_t paiyannoiIndex = utext_getNativeIndex(text);
394                         utext_next32(text);
395                         cuWordLength += utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
396                         uc = utext_current32(text);     // Fetch next character
397                     }
398                     else {
399                         // Restore prior position
400                         utext_next32(text);
401                     }
402                 }
403                 if (uc == THAI_MAIYAMOK) {
404                     if (utext_previous32(text) != THAI_MAIYAMOK) {
405                         // Skip over previous end and MAIYAMOK
406                         utext_next32(text);
407                         int32_t maiyamokIndex = utext_getNativeIndex(text);
408                         utext_next32(text);
409                         cuWordLength += utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
410                     }
411                     else {
412                         // Restore prior position
413                         utext_next32(text);
414                     }
415                 }
416             }
417             else {
418                 utext_setNativeIndex(text, current+cuWordLength);
419             }
420         }
421 
422         // Did we find a word on this iteration? If so, push it on the break stack
423         if (cuWordLength > 0) {
424             foundBreaks.push((current+cuWordLength), status);
425         }
426     }
427 
428     // Don't return a break for the end of the dictionary range if there is one there.
429     if (foundBreaks.peeki() >= rangeEnd) {
430         (void) foundBreaks.popi();
431         wordsFound -= 1;
432     }
433 
434     return wordsFound;
435 }
436 
437 /*
438  ******************************************************************
439  * LaoBreakEngine
440  */
441 
442 // How many words in a row are "good enough"?
443 static const int32_t LAO_LOOKAHEAD = 3;
444 
445 // Will not combine a non-word with a preceding dictionary word longer than this
446 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
447 
448 // Will not combine a non-word that shares at least this much prefix with a
449 // dictionary word, with a preceding word
450 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
451 
452 // Minimum word size
453 static const int32_t LAO_MIN_WORD = 2;
454 
455 // Minimum number of characters for two words
456 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
457 
LaoBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)458 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
459     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
460       fDictionary(adoptDictionary)
461 {
462     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
463     if (U_SUCCESS(status)) {
464         setCharacters(fLaoWordSet);
465     }
466     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
467     fMarkSet.add(0x0020);
468     fEndWordSet = fLaoWordSet;
469     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
470     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
471     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
472     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
473 
474     // Compact for caching.
475     fMarkSet.compact();
476     fEndWordSet.compact();
477     fBeginWordSet.compact();
478 }
479 
~LaoBreakEngine()480 LaoBreakEngine::~LaoBreakEngine() {
481     delete fDictionary;
482 }
483 
484 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UStack & foundBreaks) const485 LaoBreakEngine::divideUpDictionaryRange( UText *text,
486                                                 int32_t rangeStart,
487                                                 int32_t rangeEnd,
488                                                 UStack &foundBreaks ) const {
489     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
490         return 0;       // Not enough characters for two words
491     }
492 
493     uint32_t wordsFound = 0;
494     int32_t cpWordLength = 0;
495     int32_t cuWordLength = 0;
496     int32_t current;
497     UErrorCode status = U_ZERO_ERROR;
498     PossibleWord words[LAO_LOOKAHEAD];
499 
500     utext_setNativeIndex(text, rangeStart);
501 
502     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
503         cuWordLength = 0;
504         cpWordLength = 0;
505 
506         // Look for candidate words at the current position
507         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
508 
509         // If we found exactly one, use that
510         if (candidates == 1) {
511             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
512             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
513             wordsFound += 1;
514         }
515         // If there was more than one, see which one can take us forward the most words
516         else if (candidates > 1) {
517             // If we're already at the end of the range, we're done
518             if (utext_getNativeIndex(text) >= rangeEnd) {
519                 goto foundBest;
520             }
521             do {
522                 int32_t wordsMatched = 1;
523                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
524                     if (wordsMatched < 2) {
525                         // Followed by another dictionary word; mark first word as a good candidate
526                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
527                         wordsMatched = 2;
528                     }
529 
530                     // If we're already at the end of the range, we're done
531                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
532                         goto foundBest;
533                     }
534 
535                     // See if any of the possible second words is followed by a third word
536                     do {
537                         // If we find a third word, stop right away
538                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
539                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
540                             goto foundBest;
541                         }
542                     }
543                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
544                 }
545             }
546             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
547 foundBest:
548             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
549             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
550             wordsFound += 1;
551         }
552 
553         // We come here after having either found a word or not. We look ahead to the
554         // next word. If it's not a dictionary word, we will combine it withe the word we
555         // just found (if there is one), but only if the preceding word does not exceed
556         // the threshold.
557         // The text iterator should now be positioned at the end of the word we found.
558         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
559             // if it is a dictionary word, do nothing. If it isn't, then if there is
560             // no preceding word, or the non-word shares less than the minimum threshold
561             // of characters with a dictionary word, then scan to resynchronize
562             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
563                   && (cuWordLength == 0
564                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
565                 // Look for a plausible word boundary
566                 int32_t remaining = rangeEnd - (current + cuWordLength);
567                 UChar32 pc;
568                 UChar32 uc;
569                 int32_t chars = 0;
570                 for (;;) {
571                     int32_t pcIndex = utext_getNativeIndex(text);
572                     pc = utext_next32(text);
573                     int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
574                     chars += pcSize;
575                     remaining -= pcSize;
576                     if (remaining <= 0) {
577                         break;
578                     }
579                     uc = utext_current32(text);
580                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
581                         // Maybe. See if it's in the dictionary.
582                         // TODO: this looks iffy; compare with old code.
583                         int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
584                         utext_setNativeIndex(text, current + cuWordLength + chars);
585                         if (candidates > 0) {
586                             break;
587                         }
588                     }
589                 }
590 
591                 // Bump the word count if there wasn't already one
592                 if (cuWordLength <= 0) {
593                     wordsFound += 1;
594                 }
595 
596                 // Update the length with the passed-over characters
597                 cuWordLength += chars;
598             }
599             else {
600                 // Back up to where we were for next iteration
601                 utext_setNativeIndex(text, current + cuWordLength);
602             }
603         }
604 
605         // Never stop before a combining mark.
606         int32_t currPos;
607         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
608             utext_next32(text);
609             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
610         }
611 
612         // Look ahead for possible suffixes if a dictionary word does not follow.
613         // We do this in code rather than using a rule so that the heuristic
614         // resynch continues to function. For example, one of the suffix characters
615         // could be a typo in the middle of a word.
616         // NOT CURRENTLY APPLICABLE TO LAO
617 
618         // Did we find a word on this iteration? If so, push it on the break stack
619         if (cuWordLength > 0) {
620             foundBreaks.push((current+cuWordLength), status);
621         }
622     }
623 
624     // Don't return a break for the end of the dictionary range if there is one there.
625     if (foundBreaks.peeki() >= rangeEnd) {
626         (void) foundBreaks.popi();
627         wordsFound -= 1;
628     }
629 
630     return wordsFound;
631 }
632 
633 /*
634  ******************************************************************
635  * BurmeseBreakEngine
636  */
637 
638 // How many words in a row are "good enough"?
639 static const int32_t BURMESE_LOOKAHEAD = 3;
640 
641 // Will not combine a non-word with a preceding dictionary word longer than this
642 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
643 
644 // Will not combine a non-word that shares at least this much prefix with a
645 // dictionary word, with a preceding word
646 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
647 
648 // Minimum word size
649 static const int32_t BURMESE_MIN_WORD = 2;
650 
651 // Minimum number of characters for two words
652 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
653 
BurmeseBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)654 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
655     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
656       fDictionary(adoptDictionary)
657 {
658     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
659     if (U_SUCCESS(status)) {
660         setCharacters(fBurmeseWordSet);
661     }
662     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
663     fMarkSet.add(0x0020);
664     fEndWordSet = fBurmeseWordSet;
665     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
666 
667     // Compact for caching.
668     fMarkSet.compact();
669     fEndWordSet.compact();
670     fBeginWordSet.compact();
671 }
672 
~BurmeseBreakEngine()673 BurmeseBreakEngine::~BurmeseBreakEngine() {
674     delete fDictionary;
675 }
676 
677 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UStack & foundBreaks) const678 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
679                                                 int32_t rangeStart,
680                                                 int32_t rangeEnd,
681                                                 UStack &foundBreaks ) const {
682     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
683         return 0;       // Not enough characters for two words
684     }
685 
686     uint32_t wordsFound = 0;
687     int32_t cpWordLength = 0;
688     int32_t cuWordLength = 0;
689     int32_t current;
690     UErrorCode status = U_ZERO_ERROR;
691     PossibleWord words[BURMESE_LOOKAHEAD];
692 
693     utext_setNativeIndex(text, rangeStart);
694 
695     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
696         cuWordLength = 0;
697         cpWordLength = 0;
698 
699         // Look for candidate words at the current position
700         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
701 
702         // If we found exactly one, use that
703         if (candidates == 1) {
704             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
705             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
706             wordsFound += 1;
707         }
708         // If there was more than one, see which one can take us forward the most words
709         else if (candidates > 1) {
710             // If we're already at the end of the range, we're done
711             if (utext_getNativeIndex(text) >= rangeEnd) {
712                 goto foundBest;
713             }
714             do {
715                 int32_t wordsMatched = 1;
716                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
717                     if (wordsMatched < 2) {
718                         // Followed by another dictionary word; mark first word as a good candidate
719                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
720                         wordsMatched = 2;
721                     }
722 
723                     // If we're already at the end of the range, we're done
724                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
725                         goto foundBest;
726                     }
727 
728                     // See if any of the possible second words is followed by a third word
729                     do {
730                         // If we find a third word, stop right away
731                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
732                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
733                             goto foundBest;
734                         }
735                     }
736                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
737                 }
738             }
739             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
740 foundBest:
741             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
742             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
743             wordsFound += 1;
744         }
745 
746         // We come here after having either found a word or not. We look ahead to the
747         // next word. If it's not a dictionary word, we will combine it withe the word we
748         // just found (if there is one), but only if the preceding word does not exceed
749         // the threshold.
750         // The text iterator should now be positioned at the end of the word we found.
751         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
752             // if it is a dictionary word, do nothing. If it isn't, then if there is
753             // no preceding word, or the non-word shares less than the minimum threshold
754             // of characters with a dictionary word, then scan to resynchronize
755             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
756                   && (cuWordLength == 0
757                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
758                 // Look for a plausible word boundary
759                 int32_t remaining = rangeEnd - (current + cuWordLength);
760                 UChar32 pc;
761                 UChar32 uc;
762                 int32_t chars = 0;
763                 for (;;) {
764                     int32_t pcIndex = utext_getNativeIndex(text);
765                     pc = utext_next32(text);
766                     int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
767                     chars += pcSize;
768                     remaining -= pcSize;
769                     if (remaining <= 0) {
770                         break;
771                     }
772                     uc = utext_current32(text);
773                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
774                         // Maybe. See if it's in the dictionary.
775                         // TODO: this looks iffy; compare with old code.
776                         int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
777                         utext_setNativeIndex(text, current + cuWordLength + chars);
778                         if (candidates > 0) {
779                             break;
780                         }
781                     }
782                 }
783 
784                 // Bump the word count if there wasn't already one
785                 if (cuWordLength <= 0) {
786                     wordsFound += 1;
787                 }
788 
789                 // Update the length with the passed-over characters
790                 cuWordLength += chars;
791             }
792             else {
793                 // Back up to where we were for next iteration
794                 utext_setNativeIndex(text, current + cuWordLength);
795             }
796         }
797 
798         // Never stop before a combining mark.
799         int32_t currPos;
800         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
801             utext_next32(text);
802             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
803         }
804 
805         // Look ahead for possible suffixes if a dictionary word does not follow.
806         // We do this in code rather than using a rule so that the heuristic
807         // resynch continues to function. For example, one of the suffix characters
808         // could be a typo in the middle of a word.
809         // NOT CURRENTLY APPLICABLE TO BURMESE
810 
811         // Did we find a word on this iteration? If so, push it on the break stack
812         if (cuWordLength > 0) {
813             foundBreaks.push((current+cuWordLength), status);
814         }
815     }
816 
817     // Don't return a break for the end of the dictionary range if there is one there.
818     if (foundBreaks.peeki() >= rangeEnd) {
819         (void) foundBreaks.popi();
820         wordsFound -= 1;
821     }
822 
823     return wordsFound;
824 }
825 
826 /*
827  ******************************************************************
828  * KhmerBreakEngine
829  */
830 
831 // How many words in a row are "good enough"?
832 static const int32_t KHMER_LOOKAHEAD = 3;
833 
834 // Will not combine a non-word with a preceding dictionary word longer than this
835 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
836 
837 // Will not combine a non-word that shares at least this much prefix with a
838 // dictionary word, with a preceding word
839 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
840 
841 // Minimum word size
842 static const int32_t KHMER_MIN_WORD = 2;
843 
844 // Minimum number of characters for two words
845 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
846 
KhmerBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)847 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
848     : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
849       fDictionary(adoptDictionary)
850 {
851     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
852     if (U_SUCCESS(status)) {
853         setCharacters(fKhmerWordSet);
854     }
855     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
856     fMarkSet.add(0x0020);
857     fEndWordSet = fKhmerWordSet;
858     fBeginWordSet.add(0x1780, 0x17B3);
859     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
860     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
861     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
862     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
863     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
864 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
865 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
866 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
867 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
868 //    fSuffixSet.add(THAI_PAIYANNOI);
869 //    fSuffixSet.add(THAI_MAIYAMOK);
870 
871     // Compact for caching.
872     fMarkSet.compact();
873     fEndWordSet.compact();
874     fBeginWordSet.compact();
875 //    fSuffixSet.compact();
876 }
877 
~KhmerBreakEngine()878 KhmerBreakEngine::~KhmerBreakEngine() {
879     delete fDictionary;
880 }
881 
882 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UStack & foundBreaks) const883 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
884                                                 int32_t rangeStart,
885                                                 int32_t rangeEnd,
886                                                 UStack &foundBreaks ) const {
887     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
888         return 0;       // Not enough characters for two words
889     }
890 
891     uint32_t wordsFound = 0;
892     int32_t cpWordLength = 0;
893     int32_t cuWordLength = 0;
894     int32_t current;
895     UErrorCode status = U_ZERO_ERROR;
896     PossibleWord words[KHMER_LOOKAHEAD];
897 
898     utext_setNativeIndex(text, rangeStart);
899 
900     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
901         cuWordLength = 0;
902         cpWordLength = 0;
903 
904         // Look for candidate words at the current position
905         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
906 
907         // If we found exactly one, use that
908         if (candidates == 1) {
909             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
910             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
911             wordsFound += 1;
912         }
913 
914         // If there was more than one, see which one can take us forward the most words
915         else if (candidates > 1) {
916             // If we're already at the end of the range, we're done
917             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
918                 goto foundBest;
919             }
920             do {
921                 int32_t wordsMatched = 1;
922                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
923                     if (wordsMatched < 2) {
924                         // Followed by another dictionary word; mark first word as a good candidate
925                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
926                         wordsMatched = 2;
927                     }
928 
929                     // If we're already at the end of the range, we're done
930                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
931                         goto foundBest;
932                     }
933 
934                     // See if any of the possible second words is followed by a third word
935                     do {
936                         // If we find a third word, stop right away
937                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
938                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
939                             goto foundBest;
940                         }
941                     }
942                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
943                 }
944             }
945             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
946 foundBest:
947             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
948             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
949             wordsFound += 1;
950         }
951 
952         // We come here after having either found a word or not. We look ahead to the
953         // next word. If it's not a dictionary word, we will combine it with the word we
954         // just found (if there is one), but only if the preceding word does not exceed
955         // the threshold.
956         // The text iterator should now be positioned at the end of the word we found.
957         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
958             // if it is a dictionary word, do nothing. If it isn't, then if there is
959             // no preceding word, or the non-word shares less than the minimum threshold
960             // of characters with a dictionary word, then scan to resynchronize
961             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
962                   && (cuWordLength == 0
963                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
964                 // Look for a plausible word boundary
965                 int32_t remaining = rangeEnd - (current+cuWordLength);
966                 UChar32 pc;
967                 UChar32 uc;
968                 int32_t chars = 0;
969                 for (;;) {
970                     int32_t pcIndex = utext_getNativeIndex(text);
971                     pc = utext_next32(text);
972                     int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
973                     chars += pcSize;
974                     remaining -= pcSize;
975                     if (remaining <= 0) {
976                         break;
977                     }
978                     uc = utext_current32(text);
979                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
980                         // Maybe. See if it's in the dictionary.
981                         int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
982                         utext_setNativeIndex(text, current+cuWordLength+chars);
983                         if (candidates > 0) {
984                             break;
985                         }
986                     }
987                 }
988 
989                 // Bump the word count if there wasn't already one
990                 if (cuWordLength <= 0) {
991                     wordsFound += 1;
992                 }
993 
994                 // Update the length with the passed-over characters
995                 cuWordLength += chars;
996             }
997             else {
998                 // Back up to where we were for next iteration
999                 utext_setNativeIndex(text, current+cuWordLength);
1000             }
1001         }
1002 
1003         // Never stop before a combining mark.
1004         int32_t currPos;
1005         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
1006             utext_next32(text);
1007             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
1008         }
1009 
1010         // Look ahead for possible suffixes if a dictionary word does not follow.
1011         // We do this in code rather than using a rule so that the heuristic
1012         // resynch continues to function. For example, one of the suffix characters
1013         // could be a typo in the middle of a word.
1014 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1015 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1016 //                && fSuffixSet.contains(uc = utext_current32(text))) {
1017 //                if (uc == KHMER_PAIYANNOI) {
1018 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
1019 //                        // Skip over previous end and PAIYANNOI
1020 //                        utext_next32(text);
1021 //                        utext_next32(text);
1022 //                        wordLength += 1;            // Add PAIYANNOI to word
1023 //                        uc = utext_current32(text);     // Fetch next character
1024 //                    }
1025 //                    else {
1026 //                        // Restore prior position
1027 //                        utext_next32(text);
1028 //                    }
1029 //                }
1030 //                if (uc == KHMER_MAIYAMOK) {
1031 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1032 //                        // Skip over previous end and MAIYAMOK
1033 //                        utext_next32(text);
1034 //                        utext_next32(text);
1035 //                        wordLength += 1;            // Add MAIYAMOK to word
1036 //                    }
1037 //                    else {
1038 //                        // Restore prior position
1039 //                        utext_next32(text);
1040 //                    }
1041 //                }
1042 //            }
1043 //            else {
1044 //                utext_setNativeIndex(text, current+wordLength);
1045 //            }
1046 //        }
1047 
1048         // Did we find a word on this iteration? If so, push it on the break stack
1049         if (cuWordLength > 0) {
1050             foundBreaks.push((current+cuWordLength), status);
1051         }
1052     }
1053 
1054     // Don't return a break for the end of the dictionary range if there is one there.
1055     if (foundBreaks.peeki() >= rangeEnd) {
1056         (void) foundBreaks.popi();
1057         wordsFound -= 1;
1058     }
1059 
1060     return wordsFound;
1061 }
1062 
1063 #if !UCONFIG_NO_NORMALIZATION
1064 /*
1065  ******************************************************************
1066  * CjkBreakEngine
1067  */
1068 static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine(DictionaryMatcher * adoptDictionary,LanguageType type,UErrorCode & status)1069 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1070 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
1071     // Korean dictionary only includes Hangul syllables
1072     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1073     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1074     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1075     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1076     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1077 
1078     if (U_SUCCESS(status)) {
1079         // handle Korean and Japanese/Chinese using different dictionaries
1080         if (type == kKorean) {
1081             setCharacters(fHangulWordSet);
1082         } else { //Chinese and Japanese
1083             UnicodeSet cjSet;
1084             cjSet.addAll(fHanWordSet);
1085             cjSet.addAll(fKatakanaWordSet);
1086             cjSet.addAll(fHiraganaWordSet);
1087             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1088             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1089             setCharacters(cjSet);
1090         }
1091     }
1092 }
1093 
~CjkBreakEngine()1094 CjkBreakEngine::~CjkBreakEngine(){
1095     delete fDictionary;
1096 }
1097 
1098 // The katakanaCost values below are based on the length frequencies of all
1099 // katakana phrases in the dictionary
1100 static const int32_t kMaxKatakanaLength = 8;
1101 static const int32_t kMaxKatakanaGroupLength = 20;
1102 static const uint32_t maxSnlp = 255;
1103 
getKatakanaCost(int32_t wordLength)1104 static inline uint32_t getKatakanaCost(int32_t wordLength){
1105     //TODO: fill array with actual values from dictionary!
1106     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1107                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1108     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1109 }
1110 
isKatakana(uint16_t value)1111 static inline bool isKatakana(uint16_t value) {
1112     return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
1113             (value >= 0xFF66u && value <= 0xFF9fu);
1114 }
1115 
1116 
1117 // Function for accessing internal utext flags.
1118 //   Replicates an internal UText function.
1119 
utext_i32_flag(int32_t bitIndex)1120 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1121     return (int32_t)1 << bitIndex;
1122 }
1123 
1124 
1125 /*
1126  * @param text A UText representing the text
1127  * @param rangeStart The start of the range of dictionary characters
1128  * @param rangeEnd The end of the range of dictionary characters
1129  * @param foundBreaks Output of C array of int32_t break positions, or 0
1130  * @return The number of breaks found
1131  */
1132 int32_t
divideUpDictionaryRange(UText * inText,int32_t rangeStart,int32_t rangeEnd,UStack & foundBreaks) const1133 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1134         int32_t rangeStart,
1135         int32_t rangeEnd,
1136         UStack &foundBreaks ) const {
1137     if (rangeStart >= rangeEnd) {
1138         return 0;
1139     }
1140 
1141     // UnicodeString version of input UText, NFKC normalized in necessary.
1142     UnicodeString *inString;
1143 
1144     // inputMap[inStringIndex] = corresponding native index from UText inText.
1145     // If NULL then mapping is 1:1
1146     UVector32     *inputMap    = NULL;
1147 
1148     UErrorCode     status      = U_ZERO_ERROR;
1149 
1150 
1151     // if UText has the input string as one contiguous UTF-16 chunk
1152     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1153          inText->chunkNativeStart <= rangeStart &&
1154          inText->chunkNativeLimit >= rangeEnd   &&
1155          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1156 
1157         // Input UTtxt is in one contiguous UTF-16 chunk.
1158         // Use Read-only aliasing UnicodeString constructor on it.
1159         inString = new UnicodeString(FALSE,
1160                               inText->chunkContents + rangeStart - inText->chunkNativeStart,
1161                               rangeEnd - rangeStart);
1162     } else {
1163         // Copy the text from the original inText (UText) to inString (UnicodeString).
1164         // Create a map from UnicodeString indices -> UText offsets.
1165         utext_setNativeIndex(inText, rangeStart);
1166         int32_t limit = rangeEnd;
1167         U_ASSERT(limit <= utext_nativeLength(inText));
1168         if (limit > utext_nativeLength(inText)) {
1169             limit = utext_nativeLength(inText);
1170         }
1171         inString = new UnicodeString;
1172         inputMap = new UVector32(status);
1173         while (utext_getNativeIndex(inText) < limit) {
1174             int32_t nativePosition = utext_getNativeIndex(inText);
1175             UChar32 c = utext_next32(inText);
1176             U_ASSERT(c != U_SENTINEL);
1177             inString->append(c);
1178             while (inputMap->size() < inString->length()) {
1179                 inputMap->addElement(nativePosition, status);
1180             }
1181         }
1182         inputMap->addElement(limit, status);
1183     }
1184 
1185 
1186     if (!nfkcNorm2->isNormalized(*inString, status)) {
1187         UnicodeString *normalizedInput = new UnicodeString();
1188         //  normalizedMap[normalizedInput position] ==  original UText position.
1189         UVector32 *normalizedMap = new UVector32(status);
1190         if (U_FAILURE(status)) {
1191             return 0;
1192         }
1193 
1194         UnicodeString fragment;
1195         UnicodeString normalizedFragment;
1196         for (int32_t srcI = 0; srcI < inString->length();) {                 // Once per normalization chunk
1197             fragment.remove();
1198             int32_t fragmentStartI = srcI;
1199             UChar32 c = inString->char32At(srcI);
1200             for (;;) {
1201                 fragment.append(c);
1202                 srcI = inString->moveIndex32(srcI, 1);
1203                 if (srcI == inString->length()) {
1204                     break;
1205                 }
1206                 c = inString->char32At(srcI);
1207                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1208                     break;
1209                 }
1210             }
1211             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1212             normalizedInput->append(normalizedFragment);
1213 
1214             // Map every position in the normalized chunk to the start of the chunk
1215             //   in the original input.
1216             int32_t fragmentOriginalStart = inputMap? inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1217             while (normalizedMap->size() < normalizedInput->length()) {
1218                 normalizedMap->addElement(fragmentOriginalStart, status);
1219                 if (U_FAILURE(status)) {
1220                     break;
1221                 }
1222             }
1223         }
1224         U_ASSERT(normalizedMap->size() == normalizedInput->length());
1225         int32_t nativeEnd = inputMap? inputMap->elementAti(inString->length()) : inString->length()+rangeStart;
1226         normalizedMap->addElement(nativeEnd, status);
1227 
1228         delete inputMap;
1229         inputMap = normalizedMap;
1230         delete inString;
1231         inString = normalizedInput;
1232     }
1233 
1234     int32_t numCodePts = inString->countChar32();
1235     if (numCodePts != inString->length()) {
1236         // There are supplementary characters in the input.
1237         // The dictionary will produce boundary positions in terms of code point indexes,
1238         //   not in terms of code unit string indexes.
1239         // Use the inputMap mechanism to take care of this in addition to indexing differences
1240         //    from normalization and/or UTF-8 input.
1241         UBool hadExistingMap = (inputMap != NULL);
1242         if (!hadExistingMap) {
1243             inputMap = new UVector32(status);
1244         }
1245         int32_t cpIdx = 0;
1246         for (int32_t cuIdx = 0; ; cuIdx = inString->moveIndex32(cuIdx, 1)) {
1247             U_ASSERT(cuIdx >= cpIdx);
1248             if (hadExistingMap) {
1249                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1250             } else {
1251                 inputMap->addElement(cuIdx+rangeStart, status);
1252             }
1253             cpIdx++;
1254             if (cuIdx == inString->length()) {
1255                break;
1256             }
1257         }
1258     }
1259 
1260     // bestSnlp[i] is the snlp of the best segmentation of the first i
1261     // code points in the range to be matched.
1262     UVector32 bestSnlp(numCodePts + 1, status);
1263     bestSnlp.addElement(0, status);
1264     for(int32_t i = 1; i <= numCodePts; i++) {
1265         bestSnlp.addElement(kuint32max, status);
1266     }
1267 
1268 
1269     // prev[i] is the index of the last CJK code point in the previous word in
1270     // the best segmentation of the first i characters.
1271     UVector32 prev(numCodePts + 1, status);
1272     for(int32_t i = 0; i <= numCodePts; i++){
1273         prev.addElement(-1, status);
1274     }
1275 
1276     const int32_t maxWordSize = 20;
1277     UVector32 values(numCodePts, status);
1278     values.setSize(numCodePts);
1279     UVector32 lengths(numCodePts, status);
1280     lengths.setSize(numCodePts);
1281 
1282     UText fu = UTEXT_INITIALIZER;
1283     utext_openUnicodeString(&fu, inString, &status);
1284 
1285     // Dynamic programming to find the best segmentation.
1286 
1287     // In outer loop, i  is the code point index,
1288     //                ix is the corresponding string (code unit) index.
1289     //    They differ when the string contains supplementary characters.
1290     int32_t ix = 0;
1291     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString->moveIndex32(ix, 1)) {
1292         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1293             continue;
1294         }
1295 
1296         int32_t count;
1297         utext_setNativeIndex(&fu, ix);
1298         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1299                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1300                              // Note: lengths is filled with code point lengths
1301                              //       The NULL parameter is the ignored code unit lengths.
1302 
1303         // if there are no single character matches found in the dictionary
1304         // starting with this charcter, treat character as a 1-character word
1305         // with the highest value possible, i.e. the least likely to occur.
1306         // Exclude Korean characters from this treatment, as they should be left
1307         // together by default.
1308         if ((count == 0 || lengths.elementAti(0) != 1) &&
1309                 !fHangulWordSet.contains(inString->char32At(ix))) {
1310             values.setElementAt(maxSnlp, count);   // 255
1311             lengths.setElementAt(1, count++);
1312         }
1313 
1314         for (int32_t j = 0; j < count; j++) {
1315             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1316             int32_t ln_j_i = lengths.elementAti(j) + i;
1317             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1318                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1319                 prev.setElementAt(i, ln_j_i);
1320             }
1321         }
1322 
1323         // In Japanese,
1324         // Katakana word in single character is pretty rare. So we apply
1325         // the following heuristic to Katakana: any continuous run of Katakana
1326         // characters is considered a candidate word with a default cost
1327         // specified in the katakanaCost table according to its length.
1328 
1329         bool is_prev_katakana = false;
1330         bool is_katakana = isKatakana(inString->char32At(ix));
1331         int32_t katakanaRunLength = 1;
1332         if (!is_prev_katakana && is_katakana) {
1333             int32_t j = inString->moveIndex32(ix, 1);
1334             // Find the end of the continuous run of Katakana characters
1335             while (j < inString->length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1336                     isKatakana(inString->char32At(j))) {
1337                 j = inString->moveIndex32(j, 1);
1338                 katakanaRunLength++;
1339             }
1340             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1341                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1342                 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
1343                     bestSnlp.setElementAt(newSnlp, j);
1344                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1345                 }
1346             }
1347         }
1348         is_prev_katakana = is_katakana;
1349     }
1350     utext_close(&fu);
1351 
1352     // Start pushing the optimal offset index into t_boundary (t for tentative).
1353     // prev[numCodePts] is guaranteed to be meaningful.
1354     // We'll first push in the reverse order, i.e.,
1355     // t_boundary[0] = numCodePts, and afterwards do a swap.
1356     UVector32 t_boundary(numCodePts+1, status);
1357 
1358     int32_t numBreaks = 0;
1359     // No segmentation found, set boundary to end of range
1360     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1361         t_boundary.addElement(numCodePts, status);
1362         numBreaks++;
1363     } else {
1364         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1365             t_boundary.addElement(i, status);
1366             numBreaks++;
1367         }
1368         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1369     }
1370 
1371     // Add a break for the start of the dictionary range if there is not one
1372     // there already.
1373     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1374         t_boundary.addElement(0, status);
1375         numBreaks++;
1376     }
1377 
1378     // Now that we're done, convert positions in t_boundary[] (indices in
1379     // the normalized input string) back to indices in the original input UText
1380     // while reversing t_boundary and pushing values to foundBreaks.
1381     for (int32_t i = numBreaks-1; i >= 0; i--) {
1382         int32_t cpPos = t_boundary.elementAti(i);
1383         int32_t utextPos =  inputMap ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1384         // Boundaries are added to foundBreaks output in ascending order.
1385         U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
1386         foundBreaks.push(utextPos, status);
1387     }
1388 
1389     delete inString;
1390     delete inputMap;
1391     return numBreaks;
1392 }
1393 #endif
1394 
1395 U_NAMESPACE_END
1396 
1397 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1398 
1399