• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 #include <utility>
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_BREAK_ITERATION
15 
16 #include "brkeng.h"
17 #include "dictbe.h"
18 #include "unicode/uniset.h"
19 #include "unicode/chariter.h"
20 #include "unicode/ubrk.h"
21 #include "utracimp.h"
22 #include "uvectr32.h"
23 #include "uvector.h"
24 #include "uassert.h"
25 #include "unicode/normlzr.h"
26 #include "cmemory.h"
27 #include "dictionarydata.h"
28 
29 U_NAMESPACE_BEGIN
30 
31 /*
32  ******************************************************************
33  */
34 
DictionaryBreakEngine()35 DictionaryBreakEngine::DictionaryBreakEngine() {
36 }
37 
~DictionaryBreakEngine()38 DictionaryBreakEngine::~DictionaryBreakEngine() {
39 }
40 
41 UBool
handles(UChar32 c) const42 DictionaryBreakEngine::handles(UChar32 c) const {
43     return fSet.contains(c);
44 }
45 
46 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,UVector32 & foundBreaks) const47 DictionaryBreakEngine::findBreaks( UText *text,
48                                  int32_t startPos,
49                                  int32_t endPos,
50                                  UVector32 &foundBreaks ) const {
51     (void)startPos;            // TODO: remove this param?
52     int32_t result = 0;
53 
54     // Find the span of characters included in the set.
55     //   The span to break begins at the current position in the text, and
56     //   extends towards the start or end of the text, depending on 'reverse'.
57 
58     int32_t start = (int32_t)utext_getNativeIndex(text);
59     int32_t current;
60     int32_t rangeStart;
61     int32_t rangeEnd;
62     UChar32 c = utext_current32(text);
63     while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
64         utext_next32(text);         // TODO:  recast loop for postincrement
65         c = utext_current32(text);
66     }
67     rangeStart = start;
68     rangeEnd = current;
69     result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
70     utext_setNativeIndex(text, current);
71 
72     return result;
73 }
74 
75 void
setCharacters(const UnicodeSet & set)76 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
77     fSet = set;
78     // Compact for caching
79     fSet.compact();
80 }
81 
82 /*
83  ******************************************************************
84  * PossibleWord
85  */
86 
87 // Helper class for improving readability of the Thai/Lao/Khmer word break
88 // algorithm. The implementation is completely inline.
89 
90 // List size, limited by the maximum number of words in the dictionary
91 // that form a nested sequence.
92 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
93 
94 class PossibleWord {
95 private:
96     // list of word candidate lengths, in increasing length order
97     // TODO: bytes would be sufficient for word lengths.
98     int32_t   count;      // Count of candidates
99     int32_t   prefix;     // The longest match with a dictionary word
100     int32_t   offset;     // Offset in the text of these candidates
101     int32_t   mark;       // The preferred candidate's offset
102     int32_t   current;    // The candidate we're currently looking at
103     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
104     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
105 
106 public:
PossibleWord()107     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
~PossibleWord()108     ~PossibleWord() {}
109 
110     // Fill the list of candidates if needed, select the longest, and return the number found
111     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
112 
113     // Select the currently marked candidate, point after it in the text, and invalidate self
114     int32_t   acceptMarked( UText *text );
115 
116     // Back up from the current candidate to the next shorter one; return TRUE if that exists
117     // and point the text after it
118     UBool     backUp( UText *text );
119 
120     // Return the longest prefix this candidate location shares with a dictionary word
121     // Return value is in code points.
longestPrefix()122     int32_t   longestPrefix() { return prefix; }
123 
124     // Mark the current candidate as the one we like
markCurrent()125     void      markCurrent() { mark = current; }
126 
127     // Get length in code points of the marked word.
markedCPLength()128     int32_t   markedCPLength() { return cpLengths[mark]; }
129 };
130 
131 
candidates(UText * text,DictionaryMatcher * dict,int32_t rangeEnd)132 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
133     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
134     int32_t start = (int32_t)utext_getNativeIndex(text);
135     if (start != offset) {
136         offset = start;
137         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
138         // Dictionary leaves text after longest prefix, not longest word. Back up.
139         if (count <= 0) {
140             utext_setNativeIndex(text, start);
141         }
142     }
143     if (count > 0) {
144         utext_setNativeIndex(text, start+cuLengths[count-1]);
145     }
146     current = count-1;
147     mark = current;
148     return count;
149 }
150 
151 int32_t
acceptMarked(UText * text)152 PossibleWord::acceptMarked( UText *text ) {
153     utext_setNativeIndex(text, offset + cuLengths[mark]);
154     return cuLengths[mark];
155 }
156 
157 
158 UBool
backUp(UText * text)159 PossibleWord::backUp( UText *text ) {
160     if (current > 0) {
161         utext_setNativeIndex(text, offset + cuLengths[--current]);
162         return TRUE;
163     }
164     return FALSE;
165 }
166 
167 /*
168  ******************************************************************
169  * ThaiBreakEngine
170  */
171 
172 // How many words in a row are "good enough"?
173 static const int32_t THAI_LOOKAHEAD = 3;
174 
175 // Will not combine a non-word with a preceding dictionary word longer than this
176 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
177 
178 // Will not combine a non-word that shares at least this much prefix with a
179 // dictionary word, with a preceding word
180 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
181 
182 // Ellision character
183 static const int32_t THAI_PAIYANNOI = 0x0E2F;
184 
185 // Repeat character
186 static const int32_t THAI_MAIYAMOK = 0x0E46;
187 
188 // Minimum word size
189 static const int32_t THAI_MIN_WORD = 2;
190 
191 // Minimum number of characters for two words
192 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
193 
ThaiBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)194 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
195     : DictionaryBreakEngine(),
196       fDictionary(adoptDictionary)
197 {
198     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
199     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
200     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
201     if (U_SUCCESS(status)) {
202         setCharacters(fThaiWordSet);
203     }
204     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
205     fMarkSet.add(0x0020);
206     fEndWordSet = fThaiWordSet;
207     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
208     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
209     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
210     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
211     fSuffixSet.add(THAI_PAIYANNOI);
212     fSuffixSet.add(THAI_MAIYAMOK);
213 
214     // Compact for caching.
215     fMarkSet.compact();
216     fEndWordSet.compact();
217     fBeginWordSet.compact();
218     fSuffixSet.compact();
219     UTRACE_EXIT_STATUS(status);
220 }
221 
~ThaiBreakEngine()222 ThaiBreakEngine::~ThaiBreakEngine() {
223     delete fDictionary;
224 }
225 
226 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const227 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
228                                                 int32_t rangeStart,
229                                                 int32_t rangeEnd,
230                                                 UVector32 &foundBreaks ) const {
231     utext_setNativeIndex(text, rangeStart);
232     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
233     if (utext_getNativeIndex(text) >= rangeEnd) {
234         return 0;       // Not enough characters for two words
235     }
236     utext_setNativeIndex(text, rangeStart);
237 
238 
239     uint32_t wordsFound = 0;
240     int32_t cpWordLength = 0;    // Word Length in Code Points.
241     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
242     int32_t current;
243     UErrorCode status = U_ZERO_ERROR;
244     PossibleWord words[THAI_LOOKAHEAD];
245 
246     utext_setNativeIndex(text, rangeStart);
247 
248     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
249         cpWordLength = 0;
250         cuWordLength = 0;
251 
252         // Look for candidate words at the current position
253         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
254 
255         // If we found exactly one, use that
256         if (candidates == 1) {
257             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
258             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
259             wordsFound += 1;
260         }
261         // If there was more than one, see which one can take us forward the most words
262         else if (candidates > 1) {
263             // If we're already at the end of the range, we're done
264             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
265                 goto foundBest;
266             }
267             do {
268                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
269                     // Followed by another dictionary word; mark first word as a good candidate
270                     words[wordsFound%THAI_LOOKAHEAD].markCurrent();
271 
272                     // If we're already at the end of the range, we're done
273                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
274                         goto foundBest;
275                     }
276 
277                     // See if any of the possible second words is followed by a third word
278                     do {
279                         // If we find a third word, stop right away
280                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
281                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
282                             goto foundBest;
283                         }
284                     }
285                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
286                 }
287             }
288             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
289 foundBest:
290             // Set UText position to after the accepted word.
291             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
292             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
293             wordsFound += 1;
294         }
295 
296         // We come here after having either found a word or not. We look ahead to the
297         // next word. If it's not a dictionary word, we will combine it with the word we
298         // just found (if there is one), but only if the preceding word does not exceed
299         // the threshold.
300         // The text iterator should now be positioned at the end of the word we found.
301 
302         UChar32 uc = 0;
303         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
304             // if it is a dictionary word, do nothing. If it isn't, then if there is
305             // no preceding word, or the non-word shares less than the minimum threshold
306             // of characters with a dictionary word, then scan to resynchronize
307             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
308                   && (cuWordLength == 0
309                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
310                 // Look for a plausible word boundary
311                 int32_t remaining = rangeEnd - (current+cuWordLength);
312                 UChar32 pc;
313                 int32_t chars = 0;
314                 for (;;) {
315                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
316                     pc = utext_next32(text);
317                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
318                     chars += pcSize;
319                     remaining -= pcSize;
320                     if (remaining <= 0) {
321                         break;
322                     }
323                     uc = utext_current32(text);
324                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
325                         // Maybe. See if it's in the dictionary.
326                         // NOTE: In the original Apple code, checked that the next
327                         // two characters after uc were not 0x0E4C THANTHAKHAT before
328                         // checking the dictionary. That is just a performance filter,
329                         // but it's not clear it's faster than checking the trie.
330                         int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
331                         utext_setNativeIndex(text, current + cuWordLength + chars);
332                         if (num_candidates > 0) {
333                             break;
334                         }
335                     }
336                 }
337 
338                 // Bump the word count if there wasn't already one
339                 if (cuWordLength <= 0) {
340                     wordsFound += 1;
341                 }
342 
343                 // Update the length with the passed-over characters
344                 cuWordLength += chars;
345             }
346             else {
347                 // Back up to where we were for next iteration
348                 utext_setNativeIndex(text, current+cuWordLength);
349             }
350         }
351 
352         // Never stop before a combining mark.
353         int32_t currPos;
354         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
355             utext_next32(text);
356             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
357         }
358 
359         // Look ahead for possible suffixes if a dictionary word does not follow.
360         // We do this in code rather than using a rule so that the heuristic
361         // resynch continues to function. For example, one of the suffix characters
362         // could be a typo in the middle of a word.
363         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
364             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
365                 && fSuffixSet.contains(uc = utext_current32(text))) {
366                 if (uc == THAI_PAIYANNOI) {
367                     if (!fSuffixSet.contains(utext_previous32(text))) {
368                         // Skip over previous end and PAIYANNOI
369                         utext_next32(text);
370                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
371                         utext_next32(text);
372                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
373                         uc = utext_current32(text);     // Fetch next character
374                     }
375                     else {
376                         // Restore prior position
377                         utext_next32(text);
378                     }
379                 }
380                 if (uc == THAI_MAIYAMOK) {
381                     if (utext_previous32(text) != THAI_MAIYAMOK) {
382                         // Skip over previous end and MAIYAMOK
383                         utext_next32(text);
384                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
385                         utext_next32(text);
386                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
387                     }
388                     else {
389                         // Restore prior position
390                         utext_next32(text);
391                     }
392                 }
393             }
394             else {
395                 utext_setNativeIndex(text, current+cuWordLength);
396             }
397         }
398 
399         // Did we find a word on this iteration? If so, push it on the break stack
400         if (cuWordLength > 0) {
401             foundBreaks.push((current+cuWordLength), status);
402         }
403     }
404 
405     // Don't return a break for the end of the dictionary range if there is one there.
406     if (foundBreaks.peeki() >= rangeEnd) {
407         (void) foundBreaks.popi();
408         wordsFound -= 1;
409     }
410 
411     return wordsFound;
412 }
413 
414 /*
415  ******************************************************************
416  * LaoBreakEngine
417  */
418 
419 // How many words in a row are "good enough"?
420 static const int32_t LAO_LOOKAHEAD = 3;
421 
422 // Will not combine a non-word with a preceding dictionary word longer than this
423 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
424 
425 // Will not combine a non-word that shares at least this much prefix with a
426 // dictionary word, with a preceding word
427 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
428 
429 // Minimum word size
430 static const int32_t LAO_MIN_WORD = 2;
431 
432 // Minimum number of characters for two words
433 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
434 
LaoBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)435 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
436     : DictionaryBreakEngine(),
437       fDictionary(adoptDictionary)
438 {
439     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
440     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
441     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
442     if (U_SUCCESS(status)) {
443         setCharacters(fLaoWordSet);
444     }
445     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
446     fMarkSet.add(0x0020);
447     fEndWordSet = fLaoWordSet;
448     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
449     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
450     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
451     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
452 
453     // Compact for caching.
454     fMarkSet.compact();
455     fEndWordSet.compact();
456     fBeginWordSet.compact();
457     UTRACE_EXIT_STATUS(status);
458 }
459 
~LaoBreakEngine()460 LaoBreakEngine::~LaoBreakEngine() {
461     delete fDictionary;
462 }
463 
464 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const465 LaoBreakEngine::divideUpDictionaryRange( UText *text,
466                                                 int32_t rangeStart,
467                                                 int32_t rangeEnd,
468                                                 UVector32 &foundBreaks ) const {
469     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
470         return 0;       // Not enough characters for two words
471     }
472 
473     uint32_t wordsFound = 0;
474     int32_t cpWordLength = 0;
475     int32_t cuWordLength = 0;
476     int32_t current;
477     UErrorCode status = U_ZERO_ERROR;
478     PossibleWord words[LAO_LOOKAHEAD];
479 
480     utext_setNativeIndex(text, rangeStart);
481 
482     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
483         cuWordLength = 0;
484         cpWordLength = 0;
485 
486         // Look for candidate words at the current position
487         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
488 
489         // If we found exactly one, use that
490         if (candidates == 1) {
491             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
492             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
493             wordsFound += 1;
494         }
495         // If there was more than one, see which one can take us forward the most words
496         else if (candidates > 1) {
497             // If we're already at the end of the range, we're done
498             if (utext_getNativeIndex(text) >= rangeEnd) {
499                 goto foundBest;
500             }
501             do {
502                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
503                     // Followed by another dictionary word; mark first word as a good candidate
504                     words[wordsFound%LAO_LOOKAHEAD].markCurrent();
505 
506                     // If we're already at the end of the range, we're done
507                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
508                         goto foundBest;
509                     }
510 
511                     // See if any of the possible second words is followed by a third word
512                     do {
513                         // If we find a third word, stop right away
514                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
515                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
516                             goto foundBest;
517                         }
518                     }
519                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
520                 }
521             }
522             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
523 foundBest:
524             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
525             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
526             wordsFound += 1;
527         }
528 
529         // We come here after having either found a word or not. We look ahead to the
530         // next word. If it's not a dictionary word, we will combine it withe the word we
531         // just found (if there is one), but only if the preceding word does not exceed
532         // the threshold.
533         // The text iterator should now be positioned at the end of the word we found.
534         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
535             // if it is a dictionary word, do nothing. If it isn't, then if there is
536             // no preceding word, or the non-word shares less than the minimum threshold
537             // of characters with a dictionary word, then scan to resynchronize
538             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
539                   && (cuWordLength == 0
540                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
541                 // Look for a plausible word boundary
542                 int32_t remaining = rangeEnd - (current + cuWordLength);
543                 UChar32 pc;
544                 UChar32 uc;
545                 int32_t chars = 0;
546                 for (;;) {
547                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
548                     pc = utext_next32(text);
549                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
550                     chars += pcSize;
551                     remaining -= pcSize;
552                     if (remaining <= 0) {
553                         break;
554                     }
555                     uc = utext_current32(text);
556                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
557                         // Maybe. See if it's in the dictionary.
558                         // TODO: this looks iffy; compare with old code.
559                         int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
560                         utext_setNativeIndex(text, current + cuWordLength + chars);
561                         if (num_candidates > 0) {
562                             break;
563                         }
564                     }
565                 }
566 
567                 // Bump the word count if there wasn't already one
568                 if (cuWordLength <= 0) {
569                     wordsFound += 1;
570                 }
571 
572                 // Update the length with the passed-over characters
573                 cuWordLength += chars;
574             }
575             else {
576                 // Back up to where we were for next iteration
577                 utext_setNativeIndex(text, current + cuWordLength);
578             }
579         }
580 
581         // Never stop before a combining mark.
582         int32_t currPos;
583         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
584             utext_next32(text);
585             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
586         }
587 
588         // Look ahead for possible suffixes if a dictionary word does not follow.
589         // We do this in code rather than using a rule so that the heuristic
590         // resynch continues to function. For example, one of the suffix characters
591         // could be a typo in the middle of a word.
592         // NOT CURRENTLY APPLICABLE TO LAO
593 
594         // Did we find a word on this iteration? If so, push it on the break stack
595         if (cuWordLength > 0) {
596             foundBreaks.push((current+cuWordLength), status);
597         }
598     }
599 
600     // Don't return a break for the end of the dictionary range if there is one there.
601     if (foundBreaks.peeki() >= rangeEnd) {
602         (void) foundBreaks.popi();
603         wordsFound -= 1;
604     }
605 
606     return wordsFound;
607 }
608 
609 /*
610  ******************************************************************
611  * BurmeseBreakEngine
612  */
613 
614 // How many words in a row are "good enough"?
615 static const int32_t BURMESE_LOOKAHEAD = 3;
616 
617 // Will not combine a non-word with a preceding dictionary word longer than this
618 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
619 
620 // Will not combine a non-word that shares at least this much prefix with a
621 // dictionary word, with a preceding word
622 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
623 
624 // Minimum word size
625 static const int32_t BURMESE_MIN_WORD = 2;
626 
627 // Minimum number of characters for two words
628 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
629 
BurmeseBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)630 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
631     : DictionaryBreakEngine(),
632       fDictionary(adoptDictionary)
633 {
634     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
635     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
636     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
637     if (U_SUCCESS(status)) {
638         setCharacters(fBurmeseWordSet);
639     }
640     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
641     fMarkSet.add(0x0020);
642     fEndWordSet = fBurmeseWordSet;
643     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
644 
645     // Compact for caching.
646     fMarkSet.compact();
647     fEndWordSet.compact();
648     fBeginWordSet.compact();
649     UTRACE_EXIT_STATUS(status);
650 }
651 
~BurmeseBreakEngine()652 BurmeseBreakEngine::~BurmeseBreakEngine() {
653     delete fDictionary;
654 }
655 
656 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const657 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
658                                                 int32_t rangeStart,
659                                                 int32_t rangeEnd,
660                                                 UVector32 &foundBreaks ) const {
661     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
662         return 0;       // Not enough characters for two words
663     }
664 
665     uint32_t wordsFound = 0;
666     int32_t cpWordLength = 0;
667     int32_t cuWordLength = 0;
668     int32_t current;
669     UErrorCode status = U_ZERO_ERROR;
670     PossibleWord words[BURMESE_LOOKAHEAD];
671 
672     utext_setNativeIndex(text, rangeStart);
673 
674     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
675         cuWordLength = 0;
676         cpWordLength = 0;
677 
678         // Look for candidate words at the current position
679         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
680 
681         // If we found exactly one, use that
682         if (candidates == 1) {
683             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
684             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
685             wordsFound += 1;
686         }
687         // If there was more than one, see which one can take us forward the most words
688         else if (candidates > 1) {
689             // If we're already at the end of the range, we're done
690             if (utext_getNativeIndex(text) >= rangeEnd) {
691                 goto foundBest;
692             }
693             do {
694                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
695                     // Followed by another dictionary word; mark first word as a good candidate
696                     words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
697 
698                     // If we're already at the end of the range, we're done
699                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
700                         goto foundBest;
701                     }
702 
703                     // See if any of the possible second words is followed by a third word
704                     do {
705                         // If we find a third word, stop right away
706                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
707                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
708                             goto foundBest;
709                         }
710                     }
711                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
712                 }
713             }
714             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
715 foundBest:
716             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
717             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
718             wordsFound += 1;
719         }
720 
721         // We come here after having either found a word or not. We look ahead to the
722         // next word. If it's not a dictionary word, we will combine it withe the word we
723         // just found (if there is one), but only if the preceding word does not exceed
724         // the threshold.
725         // The text iterator should now be positioned at the end of the word we found.
726         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
727             // if it is a dictionary word, do nothing. If it isn't, then if there is
728             // no preceding word, or the non-word shares less than the minimum threshold
729             // of characters with a dictionary word, then scan to resynchronize
730             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
731                   && (cuWordLength == 0
732                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
733                 // Look for a plausible word boundary
734                 int32_t remaining = rangeEnd - (current + cuWordLength);
735                 UChar32 pc;
736                 UChar32 uc;
737                 int32_t chars = 0;
738                 for (;;) {
739                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
740                     pc = utext_next32(text);
741                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
742                     chars += pcSize;
743                     remaining -= pcSize;
744                     if (remaining <= 0) {
745                         break;
746                     }
747                     uc = utext_current32(text);
748                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
749                         // Maybe. See if it's in the dictionary.
750                         // TODO: this looks iffy; compare with old code.
751                         int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
752                         utext_setNativeIndex(text, current + cuWordLength + chars);
753                         if (num_candidates > 0) {
754                             break;
755                         }
756                     }
757                 }
758 
759                 // Bump the word count if there wasn't already one
760                 if (cuWordLength <= 0) {
761                     wordsFound += 1;
762                 }
763 
764                 // Update the length with the passed-over characters
765                 cuWordLength += chars;
766             }
767             else {
768                 // Back up to where we were for next iteration
769                 utext_setNativeIndex(text, current + cuWordLength);
770             }
771         }
772 
773         // Never stop before a combining mark.
774         int32_t currPos;
775         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
776             utext_next32(text);
777             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
778         }
779 
780         // Look ahead for possible suffixes if a dictionary word does not follow.
781         // We do this in code rather than using a rule so that the heuristic
782         // resynch continues to function. For example, one of the suffix characters
783         // could be a typo in the middle of a word.
784         // NOT CURRENTLY APPLICABLE TO BURMESE
785 
786         // Did we find a word on this iteration? If so, push it on the break stack
787         if (cuWordLength > 0) {
788             foundBreaks.push((current+cuWordLength), status);
789         }
790     }
791 
792     // Don't return a break for the end of the dictionary range if there is one there.
793     if (foundBreaks.peeki() >= rangeEnd) {
794         (void) foundBreaks.popi();
795         wordsFound -= 1;
796     }
797 
798     return wordsFound;
799 }
800 
801 /*
802  ******************************************************************
803  * KhmerBreakEngine
804  */
805 
806 // How many words in a row are "good enough"?
807 static const int32_t KHMER_LOOKAHEAD = 3;
808 
809 // Will not combine a non-word with a preceding dictionary word longer than this
810 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
811 
812 // Will not combine a non-word that shares at least this much prefix with a
813 // dictionary word, with a preceding word
814 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
815 
816 // Minimum word size
817 static const int32_t KHMER_MIN_WORD = 2;
818 
819 // Minimum number of characters for two words
820 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
821 
KhmerBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)822 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
823     : DictionaryBreakEngine(),
824       fDictionary(adoptDictionary)
825 {
826     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
827     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
828     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
829     if (U_SUCCESS(status)) {
830         setCharacters(fKhmerWordSet);
831     }
832     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
833     fMarkSet.add(0x0020);
834     fEndWordSet = fKhmerWordSet;
835     fBeginWordSet.add(0x1780, 0x17B3);
836     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
837     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
838     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
839     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
840     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
841 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
842 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
843 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
844 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
845 //    fSuffixSet.add(THAI_PAIYANNOI);
846 //    fSuffixSet.add(THAI_MAIYAMOK);
847 
848     // Compact for caching.
849     fMarkSet.compact();
850     fEndWordSet.compact();
851     fBeginWordSet.compact();
852 //    fSuffixSet.compact();
853     UTRACE_EXIT_STATUS(status);
854 }
855 
~KhmerBreakEngine()856 KhmerBreakEngine::~KhmerBreakEngine() {
857     delete fDictionary;
858 }
859 
860 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const861 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
862                                                 int32_t rangeStart,
863                                                 int32_t rangeEnd,
864                                                 UVector32 &foundBreaks ) const {
865     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
866         return 0;       // Not enough characters for two words
867     }
868 
869     uint32_t wordsFound = 0;
870     int32_t cpWordLength = 0;
871     int32_t cuWordLength = 0;
872     int32_t current;
873     UErrorCode status = U_ZERO_ERROR;
874     PossibleWord words[KHMER_LOOKAHEAD];
875 
876     utext_setNativeIndex(text, rangeStart);
877 
878     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
879         cuWordLength = 0;
880         cpWordLength = 0;
881 
882         // Look for candidate words at the current position
883         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
884 
885         // If we found exactly one, use that
886         if (candidates == 1) {
887             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
888             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
889             wordsFound += 1;
890         }
891 
892         // If there was more than one, see which one can take us forward the most words
893         else if (candidates > 1) {
894             // If we're already at the end of the range, we're done
895             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
896                 goto foundBest;
897             }
898             do {
899                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
900                     // Followed by another dictionary word; mark first word as a good candidate
901                     words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
902 
903                     // If we're already at the end of the range, we're done
904                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
905                         goto foundBest;
906                     }
907 
908                     // See if any of the possible second words is followed by a third word
909                     do {
910                         // If we find a third word, stop right away
911                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
912                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
913                             goto foundBest;
914                         }
915                     }
916                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
917                 }
918             }
919             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
920 foundBest:
921             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
922             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
923             wordsFound += 1;
924         }
925 
926         // We come here after having either found a word or not. We look ahead to the
927         // next word. If it's not a dictionary word, we will combine it with the word we
928         // just found (if there is one), but only if the preceding word does not exceed
929         // the threshold.
930         // The text iterator should now be positioned at the end of the word we found.
931         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
932             // if it is a dictionary word, do nothing. If it isn't, then if there is
933             // no preceding word, or the non-word shares less than the minimum threshold
934             // of characters with a dictionary word, then scan to resynchronize
935             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
936                   && (cuWordLength == 0
937                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
938                 // Look for a plausible word boundary
939                 int32_t remaining = rangeEnd - (current+cuWordLength);
940                 UChar32 pc;
941                 UChar32 uc;
942                 int32_t chars = 0;
943                 for (;;) {
944                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
945                     pc = utext_next32(text);
946                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
947                     chars += pcSize;
948                     remaining -= pcSize;
949                     if (remaining <= 0) {
950                         break;
951                     }
952                     uc = utext_current32(text);
953                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
954                         // Maybe. See if it's in the dictionary.
955                         int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
956                         utext_setNativeIndex(text, current+cuWordLength+chars);
957                         if (num_candidates > 0) {
958                             break;
959                         }
960                     }
961                 }
962 
963                 // Bump the word count if there wasn't already one
964                 if (cuWordLength <= 0) {
965                     wordsFound += 1;
966                 }
967 
968                 // Update the length with the passed-over characters
969                 cuWordLength += chars;
970             }
971             else {
972                 // Back up to where we were for next iteration
973                 utext_setNativeIndex(text, current+cuWordLength);
974             }
975         }
976 
977         // Never stop before a combining mark.
978         int32_t currPos;
979         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
980             utext_next32(text);
981             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
982         }
983 
984         // Look ahead for possible suffixes if a dictionary word does not follow.
985         // We do this in code rather than using a rule so that the heuristic
986         // resynch continues to function. For example, one of the suffix characters
987         // could be a typo in the middle of a word.
988 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
989 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
990 //                && fSuffixSet.contains(uc = utext_current32(text))) {
991 //                if (uc == KHMER_PAIYANNOI) {
992 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
993 //                        // Skip over previous end and PAIYANNOI
994 //                        utext_next32(text);
995 //                        utext_next32(text);
996 //                        wordLength += 1;            // Add PAIYANNOI to word
997 //                        uc = utext_current32(text);     // Fetch next character
998 //                    }
999 //                    else {
1000 //                        // Restore prior position
1001 //                        utext_next32(text);
1002 //                    }
1003 //                }
1004 //                if (uc == KHMER_MAIYAMOK) {
1005 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1006 //                        // Skip over previous end and MAIYAMOK
1007 //                        utext_next32(text);
1008 //                        utext_next32(text);
1009 //                        wordLength += 1;            // Add MAIYAMOK to word
1010 //                    }
1011 //                    else {
1012 //                        // Restore prior position
1013 //                        utext_next32(text);
1014 //                    }
1015 //                }
1016 //            }
1017 //            else {
1018 //                utext_setNativeIndex(text, current+wordLength);
1019 //            }
1020 //        }
1021 
1022         // Did we find a word on this iteration? If so, push it on the break stack
1023         if (cuWordLength > 0) {
1024             foundBreaks.push((current+cuWordLength), status);
1025         }
1026     }
1027 
1028     // Don't return a break for the end of the dictionary range if there is one there.
1029     if (foundBreaks.peeki() >= rangeEnd) {
1030         (void) foundBreaks.popi();
1031         wordsFound -= 1;
1032     }
1033 
1034     return wordsFound;
1035 }
1036 
1037 #if !UCONFIG_NO_NORMALIZATION
1038 /*
1039  ******************************************************************
1040  * CjkBreakEngine
1041  */
1042 static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine(DictionaryMatcher * adoptDictionary,LanguageType type,UErrorCode & status)1043 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1044 : DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1045     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
1046     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
1047     // Korean dictionary only includes Hangul syllables
1048     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1049     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1050     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1051     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1052     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1053 
1054     if (U_SUCCESS(status)) {
1055         // handle Korean and Japanese/Chinese using different dictionaries
1056         if (type == kKorean) {
1057             setCharacters(fHangulWordSet);
1058         } else { //Chinese and Japanese
1059             UnicodeSet cjSet;
1060             cjSet.addAll(fHanWordSet);
1061             cjSet.addAll(fKatakanaWordSet);
1062             cjSet.addAll(fHiraganaWordSet);
1063             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1064             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1065             setCharacters(cjSet);
1066         }
1067     }
1068     UTRACE_EXIT_STATUS(status);
1069 }
1070 
~CjkBreakEngine()1071 CjkBreakEngine::~CjkBreakEngine(){
1072     delete fDictionary;
1073 }
1074 
1075 // The katakanaCost values below are based on the length frequencies of all
1076 // katakana phrases in the dictionary
1077 static const int32_t kMaxKatakanaLength = 8;
1078 static const int32_t kMaxKatakanaGroupLength = 20;
1079 static const uint32_t maxSnlp = 255;
1080 
getKatakanaCost(int32_t wordLength)1081 static inline uint32_t getKatakanaCost(int32_t wordLength){
1082     //TODO: fill array with actual values from dictionary!
1083     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1084                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1085     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1086 }
1087 
isKatakana(UChar32 value)1088 static inline bool isKatakana(UChar32 value) {
1089     return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1090             (value >= 0xFF66 && value <= 0xFF9f);
1091 }
1092 
1093 
1094 // Function for accessing internal utext flags.
1095 //   Replicates an internal UText function.
1096 
utext_i32_flag(int32_t bitIndex)1097 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1098     return (int32_t)1 << bitIndex;
1099 }
1100 
1101 
1102 /*
1103  * @param text A UText representing the text
1104  * @param rangeStart The start of the range of dictionary characters
1105  * @param rangeEnd The end of the range of dictionary characters
1106  * @param foundBreaks vector<int32> to receive the break positions
1107  * @return The number of breaks found
1108  */
1109 int32_t
divideUpDictionaryRange(UText * inText,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const1110 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1111         int32_t rangeStart,
1112         int32_t rangeEnd,
1113         UVector32 &foundBreaks ) const {
1114     if (rangeStart >= rangeEnd) {
1115         return 0;
1116     }
1117 
1118     // UnicodeString version of input UText, NFKC normalized if necessary.
1119     UnicodeString inString;
1120 
1121     // inputMap[inStringIndex] = corresponding native index from UText inText.
1122     // If NULL then mapping is 1:1
1123     LocalPointer<UVector32>     inputMap;
1124 
1125     UErrorCode     status      = U_ZERO_ERROR;
1126 
1127 
1128     // if UText has the input string as one contiguous UTF-16 chunk
1129     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1130          inText->chunkNativeStart <= rangeStart &&
1131          inText->chunkNativeLimit >= rangeEnd   &&
1132          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1133 
1134         // Input UText is in one contiguous UTF-16 chunk.
1135         // Use Read-only aliasing UnicodeString.
1136         inString.setTo(FALSE,
1137                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1138                        rangeEnd - rangeStart);
1139     } else {
1140         // Copy the text from the original inText (UText) to inString (UnicodeString).
1141         // Create a map from UnicodeString indices -> UText offsets.
1142         utext_setNativeIndex(inText, rangeStart);
1143         int32_t limit = rangeEnd;
1144         U_ASSERT(limit <= utext_nativeLength(inText));
1145         if (limit > utext_nativeLength(inText)) {
1146             limit = (int32_t)utext_nativeLength(inText);
1147         }
1148         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1149         if (U_FAILURE(status)) {
1150             return 0;
1151         }
1152         while (utext_getNativeIndex(inText) < limit) {
1153             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1154             UChar32 c = utext_next32(inText);
1155             U_ASSERT(c != U_SENTINEL);
1156             inString.append(c);
1157             while (inputMap->size() < inString.length()) {
1158                 inputMap->addElement(nativePosition, status);
1159             }
1160         }
1161         inputMap->addElement(limit, status);
1162     }
1163 
1164 
1165     if (!nfkcNorm2->isNormalized(inString, status)) {
1166         UnicodeString normalizedInput;
1167         //  normalizedMap[normalizedInput position] ==  original UText position.
1168         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1169         if (U_FAILURE(status)) {
1170             return 0;
1171         }
1172 
1173         UnicodeString fragment;
1174         UnicodeString normalizedFragment;
1175         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1176             fragment.remove();
1177             int32_t fragmentStartI = srcI;
1178             UChar32 c = inString.char32At(srcI);
1179             for (;;) {
1180                 fragment.append(c);
1181                 srcI = inString.moveIndex32(srcI, 1);
1182                 if (srcI == inString.length()) {
1183                     break;
1184                 }
1185                 c = inString.char32At(srcI);
1186                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1187                     break;
1188                 }
1189             }
1190             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1191             normalizedInput.append(normalizedFragment);
1192 
1193             // Map every position in the normalized chunk to the start of the chunk
1194             //   in the original input.
1195             int32_t fragmentOriginalStart = inputMap.isValid() ?
1196                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1197             while (normalizedMap->size() < normalizedInput.length()) {
1198                 normalizedMap->addElement(fragmentOriginalStart, status);
1199                 if (U_FAILURE(status)) {
1200                     break;
1201                 }
1202             }
1203         }
1204         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1205         int32_t nativeEnd = inputMap.isValid() ?
1206                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1207         normalizedMap->addElement(nativeEnd, status);
1208 
1209         inputMap = std::move(normalizedMap);
1210         inString = std::move(normalizedInput);
1211     }
1212 
1213     int32_t numCodePts = inString.countChar32();
1214     if (numCodePts != inString.length()) {
1215         // There are supplementary characters in the input.
1216         // The dictionary will produce boundary positions in terms of code point indexes,
1217         //   not in terms of code unit string indexes.
1218         // Use the inputMap mechanism to take care of this in addition to indexing differences
1219         //    from normalization and/or UTF-8 input.
1220         UBool hadExistingMap = inputMap.isValid();
1221         if (!hadExistingMap) {
1222             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1223             if (U_FAILURE(status)) {
1224                 return 0;
1225             }
1226         }
1227         int32_t cpIdx = 0;
1228         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1229             U_ASSERT(cuIdx >= cpIdx);
1230             if (hadExistingMap) {
1231                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1232             } else {
1233                 inputMap->addElement(cuIdx+rangeStart, status);
1234             }
1235             cpIdx++;
1236             if (cuIdx == inString.length()) {
1237                break;
1238             }
1239         }
1240     }
1241 
1242     // bestSnlp[i] is the snlp of the best segmentation of the first i
1243     // code points in the range to be matched.
1244     UVector32 bestSnlp(numCodePts + 1, status);
1245     bestSnlp.addElement(0, status);
1246     for(int32_t i = 1; i <= numCodePts; i++) {
1247         bestSnlp.addElement(kuint32max, status);
1248     }
1249 
1250 
1251     // prev[i] is the index of the last CJK code point in the previous word in
1252     // the best segmentation of the first i characters.
1253     UVector32 prev(numCodePts + 1, status);
1254     for(int32_t i = 0; i <= numCodePts; i++){
1255         prev.addElement(-1, status);
1256     }
1257 
1258     const int32_t maxWordSize = 20;
1259     UVector32 values(numCodePts, status);
1260     values.setSize(numCodePts);
1261     UVector32 lengths(numCodePts, status);
1262     lengths.setSize(numCodePts);
1263 
1264     UText fu = UTEXT_INITIALIZER;
1265     utext_openUnicodeString(&fu, &inString, &status);
1266 
1267     // Dynamic programming to find the best segmentation.
1268 
1269     // In outer loop, i  is the code point index,
1270     //                ix is the corresponding string (code unit) index.
1271     //    They differ when the string contains supplementary characters.
1272     int32_t ix = 0;
1273     bool is_prev_katakana = false;
1274     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1275         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1276             continue;
1277         }
1278 
1279         int32_t count;
1280         utext_setNativeIndex(&fu, ix);
1281         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1282                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1283                              // Note: lengths is filled with code point lengths
1284                              //       The NULL parameter is the ignored code unit lengths.
1285 
1286         // if there are no single character matches found in the dictionary
1287         // starting with this character, treat character as a 1-character word
1288         // with the highest value possible, i.e. the least likely to occur.
1289         // Exclude Korean characters from this treatment, as they should be left
1290         // together by default.
1291         if ((count == 0 || lengths.elementAti(0) != 1) &&
1292                 !fHangulWordSet.contains(inString.char32At(ix))) {
1293             values.setElementAt(maxSnlp, count);   // 255
1294             lengths.setElementAt(1, count++);
1295         }
1296 
1297         for (int32_t j = 0; j < count; j++) {
1298             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1299             int32_t ln_j_i = lengths.elementAti(j) + i;
1300             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1301                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1302                 prev.setElementAt(i, ln_j_i);
1303             }
1304         }
1305 
1306         // In Japanese,
1307         // Katakana word in single character is pretty rare. So we apply
1308         // the following heuristic to Katakana: any continuous run of Katakana
1309         // characters is considered a candidate word with a default cost
1310         // specified in the katakanaCost table according to its length.
1311 
1312         bool is_katakana = isKatakana(inString.char32At(ix));
1313         int32_t katakanaRunLength = 1;
1314         if (!is_prev_katakana && is_katakana) {
1315             int32_t j = inString.moveIndex32(ix, 1);
1316             // Find the end of the continuous run of Katakana characters
1317             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1318                     isKatakana(inString.char32At(j))) {
1319                 j = inString.moveIndex32(j, 1);
1320                 katakanaRunLength++;
1321             }
1322             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1323                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1324                 if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1325                     bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1326                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1327                 }
1328             }
1329         }
1330         is_prev_katakana = is_katakana;
1331     }
1332     utext_close(&fu);
1333 
1334     // Start pushing the optimal offset index into t_boundary (t for tentative).
1335     // prev[numCodePts] is guaranteed to be meaningful.
1336     // We'll first push in the reverse order, i.e.,
1337     // t_boundary[0] = numCodePts, and afterwards do a swap.
1338     UVector32 t_boundary(numCodePts+1, status);
1339 
1340     int32_t numBreaks = 0;
1341     // No segmentation found, set boundary to end of range
1342     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1343         t_boundary.addElement(numCodePts, status);
1344         numBreaks++;
1345     } else {
1346         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1347             t_boundary.addElement(i, status);
1348             numBreaks++;
1349         }
1350         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1351     }
1352 
1353     // Add a break for the start of the dictionary range if there is not one
1354     // there already.
1355     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1356         t_boundary.addElement(0, status);
1357         numBreaks++;
1358     }
1359 
1360     // Now that we're done, convert positions in t_boundary[] (indices in
1361     // the normalized input string) back to indices in the original input UText
1362     // while reversing t_boundary and pushing values to foundBreaks.
1363     int32_t prevCPPos = -1;
1364     int32_t prevUTextPos = -1;
1365     for (int32_t i = numBreaks-1; i >= 0; i--) {
1366         int32_t cpPos = t_boundary.elementAti(i);
1367         U_ASSERT(cpPos > prevCPPos);
1368         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1369         U_ASSERT(utextPos >= prevUTextPos);
1370         if (utextPos > prevUTextPos) {
1371             // Boundaries are added to foundBreaks output in ascending order.
1372             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1373             foundBreaks.push(utextPos, status);
1374         } else {
1375             // Normalization expanded the input text, the dictionary found a boundary
1376             // within the expansion, giving two boundaries with the same index in the
1377             // original text. Ignore the second. See ticket #12918.
1378             --numBreaks;
1379         }
1380         prevCPPos = cpPos;
1381         prevUTextPos = utextPos;
1382     }
1383     (void)prevCPPos; // suppress compiler warnings about unused variable
1384 
1385     // inString goes out of scope
1386     // inputMap goes out of scope
1387     return numBreaks;
1388 }
1389 #endif
1390 
1391 U_NAMESPACE_END
1392 
1393 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1394 
1395