• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 #include <utility>
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_BREAK_ITERATION
15 
16 #include "brkeng.h"
17 #include "dictbe.h"
18 #include "unicode/uniset.h"
19 #include "unicode/chariter.h"
20 #include "unicode/ubrk.h"
21 #include "utracimp.h"
22 #include "uvectr32.h"
23 #include "uvector.h"
24 #include "uassert.h"
25 #include "unicode/normlzr.h"
26 #include "cmemory.h"
27 #include "dictionarydata.h"
28 
29 U_NAMESPACE_BEGIN
30 
31 /*
32  ******************************************************************
33  */
34 
DictionaryBreakEngine()35 DictionaryBreakEngine::DictionaryBreakEngine() {
36 }
37 
~DictionaryBreakEngine()38 DictionaryBreakEngine::~DictionaryBreakEngine() {
39 }
40 
41 UBool
handles(UChar32 c) const42 DictionaryBreakEngine::handles(UChar32 c) const {
43     return fSet.contains(c);
44 }
45 
46 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,UVector32 & foundBreaks) const47 DictionaryBreakEngine::findBreaks( UText *text,
48                                  int32_t startPos,
49                                  int32_t endPos,
50                                  UVector32 &foundBreaks ) const {
51     (void)startPos;            // TODO: remove this param?
52     int32_t result = 0;
53 
54     // Find the span of characters included in the set.
55     //   The span to break begins at the current position in the text, and
56     //   extends towards the start or end of the text, depending on 'reverse'.
57 
58     int32_t start = (int32_t)utext_getNativeIndex(text);
59     int32_t current;
60     int32_t rangeStart;
61     int32_t rangeEnd;
62     UChar32 c = utext_current32(text);
63     while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
64         utext_next32(text);         // TODO:  recast loop for postincrement
65         c = utext_current32(text);
66     }
67     rangeStart = start;
68     rangeEnd = current;
69     result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
70     utext_setNativeIndex(text, current);
71 
72     return result;
73 }
74 
75 void
setCharacters(const UnicodeSet & set)76 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
77     fSet = set;
78     // Compact for caching
79     fSet.compact();
80 }
81 
82 /*
83  ******************************************************************
84  * PossibleWord
85  */
86 
87 // Helper class for improving readability of the Thai/Lao/Khmer word break
88 // algorithm. The implementation is completely inline.
89 
90 // List size, limited by the maximum number of words in the dictionary
91 // that form a nested sequence.
92 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
93 
94 class PossibleWord {
95 private:
96     // list of word candidate lengths, in increasing length order
97     // TODO: bytes would be sufficient for word lengths.
98     int32_t   count;      // Count of candidates
99     int32_t   prefix;     // The longest match with a dictionary word
100     int32_t   offset;     // Offset in the text of these candidates
101     int32_t   mark;       // The preferred candidate's offset
102     int32_t   current;    // The candidate we're currently looking at
103     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
104     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
105 
106 public:
PossibleWord()107     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
~PossibleWord()108     ~PossibleWord() {}
109 
110     // Fill the list of candidates if needed, select the longest, and return the number found
111     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
112 
113     // Select the currently marked candidate, point after it in the text, and invalidate self
114     int32_t   acceptMarked( UText *text );
115 
116     // Back up from the current candidate to the next shorter one; return TRUE if that exists
117     // and point the text after it
118     UBool     backUp( UText *text );
119 
120     // Return the longest prefix this candidate location shares with a dictionary word
121     // Return value is in code points.
longestPrefix()122     int32_t   longestPrefix() { return prefix; }
123 
124     // Mark the current candidate as the one we like
markCurrent()125     void      markCurrent() { mark = current; }
126 
127     // Get length in code points of the marked word.
markedCPLength()128     int32_t   markedCPLength() { return cpLengths[mark]; }
129 };
130 
131 
candidates(UText * text,DictionaryMatcher * dict,int32_t rangeEnd)132 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
133     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
134     int32_t start = (int32_t)utext_getNativeIndex(text);
135     if (start != offset) {
136         offset = start;
137         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
138         // Dictionary leaves text after longest prefix, not longest word. Back up.
139         if (count <= 0) {
140             utext_setNativeIndex(text, start);
141         }
142     }
143     if (count > 0) {
144         utext_setNativeIndex(text, start+cuLengths[count-1]);
145     }
146     current = count-1;
147     mark = current;
148     return count;
149 }
150 
151 int32_t
acceptMarked(UText * text)152 PossibleWord::acceptMarked( UText *text ) {
153     utext_setNativeIndex(text, offset + cuLengths[mark]);
154     return cuLengths[mark];
155 }
156 
157 
158 UBool
backUp(UText * text)159 PossibleWord::backUp( UText *text ) {
160     if (current > 0) {
161         utext_setNativeIndex(text, offset + cuLengths[--current]);
162         return TRUE;
163     }
164     return FALSE;
165 }
166 
167 /*
168  ******************************************************************
169  * ThaiBreakEngine
170  */
171 
172 // How many words in a row are "good enough"?
173 static const int32_t THAI_LOOKAHEAD = 3;
174 
175 // Will not combine a non-word with a preceding dictionary word longer than this
176 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
177 
178 // Will not combine a non-word that shares at least this much prefix with a
179 // dictionary word, with a preceding word
180 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
181 
182 // Ellision character
183 static const int32_t THAI_PAIYANNOI = 0x0E2F;
184 
185 // Repeat character
186 static const int32_t THAI_MAIYAMOK = 0x0E46;
187 
188 // Minimum word size
189 static const int32_t THAI_MIN_WORD = 2;
190 
191 // Minimum number of characters for two words
192 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
193 
ThaiBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)194 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
195     : DictionaryBreakEngine(),
196       fDictionary(adoptDictionary)
197 {
198     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
199     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
200     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
201     if (U_SUCCESS(status)) {
202         setCharacters(fThaiWordSet);
203     }
204     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
205     fMarkSet.add(0x0020);
206     fEndWordSet = fThaiWordSet;
207     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
208     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
209     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
210     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
211     fSuffixSet.add(THAI_PAIYANNOI);
212     fSuffixSet.add(THAI_MAIYAMOK);
213 
214     // Compact for caching.
215     fMarkSet.compact();
216     fEndWordSet.compact();
217     fBeginWordSet.compact();
218     fSuffixSet.compact();
219     UTRACE_EXIT_STATUS(status);
220 }
221 
~ThaiBreakEngine()222 ThaiBreakEngine::~ThaiBreakEngine() {
223     delete fDictionary;
224 }
225 
226 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const227 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
228                                                 int32_t rangeStart,
229                                                 int32_t rangeEnd,
230                                                 UVector32 &foundBreaks ) const {
231     utext_setNativeIndex(text, rangeStart);
232     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
233     if (utext_getNativeIndex(text) >= rangeEnd) {
234         return 0;       // Not enough characters for two words
235     }
236     utext_setNativeIndex(text, rangeStart);
237 
238 
239     uint32_t wordsFound = 0;
240     int32_t cpWordLength = 0;    // Word Length in Code Points.
241     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
242     int32_t current;
243     UErrorCode status = U_ZERO_ERROR;
244     PossibleWord words[THAI_LOOKAHEAD];
245 
246     utext_setNativeIndex(text, rangeStart);
247 
248     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
249         cpWordLength = 0;
250         cuWordLength = 0;
251 
252         // Look for candidate words at the current position
253         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
254 
255         // If we found exactly one, use that
256         if (candidates == 1) {
257             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
258             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
259             wordsFound += 1;
260         }
261         // If there was more than one, see which one can take us forward the most words
262         else if (candidates > 1) {
263             // If we're already at the end of the range, we're done
264             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
265                 goto foundBest;
266             }
267             do {
268                 int32_t wordsMatched = 1;
269                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
270                     if (wordsMatched < 2) {
271                         // Followed by another dictionary word; mark first word as a good candidate
272                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
273                         wordsMatched = 2;
274                     }
275 
276                     // If we're already at the end of the range, we're done
277                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
278                         goto foundBest;
279                     }
280 
281                     // See if any of the possible second words is followed by a third word
282                     do {
283                         // If we find a third word, stop right away
284                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
285                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
286                             goto foundBest;
287                         }
288                     }
289                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
290                 }
291             }
292             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
293 foundBest:
294             // Set UText position to after the accepted word.
295             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
296             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
297             wordsFound += 1;
298         }
299 
300         // We come here after having either found a word or not. We look ahead to the
301         // next word. If it's not a dictionary word, we will combine it with the word we
302         // just found (if there is one), but only if the preceding word does not exceed
303         // the threshold.
304         // The text iterator should now be positioned at the end of the word we found.
305 
306         UChar32 uc = 0;
307         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
308             // if it is a dictionary word, do nothing. If it isn't, then if there is
309             // no preceding word, or the non-word shares less than the minimum threshold
310             // of characters with a dictionary word, then scan to resynchronize
311             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
312                   && (cuWordLength == 0
313                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
314                 // Look for a plausible word boundary
315                 int32_t remaining = rangeEnd - (current+cuWordLength);
316                 UChar32 pc;
317                 int32_t chars = 0;
318                 for (;;) {
319                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
320                     pc = utext_next32(text);
321                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
322                     chars += pcSize;
323                     remaining -= pcSize;
324                     if (remaining <= 0) {
325                         break;
326                     }
327                     uc = utext_current32(text);
328                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
329                         // Maybe. See if it's in the dictionary.
330                         // NOTE: In the original Apple code, checked that the next
331                         // two characters after uc were not 0x0E4C THANTHAKHAT before
332                         // checking the dictionary. That is just a performance filter,
333                         // but it's not clear it's faster than checking the trie.
334                         int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
335                         utext_setNativeIndex(text, current + cuWordLength + chars);
336                         if (num_candidates > 0) {
337                             break;
338                         }
339                     }
340                 }
341 
342                 // Bump the word count if there wasn't already one
343                 if (cuWordLength <= 0) {
344                     wordsFound += 1;
345                 }
346 
347                 // Update the length with the passed-over characters
348                 cuWordLength += chars;
349             }
350             else {
351                 // Back up to where we were for next iteration
352                 utext_setNativeIndex(text, current+cuWordLength);
353             }
354         }
355 
356         // Never stop before a combining mark.
357         int32_t currPos;
358         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
359             utext_next32(text);
360             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
361         }
362 
363         // Look ahead for possible suffixes if a dictionary word does not follow.
364         // We do this in code rather than using a rule so that the heuristic
365         // resynch continues to function. For example, one of the suffix characters
366         // could be a typo in the middle of a word.
367         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
368             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
369                 && fSuffixSet.contains(uc = utext_current32(text))) {
370                 if (uc == THAI_PAIYANNOI) {
371                     if (!fSuffixSet.contains(utext_previous32(text))) {
372                         // Skip over previous end and PAIYANNOI
373                         utext_next32(text);
374                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
375                         utext_next32(text);
376                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
377                         uc = utext_current32(text);     // Fetch next character
378                     }
379                     else {
380                         // Restore prior position
381                         utext_next32(text);
382                     }
383                 }
384                 if (uc == THAI_MAIYAMOK) {
385                     if (utext_previous32(text) != THAI_MAIYAMOK) {
386                         // Skip over previous end and MAIYAMOK
387                         utext_next32(text);
388                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
389                         utext_next32(text);
390                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
391                     }
392                     else {
393                         // Restore prior position
394                         utext_next32(text);
395                     }
396                 }
397             }
398             else {
399                 utext_setNativeIndex(text, current+cuWordLength);
400             }
401         }
402 
403         // Did we find a word on this iteration? If so, push it on the break stack
404         if (cuWordLength > 0) {
405             foundBreaks.push((current+cuWordLength), status);
406         }
407     }
408 
409     // Don't return a break for the end of the dictionary range if there is one there.
410     if (foundBreaks.peeki() >= rangeEnd) {
411         (void) foundBreaks.popi();
412         wordsFound -= 1;
413     }
414 
415     return wordsFound;
416 }
417 
418 /*
419  ******************************************************************
420  * LaoBreakEngine
421  */
422 
423 // How many words in a row are "good enough"?
424 static const int32_t LAO_LOOKAHEAD = 3;
425 
426 // Will not combine a non-word with a preceding dictionary word longer than this
427 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
428 
429 // Will not combine a non-word that shares at least this much prefix with a
430 // dictionary word, with a preceding word
431 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
432 
433 // Minimum word size
434 static const int32_t LAO_MIN_WORD = 2;
435 
436 // Minimum number of characters for two words
437 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
438 
LaoBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)439 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
440     : DictionaryBreakEngine(),
441       fDictionary(adoptDictionary)
442 {
443     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
444     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
445     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
446     if (U_SUCCESS(status)) {
447         setCharacters(fLaoWordSet);
448     }
449     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
450     fMarkSet.add(0x0020);
451     fEndWordSet = fLaoWordSet;
452     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
453     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
454     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
455     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
456 
457     // Compact for caching.
458     fMarkSet.compact();
459     fEndWordSet.compact();
460     fBeginWordSet.compact();
461     UTRACE_EXIT_STATUS(status);
462 }
463 
~LaoBreakEngine()464 LaoBreakEngine::~LaoBreakEngine() {
465     delete fDictionary;
466 }
467 
468 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const469 LaoBreakEngine::divideUpDictionaryRange( UText *text,
470                                                 int32_t rangeStart,
471                                                 int32_t rangeEnd,
472                                                 UVector32 &foundBreaks ) const {
473     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
474         return 0;       // Not enough characters for two words
475     }
476 
477     uint32_t wordsFound = 0;
478     int32_t cpWordLength = 0;
479     int32_t cuWordLength = 0;
480     int32_t current;
481     UErrorCode status = U_ZERO_ERROR;
482     PossibleWord words[LAO_LOOKAHEAD];
483 
484     utext_setNativeIndex(text, rangeStart);
485 
486     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
487         cuWordLength = 0;
488         cpWordLength = 0;
489 
490         // Look for candidate words at the current position
491         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
492 
493         // If we found exactly one, use that
494         if (candidates == 1) {
495             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
496             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
497             wordsFound += 1;
498         }
499         // If there was more than one, see which one can take us forward the most words
500         else if (candidates > 1) {
501             // If we're already at the end of the range, we're done
502             if (utext_getNativeIndex(text) >= rangeEnd) {
503                 goto foundBest;
504             }
505             do {
506                 int32_t wordsMatched = 1;
507                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
508                     if (wordsMatched < 2) {
509                         // Followed by another dictionary word; mark first word as a good candidate
510                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
511                         wordsMatched = 2;
512                     }
513 
514                     // If we're already at the end of the range, we're done
515                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
516                         goto foundBest;
517                     }
518 
519                     // See if any of the possible second words is followed by a third word
520                     do {
521                         // If we find a third word, stop right away
522                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
523                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
524                             goto foundBest;
525                         }
526                     }
527                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
528                 }
529             }
530             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
531 foundBest:
532             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
533             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
534             wordsFound += 1;
535         }
536 
537         // We come here after having either found a word or not. We look ahead to the
538         // next word. If it's not a dictionary word, we will combine it withe the word we
539         // just found (if there is one), but only if the preceding word does not exceed
540         // the threshold.
541         // The text iterator should now be positioned at the end of the word we found.
542         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
543             // if it is a dictionary word, do nothing. If it isn't, then if there is
544             // no preceding word, or the non-word shares less than the minimum threshold
545             // of characters with a dictionary word, then scan to resynchronize
546             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
547                   && (cuWordLength == 0
548                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
549                 // Look for a plausible word boundary
550                 int32_t remaining = rangeEnd - (current + cuWordLength);
551                 UChar32 pc;
552                 UChar32 uc;
553                 int32_t chars = 0;
554                 for (;;) {
555                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
556                     pc = utext_next32(text);
557                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
558                     chars += pcSize;
559                     remaining -= pcSize;
560                     if (remaining <= 0) {
561                         break;
562                     }
563                     uc = utext_current32(text);
564                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
565                         // Maybe. See if it's in the dictionary.
566                         // TODO: this looks iffy; compare with old code.
567                         int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
568                         utext_setNativeIndex(text, current + cuWordLength + chars);
569                         if (num_candidates > 0) {
570                             break;
571                         }
572                     }
573                 }
574 
575                 // Bump the word count if there wasn't already one
576                 if (cuWordLength <= 0) {
577                     wordsFound += 1;
578                 }
579 
580                 // Update the length with the passed-over characters
581                 cuWordLength += chars;
582             }
583             else {
584                 // Back up to where we were for next iteration
585                 utext_setNativeIndex(text, current + cuWordLength);
586             }
587         }
588 
589         // Never stop before a combining mark.
590         int32_t currPos;
591         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
592             utext_next32(text);
593             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
594         }
595 
596         // Look ahead for possible suffixes if a dictionary word does not follow.
597         // We do this in code rather than using a rule so that the heuristic
598         // resynch continues to function. For example, one of the suffix characters
599         // could be a typo in the middle of a word.
600         // NOT CURRENTLY APPLICABLE TO LAO
601 
602         // Did we find a word on this iteration? If so, push it on the break stack
603         if (cuWordLength > 0) {
604             foundBreaks.push((current+cuWordLength), status);
605         }
606     }
607 
608     // Don't return a break for the end of the dictionary range if there is one there.
609     if (foundBreaks.peeki() >= rangeEnd) {
610         (void) foundBreaks.popi();
611         wordsFound -= 1;
612     }
613 
614     return wordsFound;
615 }
616 
617 /*
618  ******************************************************************
619  * BurmeseBreakEngine
620  */
621 
622 // How many words in a row are "good enough"?
623 static const int32_t BURMESE_LOOKAHEAD = 3;
624 
625 // Will not combine a non-word with a preceding dictionary word longer than this
626 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
627 
628 // Will not combine a non-word that shares at least this much prefix with a
629 // dictionary word, with a preceding word
630 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
631 
632 // Minimum word size
633 static const int32_t BURMESE_MIN_WORD = 2;
634 
635 // Minimum number of characters for two words
636 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
637 
BurmeseBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)638 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
639     : DictionaryBreakEngine(),
640       fDictionary(adoptDictionary)
641 {
642     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
643     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
644     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
645     if (U_SUCCESS(status)) {
646         setCharacters(fBurmeseWordSet);
647     }
648     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
649     fMarkSet.add(0x0020);
650     fEndWordSet = fBurmeseWordSet;
651     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
652 
653     // Compact for caching.
654     fMarkSet.compact();
655     fEndWordSet.compact();
656     fBeginWordSet.compact();
657     UTRACE_EXIT_STATUS(status);
658 }
659 
~BurmeseBreakEngine()660 BurmeseBreakEngine::~BurmeseBreakEngine() {
661     delete fDictionary;
662 }
663 
664 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const665 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
666                                                 int32_t rangeStart,
667                                                 int32_t rangeEnd,
668                                                 UVector32 &foundBreaks ) const {
669     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
670         return 0;       // Not enough characters for two words
671     }
672 
673     uint32_t wordsFound = 0;
674     int32_t cpWordLength = 0;
675     int32_t cuWordLength = 0;
676     int32_t current;
677     UErrorCode status = U_ZERO_ERROR;
678     PossibleWord words[BURMESE_LOOKAHEAD];
679 
680     utext_setNativeIndex(text, rangeStart);
681 
682     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
683         cuWordLength = 0;
684         cpWordLength = 0;
685 
686         // Look for candidate words at the current position
687         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
688 
689         // If we found exactly one, use that
690         if (candidates == 1) {
691             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
692             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
693             wordsFound += 1;
694         }
695         // If there was more than one, see which one can take us forward the most words
696         else if (candidates > 1) {
697             // If we're already at the end of the range, we're done
698             if (utext_getNativeIndex(text) >= rangeEnd) {
699                 goto foundBest;
700             }
701             do {
702                 int32_t wordsMatched = 1;
703                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
704                     if (wordsMatched < 2) {
705                         // Followed by another dictionary word; mark first word as a good candidate
706                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
707                         wordsMatched = 2;
708                     }
709 
710                     // If we're already at the end of the range, we're done
711                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
712                         goto foundBest;
713                     }
714 
715                     // See if any of the possible second words is followed by a third word
716                     do {
717                         // If we find a third word, stop right away
718                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
719                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
720                             goto foundBest;
721                         }
722                     }
723                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
724                 }
725             }
726             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
727 foundBest:
728             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
729             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
730             wordsFound += 1;
731         }
732 
733         // We come here after having either found a word or not. We look ahead to the
734         // next word. If it's not a dictionary word, we will combine it withe the word we
735         // just found (if there is one), but only if the preceding word does not exceed
736         // the threshold.
737         // The text iterator should now be positioned at the end of the word we found.
738         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
739             // if it is a dictionary word, do nothing. If it isn't, then if there is
740             // no preceding word, or the non-word shares less than the minimum threshold
741             // of characters with a dictionary word, then scan to resynchronize
742             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
743                   && (cuWordLength == 0
744                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
745                 // Look for a plausible word boundary
746                 int32_t remaining = rangeEnd - (current + cuWordLength);
747                 UChar32 pc;
748                 UChar32 uc;
749                 int32_t chars = 0;
750                 for (;;) {
751                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
752                     pc = utext_next32(text);
753                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
754                     chars += pcSize;
755                     remaining -= pcSize;
756                     if (remaining <= 0) {
757                         break;
758                     }
759                     uc = utext_current32(text);
760                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
761                         // Maybe. See if it's in the dictionary.
762                         // TODO: this looks iffy; compare with old code.
763                         int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
764                         utext_setNativeIndex(text, current + cuWordLength + chars);
765                         if (num_candidates > 0) {
766                             break;
767                         }
768                     }
769                 }
770 
771                 // Bump the word count if there wasn't already one
772                 if (cuWordLength <= 0) {
773                     wordsFound += 1;
774                 }
775 
776                 // Update the length with the passed-over characters
777                 cuWordLength += chars;
778             }
779             else {
780                 // Back up to where we were for next iteration
781                 utext_setNativeIndex(text, current + cuWordLength);
782             }
783         }
784 
785         // Never stop before a combining mark.
786         int32_t currPos;
787         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
788             utext_next32(text);
789             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
790         }
791 
792         // Look ahead for possible suffixes if a dictionary word does not follow.
793         // We do this in code rather than using a rule so that the heuristic
794         // resynch continues to function. For example, one of the suffix characters
795         // could be a typo in the middle of a word.
796         // NOT CURRENTLY APPLICABLE TO BURMESE
797 
798         // Did we find a word on this iteration? If so, push it on the break stack
799         if (cuWordLength > 0) {
800             foundBreaks.push((current+cuWordLength), status);
801         }
802     }
803 
804     // Don't return a break for the end of the dictionary range if there is one there.
805     if (foundBreaks.peeki() >= rangeEnd) {
806         (void) foundBreaks.popi();
807         wordsFound -= 1;
808     }
809 
810     return wordsFound;
811 }
812 
813 /*
814  ******************************************************************
815  * KhmerBreakEngine
816  */
817 
818 // How many words in a row are "good enough"?
819 static const int32_t KHMER_LOOKAHEAD = 3;
820 
821 // Will not combine a non-word with a preceding dictionary word longer than this
822 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
823 
824 // Will not combine a non-word that shares at least this much prefix with a
825 // dictionary word, with a preceding word
826 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
827 
828 // Minimum word size
829 static const int32_t KHMER_MIN_WORD = 2;
830 
831 // Minimum number of characters for two words
832 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
833 
KhmerBreakEngine(DictionaryMatcher * adoptDictionary,UErrorCode & status)834 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
835     : DictionaryBreakEngine(),
836       fDictionary(adoptDictionary)
837 {
838     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
839     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
840     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
841     if (U_SUCCESS(status)) {
842         setCharacters(fKhmerWordSet);
843     }
844     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
845     fMarkSet.add(0x0020);
846     fEndWordSet = fKhmerWordSet;
847     fBeginWordSet.add(0x1780, 0x17B3);
848     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
849     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
850     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
851     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
852     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
853 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
854 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
855 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
856 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
857 //    fSuffixSet.add(THAI_PAIYANNOI);
858 //    fSuffixSet.add(THAI_MAIYAMOK);
859 
860     // Compact for caching.
861     fMarkSet.compact();
862     fEndWordSet.compact();
863     fBeginWordSet.compact();
864 //    fSuffixSet.compact();
865     UTRACE_EXIT_STATUS(status);
866 }
867 
~KhmerBreakEngine()868 KhmerBreakEngine::~KhmerBreakEngine() {
869     delete fDictionary;
870 }
871 
872 int32_t
divideUpDictionaryRange(UText * text,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const873 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
874                                                 int32_t rangeStart,
875                                                 int32_t rangeEnd,
876                                                 UVector32 &foundBreaks ) const {
877     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
878         return 0;       // Not enough characters for two words
879     }
880 
881     uint32_t wordsFound = 0;
882     int32_t cpWordLength = 0;
883     int32_t cuWordLength = 0;
884     int32_t current;
885     UErrorCode status = U_ZERO_ERROR;
886     PossibleWord words[KHMER_LOOKAHEAD];
887 
888     utext_setNativeIndex(text, rangeStart);
889 
890     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
891         cuWordLength = 0;
892         cpWordLength = 0;
893 
894         // Look for candidate words at the current position
895         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
896 
897         // If we found exactly one, use that
898         if (candidates == 1) {
899             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
900             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
901             wordsFound += 1;
902         }
903 
904         // If there was more than one, see which one can take us forward the most words
905         else if (candidates > 1) {
906             // If we're already at the end of the range, we're done
907             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
908                 goto foundBest;
909             }
910             do {
911                 int32_t wordsMatched = 1;
912                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
913                     if (wordsMatched < 2) {
914                         // Followed by another dictionary word; mark first word as a good candidate
915                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
916                         wordsMatched = 2;
917                     }
918 
919                     // If we're already at the end of the range, we're done
920                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
921                         goto foundBest;
922                     }
923 
924                     // See if any of the possible second words is followed by a third word
925                     do {
926                         // If we find a third word, stop right away
927                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
928                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
929                             goto foundBest;
930                         }
931                     }
932                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
933                 }
934             }
935             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
936 foundBest:
937             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
938             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
939             wordsFound += 1;
940         }
941 
942         // We come here after having either found a word or not. We look ahead to the
943         // next word. If it's not a dictionary word, we will combine it with the word we
944         // just found (if there is one), but only if the preceding word does not exceed
945         // the threshold.
946         // The text iterator should now be positioned at the end of the word we found.
947         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
948             // if it is a dictionary word, do nothing. If it isn't, then if there is
949             // no preceding word, or the non-word shares less than the minimum threshold
950             // of characters with a dictionary word, then scan to resynchronize
951             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
952                   && (cuWordLength == 0
953                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
954                 // Look for a plausible word boundary
955                 int32_t remaining = rangeEnd - (current+cuWordLength);
956                 UChar32 pc;
957                 UChar32 uc;
958                 int32_t chars = 0;
959                 for (;;) {
960                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
961                     pc = utext_next32(text);
962                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
963                     chars += pcSize;
964                     remaining -= pcSize;
965                     if (remaining <= 0) {
966                         break;
967                     }
968                     uc = utext_current32(text);
969                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
970                         // Maybe. See if it's in the dictionary.
971                         int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
972                         utext_setNativeIndex(text, current+cuWordLength+chars);
973                         if (num_candidates > 0) {
974                             break;
975                         }
976                     }
977                 }
978 
979                 // Bump the word count if there wasn't already one
980                 if (cuWordLength <= 0) {
981                     wordsFound += 1;
982                 }
983 
984                 // Update the length with the passed-over characters
985                 cuWordLength += chars;
986             }
987             else {
988                 // Back up to where we were for next iteration
989                 utext_setNativeIndex(text, current+cuWordLength);
990             }
991         }
992 
993         // Never stop before a combining mark.
994         int32_t currPos;
995         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
996             utext_next32(text);
997             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
998         }
999 
1000         // Look ahead for possible suffixes if a dictionary word does not follow.
1001         // We do this in code rather than using a rule so that the heuristic
1002         // resynch continues to function. For example, one of the suffix characters
1003         // could be a typo in the middle of a word.
1004 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1005 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1006 //                && fSuffixSet.contains(uc = utext_current32(text))) {
1007 //                if (uc == KHMER_PAIYANNOI) {
1008 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
1009 //                        // Skip over previous end and PAIYANNOI
1010 //                        utext_next32(text);
1011 //                        utext_next32(text);
1012 //                        wordLength += 1;            // Add PAIYANNOI to word
1013 //                        uc = utext_current32(text);     // Fetch next character
1014 //                    }
1015 //                    else {
1016 //                        // Restore prior position
1017 //                        utext_next32(text);
1018 //                    }
1019 //                }
1020 //                if (uc == KHMER_MAIYAMOK) {
1021 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1022 //                        // Skip over previous end and MAIYAMOK
1023 //                        utext_next32(text);
1024 //                        utext_next32(text);
1025 //                        wordLength += 1;            // Add MAIYAMOK to word
1026 //                    }
1027 //                    else {
1028 //                        // Restore prior position
1029 //                        utext_next32(text);
1030 //                    }
1031 //                }
1032 //            }
1033 //            else {
1034 //                utext_setNativeIndex(text, current+wordLength);
1035 //            }
1036 //        }
1037 
1038         // Did we find a word on this iteration? If so, push it on the break stack
1039         if (cuWordLength > 0) {
1040             foundBreaks.push((current+cuWordLength), status);
1041         }
1042     }
1043 
1044     // Don't return a break for the end of the dictionary range if there is one there.
1045     if (foundBreaks.peeki() >= rangeEnd) {
1046         (void) foundBreaks.popi();
1047         wordsFound -= 1;
1048     }
1049 
1050     return wordsFound;
1051 }
1052 
1053 #if !UCONFIG_NO_NORMALIZATION
1054 /*
1055  ******************************************************************
1056  * CjkBreakEngine
1057  */
1058 static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine(DictionaryMatcher * adoptDictionary,LanguageType type,UErrorCode & status)1059 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1060 : DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1061     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
1062     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
1063     // Korean dictionary only includes Hangul syllables
1064     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1065     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1066     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1067     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1068     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1069 
1070     if (U_SUCCESS(status)) {
1071         // handle Korean and Japanese/Chinese using different dictionaries
1072         if (type == kKorean) {
1073             setCharacters(fHangulWordSet);
1074         } else { //Chinese and Japanese
1075             UnicodeSet cjSet;
1076             cjSet.addAll(fHanWordSet);
1077             cjSet.addAll(fKatakanaWordSet);
1078             cjSet.addAll(fHiraganaWordSet);
1079             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1080             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1081             setCharacters(cjSet);
1082         }
1083     }
1084     UTRACE_EXIT_STATUS(status);
1085 }
1086 
~CjkBreakEngine()1087 CjkBreakEngine::~CjkBreakEngine(){
1088     delete fDictionary;
1089 }
1090 
1091 // The katakanaCost values below are based on the length frequencies of all
1092 // katakana phrases in the dictionary
1093 static const int32_t kMaxKatakanaLength = 8;
1094 static const int32_t kMaxKatakanaGroupLength = 20;
1095 static const uint32_t maxSnlp = 255;
1096 
getKatakanaCost(int32_t wordLength)1097 static inline uint32_t getKatakanaCost(int32_t wordLength){
1098     //TODO: fill array with actual values from dictionary!
1099     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1100                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1101     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1102 }
1103 
isKatakana(UChar32 value)1104 static inline bool isKatakana(UChar32 value) {
1105     return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1106             (value >= 0xFF66 && value <= 0xFF9f);
1107 }
1108 
1109 
1110 // Function for accessing internal utext flags.
1111 //   Replicates an internal UText function.
1112 
utext_i32_flag(int32_t bitIndex)1113 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1114     return (int32_t)1 << bitIndex;
1115 }
1116 
1117 
1118 /*
1119  * @param text A UText representing the text
1120  * @param rangeStart The start of the range of dictionary characters
1121  * @param rangeEnd The end of the range of dictionary characters
1122  * @param foundBreaks vector<int32> to receive the break positions
1123  * @return The number of breaks found
1124  */
1125 int32_t
divideUpDictionaryRange(UText * inText,int32_t rangeStart,int32_t rangeEnd,UVector32 & foundBreaks) const1126 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1127         int32_t rangeStart,
1128         int32_t rangeEnd,
1129         UVector32 &foundBreaks ) const {
1130     if (rangeStart >= rangeEnd) {
1131         return 0;
1132     }
1133 
1134     // UnicodeString version of input UText, NFKC normalized if necessary.
1135     UnicodeString inString;
1136 
1137     // inputMap[inStringIndex] = corresponding native index from UText inText.
1138     // If NULL then mapping is 1:1
1139     LocalPointer<UVector32>     inputMap;
1140 
1141     UErrorCode     status      = U_ZERO_ERROR;
1142 
1143 
1144     // if UText has the input string as one contiguous UTF-16 chunk
1145     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1146          inText->chunkNativeStart <= rangeStart &&
1147          inText->chunkNativeLimit >= rangeEnd   &&
1148          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1149 
1150         // Input UText is in one contiguous UTF-16 chunk.
1151         // Use Read-only aliasing UnicodeString.
1152         inString.setTo(FALSE,
1153                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1154                        rangeEnd - rangeStart);
1155     } else {
1156         // Copy the text from the original inText (UText) to inString (UnicodeString).
1157         // Create a map from UnicodeString indices -> UText offsets.
1158         utext_setNativeIndex(inText, rangeStart);
1159         int32_t limit = rangeEnd;
1160         U_ASSERT(limit <= utext_nativeLength(inText));
1161         if (limit > utext_nativeLength(inText)) {
1162             limit = (int32_t)utext_nativeLength(inText);
1163         }
1164         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1165         if (U_FAILURE(status)) {
1166             return 0;
1167         }
1168         while (utext_getNativeIndex(inText) < limit) {
1169             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1170             UChar32 c = utext_next32(inText);
1171             U_ASSERT(c != U_SENTINEL);
1172             inString.append(c);
1173             while (inputMap->size() < inString.length()) {
1174                 inputMap->addElement(nativePosition, status);
1175             }
1176         }
1177         inputMap->addElement(limit, status);
1178     }
1179 
1180 
1181     if (!nfkcNorm2->isNormalized(inString, status)) {
1182         UnicodeString normalizedInput;
1183         //  normalizedMap[normalizedInput position] ==  original UText position.
1184         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1185         if (U_FAILURE(status)) {
1186             return 0;
1187         }
1188 
1189         UnicodeString fragment;
1190         UnicodeString normalizedFragment;
1191         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1192             fragment.remove();
1193             int32_t fragmentStartI = srcI;
1194             UChar32 c = inString.char32At(srcI);
1195             for (;;) {
1196                 fragment.append(c);
1197                 srcI = inString.moveIndex32(srcI, 1);
1198                 if (srcI == inString.length()) {
1199                     break;
1200                 }
1201                 c = inString.char32At(srcI);
1202                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1203                     break;
1204                 }
1205             }
1206             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1207             normalizedInput.append(normalizedFragment);
1208 
1209             // Map every position in the normalized chunk to the start of the chunk
1210             //   in the original input.
1211             int32_t fragmentOriginalStart = inputMap.isValid() ?
1212                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1213             while (normalizedMap->size() < normalizedInput.length()) {
1214                 normalizedMap->addElement(fragmentOriginalStart, status);
1215                 if (U_FAILURE(status)) {
1216                     break;
1217                 }
1218             }
1219         }
1220         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1221         int32_t nativeEnd = inputMap.isValid() ?
1222                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1223         normalizedMap->addElement(nativeEnd, status);
1224 
1225         inputMap = std::move(normalizedMap);
1226         inString = std::move(normalizedInput);
1227     }
1228 
1229     int32_t numCodePts = inString.countChar32();
1230     if (numCodePts != inString.length()) {
1231         // There are supplementary characters in the input.
1232         // The dictionary will produce boundary positions in terms of code point indexes,
1233         //   not in terms of code unit string indexes.
1234         // Use the inputMap mechanism to take care of this in addition to indexing differences
1235         //    from normalization and/or UTF-8 input.
1236         UBool hadExistingMap = inputMap.isValid();
1237         if (!hadExistingMap) {
1238             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1239             if (U_FAILURE(status)) {
1240                 return 0;
1241             }
1242         }
1243         int32_t cpIdx = 0;
1244         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1245             U_ASSERT(cuIdx >= cpIdx);
1246             if (hadExistingMap) {
1247                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1248             } else {
1249                 inputMap->addElement(cuIdx+rangeStart, status);
1250             }
1251             cpIdx++;
1252             if (cuIdx == inString.length()) {
1253                break;
1254             }
1255         }
1256     }
1257 
1258     // bestSnlp[i] is the snlp of the best segmentation of the first i
1259     // code points in the range to be matched.
1260     UVector32 bestSnlp(numCodePts + 1, status);
1261     bestSnlp.addElement(0, status);
1262     for(int32_t i = 1; i <= numCodePts; i++) {
1263         bestSnlp.addElement(kuint32max, status);
1264     }
1265 
1266 
1267     // prev[i] is the index of the last CJK code point in the previous word in
1268     // the best segmentation of the first i characters.
1269     UVector32 prev(numCodePts + 1, status);
1270     for(int32_t i = 0; i <= numCodePts; i++){
1271         prev.addElement(-1, status);
1272     }
1273 
1274     const int32_t maxWordSize = 20;
1275     UVector32 values(numCodePts, status);
1276     values.setSize(numCodePts);
1277     UVector32 lengths(numCodePts, status);
1278     lengths.setSize(numCodePts);
1279 
1280     UText fu = UTEXT_INITIALIZER;
1281     utext_openUnicodeString(&fu, &inString, &status);
1282 
1283     // Dynamic programming to find the best segmentation.
1284 
1285     // In outer loop, i  is the code point index,
1286     //                ix is the corresponding string (code unit) index.
1287     //    They differ when the string contains supplementary characters.
1288     int32_t ix = 0;
1289     bool is_prev_katakana = false;
1290     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1291         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1292             continue;
1293         }
1294 
1295         int32_t count;
1296         utext_setNativeIndex(&fu, ix);
1297         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1298                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1299                              // Note: lengths is filled with code point lengths
1300                              //       The NULL parameter is the ignored code unit lengths.
1301 
1302         // if there are no single character matches found in the dictionary
1303         // starting with this character, treat character as a 1-character word
1304         // with the highest value possible, i.e. the least likely to occur.
1305         // Exclude Korean characters from this treatment, as they should be left
1306         // together by default.
1307         if ((count == 0 || lengths.elementAti(0) != 1) &&
1308                 !fHangulWordSet.contains(inString.char32At(ix))) {
1309             values.setElementAt(maxSnlp, count);   // 255
1310             lengths.setElementAt(1, count++);
1311         }
1312 
1313         for (int32_t j = 0; j < count; j++) {
1314             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1315             int32_t ln_j_i = lengths.elementAti(j) + i;
1316             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1317                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1318                 prev.setElementAt(i, ln_j_i);
1319             }
1320         }
1321 
1322         // In Japanese,
1323         // Katakana word in single character is pretty rare. So we apply
1324         // the following heuristic to Katakana: any continuous run of Katakana
1325         // characters is considered a candidate word with a default cost
1326         // specified in the katakanaCost table according to its length.
1327 
1328         bool is_katakana = isKatakana(inString.char32At(ix));
1329         int32_t katakanaRunLength = 1;
1330         if (!is_prev_katakana && is_katakana) {
1331             int32_t j = inString.moveIndex32(ix, 1);
1332             // Find the end of the continuous run of Katakana characters
1333             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1334                     isKatakana(inString.char32At(j))) {
1335                 j = inString.moveIndex32(j, 1);
1336                 katakanaRunLength++;
1337             }
1338             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1339                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1340                 if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1341                     bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1342                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1343                 }
1344             }
1345         }
1346         is_prev_katakana = is_katakana;
1347     }
1348     utext_close(&fu);
1349 
1350     // Start pushing the optimal offset index into t_boundary (t for tentative).
1351     // prev[numCodePts] is guaranteed to be meaningful.
1352     // We'll first push in the reverse order, i.e.,
1353     // t_boundary[0] = numCodePts, and afterwards do a swap.
1354     UVector32 t_boundary(numCodePts+1, status);
1355 
1356     int32_t numBreaks = 0;
1357     // No segmentation found, set boundary to end of range
1358     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1359         t_boundary.addElement(numCodePts, status);
1360         numBreaks++;
1361     } else {
1362         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1363             t_boundary.addElement(i, status);
1364             numBreaks++;
1365         }
1366         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1367     }
1368 
1369     // Add a break for the start of the dictionary range if there is not one
1370     // there already.
1371     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1372         t_boundary.addElement(0, status);
1373         numBreaks++;
1374     }
1375 
1376     // Now that we're done, convert positions in t_boundary[] (indices in
1377     // the normalized input string) back to indices in the original input UText
1378     // while reversing t_boundary and pushing values to foundBreaks.
1379     int32_t prevCPPos = -1;
1380     int32_t prevUTextPos = -1;
1381     for (int32_t i = numBreaks-1; i >= 0; i--) {
1382         int32_t cpPos = t_boundary.elementAti(i);
1383         U_ASSERT(cpPos > prevCPPos);
1384         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1385         U_ASSERT(utextPos >= prevUTextPos);
1386         if (utextPos > prevUTextPos) {
1387             // Boundaries are added to foundBreaks output in ascending order.
1388             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1389             foundBreaks.push(utextPos, status);
1390         } else {
1391             // Normalization expanded the input text, the dictionary found a boundary
1392             // within the expansion, giving two boundaries with the same index in the
1393             // original text. Ignore the second. See ticket #12918.
1394             --numBreaks;
1395         }
1396         prevCPPos = cpPos;
1397         prevUTextPos = utextPos;
1398     }
1399     (void)prevCPPos; // suppress compiler warnings about unused variable
1400 
1401     // inString goes out of scope
1402     // inputMap goes out of scope
1403     return numBreaks;
1404 }
1405 #endif
1406 
1407 U_NAMESPACE_END
1408 
1409 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1410 
1411