• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1999-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Date        Name        Description
10 *   12/15/99    Madhu        Creation.
11 *   01/12/2000  Madhu        Updated for changed API and added new tests
12 ************************************************************************/
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include <algorithm>
18 #include <sstream>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <utility>
23 #include <vector>
24 
25 #include "unicode/brkiter.h"
26 #include "unicode/localpointer.h"
27 #include "unicode/numfmt.h"
28 #include "unicode/rbbi.h"
29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30 #include "unicode/regex.h"
31 #endif
32 #include "unicode/schriter.h"
33 #include "unicode/uchar.h"
34 #include "unicode/utf16.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uscript.h"
38 #include "unicode/ustring.h"
39 #include "unicode/utext.h"
40 #include "unicode/utrace.h"
41 
42 #include "charstr.h"
43 #include "cmemory.h"
44 #include "cstr.h"
45 #include "intltest.h"
46 #include "lstmbe.h"
47 #include "rbbitst.h"
48 #include "rbbidata.h"
49 #include "utypeinfo.h"  // for 'typeid' to work
50 #include "uvector.h"
51 #include "uvectr32.h"
52 
53 
54 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 #include "unicode/filteredbrk.h"
56 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
57 
58 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
59     if (!(x)) { \
60         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
61     } \
62 } UPRV_BLOCK_MACRO_END
63 
64 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
65     if (U_FAILURE(errcode)) { \
66         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
67     } \
68 } UPRV_BLOCK_MACRO_END
69 
70 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
71     IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
72                     __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
73 }
74 
75 //---------------------------------------------
76 // runIndexedTest
77 //---------------------------------------------
78 
79 
80 //  Note:  Before adding new tests to this file, check whether the desired test data can
81 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
82 //         it's much less work than writing a new test, diagnostic output in the event of failures
83 //         is good, and the test data file will is shared with ICU4J, so eventually the test
84 //         will run there as well, without additional effort.
85 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)86 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
87 {
88     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
89     fTestParams = params;
90 
91     TESTCASE_AUTO_BEGIN;
92 #if !UCONFIG_NO_FILE_IO
93     TESTCASE_AUTO(TestBug4153072);
94 #endif
95 #if !UCONFIG_NO_FILE_IO
96     TESTCASE_AUTO(TestUnicodeFiles);
97 #endif
98     TESTCASE_AUTO(TestGetAvailableLocales);
99     TESTCASE_AUTO(TestGetDisplayName);
100 #if !UCONFIG_NO_FILE_IO
101     TESTCASE_AUTO(TestEndBehaviour);
102     TESTCASE_AUTO(TestWordBreaks);
103     TESTCASE_AUTO(TestWordBoundary);
104     TESTCASE_AUTO(TestLineBreaks);
105     TESTCASE_AUTO(TestSentBreaks);
106     TESTCASE_AUTO(TestExtended);
107 #endif
108 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
109     TESTCASE_AUTO(TestMonkey);
110 #endif
111 #if !UCONFIG_NO_FILE_IO
112     TESTCASE_AUTO(TestBug3818);
113 #endif
114     TESTCASE_AUTO(TestDebug);
115 #if !UCONFIG_NO_FILE_IO
116     TESTCASE_AUTO(TestBug5775);
117 #endif
118     TESTCASE_AUTO(TestBug9983);
119     TESTCASE_AUTO(TestDictRules);
120     TESTCASE_AUTO(TestBug5532);
121     TESTCASE_AUTO(TestBug7547);
122     TESTCASE_AUTO(TestBug12797);
123     TESTCASE_AUTO(TestBug12918);
124     TESTCASE_AUTO(TestBug12932);
125     TESTCASE_AUTO(TestEmoji);
126     TESTCASE_AUTO(TestBug12519);
127     TESTCASE_AUTO(TestBug12677);
128     TESTCASE_AUTO(TestTableRedundancies);
129     TESTCASE_AUTO(TestBug13447);
130     TESTCASE_AUTO(TestReverse);
131     TESTCASE_AUTO(TestBug13692);
132     TESTCASE_AUTO(TestDebugRules);
133     TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
134     TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
135     TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
136     TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
137     TESTCASE_AUTO(TestTable_8_16_Bits);
138     TESTCASE_AUTO(TestBug13590);
139     TESTCASE_AUTO(TestUnpairedSurrogate);
140     TESTCASE_AUTO(TestLSTMThai);
141     TESTCASE_AUTO(TestLSTMBurmese);
142     TESTCASE_AUTO(TestRandomAccess);
143 
144 #if U_ENABLE_TRACING
145     TESTCASE_AUTO(TestTraceCreateCharacter);
146     TESTCASE_AUTO(TestTraceCreateWord);
147     TESTCASE_AUTO(TestTraceCreateSentence);
148     TESTCASE_AUTO(TestTraceCreateTitle);
149     TESTCASE_AUTO(TestTraceCreateLine);
150     TESTCASE_AUTO(TestTraceCreateLineNormal);
151     TESTCASE_AUTO(TestTraceCreateLineLoose);
152     TESTCASE_AUTO(TestTraceCreateLineStrict);
153     TESTCASE_AUTO(TestTraceCreateLineNormalPhrase);
154     TESTCASE_AUTO(TestTraceCreateLineLoosePhrase);
155     TESTCASE_AUTO(TestTraceCreateLineStrictPhrase);
156     TESTCASE_AUTO(TestTraceCreateLinePhrase);
157     TESTCASE_AUTO(TestTraceCreateBreakEngine);
158 #endif
159 
160     TESTCASE_AUTO_END;
161 }
162 
163 
164 //--------------------------------------------------------------------------------------
165 //
166 //    RBBITest    constructor and destructor
167 //
168 //--------------------------------------------------------------------------------------
169 
RBBITest()170 RBBITest::RBBITest() {
171     fTestParams = NULL;
172 }
173 
174 
~RBBITest()175 RBBITest::~RBBITest() {
176 }
177 
178 
printStringBreaks(UText * tstr,int expected[],int expectedCount)179 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
180     UErrorCode status = U_ZERO_ERROR;
181     char name[100];
182     printf("code    alpha extend alphanum type word sent line name\n");
183     int nextExpectedIndex = 0;
184     utext_setNativeIndex(tstr, 0);
185     for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
186         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
187             printf("------------------------------------------------ %d\n", j);
188             ++nextExpectedIndex;
189         }
190 
191         UChar32 c = utext_next32(tstr);
192         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
193         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
194                            u_isUAlphabetic(c),
195                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
196                            u_isalnum(c),
197                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
198                                                   u_charType(c),
199                                                   U_SHORT_PROPERTY_NAME),
200                            u_getPropertyValueName(UCHAR_WORD_BREAK,
201                                                   u_getIntPropertyValue(c,
202                                                           UCHAR_WORD_BREAK),
203                                                   U_SHORT_PROPERTY_NAME),
204                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
205                                    u_getIntPropertyValue(c,
206                                            UCHAR_SENTENCE_BREAK),
207                                    U_SHORT_PROPERTY_NAME),
208                            u_getPropertyValueName(UCHAR_LINE_BREAK,
209                                    u_getIntPropertyValue(c,
210                                            UCHAR_LINE_BREAK),
211                                    U_SHORT_PROPERTY_NAME),
212                            name);
213     }
214 }
215 
216 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)217 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
218    UErrorCode status = U_ZERO_ERROR;
219    UText *tstr = NULL;
220    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
221    if (U_FAILURE(status)) {
222        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
223        return;
224     }
225    printStringBreaks(tstr, expected, expectedCount);
226    utext_close(tstr);
227 }
228 
229 
TestBug3818()230 void RBBITest::TestBug3818() {
231     UErrorCode  status = U_ZERO_ERROR;
232 
233     // Four Thai words...
234     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
235                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
236     UnicodeString  thaiStr(thaiWordData);
237 
238     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
239     if (U_FAILURE(status) || bi == NULL) {
240         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
241         return;
242     }
243     bi->setText(thaiStr);
244 
245     int32_t  startOfSecondWord = bi->following(1);
246     if (startOfSecondWord != 4) {
247         errln("Fail at file %s, line %d expected start of word at 4, got %d",
248             __FILE__, __LINE__, startOfSecondWord);
249     }
250     startOfSecondWord = bi->following(0);
251     if (startOfSecondWord != 4) {
252         errln("Fail at file %s, line %d expected start of word at 4, got %d",
253             __FILE__, __LINE__, startOfSecondWord);
254     }
255     delete bi;
256 }
257 
258 
259 //---------------------------------------------
260 //
261 //     other tests
262 //
263 //---------------------------------------------
264 
TestGetAvailableLocales()265 void RBBITest::TestGetAvailableLocales()
266 {
267     int32_t locCount = 0;
268     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
269 
270     if (locCount == 0)
271         dataerrln("getAvailableLocales() returned an empty list!");
272     // Just make sure that it's returning good memory.
273     int32_t i;
274     for (i = 0; i < locCount; ++i) {
275         logln(locList[i].getName());
276     }
277 }
278 
279 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()280 void RBBITest::TestGetDisplayName()
281 {
282     UnicodeString   result;
283 
284     BreakIterator::getDisplayName(Locale::getUS(), result);
285     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
286         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
287                 + result);
288 
289     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
290     if (result != "French (France)")
291         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
292                 + result);
293 }
294 /**
295  * Test End Behaviour
296  * @bug 4068137
297  */
TestEndBehaviour()298 void RBBITest::TestEndBehaviour()
299 {
300     UErrorCode status = U_ZERO_ERROR;
301     UnicodeString testString("boo.");
302     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
303     if (U_FAILURE(status))
304     {
305         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
306         return;
307     }
308     wb->setText(testString);
309 
310     if (wb->first() != 0)
311         errln("Didn't get break at beginning of string.");
312     if (wb->next() != 3)
313         errln("Didn't get break before period in \"boo.\"");
314     if (wb->current() != 4 && wb->next() != 4)
315         errln("Didn't get break at end of string.");
316     delete wb;
317 }
318 /*
319  * @bug 4153072
320  */
TestBug4153072()321 void RBBITest::TestBug4153072() {
322     UErrorCode status = U_ZERO_ERROR;
323     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
324     if (U_FAILURE(status))
325     {
326         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
327         return;
328     }
329     UnicodeString str("...Hello, World!...");
330     int32_t begin = 3;
331     int32_t end = str.length() - 3;
332     UBool onBoundary;
333 
334     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
335     iter->adoptText(textIterator);
336     int index;
337     // Note: with the switch to UText, there is no way to restrict the
338     //       iteration range to begin at an index other than zero.
339     //       String character iterators created with a non-zero bound are
340     //         treated by RBBI as being empty.
341     for (index = -1; index < begin + 1; ++index) {
342         onBoundary = iter->isBoundary(index);
343         if (index == 0?  !onBoundary : onBoundary) {
344             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
345                             " and begin index = " + begin);
346         }
347     }
348     delete iter;
349 }
350 
351 
352 //
353 // Test for problem reported by Ashok Matoria on 9 July 2007
354 //    One.<kSoftHyphen><kSpace>Two.
355 //
356 //    Sentence break at start (0) and then on calling next() it breaks at
357 //   'T' of "Two". Now, at this point if I do next() and
358 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
359 //
TestBug5775()360 void RBBITest::TestBug5775() {
361     UErrorCode status = U_ZERO_ERROR;
362     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
363     TEST_ASSERT_SUCCESS(status);
364     if (U_FAILURE(status)) {
365         return;
366     }
367 // Check for status first for better handling of no data errors.
368     TEST_ASSERT(bi != NULL);
369     if (bi == NULL) {
370         return;
371     }
372 
373     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
374     //               01234      56789
375     s = s.unescape();
376     bi->setText(s);
377     int pos = bi->next();
378     TEST_ASSERT(pos == 6);
379     pos = bi->next();
380     TEST_ASSERT(pos == 10);
381     pos = bi->previous();
382     TEST_ASSERT(pos == 6);
383     delete bi;
384 }
385 
386 
387 
388 //------------------------------------------------------------------------------
389 //
390 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
391 //
392 //------------------------------------------------------------------------------
393 
394 struct TestParams {
395     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
396                                            //   Changed out whenever test data changes break type.
397 
398     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
399     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
400     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
401     UVector32       *srcCol;
402 
403     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
404     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
405     CharString       utf8String;           // UTF-8 form of text to break.
406 
TestParamsTestParams407     TestParams(UErrorCode &status) : dataToBreak() {
408         bi               = NULL;
409         expectedBreaks   = new UVector32(status);
410         srcLine          = new UVector32(status);
411         srcCol           = new UVector32(status);
412         textToBreak      = NULL;
413         textMap          = new UVector32(status);
414     }
415 
~TestParamsTestParams416     ~TestParams() {
417         delete bi;
418         delete expectedBreaks;
419         delete srcLine;
420         delete srcCol;
421         utext_close(textToBreak);
422         delete textMap;
423     }
424 
425     int32_t getSrcLine(int32_t bp);
426     int32_t getExpectedBreak(int32_t bp);
427     int32_t getSrcCol(int32_t bp);
428 
429     void setUTF16(UErrorCode &status);
430     void setUTF8(UErrorCode &status);
431 };
432 
433 // Append a UnicodeString to a CharString with UTF-8 encoding.
434 // Substitute any invalid chars.
435 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)436 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
437     if (U_FAILURE(status)) {
438         return;
439     }
440     int32_t utf8Length;
441     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
442                        src.getBuffer(), src.length(),   // UTF-16 data
443                        0xfffd, NULL,                    // Substitution char, number of subs.
444                        &status);
445     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
446         return;
447     }
448     status = U_ZERO_ERROR;
449     int32_t capacity;
450     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
451     u_strToUTF8WithSub(buffer, utf8Length, NULL,
452                        src.getBuffer(), src.length(),
453                        0xfffd, NULL, &status);
454     dest.append(buffer, utf8Length, status);
455 }
456 
457 
setUTF16(UErrorCode & status)458 void TestParams::setUTF16(UErrorCode &status) {
459     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
460     textMap->removeAllElements();
461     for (int32_t i=0; i<dataToBreak.length(); i++) {
462         if (i == dataToBreak.getChar32Start(i)) {
463             textMap->addElement(i, status);
464         } else {
465             textMap->addElement(-1, status);
466         }
467     }
468     textMap->addElement(dataToBreak.length(), status);
469     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
470 }
471 
472 
setUTF8(UErrorCode & status)473 void TestParams::setUTF8(UErrorCode &status) {
474     if (U_FAILURE(status)) {
475         return;
476     }
477     utf8String.clear();
478     CharStringAppend(utf8String, dataToBreak, status);
479     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
480     if (U_FAILURE(status)) {
481         return;
482     }
483 
484     textMap->removeAllElements();
485     int32_t utf16Index = 0;
486     for (;;) {
487         textMap->addElement(utf16Index, status);
488         UChar32 c32 = utext_current32(textToBreak);
489         if (c32 < 0) {
490             break;
491         }
492         utf16Index += U16_LENGTH(c32);
493         utext_next32(textToBreak);
494         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
495             textMap->addElement(-1, status);
496         }
497     }
498     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
499 }
500 
501 
getSrcLine(int32_t bp)502 int32_t TestParams::getSrcLine(int32_t bp) {
503     if (bp >= textMap->size()) {
504         bp = textMap->size() - 1;
505     }
506     int32_t i = 0;
507     for(; bp >= 0 ; --bp) {
508         // Move to a character boundary if we are not on one already.
509         i = textMap->elementAti(bp);
510         if (i >= 0) {
511             break;
512         }
513     }
514     return srcLine->elementAti(i);
515 }
516 
517 
getExpectedBreak(int32_t bp)518 int32_t TestParams::getExpectedBreak(int32_t bp) {
519     if (bp >= textMap->size()) {
520         return 0;
521     }
522     int32_t i = textMap->elementAti(bp);
523     int32_t retVal = 0;
524     if (i >= 0) {
525         retVal = expectedBreaks->elementAti(i);
526     }
527     return retVal;
528 }
529 
530 
getSrcCol(int32_t bp)531 int32_t TestParams::getSrcCol(int32_t bp) {
532     if (bp >= textMap->size()) {
533         bp = textMap->size() - 1;
534     }
535     int32_t i = 0;
536     for(; bp >= 0; --bp) {
537         // Move bp to a character boundary if we are not on one already.
538         i = textMap->elementAti(bp);
539         if (i >= 0) {
540             break;
541         }
542     }
543     return srcCol->elementAti(i);
544 }
545 
546 
executeTest(TestParams * t,UErrorCode & status)547 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
548     int32_t    bp;
549     int32_t    prevBP;
550     int32_t    i;
551 
552     TEST_ASSERT_SUCCESS(status);
553     if (U_FAILURE(status)) {
554         return;
555     }
556 
557     if (t->bi == NULL) {
558         return;
559     }
560 
561     t->bi->setText(t->textToBreak, status);
562     //
563     //  Run the iterator forward
564     //
565     prevBP = -1;
566     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
567         if (prevBP ==  bp) {
568             // Fail for lack of forward progress.
569             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
570                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
571             break;
572         }
573 
574         // Check that there we didn't miss an expected break between the last one
575         //  and this one.
576         for (i=prevBP+1; i<bp; i++) {
577             if (t->getExpectedBreak(i) != 0) {
578                 int expected[] = {0, i};
579                 printStringBreaks(t->dataToBreak, expected, 2);
580                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
581                       i, t->getSrcLine(i), t->getSrcCol(i));
582             }
583         }
584 
585         // Check that the break we did find was expected
586         if (t->getExpectedBreak(bp) == 0) {
587             int expected[] = {0, bp};
588             printStringBreaks(t->textToBreak, expected, 2);
589             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
590                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
591         } else {
592             // The break was expected.
593             //   Check that the {nnn} tag value is correct.
594             int32_t expectedTagVal = t->getExpectedBreak(bp);
595             if (expectedTagVal == -1) {
596                 expectedTagVal = 0;
597             }
598             int32_t line = t->getSrcLine(bp);
599             int32_t rs = t->bi->getRuleStatus();
600             if (rs != expectedTagVal) {
601                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
602                       "          Actual, Expected status = %4d, %4d",
603                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
604             }
605         }
606 
607         prevBP = bp;
608     }
609 
610     // Verify that there were no missed expected breaks after the last one found
611     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
612         if (t->getExpectedBreak(i) != 0) {
613             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
614                       i, t->getSrcLine(i), t->getSrcCol(i));
615         }
616     }
617 
618     //
619     //  Run the iterator backwards, verify that the same breaks are found.
620     //
621     prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
622     bp = t->bi->last();
623     while (bp != BreakIterator::DONE) {
624         if (prevBP ==  bp) {
625             // Fail for lack of progress.
626             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
627                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
628             break;
629         }
630 
631         // Check that we didn't miss an expected break between the last one
632         //  and this one.  (UVector returns zeros for index out of bounds.)
633         for (i=prevBP-1; i>bp; i--) {
634             if (t->getExpectedBreak(i) != 0) {
635                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
636                       i, t->getSrcLine(i), t->getSrcCol(i));
637             }
638         }
639 
640         // Check that the break we did find was expected
641         if (t->getExpectedBreak(bp) == 0) {
642             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
643                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
644         } else {
645             // The break was expected.
646             //   Check that the {nnn} tag value is correct.
647             int32_t expectedTagVal = t->getExpectedBreak(bp);
648             if (expectedTagVal == -1) {
649                 expectedTagVal = 0;
650             }
651             int line = t->getSrcLine(bp);
652             int32_t rs = t->bi->getRuleStatus();
653             if (rs != expectedTagVal) {
654                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
655                       "          Actual, Expected status = %4d, %4d",
656                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
657             }
658         }
659 
660         prevBP = bp;
661         bp = t->bi->previous();
662     }
663 
664     // Verify that there were no missed breaks prior to the last one found
665     for (i=prevBP-1; i>=0; i--) {
666         if (t->getExpectedBreak(i) != 0) {
667             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
668                       i, t->getSrcLine(i), t->getSrcCol(i));
669         }
670     }
671 
672     // Check isBoundary()
673     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
674         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
675         UBool boundaryFound    = t->bi->isBoundary(i);
676         if (boundaryExpected != boundaryFound) {
677             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
678                   "        Expected, Actual= %s, %s",
679                   i, t->getSrcLine(i), t->getSrcCol(i),
680                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
681         }
682     }
683 
684     // Check following()
685     for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
686         int32_t actualBreak = t->bi->following(i);
687         int32_t expectedBreak = BreakIterator::DONE;
688         for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
689             if (t->getExpectedBreak(j) != 0) {
690                 expectedBreak = j;
691                 break;
692             }
693         }
694         if (expectedBreak != actualBreak) {
695             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
696                   "        Expected, Actual= %d, %d",
697                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
698         }
699     }
700 
701     // Check preceding()
702     for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
703         int32_t actualBreak = t->bi->preceding(i);
704         int32_t expectedBreak = BreakIterator::DONE;
705 
706         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
707         // preceding(trailing byte) will return the index of some preceding code point,
708         // not the lead byte of the current code point, even though that has a smaller index.
709         // Therefore, start looking at the expected break data not at i-1, but at
710         // the start of code point index - 1.
711         utext_setNativeIndex(t->textToBreak, i);
712         int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
713         for (; j >= 0; j--) {
714             if (t->getExpectedBreak(j) != 0) {
715                 expectedBreak = j;
716                 break;
717             }
718         }
719         if (expectedBreak != actualBreak) {
720             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
721                   "        Expected, Actual= %d, %d",
722                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
723         }
724     }
725 }
726 
TestExtended()727 void RBBITest::TestExtended() {
728      // The expectations in this test heavily depends on the Thai dictionary.
729      // Therefore, we skip this test under the LSTM configuration.
730      if (skipDictionaryTest()) {
731          return;
732      }
733   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
734   // data driven test closely entangles filtered and regular data.
735 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
736     UErrorCode      status  = U_ZERO_ERROR;
737     Locale          locale("");
738 
739     TestParams          tp(status);
740 
741     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
742     if (U_FAILURE(status)) {
743         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
744     }
745 
746     //
747     //  Open and read the test data file.
748     //
749     const char *testDataDirectory = IntlTest::getSourceTestData(status);
750     CharString testFileName(testDataDirectory, -1, status);
751     testFileName.append("rbbitst.txt", -1, status);
752 
753     int    len;
754     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
755     if (U_FAILURE(status)) {
756         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
757         return;
758     }
759 
760     bool skipTest = false; // Skip this test?
761 
762     //
763     //  Put the test data into a UnicodeString
764     //
765     UnicodeString testString(false, testFile, len);
766 
767     enum EParseState{
768         PARSE_COMMENT,
769         PARSE_TAG,
770         PARSE_DATA,
771         PARSE_NUM,
772         PARSE_RULES
773     }
774     parseState = PARSE_TAG;
775 
776     EParseState savedState = PARSE_TAG;
777 
778     int32_t    lineNum  = 1;
779     int32_t    colStart = 0;
780     int32_t    column   = 0;
781     int32_t    charIdx  = 0;
782 
783     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
784 
785     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
786     int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
787 
788     for (charIdx = 0; charIdx < len; ) {
789         status = U_ZERO_ERROR;
790         UChar  c = testString.charAt(charIdx);
791         charIdx++;
792         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
793             // treat CRLF as a unit
794             c = u'\n';
795             charIdx++;
796         }
797         if (c == u'\n' || c == u'\r') {
798             lineNum++;
799             colStart = charIdx;
800         }
801         column = charIdx - colStart + 1;
802 
803         switch (parseState) {
804         case PARSE_COMMENT:
805             if (c == u'\n' || c == u'\r') {
806                 parseState = savedState;
807             }
808             break;
809 
810         case PARSE_TAG:
811             {
812             if (c == u'#') {
813                 parseState = PARSE_COMMENT;
814                 savedState = PARSE_TAG;
815                 break;
816             }
817             if (u_isUWhiteSpace(c)) {
818                 break;
819             }
820             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
821                 delete tp.bi;
822                 tp.bi = BreakIterator::createWordInstance(locale,  status);
823                 skipTest = false;
824                 charIdx += 5;
825                 break;
826             }
827             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
828                 delete tp.bi;
829                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
830                 skipTest = false;
831                 charIdx += 5;
832                 break;
833             }
834             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
835                 delete tp.bi;
836                 tp.bi = BreakIterator::createLineInstance(locale,  status);
837                 skipTest = false;
838                 charIdx += 5;
839                 break;
840             }
841             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
842                 delete tp.bi;
843                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
844                 skipTest = false;
845                 charIdx += 5;
846                 break;
847             }
848             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
849                 delete tp.bi;
850                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
851                 charIdx += 6;
852                 break;
853             }
854 
855             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
856                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
857                 charIdx = testString.indexOf(u'>', charIdx) + 1;
858                 parseState = PARSE_RULES;
859                 rules.remove();
860                 rulesFirstLine = lineNum;
861                 break;
862             }
863 
864             // <locale  loc_name>
865             localeMatcher.reset(testString);
866             if (localeMatcher.lookingAt(charIdx-1, status)) {
867                 UnicodeString localeName = localeMatcher.group(1, status);
868                 char localeName8[100];
869                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
870                 locale = Locale::createFromName(localeName8);
871                 charIdx += localeMatcher.group(0, status).length() - 1;
872                 TEST_ASSERT_SUCCESS(status);
873                 break;
874             }
875             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
876                 parseState = PARSE_DATA;
877                 charIdx += 5;
878                 tp.dataToBreak = "";
879                 tp.expectedBreaks->removeAllElements();
880                 tp.srcCol ->removeAllElements();
881                 tp.srcLine->removeAllElements();
882                 break;
883             }
884 
885             errln("line %d: Tag expected in test file.", lineNum);
886             parseState = PARSE_COMMENT;
887             savedState = PARSE_DATA;
888             goto end_test; // Stop the test.
889             }
890             break;
891 
892         case PARSE_RULES:
893             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
894                 charIdx += 7;
895                 parseState = PARSE_TAG;
896                 delete tp.bi;
897                 UParseError pe;
898                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
899                 skipTest = U_FAILURE(status);
900                 if (U_FAILURE(status)) {
901                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
902                         rulesFirstLine + pe.line - 1, u_errorName(status));
903                 }
904             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
905                 charIdx += 10;
906                 parseState = PARSE_TAG;
907                 UErrorCode ec = U_ZERO_ERROR;
908                 UParseError pe;
909                 RuleBasedBreakIterator bi(rules, pe, ec);
910                 if (U_SUCCESS(ec)) {
911                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
912                         rulesFirstLine + pe.line - 1);
913                 }
914             } else {
915                 rules.append(c);
916             }
917             break;
918 
919         case PARSE_DATA:
920             if (c == u'•') {
921                 int32_t  breakIdx = tp.dataToBreak.length();
922                 if (tp.expectedBreaks->size() > breakIdx) {
923                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
924                           lineNum, column);
925                 }
926                 tp.expectedBreaks->setSize(breakIdx+1);
927                 tp.expectedBreaks->setElementAt(-1, breakIdx);
928                 tp.srcLine->setSize(breakIdx+1);
929                 tp.srcLine->setElementAt(lineNum, breakIdx);
930                 tp.srcCol ->setSize(breakIdx+1);
931                 tp.srcCol ->setElementAt(column, breakIdx);
932                 break;
933             }
934 
935             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
936                 // Add final entry to mappings from break location to source file position.
937                 //  Need one extra because last break position returned is after the
938                 //    last char in the data, not at the last char.
939                 tp.srcLine->addElement(lineNum, status);
940                 tp.srcCol ->addElement(column, status);
941 
942                 parseState = PARSE_TAG;
943                 charIdx += 6;
944 
945                 if (!skipTest) {
946                     // RUN THE TEST!
947                     status = U_ZERO_ERROR;
948                     tp.setUTF16(status);
949                     executeTest(&tp, status);
950                     TEST_ASSERT_SUCCESS(status);
951 
952                     // Run again, this time with UTF-8 text wrapped in a UText.
953                     status = U_ZERO_ERROR;
954                     tp.setUTF8(status);
955                     TEST_ASSERT_SUCCESS(status);
956                     executeTest(&tp, status);
957                 }
958                 break;
959             }
960 
961             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
962                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
963                 // Get the code point from the name and insert it into the test data.
964                 //   (Damn, no API takes names in Unicode  !!!
965                 //    we've got to take it back to char *)
966                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
967                 int32_t nameLength = nameEndIdx - (charIdx+2);
968                 char charNameBuf[200];
969                 UChar32 theChar = -1;
970                 if (nameEndIdx != -1) {
971                     UErrorCode status = U_ZERO_ERROR;
972                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
973                     charNameBuf[sizeof(charNameBuf)-1] = 0;
974                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
975                     if (U_FAILURE(status)) {
976                         theChar = -1;
977                     }
978                 }
979                 if (theChar == -1) {
980                     errln("Error in named character in test file at line %d, col %d",
981                         lineNum, column);
982                 } else {
983                     // Named code point was recognized.  Insert it
984                     //   into the test data.
985                     tp.dataToBreak.append(theChar);
986                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
987                         tp.srcLine->addElement(lineNum, status);
988                         tp.srcCol ->addElement(column, status);
989                     }
990                 }
991                 if (nameEndIdx > charIdx) {
992                     charIdx = nameEndIdx+1;
993 
994                 }
995                 break;
996             }
997 
998 
999 
1000             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
1001                 charIdx++;
1002                 int32_t  breakIdx = tp.dataToBreak.length();
1003                 tp.expectedBreaks->setSize(breakIdx+1);
1004                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1005                 tp.srcLine->setSize(breakIdx+1);
1006                 tp.srcLine->setElementAt(lineNum, breakIdx);
1007                 tp.srcCol ->setSize(breakIdx+1);
1008                 tp.srcCol ->setElementAt(column, breakIdx);
1009                 break;
1010             }
1011 
1012             if (c == u'<') {
1013                 tagValue   = 0;
1014                 parseState = PARSE_NUM;
1015                 break;
1016             }
1017 
1018             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
1019                 parseState = PARSE_COMMENT;
1020                 savedState = PARSE_DATA;
1021                 break;
1022             }
1023 
1024             if (c == u'\\') {
1025                 // Check for \ at end of line, a line continuation.
1026                 //     Advance over (discard) the newline
1027                 UChar32 cp = testString.char32At(charIdx);
1028                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1029                     // We have a CR LF
1030                     //  Need an extra increment of the input ptr to move over both of them
1031                     charIdx++;
1032                 }
1033                 if (cp == u'\n' || cp == u'\r') {
1034                     lineNum++;
1035                     colStart = charIdx;
1036                     charIdx++;
1037                     break;
1038                 }
1039 
1040                 // Let unescape handle the back slash.
1041                 cp = testString.unescapeAt(charIdx);
1042                 if (cp != -1) {
1043                     // Escape sequence was recognized.  Insert the char
1044                     //   into the test data.
1045                     tp.dataToBreak.append(cp);
1046                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1047                         tp.srcLine->addElement(lineNum, status);
1048                         tp.srcCol ->addElement(column, status);
1049                     }
1050                     break;
1051                 }
1052 
1053 
1054                 // Not a recognized backslash escape sequence.
1055                 // Take the next char as a literal.
1056                 //  TODO:  Should this be an error?
1057                 c = testString.charAt(charIdx);
1058                 charIdx = testString.moveIndex32(charIdx, 1);
1059             }
1060 
1061             // Normal, non-escaped data char.
1062             tp.dataToBreak.append(c);
1063 
1064             // Save the mapping from offset in the data to line/column numbers in
1065             //   the original input file.  Will be used for better error messages only.
1066             //   If there's an expected break before this char, the slot in the mapping
1067             //     vector will already be set for this char; don't overwrite it.
1068             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1069                 tp.srcLine->addElement(lineNum, status);
1070                 tp.srcCol ->addElement(column, status);
1071             }
1072             break;
1073 
1074 
1075         case PARSE_NUM:
1076             // We are parsing an expected numeric tag value, like <1234>,
1077             //   within a chunk of data.
1078             if (u_isUWhiteSpace(c)) {
1079                 break;
1080             }
1081 
1082             if (c == u'>') {
1083                 // Finished the number.  Add the info to the expected break data,
1084                 //   and switch parse state back to doing plain data.
1085                 parseState = PARSE_DATA;
1086                 if (tagValue == 0) {
1087                     tagValue = -1;
1088                 }
1089                 int32_t  breakIdx = tp.dataToBreak.length();
1090                 if (tp.expectedBreaks->size() > breakIdx) {
1091                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1092                           lineNum, column);
1093                 }
1094                 tp.expectedBreaks->setSize(breakIdx+1);
1095                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1096                 tp.srcLine->setSize(breakIdx+1);
1097                 tp.srcLine->setElementAt(lineNum, breakIdx);
1098                 tp.srcCol ->setSize(breakIdx+1);
1099                 tp.srcCol ->setElementAt(column, breakIdx);
1100                 break;
1101             }
1102 
1103             if (u_isdigit(c)) {
1104                 tagValue = tagValue*10 + u_charDigitValue(c);
1105                 break;
1106             }
1107 
1108             errln("Syntax Error in test file at line %d, col %d",
1109                 lineNum, column);
1110             parseState = PARSE_COMMENT;
1111             goto end_test; // Stop the test
1112             break;
1113         }
1114 
1115 
1116         if (U_FAILURE(status)) {
1117             dataerrln("ICU Error %s while parsing test file at line %d.",
1118                 u_errorName(status), lineNum);
1119             status = U_ZERO_ERROR;
1120             goto end_test; // Stop the test
1121         }
1122 
1123     }
1124 
1125     // Reached end of test file. Raise an error if parseState indicates that we are
1126     //   within a block that should have been terminated.
1127 
1128     if (parseState == PARSE_RULES) {
1129         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1130             lineNum, rulesFirstLine);
1131     }
1132     if (parseState == PARSE_DATA) {
1133         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1134     }
1135 
1136 
1137 end_test:
1138     delete [] testFile;
1139 #endif
1140 }
1141 
1142 //-------------------------------------------------------------------------------
1143 //
1144 //  TestDictRules   create a break iterator from source rules that includes a
1145 //                  dictionary range.   Regression for bug #7130.  Source rules
1146 //                  do not declare a break iterator type (word, line, sentence, etc.
1147 //                  but the dictionary code, without a type, would loop.
1148 //
1149 //-------------------------------------------------------------------------------
TestDictRules()1150 void RBBITest::TestDictRules() {
1151     const char *rules =  "$dictionary = [a-z]; \n"
1152                          "!!forward; \n"
1153                          "$dictionary $dictionary; \n"
1154                          "!!reverse; \n"
1155                          "$dictionary $dictionary; \n";
1156     const char *text = "aa";
1157     UErrorCode status = U_ZERO_ERROR;
1158     UParseError parseError;
1159 
1160     RuleBasedBreakIterator bi(rules, parseError, status);
1161     if (U_SUCCESS(status)) {
1162         UnicodeString utext = text;
1163         bi.setText(utext);
1164         int32_t position;
1165         int32_t loops;
1166         for (loops = 0; loops<10; loops++) {
1167             position = bi.next();
1168             if (position == RuleBasedBreakIterator::DONE) {
1169                 break;
1170             }
1171         }
1172         TEST_ASSERT(loops == 1);
1173     } else {
1174         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1175     }
1176 }
1177 
1178 
1179 
1180 //--------------------------------------------------------------------------------------------
1181 //
1182 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1183 //
1184 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1185 void RBBITest::TestUnicodeFiles() {
1186     RuleBasedBreakIterator  *bi;
1187     UErrorCode               status = U_ZERO_ERROR;
1188 
1189     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1190     TEST_ASSERT_SUCCESS(status);
1191     if (U_SUCCESS(status)) {
1192         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1193     }
1194     delete bi;
1195 
1196     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1197     TEST_ASSERT_SUCCESS(status);
1198     if (U_SUCCESS(status)) {
1199         runUnicodeTestData("WordBreakTest.txt", bi);
1200     }
1201     delete bi;
1202 
1203     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1204     TEST_ASSERT_SUCCESS(status);
1205     if (U_SUCCESS(status)) {
1206         runUnicodeTestData("SentenceBreakTest.txt", bi);
1207     }
1208     delete bi;
1209 
1210     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1211     TEST_ASSERT_SUCCESS(status);
1212     if (U_SUCCESS(status)) {
1213         runUnicodeTestData("LineBreakTest.txt", bi);
1214     }
1215     delete bi;
1216 }
1217 
1218 
1219 // Check for test cases from the Unicode test data files that are known to fail
1220 // and should be skipped as known issues because ICU does not fully implement
1221 // the Unicode specifications, or because ICU includes tailorings that differ from
1222 // the Unicode standard.
1223 //
1224 // Test cases are identified by the test data sequence, which tends to be more stable
1225 // across Unicode versions than the test file line numbers.
1226 //
1227 // The test case with ticket "10666" is a dummy, included as an example.
1228 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1229 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1230     static struct TestCase {
1231         const char *fTicketNum;
1232         const char *fFileName;
1233         const UChar *fString;
1234     } badTestCases[] = {
1235         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1236         // The following tests were originally for
1237         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1238         // However, that ticket has been closed as fixed but these tests still fail, so
1239         // ICU-21097 has been created to investigate and address these remaining issues.
1240         {"21097",  "LineBreakTest.txt", u"-#"},
1241         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1242         {"21097",  "LineBreakTest.txt", u"\u002d\u00a7"},
1243         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1244         {"21097",  "LineBreakTest.txt", u"\u002d\U00050005"},
1245         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1246         {"21097",  "LineBreakTest.txt", u"\u002d\u0e01"},
1247         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1248 
1249         // The following tests were originally for
1250         // Issue ICU-12017 Improve line break around numbers.
1251         // However, that ticket has been closed as fixed but these tests still fail, so
1252         // ICU-21097 has been created to investigate and address these remaining issues.
1253         {"21097", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1254         {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1255         {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1256         {"21097", "LineBreakTest.txt", u"a.2 "},
1257         {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1258         {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1259         {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1260         {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1261         {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1262         {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1263         {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1264         {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1265         {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1266         {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1267 
1268         // ICU-22127 until UAX #29 wordbreak is update for the colon changes in ICU-22112,
1269         // need to skip some tests in WordBreakTest.txt
1270         {"22127", "WordBreakTest.txt", u"a:"},
1271         {"22127", "WordBreakTest.txt", u"A:"},
1272     };
1273 
1274     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1275         const TestCase &badCase = badTestCases[n];
1276         if (!strcmp(fileName, badCase.fFileName) &&
1277                 testCase.startsWith(UnicodeString(badCase.fString))) {
1278             return logKnownIssue(badCase.fTicketNum);
1279         }
1280     }
1281     return false;
1282 }
1283 
1284 
1285 //--------------------------------------------------------------------------------------------
1286 //
1287 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1288 //
1289 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1290 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1291 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1292     UErrorCode  status = U_ZERO_ERROR;
1293 
1294     //
1295     //  Open and read the test data file, put it into a UnicodeString.
1296     //
1297     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1298     char testFileName[1000];
1299     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1300         dataerrln("Can't open test data.  Path too long.");
1301         return;
1302     }
1303     strcpy(testFileName, testDataDirectory);
1304     strcat(testFileName, fileName);
1305 
1306     logln("Opening data file %s\n", fileName);
1307 
1308     int    len;
1309     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1310     if (status != U_FILE_ACCESS_ERROR) {
1311         TEST_ASSERT_SUCCESS(status);
1312         TEST_ASSERT(testFile != NULL);
1313     }
1314     if (U_FAILURE(status) || testFile == NULL) {
1315         return; /* something went wrong, error already output */
1316     }
1317     UnicodeString testFileAsString(true, testFile, len);
1318 
1319     //
1320     //  Parse the test data file using a regular expression.
1321     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1322     //     is identified by which group had a match.
1323     //
1324     //    Capture Group  #                  1          2            3            4           5
1325     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1326     //
1327     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1328     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1329     UnicodeString   testString;
1330     UVector32       breakPositions(status);
1331     int             lineNumber = 1;
1332     TEST_ASSERT_SUCCESS(status);
1333     if (U_FAILURE(status)) {
1334         return;
1335     }
1336 
1337     //
1338     //  Scan through each test case, building up the string to be broken in testString,
1339     //   and the positions that should be boundaries in the breakPositions vector.
1340     //
1341     int spin = 0;
1342     while (tokenMatcher.find()) {
1343         if(tokenMatcher.hitEnd()) {
1344           /* Shouldn't Happen(TM).  This means we didn't find the symbols we were looking for.
1345              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1346              and caused an infinite loop here on EBCDIC systems!
1347           */
1348           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1349           //       return;
1350         }
1351         if (tokenMatcher.start(1, status) >= 0) {
1352             // Scanned a divide sign, indicating a break position in the test data.
1353             if (testString.length()>0) {
1354                 breakPositions.addElement(testString.length(), status);
1355             }
1356         }
1357         else if (tokenMatcher.start(2, status) >= 0) {
1358             // Scanned an 'x', meaning no break at this position in the test data
1359             //   Nothing to be done here.
1360             }
1361         else if (tokenMatcher.start(3, status) >= 0) {
1362             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1363             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1364             int length = hexNumber.length();
1365             if (length<=8) {
1366                 char buf[10];
1367                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1368                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1369                 if (c<=0x10ffff) {
1370                     testString.append(c);
1371                 } else {
1372                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1373                        fileName, lineNumber);
1374                 }
1375             } else {
1376                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1377                        fileName, lineNumber);
1378              }
1379         }
1380         else if (tokenMatcher.start(4, status) >= 0) {
1381             // Scanned to end of a line, possibly skipping over a comment in the process.
1382             //   If the line from the file contained test data, run the test now.
1383             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1384                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1385             }
1386 
1387             // Clear out this test case.
1388             //    The string and breakPositions vector will be refilled as the next
1389             //       test case is parsed.
1390             testString.remove();
1391             breakPositions.removeAllElements();
1392             lineNumber++;
1393         } else {
1394             // Scanner catchall.  Something unrecognized appeared on the line.
1395             char token[16];
1396             UnicodeString uToken = tokenMatcher.group(0, status);
1397             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1398             token[sizeof(token)-1] = 0;
1399             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1400 
1401             // Clean up, in preparation for continuing with the next line.
1402             testString.remove();
1403             breakPositions.removeAllElements();
1404             lineNumber++;
1405         }
1406         TEST_ASSERT_SUCCESS(status);
1407         if (U_FAILURE(status)) {
1408             break;
1409         }
1410     }
1411 
1412     delete [] testFile;
1413  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1414 }
1415 
1416 //--------------------------------------------------------------------------------------------
1417 //
1418 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1419 //                            test data files.  Do only a simple, forward-only check -
1420 //                            this test is mostly to check that ICU and the Unicode
1421 //                            data agree with each other.
1422 //
1423 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1424 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1425                          const UnicodeString &testString,   // Text data to be broken
1426                          UVector32 *breakPositions,         // Positions where breaks should be found.
1427                          RuleBasedBreakIterator *bi) {
1428     int32_t pos;                 // Break Position in the test string
1429     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1430     int32_t expectedPos;         // Expected break position (index into test string)
1431 
1432     bi->setText(testString);
1433     pos = bi->first();
1434     pos = bi->next();
1435 
1436     while (pos != BreakIterator::DONE) {
1437         if (expectedI >= breakPositions->size()) {
1438             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1439                 testFileName, lineNumber, pos);
1440             break;
1441         }
1442         expectedPos = breakPositions->elementAti(expectedI);
1443         if (pos < expectedPos) {
1444             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1445                 testFileName, lineNumber, pos);
1446             break;
1447         }
1448         if (pos > expectedPos) {
1449             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1450                 testFileName, lineNumber, expectedPos);
1451             break;
1452         }
1453         pos = bi->next();
1454         expectedI++;
1455     }
1456 
1457     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1458         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1459             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1460     }
1461 }
1462 
1463 
1464 
1465 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1466 //---------------------------------------------------------------------------------------
1467 //
1468 //   class RBBIMonkeyKind
1469 //
1470 //      Monkey Test for Break Iteration
1471 //      Abstract interface class.   Concrete derived classes independently
1472 //      implement the break rules for different iterator types.
1473 //
1474 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1475 //      testing, but works purely in terms of the interface defined here.
1476 //
1477 //---------------------------------------------------------------------------------------
1478 class RBBIMonkeyKind {
1479 public:
1480     // Return a UVector of UnicodeSets, representing the character classes used
1481     //   for this type of iterator.
1482     virtual  UVector  *charClasses() = 0;
1483 
1484     // Set the test text on which subsequent calls to next() will operate
1485     virtual  void      setText(const UnicodeString &s) = 0;
1486 
1487     // Find the next break position, starting from the prev break position, or from zero.
1488     // Return -1 after reaching end of string.
1489     virtual  int32_t   next(int32_t i) = 0;
1490 
1491     // Name of each character class, parallel with charClasses. Used for debugging output
1492     // of characters.
1493     virtual  std::vector<std::string>&     characterClassNames();
1494 
1495     void setAppliedRule(int32_t position, const char* value);
1496 
1497     std::string getAppliedRule(int32_t position);
1498 
1499     virtual ~RBBIMonkeyKind();
1500     UErrorCode deferredStatus;
1501 
1502     std::string classNameFromCodepoint(const UChar32 c);
1503     unsigned int maxClassNameSize();
1504 
1505  protected:
1506      RBBIMonkeyKind();
1507      std::vector<std::string> classNames;
1508      std::vector<std::string> appliedRules;
1509 
1510     // Clear `appliedRules` and fill it with empty strings in the size of test text.
1511     void prepareAppliedRules(int32_t size );
1512 
1513  private:
1514 
1515 };
1516 
RBBIMonkeyKind()1517 RBBIMonkeyKind::RBBIMonkeyKind() {
1518     deferredStatus = U_ZERO_ERROR;
1519 }
1520 
~RBBIMonkeyKind()1521 RBBIMonkeyKind::~RBBIMonkeyKind() {
1522 }
1523 
characterClassNames()1524 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1525     return classNames;
1526 }
1527 
prepareAppliedRules(int32_t size)1528 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1529     // Remove all the information in the `appliedRules`.
1530     appliedRules.clear();
1531     appliedRules.resize(size + 1);
1532 }
1533 
setAppliedRule(int32_t position,const char * value)1534 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1535     appliedRules[position] = value;
1536 }
1537 
getAppliedRule(int32_t position)1538 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1539     return appliedRules[position];
1540 }
1541 
classNameFromCodepoint(const UChar32 c)1542 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1543     // Simply iterate through charClasses to find character's class
1544     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1545         UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1546         if (classSet->contains(c)) {
1547             return classNames[aClassNum];
1548         }
1549     }
1550     U_ASSERT(false);  // This should not happen.
1551     return "bad class name";
1552 }
1553 
maxClassNameSize()1554 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1555     unsigned int maxSize = 0;
1556     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1557         auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1558         if (aClassNumSize > maxSize) {
1559             maxSize = aClassNumSize;
1560         }
1561     }
1562     return maxSize;
1563 }
1564 
1565 //----------------------------------------------------------------------------------------
1566 //
1567 //   Random Numbers.  Similar to standard lib rand() and srand()
1568 //                    Not using library to
1569 //                      1.  Get same results on all platforms.
1570 //                      2.  Get access to current seed, to more easily reproduce failures.
1571 //
1572 //---------------------------------------------------------------------------------------
1573 static uint32_t m_seed = 1;
1574 
m_rand()1575 static uint32_t m_rand()
1576 {
1577     m_seed = m_seed * 1103515245 + 12345;
1578     return (uint32_t)(m_seed/65536) % 32768;
1579 }
1580 
1581 
1582 //------------------------------------------------------------------------------------------
1583 //
1584 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1585 //                             of RBBIMonkeyKind.
1586 //
1587 //------------------------------------------------------------------------------------------
1588 class RBBICharMonkey: public RBBIMonkeyKind {
1589 public:
1590     RBBICharMonkey();
1591     virtual          ~RBBICharMonkey();
1592     virtual  UVector *charClasses() override;
1593     virtual  void     setText(const UnicodeString &s) override;
1594     virtual  int32_t  next(int32_t i) override;
1595 private:
1596     UVector   *fSets;
1597 
1598     UnicodeSet  *fCRLFSet;
1599     UnicodeSet  *fControlSet;
1600     UnicodeSet  *fExtendSet;
1601     UnicodeSet  *fZWJSet;
1602     UnicodeSet  *fRegionalIndicatorSet;
1603     UnicodeSet  *fPrependSet;
1604     UnicodeSet  *fSpacingSet;
1605     UnicodeSet  *fLSet;
1606     UnicodeSet  *fVSet;
1607     UnicodeSet  *fTSet;
1608     UnicodeSet  *fLVSet;
1609     UnicodeSet  *fLVTSet;
1610     UnicodeSet  *fHangulSet;
1611     UnicodeSet  *fExtendedPictSet;
1612     UnicodeSet  *fViramaSet;
1613     UnicodeSet  *fLinkingConsonantSet;
1614     UnicodeSet  *fExtCccZwjSet;
1615     UnicodeSet  *fAnySet;
1616 
1617     const UnicodeString *fText;
1618 };
1619 
1620 
RBBICharMonkey()1621 RBBICharMonkey::RBBICharMonkey() {
1622     UErrorCode  status = U_ZERO_ERROR;
1623 
1624     fText = NULL;
1625 
1626     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1627     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1628     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1629     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1630     fRegionalIndicatorSet =
1631                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1632     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1633     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1634     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1635     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1636     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1637     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1638     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1639     fHangulSet  = new UnicodeSet();
1640     fHangulSet->addAll(*fLSet);
1641     fHangulSet->addAll(*fVSet);
1642     fHangulSet->addAll(*fTSet);
1643     fHangulSet->addAll(*fLVSet);
1644     fHangulSet->addAll(*fLVTSet);
1645 
1646     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1647     fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1648                                         "\\p{Indic_Syllabic_Category=Virama}]", status);
1649     fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1650                                         "\\p{Indic_Syllabic_Category=Consonant}]", status);
1651     fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1652     fAnySet           = new UnicodeSet(0, 0x10ffff);
1653 
1654     // Create sets of characters, and add the names of the above character sets.
1655     // In each new ICU release, add new names corresponding to the sets above.
1656     fSets             = new UVector(status);
1657 
1658     // Important: Keep class names the same as the class contents.
1659     fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1660     fSets->addElement(fControlSet, status); classNames.push_back("Control");
1661     fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1662     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1663     if (!fPrependSet->isEmpty()) {
1664         fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1665     }
1666     fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1667     fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1668     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1669     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1670     fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1671     fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1672     fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1673     fSets->addElement(fAnySet, status); classNames.push_back("Any");
1674 
1675     if (U_FAILURE(status)) {
1676         deferredStatus = status;
1677     }
1678 }
1679 
1680 
setText(const UnicodeString & s)1681 void RBBICharMonkey::setText(const UnicodeString &s) {
1682     fText = &s;
1683     prepareAppliedRules(s.length());
1684 }
1685 
1686 
1687 
next(int32_t prevPos)1688 int32_t RBBICharMonkey::next(int32_t prevPos) {
1689     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1690                               //   break position being tested.  The candidate break
1691                               //   location is before p2.
1692 
1693     int     breakPos = -1;
1694 
1695     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1696     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1697 
1698     if (U_FAILURE(deferredStatus)) {
1699         return -1;
1700     }
1701 
1702     // Previous break at end of string.  return DONE.
1703     if (prevPos >= fText->length()) {
1704         return -1;
1705     }
1706 
1707     p0 = p1 = p2 = p3 = prevPos;
1708     c3 =  fText->char32At(prevPos);
1709     c0 = c1 = c2 = cBase = 0;
1710     (void)p0;   // suppress set but not used warning.
1711     (void)c0;
1712 
1713     // Loop runs once per "significant" character position in the input text.
1714     for (;;) {
1715         // Move all of the positions forward in the input string.
1716         p0 = p1;  c0 = c1;
1717         p1 = p2;  c1 = c2;
1718         p2 = p3;  c2 = c3;
1719 
1720         // Advance p3 by one codepoint
1721         p3 = fText->moveIndex32(p3, 1);
1722         c3 = fText->char32At(p3);
1723 
1724         if (p1 == p2) {
1725             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1726             continue;
1727         }
1728 
1729         if (p2 == fText->length()) {
1730             setAppliedRule(p2, "End of String");
1731             break;
1732         }
1733 
1734         //     No Extend or Format characters may appear between the CR and LF,
1735         //     which requires the additional check for p2 immediately following p1.
1736         //
1737         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1738           setAppliedRule(p2, "GB3   CR x LF");
1739           continue;
1740         }
1741 
1742         if (fControlSet->contains(c1) ||
1743             c1 == 0x0D ||
1744             c1 == 0x0A)  {
1745           setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
1746           break;
1747         }
1748 
1749         if (fControlSet->contains(c2) ||
1750             c2 == 0x0D ||
1751             c2 == 0x0A)  {
1752             setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
1753             break;
1754         }
1755 
1756         if (fLSet->contains(c1) &&
1757                (fLSet->contains(c2)  ||
1758                 fVSet->contains(c2)  ||
1759                 fLVSet->contains(c2) ||
1760                 fLVTSet->contains(c2))) {
1761             setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
1762             continue;
1763         }
1764 
1765         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1766             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1767             setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
1768             continue;
1769         }
1770 
1771         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1772             fTSet->contains(c2))  {
1773             setAppliedRule(p2, "GB8   ( LVT | T)  x T");
1774             continue;
1775         }
1776 
1777         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1778             if (!fExtendSet->contains(c1)) {
1779                 cBase = c1;
1780             }
1781             setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
1782             continue;
1783         }
1784 
1785         if (fSpacingSet->contains(c2)) {
1786             setAppliedRule(p2, "GB9a  x  SpacingMark");
1787             continue;
1788         }
1789 
1790         if (fPrependSet->contains(c1)) {
1791             setAppliedRule(p2, "GB9b  Prepend x");
1792             continue;
1793         }
1794 
1795         //   Note: Viramas are also included in the ExtCccZwj class.
1796         if (fLinkingConsonantSet->contains(c2)) {
1797             int pi = p1;
1798             bool sawVirama = false;
1799             while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1800                 if (fViramaSet->contains(fText->char32At(pi))) {
1801                     sawVirama = true;
1802                 }
1803                 pi = fText->moveIndex32(pi, -1);
1804             }
1805             if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1806               setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1807               continue;
1808             }
1809         }
1810 
1811         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1812           setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1813           continue;
1814         }
1815 
1816         //                   Note: The first if condition is a little tricky. We only need to force
1817         //                      a break if there are three or more contiguous RIs. If there are
1818         //                      only two, a break following will occur via other rules, and will include
1819         //                      any trailing extend characters, which is needed behavior.
1820         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1821                 && fRegionalIndicatorSet->contains(c2)) {
1822           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1823           break;
1824         }
1825         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1826           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1827           continue;
1828         }
1829 
1830         setAppliedRule(p2, "GB999 Any <break> Any");
1831         break;
1832     }
1833 
1834     breakPos = p2;
1835     return breakPos;
1836 }
1837 
1838 
1839 
charClasses()1840 UVector  *RBBICharMonkey::charClasses() {
1841     return fSets;
1842 }
1843 
~RBBICharMonkey()1844 RBBICharMonkey::~RBBICharMonkey() {
1845     delete fSets;
1846     delete fCRLFSet;
1847     delete fControlSet;
1848     delete fExtendSet;
1849     delete fRegionalIndicatorSet;
1850     delete fPrependSet;
1851     delete fSpacingSet;
1852     delete fLSet;
1853     delete fVSet;
1854     delete fTSet;
1855     delete fLVSet;
1856     delete fLVTSet;
1857     delete fHangulSet;
1858     delete fAnySet;
1859     delete fZWJSet;
1860     delete fExtendedPictSet;
1861     delete fViramaSet;
1862     delete fLinkingConsonantSet;
1863     delete fExtCccZwjSet;
1864 }
1865 
1866 //------------------------------------------------------------------------------------------
1867 //
1868 //   class RBBIWordMonkey      Word Break specific implementation
1869 //                             of RBBIMonkeyKind.
1870 //
1871 //------------------------------------------------------------------------------------------
1872 class RBBIWordMonkey: public RBBIMonkeyKind {
1873 public:
1874     RBBIWordMonkey();
1875     virtual          ~RBBIWordMonkey();
1876     virtual  UVector *charClasses() override;
1877     virtual  void     setText(const UnicodeString &s) override;
1878     virtual int32_t   next(int32_t i) override;
1879 private:
1880     UVector      *fSets;
1881 
1882     UnicodeSet  *fCRSet;
1883     UnicodeSet  *fLFSet;
1884     UnicodeSet  *fNewlineSet;
1885     UnicodeSet  *fRegionalIndicatorSet;
1886     UnicodeSet  *fKatakanaSet;
1887     UnicodeSet  *fHebrew_LetterSet;
1888     UnicodeSet  *fALetterSet;
1889     UnicodeSet  *fSingle_QuoteSet;
1890     UnicodeSet  *fDouble_QuoteSet;
1891     UnicodeSet  *fMidNumLetSet;
1892     UnicodeSet  *fMidLetterSet;
1893     UnicodeSet  *fMidNumSet;
1894     UnicodeSet  *fNumericSet;
1895     UnicodeSet  *fFormatSet;
1896     UnicodeSet  *fOtherSet = nullptr;
1897     UnicodeSet  *fExtendSet;
1898     UnicodeSet  *fExtendNumLetSet;
1899     UnicodeSet  *fWSegSpaceSet;
1900     UnicodeSet  *fDictionarySet = nullptr;
1901     UnicodeSet  *fZWJSet;
1902     UnicodeSet  *fExtendedPictSet;
1903 
1904     const UnicodeString  *fText;
1905 };
1906 
1907 
RBBIWordMonkey()1908 RBBIWordMonkey::RBBIWordMonkey()
1909 {
1910     UErrorCode  status = U_ZERO_ERROR;
1911 
1912     fSets            = new UVector(status);
1913 
1914     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
1915     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
1916     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
1917     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
1918     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1919     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1920     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter} @]", status);
1921     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
1922     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
1923     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
1924     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]",    status);
1925     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
1926     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1927     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
1928     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1929     // There are some sc=Hani characters with WB=Extend.
1930     // The break rules need to pick one or the other because
1931     // Extend overlapping with something else is messy.
1932     // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1933     // in $Han (for $dictionary) and out of $Extend.
1934     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1935     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
1936 
1937     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
1938     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1939     if(U_FAILURE(status)) {
1940         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1941         deferredStatus = status;
1942         return;
1943     }
1944 
1945     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1946     fDictionarySet->addAll(*fKatakanaSet);
1947     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1948 
1949     fALetterSet->removeAll(*fDictionarySet);
1950 
1951     fOtherSet        = new UnicodeSet();
1952     if(U_FAILURE(status)) {
1953         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1954         deferredStatus = status;
1955         return;
1956     }
1957 
1958     fOtherSet->complement();
1959     fOtherSet->removeAll(*fCRSet);
1960     fOtherSet->removeAll(*fLFSet);
1961     fOtherSet->removeAll(*fNewlineSet);
1962     fOtherSet->removeAll(*fKatakanaSet);
1963     fOtherSet->removeAll(*fHebrew_LetterSet);
1964     fOtherSet->removeAll(*fALetterSet);
1965     fOtherSet->removeAll(*fSingle_QuoteSet);
1966     fOtherSet->removeAll(*fDouble_QuoteSet);
1967     fOtherSet->removeAll(*fMidLetterSet);
1968     fOtherSet->removeAll(*fMidNumSet);
1969     fOtherSet->removeAll(*fNumericSet);
1970     fOtherSet->removeAll(*fExtendNumLetSet);
1971     fOtherSet->removeAll(*fWSegSpaceSet);
1972     fOtherSet->removeAll(*fFormatSet);
1973     fOtherSet->removeAll(*fExtendSet);
1974     fOtherSet->removeAll(*fRegionalIndicatorSet);
1975     fOtherSet->removeAll(*fZWJSet);
1976     fOtherSet->removeAll(*fExtendedPictSet);
1977 
1978     // Inhibit dictionary characters from being tested at all.
1979     fOtherSet->removeAll(*fDictionarySet);
1980 
1981     // Add classes and their names
1982     fSets->addElement(fCRSet, status); classNames.push_back("CR");
1983     fSets->addElement(fLFSet, status); classNames.push_back("LF");
1984     fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
1985     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1986     fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
1987     fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
1988     fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
1989     fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
1990     // Omit Katakana from fSets, which omits Katakana characters
1991     // from the test data. They are all in the dictionary set,
1992     // which this (old, to be retired) monkey test cannot handle.
1993     //fSets->addElement(fKatakanaSet, status);
1994 
1995     fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
1996     fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
1997     fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
1998     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
1999     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2000     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2001     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2002     fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2003     fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2004 
2005     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2006     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2007 
2008     if (U_FAILURE(status)) {
2009         deferredStatus = status;
2010     }
2011 }
2012 
setText(const UnicodeString & s)2013 void RBBIWordMonkey::setText(const UnicodeString &s) {
2014     fText       = &s;
2015     prepareAppliedRules(s.length());
2016 }
2017 
2018 
next(int32_t prevPos)2019 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2020     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2021                               //   break position being tested.  The candidate break
2022                               //   location is before p2.
2023 
2024     int     breakPos = -1;
2025 
2026     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2027 
2028     if (U_FAILURE(deferredStatus)) {
2029         return -1;
2030     }
2031 
2032     // Prev break at end of string.  return DONE.
2033     if (prevPos >= fText->length()) {
2034         return -1;
2035     }
2036     p0 = p1 = p2 = p3 = prevPos;
2037     c3 =  fText->char32At(prevPos);
2038     c0 = c1 = c2 = 0;
2039     (void)p0;       // Suppress set but not used warning.
2040 
2041     // Loop runs once per "significant" character position in the input text.
2042     for (;;) {
2043         // Move all of the positions forward in the input string.
2044         p0 = p1;  c0 = c1;
2045         p1 = p2;  c1 = c2;
2046         p2 = p3;  c2 = c3;
2047 
2048         // Advance p3 by    X(Extend | Format)*   Rule 4
2049         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2050         do {
2051             p3 = fText->moveIndex32(p3, 1);
2052             c3 = fText->char32At(p3);
2053             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2054                break;
2055             }
2056         }
2057         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2058 
2059 
2060         if (p1 == p2) {
2061             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2062             continue;
2063         }
2064 
2065         if (p2 == fText->length()) {
2066             // Reached end of string.  Always a break position.
2067             break;
2068         }
2069 
2070         //     No Extend or Format characters may appear between the CR and LF,
2071         //     which requires the additional check for p2 immediately following p1.
2072         //
2073         if (c1==0x0D && c2==0x0A) {
2074           setAppliedRule(p2, "WB3   CR x LF");
2075           continue;
2076         }
2077 
2078         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2079             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2080             break;
2081         }
2082         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2083             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2084             break;
2085         }
2086 
2087         //              Not ignoring extend chars, so peek into input text to
2088         //              get the potential ZWJ, the character immediately preceding c2.
2089         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2090         //              but char32At will get the full code point.
2091         if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2092             setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
2093             continue;
2094         }
2095 
2096         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2097             setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
2098             continue;
2099         }
2100 
2101         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2102             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2103             setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2104             continue;
2105         }
2106 
2107         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2108              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2109              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2110             setAppliedRule(p2,
2111                            "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2112             continue;
2113         }
2114 
2115         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2116             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2117             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2118             setAppliedRule(p2,
2119                            "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
2120             continue;
2121         }
2122 
2123         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2124             setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
2125             continue;
2126         }
2127 
2128           if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2129             setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
2130             continue;
2131         }
2132 
2133         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2134             setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
2135             continue;
2136         }
2137 
2138         if (fNumericSet->contains(c1) &&
2139             fNumericSet->contains(c2)) {
2140             setAppliedRule(p2, "WB8   Numeric x Numeric");
2141             continue;
2142         }
2143 
2144         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2145             fNumericSet->contains(c2)) {
2146             setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
2147             continue;
2148         }
2149 
2150         if (fNumericSet->contains(c1) &&
2151             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2152             setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
2153             continue;
2154         }
2155 
2156           if (fNumericSet->contains(c0) &&
2157             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2158             fNumericSet->contains(c2)) {
2159             setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
2160             continue;
2161         }
2162 
2163         if (fNumericSet->contains(c1) &&
2164             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2165             fNumericSet->contains(c3)) {
2166             setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2167             continue;
2168         }
2169 
2170         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2171         //                  all Katakana are handled by the dictionary breaker.
2172         if (fKatakanaSet->contains(c1) &&
2173             fKatakanaSet->contains(c2))  {
2174             setAppliedRule(p2, "WB13  Katakana x Katakana");
2175             continue;
2176         }
2177 
2178         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2179              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2180              fExtendNumLetSet->contains(c2)) {
2181             setAppliedRule(p2,
2182                            "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2183             continue;
2184         }
2185 
2186         if (fExtendNumLetSet->contains(c1) &&
2187                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2188                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2189             setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2190             continue;
2191         }
2192 
2193         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2194             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2195             break;
2196         }
2197         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2198             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2199             continue;
2200         }
2201 
2202         setAppliedRule(p2, "WB999");
2203         break;
2204     }
2205 
2206     breakPos = p2;
2207     return breakPos;
2208 }
2209 
2210 
charClasses()2211 UVector  *RBBIWordMonkey::charClasses() {
2212     return fSets;
2213 }
2214 
~RBBIWordMonkey()2215 RBBIWordMonkey::~RBBIWordMonkey() {
2216     delete fSets;
2217     delete fCRSet;
2218     delete fLFSet;
2219     delete fNewlineSet;
2220     delete fKatakanaSet;
2221     delete fHebrew_LetterSet;
2222     delete fALetterSet;
2223     delete fSingle_QuoteSet;
2224     delete fDouble_QuoteSet;
2225     delete fMidNumLetSet;
2226     delete fMidLetterSet;
2227     delete fMidNumSet;
2228     delete fNumericSet;
2229     delete fFormatSet;
2230     delete fExtendSet;
2231     delete fExtendNumLetSet;
2232     delete fWSegSpaceSet;
2233     delete fRegionalIndicatorSet;
2234     delete fDictionarySet;
2235     delete fOtherSet;
2236     delete fZWJSet;
2237     delete fExtendedPictSet;
2238 }
2239 
2240 
2241 
2242 
2243 //------------------------------------------------------------------------------------------
2244 //
2245 //   class RBBISentMonkey      Sentence Break specific implementation
2246 //                             of RBBIMonkeyKind.
2247 //
2248 //------------------------------------------------------------------------------------------
2249 class RBBISentMonkey: public RBBIMonkeyKind {
2250 public:
2251     RBBISentMonkey();
2252     virtual          ~RBBISentMonkey();
2253     virtual  UVector *charClasses() override;
2254     virtual  void     setText(const UnicodeString &s) override;
2255     virtual int32_t   next(int32_t i) override;
2256 private:
2257     int               moveBack(int posFrom);
2258     int               moveForward(int posFrom);
2259     UChar32           cAt(int pos);
2260 
2261     UVector      *fSets;
2262 
2263     UnicodeSet  *fSepSet;
2264     UnicodeSet  *fFormatSet;
2265     UnicodeSet  *fSpSet;
2266     UnicodeSet  *fLowerSet;
2267     UnicodeSet  *fUpperSet;
2268     UnicodeSet  *fOLetterSet;
2269     UnicodeSet  *fNumericSet;
2270     UnicodeSet  *fATermSet;
2271     UnicodeSet  *fSContinueSet;
2272     UnicodeSet  *fSTermSet;
2273     UnicodeSet  *fCloseSet;
2274     UnicodeSet  *fOtherSet;
2275     UnicodeSet  *fExtendSet;
2276 
2277     const UnicodeString  *fText;
2278 };
2279 
RBBISentMonkey()2280 RBBISentMonkey::RBBISentMonkey()
2281 {
2282     UErrorCode  status = U_ZERO_ERROR;
2283 
2284     fSets            = new UVector(status);
2285 
2286     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2287     //                       set and made into character classes of their own.  For the monkey impl,
2288     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2289     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2290     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2291     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2292     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2293     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2294     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2295     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2296     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2297     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2298     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2299     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2300     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2301     fOtherSet        = new UnicodeSet();
2302 
2303     if(U_FAILURE(status)) {
2304       deferredStatus = status;
2305       return;
2306     }
2307 
2308     fOtherSet->complement();
2309     fOtherSet->removeAll(*fSepSet);
2310     fOtherSet->removeAll(*fFormatSet);
2311     fOtherSet->removeAll(*fSpSet);
2312     fOtherSet->removeAll(*fLowerSet);
2313     fOtherSet->removeAll(*fUpperSet);
2314     fOtherSet->removeAll(*fOLetterSet);
2315     fOtherSet->removeAll(*fNumericSet);
2316     fOtherSet->removeAll(*fATermSet);
2317     fOtherSet->removeAll(*fSContinueSet);
2318     fOtherSet->removeAll(*fSTermSet);
2319     fOtherSet->removeAll(*fCloseSet);
2320     fOtherSet->removeAll(*fExtendSet);
2321 
2322     fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2323     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2324     fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2325     fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2326     fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2327     fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2328     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2329     fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2330     fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2331     fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2332     fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2333     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2334     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2335 
2336     if (U_FAILURE(status)) {
2337         deferredStatus = status;
2338     }
2339 }
2340 
2341 
2342 
setText(const UnicodeString & s)2343 void RBBISentMonkey::setText(const UnicodeString &s) {
2344     fText       = &s;
2345     prepareAppliedRules(s.length());
2346 }
2347 
charClasses()2348 UVector  *RBBISentMonkey::charClasses() {
2349     return fSets;
2350 }
2351 
2352 //  moveBack()   Find the "significant" code point preceding the index i.
2353 //               Skips over ($Extend | $Format)* .
2354 //
moveBack(int i)2355 int RBBISentMonkey::moveBack(int i) {
2356     if (i <= 0) {
2357         return -1;
2358     }
2359     UChar32   c;
2360     int32_t   j = i;
2361     do {
2362         j = fText->moveIndex32(j, -1);
2363         c = fText->char32At(j);
2364     }
2365     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2366     return j;
2367 
2368  }
2369 
2370 
moveForward(int i)2371 int RBBISentMonkey::moveForward(int i) {
2372     if (i>=fText->length()) {
2373         return fText->length();
2374     }
2375     UChar32   c;
2376     int32_t   j = i;
2377     do {
2378         j = fText->moveIndex32(j, 1);
2379         c = cAt(j);
2380     }
2381     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2382     return j;
2383 }
2384 
cAt(int pos)2385 UChar32 RBBISentMonkey::cAt(int pos) {
2386     if (pos<0 || pos>=fText->length()) {
2387         return -1;
2388     } else {
2389         return fText->char32At(pos);
2390     }
2391 }
2392 
next(int32_t prevPos)2393 int32_t RBBISentMonkey::next(int32_t prevPos) {
2394     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2395                               //   break position being tested.  The candidate break
2396                               //   location is before p2.
2397 
2398     int     breakPos = -1;
2399 
2400     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2401     UChar32 c;
2402 
2403     if (U_FAILURE(deferredStatus)) {
2404         return -1;
2405     }
2406 
2407     // Prev break at end of string.  return DONE.
2408     if (prevPos >= fText->length()) {
2409         return -1;
2410     }
2411     p0 = p1 = p2 = p3 = prevPos;
2412     c3 =  fText->char32At(prevPos);
2413     c0 = c1 = c2 = 0;
2414     (void)p0;     // Suppress set but not used warning.
2415 
2416     // Loop runs once per "significant" character position in the input text.
2417     for (;;) {
2418         // Move all of the positions forward in the input string.
2419         p0 = p1;  c0 = c1;
2420         p1 = p2;  c1 = c2;
2421         p2 = p3;  c2 = c3;
2422 
2423         // Advance p3 by    X(Extend | Format)*   Rule 4
2424         p3 = moveForward(p3);
2425         c3 = cAt(p3);
2426 
2427         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2428             setAppliedRule(p2, "SB3   CR x LF");
2429             continue;
2430         }
2431 
2432         if (fSepSet->contains(c1)) {
2433             p2 = p1+1;   // Separators don't combine with Extend or Format.
2434 
2435             setAppliedRule(p2, "SB4   Sep  <break>");
2436             break;
2437         }
2438 
2439         if (p2 >= fText->length()) {
2440             // Reached end of string.  Always a break position.
2441             setAppliedRule(p2, "SB4   Sep  <break>");
2442             break;
2443         }
2444 
2445         if (p2 == prevPos) {
2446             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2447             setAppliedRule(p2, "SB4   Sep  <break>");
2448             continue;
2449         }
2450 
2451         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2452             setAppliedRule(p2, "SB6   ATerm x Numeric");
2453             continue;
2454         }
2455 
2456           if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2457                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2458             setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
2459             continue;
2460         }
2461 
2462         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2463         //                  note to the Unicode 5.0 documents.
2464         int p8 = p1;
2465         while (fSpSet->contains(cAt(p8))) {
2466             p8 = moveBack(p8);
2467         }
2468         while (fCloseSet->contains(cAt(p8))) {
2469             p8 = moveBack(p8);
2470         }
2471         if (fATermSet->contains(cAt(p8))) {
2472             p8=p2;
2473             for (;;) {
2474                 c = cAt(p8);
2475                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2476                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2477                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2478 
2479                     setAppliedRule(p2,
2480                                    "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2481                     break;
2482                 }
2483                 p8 = moveForward(p8);
2484             }
2485             if (fLowerSet->contains(cAt(p8))) {
2486 
2487                 setAppliedRule(p2,
2488                                "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2489                 continue;
2490             }
2491         }
2492 
2493         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2494             p8 = p1;
2495             while (fSpSet->contains(cAt(p8))) {
2496                 p8 = moveBack(p8);
2497             }
2498             while (fCloseSet->contains(cAt(p8))) {
2499                 p8 = moveBack(p8);
2500             }
2501             c = cAt(p8);
2502             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2503                 setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2504                 continue;
2505             }
2506         }
2507 
2508         int p9 = p1;
2509         while (fCloseSet->contains(cAt(p9))) {
2510             p9 = moveBack(p9);
2511         }
2512         c = cAt(p9);
2513         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2514             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2515 
2516                 setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
2517                 continue;
2518             }
2519         }
2520 
2521         int p10 = p1;
2522         while (fSpSet->contains(cAt(p10))) {
2523             p10 = moveBack(p10);
2524         }
2525         while (fCloseSet->contains(cAt(p10))) {
2526             p10 = moveBack(p10);
2527         }
2528         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2529             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2530                 setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
2531                 continue;
2532             }
2533         }
2534 
2535         int p11 = p1;
2536         if (fSepSet->contains(cAt(p11))) {
2537             p11 = moveBack(p11);
2538         }
2539         while (fSpSet->contains(cAt(p11))) {
2540             p11 = moveBack(p11);
2541         }
2542         while (fCloseSet->contains(cAt(p11))) {
2543             p11 = moveBack(p11);
2544         }
2545         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2546           setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
2547             break;
2548         }
2549 
2550         setAppliedRule(p2, "SB12  Any x Any");
2551         continue;
2552     }
2553 
2554     breakPos = p2;
2555     return breakPos;
2556 }
2557 
~RBBISentMonkey()2558 RBBISentMonkey::~RBBISentMonkey() {
2559     delete fSets;
2560     delete fSepSet;
2561     delete fFormatSet;
2562     delete fSpSet;
2563     delete fLowerSet;
2564     delete fUpperSet;
2565     delete fOLetterSet;
2566     delete fNumericSet;
2567     delete fATermSet;
2568     delete fSContinueSet;
2569     delete fSTermSet;
2570     delete fCloseSet;
2571     delete fOtherSet;
2572     delete fExtendSet;
2573 }
2574 
2575 
2576 
2577 //-------------------------------------------------------------------------------------------
2578 //
2579 //  RBBILineMonkey
2580 //
2581 //-------------------------------------------------------------------------------------------
2582 
2583 class RBBILineMonkey: public RBBIMonkeyKind {
2584 public:
2585     RBBILineMonkey();
2586     virtual          ~RBBILineMonkey();
2587     virtual  UVector *charClasses() override;
2588     virtual  void     setText(const UnicodeString &s) override;
2589     virtual  int32_t  next(int32_t i) override;
2590     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2591 private:
2592     UVector      *fSets;
2593 
2594     UnicodeSet  *fBK;
2595     UnicodeSet  *fCR;
2596     UnicodeSet  *fLF;
2597     UnicodeSet  *fCM;
2598     UnicodeSet  *fNL;
2599     UnicodeSet  *fSG;
2600     UnicodeSet  *fWJ;
2601     UnicodeSet  *fZW;
2602     UnicodeSet  *fGL;
2603     UnicodeSet  *fCB;
2604     UnicodeSet  *fSP;
2605     UnicodeSet  *fB2;
2606     UnicodeSet  *fBA;
2607     UnicodeSet  *fBB;
2608     UnicodeSet  *fHH;
2609     UnicodeSet  *fHY;
2610     UnicodeSet  *fH2;
2611     UnicodeSet  *fH3;
2612     UnicodeSet  *fCL;
2613     UnicodeSet  *fCP;
2614     UnicodeSet  *fEX;
2615     UnicodeSet  *fIN;
2616     UnicodeSet  *fJL;
2617     UnicodeSet  *fJV;
2618     UnicodeSet  *fJT;
2619     UnicodeSet  *fNS;
2620     UnicodeSet  *fOP;
2621     UnicodeSet  *fQU;
2622     UnicodeSet  *fIS;
2623     UnicodeSet  *fNU;
2624     UnicodeSet  *fPO;
2625     UnicodeSet  *fPR;
2626     UnicodeSet  *fSY;
2627     UnicodeSet  *fAI;
2628     UnicodeSet  *fAL;
2629     UnicodeSet  *fCJ;
2630     UnicodeSet  *fHL;
2631     UnicodeSet  *fID;
2632     UnicodeSet  *fRI;
2633     UnicodeSet  *fXX;
2634     UnicodeSet  *fEB;
2635     UnicodeSet  *fEM;
2636     UnicodeSet  *fZWJ;
2637     UnicodeSet  *fOP30;
2638     UnicodeSet  *fCP30;
2639     UnicodeSet  *fExtPictUnassigned;
2640 
2641     BreakIterator        *fCharBI;
2642     const UnicodeString  *fText;
2643     RegexMatcher         *fNumberMatcher;
2644 };
2645 
RBBILineMonkey()2646 RBBILineMonkey::RBBILineMonkey() :
2647     RBBIMonkeyKind(),
2648     fSets(NULL),
2649 
2650     fCharBI(NULL),
2651     fText(NULL),
2652     fNumberMatcher(NULL)
2653 
2654 {
2655     if (U_FAILURE(deferredStatus)) {
2656         return;
2657     }
2658 
2659     UErrorCode  status = U_ZERO_ERROR;
2660 
2661     fSets  = new UVector(status);
2662 
2663     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2664     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2665     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2666     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2667     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2668     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2669     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2670     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2671     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2672     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2673     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2674     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2675     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2676     fHH    = new UnicodeSet();
2677     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2678     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2679     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2680     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2681     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2682     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2683     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2684     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2685     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2686     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2687     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2688     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2689     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2690     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2691     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2692     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2693     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2694     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2695     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2696     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2697     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2698     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2699     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2700     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2701     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2702     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2703     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2704     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2705     fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2706     fOP30  = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2707     fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2708     fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2709 
2710     if (U_FAILURE(status)) {
2711         deferredStatus = status;
2712         return;
2713     }
2714 
2715     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2716     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2717     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2718 
2719     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2720     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2721 
2722     fHH->add(u'\u2010');   // Hyphen, '‐'
2723 
2724     // Sets and names.
2725     fSets->addElement(fBK, status); classNames.push_back("fBK");
2726     fSets->addElement(fCR, status); classNames.push_back("fCR");
2727     fSets->addElement(fLF, status); classNames.push_back("fLF");
2728     fSets->addElement(fCM, status); classNames.push_back("fCM");
2729     fSets->addElement(fNL, status); classNames.push_back("fNL");
2730     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2731     fSets->addElement(fZW, status); classNames.push_back("fZW");
2732     fSets->addElement(fGL, status); classNames.push_back("fGL");
2733     fSets->addElement(fCB, status); classNames.push_back("fCB");
2734     fSets->addElement(fSP, status); classNames.push_back("fSP");
2735     fSets->addElement(fB2, status); classNames.push_back("fB2");
2736     fSets->addElement(fBA, status); classNames.push_back("fBA");
2737     fSets->addElement(fBB, status); classNames.push_back("fBB");
2738     fSets->addElement(fHY, status); classNames.push_back("fHY");
2739     fSets->addElement(fH2, status); classNames.push_back("fH2");
2740     fSets->addElement(fH3, status); classNames.push_back("fH3");
2741     fSets->addElement(fCL, status); classNames.push_back("fCL");
2742     fSets->addElement(fCP, status); classNames.push_back("fCP");
2743     fSets->addElement(fEX, status); classNames.push_back("fEX");
2744     fSets->addElement(fIN, status); classNames.push_back("fIN");
2745     fSets->addElement(fJL, status); classNames.push_back("fJL");
2746     fSets->addElement(fJT, status); classNames.push_back("fJT");
2747     fSets->addElement(fJV, status); classNames.push_back("fJV");
2748     fSets->addElement(fNS, status); classNames.push_back("fNS");
2749     fSets->addElement(fOP, status); classNames.push_back("fOP");
2750     fSets->addElement(fQU, status); classNames.push_back("fQU");
2751     fSets->addElement(fIS, status); classNames.push_back("fIS");
2752     fSets->addElement(fNU, status); classNames.push_back("fNU");
2753     fSets->addElement(fPO, status); classNames.push_back("fPO");
2754     fSets->addElement(fPR, status); classNames.push_back("fPR");
2755     fSets->addElement(fSY, status); classNames.push_back("fSY");
2756     fSets->addElement(fAI, status); classNames.push_back("fAI");
2757     fSets->addElement(fAL, status); classNames.push_back("fAL");
2758     fSets->addElement(fHL, status); classNames.push_back("fHL");
2759     fSets->addElement(fID, status); classNames.push_back("fID");
2760     fSets->addElement(fRI, status); classNames.push_back("fRI");
2761     fSets->addElement(fSG, status); classNames.push_back("fSG");
2762     fSets->addElement(fEB, status); classNames.push_back("fEB");
2763     fSets->addElement(fEM, status); classNames.push_back("fEM");
2764     fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2765     // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2766     fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2767     fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2768     fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
2769 
2770     const char *rules =
2771             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2772             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2773             "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2774             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2775             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2776             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2777             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2778 
2779     fNumberMatcher = new RegexMatcher(
2780         UnicodeString(rules, -1, US_INV), 0, status);
2781 
2782     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2783 
2784     if (U_FAILURE(status)) {
2785         deferredStatus = status;
2786     }
2787 
2788 }
2789 
2790 
setText(const UnicodeString & s)2791 void RBBILineMonkey::setText(const UnicodeString &s) {
2792     fText       = &s;
2793     fCharBI->setText(s);
2794     prepareAppliedRules(s.length());
2795     fNumberMatcher->reset(s);
2796 }
2797 
2798 //
2799 //  rule9Adjust
2800 //     Line Break TR rules 9 and 10 implementation.
2801 //     This deals with combining marks and other sequences that
2802 //     that must be treated as if they were something other than what they actually are.
2803 //
2804 //     This is factored out into a separate function because it must be applied twice for
2805 //     each potential break, once to the chars before the position being checked, then
2806 //     again to the text following the possible break.
2807 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2808 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2809     if (pos == -1) {
2810         // Invalid initial position.  Happens during the warmup iteration of the
2811         //   main loop in next().
2812         return;
2813     }
2814 
2815     int32_t  nPos = *nextPos;
2816 
2817     // LB 9  Keep combining sequences together.
2818     // advance over any CM class chars.  Note that Line Break CM is different
2819     // from the normal Grapheme Extend property.
2820     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2821           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2822         for (;;) {
2823             *nextChar = fText->char32At(nPos);
2824             if (!fCM->contains(*nextChar)) {
2825                 break;
2826             }
2827             nPos = fText->moveIndex32(nPos, 1);
2828         }
2829     }
2830 
2831 
2832     // LB 9 Treat X CM* as if it were x.
2833     //       No explicit action required.
2834 
2835     // LB 10  Treat any remaining combining mark as AL
2836     if (fCM->contains(*posChar)) {
2837         *posChar = u'A';
2838     }
2839 
2840     // Push the updated nextPos and nextChar back to our caller.
2841     // This only makes a difference if posChar got bigger by consuming a
2842     // combining sequence.
2843     *nextPos  = nPos;
2844     *nextChar = fText->char32At(nPos);
2845 }
2846 
2847 
2848 
next(int32_t startPos)2849 int32_t RBBILineMonkey::next(int32_t startPos) {
2850     UErrorCode status = U_ZERO_ERROR;
2851     int32_t    pos;       //  Index of the char following a potential break position
2852     UChar32    thisChar;  //  Character at above position "pos"
2853 
2854     int32_t    prevPos;   //  Index of the char preceding a potential break position
2855     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2856                           //   and thisChar may not be adjacent because combining
2857                           //   characters between them will be ignored.
2858 
2859     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2860     UChar32    prevCharX2;
2861 
2862     int32_t    nextPos;   //  Index of the next character following pos.
2863                           //     Usually skips over combining marks.
2864     int32_t    nextCPPos; //  Index of the code point following "pos."
2865                           //     May point to a combining mark.
2866     int32_t    tPos;      //  temp value.
2867     UChar32    c;
2868 
2869     if (U_FAILURE(deferredStatus)) {
2870         return -1;
2871     }
2872 
2873     if (startPos >= fText->length()) {
2874         return -1;
2875     }
2876 
2877 
2878     // Initial values for loop.  Loop will run the first time without finding breaks,
2879     //                           while the invalid values shift out and the "this" and
2880     //                           "prev" positions are filled in with good values.
2881     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2882     thisChar = prevChar  = prevCharX2 = 0;
2883     nextPos  = nextCPPos = startPos;
2884 
2885 
2886     // Loop runs once per position in the test text, until a break position
2887     //  is found.
2888     for (;;) {
2889         prevPosX2 = prevPos;
2890         prevCharX2 = prevChar;
2891 
2892         prevPos   = pos;
2893         prevChar  = thisChar;
2894 
2895         pos       = nextPos;
2896         thisChar  = fText->char32At(pos);
2897 
2898         nextCPPos = fText->moveIndex32(pos, 1);
2899         nextPos   = nextCPPos;
2900 
2901 
2902         if (pos >= fText->length()) {
2903             setAppliedRule(pos, "LB2 - Break at end of text.");
2904             break;
2905         }
2906 
2907 
2908         //             We do this one out-of-order because the adjustment does not change anything
2909         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2910         //             be applied.
2911         rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2912         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2913         c = fText->char32At(nextPos);
2914         rule9Adjust(pos, &thisChar, &nextPos, &c);
2915 
2916         // If the loop is still warming up - if we haven't shifted the initial
2917         //   -1 positions out of prevPos yet - loop back to advance the
2918         //    position in the input without any further looking for breaks.
2919         if (prevPos == -1) {
2920           setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2921             continue;
2922         }
2923 
2924 
2925         if (fBK->contains(prevChar)) {
2926             setAppliedRule(pos, "LB 4  Always break after hard line breaks");
2927             break;
2928         }
2929 
2930 
2931         if (prevChar == 0x0d && thisChar == 0x0a) {
2932             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
2933             continue;
2934         }
2935         if (prevChar == 0x0d ||
2936             prevChar == 0x0a ||
2937             prevChar == 0x85)  {
2938             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
2939             break;
2940         }
2941 
2942 
2943         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2944             fBK->contains(thisChar)) {
2945             setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
2946             continue;
2947         }
2948 
2949 
2950         if (fSP->contains(thisChar)) {
2951             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
2952             continue;
2953         }
2954 
2955         // !!! ??? Is this the right text for the applied rule?
2956         if (fZW->contains(thisChar)) {
2957             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
2958             continue;
2959         }
2960 
2961 
2962         //       ZW SP* ÷
2963         //       Scan backwards from prevChar for SP* ZW
2964         tPos = prevPos;
2965         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2966             tPos = fText->moveIndex32(tPos, -1);
2967         }
2968         if (fZW->contains(fText->char32At(tPos))) {
2969             setAppliedRule(pos, "LB 8  Break after zero width space");
2970             break;
2971         }
2972 
2973 
2974         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
2975         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
2976         if (fNumberMatcher->lookingAt(prevPos, status)) {
2977             if (U_FAILURE(status)) {
2978                 setAppliedRule(pos, "LB 25 Numbers");
2979                 break;
2980             }
2981             // Matched a number.  But could have been just a single digit, which would
2982             //    not represent a "no break here" between prevChar and thisChar
2983             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
2984             if (numEndIdx > pos) {
2985                 // Number match includes at least our two chars being checked
2986                 if (numEndIdx > nextPos) {
2987                     // Number match includes additional chars.  Update pos and nextPos
2988                     //   so that next loop iteration will continue at the end of the number,
2989                     //   checking for breaks between last char in number & whatever follows.
2990                     pos = nextPos = numEndIdx;
2991                     do {
2992                         pos = fText->moveIndex32(pos, -1);
2993                         thisChar = fText->char32At(pos);
2994                     } while (fCM->contains(thisChar));
2995                 }
2996                 setAppliedRule(pos, "LB 25 Numbers");
2997                 continue;
2998             }
2999         }
3000 
3001 
3002         //       The monkey test's way of ignoring combining characters doesn't work
3003         //       for this rule. ZJ is also a CM. Need to get the actual character
3004         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3005         {
3006             int32_t prevIdx = fText->moveIndex32(pos, -1);
3007             UChar32 prevC = fText->char32At(prevIdx);
3008             if (fZWJ->contains(prevC)) {
3009                 setAppliedRule(pos, "LB 8a ZWJ x");
3010                 continue;
3011             }
3012         }
3013 
3014 
3015         // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
3016         //
3017 
3018 
3019         //    x  WJ
3020         //    WJ  x
3021         //
3022         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3023             setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
3024             continue;
3025         }
3026 
3027 
3028         if (fGL->contains(prevChar)) {
3029             setAppliedRule(pos, "LB 12  GL  x");
3030             continue;
3031         }
3032 
3033 
3034           if (!(fSP->contains(prevChar) ||
3035               fBA->contains(prevChar) ||
3036               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3037               setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
3038               continue;
3039         }
3040 
3041 
3042         if (fCL->contains(thisChar) ||
3043                 fCP->contains(thisChar) ||
3044                 fEX->contains(thisChar) ||
3045                 fSY->contains(thisChar)) {
3046             setAppliedRule(pos, "LB 13  Don't break before closings.");
3047             continue;
3048         }
3049 
3050 
3051         //       Scan backwards, checking for this sequence.
3052         //       The OP char could include combining marks, so we actually check for
3053         //           OP CM* SP*
3054         //       Another Twist: The Rule 9 fixes may have changed a SP CM
3055         //       sequence into a ID char, so before scanning back through spaces,
3056         //       verify that prevChar is indeed a space.  The prevChar variable
3057         //       may differ from fText[prevPos]
3058         tPos = prevPos;
3059         if (fSP->contains(prevChar)) {
3060             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3061                 tPos=fText->moveIndex32(tPos, -1);
3062             }
3063         }
3064         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3065             tPos=fText->moveIndex32(tPos, -1);
3066         }
3067         if (fOP->contains(fText->char32At(tPos))) {
3068             setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3069             continue;
3070         }
3071 
3072 
3073         if (nextPos < fText->length()) {
3074             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3075             //       from a legit ffff character. So test length separately.
3076             UChar32 nextChar = fText->char32At(nextPos);
3077             if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3078                 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3079                 break;
3080             }
3081         }
3082 
3083 
3084           if (fIS->contains(thisChar)) {
3085               setAppliedRule(pos, "LB 14b  Do not break before numeric separators, even after spaces.");
3086               continue;
3087         }
3088 
3089 
3090         if (fOP->contains(thisChar)) {
3091             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3092             int tPos = prevPos;
3093             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3094                 tPos = fText->moveIndex32(tPos, -1);
3095             }
3096             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3097                 tPos = fText->moveIndex32(tPos, -1);
3098             }
3099             if (fQU->contains(fText->char32At(tPos))) {
3100                 setAppliedRule(pos, "LB 15    QU SP* x OP");
3101                 continue;
3102             }
3103         }
3104 
3105 
3106         //    Scan backwards for SP* CM* (CL | CP)
3107         if (fNS->contains(thisChar)) {
3108             int tPos = prevPos;
3109             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3110                 tPos = fText->moveIndex32(tPos, -1);
3111             }
3112             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3113                 tPos = fText->moveIndex32(tPos, -1);
3114             }
3115             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3116                 setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
3117                 continue;
3118             }
3119         }
3120 
3121 
3122         if (fB2->contains(thisChar)) {
3123             //  Scan backwards, checking for the B2 CM* SP* sequence.
3124             tPos = prevPos;
3125             if (fSP->contains(prevChar)) {
3126                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3127                     tPos=fText->moveIndex32(tPos, -1);
3128                 }
3129             }
3130             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3131                 tPos=fText->moveIndex32(tPos, -1);
3132             }
3133             if (fB2->contains(fText->char32At(tPos))) {
3134                 setAppliedRule(pos, "LB 17   B2 SP* x B2");
3135                 continue;
3136             }
3137         }
3138 
3139 
3140         if (fSP->contains(prevChar)) {
3141             setAppliedRule(pos, "LB 18    break after space");
3142             break;
3143         }
3144 
3145         //    x   QU
3146         //    QU  x
3147         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3148             setAppliedRule(pos, "LB 19");
3149             continue;
3150         }
3151 
3152         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3153             setAppliedRule(pos, "LB 20  Break around a CB");
3154             break;
3155         }
3156 
3157         //           Don't break between Hyphens and letters if a break precedes the hyphen.
3158         //           Formerly this was a Finnish tailoring.
3159         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3160         //           ^($HY | $HH) $AL;
3161         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3162                 prevPosX2 == -1) {
3163             setAppliedRule(pos, "LB 20.09");
3164             continue;
3165         }
3166 
3167         if (fBA->contains(thisChar) ||
3168             fHY->contains(thisChar) ||
3169             fNS->contains(thisChar) ||
3170             fBB->contains(prevChar) )   {
3171             setAppliedRule(pos, "LB 21");
3172             continue;
3173         }
3174 
3175         if (fHL->contains(prevCharX2) &&
3176                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3177             setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
3178             continue;
3179         }
3180 
3181         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3182             setAppliedRule(pos, "LB 21b SY x HL");
3183             continue;
3184         }
3185 
3186         if (fIN->contains(thisChar))   {
3187             setAppliedRule(pos, "LB 22");
3188             continue;
3189         }
3190 
3191 
3192         //          (AL | HL) x NU
3193         //          NU x (AL | HL)
3194         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3195             setAppliedRule(pos, "LB 23");
3196             continue;
3197         }
3198         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3199             setAppliedRule(pos, "LB 23");
3200             continue;
3201         }
3202 
3203         // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3204         //      PR x (ID | EB | EM)
3205         //     (ID | EB | EM) x PO
3206         if (fPR->contains(prevChar) &&
3207                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3208             setAppliedRule(pos, "LB 23a");
3209             continue;
3210         }
3211         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3212                 fPO->contains(thisChar)) {
3213             setAppliedRule(pos, "LB 23a");
3214             continue;
3215         }
3216 
3217         //   Do not break between prefix and letters or ideographs.
3218         //         (PR | PO) x (AL | HL)
3219         //         (AL | HL) x (PR | PO)
3220         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3221                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3222             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3223             continue;
3224         }
3225         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3226                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3227             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3228             continue;
3229         }
3230 
3231         // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3232 
3233         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3234                                         fJV->contains(thisChar) ||
3235                                         fH2->contains(thisChar) ||
3236                                         fH3->contains(thisChar))) {
3237             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3238             continue;
3239                                         }
3240 
3241         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3242             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3243             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3244             continue;
3245         }
3246 
3247         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3248             fJT->contains(thisChar)) {
3249             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3250             continue;
3251         }
3252 
3253         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3254             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3255             fPO->contains(thisChar)) {
3256             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3257             continue;
3258         }
3259         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3260             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3261             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3262             continue;
3263         }
3264 
3265 
3266 
3267         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3268             setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
3269             continue;
3270         }
3271 
3272           if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3273               setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3274               continue;
3275         }
3276 
3277         //          (AL | NU) x OP
3278         //          CP x (AL | NU)
3279         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3280             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3281             continue;
3282         }
3283         if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3284             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3285             continue;
3286         }
3287 
3288         //             RI  x  RI
3289         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3290             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3291             break;
3292         }
3293         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3294             // Two Regional Indicators have been paired.
3295             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3296             // following RI. This is a hack.
3297             thisChar = -1;
3298             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3299             continue;
3300         }
3301 
3302         // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3303         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3304             setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
3305             continue;
3306         }
3307 
3308         if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3309             setAppliedRule(pos, "LB30b    [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
3310             continue;
3311         }
3312 
3313         setAppliedRule(pos, "LB 31    Break everywhere else");
3314         break;
3315     }
3316 
3317     return pos;
3318 }
3319 
3320 
charClasses()3321 UVector  *RBBILineMonkey::charClasses() {
3322     return fSets;
3323 }
3324 
3325 
~RBBILineMonkey()3326 RBBILineMonkey::~RBBILineMonkey() {
3327     delete fSets;
3328 
3329     delete fBK;
3330     delete fCR;
3331     delete fLF;
3332     delete fCM;
3333     delete fNL;
3334     delete fWJ;
3335     delete fZW;
3336     delete fGL;
3337     delete fCB;
3338     delete fSP;
3339     delete fB2;
3340     delete fBA;
3341     delete fBB;
3342     delete fHH;
3343     delete fHY;
3344     delete fH2;
3345     delete fH3;
3346     delete fCL;
3347     delete fCP;
3348     delete fEX;
3349     delete fIN;
3350     delete fJL;
3351     delete fJV;
3352     delete fJT;
3353     delete fNS;
3354     delete fOP;
3355     delete fQU;
3356     delete fIS;
3357     delete fNU;
3358     delete fPO;
3359     delete fPR;
3360     delete fSY;
3361     delete fAI;
3362     delete fAL;
3363     delete fCJ;
3364     delete fHL;
3365     delete fID;
3366     delete fRI;
3367     delete fSG;
3368     delete fXX;
3369     delete fEB;
3370     delete fEM;
3371     delete fZWJ;
3372     delete fOP30;
3373     delete fCP30;
3374     delete fExtPictUnassigned;
3375 
3376     delete fCharBI;
3377     delete fNumberMatcher;
3378 }
3379 
3380 
3381 //-------------------------------------------------------------------------------------------
3382 //
3383 //   TestMonkey
3384 //
3385 //     params
3386 //       seed=nnnnn        Random number starting seed.
3387 //                         Setting the seed allows errors to be reproduced.
3388 //       loop=nnn          Looping count.  Controls running time.
3389 //                         -1:  run forever.
3390 //                          0 or greater:  run length.
3391 //
3392 //       type = char | word | line | sent | title
3393 //
3394 //  Example:
3395 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3396 //
3397 //-------------------------------------------------------------------------------------------
3398 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3399 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3400     int32_t val = defaultVal;
3401     name.append(" *= *(-?\\d+)");
3402     UErrorCode status = U_ZERO_ERROR;
3403     RegexMatcher m(name, params, 0, status);
3404     if (m.find()) {
3405         // The param exists.  Convert the string to an int.
3406         char valString[100];
3407         int32_t paramLength = m.end(1, status) - m.start(1, status);
3408         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3409             paramLength = (int32_t)(sizeof(valString)-2);
3410         }
3411         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3412         val = strtol(valString, NULL, 10);
3413 
3414         // Delete this parameter from the params string.
3415         m.reset();
3416         params = m.replaceFirst("", status);
3417     }
3418     U_ASSERT(U_SUCCESS(status));
3419     return val;
3420 }
3421 #endif
3422 
3423 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3424 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3425                                     BreakIterator *bi,
3426                                     int expected[],
3427                                     int expectedcount)
3428 {
3429     int count = 0;
3430     int i = 0;
3431     int forward[50];
3432     bi->setText(ustr);
3433     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3434         forward[count] = i;
3435         if (count < expectedcount && expected[count] != i) {
3436             test->errln("%s:%d break forward test failed: expected %d but got %d",
3437                         __FILE__, __LINE__, expected[count], i);
3438             break;
3439         }
3440         count ++;
3441     }
3442     if (count != expectedcount) {
3443         printStringBreaks(ustr, expected, expectedcount);
3444         test->errln("%s:%d break forward test failed: missed %d match",
3445                     __FILE__, __LINE__, expectedcount - count);
3446         return;
3447     }
3448     // testing boundaries
3449     for (i = 1; i < expectedcount; i ++) {
3450         int j = expected[i - 1];
3451         if (!bi->isBoundary(j)) {
3452             printStringBreaks(ustr, expected, expectedcount);
3453             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3454                     __FILE__, __LINE__, j);
3455             return;
3456         }
3457         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3458             if (bi->isBoundary(j)) {
3459                 printStringBreaks(ustr, expected, expectedcount);
3460                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3461                     __FILE__, __LINE__, j);
3462                 return;
3463             }
3464         }
3465     }
3466 
3467     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3468         count --;
3469         if (forward[count] != i) {
3470             printStringBreaks(ustr, expected, expectedcount);
3471             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3472                         __FILE__, __LINE__, forward[count], i);
3473             break;
3474         }
3475     }
3476     if (count != 0) {
3477         printStringBreaks(ustr, expected, expectedcount);
3478         test->errln("break test previous() failed: missed a match");
3479         return;
3480     }
3481 
3482     // testing preceding
3483     for (i = 0; i < expectedcount - 1; i ++) {
3484         // int j = expected[i] + 1;
3485         int j = ustr.moveIndex32(expected[i], 1);
3486         for (; j <= expected[i + 1]; j ++) {
3487             int32_t expectedPreceding = expected[i];
3488             int32_t actualPreceding = bi->preceding(j);
3489             if (actualPreceding != expectedPreceding) {
3490                 printStringBreaks(ustr, expected, expectedcount);
3491                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3492                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3493                 return;
3494             }
3495         }
3496     }
3497 }
3498 #endif
3499 
TestWordBreaks(void)3500 void RBBITest::TestWordBreaks(void)
3501 {
3502 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3503 
3504     Locale        locale("en");
3505     UErrorCode    status = U_ZERO_ERROR;
3506     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3507     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3508     // Replaced any C+J characters in a row with a random sequence of characters
3509     // of the same length to make our C+J segmentation not get in the way.
3510     static const char *strlist[] =
3511     {
3512     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3513     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3514     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3515     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3516     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3517     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3518     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3519     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3520     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3521     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3522     "\\u2027\\U000e0067\\u0a47\\u00b7",
3523     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3524     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3525     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3526     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3527     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3528     "\\u0027\\u11af\\U000e0057\\u0602",
3529     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3530     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3531     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3532     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3533     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3534     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3535     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3536     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3537     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3538     "\\u18f4\\U000e0049\\u20e7\\u2027",
3539     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3540     "\\ua183\\u102d\\u0bec\\u003a",
3541     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3542     "\\u003a\\u0e57\\u0fad\\u002e",
3543     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3544     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3545     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3546     "\\u003a\\u0664\\u00b7\\u1fba",
3547     "\\u003b\\u0027\\u00b7\\u47a3",
3548     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3549     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3550     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3551     };
3552     int loop;
3553     if (U_FAILURE(status)) {
3554         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3555         return;
3556     }
3557     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3558         // printf("looping %d\n", loop);
3559         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3560         // RBBICharMonkey monkey;
3561         RBBIWordMonkey monkey;
3562 
3563         int expected[50];
3564         int expectedcount = 0;
3565 
3566         monkey.setText(ustr);
3567         int i;
3568         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3569             expected[expectedcount ++] = i;
3570         }
3571 
3572         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3573     }
3574     delete bi;
3575 #endif
3576 }
3577 
TestWordBoundary(void)3578 void RBBITest::TestWordBoundary(void)
3579 {
3580     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3581     Locale        locale("en");
3582     UErrorCode    status = U_ZERO_ERROR;
3583     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3584     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3585     if (U_FAILURE(status)) {
3586         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3587                 __FILE__, __LINE__, u_errorName(status));
3588         return;
3589     }
3590     UChar         str[50];
3591     static const char *strlist[] =
3592     {
3593     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3594     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3595     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3596     "\\u2027\\U000e0067\\u0a47\\u00b7",
3597     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3598     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3599     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3600     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3601     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3602     "\\u0027\\u11af\\U000e0057\\u0602",
3603     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3604     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3605     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3606     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3607     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3608     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3609     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3610     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3611     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3612     "\\u58f4\\U000e0049\\u20e7\\u2027",
3613     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3614     "\\ua183\\u102d\\u0bec\\u003a",
3615     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3616     "\\u003a\\u0e57\\u0fad\\u002e",
3617     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3618     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3619     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3620     "\\u003a\\u0664\\u00b7\\u1fba",
3621     "\\u003b\\u0027\\u00b7\\u47a3",
3622     };
3623     int loop;
3624     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3625         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3626         UnicodeString ustr(str);
3627         int forward[50];
3628         int count = 0;
3629 
3630         bi->setText(ustr);
3631         int prev = -1;
3632         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3633             ++count;
3634             if (count >= UPRV_LENGTHOF(forward)) {
3635                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3636                         __FILE__, __LINE__, loop, count, boundary);
3637                 return;
3638             }
3639             forward[count] = boundary;
3640             if (boundary <= prev) {
3641                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3642                         __FILE__, __LINE__, loop, prev, boundary);
3643                 break;
3644             }
3645             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3646                 if (bi->isBoundary(nonBoundary)) {
3647                     printStringBreaks(ustr, forward, count);
3648                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3649                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3650                     return;
3651                 }
3652             }
3653             if (!bi->isBoundary(boundary)) {
3654                 printStringBreaks(ustr, forward, count);
3655                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3656                        __FILE__, __LINE__, boundary);
3657                 return;
3658             }
3659             prev = boundary;
3660         }
3661     }
3662 }
3663 
TestLineBreaks(void)3664 void RBBITest::TestLineBreaks(void)
3665 {
3666 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3667     Locale        locale("en");
3668     UErrorCode    status = U_ZERO_ERROR;
3669     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3670     const int32_t  STRSIZE = 50;
3671     UChar         str[STRSIZE];
3672     static const char *strlist[] =
3673     {
3674      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3675      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3676              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3677      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3678              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3679      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3680      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3681      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3682      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3683      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3684      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3685      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3686      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3687      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3688      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3689      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3690      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3691      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3692      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3693      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3694      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3695      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3696      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3697      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3698      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3699      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3700      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3701      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3702      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3703      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3704      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3705      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3706      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3707      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3708      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3709      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3710      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3711      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3712      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3713          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3714     };
3715     int loop;
3716     TEST_ASSERT_SUCCESS(status);
3717     if (U_FAILURE(status)) {
3718         return;
3719     }
3720     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3721         // printf("looping %d\n", loop);
3722         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3723         if (t >= STRSIZE) {
3724             TEST_ASSERT(false);
3725             continue;
3726         }
3727 
3728 
3729         UnicodeString ustr(str);
3730         RBBILineMonkey monkey;
3731         if (U_FAILURE(monkey.deferredStatus)) {
3732             continue;
3733         }
3734 
3735         const int EXPECTEDSIZE = 50;
3736         int expected[EXPECTEDSIZE];
3737         int expectedcount = 0;
3738 
3739         monkey.setText(ustr);
3740 
3741         int i;
3742         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3743             if (expectedcount >= EXPECTEDSIZE) {
3744                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3745                 return;
3746             }
3747             expected[expectedcount ++] = i;
3748         }
3749 
3750         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3751     }
3752     delete bi;
3753 #endif
3754 }
3755 
TestSentBreaks(void)3756 void RBBITest::TestSentBreaks(void)
3757 {
3758 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3759     Locale        locale("en");
3760     UErrorCode    status = U_ZERO_ERROR;
3761     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3762     UChar         str[200];
3763     static const char *strlist[] =
3764     {
3765      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3766      "This\n",
3767      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3768      "\"Sentence ending with a quote.\" Bye.",
3769      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3770      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3771      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3772      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3773      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3774      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3775      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3776              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3777              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3778              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3779      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3780              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3781              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3782              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3783              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3784              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3785     };
3786     int loop;
3787     if (U_FAILURE(status)) {
3788         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3789         return;
3790     }
3791     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3792         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3793         UnicodeString ustr(str);
3794 
3795         RBBISentMonkey monkey;
3796         if (U_FAILURE(monkey.deferredStatus)) {
3797             continue;
3798         }
3799 
3800         const int EXPECTEDSIZE = 50;
3801         int expected[EXPECTEDSIZE];
3802         int expectedcount = 0;
3803 
3804         monkey.setText(ustr);
3805 
3806         int i;
3807         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3808             if (expectedcount >= EXPECTEDSIZE) {
3809                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3810                 return;
3811             }
3812             expected[expectedcount ++] = i;
3813         }
3814 
3815         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3816     }
3817     delete bi;
3818 #endif
3819 }
3820 
TestMonkey()3821 void RBBITest::TestMonkey() {
3822 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3823 
3824     UErrorCode     status    = U_ZERO_ERROR;
3825     int32_t        loopCount = 500;
3826     int32_t        seed      = 1;
3827     UnicodeString  breakType = "all";
3828     Locale         locale("en");
3829     UBool          useUText  = false;
3830 
3831     if (quick == false) {
3832         loopCount = 10000;
3833     }
3834 
3835     if (fTestParams) {
3836         UnicodeString p(fTestParams);
3837         loopCount = getIntParam("loop", p, loopCount);
3838         seed      = getIntParam("seed", p, seed);
3839 
3840         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3841         if (m.find()) {
3842             breakType = m.group(1, status);
3843             m.reset();
3844             p = m.replaceFirst("", status);
3845         }
3846 
3847         RegexMatcher u(" *utext", p, 0, status);
3848         if (u.find()) {
3849             useUText = true;
3850             u.reset();
3851             p = u.replaceFirst("", status);
3852         }
3853 
3854 
3855         // m.reset(p);
3856         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3857             // Each option is stripped out of the option string as it is processed.
3858             // All options have been checked.  The option string should have been completely emptied..
3859             char buf[100];
3860             p.extract(buf, sizeof(buf), NULL, status);
3861             buf[sizeof(buf)-1] = 0;
3862             errln("Unrecognized or extra parameter:  %s\n", buf);
3863             return;
3864         }
3865 
3866     }
3867 
3868     if (breakType == "char" || breakType == "all") {
3869         RBBICharMonkey  m;
3870         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3871         if (U_SUCCESS(status)) {
3872             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3873             if (breakType == "all" && useUText==false) {
3874                 // Also run a quick test with UText when "all" is specified
3875                 RunMonkey(bi, m, "char", seed, loopCount, true);
3876             }
3877         }
3878         else {
3879             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3880         }
3881         delete bi;
3882     }
3883 
3884     if (breakType == "word" || breakType == "all") {
3885         logln("Word Break Monkey Test");
3886         RBBIWordMonkey  m;
3887         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3888         if (U_SUCCESS(status)) {
3889             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3890         }
3891         else {
3892             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3893         }
3894         delete bi;
3895     }
3896 
3897     if (breakType == "line" || breakType == "all") {
3898         logln("Line Break Monkey Test");
3899         RBBILineMonkey  m;
3900         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3901         if (loopCount >= 10) {
3902             loopCount = loopCount / 5;   // Line break runs slower than the others.
3903         }
3904         if (U_SUCCESS(status)) {
3905             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3906         }
3907         else {
3908             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3909         }
3910         delete bi;
3911     }
3912 
3913     if (breakType == "sent" || breakType == "all"  ) {
3914         logln("Sentence Break Monkey Test");
3915         RBBISentMonkey  m;
3916         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3917         if (loopCount >= 10) {
3918             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3919         }
3920         if (U_SUCCESS(status)) {
3921             RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3922         }
3923         else {
3924             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3925         }
3926         delete bi;
3927     }
3928 
3929 #endif
3930 }
3931 
3932 //
3933 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3934 //    Parameters:
3935 //       bi      - the break iterator to use
3936 //       mk      - MonkeyKind, abstraction for obtaining expected results
3937 //       name    - Name of test (char, word, etc.) for use in error messages
3938 //       seed    - Seed for starting random number generator (parameter from user)
3939 //       numIterations
3940 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3941 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3942                          int32_t numIterations, UBool useUText) {
3943 
3944 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3945 
3946     const int32_t    TESTSTRINGLEN = 500;
3947     UnicodeString    testText;
3948     int32_t          numCharClasses;
3949     UVector          *chClasses;
3950     int              expectedCount = 0;
3951     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3952     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3953     char             reverseBreaks[TESTSTRINGLEN*2+1];
3954     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3955     char             followingBreaks[TESTSTRINGLEN*2+1];
3956     char             precedingBreaks[TESTSTRINGLEN*2+1];
3957     int              i;
3958     int              loopCount = 0;
3959 
3960 
3961     m_seed = seed;
3962 
3963     numCharClasses = mk.charClasses()->size();
3964     chClasses      = mk.charClasses();
3965 
3966     // Check for errors that occurred during the construction of the MonkeyKind object.
3967     //  Can't report them where they occurred because errln() is a method coming from intlTest,
3968     //  and is not visible outside of RBBITest :-(
3969     if (U_FAILURE(mk.deferredStatus)) {
3970         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3971         return;
3972     }
3973 
3974     // Verify that the character classes all have at least one member.
3975     for (i=0; i<numCharClasses; i++) {
3976         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3977         if (s == NULL || s->size() == 0) {
3978             errln("Character Class #%d is null or of zero size.", i);
3979             return;
3980         }
3981     }
3982 
3983     // For minimizing width of class name output.
3984     int classNameSize = mk.maxClassNameSize();
3985 
3986     while (loopCount < numIterations || numIterations == -1) {
3987         if (numIterations == -1 && loopCount % 10 == 0) {
3988             // If test is running in an infinite loop, display a periodic tic so
3989             //   we can tell that it is making progress.
3990             fprintf(stderr, ".");
3991         }
3992         // Save current random number seed, so that we can recreate the random numbers
3993         //   for this loop iteration in event of an error.
3994         seed = m_seed;
3995 
3996         // Populate a test string with data.
3997         testText.truncate(0);
3998         for (i=0; i<TESTSTRINGLEN; i++) {
3999             int32_t  aClassNum = m_rand() % numCharClasses;
4000             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4001             int32_t   charIdx = m_rand() % classSet->size();
4002             UChar32   c = classSet->charAt(charIdx);
4003             if (c < 0) {   // TODO:  deal with sets containing strings.
4004                 errln("%s:%d c < 0", __FILE__, __LINE__);
4005                 break;
4006             }
4007             // Do not assemble a supplementary character from randomly generated separate surrogates.
4008             //   (It could be a dictionary character)
4009             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4010                 continue;
4011             }
4012 
4013             testText.append(c);
4014         }
4015 
4016         // Calculate the expected results for this test string and reset applied rules.
4017         mk.setText(testText);
4018 
4019         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4020         expectedBreaks[0] = 1;
4021         int32_t breakPos = 0;
4022         expectedCount = 0;
4023         for (;;) {
4024             breakPos = mk.next(breakPos);
4025             if (breakPos == -1) {
4026                 break;
4027             }
4028             if (breakPos > testText.length()) {
4029                 errln("breakPos > testText.length()");
4030             }
4031             expectedBreaks[breakPos] = 1;
4032             expectedCount++;
4033             U_ASSERT(expectedCount<testText.length());
4034         }
4035 
4036         // Find the break positions using forward iteration
4037         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4038         if (useUText) {
4039             UErrorCode status = U_ZERO_ERROR;
4040             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4041             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4042             bi->setText(testUText, status);
4043             TEST_ASSERT_SUCCESS(status);
4044             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4045                                       //  This UText can be closed immediately, so long as the
4046                                       //  testText string continues to exist.
4047         } else {
4048             bi->setText(testText);
4049         }
4050 
4051         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4052             if (i < 0 || i > testText.length()) {
4053                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4054                 break;
4055             }
4056             forwardBreaks[i] = 1;
4057         }
4058 
4059         // Find the break positions using reverse iteration
4060         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4061         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4062             if (i < 0 || i > testText.length()) {
4063                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4064                 break;
4065             }
4066             reverseBreaks[i] = 1;
4067         }
4068 
4069         // Find the break positions using isBoundary() tests.
4070         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4071         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4072         for (i=0; i<=testText.length(); i++) {
4073             isBoundaryBreaks[i] = bi->isBoundary(i);
4074         }
4075 
4076 
4077         // Find the break positions using the following() function.
4078         // printf(".");
4079         memset(followingBreaks, 0, sizeof(followingBreaks));
4080         int32_t   lastBreakPos = 0;
4081         followingBreaks[0] = 1;
4082         for (i=0; i<testText.length(); i++) {
4083             breakPos = bi->following(i);
4084             if (breakPos <= i ||
4085                 breakPos < lastBreakPos ||
4086                 breakPos > testText.length() ||
4087                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4088                 errln("%s break monkey test: "
4089                     "Out of range value returned by BreakIterator::following().\n"
4090                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4091                          name, seed, i, breakPos, lastBreakPos);
4092                 break;
4093             }
4094             followingBreaks[breakPos] = 1;
4095             lastBreakPos = breakPos;
4096         }
4097 
4098         // Find the break positions using the preceding() function.
4099         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4100         lastBreakPos = testText.length();
4101         precedingBreaks[testText.length()] = 1;
4102         for (i=testText.length(); i>0; i--) {
4103             breakPos = bi->preceding(i);
4104             if (breakPos >= i ||
4105                 breakPos > lastBreakPos ||
4106                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4107                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4108                 errln("%s break monkey test: "
4109                     "Out of range value returned by BreakIterator::preceding().\n"
4110                     "index=%d;  prev returned %d; lastBreak=%d" ,
4111                     name,  i, breakPos, lastBreakPos);
4112                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4113                     precedingBreaks[i] = 2;   // Forces an error.
4114                 }
4115             } else {
4116                 if (breakPos >= 0) {
4117                     precedingBreaks[breakPos] = 1;
4118                 }
4119                 lastBreakPos = breakPos;
4120             }
4121         }
4122 
4123         // Compare the expected and actual results.
4124         for (i=0; i<=testText.length(); i++) {
4125             const char *errorType = NULL;
4126             const char* currentBreakData = NULL;
4127             if  (forwardBreaks[i] != expectedBreaks[i]) {
4128                 errorType = "next()";
4129                 currentBreakData = forwardBreaks;
4130             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4131                 errorType = "previous()";
4132                 currentBreakData = reverseBreaks;
4133            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4134                 errorType = "isBoundary()";
4135                 currentBreakData = isBoundaryBreaks;
4136             } else if (followingBreaks[i] != expectedBreaks[i]) {
4137                 errorType = "following()";
4138                 currentBreakData = followingBreaks;
4139             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4140                 errorType = "preceding()";
4141                 currentBreakData = precedingBreaks;
4142             }
4143 
4144             if (errorType != NULL) {
4145                 // Format a range of the test text that includes the failure as
4146                 //  a data item that can be included in the rbbi test data file.
4147 
4148                 // Start of the range is the last point where expected and actual results
4149                 //  both agreed that there was a break position.
4150 
4151                 int startContext = i;
4152                 int32_t count = 0;
4153                 for (;;) {
4154                     if (startContext==0) { break; }
4155                     startContext --;
4156                     if (expectedBreaks[startContext] != 0) {
4157                         if (count == 2) break;
4158                         count ++;
4159                     }
4160                 }
4161 
4162                 // End of range is two expected breaks past the start position.
4163                 int endContext = i + 1;
4164                 int ci;
4165                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4166                     for (;;) {
4167                         if (endContext >= testText.length()) {break;}
4168                         if (expectedBreaks[endContext-1] != 0) {
4169                             if (count == 0) break;
4170                             count --;
4171                         }
4172                         endContext ++;
4173                     }
4174                 }
4175 
4176                 // Formatting of each line includes:
4177                 //   character code
4178                 //   reference break: '|' -> a break, '.' -> no break
4179                 //   actual break:    '|' -> a break, '.' -> no break
4180                 //   (name of character clase)
4181                 //   Unicode name of character
4182                 //   '-->' indicates location of the difference.
4183 
4184                 MONKEY_ERROR(
4185                     (expectedBreaks[i] ? "Break expected but not found" :
4186                        "Break found but not expected"),
4187                     name, i, seed);
4188 
4189                 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4190                     UChar32  c;
4191                     c = testText.char32At(ci);
4192 
4193                     std::string currentLineFlag = "   ";
4194                     if (ci == i) {
4195                         currentLineFlag = "-->";  // Error position
4196                     }
4197 
4198                     // BMP or SMP character in hex
4199                     char hexCodePoint[12];
4200                     std::string format = "    \\u%04x";
4201                     if (c >= 0x10000) {
4202                         format = "\\U%08x";
4203                     }
4204                     sprintf(hexCodePoint, format.c_str(), c);
4205 
4206                     // Get the class name and character name for the character.
4207                     char cName[200];
4208                     UErrorCode status = U_ZERO_ERROR;
4209                     u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4210 
4211                     char buffer[200];
4212                     auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4213                              "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
4214                              currentLineFlag.c_str(),
4215                              ci,
4216                              expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
4217                              currentBreakData[ci] == 0 ? "." : "|",  // Actual break
4218                              hexCodePoint,
4219                              classNameSize,
4220                              mk.classNameFromCodepoint(c).c_str(),
4221                              mk.getAppliedRule(ci).c_str(), cName);
4222                     (void)ret;
4223                     U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4224 
4225                     // Output the error
4226                     if (ci == i) {
4227                         errln(buffer);
4228                     } else {
4229                         infoln(buffer);
4230                     }
4231 
4232                     if (ci >= endContext) { break; }
4233                 }
4234                 break;
4235             }
4236         }
4237 
4238         loopCount++;
4239     }
4240 #endif
4241 }
4242 
4243 
4244 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4245 //             This test checks the initial patch,
4246 //             which is to just keep it from crashing.  Correct word boundaries
4247 //             await a proper fix to the dictionary code.
4248 //
TestBug5532(void)4249 void RBBITest::TestBug5532(void)  {
4250    // Text includes a mixture of Thai and Latin.
4251    const unsigned char utf8Data[] = {
4252            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4253            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4254            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4255            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4256            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4257            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4258            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4259            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4260            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4261            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4262            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4263 
4264     UErrorCode status = U_ZERO_ERROR;
4265     UText utext=UTEXT_INITIALIZER;
4266     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4267     TEST_ASSERT_SUCCESS(status);
4268 
4269     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4270     TEST_ASSERT_SUCCESS(status);
4271     if (U_SUCCESS(status)) {
4272         bi->setText(&utext, status);
4273         TEST_ASSERT_SUCCESS(status);
4274 
4275         int32_t breakCount = 0;
4276         int32_t previousBreak = -1;
4277         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4278             // For now, just make sure that the break iterator doesn't hang.
4279             TEST_ASSERT(previousBreak < bi->current());
4280             previousBreak = bi->current();
4281         }
4282         TEST_ASSERT(breakCount > 0);
4283     }
4284     delete bi;
4285     utext_close(&utext);
4286 }
4287 
4288 
TestBug9983(void)4289 void RBBITest::TestBug9983(void)  {
4290     UnicodeString text = UnicodeString("\\u002A"  // * Other
4291                                        "\\uFF65"  //   Other
4292                                        "\\u309C"  //   Katakana
4293                                        "\\uFF9F"  //   Extend
4294                                        "\\uFF65"  //   Other
4295                                        "\\u0020"  //   Other
4296                                        "\\u0000").unescape();
4297 
4298     UErrorCode status = U_ZERO_ERROR;
4299     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4300         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4301     TEST_ASSERT_SUCCESS(status);
4302     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4303         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4304     TEST_ASSERT_SUCCESS(status);
4305     if (U_FAILURE(status)) {
4306         return;
4307     }
4308     int32_t offset, rstatus, iterationCount;
4309 
4310     brkiter->setText(text);
4311     brkiter->last();
4312     iterationCount = 0;
4313     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4314         iterationCount++;
4315         rstatus = brkiter->getRuleStatus();
4316         (void)rstatus;     // Suppress set but not used warning.
4317         if (iterationCount >= 10) {
4318            break;
4319         }
4320     }
4321     TEST_ASSERT(iterationCount == 6);
4322 
4323     brkiterPOSIX->setText(text);
4324     brkiterPOSIX->last();
4325     iterationCount = 0;
4326     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4327         iterationCount++;
4328         rstatus = brkiterPOSIX->getRuleStatus();
4329         (void)rstatus;     // Suppress set but not used warning.
4330         if (iterationCount >= 10) {
4331            break;
4332         }
4333     }
4334     TEST_ASSERT(iterationCount == 6);
4335 }
4336 
4337 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4338 //
TestBug7547()4339 void RBBITest::TestBug7547() {
4340     UnicodeString rules;
4341     UErrorCode status = U_ZERO_ERROR;
4342     UParseError parseError;
4343     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4344     if (status != U_BRK_RULE_SYNTAX) {
4345         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4346     }
4347     if (parseError.line != 1 || parseError.offset != 0) {
4348         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4349     }
4350 }
4351 
4352 
TestBug12797()4353 void RBBITest::TestBug12797() {
4354     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4355     UErrorCode status = U_ZERO_ERROR;
4356     UParseError parseError;
4357     RuleBasedBreakIterator bi(rules, parseError, status);
4358     if (U_FAILURE(status)) {
4359         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4360         return;
4361     }
4362     UnicodeString text = "abc";
4363     bi.setText(text);
4364     bi.first();
4365     int32_t boundary = bi.next();
4366     if (boundary != 3) {
4367         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4368     }
4369 }
4370 
TestBug12918()4371 void RBBITest::TestBug12918() {
4372     // This test triggers an assertion failure in dictbe.cpp
4373     const UChar *crasherString = u"\u3325\u4a16";
4374     UErrorCode status = U_ZERO_ERROR;
4375     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4376     if (U_FAILURE(status)) {
4377         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4378         return;
4379     }
4380     ubrk_first(iter);
4381     int32_t pos = 0;
4382     int32_t lastPos = -1;
4383     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4384         if (pos <= lastPos) {
4385             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4386             break;
4387         }
4388     }
4389     ubrk_close(iter);
4390 }
4391 
TestBug12932()4392 void RBBITest::TestBug12932() {
4393     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4394     UnicodeString ruleStr(
4395             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4396             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4397             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4398             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4399             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4400             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4401 
4402     UErrorCode status = U_ZERO_ERROR;
4403     UParseError parseError;
4404     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4405     if (status != U_BRK_RULE_SYNTAX) {
4406         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4407                 __FILE__, __LINE__, u_errorName(status));
4408     }
4409 }
4410 
4411 
4412 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4413 //             remain undevided by ICU char, word and line break.
TestEmoji()4414 void RBBITest::TestEmoji() {
4415 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4416     UErrorCode  status = U_ZERO_ERROR;
4417 
4418     CharString testFileName;
4419     testFileName.append(IntlTest::getSourceTestData(status), status);
4420     testFileName.appendPathPart("emoji-test.txt", status);
4421     if (U_FAILURE(status)) {
4422         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4423         return;
4424     }
4425     logln("Opening data file %s\n", testFileName.data());
4426 
4427     int    len;
4428     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4429     if (U_FAILURE(status) || testFile == NULL) {
4430         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4431         return;
4432     }
4433     UnicodeString testFileAsString(testFile, len);
4434     delete [] testFile;
4435 
4436     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4437     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4438     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4439     int32_t lineNumber = 0;
4440 
4441     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4442     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4443     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4444     if (U_FAILURE(status)) {
4445         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4446         return;
4447     }
4448 
4449     while (lineMatcher.find()) {
4450         ++lineNumber;
4451         UnicodeString line = lineMatcher.group(status);
4452         hexMatcher.reset(line);
4453         UnicodeString testString;   // accumulates the emoji sequence.
4454         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4455             UnicodeString hex = hexMatcher.group(1, status);
4456             if (hex.length() > 8) {
4457                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4458                 break;
4459             }
4460             CharString hex8;
4461             hex8.appendInvariantChars(hex, status);
4462             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4463             if (c<=0x10ffff) {
4464                 testString.append(c);
4465             } else {
4466                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4467                         __FILE__, __LINE__, lineNumber, hex8.data());
4468                 break;
4469             }
4470         }
4471 
4472         if (testString.length() > 1) {
4473             charBreaks->setText(testString);
4474             charBreaks->first();
4475             int32_t firstBreak = charBreaks->next();
4476             if (testString.length() != firstBreak) {
4477                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4478                         __FILE__, __LINE__, lineNumber, firstBreak);
4479             }
4480             wordBreaks->setText(testString);
4481             wordBreaks->first();
4482             firstBreak = wordBreaks->next();
4483             if (testString.length() != firstBreak) {
4484                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4485                         __FILE__, __LINE__, lineNumber, firstBreak);
4486             }
4487             lineBreaks->setText(testString);
4488             lineBreaks->first();
4489             firstBreak = lineBreaks->next();
4490             if (testString.length() != firstBreak) {
4491                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4492                         __FILE__, __LINE__, lineNumber, firstBreak);
4493             }
4494         }
4495     }
4496 #endif
4497 }
4498 
4499 
4500 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4501 
TestBug12519()4502 void RBBITest::TestBug12519() {
4503     UErrorCode status = U_ZERO_ERROR;
4504     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4505     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4506     if (!assertSuccess(WHERE, status)) {
4507         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4508         return;
4509     }
4510     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4511 
4512     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4513     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4514 
4515     LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4516     assertTrue(WHERE, *biEn == *cloneEn);
4517     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4518 
4519     LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4520     assertTrue(WHERE, *biFr == *cloneFr);
4521     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4522 
4523     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4524     UnicodeString text("Hallo Welt");
4525     biDe->setText(text);
4526     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4527     *biDe = *biFr;
4528     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4529 }
4530 
TestBug12677()4531 void RBBITest::TestBug12677() {
4532     // Check that stripping of comments from rules for getRules() is not confused by
4533     // the presence of '#' characters in the rules that do not introduce comments.
4534     UnicodeString rules(u"!!forward; \n"
4535                          "$x = [ab#];  # a set with a # literal. \n"
4536                          " # .;        # a comment that looks sort of like a rule.   \n"
4537                          " '#' '?';    # a rule with a quoted #   \n"
4538                        );
4539 
4540     UErrorCode status = U_ZERO_ERROR;
4541     UParseError pe;
4542     RuleBasedBreakIterator bi(rules, pe, status);
4543     assertSuccess(WHERE, status);
4544     UnicodeString rtRules = bi.getRules();
4545     assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"),  rtRules);
4546 }
4547 
4548 
TestTableRedundancies()4549 void RBBITest::TestTableRedundancies() {
4550     UErrorCode status = U_ZERO_ERROR;
4551 
4552     LocalPointer<RuleBasedBreakIterator> bi (
4553         (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4554     assertSuccess(WHERE, status);
4555     if (U_FAILURE(status)) return;
4556 
4557     RBBIDataWrapper *dw = bi->fData;
4558     const RBBIStateTable *fwtbl = dw->fForwardTable;
4559     UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4560     int32_t numCharClasses = dw->fHeader->fCatCount;
4561     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4562 
4563     // Check for duplicate columns (character categories)
4564 
4565     std::vector<UnicodeString> columns;
4566     for (int32_t column = 0; column < numCharClasses; column++) {
4567         UnicodeString s;
4568         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4569             RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4570             s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4571         }
4572         columns.push_back(s);
4573     }
4574     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4575     for (int c1=1; c1<numCharClasses; c1++) {
4576         int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4577         for (int c2 = c1+1; c2 < limit; c2++) {
4578             if (columns.at(c1) == columns.at(c2)) {
4579                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4580                 goto out;
4581             }
4582         }
4583     }
4584   out:
4585 
4586     // Check for duplicate states
4587     std::vector<UnicodeString> rows;
4588     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4589         UnicodeString s;
4590         RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4591         if (in8Bits) {
4592             s.append(row->r8.fAccepting);
4593             s.append(row->r8.fLookAhead);
4594             s.append(row->r8.fTagsIdx);
4595             for (int32_t column = 0; column < numCharClasses; column++) {
4596                 s.append(row->r8.fNextState[column]);
4597             }
4598         } else {
4599             s.append(row->r16.fAccepting);
4600             s.append(row->r16.fLookAhead);
4601             s.append(row->r16.fTagsIdx);
4602             for (int32_t column = 0; column < numCharClasses; column++) {
4603                 s.append(row->r16.fNextState[column]);
4604             }
4605         }
4606         rows.push_back(s);
4607     }
4608     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4609         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4610             if (rows.at(r1) == rows.at(r2)) {
4611                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4612                 return;
4613             }
4614         }
4615     }
4616 }
4617 
4618 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4619 //            even after next() has returned DONE.
4620 
TestBug13447()4621 void RBBITest::TestBug13447() {
4622     UErrorCode status = U_ZERO_ERROR;
4623     LocalPointer<RuleBasedBreakIterator> bi(
4624         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4625     assertSuccess(WHERE, status);
4626     if (U_FAILURE(status)) return;
4627     UnicodeString data(u"1234");
4628     bi->setText(data);
4629     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4630     assertEquals(WHERE, 4, bi->next());
4631     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4632     assertEquals(WHERE, UBRK_DONE, bi->next());
4633     assertEquals(WHERE, 4, bi->current());
4634     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4635 }
4636 
4637 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4638 //  for filling the break iterator cache when starting from random positions
4639 //  in the text.
4640 //
4641 //  It's a monkey test, working on random data, with the expected data obtained
4642 //  from forward iteration (no safe rules involved), comparing with results
4643 //  when indexing into the interior of the string (safe rules needed).
4644 
TestReverse()4645 void RBBITest::TestReverse() {
4646     UErrorCode status = U_ZERO_ERROR;
4647 
4648     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4649             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4650     assertSuccess(WHERE, status, true);
4651     status = U_ZERO_ERROR;
4652     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4653             BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4654     assertSuccess(WHERE, status, true);
4655     status = U_ZERO_ERROR;
4656     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4657             BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4658     assertSuccess(WHERE, status, true);
4659     status = U_ZERO_ERROR;
4660     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4661             BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4662     assertSuccess(WHERE, status, true);
4663 }
4664 
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4665 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4666     if (!bi) {
4667         return;
4668     }
4669 
4670     // From the mapping trie in the break iterator's internal data, create a
4671     // vector of UnicodeStrings, one for each character category, containing
4672     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4673     // to avoid an execess of unassigned code points.
4674 
4675     RBBIDataWrapper *data = bi->fData;
4676     int32_t categoryCount = data->fHeader->fCatCount;
4677     UCPTrie *trie = data->fTrie;
4678     bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4679     uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4680 
4681     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4682     for (int cp=0; cp<0x1fff0; ++cp) {
4683         int cat = ucptrie_get(trie, cp);
4684         cat &= ~dictBit;    // And off the dictionary bit from the category.
4685         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4686         if (cat < 0 || cat >= categoryCount) return;
4687         strings[cat].append(cp);
4688     }
4689 
4690     icu_rand randomGen;
4691     const int testStringLength = 10000;
4692     UnicodeString testString;
4693 
4694     for (int i=0; i<testStringLength; ++i) {
4695         int charClass = randomGen() % categoryCount;
4696         if (strings[charClass].length() > 0) {
4697             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4698             testString.append(cp);
4699         }
4700     }
4701 
4702     typedef std::pair<UBool, int32_t> Result;
4703     std::vector<Result> expectedResults;
4704     bi->setText(testString);
4705     for (int i=0; i<testString.length(); ++i) {
4706         bool isboundary = bi->isBoundary(i);
4707         int  ruleStatus = bi->getRuleStatus();
4708         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4709     }
4710 
4711     for (int i=testString.length()-1; i>=0; --i) {
4712         bi->setText(testString);   // clears the internal break cache
4713         Result expected = expectedResults[i];
4714         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4715         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4716     }
4717 }
4718 
4719 
4720 // Ticket 13692 - finding word boundaries in very large numbers or words could
4721 //                be very time consuming. When the problem was present, this void test
4722 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4723 
TestBug13692()4724 void RBBITest::TestBug13692() {
4725     UErrorCode status = U_ZERO_ERROR;
4726     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4727             BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4728     if (!assertSuccess(WHERE, status, true)) {
4729         return;
4730     }
4731     constexpr int32_t LENGTH = 1000000;
4732     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4733     for (int i=0; i<20; i+=2) {
4734         longNumber.setCharAt(i, u' ');
4735     }
4736     bi->setText(longNumber);
4737     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4738     assertSuccess(WHERE, status);
4739 }
4740 
4741 
TestProperties()4742 void RBBITest::TestProperties() {
4743     UErrorCode errorCode = U_ZERO_ERROR;
4744     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4745     if (!prependSet.isEmpty()) {
4746         errln(
4747             "[:GCB=Prepend:] is not empty any more. "
4748             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4749             "change this test to the opposite condition.");
4750     }
4751 }
4752 
4753 
4754 //
4755 //  TestDebug    -  A place-holder test for debugging purposes.
4756 //                  For putting in fragments of other tests that can be invoked
4757 //                  for tracing  without a lot of unwanted extra stuff happening.
4758 //
TestDebug(void)4759 void RBBITest::TestDebug(void) {
4760     UErrorCode status = U_ZERO_ERROR;
4761     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4762             BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4763     if (!assertSuccess(WHERE, status, true)) {
4764         return;
4765     }
4766     const UnicodeString &rules = bi->getRules();
4767     UParseError pe;
4768     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4769     assertSuccess(WHERE, status);
4770 }
4771 
4772 
4773 //
4774 //  TestDebugRules   A stub test for use in debugging rule compilation problems.
4775 //                   Can be freely altered as needed or convenient.
4776 //                   Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4777 //                   data files may not be available in all environments.
4778 //                   Any permanent test cases should be moved to rbbitst.txt
4779 //                   (see Bug 20303 in that file, for example), or to another test function in this file.
4780 //
TestDebugRules()4781 void RBBITest::TestDebugRules() {
4782 #if 0
4783     const char16_t *rules = u""
4784         "!!quoted_literals_only; \n"
4785         "!!chain; \n"
4786         "!!lookAheadHardBreak; \n"
4787         " \n"
4788         // "[a] / ; \n"
4789         "[a] [b] / [c] [d]; \n"
4790         "[a] [b] / [c] [d] {100}; \n"
4791         "[x] [a] [b] / [c] [d] {100}; \n"
4792         "[a] [b] [c] / [d] {100}; \n"
4793         //" [c] [d] / [e] [f]; \n"
4794         //"[a] [b] / [c]; \n"
4795         ;
4796 
4797     UErrorCode status = U_ZERO_ERROR;
4798     CharString path(pathToDataDirectory(), status);
4799     path.appendPathPart("brkitr", status);
4800     path.appendPathPart("rules", status);
4801     path.appendPathPart("line.txt", status);
4802     int    len;
4803     std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4804     if (!assertSuccess(WHERE, status)) {
4805         return;
4806     }
4807 
4808     UParseError pe;
4809     // rules = testFile.get();
4810     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4811 
4812     if (!assertSuccess(WHERE, status)) {
4813         delete bi;
4814         return;
4815     }
4816     // bi->dumpTables();
4817 
4818     delete bi;
4819 #endif
4820 }
4821 
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4822 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4823     UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4824     int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4825     // Text are duplicate characters from U+4E00 to U+4FFF
4826     UnicodeString text;
4827     for (UChar c = 0x4e00; c < 0x5000; c++) {
4828         text.append(c).append(c);
4829     }
4830     // Generate rule which will caused length+4 character classes and
4831     // length+3 states
4832     UnicodeString rules(u"!!quoted_literals_only;");
4833     for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4834         rules.append(u'\'').append(c).append(c).append(u"';");
4835     }
4836     rules.append(u".;");
4837     UErrorCode status = U_ZERO_ERROR;
4838     UParseError parseError;
4839     RuleBasedBreakIterator bi(rules, parseError, status);
4840 
4841     assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4842     assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4843     assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4844     assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4845     assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4846 
4847     bi.setText(text);
4848 
4849     int32_t pos;
4850     int32_t i = 0;
4851     while ((pos = bi.next()) > 0) {
4852         // The first numChar should not break between the pair
4853         if (i++ < numChar) {
4854             assertEquals(WHERE, i * 2, pos);
4855         } else {
4856             // After the first numChar next(), break on each character.
4857             assertEquals(WHERE, i + numChar, pos);
4858         }
4859     }
4860     while ((pos = bi.previous()) > 0) {
4861         // The first numChar should not break between the pair
4862         if (--i < numChar) {
4863             assertEquals(WHERE, i * 2, pos);
4864         } else {
4865             // After the first numChar next(), break on each character.
4866             assertEquals(WHERE, i + numChar, pos);
4867         }
4868     }
4869 }
4870 
Test8BitsTrieWith8BitStateTable()4871 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4872     testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4873 }
4874 
Test16BitsTrieWith8BitStateTable()4875 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4876     testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4877 }
4878 
Test16BitsTrieWith16BitStateTable()4879 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4880     testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4881 }
4882 
Test8BitsTrieWith16BitStateTable()4883 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4884     // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4885     // create state table in 16 bits.
4886 
4887     // Generate 510 'a' as text
4888     UnicodeString text;
4889     for (int32_t i = 0; i < 510; i++) {
4890         text.append(u'a');
4891     }
4892 
4893     UnicodeString rules(u"!!quoted_literals_only;'");
4894     // 254 'a' in the rule will cause 256 states
4895     for (int32_t i = 0; i < 254; i++) {
4896         rules.append(u'a');
4897     }
4898     rules.append(u"';.;");
4899 
4900     UErrorCode status = U_ZERO_ERROR;
4901     UParseError parseError;
4902     LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4903 
4904     assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4905     assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4906     assertEquals(WHERE,
4907                  false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
4908     bi->setText(text);
4909 
4910     // break positions:
4911     // 254, 508, 509, ... 510
4912     assertEquals("next()", 254, bi->next());
4913     int32_t i = 0;
4914     int32_t pos;
4915     while ((pos = bi->next()) > 0) {
4916         assertEquals(WHERE, 508 + i , pos);
4917         i++;
4918     }
4919     i = 0;
4920     while ((pos = bi->previous()) > 0) {
4921         i++;
4922         if (pos >= 508) {
4923             assertEquals(WHERE, 510 - i , pos);
4924         } else {
4925             assertEquals(WHERE, 254 , pos);
4926         }
4927     }
4928 }
4929 
4930 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
4931 // that there are no problems with rules at the size that transitions between the two.
4932 //
4933 // A rule that matches a literal string, like 'abcdefghij', will require one state and
4934 // one character class per character in the string. So we can make a rule to tickle the
4935 // boundaries by using literal strings of various lengths.
4936 //
4937 // For both the number of states and the number of character classes, the eight bit format
4938 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
4939 // leaving 120 something available. This test runs the string over the range of 120 - 130,
4940 // which allows some margin for changes to the number of values reserved by the rule builder
4941 // without breaking the test.
4942 
TestTable_8_16_Bits()4943 void RBBITest::TestTable_8_16_Bits() {
4944 
4945     // testStr serves as both the source of the rule string (truncated to the desired length)
4946     // and as test data to check matching behavior. A break rule consisting of the first 120
4947     // characters of testStr will match the first 120 chars of the full-length testStr.
4948     UnicodeString testStr;
4949     for (UChar c=0x3000; c<0x3200; ++c) {
4950         testStr.append(c);
4951     }
4952 
4953     const int32_t startLength = 120;   // The shortest rule string to test.
4954     const int32_t endLength = 260;     // The longest rule string to test
4955     const int32_t increment = this->quick ? endLength - startLength : 1;
4956 
4957     for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
4958         UParseError parseError;
4959         UErrorCode status = U_ZERO_ERROR;
4960 
4961         UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
4962         ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
4963         RuleBasedBreakIterator bi(ruleString, parseError, status);
4964         if (!assertSuccess(WHERE, status)) {
4965             errln(ruleString);
4966             break;
4967         }
4968         // bi.dumpTables();
4969 
4970         // Verify that the break iterator is functioning - that the first boundary found
4971         // in testStr is at the length of the rule string.
4972         bi.setText(testStr);
4973         assertEquals(WHERE, ruleLen, bi.next());
4974 
4975         // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
4976         // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
4977         bi.setText(testStr);
4978         int32_t result = bi.preceding(ruleLen);
4979         assertEquals(WHERE, 0, result);
4980 
4981         // Verify that the range of rule lengths being tested cover the translations
4982         // from 8 to 16 bit data.
4983         bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
4984         bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
4985 
4986         if (ruleLen == startLength) {
4987             assertEquals(WHERE, true, has8BitRowData);
4988             assertEquals(WHERE, true, has8BitsTrie);
4989         }
4990         if (ruleLen == endLength) {
4991             assertEquals(WHERE, false, has8BitRowData);
4992             assertEquals(WHERE, false, has8BitsTrie);
4993         }
4994     }
4995 }
4996 
4997 /* Test handling of a large number of look-ahead rules.
4998  * The number of rules in the test exceeds the implementation limits prior to the
4999  * improvements introduced with #13590.
5000  *
5001  * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5002  * The text being matched is sequential, "ABCDEFGHI..."
5003  *
5004  * The upshot is that the look-ahead rules all match on their preceding context,
5005  * and consequently must save a potential result, but then fail to match on their
5006  * trailing context, so that they don't actually cause a boundary.
5007  *
5008  * Additionally, add a ".*" rule, so there are no boundaries unless a
5009  * look-ahead hard-break rule forces one.
5010  */
TestBug13590()5011 void RBBITest::TestBug13590() {
5012     UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5013 
5014     const int NUM_LOOKAHEAD_RULES = 50;
5015     const char16_t STARTING_CHAR = u'\u5000';
5016     char16_t firstChar;
5017     for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5018         firstChar = STARTING_CHAR + ruleNum*2;
5019         rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5020              .append(u' ') .append(u'/') .append(u' ')
5021              .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5022              .append(u';') .append(u'\n');
5023     }
5024 
5025     // Change the last rule added from the form "UV / WY" to "UV / WX".
5026     // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5027     rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5028 
5029     UErrorCode status = U_ZERO_ERROR;
5030     UParseError parseError;
5031     RuleBasedBreakIterator bi(rules, parseError, status);
5032     if (!assertSuccess(WHERE, status)) {
5033         errln(rules);
5034         return;
5035     }
5036     // bi.dumpTables();
5037 
5038     UnicodeString testString;
5039     for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5040         testString.append(c);
5041     }
5042     bi.setText(testString);
5043 
5044     int breaksFound = 0;
5045     while (bi.next() != UBRK_DONE) {
5046         ++breaksFound;
5047     }
5048 
5049     // Two matches are expected, one from the last rule that was explicitly modified,
5050     // and one at the end of the text.
5051     assertEquals(WHERE, 2, breaksFound);
5052 }
5053 
5054 
5055 #if U_ENABLE_TRACING
5056 static std::vector<std::string> gData;
5057 static std::vector<int32_t> gEntryFn;
5058 static std::vector<int32_t> gExitFn;
5059 static std::vector<int32_t> gDataFn;
5060 
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5061 static void U_CALLCONV traceData(
5062         const void*,
5063         int32_t fnNumber,
5064         int32_t,
5065         const char *,
5066         va_list args) {
5067     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5068         const char* data = va_arg(args, const char*);
5069         gDataFn.push_back(fnNumber);
5070         gData.push_back(data);
5071     }
5072 }
5073 
traceEntry(const void *,int32_t fnNumber)5074 static void traceEntry(const void *, int32_t fnNumber) {
5075     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5076         gEntryFn.push_back(fnNumber);
5077     }
5078 }
5079 
traceExit(const void *,int32_t fnNumber,const char *,va_list)5080 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5081     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5082         gExitFn.push_back(fnNumber);
5083     }
5084 }
5085 
5086 
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5087 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5088     assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5089     assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5090     assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5091     assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5092 
5093     if (expectedData == nullptr) {
5094       assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5095       assertEquals("utrace_data should not be called ", 0, gData.size());
5096     } else {
5097       assertEquals("utrace_data should be called ", 1, gDataFn.size());
5098       assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5099       assertEquals("utrace_data should be called ", 1, gData.size());
5100       assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5101     }
5102 }
5103 
SetupTestTrace()5104 void SetupTestTrace() {
5105     gEntryFn.clear();
5106     gExitFn.clear();
5107     gDataFn.clear();
5108     gData.clear();
5109 
5110     const void* context = nullptr;
5111     utrace_setFunctions(context, traceEntry, traceExit, traceData);
5112     utrace_setLevel(UTRACE_INFO);
5113 }
5114 
TestTraceCreateCharacter(void)5115 void RBBITest::TestTraceCreateCharacter(void) {
5116     SetupTestTrace();
5117     IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5118     LocalPointer<BreakIterator> brkitr(
5119         BreakIterator::createCharacterInstance("zh-CN", status));
5120     status.errIfFailureAndReset();
5121     assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5122 }
5123 
TestTraceCreateTitle(void)5124 void RBBITest::TestTraceCreateTitle(void) {
5125     SetupTestTrace();
5126     IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5127     LocalPointer<BreakIterator> brkitr(
5128         BreakIterator::createTitleInstance("zh-CN", status));
5129     status.errIfFailureAndReset();
5130     assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5131 }
5132 
TestTraceCreateSentence(void)5133 void RBBITest::TestTraceCreateSentence(void) {
5134     SetupTestTrace();
5135     IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5136     LocalPointer<BreakIterator> brkitr(
5137         BreakIterator::createSentenceInstance("zh-CN", status));
5138     status.errIfFailureAndReset();
5139     assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5140 }
5141 
TestTraceCreateWord(void)5142 void RBBITest::TestTraceCreateWord(void) {
5143     SetupTestTrace();
5144     IcuTestErrorCode status(*this, "TestTraceCreateWord");
5145     LocalPointer<BreakIterator> brkitr(
5146         BreakIterator::createWordInstance("zh-CN", status));
5147     status.errIfFailureAndReset();
5148     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5149 }
5150 
TestTraceCreateLine(void)5151 void RBBITest::TestTraceCreateLine(void) {
5152     SetupTestTrace();
5153     IcuTestErrorCode status(*this, "TestTraceCreateLine");
5154     LocalPointer<BreakIterator> brkitr(
5155         BreakIterator::createLineInstance("zh-CN", status));
5156     status.errIfFailureAndReset();
5157     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line");
5158 }
5159 
TestTraceCreateLineStrict(void)5160 void RBBITest::TestTraceCreateLineStrict(void) {
5161     SetupTestTrace();
5162     IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5163     LocalPointer<BreakIterator> brkitr(
5164         BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5165     status.errIfFailureAndReset();
5166     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict");
5167 }
5168 
TestTraceCreateLineNormal(void)5169 void RBBITest::TestTraceCreateLineNormal(void) {
5170     SetupTestTrace();
5171     IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5172     LocalPointer<BreakIterator> brkitr(
5173         BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5174     status.errIfFailureAndReset();
5175     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal");
5176 }
5177 
TestTraceCreateLineLoose(void)5178 void RBBITest::TestTraceCreateLineLoose(void) {
5179     SetupTestTrace();
5180     IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5181     LocalPointer<BreakIterator> brkitr(
5182         BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5183     status.errIfFailureAndReset();
5184     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose");
5185 }
5186 
TestTraceCreateLineLoosePhrase(void)5187 void RBBITest::TestTraceCreateLineLoosePhrase(void) {
5188     SetupTestTrace();
5189     IcuTestErrorCode status(*this, "TestTraceCreateLineLoosePhrase");
5190     LocalPointer<BreakIterator> brkitr(
5191         BreakIterator::createLineInstance("ja-u-lb-loose-lw-phrase", status));
5192     status.errIfFailureAndReset();
5193     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose_phrase");
5194 }
5195 
TestTraceCreateLineNormalPhrase(void)5196 void RBBITest::TestTraceCreateLineNormalPhrase(void) {
5197     SetupTestTrace();
5198     IcuTestErrorCode status(*this, "TestTraceCreateLineNormalPhrase");
5199     LocalPointer<BreakIterator> brkitr(
5200         BreakIterator::createLineInstance("ja-u-lb-normal-lw-phrase", status));
5201     status.errIfFailureAndReset();
5202     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal_phrase");
5203 }
5204 
TestTraceCreateLineStrictPhrase(void)5205 void RBBITest::TestTraceCreateLineStrictPhrase(void) {
5206     SetupTestTrace();
5207     IcuTestErrorCode status(*this, "TestTraceCreateLineStrictPhrase");
5208     LocalPointer<BreakIterator> brkitr(
5209         BreakIterator::createLineInstance("ja-u-lb-strict-lw-phrase", status));
5210     status.errIfFailureAndReset();
5211     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict_phrase");
5212 }
5213 
TestTraceCreateLinePhrase(void)5214 void RBBITest::TestTraceCreateLinePhrase(void) {
5215     SetupTestTrace();
5216     IcuTestErrorCode status(*this, "TestTraceCreateLinePhrase");
5217     LocalPointer<BreakIterator> brkitr(
5218         BreakIterator::createLineInstance("ja-u-lw-phrase", status));
5219     status.errIfFailureAndReset();
5220     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_phrase");
5221 }
5222 
TestTraceCreateBreakEngine(void)5223 void RBBITest::TestTraceCreateBreakEngine(void) {
5224     rbbi_cleanup();
5225     SetupTestTrace();
5226     IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5227     LocalPointer<BreakIterator> brkitr(
5228         BreakIterator::createWordInstance("zh-CN", status));
5229     status.errIfFailureAndReset();
5230     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5231 
5232     // To word break the following text, BreakIterator will create 5 dictionary
5233     // break engine internally.
5234     brkitr->setText(
5235         u"test "
5236         u"測試 " // Hani
5237         u"សាកល្បង " // Khmr
5238         u"ທົດສອບ " // Laoo
5239         u"စမ်းသပ်မှု " // Mymr
5240         u"ทดสอบ " // Thai
5241         u"test "
5242     );
5243 
5244     // Loop through all the text.
5245     while (brkitr->next() > 0) ;
5246 
5247     assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5248     assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5249     assertEquals("utrace_data should be called ", 5, gDataFn.size());
5250 
5251     for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5252         assertEquals("utrace_entry should be called ",
5253                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5254         assertEquals("utrace_exit should be called ",
5255                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5256         assertEquals("utrace_data should be called ",
5257                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5258     }
5259 
5260     assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5261     assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5262     assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5263     assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5264     assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5265 
5266 }
5267 #endif
5268 
TestUnpairedSurrogate()5269 void RBBITest::TestUnpairedSurrogate() {
5270     UnicodeString rules(u"ab;");
5271 
5272     UErrorCode status = U_ZERO_ERROR;
5273     UParseError pe;
5274     RuleBasedBreakIterator bi1(rules, pe, status);
5275     assertSuccess(WHERE, status);
5276     UnicodeString rtRules = bi1.getRules();
5277     // make sure the simple one work first.
5278     assertEquals(WHERE, rules,  rtRules);
5279 
5280 
5281     rules = UnicodeString(u"a\\ud800b;").unescape();
5282     pe.line = 0;
5283     pe.offset = 0;
5284     RuleBasedBreakIterator bi2(rules, pe, status);
5285     assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5286     if (pe.line != 1 || pe.offset != 1) {
5287         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5288     }
5289 
5290     status = U_ZERO_ERROR;
5291     rules = UnicodeString(u"a\\ude00b;").unescape();
5292     pe.line = 0;
5293     pe.offset = 0;
5294     RuleBasedBreakIterator bi3(rules, pe, status);
5295     assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5296     if (pe.line != 1 || pe.offset != 1) {
5297         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5298     }
5299 
5300     // make sure the surrogate one work too.
5301     status = U_ZERO_ERROR;
5302     rules = UnicodeString(u"a��b;");
5303     RuleBasedBreakIterator bi4(rules, pe, status);
5304     rtRules = bi4.getRules();
5305     assertEquals(WHERE, rules, rtRules);
5306 }
5307 
5308 // Read file generated by
5309 // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5310 // as test cases and compare the Output.
5311 // Format of the file
5312 //   Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5313 //   Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5314 //   Input:\t[source text]
5315 //   Output:\t[expected output separated by | ]
5316 //   Input: ...
5317 //   Output: ...
5318 
runLSTMTestFromFile(const char * filename,UScriptCode script)5319 void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5320     // The expectation in this test depends on LSTM, skip the test if the
5321     // configuration is not build with LSTM data.
5322     if (skipLSTMTest()) {
5323         return;
5324     }
5325     UErrorCode   status = U_ZERO_ERROR;
5326     LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5327     if (U_FAILURE(status)) {
5328         errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5329         return;
5330     }
5331     //  Open and read the test data file.
5332     const char *testDataDirectory = IntlTest::getSourceTestData(status);
5333     CharString testFileName(testDataDirectory, -1, status);
5334     testFileName.append(filename, -1, status);
5335 
5336     int len;
5337     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5338     if (U_FAILURE(status)) {
5339         errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5340         return;
5341     }
5342 
5343     //  Put the test data into a UnicodeString
5344     UnicodeString testString(false, testFile, len);
5345 
5346     int32_t start = 0;
5347 
5348     UnicodeString line;
5349     int32_t end;
5350     std::string actual_sep_str;
5351     int32_t caseNum = 0;
5352     // Iterate through all the lines in the test file.
5353     do {
5354         int32_t cr = testString.indexOf(u'\r', start);
5355         int32_t lf = testString.indexOf(u'\n', start);
5356         end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5357         line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5358         if (line.length() > 0) {
5359             // Separate each line to key and value by TAB.
5360             int32_t tab = line.indexOf(u'\t');
5361             UnicodeString key = line.tempSubString(0, tab);
5362             const UnicodeString value = line.tempSubString(tab+1);
5363 
5364             if (key == "Model:") {
5365                 // Verify the expectation in the test file match the LSTM model
5366                 // we are using now.
5367                 const LSTMData* data = CreateLSTMDataForScript(script, status);
5368                 if (U_FAILURE(status)) {
5369                     dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5370                               __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5371                     return;
5372                 }
5373                 UnicodeString name(LSTMDataName(data));
5374                 DeleteLSTMData(data);
5375                 if (value != name) {
5376                     std::string utf8Name, utf8Value;
5377                     dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5378                               __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5379                               name.toUTF8String<std::string>(utf8Name).c_str(),
5380                               value.toUTF8String<std::string>(utf8Value).c_str());
5381                     return;
5382                 }
5383             } else if (key == "Input:") {
5384                 UnicodeString input("prefix ");
5385                 input += value + " suffix";
5386                 std::stringstream ss;
5387 
5388                 // Construct the UText which is expected by the the engine as
5389                 // input from the UnicodeString.
5390                 UText ut = UTEXT_INITIALIZER;
5391                 utext_openConstUnicodeString(&ut, &input, &status);
5392                 if (U_FAILURE(status)) {
5393                     dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5394                     return;
5395                 }
5396 
5397                 iterator->setText(&ut, status);
5398                 if (U_FAILURE(status)) {
5399                     errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5400                     return;
5401                 }
5402 
5403                 int32_t bp;
5404                 for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5405                     ss << bp;
5406                     if (bp != input.length()) {
5407                         ss << ", ";
5408                     }
5409                 }
5410 
5411                 utext_close(&ut);
5412                 // Turn the break points into a string for easy comparison
5413                 // output.
5414                 actual_sep_str = "{" + ss.str() + "}";
5415             } else if (key == "Output:" && !actual_sep_str.empty()) {
5416                 UnicodeString input("prefix| |");
5417                 input += value + "| |suffix";
5418                 std::string d;
5419                 int32_t sep;
5420                 int32_t start = 0;
5421                 int32_t curr = 0;
5422                 std::stringstream ss;
5423                 // Include 0 as the break point.
5424                 ss << "0, ";
5425                 while ((sep = input.indexOf(u'|', start)) >= 0) {
5426                     int32_t len = sep - start;
5427                     if (len > 0) {
5428                         if (curr > 0) {
5429                             ss << ", ";
5430                         }
5431                         curr += len;
5432                         ss << curr;
5433                     }
5434                     start = sep + 1;
5435                 }
5436                 // Include end of the string as break point.
5437                 ss << ", " << curr + input.length() - start;
5438                 // Turn the break points into a string for easy comparison
5439                 // output.
5440                 std::string expected = "{" + ss.str() + "}";
5441                 std::string utf8;
5442 
5443                 assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5444                              expected.c_str(), actual_sep_str.c_str());
5445                 actual_sep_str.clear();
5446             }
5447         }
5448         start = std::max(cr, lf) + 1;
5449     } while (end >= 0);
5450 
5451     delete [] testFile;
5452 }
5453 
TestLSTMThai()5454 void RBBITest::TestLSTMThai() {
5455     runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5456 }
5457 
TestLSTMBurmese()5458 void RBBITest::TestLSTMBurmese() {
5459     runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5460 }
5461 
5462 
5463 // Test preceding(index) and following(index), with semi-random indexes.
5464 // The random indexes are produced in clusters that are relatively closely spaced,
5465 // to increase the occurrences of hits to the internal break cache.
5466 
TestRandomAccess()5467 void RBBITest::TestRandomAccess() {
5468     static constexpr int32_t CACHE_SIZE = 128;
5469 
5470     UnicodeString testData;
5471     for (int i=0; i<CACHE_SIZE*2; ++i) {
5472         testData.append(u"aaaa\n");
5473     }
5474 
5475     UErrorCode status = U_ZERO_ERROR;
5476     LocalPointer<RuleBasedBreakIterator> bi(
5477             (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status),
5478             status);
5479     if (!assertSuccess(WHERE, status)) { return; };
5480 
5481     bi->setText(testData);
5482 
5483     auto expectedPreceding = [](int from) {
5484         if (from == 0) {return UBRK_DONE;}
5485         if (from % 5 == 0) {return from - 5;}
5486         return from - (from % 5);
5487     };
5488 
5489     auto expectedFollow = [testData](int from) {
5490         if (from >= testData.length()) {return UBRK_DONE;}
5491         if (from % 5 == 0) {return from + 5;}
5492         return from + (5 - (from % 5));
5493     };
5494 
5495     auto randomStringIndex = [testData]() {
5496         static icu_rand randomGenerator;  // produces random uint32_t values.
5497         static int lastNum;
5498         static int clusterCount;
5499         static constexpr int CLUSTER_SIZE = 100;
5500         static constexpr int CLUSTER_LENGTH = 10;
5501 
5502         if (clusterCount < CLUSTER_LENGTH) {
5503             ++clusterCount;
5504             lastNum += (randomGenerator() % CLUSTER_SIZE);
5505             lastNum -= CLUSTER_SIZE / 2;
5506             lastNum = std::max(0, lastNum);
5507             // Deliberately test indexes > testData.length.
5508             lastNum = std::min(testData.length() + 5, lastNum);
5509         } else {
5510             clusterCount = 0;
5511             lastNum = randomGenerator() % testData.length();
5512         }
5513         return lastNum;
5514     };
5515 
5516     for (int i=0; i<5000; ++i) {
5517         int idx = randomStringIndex();
5518         assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
5519         idx = randomStringIndex();
5520         assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
5521     }
5522 }
5523 
5524 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5525