• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1999-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Date        Name        Description
10 *   12/15/99    Madhu        Creation.
11 *   01/12/2000  Madhu        Updated for changed API and added new tests
12 ************************************************************************/
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include <algorithm>
18 #include <sstream>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <utility>
23 #include <vector>
24 
25 #include "unicode/brkiter.h"
26 #include "unicode/localpointer.h"
27 #include "unicode/numfmt.h"
28 #include "unicode/rbbi.h"
29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30 #include "unicode/regex.h"
31 #endif
32 #include "unicode/schriter.h"
33 #include "unicode/uchar.h"
34 #include "unicode/utf16.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uscript.h"
38 #include "unicode/ustring.h"
39 #include "unicode/utext.h"
40 #include "unicode/utrace.h"
41 
42 #include "charstr.h"
43 #include "cmemory.h"
44 #include "cstr.h"
45 #include "cstring.h"
46 #include "intltest.h"
47 #include "lstmbe.h"
48 #include "rbbitst.h"
49 #include "rbbidata.h"
50 #include "utypeinfo.h"  // for 'typeid' to work
51 #include "uvector.h"
52 #include "uvectr32.h"
53 
54 
55 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
56 #include "unicode/filteredbrk.h"
57 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
58 
59 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
60     if (!(x)) { \
61         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
62     } \
63 } UPRV_BLOCK_MACRO_END
64 
65 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
66     if (U_FAILURE(errcode)) { \
67         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
68     } \
69 } UPRV_BLOCK_MACRO_END
70 
71 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
72     IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
73                     __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
74 }
75 
76 //---------------------------------------------
77 // runIndexedTest
78 //---------------------------------------------
79 
80 
81 //  Note:  Before adding new tests to this file, check whether the desired test data can
82 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
83 //         it's much less work than writing a new test, diagnostic output in the event of failures
84 //         is good, and the test data file will is shared with ICU4J, so eventually the test
85 //         will run there as well, without additional effort.
86 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)87 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
88 {
89     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
90     fTestParams = params;
91 
92     TESTCASE_AUTO_BEGIN;
93 #if !UCONFIG_NO_FILE_IO
94     TESTCASE_AUTO(TestBug4153072);
95 #endif
96 #if !UCONFIG_NO_FILE_IO
97     TESTCASE_AUTO(TestUnicodeFiles);
98 #endif
99     TESTCASE_AUTO(TestGetAvailableLocales);
100     TESTCASE_AUTO(TestGetDisplayName);
101 #if !UCONFIG_NO_FILE_IO
102     TESTCASE_AUTO(TestEndBehaviour);
103     TESTCASE_AUTO(TestWordBreaks);
104     TESTCASE_AUTO(TestWordBoundary);
105     TESTCASE_AUTO(TestLineBreaks);
106     TESTCASE_AUTO(TestSentBreaks);
107     TESTCASE_AUTO(TestExtended);
108 #endif
109 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
110     TESTCASE_AUTO(TestMonkey);
111 #endif
112 #if !UCONFIG_NO_FILE_IO
113     TESTCASE_AUTO(TestBug3818);
114 #endif
115     TESTCASE_AUTO(TestDebug);
116 #if !UCONFIG_NO_FILE_IO
117     TESTCASE_AUTO(TestBug5775);
118 #endif
119     TESTCASE_AUTO(TestBug9983);
120     TESTCASE_AUTO(TestDictRules);
121     TESTCASE_AUTO(TestBug5532);
122     TESTCASE_AUTO(TestBug7547);
123     TESTCASE_AUTO(TestBug12797);
124     TESTCASE_AUTO(TestBug12918);
125     TESTCASE_AUTO(TestBug12932);
126     TESTCASE_AUTO(TestEmoji);
127     TESTCASE_AUTO(TestBug12519);
128     TESTCASE_AUTO(TestBug12677);
129     TESTCASE_AUTO(TestTableRedundancies);
130     TESTCASE_AUTO(TestBug13447);
131     TESTCASE_AUTO(TestReverse);
132     TESTCASE_AUTO(TestBug13692);
133     TESTCASE_AUTO(TestDebugRules);
134     TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
135     TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
136     TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
137     TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
138     TESTCASE_AUTO(TestTable_8_16_Bits);
139     TESTCASE_AUTO(TestBug13590);
140     TESTCASE_AUTO(TestUnpairedSurrogate);
141     TESTCASE_AUTO(TestLSTMThai);
142     TESTCASE_AUTO(TestLSTMBurmese);
143     TESTCASE_AUTO(TestRandomAccess);
144 
145 #if U_ENABLE_TRACING
146     TESTCASE_AUTO(TestTraceCreateCharacter);
147     TESTCASE_AUTO(TestTraceCreateWord);
148     TESTCASE_AUTO(TestTraceCreateSentence);
149     TESTCASE_AUTO(TestTraceCreateTitle);
150     TESTCASE_AUTO(TestTraceCreateLine);
151     TESTCASE_AUTO(TestTraceCreateLineNormal);
152     TESTCASE_AUTO(TestTraceCreateLineLoose);
153     TESTCASE_AUTO(TestTraceCreateLineStrict);
154     TESTCASE_AUTO(TestTraceCreateLineNormalPhrase);
155     TESTCASE_AUTO(TestTraceCreateLineLoosePhrase);
156     TESTCASE_AUTO(TestTraceCreateLineStrictPhrase);
157     TESTCASE_AUTO(TestTraceCreateLinePhrase);
158     TESTCASE_AUTO(TestTraceCreateBreakEngine);
159 #endif
160 
161     TESTCASE_AUTO_END;
162 }
163 
164 
165 //--------------------------------------------------------------------------------------
166 //
167 //    RBBITest    constructor and destructor
168 //
169 //--------------------------------------------------------------------------------------
170 
RBBITest()171 RBBITest::RBBITest() {
172     fTestParams = nullptr;
173 }
174 
175 
~RBBITest()176 RBBITest::~RBBITest() {
177 }
178 
179 
printStringBreaks(UText * tstr,int expected[],int expectedCount)180 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
181     UErrorCode status = U_ZERO_ERROR;
182     char name[100];
183     printf("code    alpha extend alphanum type word sent line name\n");
184     int nextExpectedIndex = 0;
185     utext_setNativeIndex(tstr, 0);
186     for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
187         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
188             printf("------------------------------------------------ %d\n", j);
189             ++nextExpectedIndex;
190         }
191 
192         UChar32 c = utext_next32(tstr);
193         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
194         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
195                            u_isUAlphabetic(c),
196                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
197                            u_isalnum(c),
198                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
199                                                   u_charType(c),
200                                                   U_SHORT_PROPERTY_NAME),
201                            u_getPropertyValueName(UCHAR_WORD_BREAK,
202                                                   u_getIntPropertyValue(c,
203                                                           UCHAR_WORD_BREAK),
204                                                   U_SHORT_PROPERTY_NAME),
205                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
206                                    u_getIntPropertyValue(c,
207                                            UCHAR_SENTENCE_BREAK),
208                                    U_SHORT_PROPERTY_NAME),
209                            u_getPropertyValueName(UCHAR_LINE_BREAK,
210                                    u_getIntPropertyValue(c,
211                                            UCHAR_LINE_BREAK),
212                                    U_SHORT_PROPERTY_NAME),
213                            name);
214     }
215 }
216 
217 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)218 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
219    UErrorCode status = U_ZERO_ERROR;
220    UText *tstr = nullptr;
221    tstr = utext_openConstUnicodeString(nullptr, &ustr, &status);
222    if (U_FAILURE(status)) {
223        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
224        return;
225     }
226    printStringBreaks(tstr, expected, expectedCount);
227    utext_close(tstr);
228 }
229 
230 
TestBug3818()231 void RBBITest::TestBug3818() {
232     UErrorCode  status = U_ZERO_ERROR;
233 
234     // Four Thai words...
235     static const char16_t thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
236                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
237     UnicodeString  thaiStr(thaiWordData);
238 
239     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
240     if (U_FAILURE(status) || bi == nullptr) {
241         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
242         return;
243     }
244     bi->setText(thaiStr);
245 
246     int32_t  startOfSecondWord = bi->following(1);
247     if (startOfSecondWord != 4) {
248         errln("Fail at file %s, line %d expected start of word at 4, got %d",
249             __FILE__, __LINE__, startOfSecondWord);
250     }
251     startOfSecondWord = bi->following(0);
252     if (startOfSecondWord != 4) {
253         errln("Fail at file %s, line %d expected start of word at 4, got %d",
254             __FILE__, __LINE__, startOfSecondWord);
255     }
256     delete bi;
257 }
258 
259 
260 //---------------------------------------------
261 //
262 //     other tests
263 //
264 //---------------------------------------------
265 
TestGetAvailableLocales()266 void RBBITest::TestGetAvailableLocales()
267 {
268     int32_t locCount = 0;
269     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
270 
271     if (locCount == 0)
272         dataerrln("getAvailableLocales() returned an empty list!");
273     // Just make sure that it's returning good memory.
274     int32_t i;
275     for (i = 0; i < locCount; ++i) {
276         logln(locList[i].getName());
277     }
278 }
279 
280 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()281 void RBBITest::TestGetDisplayName()
282 {
283     UnicodeString   result;
284 
285     BreakIterator::getDisplayName(Locale::getUS(), result);
286     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
287         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
288                 + result);
289 
290     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
291     if (result != "French (France)")
292         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
293                 + result);
294 }
295 /**
296  * Test End Behaviour
297  * @bug 4068137
298  */
TestEndBehaviour()299 void RBBITest::TestEndBehaviour()
300 {
301     UErrorCode status = U_ZERO_ERROR;
302     UnicodeString testString("boo.");
303     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
304     if (U_FAILURE(status))
305     {
306         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
307         return;
308     }
309     wb->setText(testString);
310 
311     if (wb->first() != 0)
312         errln("Didn't get break at beginning of string.");
313     if (wb->next() != 3)
314         errln("Didn't get break before period in \"boo.\"");
315     if (wb->current() != 4 && wb->next() != 4)
316         errln("Didn't get break at end of string.");
317     delete wb;
318 }
319 /*
320  * @bug 4153072
321  */
TestBug4153072()322 void RBBITest::TestBug4153072() {
323     UErrorCode status = U_ZERO_ERROR;
324     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
325     if (U_FAILURE(status))
326     {
327         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
328         return;
329     }
330     UnicodeString str("...Hello, World!...");
331     int32_t begin = 3;
332     int32_t end = str.length() - 3;
333     UBool onBoundary;
334 
335     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
336     iter->adoptText(textIterator);
337     int index;
338     // Note: with the switch to UText, there is no way to restrict the
339     //       iteration range to begin at an index other than zero.
340     //       String character iterators created with a non-zero bound are
341     //         treated by RBBI as being empty.
342     for (index = -1; index < begin + 1; ++index) {
343         onBoundary = iter->isBoundary(index);
344         if (index == 0?  !onBoundary : onBoundary) {
345             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
346                             " and begin index = " + begin);
347         }
348     }
349     delete iter;
350 }
351 
352 
353 //
354 // Test for problem reported by Ashok Matoria on 9 July 2007
355 //    One.<kSoftHyphen><kSpace>Two.
356 //
357 //    Sentence break at start (0) and then on calling next() it breaks at
358 //   'T' of "Two". Now, at this point if I do next() and
359 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
360 //
TestBug5775()361 void RBBITest::TestBug5775() {
362     UErrorCode status = U_ZERO_ERROR;
363     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
364     TEST_ASSERT_SUCCESS(status);
365     if (U_FAILURE(status)) {
366         return;
367     }
368 // Check for status first for better handling of no data errors.
369     TEST_ASSERT(bi != nullptr);
370     if (bi == nullptr) {
371         return;
372     }
373 
374     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
375     //               01234      56789
376     s = s.unescape();
377     bi->setText(s);
378     int pos = bi->next();
379     TEST_ASSERT(pos == 6);
380     pos = bi->next();
381     TEST_ASSERT(pos == 10);
382     pos = bi->previous();
383     TEST_ASSERT(pos == 6);
384     delete bi;
385 }
386 
387 
388 
389 //------------------------------------------------------------------------------
390 //
391 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
392 //
393 //------------------------------------------------------------------------------
394 
395 struct TestParams {
396     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
397                                            //   Changed out whenever test data changes break type.
398 
399     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
400     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
401     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
402     UVector32       *srcCol;
403 
404     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
405     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
406     CharString       utf8String;           // UTF-8 form of text to break.
407 
TestParamsTestParams408     TestParams(UErrorCode &status) : dataToBreak() {
409         bi               = nullptr;
410         expectedBreaks   = new UVector32(status);
411         srcLine          = new UVector32(status);
412         srcCol           = new UVector32(status);
413         textToBreak      = nullptr;
414         textMap          = new UVector32(status);
415     }
416 
~TestParamsTestParams417     ~TestParams() {
418         delete bi;
419         delete expectedBreaks;
420         delete srcLine;
421         delete srcCol;
422         utext_close(textToBreak);
423         delete textMap;
424     }
425 
426     int32_t getSrcLine(int32_t bp);
427     int32_t getExpectedBreak(int32_t bp);
428     int32_t getSrcCol(int32_t bp);
429 
430     void setUTF16(UErrorCode &status);
431     void setUTF8(UErrorCode &status);
432 };
433 
434 // Append a UnicodeString to a CharString with UTF-8 encoding.
435 // Substitute any invalid chars.
436 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)437 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
438     if (U_FAILURE(status)) {
439         return;
440     }
441     int32_t utf8Length;
442     u_strToUTF8WithSub(nullptr, 0, &utf8Length,         // Output Buffer, nullptr for preflight.
443                        src.getBuffer(), src.length(),   // UTF-16 data
444                        0xfffd, nullptr,                 // Substitution char, number of subs.
445                        &status);
446     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
447         return;
448     }
449     status = U_ZERO_ERROR;
450     int32_t capacity;
451     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
452     u_strToUTF8WithSub(buffer, utf8Length, nullptr,
453                        src.getBuffer(), src.length(),
454                        0xfffd, nullptr, &status);
455     dest.append(buffer, utf8Length, status);
456 }
457 
458 
setUTF16(UErrorCode & status)459 void TestParams::setUTF16(UErrorCode &status) {
460     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
461     textMap->removeAllElements();
462     for (int32_t i=0; i<dataToBreak.length(); i++) {
463         if (i == dataToBreak.getChar32Start(i)) {
464             textMap->addElement(i, status);
465         } else {
466             textMap->addElement(-1, status);
467         }
468     }
469     textMap->addElement(dataToBreak.length(), status);
470     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
471 }
472 
473 
setUTF8(UErrorCode & status)474 void TestParams::setUTF8(UErrorCode &status) {
475     if (U_FAILURE(status)) {
476         return;
477     }
478     utf8String.clear();
479     CharStringAppend(utf8String, dataToBreak, status);
480     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
481     if (U_FAILURE(status)) {
482         return;
483     }
484 
485     textMap->removeAllElements();
486     int32_t utf16Index = 0;
487     for (;;) {
488         textMap->addElement(utf16Index, status);
489         UChar32 c32 = utext_current32(textToBreak);
490         if (c32 < 0) {
491             break;
492         }
493         utf16Index += U16_LENGTH(c32);
494         utext_next32(textToBreak);
495         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
496             textMap->addElement(-1, status);
497         }
498     }
499     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
500 }
501 
502 
getSrcLine(int32_t bp)503 int32_t TestParams::getSrcLine(int32_t bp) {
504     if (bp >= textMap->size()) {
505         bp = textMap->size() - 1;
506     }
507     int32_t i = 0;
508     for(; bp >= 0 ; --bp) {
509         // Move to a character boundary if we are not on one already.
510         i = textMap->elementAti(bp);
511         if (i >= 0) {
512             break;
513         }
514     }
515     return srcLine->elementAti(i);
516 }
517 
518 
getExpectedBreak(int32_t bp)519 int32_t TestParams::getExpectedBreak(int32_t bp) {
520     if (bp >= textMap->size()) {
521         return 0;
522     }
523     int32_t i = textMap->elementAti(bp);
524     int32_t retVal = 0;
525     if (i >= 0) {
526         retVal = expectedBreaks->elementAti(i);
527     }
528     return retVal;
529 }
530 
531 
getSrcCol(int32_t bp)532 int32_t TestParams::getSrcCol(int32_t bp) {
533     if (bp >= textMap->size()) {
534         bp = textMap->size() - 1;
535     }
536     int32_t i = 0;
537     for(; bp >= 0; --bp) {
538         // Move bp to a character boundary if we are not on one already.
539         i = textMap->elementAti(bp);
540         if (i >= 0) {
541             break;
542         }
543     }
544     return srcCol->elementAti(i);
545 }
546 
547 
executeTest(TestParams * t,UErrorCode & status)548 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
549     int32_t    bp;
550     int32_t    prevBP;
551     int32_t    i;
552 
553     TEST_ASSERT_SUCCESS(status);
554     if (U_FAILURE(status)) {
555         return;
556     }
557 
558     if (t->bi == nullptr) {
559         return;
560     }
561 
562     t->bi->setText(t->textToBreak, status);
563     //
564     //  Run the iterator forward
565     //
566     prevBP = -1;
567     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
568         if (prevBP ==  bp) {
569             // Fail for lack of forward progress.
570             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
571                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
572             break;
573         }
574 
575         // Check that there we didn't miss an expected break between the last one
576         //  and this one.
577         for (i=prevBP+1; i<bp; i++) {
578             if (t->getExpectedBreak(i) != 0) {
579                 int expected[] = {0, i};
580                 printStringBreaks(t->dataToBreak, expected, 2);
581                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
582                       i, t->getSrcLine(i), t->getSrcCol(i));
583             }
584         }
585 
586         // Check that the break we did find was expected
587         if (t->getExpectedBreak(bp) == 0) {
588             int expected[] = {0, bp};
589             printStringBreaks(t->textToBreak, expected, 2);
590             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
591                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
592         } else {
593             // The break was expected.
594             //   Check that the {nnn} tag value is correct.
595             int32_t expectedTagVal = t->getExpectedBreak(bp);
596             if (expectedTagVal == -1) {
597                 expectedTagVal = 0;
598             }
599             int32_t line = t->getSrcLine(bp);
600             int32_t rs = t->bi->getRuleStatus();
601             if (rs != expectedTagVal) {
602                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
603                       "          Actual, Expected status = %4d, %4d",
604                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
605             }
606         }
607 
608         prevBP = bp;
609     }
610 
611     // Verify that there were no missed expected breaks after the last one found
612     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
613         if (t->getExpectedBreak(i) != 0) {
614             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
615                       i, t->getSrcLine(i), t->getSrcCol(i));
616         }
617     }
618 
619     //
620     //  Run the iterator backwards, verify that the same breaks are found.
621     //
622     prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
623     bp = t->bi->last();
624     while (bp != BreakIterator::DONE) {
625         if (prevBP ==  bp) {
626             // Fail for lack of progress.
627             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
628                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
629             break;
630         }
631 
632         // Check that we didn't miss an expected break between the last one
633         //  and this one.  (UVector returns zeros for index out of bounds.)
634         for (i=prevBP-1; i>bp; i--) {
635             if (t->getExpectedBreak(i) != 0) {
636                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
637                       i, t->getSrcLine(i), t->getSrcCol(i));
638             }
639         }
640 
641         // Check that the break we did find was expected
642         if (t->getExpectedBreak(bp) == 0) {
643             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
644                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
645         } else {
646             // The break was expected.
647             //   Check that the {nnn} tag value is correct.
648             int32_t expectedTagVal = t->getExpectedBreak(bp);
649             if (expectedTagVal == -1) {
650                 expectedTagVal = 0;
651             }
652             int line = t->getSrcLine(bp);
653             int32_t rs = t->bi->getRuleStatus();
654             if (rs != expectedTagVal) {
655                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
656                       "          Actual, Expected status = %4d, %4d",
657                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
658             }
659         }
660 
661         prevBP = bp;
662         bp = t->bi->previous();
663     }
664 
665     // Verify that there were no missed breaks prior to the last one found
666     for (i=prevBP-1; i>=0; i--) {
667         if (t->getExpectedBreak(i) != 0) {
668             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
669                       i, t->getSrcLine(i), t->getSrcCol(i));
670         }
671     }
672 
673     // Check isBoundary()
674     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
675         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
676         UBool boundaryFound    = t->bi->isBoundary(i);
677         if (boundaryExpected != boundaryFound) {
678             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
679                   "        Expected, Actual= %s, %s",
680                   i, t->getSrcLine(i), t->getSrcCol(i),
681                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
682         }
683     }
684 
685     // Check following()
686     for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
687         int32_t actualBreak = t->bi->following(i);
688         int32_t expectedBreak = BreakIterator::DONE;
689         for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
690             if (t->getExpectedBreak(j) != 0) {
691                 expectedBreak = j;
692                 break;
693             }
694         }
695         if (expectedBreak != actualBreak) {
696             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
697                   "        Expected, Actual= %d, %d",
698                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
699         }
700     }
701 
702     // Check preceding()
703     for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
704         int32_t actualBreak = t->bi->preceding(i);
705         int32_t expectedBreak = BreakIterator::DONE;
706 
707         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
708         // preceding(trailing byte) will return the index of some preceding code point,
709         // not the lead byte of the current code point, even though that has a smaller index.
710         // Therefore, start looking at the expected break data not at i-1, but at
711         // the start of code point index - 1.
712         utext_setNativeIndex(t->textToBreak, i);
713         int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
714         for (; j >= 0; j--) {
715             if (t->getExpectedBreak(j) != 0) {
716                 expectedBreak = j;
717                 break;
718             }
719         }
720         if (expectedBreak != actualBreak) {
721             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
722                   "        Expected, Actual= %d, %d",
723                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
724         }
725     }
726 }
727 
TestExtended()728 void RBBITest::TestExtended() {
729      // The expectations in this test heavily depends on the Thai dictionary.
730      // Therefore, we skip this test under the LSTM configuration.
731      if (skipDictionaryTest()) {
732          return;
733      }
734   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
735   // data driven test closely entangles filtered and regular data.
736 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
737     UErrorCode      status  = U_ZERO_ERROR;
738     Locale          locale("");
739 
740     TestParams          tp(status);
741 
742     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
743     if (U_FAILURE(status)) {
744         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
745     }
746 
747     //
748     //  Open and read the test data file.
749     //
750     const char *testDataDirectory = IntlTest::getSourceTestData(status);
751     CharString testFileName(testDataDirectory, -1, status);
752     testFileName.append("rbbitst.txt", -1, status);
753 
754     int    len;
755     char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
756     if (U_FAILURE(status)) {
757         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
758         return;
759     }
760 
761     bool skipTest = false; // Skip this test?
762 
763     //
764     //  Put the test data into a UnicodeString
765     //
766     UnicodeString testString(false, testFile, len);
767 
768     enum EParseState{
769         PARSE_COMMENT,
770         PARSE_TAG,
771         PARSE_DATA,
772         PARSE_NUM,
773         PARSE_RULES
774     }
775     parseState = PARSE_TAG;
776 
777     EParseState savedState = PARSE_TAG;
778 
779     int32_t    lineNum  = 1;
780     int32_t    colStart = 0;
781     int32_t    column   = 0;
782     int32_t    charIdx  = 0;
783 
784     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
785 
786     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
787     int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
788 
789     for (charIdx = 0; charIdx < len; ) {
790         status = U_ZERO_ERROR;
791         char16_t  c = testString.charAt(charIdx);
792         charIdx++;
793         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
794             // treat CRLF as a unit
795             c = u'\n';
796             charIdx++;
797         }
798         if (c == u'\n' || c == u'\r') {
799             lineNum++;
800             colStart = charIdx;
801         }
802         column = charIdx - colStart + 1;
803 
804         switch (parseState) {
805         case PARSE_COMMENT:
806             if (c == u'\n' || c == u'\r') {
807                 parseState = savedState;
808             }
809             break;
810 
811         case PARSE_TAG:
812             {
813             if (c == u'#') {
814                 parseState = PARSE_COMMENT;
815                 savedState = PARSE_TAG;
816                 break;
817             }
818             if (u_isUWhiteSpace(c)) {
819                 break;
820             }
821             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
822                 delete tp.bi;
823                 tp.bi = BreakIterator::createWordInstance(locale,  status);
824                 skipTest = false;
825                 charIdx += 5;
826                 break;
827             }
828             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
829                 delete tp.bi;
830                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
831                 skipTest = false;
832                 charIdx += 5;
833                 break;
834             }
835             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
836                 delete tp.bi;
837                 tp.bi = BreakIterator::createLineInstance(locale,  status);
838                 skipTest = false;
839 #if UCONFIG_USE_ML_PHRASE_BREAKING
840                 if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
841                     // skip <line> test cases of JP's phrase breaking when ML is enabled.
842                     skipTest = true;
843                 }
844 #endif
845                 charIdx += 5;
846                 break;
847             }
848             if (testString.compare(charIdx-1, 8, u"<lineML>") == 0) {
849                 delete tp.bi;
850                 tp.bi = BreakIterator::createLineInstance(locale,  status);
851                 skipTest = false;
852 #if !UCONFIG_USE_ML_PHRASE_BREAKING
853                 if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
854                     // skip <lineML> test cases of JP's phrase breaking when ML is disabled.
855                     skipTest = true;
856                 }
857 #endif
858                 charIdx += 7;
859                 break;
860             }
861             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
862                 delete tp.bi;
863                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
864                 skipTest = false;
865                 charIdx += 5;
866                 break;
867             }
868             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
869                 delete tp.bi;
870                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
871                 charIdx += 6;
872                 break;
873             }
874 
875             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
876                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
877                 charIdx = testString.indexOf(u'>', charIdx) + 1;
878                 parseState = PARSE_RULES;
879                 rules.remove();
880                 rulesFirstLine = lineNum;
881                 break;
882             }
883 
884             // <locale  loc_name>
885             localeMatcher.reset(testString);
886             if (localeMatcher.lookingAt(charIdx-1, status)) {
887                 UnicodeString localeName = localeMatcher.group(1, status);
888                 char localeName8[100];
889                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
890                 locale = Locale::createFromName(localeName8);
891                 charIdx += localeMatcher.group(0, status).length() - 1;
892                 TEST_ASSERT_SUCCESS(status);
893                 break;
894             }
895             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
896                 parseState = PARSE_DATA;
897                 charIdx += 5;
898                 tp.dataToBreak = "";
899                 tp.expectedBreaks->removeAllElements();
900                 tp.srcCol ->removeAllElements();
901                 tp.srcLine->removeAllElements();
902                 break;
903             }
904 
905             errln("line %d: Tag expected in test file.", lineNum);
906             parseState = PARSE_COMMENT;
907             savedState = PARSE_DATA;
908             goto end_test; // Stop the test.
909             }
910             break;
911 
912         case PARSE_RULES:
913             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
914                 charIdx += 7;
915                 parseState = PARSE_TAG;
916                 delete tp.bi;
917                 UParseError pe;
918                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
919                 skipTest = U_FAILURE(status);
920                 if (U_FAILURE(status)) {
921                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
922                         rulesFirstLine + pe.line - 1, u_errorName(status));
923                 }
924             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
925                 charIdx += 10;
926                 parseState = PARSE_TAG;
927                 UErrorCode ec = U_ZERO_ERROR;
928                 UParseError pe;
929                 RuleBasedBreakIterator bi(rules, pe, ec);
930                 if (U_SUCCESS(ec)) {
931                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
932                         rulesFirstLine + pe.line - 1);
933                 }
934             } else {
935                 rules.append(c);
936             }
937             break;
938 
939         case PARSE_DATA:
940             if (c == u'•') {
941                 int32_t  breakIdx = tp.dataToBreak.length();
942                 if (tp.expectedBreaks->size() > breakIdx) {
943                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
944                           lineNum, column);
945                 }
946                 tp.expectedBreaks->setSize(breakIdx+1);
947                 tp.expectedBreaks->setElementAt(-1, breakIdx);
948                 tp.srcLine->setSize(breakIdx+1);
949                 tp.srcLine->setElementAt(lineNum, breakIdx);
950                 tp.srcCol ->setSize(breakIdx+1);
951                 tp.srcCol ->setElementAt(column, breakIdx);
952                 break;
953             }
954 
955             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
956                 // Add final entry to mappings from break location to source file position.
957                 //  Need one extra because last break position returned is after the
958                 //    last char in the data, not at the last char.
959                 tp.srcLine->addElement(lineNum, status);
960                 tp.srcCol ->addElement(column, status);
961 
962                 parseState = PARSE_TAG;
963                 charIdx += 6;
964 
965                 if (!skipTest) {
966                     // RUN THE TEST!
967                     status = U_ZERO_ERROR;
968                     tp.setUTF16(status);
969                     executeTest(&tp, status);
970                     TEST_ASSERT_SUCCESS(status);
971 
972                     // Run again, this time with UTF-8 text wrapped in a UText.
973                     status = U_ZERO_ERROR;
974                     tp.setUTF8(status);
975                     TEST_ASSERT_SUCCESS(status);
976                     executeTest(&tp, status);
977                 }
978                 break;
979             }
980 
981             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
982                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
983                 // Get the code point from the name and insert it into the test data.
984                 //   (Damn, no API takes names in Unicode  !!!
985                 //    we've got to take it back to char *)
986                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
987                 int32_t nameLength = nameEndIdx - (charIdx+2);
988                 char charNameBuf[200];
989                 UChar32 theChar = -1;
990                 if (nameEndIdx != -1) {
991                     UErrorCode status = U_ZERO_ERROR;
992                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
993                     charNameBuf[sizeof(charNameBuf)-1] = 0;
994                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
995                     if (U_FAILURE(status)) {
996                         theChar = -1;
997                     }
998                 }
999                 if (theChar == -1) {
1000                     errln("Error in named character in test file at line %d, col %d",
1001                         lineNum, column);
1002                 } else {
1003                     // Named code point was recognized.  Insert it
1004                     //   into the test data.
1005                     tp.dataToBreak.append(theChar);
1006                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1007                         tp.srcLine->addElement(lineNum, status);
1008                         tp.srcCol ->addElement(column, status);
1009                     }
1010                 }
1011                 if (nameEndIdx > charIdx) {
1012                     charIdx = nameEndIdx+1;
1013 
1014                 }
1015                 break;
1016             }
1017 
1018 
1019 
1020             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
1021                 charIdx++;
1022                 int32_t  breakIdx = tp.dataToBreak.length();
1023                 tp.expectedBreaks->setSize(breakIdx+1);
1024                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1025                 tp.srcLine->setSize(breakIdx+1);
1026                 tp.srcLine->setElementAt(lineNum, breakIdx);
1027                 tp.srcCol ->setSize(breakIdx+1);
1028                 tp.srcCol ->setElementAt(column, breakIdx);
1029                 break;
1030             }
1031 
1032             if (c == u'<') {
1033                 tagValue   = 0;
1034                 parseState = PARSE_NUM;
1035                 break;
1036             }
1037 
1038             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
1039                 parseState = PARSE_COMMENT;
1040                 savedState = PARSE_DATA;
1041                 break;
1042             }
1043 
1044             if (c == u'\\') {
1045                 // Check for \ at end of line, a line continuation.
1046                 //     Advance over (discard) the newline
1047                 UChar32 cp = testString.char32At(charIdx);
1048                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1049                     // We have a CR LF
1050                     //  Need an extra increment of the input ptr to move over both of them
1051                     charIdx++;
1052                 }
1053                 if (cp == u'\n' || cp == u'\r') {
1054                     lineNum++;
1055                     colStart = charIdx;
1056                     charIdx++;
1057                     break;
1058                 }
1059 
1060                 // Let unescape handle the back slash.
1061                 cp = testString.unescapeAt(charIdx);
1062                 if (cp != -1) {
1063                     // Escape sequence was recognized.  Insert the char
1064                     //   into the test data.
1065                     tp.dataToBreak.append(cp);
1066                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1067                         tp.srcLine->addElement(lineNum, status);
1068                         tp.srcCol ->addElement(column, status);
1069                     }
1070                     break;
1071                 }
1072 
1073 
1074                 // Not a recognized backslash escape sequence.
1075                 // Take the next char as a literal.
1076                 //  TODO:  Should this be an error?
1077                 c = testString.charAt(charIdx);
1078                 charIdx = testString.moveIndex32(charIdx, 1);
1079             }
1080 
1081             // Normal, non-escaped data char.
1082             tp.dataToBreak.append(c);
1083 
1084             // Save the mapping from offset in the data to line/column numbers in
1085             //   the original input file.  Will be used for better error messages only.
1086             //   If there's an expected break before this char, the slot in the mapping
1087             //     vector will already be set for this char; don't overwrite it.
1088             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1089                 tp.srcLine->addElement(lineNum, status);
1090                 tp.srcCol ->addElement(column, status);
1091             }
1092             break;
1093 
1094 
1095         case PARSE_NUM:
1096             // We are parsing an expected numeric tag value, like <1234>,
1097             //   within a chunk of data.
1098             if (u_isUWhiteSpace(c)) {
1099                 break;
1100             }
1101 
1102             if (c == u'>') {
1103                 // Finished the number.  Add the info to the expected break data,
1104                 //   and switch parse state back to doing plain data.
1105                 parseState = PARSE_DATA;
1106                 if (tagValue == 0) {
1107                     tagValue = -1;
1108                 }
1109                 int32_t  breakIdx = tp.dataToBreak.length();
1110                 if (tp.expectedBreaks->size() > breakIdx) {
1111                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1112                           lineNum, column);
1113                 }
1114                 tp.expectedBreaks->setSize(breakIdx+1);
1115                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1116                 tp.srcLine->setSize(breakIdx+1);
1117                 tp.srcLine->setElementAt(lineNum, breakIdx);
1118                 tp.srcCol ->setSize(breakIdx+1);
1119                 tp.srcCol ->setElementAt(column, breakIdx);
1120                 break;
1121             }
1122 
1123             if (u_isdigit(c)) {
1124                 tagValue = tagValue*10 + u_charDigitValue(c);
1125                 break;
1126             }
1127 
1128             errln("Syntax Error in test file at line %d, col %d",
1129                 lineNum, column);
1130             parseState = PARSE_COMMENT;
1131             goto end_test; // Stop the test
1132             break;
1133         }
1134 
1135 
1136         if (U_FAILURE(status)) {
1137             dataerrln("ICU Error %s while parsing test file at line %d.",
1138                 u_errorName(status), lineNum);
1139             status = U_ZERO_ERROR;
1140             goto end_test; // Stop the test
1141         }
1142 
1143     }
1144 
1145     // Reached end of test file. Raise an error if parseState indicates that we are
1146     //   within a block that should have been terminated.
1147 
1148     if (parseState == PARSE_RULES) {
1149         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1150             lineNum, rulesFirstLine);
1151     }
1152     if (parseState == PARSE_DATA) {
1153         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1154     }
1155 
1156 
1157 end_test:
1158     delete [] testFile;
1159 #endif
1160 }
1161 
1162 //-------------------------------------------------------------------------------
1163 //
1164 //  TestDictRules   create a break iterator from source rules that includes a
1165 //                  dictionary range.   Regression for bug #7130.  Source rules
1166 //                  do not declare a break iterator type (word, line, sentence, etc.
1167 //                  but the dictionary code, without a type, would loop.
1168 //
1169 //-------------------------------------------------------------------------------
TestDictRules()1170 void RBBITest::TestDictRules() {
1171     const char *rules =  "$dictionary = [a-z]; \n"
1172                          "!!forward; \n"
1173                          "$dictionary $dictionary; \n"
1174                          "!!reverse; \n"
1175                          "$dictionary $dictionary; \n";
1176     const char *text = "aa";
1177     UErrorCode status = U_ZERO_ERROR;
1178     UParseError parseError;
1179 
1180     RuleBasedBreakIterator bi(rules, parseError, status);
1181     if (U_SUCCESS(status)) {
1182         UnicodeString utext = text;
1183         bi.setText(utext);
1184         int32_t position;
1185         int32_t loops;
1186         for (loops = 0; loops<10; loops++) {
1187             position = bi.next();
1188             if (position == RuleBasedBreakIterator::DONE) {
1189                 break;
1190             }
1191         }
1192         TEST_ASSERT(loops == 1);
1193     } else {
1194         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1195     }
1196 }
1197 
1198 
1199 
1200 //--------------------------------------------------------------------------------------------
1201 //
1202 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1203 //
1204 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1205 void RBBITest::TestUnicodeFiles() {
1206     RuleBasedBreakIterator  *bi;
1207     UErrorCode               status = U_ZERO_ERROR;
1208 
1209     bi =  dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createCharacterInstance(Locale::getEnglish(), status));
1210     TEST_ASSERT_SUCCESS(status);
1211     if (U_SUCCESS(status)) {
1212         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1213     }
1214     delete bi;
1215 
1216     bi =  dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status));
1217     TEST_ASSERT_SUCCESS(status);
1218     if (U_SUCCESS(status)) {
1219         runUnicodeTestData("WordBreakTest.txt", bi);
1220     }
1221     delete bi;
1222 
1223     bi =  dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1224     TEST_ASSERT_SUCCESS(status);
1225     if (U_SUCCESS(status)) {
1226         runUnicodeTestData("SentenceBreakTest.txt", bi);
1227     }
1228     delete bi;
1229 
1230     bi =  dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status));
1231     TEST_ASSERT_SUCCESS(status);
1232     if (U_SUCCESS(status)) {
1233         runUnicodeTestData("LineBreakTest.txt", bi);
1234     }
1235     delete bi;
1236 }
1237 
1238 
1239 // Check for test cases from the Unicode test data files that are known to fail
1240 // and should be skipped as known issues because ICU does not fully implement
1241 // the Unicode specifications, or because ICU includes tailorings that differ from
1242 // the Unicode standard.
1243 //
1244 // Test cases are identified by the test data sequence, which tends to be more stable
1245 // across Unicode versions than the test file line numbers.
1246 //
1247 // The test case with ticket "10666" is a dummy, included as an example.
1248 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1249 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1250     static struct TestCase {
1251         const char *fTicketNum;
1252         const char *fFileName;
1253         const char16_t *fString;
1254     } badTestCases[] = {
1255         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1256         // The following tests were originally for
1257         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1258         // However, that ticket has been closed as fixed but these tests still fail, so
1259         // ICU-21097 has been created to investigate and address these remaining issues.
1260         {"21097",  "LineBreakTest.txt", u"-#"},
1261         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1262         {"21097",  "LineBreakTest.txt", u"\u002d\u00a7"},
1263         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1264         {"21097",  "LineBreakTest.txt", u"\u002d\U00050005"},
1265         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1266         {"21097",  "LineBreakTest.txt", u"\u002d\u0e01"},
1267         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1268 
1269         // The following tests were originally for
1270         // Issue ICU-12017 Improve line break around numbers.
1271         // However, that ticket has been closed as fixed but these tests still fail, so
1272         // ICU-21097 has been created to investigate and address these remaining issues.
1273         {"21097", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1274         {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1275         {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1276         {"21097", "LineBreakTest.txt", u"a.2 "},
1277         {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1278         {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1279         {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1280         {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1281         {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1282         {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1283         {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1284         {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1285         {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1286         {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1287 
1288         // ICU-22127 until UAX #29 wordbreak is update for the colon changes in ICU-22112,
1289         // need to skip some tests in WordBreakTest.txt
1290         {"22127", "WordBreakTest.txt", u"a:"},
1291         {"22127", "WordBreakTest.txt", u"A:"},
1292     };
1293 
1294     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1295         const TestCase &badCase = badTestCases[n];
1296         if (!strcmp(fileName, badCase.fFileName) &&
1297                 testCase.startsWith(UnicodeString(badCase.fString))) {
1298             return logKnownIssue(badCase.fTicketNum);
1299         }
1300     }
1301     return false;
1302 }
1303 
1304 
1305 //--------------------------------------------------------------------------------------------
1306 //
1307 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1308 //
1309 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1310 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1311 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1312     UErrorCode  status = U_ZERO_ERROR;
1313 
1314     //
1315     //  Open and read the test data file, put it into a UnicodeString.
1316     //
1317     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1318     char testFileName[1000];
1319     if (testDataDirectory == nullptr || strlen(testDataDirectory) >= sizeof(testFileName)) {
1320         dataerrln("Can't open test data.  Path too long.");
1321         return;
1322     }
1323     strcpy(testFileName, testDataDirectory);
1324     strcat(testFileName, fileName);
1325 
1326     logln("Opening data file %s\n", fileName);
1327 
1328     int    len;
1329     char16_t *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1330     if (status != U_FILE_ACCESS_ERROR) {
1331         TEST_ASSERT_SUCCESS(status);
1332         TEST_ASSERT(testFile != nullptr);
1333     }
1334     if (U_FAILURE(status) || testFile == nullptr) {
1335         return; /* something went wrong, error already output */
1336     }
1337     UnicodeString testFileAsString(true, testFile, len);
1338 
1339     //
1340     //  Parse the test data file using a regular expression.
1341     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1342     //     is identified by which group had a match.
1343     //
1344     //    Capture Group  #                  1          2            3            4           5
1345     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1346     //
1347     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1348     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1349     UnicodeString   testString;
1350     UVector32       breakPositions(status);
1351     int             lineNumber = 1;
1352     TEST_ASSERT_SUCCESS(status);
1353     if (U_FAILURE(status)) {
1354         return;
1355     }
1356 
1357     //
1358     //  Scan through each test case, building up the string to be broken in testString,
1359     //   and the positions that should be boundaries in the breakPositions vector.
1360     //
1361     int spin = 0;
1362     while (tokenMatcher.find()) {
1363         if(tokenMatcher.hitEnd()) {
1364           /* Shouldn't Happen(TM).  This means we didn't find the symbols we were looking for.
1365              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1366              and caused an infinite loop here on EBCDIC systems!
1367           */
1368           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1369           //       return;
1370         }
1371         if (tokenMatcher.start(1, status) >= 0) {
1372             // Scanned a divide sign, indicating a break position in the test data.
1373             if (testString.length()>0) {
1374                 breakPositions.addElement(testString.length(), status);
1375             }
1376         }
1377         else if (tokenMatcher.start(2, status) >= 0) {
1378             // Scanned an 'x', meaning no break at this position in the test data
1379             //   Nothing to be done here.
1380             }
1381         else if (tokenMatcher.start(3, status) >= 0) {
1382             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1383             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1384             int length = hexNumber.length();
1385             if (length<=8) {
1386                 char buf[10];
1387                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1388                 UChar32 c = (UChar32)strtol(buf, nullptr, 16);
1389                 if (c<=0x10ffff) {
1390                     testString.append(c);
1391                 } else {
1392                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1393                        fileName, lineNumber);
1394                 }
1395             } else {
1396                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1397                        fileName, lineNumber);
1398              }
1399         }
1400         else if (tokenMatcher.start(4, status) >= 0) {
1401             // Scanned to end of a line, possibly skipping over a comment in the process.
1402             //   If the line from the file contained test data, run the test now.
1403             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1404                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1405             }
1406 
1407             // Clear out this test case.
1408             //    The string and breakPositions vector will be refilled as the next
1409             //       test case is parsed.
1410             testString.remove();
1411             breakPositions.removeAllElements();
1412             lineNumber++;
1413         } else {
1414             // Scanner catchall.  Something unrecognized appeared on the line.
1415             char token[16];
1416             UnicodeString uToken = tokenMatcher.group(0, status);
1417             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1418             token[sizeof(token)-1] = 0;
1419             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1420 
1421             // Clean up, in preparation for continuing with the next line.
1422             testString.remove();
1423             breakPositions.removeAllElements();
1424             lineNumber++;
1425         }
1426         TEST_ASSERT_SUCCESS(status);
1427         if (U_FAILURE(status)) {
1428             break;
1429         }
1430     }
1431 
1432     delete [] testFile;
1433  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1434 }
1435 
1436 //--------------------------------------------------------------------------------------------
1437 //
1438 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1439 //                            test data files.  Do only a simple, forward-only check -
1440 //                            this test is mostly to check that ICU and the Unicode
1441 //                            data agree with each other.
1442 //
1443 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1444 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1445                          const UnicodeString &testString,   // Text data to be broken
1446                          UVector32 *breakPositions,         // Positions where breaks should be found.
1447                          RuleBasedBreakIterator *bi) {
1448     int32_t pos;                 // Break Position in the test string
1449     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1450     int32_t expectedPos;         // Expected break position (index into test string)
1451 
1452     bi->setText(testString);
1453     pos = bi->first();
1454     pos = bi->next();
1455 
1456     while (pos != BreakIterator::DONE) {
1457         if (expectedI >= breakPositions->size()) {
1458             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1459                 testFileName, lineNumber, pos);
1460             break;
1461         }
1462         expectedPos = breakPositions->elementAti(expectedI);
1463         if (pos < expectedPos) {
1464             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1465                 testFileName, lineNumber, pos);
1466             break;
1467         }
1468         if (pos > expectedPos) {
1469             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1470                 testFileName, lineNumber, expectedPos);
1471             break;
1472         }
1473         pos = bi->next();
1474         expectedI++;
1475     }
1476 
1477     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1478         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1479             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1480     }
1481 }
1482 
1483 
1484 
1485 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1486 //---------------------------------------------------------------------------------------
1487 //
1488 //   class RBBIMonkeyKind
1489 //
1490 //      Monkey Test for Break Iteration
1491 //      Abstract interface class.   Concrete derived classes independently
1492 //      implement the break rules for different iterator types.
1493 //
1494 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1495 //      testing, but works purely in terms of the interface defined here.
1496 //
1497 //---------------------------------------------------------------------------------------
1498 class RBBIMonkeyKind {
1499 public:
1500     // Return a UVector of UnicodeSets, representing the character classes used
1501     //   for this type of iterator.
1502     virtual  UVector  *charClasses() = 0;
1503 
1504     // Set the test text on which subsequent calls to next() will operate
1505     virtual  void      setText(const UnicodeString &s) = 0;
1506 
1507     // Find the next break position, starting from the prev break position, or from zero.
1508     // Return -1 after reaching end of string.
1509     virtual  int32_t   next(int32_t i) = 0;
1510 
1511     // Name of each character class, parallel with charClasses. Used for debugging output
1512     // of characters.
1513     virtual  std::vector<std::string>&     characterClassNames();
1514 
1515     void setAppliedRule(int32_t position, const char* value);
1516 
1517     std::string getAppliedRule(int32_t position);
1518 
1519     virtual ~RBBIMonkeyKind();
1520     UErrorCode deferredStatus;
1521 
1522     std::string classNameFromCodepoint(const UChar32 c);
1523     unsigned int maxClassNameSize();
1524 
1525  protected:
1526      RBBIMonkeyKind();
1527      std::vector<std::string> classNames;
1528      std::vector<std::string> appliedRules;
1529 
1530     // Clear `appliedRules` and fill it with empty strings in the size of test text.
1531     void prepareAppliedRules(int32_t size );
1532 
1533  private:
1534 
1535 };
1536 
RBBIMonkeyKind()1537 RBBIMonkeyKind::RBBIMonkeyKind() {
1538     deferredStatus = U_ZERO_ERROR;
1539 }
1540 
~RBBIMonkeyKind()1541 RBBIMonkeyKind::~RBBIMonkeyKind() {
1542 }
1543 
characterClassNames()1544 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1545     return classNames;
1546 }
1547 
prepareAppliedRules(int32_t size)1548 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1549     // Remove all the information in the `appliedRules`.
1550     appliedRules.clear();
1551     appliedRules.resize(size + 1);
1552 }
1553 
setAppliedRule(int32_t position,const char * value)1554 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1555     appliedRules[position] = value;
1556 }
1557 
getAppliedRule(int32_t position)1558 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1559     return appliedRules[position];
1560 }
1561 
classNameFromCodepoint(const UChar32 c)1562 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1563     // Simply iterate through charClasses to find character's class
1564     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1565         UnicodeSet *classSet = static_cast<UnicodeSet *>(charClasses()->elementAt(aClassNum));
1566         if (classSet->contains(c)) {
1567             return classNames[aClassNum];
1568         }
1569     }
1570     U_ASSERT(false);  // This should not happen.
1571     return "bad class name";
1572 }
1573 
maxClassNameSize()1574 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1575     unsigned int maxSize = 0;
1576     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1577         auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1578         if (aClassNumSize > maxSize) {
1579             maxSize = aClassNumSize;
1580         }
1581     }
1582     return maxSize;
1583 }
1584 
1585 //----------------------------------------------------------------------------------------
1586 //
1587 //   Random Numbers.  Similar to standard lib rand() and srand()
1588 //                    Not using library to
1589 //                      1.  Get same results on all platforms.
1590 //                      2.  Get access to current seed, to more easily reproduce failures.
1591 //
1592 //---------------------------------------------------------------------------------------
1593 static uint32_t m_seed = 1;
1594 
m_rand()1595 static uint32_t m_rand()
1596 {
1597     m_seed = m_seed * 1103515245 + 12345;
1598     return (uint32_t)(m_seed/65536) % 32768;
1599 }
1600 
1601 
1602 //------------------------------------------------------------------------------------------
1603 //
1604 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1605 //                             of RBBIMonkeyKind.
1606 //
1607 //------------------------------------------------------------------------------------------
1608 class RBBICharMonkey: public RBBIMonkeyKind {
1609 public:
1610     RBBICharMonkey();
1611     virtual          ~RBBICharMonkey();
1612     virtual  UVector *charClasses() override;
1613     virtual  void     setText(const UnicodeString &s) override;
1614     virtual  int32_t  next(int32_t i) override;
1615 private:
1616     UVector   *fSets;
1617 
1618     UnicodeSet  *fCRLFSet;
1619     UnicodeSet  *fControlSet;
1620     UnicodeSet  *fExtendSet;
1621     UnicodeSet  *fZWJSet;
1622     UnicodeSet  *fRegionalIndicatorSet;
1623     UnicodeSet  *fPrependSet;
1624     UnicodeSet  *fSpacingSet;
1625     UnicodeSet  *fLSet;
1626     UnicodeSet  *fVSet;
1627     UnicodeSet  *fTSet;
1628     UnicodeSet  *fLVSet;
1629     UnicodeSet  *fLVTSet;
1630     UnicodeSet  *fHangulSet;
1631     UnicodeSet  *fExtendedPictSet;
1632     UnicodeSet  *fViramaSet;
1633     UnicodeSet  *fLinkingConsonantSet;
1634     UnicodeSet  *fExtCccZwjSet;
1635     UnicodeSet  *fAnySet;
1636 
1637     const UnicodeString *fText;
1638 };
1639 
1640 
RBBICharMonkey()1641 RBBICharMonkey::RBBICharMonkey() {
1642     UErrorCode  status = U_ZERO_ERROR;
1643 
1644     fText = nullptr;
1645 
1646     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1647     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1648     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1649     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1650     fRegionalIndicatorSet =
1651                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1652     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1653     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1654     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1655     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1656     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1657     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1658     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1659     fHangulSet  = new UnicodeSet();
1660     fHangulSet->addAll(*fLSet);
1661     fHangulSet->addAll(*fVSet);
1662     fHangulSet->addAll(*fTSet);
1663     fHangulSet->addAll(*fLVSet);
1664     fHangulSet->addAll(*fLVTSet);
1665 
1666     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1667     fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1668                                         "\\p{Indic_Syllabic_Category=Virama}]", status);
1669     fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1670                                         "\\p{Indic_Syllabic_Category=Consonant}]", status);
1671     fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1672     fAnySet           = new UnicodeSet(0, 0x10ffff);
1673 
1674     // Create sets of characters, and add the names of the above character sets.
1675     // In each new ICU release, add new names corresponding to the sets above.
1676     fSets             = new UVector(status);
1677 
1678     // Important: Keep class names the same as the class contents.
1679     fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1680     fSets->addElement(fControlSet, status); classNames.push_back("Control");
1681     fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1682     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1683     if (!fPrependSet->isEmpty()) {
1684         fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1685     }
1686     fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1687     fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1688     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1689     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1690     fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1691     fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1692     fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1693     fSets->addElement(fAnySet, status); classNames.push_back("Any");
1694 
1695     if (U_FAILURE(status)) {
1696         deferredStatus = status;
1697     }
1698 }
1699 
1700 
setText(const UnicodeString & s)1701 void RBBICharMonkey::setText(const UnicodeString &s) {
1702     fText = &s;
1703     prepareAppliedRules(s.length());
1704 }
1705 
1706 
1707 
next(int32_t prevPos)1708 int32_t RBBICharMonkey::next(int32_t prevPos) {
1709     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1710                               //   break position being tested.  The candidate break
1711                               //   location is before p2.
1712 
1713     int     breakPos = -1;
1714 
1715     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1716     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1717 
1718     if (U_FAILURE(deferredStatus)) {
1719         return -1;
1720     }
1721 
1722     // Previous break at end of string.  return DONE.
1723     if (prevPos >= fText->length()) {
1724         return -1;
1725     }
1726 
1727     p0 = p1 = p2 = p3 = prevPos;
1728     c3 =  fText->char32At(prevPos);
1729     c0 = c1 = c2 = cBase = 0;
1730     (void)p0;   // suppress set but not used warning.
1731     (void)c0;
1732 
1733     // Loop runs once per "significant" character position in the input text.
1734     for (;;) {
1735         // Move all of the positions forward in the input string.
1736         p0 = p1;  c0 = c1;
1737         p1 = p2;  c1 = c2;
1738         p2 = p3;  c2 = c3;
1739 
1740         // Advance p3 by one codepoint
1741         p3 = fText->moveIndex32(p3, 1);
1742         c3 = fText->char32At(p3);
1743 
1744         if (p1 == p2) {
1745             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1746             continue;
1747         }
1748 
1749         if (p2 == fText->length()) {
1750             setAppliedRule(p2, "End of String");
1751             break;
1752         }
1753 
1754         //     No Extend or Format characters may appear between the CR and LF,
1755         //     which requires the additional check for p2 immediately following p1.
1756         //
1757         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1758           setAppliedRule(p2, "GB3   CR x LF");
1759           continue;
1760         }
1761 
1762         if (fControlSet->contains(c1) ||
1763             c1 == 0x0D ||
1764             c1 == 0x0A)  {
1765           setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
1766           break;
1767         }
1768 
1769         if (fControlSet->contains(c2) ||
1770             c2 == 0x0D ||
1771             c2 == 0x0A)  {
1772             setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
1773             break;
1774         }
1775 
1776         if (fLSet->contains(c1) &&
1777                (fLSet->contains(c2)  ||
1778                 fVSet->contains(c2)  ||
1779                 fLVSet->contains(c2) ||
1780                 fLVTSet->contains(c2))) {
1781             setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
1782             continue;
1783         }
1784 
1785         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1786             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1787             setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
1788             continue;
1789         }
1790 
1791         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1792             fTSet->contains(c2))  {
1793             setAppliedRule(p2, "GB8   ( LVT | T)  x T");
1794             continue;
1795         }
1796 
1797         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1798             if (!fExtendSet->contains(c1)) {
1799                 cBase = c1;
1800             }
1801             setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
1802             continue;
1803         }
1804 
1805         if (fSpacingSet->contains(c2)) {
1806             setAppliedRule(p2, "GB9a  x  SpacingMark");
1807             continue;
1808         }
1809 
1810         if (fPrependSet->contains(c1)) {
1811             setAppliedRule(p2, "GB9b  Prepend x");
1812             continue;
1813         }
1814 
1815         //   Note: Viramas are also included in the ExtCccZwj class.
1816         if (fLinkingConsonantSet->contains(c2)) {
1817             int pi = p1;
1818             bool sawVirama = false;
1819             while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1820                 if (fViramaSet->contains(fText->char32At(pi))) {
1821                     sawVirama = true;
1822                 }
1823                 pi = fText->moveIndex32(pi, -1);
1824             }
1825             if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1826               setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1827               continue;
1828             }
1829         }
1830 
1831         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1832           setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1833           continue;
1834         }
1835 
1836         //                   Note: The first if condition is a little tricky. We only need to force
1837         //                      a break if there are three or more contiguous RIs. If there are
1838         //                      only two, a break following will occur via other rules, and will include
1839         //                      any trailing extend characters, which is needed behavior.
1840         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1841                 && fRegionalIndicatorSet->contains(c2)) {
1842           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1843           break;
1844         }
1845         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1846           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1847           continue;
1848         }
1849 
1850         setAppliedRule(p2, "GB999 Any <break> Any");
1851         break;
1852     }
1853 
1854     breakPos = p2;
1855     return breakPos;
1856 }
1857 
1858 
1859 
charClasses()1860 UVector  *RBBICharMonkey::charClasses() {
1861     return fSets;
1862 }
1863 
~RBBICharMonkey()1864 RBBICharMonkey::~RBBICharMonkey() {
1865     delete fSets;
1866     delete fCRLFSet;
1867     delete fControlSet;
1868     delete fExtendSet;
1869     delete fRegionalIndicatorSet;
1870     delete fPrependSet;
1871     delete fSpacingSet;
1872     delete fLSet;
1873     delete fVSet;
1874     delete fTSet;
1875     delete fLVSet;
1876     delete fLVTSet;
1877     delete fHangulSet;
1878     delete fAnySet;
1879     delete fZWJSet;
1880     delete fExtendedPictSet;
1881     delete fViramaSet;
1882     delete fLinkingConsonantSet;
1883     delete fExtCccZwjSet;
1884 }
1885 
1886 //------------------------------------------------------------------------------------------
1887 //
1888 //   class RBBIWordMonkey      Word Break specific implementation
1889 //                             of RBBIMonkeyKind.
1890 //
1891 //------------------------------------------------------------------------------------------
1892 class RBBIWordMonkey: public RBBIMonkeyKind {
1893 public:
1894     RBBIWordMonkey();
1895     virtual          ~RBBIWordMonkey();
1896     virtual  UVector *charClasses() override;
1897     virtual  void     setText(const UnicodeString &s) override;
1898     virtual int32_t   next(int32_t i) override;
1899 private:
1900     UVector      *fSets;
1901 
1902     UnicodeSet  *fCRSet;
1903     UnicodeSet  *fLFSet;
1904     UnicodeSet  *fNewlineSet;
1905     UnicodeSet  *fRegionalIndicatorSet;
1906     UnicodeSet  *fKatakanaSet;
1907     UnicodeSet  *fHebrew_LetterSet;
1908     UnicodeSet  *fALetterSet;
1909     UnicodeSet  *fSingle_QuoteSet;
1910     UnicodeSet  *fDouble_QuoteSet;
1911     UnicodeSet  *fMidNumLetSet;
1912     UnicodeSet  *fMidLetterSet;
1913     UnicodeSet  *fMidNumSet;
1914     UnicodeSet  *fNumericSet;
1915     UnicodeSet  *fFormatSet;
1916     UnicodeSet  *fOtherSet = nullptr;
1917     UnicodeSet  *fExtendSet;
1918     UnicodeSet  *fExtendNumLetSet;
1919     UnicodeSet  *fWSegSpaceSet;
1920     UnicodeSet  *fDictionarySet = nullptr;
1921     UnicodeSet  *fZWJSet;
1922     UnicodeSet  *fExtendedPictSet;
1923 
1924     const UnicodeString  *fText;
1925 };
1926 
1927 
RBBIWordMonkey()1928 RBBIWordMonkey::RBBIWordMonkey()
1929 {
1930     UErrorCode  status = U_ZERO_ERROR;
1931 
1932     fSets            = new UVector(status);
1933 
1934     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
1935     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
1936     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
1937     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
1938     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1939     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1940     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter} @]", status);
1941     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
1942     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
1943     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
1944     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]",    status);
1945     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
1946     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1947     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
1948     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1949     // There are some sc=Hani characters with WB=Extend.
1950     // The break rules need to pick one or the other because
1951     // Extend overlapping with something else is messy.
1952     // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1953     // in $Han (for $dictionary) and out of $Extend.
1954     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1955     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
1956 
1957     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
1958     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1959     if(U_FAILURE(status)) {
1960         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1961         deferredStatus = status;
1962         return;
1963     }
1964 
1965     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1966     fDictionarySet->addAll(*fKatakanaSet);
1967     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1968 
1969     fALetterSet->removeAll(*fDictionarySet);
1970 
1971     fOtherSet        = new UnicodeSet();
1972     if(U_FAILURE(status)) {
1973         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1974         deferredStatus = status;
1975         return;
1976     }
1977 
1978     fOtherSet->complement();
1979     fOtherSet->removeAll(*fCRSet);
1980     fOtherSet->removeAll(*fLFSet);
1981     fOtherSet->removeAll(*fNewlineSet);
1982     fOtherSet->removeAll(*fKatakanaSet);
1983     fOtherSet->removeAll(*fHebrew_LetterSet);
1984     fOtherSet->removeAll(*fALetterSet);
1985     fOtherSet->removeAll(*fSingle_QuoteSet);
1986     fOtherSet->removeAll(*fDouble_QuoteSet);
1987     fOtherSet->removeAll(*fMidLetterSet);
1988     fOtherSet->removeAll(*fMidNumSet);
1989     fOtherSet->removeAll(*fNumericSet);
1990     fOtherSet->removeAll(*fExtendNumLetSet);
1991     fOtherSet->removeAll(*fWSegSpaceSet);
1992     fOtherSet->removeAll(*fFormatSet);
1993     fOtherSet->removeAll(*fExtendSet);
1994     fOtherSet->removeAll(*fRegionalIndicatorSet);
1995     fOtherSet->removeAll(*fZWJSet);
1996     fOtherSet->removeAll(*fExtendedPictSet);
1997 
1998     // Inhibit dictionary characters from being tested at all.
1999     fOtherSet->removeAll(*fDictionarySet);
2000 
2001     // Add classes and their names
2002     fSets->addElement(fCRSet, status); classNames.push_back("CR");
2003     fSets->addElement(fLFSet, status); classNames.push_back("LF");
2004     fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2005     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2006     fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2007     fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2008     fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2009     fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2010     // Omit Katakana from fSets, which omits Katakana characters
2011     // from the test data. They are all in the dictionary set,
2012     // which this (old, to be retired) monkey test cannot handle.
2013     //fSets->addElement(fKatakanaSet, status);
2014 
2015     fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2016     fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2017     fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2018     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2019     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2020     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2021     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2022     fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2023     fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2024 
2025     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2026     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2027 
2028     if (U_FAILURE(status)) {
2029         deferredStatus = status;
2030     }
2031 }
2032 
setText(const UnicodeString & s)2033 void RBBIWordMonkey::setText(const UnicodeString &s) {
2034     fText       = &s;
2035     prepareAppliedRules(s.length());
2036 }
2037 
2038 
next(int32_t prevPos)2039 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2040     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2041                               //   break position being tested.  The candidate break
2042                               //   location is before p2.
2043 
2044     int     breakPos = -1;
2045 
2046     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2047 
2048     if (U_FAILURE(deferredStatus)) {
2049         return -1;
2050     }
2051 
2052     // Prev break at end of string.  return DONE.
2053     if (prevPos >= fText->length()) {
2054         return -1;
2055     }
2056     p0 = p1 = p2 = p3 = prevPos;
2057     c3 =  fText->char32At(prevPos);
2058     c0 = c1 = c2 = 0;
2059     (void)p0;       // Suppress set but not used warning.
2060 
2061     // Loop runs once per "significant" character position in the input text.
2062     for (;;) {
2063         // Move all of the positions forward in the input string.
2064         p0 = p1;  c0 = c1;
2065         p1 = p2;  c1 = c2;
2066         p2 = p3;  c2 = c3;
2067 
2068         // Advance p3 by    X(Extend | Format)*   Rule 4
2069         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2070         do {
2071             p3 = fText->moveIndex32(p3, 1);
2072             c3 = fText->char32At(p3);
2073             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2074                break;
2075             }
2076         }
2077         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2078 
2079 
2080         if (p1 == p2) {
2081             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2082             continue;
2083         }
2084 
2085         if (p2 == fText->length()) {
2086             // Reached end of string.  Always a break position.
2087             break;
2088         }
2089 
2090         //     No Extend or Format characters may appear between the CR and LF,
2091         //     which requires the additional check for p2 immediately following p1.
2092         //
2093         if (c1==0x0D && c2==0x0A) {
2094           setAppliedRule(p2, "WB3   CR x LF");
2095           continue;
2096         }
2097 
2098         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2099             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2100             break;
2101         }
2102         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2103             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2104             break;
2105         }
2106 
2107         //              Not ignoring extend chars, so peek into input text to
2108         //              get the potential ZWJ, the character immediately preceding c2.
2109         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2110         //              but char32At will get the full code point.
2111         if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2112             setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
2113             continue;
2114         }
2115 
2116         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2117             setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
2118             continue;
2119         }
2120 
2121         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2122             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2123             setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2124             continue;
2125         }
2126 
2127         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2128              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2129              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2130             setAppliedRule(p2,
2131                            "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2132             continue;
2133         }
2134 
2135         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2136             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2137             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2138             setAppliedRule(p2,
2139                            "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
2140             continue;
2141         }
2142 
2143         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2144             setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
2145             continue;
2146         }
2147 
2148           if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2149             setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
2150             continue;
2151         }
2152 
2153         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2154             setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
2155             continue;
2156         }
2157 
2158         if (fNumericSet->contains(c1) &&
2159             fNumericSet->contains(c2)) {
2160             setAppliedRule(p2, "WB8   Numeric x Numeric");
2161             continue;
2162         }
2163 
2164         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2165             fNumericSet->contains(c2)) {
2166             setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
2167             continue;
2168         }
2169 
2170         if (fNumericSet->contains(c1) &&
2171             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2172             setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
2173             continue;
2174         }
2175 
2176           if (fNumericSet->contains(c0) &&
2177             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2178             fNumericSet->contains(c2)) {
2179             setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
2180             continue;
2181         }
2182 
2183         if (fNumericSet->contains(c1) &&
2184             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2185             fNumericSet->contains(c3)) {
2186             setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2187             continue;
2188         }
2189 
2190         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2191         //                  all Katakana are handled by the dictionary breaker.
2192         if (fKatakanaSet->contains(c1) &&
2193             fKatakanaSet->contains(c2))  {
2194             setAppliedRule(p2, "WB13  Katakana x Katakana");
2195             continue;
2196         }
2197 
2198         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2199              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2200              fExtendNumLetSet->contains(c2)) {
2201             setAppliedRule(p2,
2202                            "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2203             continue;
2204         }
2205 
2206         if (fExtendNumLetSet->contains(c1) &&
2207                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2208                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2209             setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2210             continue;
2211         }
2212 
2213         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2214             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2215             break;
2216         }
2217         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2218             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2219             continue;
2220         }
2221 
2222         setAppliedRule(p2, "WB999");
2223         break;
2224     }
2225 
2226     breakPos = p2;
2227     return breakPos;
2228 }
2229 
2230 
charClasses()2231 UVector  *RBBIWordMonkey::charClasses() {
2232     return fSets;
2233 }
2234 
~RBBIWordMonkey()2235 RBBIWordMonkey::~RBBIWordMonkey() {
2236     delete fSets;
2237     delete fCRSet;
2238     delete fLFSet;
2239     delete fNewlineSet;
2240     delete fKatakanaSet;
2241     delete fHebrew_LetterSet;
2242     delete fALetterSet;
2243     delete fSingle_QuoteSet;
2244     delete fDouble_QuoteSet;
2245     delete fMidNumLetSet;
2246     delete fMidLetterSet;
2247     delete fMidNumSet;
2248     delete fNumericSet;
2249     delete fFormatSet;
2250     delete fExtendSet;
2251     delete fExtendNumLetSet;
2252     delete fWSegSpaceSet;
2253     delete fRegionalIndicatorSet;
2254     delete fDictionarySet;
2255     delete fOtherSet;
2256     delete fZWJSet;
2257     delete fExtendedPictSet;
2258 }
2259 
2260 
2261 
2262 
2263 //------------------------------------------------------------------------------------------
2264 //
2265 //   class RBBISentMonkey      Sentence Break specific implementation
2266 //                             of RBBIMonkeyKind.
2267 //
2268 //------------------------------------------------------------------------------------------
2269 class RBBISentMonkey: public RBBIMonkeyKind {
2270 public:
2271     RBBISentMonkey();
2272     virtual          ~RBBISentMonkey();
2273     virtual  UVector *charClasses() override;
2274     virtual  void     setText(const UnicodeString &s) override;
2275     virtual int32_t   next(int32_t i) override;
2276 private:
2277     int               moveBack(int posFrom);
2278     int               moveForward(int posFrom);
2279     UChar32           cAt(int pos);
2280 
2281     UVector      *fSets;
2282 
2283     UnicodeSet  *fSepSet;
2284     UnicodeSet  *fFormatSet;
2285     UnicodeSet  *fSpSet;
2286     UnicodeSet  *fLowerSet;
2287     UnicodeSet  *fUpperSet;
2288     UnicodeSet  *fOLetterSet;
2289     UnicodeSet  *fNumericSet;
2290     UnicodeSet  *fATermSet;
2291     UnicodeSet  *fSContinueSet;
2292     UnicodeSet  *fSTermSet;
2293     UnicodeSet  *fCloseSet;
2294     UnicodeSet  *fOtherSet;
2295     UnicodeSet  *fExtendSet;
2296 
2297     const UnicodeString  *fText;
2298 };
2299 
RBBISentMonkey()2300 RBBISentMonkey::RBBISentMonkey()
2301 {
2302     UErrorCode  status = U_ZERO_ERROR;
2303 
2304     fSets            = new UVector(status);
2305 
2306     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2307     //                       set and made into character classes of their own.  For the monkey impl,
2308     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2309     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2310     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2311     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2312     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2313     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2314     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2315     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2316     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2317     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2318     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2319     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2320     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2321     fOtherSet        = new UnicodeSet();
2322 
2323     if(U_FAILURE(status)) {
2324       deferredStatus = status;
2325       return;
2326     }
2327 
2328     fOtherSet->complement();
2329     fOtherSet->removeAll(*fSepSet);
2330     fOtherSet->removeAll(*fFormatSet);
2331     fOtherSet->removeAll(*fSpSet);
2332     fOtherSet->removeAll(*fLowerSet);
2333     fOtherSet->removeAll(*fUpperSet);
2334     fOtherSet->removeAll(*fOLetterSet);
2335     fOtherSet->removeAll(*fNumericSet);
2336     fOtherSet->removeAll(*fATermSet);
2337     fOtherSet->removeAll(*fSContinueSet);
2338     fOtherSet->removeAll(*fSTermSet);
2339     fOtherSet->removeAll(*fCloseSet);
2340     fOtherSet->removeAll(*fExtendSet);
2341 
2342     fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2343     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2344     fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2345     fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2346     fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2347     fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2348     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2349     fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2350     fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2351     fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2352     fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2353     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2354     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2355 
2356     if (U_FAILURE(status)) {
2357         deferredStatus = status;
2358     }
2359 }
2360 
2361 
2362 
setText(const UnicodeString & s)2363 void RBBISentMonkey::setText(const UnicodeString &s) {
2364     fText       = &s;
2365     prepareAppliedRules(s.length());
2366 }
2367 
charClasses()2368 UVector  *RBBISentMonkey::charClasses() {
2369     return fSets;
2370 }
2371 
2372 //  moveBack()   Find the "significant" code point preceding the index i.
2373 //               Skips over ($Extend | $Format)* .
2374 //
moveBack(int i)2375 int RBBISentMonkey::moveBack(int i) {
2376     if (i <= 0) {
2377         return -1;
2378     }
2379     UChar32   c;
2380     int32_t   j = i;
2381     do {
2382         j = fText->moveIndex32(j, -1);
2383         c = fText->char32At(j);
2384     }
2385     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2386     return j;
2387 
2388  }
2389 
2390 
moveForward(int i)2391 int RBBISentMonkey::moveForward(int i) {
2392     if (i>=fText->length()) {
2393         return fText->length();
2394     }
2395     UChar32   c;
2396     int32_t   j = i;
2397     do {
2398         j = fText->moveIndex32(j, 1);
2399         c = cAt(j);
2400     }
2401     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2402     return j;
2403 }
2404 
cAt(int pos)2405 UChar32 RBBISentMonkey::cAt(int pos) {
2406     if (pos<0 || pos>=fText->length()) {
2407         return -1;
2408     } else {
2409         return fText->char32At(pos);
2410     }
2411 }
2412 
next(int32_t prevPos)2413 int32_t RBBISentMonkey::next(int32_t prevPos) {
2414     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2415                               //   break position being tested.  The candidate break
2416                               //   location is before p2.
2417 
2418     int     breakPos = -1;
2419 
2420     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2421     UChar32 c;
2422 
2423     if (U_FAILURE(deferredStatus)) {
2424         return -1;
2425     }
2426 
2427     // Prev break at end of string.  return DONE.
2428     if (prevPos >= fText->length()) {
2429         return -1;
2430     }
2431     p0 = p1 = p2 = p3 = prevPos;
2432     c3 =  fText->char32At(prevPos);
2433     c0 = c1 = c2 = 0;
2434     (void)p0;     // Suppress set but not used warning.
2435 
2436     // Loop runs once per "significant" character position in the input text.
2437     for (;;) {
2438         // Move all of the positions forward in the input string.
2439         p0 = p1;  c0 = c1;
2440         p1 = p2;  c1 = c2;
2441         p2 = p3;  c2 = c3;
2442 
2443         // Advance p3 by    X(Extend | Format)*   Rule 4
2444         p3 = moveForward(p3);
2445         c3 = cAt(p3);
2446 
2447         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2448             setAppliedRule(p2, "SB3   CR x LF");
2449             continue;
2450         }
2451 
2452         if (fSepSet->contains(c1)) {
2453             p2 = p1+1;   // Separators don't combine with Extend or Format.
2454 
2455             setAppliedRule(p2, "SB4   Sep  <break>");
2456             break;
2457         }
2458 
2459         if (p2 >= fText->length()) {
2460             // Reached end of string.  Always a break position.
2461             setAppliedRule(p2, "SB4   Sep  <break>");
2462             break;
2463         }
2464 
2465         if (p2 == prevPos) {
2466             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2467             setAppliedRule(p2, "SB4   Sep  <break>");
2468             continue;
2469         }
2470 
2471         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2472             setAppliedRule(p2, "SB6   ATerm x Numeric");
2473             continue;
2474         }
2475 
2476           if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2477                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2478             setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
2479             continue;
2480         }
2481 
2482         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2483         //                  note to the Unicode 5.0 documents.
2484         int p8 = p1;
2485         while (fSpSet->contains(cAt(p8))) {
2486             p8 = moveBack(p8);
2487         }
2488         while (fCloseSet->contains(cAt(p8))) {
2489             p8 = moveBack(p8);
2490         }
2491         if (fATermSet->contains(cAt(p8))) {
2492             p8=p2;
2493             for (;;) {
2494                 c = cAt(p8);
2495                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2496                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2497                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2498 
2499                     setAppliedRule(p2,
2500                                    "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2501                     break;
2502                 }
2503                 p8 = moveForward(p8);
2504             }
2505             if (fLowerSet->contains(cAt(p8))) {
2506 
2507                 setAppliedRule(p2,
2508                                "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2509                 continue;
2510             }
2511         }
2512 
2513         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2514             p8 = p1;
2515             while (fSpSet->contains(cAt(p8))) {
2516                 p8 = moveBack(p8);
2517             }
2518             while (fCloseSet->contains(cAt(p8))) {
2519                 p8 = moveBack(p8);
2520             }
2521             c = cAt(p8);
2522             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2523                 setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2524                 continue;
2525             }
2526         }
2527 
2528         int p9 = p1;
2529         while (fCloseSet->contains(cAt(p9))) {
2530             p9 = moveBack(p9);
2531         }
2532         c = cAt(p9);
2533         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2534             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2535 
2536                 setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
2537                 continue;
2538             }
2539         }
2540 
2541         int p10 = p1;
2542         while (fSpSet->contains(cAt(p10))) {
2543             p10 = moveBack(p10);
2544         }
2545         while (fCloseSet->contains(cAt(p10))) {
2546             p10 = moveBack(p10);
2547         }
2548         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2549             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2550                 setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
2551                 continue;
2552             }
2553         }
2554 
2555         int p11 = p1;
2556         if (fSepSet->contains(cAt(p11))) {
2557             p11 = moveBack(p11);
2558         }
2559         while (fSpSet->contains(cAt(p11))) {
2560             p11 = moveBack(p11);
2561         }
2562         while (fCloseSet->contains(cAt(p11))) {
2563             p11 = moveBack(p11);
2564         }
2565         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2566           setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
2567             break;
2568         }
2569 
2570         setAppliedRule(p2, "SB12  Any x Any");
2571         continue;
2572     }
2573 
2574     breakPos = p2;
2575     return breakPos;
2576 }
2577 
~RBBISentMonkey()2578 RBBISentMonkey::~RBBISentMonkey() {
2579     delete fSets;
2580     delete fSepSet;
2581     delete fFormatSet;
2582     delete fSpSet;
2583     delete fLowerSet;
2584     delete fUpperSet;
2585     delete fOLetterSet;
2586     delete fNumericSet;
2587     delete fATermSet;
2588     delete fSContinueSet;
2589     delete fSTermSet;
2590     delete fCloseSet;
2591     delete fOtherSet;
2592     delete fExtendSet;
2593 }
2594 
2595 
2596 
2597 //-------------------------------------------------------------------------------------------
2598 //
2599 //  RBBILineMonkey
2600 //
2601 //-------------------------------------------------------------------------------------------
2602 
2603 class RBBILineMonkey: public RBBIMonkeyKind {
2604 public:
2605     RBBILineMonkey();
2606     virtual          ~RBBILineMonkey();
2607     virtual  UVector *charClasses() override;
2608     virtual  void     setText(const UnicodeString &s) override;
2609     virtual  int32_t  next(int32_t i) override;
2610     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2611 private:
2612     UVector      *fSets;
2613 
2614     UnicodeSet  *fBK;
2615     UnicodeSet  *fCR;
2616     UnicodeSet  *fLF;
2617     UnicodeSet  *fCM;
2618     UnicodeSet  *fNL;
2619     UnicodeSet  *fSG;
2620     UnicodeSet  *fWJ;
2621     UnicodeSet  *fZW;
2622     UnicodeSet  *fGL;
2623     UnicodeSet  *fCB;
2624     UnicodeSet  *fSP;
2625     UnicodeSet  *fB2;
2626     UnicodeSet  *fBA;
2627     UnicodeSet  *fBB;
2628     UnicodeSet  *fHH;
2629     UnicodeSet  *fHY;
2630     UnicodeSet  *fH2;
2631     UnicodeSet  *fH3;
2632     UnicodeSet  *fCL;
2633     UnicodeSet  *fCP;
2634     UnicodeSet  *fEX;
2635     UnicodeSet  *fIN;
2636     UnicodeSet  *fJL;
2637     UnicodeSet  *fJV;
2638     UnicodeSet  *fJT;
2639     UnicodeSet  *fNS;
2640     UnicodeSet  *fOP;
2641     UnicodeSet  *fQU;
2642     UnicodeSet  *fIS;
2643     UnicodeSet  *fNU;
2644     UnicodeSet  *fPO;
2645     UnicodeSet  *fPR;
2646     UnicodeSet  *fSY;
2647     UnicodeSet  *fAI;
2648     UnicodeSet  *fAL;
2649     UnicodeSet  *fCJ;
2650     UnicodeSet  *fHL;
2651     UnicodeSet  *fID;
2652     UnicodeSet  *fRI;
2653     UnicodeSet  *fXX;
2654     UnicodeSet  *fEB;
2655     UnicodeSet  *fEM;
2656     UnicodeSet  *fZWJ;
2657     UnicodeSet  *fOP30;
2658     UnicodeSet  *fCP30;
2659     UnicodeSet  *fExtPictUnassigned;
2660 
2661     BreakIterator        *fCharBI;
2662     const UnicodeString  *fText;
2663     RegexMatcher         *fNumberMatcher;
2664 };
2665 
RBBILineMonkey()2666 RBBILineMonkey::RBBILineMonkey() :
2667     RBBIMonkeyKind(),
2668     fSets(nullptr),
2669 
2670     fCharBI(nullptr),
2671     fText(nullptr),
2672     fNumberMatcher(nullptr)
2673 
2674 {
2675     if (U_FAILURE(deferredStatus)) {
2676         return;
2677     }
2678 
2679     UErrorCode  status = U_ZERO_ERROR;
2680 
2681     fSets  = new UVector(status);
2682 
2683     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2684     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2685     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2686     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2687     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2688     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2689     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2690     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2691     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2692     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2693     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2694     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2695     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2696     fHH    = new UnicodeSet();
2697     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2698     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2699     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2700     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2701     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2702     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2703     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2704     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2705     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2706     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2707     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2708     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2709     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2710     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2711     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2712     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2713     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2714     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2715     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2716     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2717     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2718     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2719     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2720     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2721     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2722     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2723     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2724     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2725     fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2726     fOP30  = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2727     fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2728     fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2729 
2730     if (U_FAILURE(status)) {
2731         deferredStatus = status;
2732         return;
2733     }
2734 
2735     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2736     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2737     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2738 
2739     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2740     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2741 
2742     fHH->add(u'\u2010');   // Hyphen, '‐'
2743 
2744     // Sets and names.
2745     fSets->addElement(fBK, status); classNames.push_back("fBK");
2746     fSets->addElement(fCR, status); classNames.push_back("fCR");
2747     fSets->addElement(fLF, status); classNames.push_back("fLF");
2748     fSets->addElement(fCM, status); classNames.push_back("fCM");
2749     fSets->addElement(fNL, status); classNames.push_back("fNL");
2750     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2751     fSets->addElement(fZW, status); classNames.push_back("fZW");
2752     fSets->addElement(fGL, status); classNames.push_back("fGL");
2753     fSets->addElement(fCB, status); classNames.push_back("fCB");
2754     fSets->addElement(fSP, status); classNames.push_back("fSP");
2755     fSets->addElement(fB2, status); classNames.push_back("fB2");
2756     fSets->addElement(fBA, status); classNames.push_back("fBA");
2757     fSets->addElement(fBB, status); classNames.push_back("fBB");
2758     fSets->addElement(fHY, status); classNames.push_back("fHY");
2759     fSets->addElement(fH2, status); classNames.push_back("fH2");
2760     fSets->addElement(fH3, status); classNames.push_back("fH3");
2761     fSets->addElement(fCL, status); classNames.push_back("fCL");
2762     fSets->addElement(fCP, status); classNames.push_back("fCP");
2763     fSets->addElement(fEX, status); classNames.push_back("fEX");
2764     fSets->addElement(fIN, status); classNames.push_back("fIN");
2765     fSets->addElement(fJL, status); classNames.push_back("fJL");
2766     fSets->addElement(fJT, status); classNames.push_back("fJT");
2767     fSets->addElement(fJV, status); classNames.push_back("fJV");
2768     fSets->addElement(fNS, status); classNames.push_back("fNS");
2769     fSets->addElement(fOP, status); classNames.push_back("fOP");
2770     fSets->addElement(fQU, status); classNames.push_back("fQU");
2771     fSets->addElement(fIS, status); classNames.push_back("fIS");
2772     fSets->addElement(fNU, status); classNames.push_back("fNU");
2773     fSets->addElement(fPO, status); classNames.push_back("fPO");
2774     fSets->addElement(fPR, status); classNames.push_back("fPR");
2775     fSets->addElement(fSY, status); classNames.push_back("fSY");
2776     fSets->addElement(fAI, status); classNames.push_back("fAI");
2777     fSets->addElement(fAL, status); classNames.push_back("fAL");
2778     fSets->addElement(fHL, status); classNames.push_back("fHL");
2779     fSets->addElement(fID, status); classNames.push_back("fID");
2780     fSets->addElement(fRI, status); classNames.push_back("fRI");
2781     fSets->addElement(fSG, status); classNames.push_back("fSG");
2782     fSets->addElement(fEB, status); classNames.push_back("fEB");
2783     fSets->addElement(fEM, status); classNames.push_back("fEM");
2784     fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2785     // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2786     fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2787     fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2788     fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
2789 
2790     const char *rules =
2791             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2792             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2793             "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2794             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2795             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2796             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2797             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2798 
2799     fNumberMatcher = new RegexMatcher(
2800         UnicodeString(rules, -1, US_INV), 0, status);
2801 
2802     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2803 
2804     if (U_FAILURE(status)) {
2805         deferredStatus = status;
2806     }
2807 
2808 }
2809 
2810 
setText(const UnicodeString & s)2811 void RBBILineMonkey::setText(const UnicodeString &s) {
2812     fText       = &s;
2813     fCharBI->setText(s);
2814     prepareAppliedRules(s.length());
2815     fNumberMatcher->reset(s);
2816 }
2817 
2818 //
2819 //  rule9Adjust
2820 //     Line Break TR rules 9 and 10 implementation.
2821 //     This deals with combining marks and other sequences that
2822 //     that must be treated as if they were something other than what they actually are.
2823 //
2824 //     This is factored out into a separate function because it must be applied twice for
2825 //     each potential break, once to the chars before the position being checked, then
2826 //     again to the text following the possible break.
2827 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2828 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2829     if (pos == -1) {
2830         // Invalid initial position.  Happens during the warmup iteration of the
2831         //   main loop in next().
2832         return;
2833     }
2834 
2835     int32_t  nPos = *nextPos;
2836 
2837     // LB 9  Keep combining sequences together.
2838     // advance over any CM class chars.  Note that Line Break CM is different
2839     // from the normal Grapheme Extend property.
2840     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2841           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2842         for (;;) {
2843             *nextChar = fText->char32At(nPos);
2844             if (!fCM->contains(*nextChar)) {
2845                 break;
2846             }
2847             nPos = fText->moveIndex32(nPos, 1);
2848         }
2849     }
2850 
2851 
2852     // LB 9 Treat X CM* as if it were x.
2853     //       No explicit action required.
2854 
2855     // LB 10  Treat any remaining combining mark as AL
2856     if (fCM->contains(*posChar)) {
2857         *posChar = u'A';
2858     }
2859 
2860     // Push the updated nextPos and nextChar back to our caller.
2861     // This only makes a difference if posChar got bigger by consuming a
2862     // combining sequence.
2863     *nextPos  = nPos;
2864     *nextChar = fText->char32At(nPos);
2865 }
2866 
2867 
2868 
next(int32_t startPos)2869 int32_t RBBILineMonkey::next(int32_t startPos) {
2870     UErrorCode status = U_ZERO_ERROR;
2871     int32_t    pos;       //  Index of the char following a potential break position
2872     UChar32    thisChar;  //  Character at above position "pos"
2873 
2874     int32_t    prevPos;   //  Index of the char preceding a potential break position
2875     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2876                           //   and thisChar may not be adjacent because combining
2877                           //   characters between them will be ignored.
2878 
2879     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2880     UChar32    prevCharX2;
2881 
2882     int32_t    nextPos;   //  Index of the next character following pos.
2883                           //     Usually skips over combining marks.
2884     int32_t    nextCPPos; //  Index of the code point following "pos."
2885                           //     May point to a combining mark.
2886     int32_t    tPos;      //  temp value.
2887     UChar32    c;
2888 
2889     if (U_FAILURE(deferredStatus)) {
2890         return -1;
2891     }
2892 
2893     if (startPos >= fText->length()) {
2894         return -1;
2895     }
2896 
2897 
2898     // Initial values for loop.  Loop will run the first time without finding breaks,
2899     //                           while the invalid values shift out and the "this" and
2900     //                           "prev" positions are filled in with good values.
2901     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2902     thisChar = prevChar  = prevCharX2 = 0;
2903     nextPos  = nextCPPos = startPos;
2904 
2905 
2906     // Loop runs once per position in the test text, until a break position
2907     //  is found.
2908     for (;;) {
2909         prevPosX2 = prevPos;
2910         prevCharX2 = prevChar;
2911 
2912         prevPos   = pos;
2913         prevChar  = thisChar;
2914 
2915         pos       = nextPos;
2916         thisChar  = fText->char32At(pos);
2917 
2918         nextCPPos = fText->moveIndex32(pos, 1);
2919         nextPos   = nextCPPos;
2920 
2921 
2922         if (pos >= fText->length()) {
2923             setAppliedRule(pos, "LB2 - Break at end of text.");
2924             break;
2925         }
2926 
2927 
2928         //             We do this one out-of-order because the adjustment does not change anything
2929         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2930         //             be applied.
2931         rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2932         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2933         c = fText->char32At(nextPos);
2934         rule9Adjust(pos, &thisChar, &nextPos, &c);
2935 
2936         // If the loop is still warming up - if we haven't shifted the initial
2937         //   -1 positions out of prevPos yet - loop back to advance the
2938         //    position in the input without any further looking for breaks.
2939         if (prevPos == -1) {
2940           setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2941             continue;
2942         }
2943 
2944 
2945         if (fBK->contains(prevChar)) {
2946             setAppliedRule(pos, "LB 4  Always break after hard line breaks");
2947             break;
2948         }
2949 
2950 
2951         if (prevChar == 0x0d && thisChar == 0x0a) {
2952             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
2953             continue;
2954         }
2955         if (prevChar == 0x0d ||
2956             prevChar == 0x0a ||
2957             prevChar == 0x85)  {
2958             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
2959             break;
2960         }
2961 
2962 
2963         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2964             fBK->contains(thisChar)) {
2965             setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
2966             continue;
2967         }
2968 
2969 
2970         if (fSP->contains(thisChar)) {
2971             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
2972             continue;
2973         }
2974 
2975         // !!! ??? Is this the right text for the applied rule?
2976         if (fZW->contains(thisChar)) {
2977             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
2978             continue;
2979         }
2980 
2981 
2982         //       ZW SP* ÷
2983         //       Scan backwards from prevChar for SP* ZW
2984         tPos = prevPos;
2985         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2986             tPos = fText->moveIndex32(tPos, -1);
2987         }
2988         if (fZW->contains(fText->char32At(tPos))) {
2989             setAppliedRule(pos, "LB 8  Break after zero width space");
2990             break;
2991         }
2992 
2993 
2994         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
2995         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
2996         if (fNumberMatcher->lookingAt(prevPos, status)) {
2997             if (U_FAILURE(status)) {
2998                 setAppliedRule(pos, "LB 25 Numbers");
2999                 break;
3000             }
3001             // Matched a number.  But could have been just a single digit, which would
3002             //    not represent a "no break here" between prevChar and thisChar
3003             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3004             if (numEndIdx > pos) {
3005                 // Number match includes at least our two chars being checked
3006                 if (numEndIdx > nextPos) {
3007                     // Number match includes additional chars.  Update pos and nextPos
3008                     //   so that next loop iteration will continue at the end of the number,
3009                     //   checking for breaks between last char in number & whatever follows.
3010                     pos = nextPos = numEndIdx;
3011                     do {
3012                         pos = fText->moveIndex32(pos, -1);
3013                         thisChar = fText->char32At(pos);
3014                     } while (fCM->contains(thisChar));
3015                 }
3016                 setAppliedRule(pos, "LB 25 Numbers");
3017                 continue;
3018             }
3019         }
3020 
3021 
3022         //       The monkey test's way of ignoring combining characters doesn't work
3023         //       for this rule. ZJ is also a CM. Need to get the actual character
3024         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3025         {
3026             int32_t prevIdx = fText->moveIndex32(pos, -1);
3027             UChar32 prevC = fText->char32At(prevIdx);
3028             if (fZWJ->contains(prevC)) {
3029                 setAppliedRule(pos, "LB 8a ZWJ x");
3030                 continue;
3031             }
3032         }
3033 
3034 
3035         // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
3036         //
3037 
3038 
3039         //    x  WJ
3040         //    WJ  x
3041         //
3042         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3043             setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
3044             continue;
3045         }
3046 
3047 
3048         if (fGL->contains(prevChar)) {
3049             setAppliedRule(pos, "LB 12  GL  x");
3050             continue;
3051         }
3052 
3053 
3054           if (!(fSP->contains(prevChar) ||
3055               fBA->contains(prevChar) ||
3056               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3057               setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
3058               continue;
3059         }
3060 
3061 
3062         if (fCL->contains(thisChar) ||
3063                 fCP->contains(thisChar) ||
3064                 fEX->contains(thisChar) ||
3065                 fSY->contains(thisChar)) {
3066             setAppliedRule(pos, "LB 13  Don't break before closings.");
3067             continue;
3068         }
3069 
3070 
3071         //       Scan backwards, checking for this sequence.
3072         //       The OP char could include combining marks, so we actually check for
3073         //           OP CM* SP*
3074         //       Another Twist: The Rule 9 fixes may have changed a SP CM
3075         //       sequence into a ID char, so before scanning back through spaces,
3076         //       verify that prevChar is indeed a space.  The prevChar variable
3077         //       may differ from fText[prevPos]
3078         tPos = prevPos;
3079         if (fSP->contains(prevChar)) {
3080             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3081                 tPos=fText->moveIndex32(tPos, -1);
3082             }
3083         }
3084         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3085             tPos=fText->moveIndex32(tPos, -1);
3086         }
3087         if (fOP->contains(fText->char32At(tPos))) {
3088             setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3089             continue;
3090         }
3091 
3092 
3093         if (nextPos < fText->length()) {
3094             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3095             //       from a legit ffff character. So test length separately.
3096             UChar32 nextChar = fText->char32At(nextPos);
3097             if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3098                 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3099                 break;
3100             }
3101         }
3102 
3103 
3104           if (fIS->contains(thisChar)) {
3105               setAppliedRule(pos, "LB 14b  Do not break before numeric separators, even after spaces.");
3106               continue;
3107         }
3108 
3109 
3110         if (fOP->contains(thisChar)) {
3111             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3112             int tPos = prevPos;
3113             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3114                 tPos = fText->moveIndex32(tPos, -1);
3115             }
3116             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3117                 tPos = fText->moveIndex32(tPos, -1);
3118             }
3119             if (fQU->contains(fText->char32At(tPos))) {
3120                 setAppliedRule(pos, "LB 15    QU SP* x OP");
3121                 continue;
3122             }
3123         }
3124 
3125 
3126         //    Scan backwards for SP* CM* (CL | CP)
3127         if (fNS->contains(thisChar)) {
3128             int tPos = prevPos;
3129             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3130                 tPos = fText->moveIndex32(tPos, -1);
3131             }
3132             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3133                 tPos = fText->moveIndex32(tPos, -1);
3134             }
3135             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3136                 setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
3137                 continue;
3138             }
3139         }
3140 
3141 
3142         if (fB2->contains(thisChar)) {
3143             //  Scan backwards, checking for the B2 CM* SP* sequence.
3144             tPos = prevPos;
3145             if (fSP->contains(prevChar)) {
3146                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3147                     tPos=fText->moveIndex32(tPos, -1);
3148                 }
3149             }
3150             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3151                 tPos=fText->moveIndex32(tPos, -1);
3152             }
3153             if (fB2->contains(fText->char32At(tPos))) {
3154                 setAppliedRule(pos, "LB 17   B2 SP* x B2");
3155                 continue;
3156             }
3157         }
3158 
3159 
3160         if (fSP->contains(prevChar)) {
3161             setAppliedRule(pos, "LB 18    break after space");
3162             break;
3163         }
3164 
3165         //    x   QU
3166         //    QU  x
3167         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3168             setAppliedRule(pos, "LB 19");
3169             continue;
3170         }
3171 
3172         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3173             setAppliedRule(pos, "LB 20  Break around a CB");
3174             break;
3175         }
3176 
3177         //           Don't break between Hyphens and letters if a break precedes the hyphen.
3178         //           Formerly this was a Finnish tailoring.
3179         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3180         //           ^($HY | $HH) $AL;
3181         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3182                 prevPosX2 == -1) {
3183             setAppliedRule(pos, "LB 20.09");
3184             continue;
3185         }
3186 
3187         if (fBA->contains(thisChar) ||
3188             fHY->contains(thisChar) ||
3189             fNS->contains(thisChar) ||
3190             fBB->contains(prevChar) )   {
3191             setAppliedRule(pos, "LB 21");
3192             continue;
3193         }
3194 
3195         if (fHL->contains(prevCharX2) &&
3196                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3197             setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
3198             continue;
3199         }
3200 
3201         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3202             setAppliedRule(pos, "LB 21b SY x HL");
3203             continue;
3204         }
3205 
3206         if (fIN->contains(thisChar))   {
3207             setAppliedRule(pos, "LB 22");
3208             continue;
3209         }
3210 
3211 
3212         //          (AL | HL) x NU
3213         //          NU x (AL | HL)
3214         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3215             setAppliedRule(pos, "LB 23");
3216             continue;
3217         }
3218         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3219             setAppliedRule(pos, "LB 23");
3220             continue;
3221         }
3222 
3223         // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3224         //      PR x (ID | EB | EM)
3225         //     (ID | EB | EM) x PO
3226         if (fPR->contains(prevChar) &&
3227                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3228             setAppliedRule(pos, "LB 23a");
3229             continue;
3230         }
3231         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3232                 fPO->contains(thisChar)) {
3233             setAppliedRule(pos, "LB 23a");
3234             continue;
3235         }
3236 
3237         //   Do not break between prefix and letters or ideographs.
3238         //         (PR | PO) x (AL | HL)
3239         //         (AL | HL) x (PR | PO)
3240         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3241                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3242             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3243             continue;
3244         }
3245         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3246                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3247             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3248             continue;
3249         }
3250 
3251         // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3252 
3253         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3254                                         fJV->contains(thisChar) ||
3255                                         fH2->contains(thisChar) ||
3256                                         fH3->contains(thisChar))) {
3257             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3258             continue;
3259                                         }
3260 
3261         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3262             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3263             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3264             continue;
3265         }
3266 
3267         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3268             fJT->contains(thisChar)) {
3269             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3270             continue;
3271         }
3272 
3273         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3274             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3275             fPO->contains(thisChar)) {
3276             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3277             continue;
3278         }
3279         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3280             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3281             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3282             continue;
3283         }
3284 
3285 
3286 
3287         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3288             setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
3289             continue;
3290         }
3291 
3292           if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3293               setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3294               continue;
3295         }
3296 
3297         //          (AL | NU) x OP
3298         //          CP x (AL | NU)
3299         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3300             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3301             continue;
3302         }
3303         if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3304             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3305             continue;
3306         }
3307 
3308         //             RI  x  RI
3309         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3310             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3311             break;
3312         }
3313         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3314             // Two Regional Indicators have been paired.
3315             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3316             // following RI. This is a hack.
3317             thisChar = -1;
3318             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3319             continue;
3320         }
3321 
3322         // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3323         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3324             setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
3325             continue;
3326         }
3327 
3328         if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3329             setAppliedRule(pos, "LB30b    [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
3330             continue;
3331         }
3332 
3333         setAppliedRule(pos, "LB 31    Break everywhere else");
3334         break;
3335     }
3336 
3337     return pos;
3338 }
3339 
3340 
charClasses()3341 UVector  *RBBILineMonkey::charClasses() {
3342     return fSets;
3343 }
3344 
3345 
~RBBILineMonkey()3346 RBBILineMonkey::~RBBILineMonkey() {
3347     delete fSets;
3348 
3349     delete fBK;
3350     delete fCR;
3351     delete fLF;
3352     delete fCM;
3353     delete fNL;
3354     delete fWJ;
3355     delete fZW;
3356     delete fGL;
3357     delete fCB;
3358     delete fSP;
3359     delete fB2;
3360     delete fBA;
3361     delete fBB;
3362     delete fHH;
3363     delete fHY;
3364     delete fH2;
3365     delete fH3;
3366     delete fCL;
3367     delete fCP;
3368     delete fEX;
3369     delete fIN;
3370     delete fJL;
3371     delete fJV;
3372     delete fJT;
3373     delete fNS;
3374     delete fOP;
3375     delete fQU;
3376     delete fIS;
3377     delete fNU;
3378     delete fPO;
3379     delete fPR;
3380     delete fSY;
3381     delete fAI;
3382     delete fAL;
3383     delete fCJ;
3384     delete fHL;
3385     delete fID;
3386     delete fRI;
3387     delete fSG;
3388     delete fXX;
3389     delete fEB;
3390     delete fEM;
3391     delete fZWJ;
3392     delete fOP30;
3393     delete fCP30;
3394     delete fExtPictUnassigned;
3395 
3396     delete fCharBI;
3397     delete fNumberMatcher;
3398 }
3399 
3400 
3401 //-------------------------------------------------------------------------------------------
3402 //
3403 //   TestMonkey
3404 //
3405 //     params
3406 //       seed=nnnnn        Random number starting seed.
3407 //                         Setting the seed allows errors to be reproduced.
3408 //       loop=nnn          Looping count.  Controls running time.
3409 //                         -1:  run forever.
3410 //                          0 or greater:  run length.
3411 //
3412 //       type = char | word | line | sent | title
3413 //
3414 //  Example:
3415 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3416 //
3417 //-------------------------------------------------------------------------------------------
3418 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3419 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3420     int32_t val = defaultVal;
3421     name.append(" *= *(-?\\d+)");
3422     UErrorCode status = U_ZERO_ERROR;
3423     RegexMatcher m(name, params, 0, status);
3424     if (m.find()) {
3425         // The param exists.  Convert the string to an int.
3426         char valString[100];
3427         int32_t paramLength = m.end(1, status) - m.start(1, status);
3428         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3429             paramLength = (int32_t)(sizeof(valString)-2);
3430         }
3431         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3432         val = strtol(valString, nullptr, 10);
3433 
3434         // Delete this parameter from the params string.
3435         m.reset();
3436         params = m.replaceFirst("", status);
3437     }
3438     U_ASSERT(U_SUCCESS(status));
3439     return val;
3440 }
3441 #endif
3442 
3443 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3444 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3445                                     BreakIterator *bi,
3446                                     int expected[],
3447                                     int expectedcount)
3448 {
3449     int count = 0;
3450     int i = 0;
3451     int forward[50];
3452     bi->setText(ustr);
3453     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3454         forward[count] = i;
3455         if (count < expectedcount && expected[count] != i) {
3456             test->errln("%s:%d break forward test failed: expected %d but got %d",
3457                         __FILE__, __LINE__, expected[count], i);
3458             break;
3459         }
3460         count ++;
3461     }
3462     if (count != expectedcount) {
3463         printStringBreaks(ustr, expected, expectedcount);
3464         test->errln("%s:%d break forward test failed: missed %d match",
3465                     __FILE__, __LINE__, expectedcount - count);
3466         return;
3467     }
3468     // testing boundaries
3469     for (i = 1; i < expectedcount; i ++) {
3470         int j = expected[i - 1];
3471         if (!bi->isBoundary(j)) {
3472             printStringBreaks(ustr, expected, expectedcount);
3473             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3474                     __FILE__, __LINE__, j);
3475             return;
3476         }
3477         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3478             if (bi->isBoundary(j)) {
3479                 printStringBreaks(ustr, expected, expectedcount);
3480                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3481                     __FILE__, __LINE__, j);
3482                 return;
3483             }
3484         }
3485     }
3486 
3487     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3488         count --;
3489         if (forward[count] != i) {
3490             printStringBreaks(ustr, expected, expectedcount);
3491             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3492                         __FILE__, __LINE__, forward[count], i);
3493             break;
3494         }
3495     }
3496     if (count != 0) {
3497         printStringBreaks(ustr, expected, expectedcount);
3498         test->errln("break test previous() failed: missed a match");
3499         return;
3500     }
3501 
3502     // testing preceding
3503     for (i = 0; i < expectedcount - 1; i ++) {
3504         // int j = expected[i] + 1;
3505         int j = ustr.moveIndex32(expected[i], 1);
3506         for (; j <= expected[i + 1]; j ++) {
3507             int32_t expectedPreceding = expected[i];
3508             int32_t actualPreceding = bi->preceding(j);
3509             if (actualPreceding != expectedPreceding) {
3510                 printStringBreaks(ustr, expected, expectedcount);
3511                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3512                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3513                 return;
3514             }
3515         }
3516     }
3517 }
3518 #endif
3519 
TestWordBreaks()3520 void RBBITest::TestWordBreaks()
3521 {
3522 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3523 
3524     Locale        locale("en");
3525     UErrorCode    status = U_ZERO_ERROR;
3526     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3527     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3528     // Replaced any C+J characters in a row with a random sequence of characters
3529     // of the same length to make our C+J segmentation not get in the way.
3530     static const char *strlist[] =
3531     {
3532     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3533     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3534     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3535     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3536     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3537     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3538     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3539     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3540     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3541     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3542     "\\u2027\\U000e0067\\u0a47\\u00b7",
3543     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3544     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3545     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3546     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3547     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3548     "\\u0027\\u11af\\U000e0057\\u0602",
3549     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3550     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3551     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3552     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3553     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3554     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3555     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3556     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3557     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3558     "\\u18f4\\U000e0049\\u20e7\\u2027",
3559     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3560     "\\ua183\\u102d\\u0bec\\u003a",
3561     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3562     "\\u003a\\u0e57\\u0fad\\u002e",
3563     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3564     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3565     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3566     "\\u003a\\u0664\\u00b7\\u1fba",
3567     "\\u003b\\u0027\\u00b7\\u47a3",
3568     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3569     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3570     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3571     };
3572     int loop;
3573     if (U_FAILURE(status)) {
3574         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3575         return;
3576     }
3577     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3578         // printf("looping %d\n", loop);
3579         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3580         // RBBICharMonkey monkey;
3581         RBBIWordMonkey monkey;
3582 
3583         int expected[50];
3584         int expectedcount = 0;
3585 
3586         monkey.setText(ustr);
3587         int i;
3588         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3589             expected[expectedcount ++] = i;
3590         }
3591 
3592         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3593     }
3594     delete bi;
3595 #endif
3596 }
3597 
TestWordBoundary()3598 void RBBITest::TestWordBoundary()
3599 {
3600     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3601     Locale        locale("en");
3602     UErrorCode    status = U_ZERO_ERROR;
3603     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3604     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3605     if (U_FAILURE(status)) {
3606         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3607                 __FILE__, __LINE__, u_errorName(status));
3608         return;
3609     }
3610     char16_t      str[50];
3611     static const char *strlist[] =
3612     {
3613     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3614     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3615     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3616     "\\u2027\\U000e0067\\u0a47\\u00b7",
3617     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3618     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3619     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3620     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3621     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3622     "\\u0027\\u11af\\U000e0057\\u0602",
3623     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3624     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3625     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3626     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3627     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3628     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3629     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3630     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3631     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3632     "\\u58f4\\U000e0049\\u20e7\\u2027",
3633     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3634     "\\ua183\\u102d\\u0bec\\u003a",
3635     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3636     "\\u003a\\u0e57\\u0fad\\u002e",
3637     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3638     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3639     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3640     "\\u003a\\u0664\\u00b7\\u1fba",
3641     "\\u003b\\u0027\\u00b7\\u47a3",
3642     };
3643     int loop;
3644     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3645         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3646         UnicodeString ustr(str);
3647         int forward[50];
3648         int count = 0;
3649 
3650         bi->setText(ustr);
3651         int prev = -1;
3652         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3653             ++count;
3654             if (count >= UPRV_LENGTHOF(forward)) {
3655                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3656                         __FILE__, __LINE__, loop, count, boundary);
3657                 return;
3658             }
3659             forward[count] = boundary;
3660             if (boundary <= prev) {
3661                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3662                         __FILE__, __LINE__, loop, prev, boundary);
3663                 break;
3664             }
3665             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3666                 if (bi->isBoundary(nonBoundary)) {
3667                     printStringBreaks(ustr, forward, count);
3668                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3669                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3670                     return;
3671                 }
3672             }
3673             if (!bi->isBoundary(boundary)) {
3674                 printStringBreaks(ustr, forward, count);
3675                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3676                        __FILE__, __LINE__, boundary);
3677                 return;
3678             }
3679             prev = boundary;
3680         }
3681     }
3682 }
3683 
TestLineBreaks()3684 void RBBITest::TestLineBreaks()
3685 {
3686 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3687     Locale        locale("en");
3688     UErrorCode    status = U_ZERO_ERROR;
3689     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3690     const int32_t  STRSIZE = 50;
3691     char16_t      str[STRSIZE];
3692     static const char *strlist[] =
3693     {
3694      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3695      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3696              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3697      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3698              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3699      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3700      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3701      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3702      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3703      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3704      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3705      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3706      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3707      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3708      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3709      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3710      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3711      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3712      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3713      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3714      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3715      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3716      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3717      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3718      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3719      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3720      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3721      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3722      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3723      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3724      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3725      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3726      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3727      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3728      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3729      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3730      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3731      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3732      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3733          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3734     };
3735     int loop;
3736     TEST_ASSERT_SUCCESS(status);
3737     if (U_FAILURE(status)) {
3738         return;
3739     }
3740     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3741         // printf("looping %d\n", loop);
3742         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3743         if (t >= STRSIZE) {
3744             TEST_ASSERT(false);
3745             continue;
3746         }
3747 
3748 
3749         UnicodeString ustr(str);
3750         RBBILineMonkey monkey;
3751         if (U_FAILURE(monkey.deferredStatus)) {
3752             continue;
3753         }
3754 
3755         const int EXPECTEDSIZE = 50;
3756         int expected[EXPECTEDSIZE];
3757         int expectedcount = 0;
3758 
3759         monkey.setText(ustr);
3760 
3761         int i;
3762         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3763             if (expectedcount >= EXPECTEDSIZE) {
3764                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3765                 return;
3766             }
3767             expected[expectedcount ++] = i;
3768         }
3769 
3770         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3771     }
3772     delete bi;
3773 #endif
3774 }
3775 
TestSentBreaks()3776 void RBBITest::TestSentBreaks()
3777 {
3778 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3779     Locale        locale("en");
3780     UErrorCode    status = U_ZERO_ERROR;
3781     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3782     char16_t      str[200];
3783     static const char *strlist[] =
3784     {
3785      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3786      "This\n",
3787      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3788      "\"Sentence ending with a quote.\" Bye.",
3789      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3790      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3791      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3792      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3793      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3794      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3795      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3796              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3797              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3798              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3799      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3800              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3801              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3802              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3803              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3804              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3805     };
3806     int loop;
3807     if (U_FAILURE(status)) {
3808         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3809         return;
3810     }
3811     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3812         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3813         UnicodeString ustr(str);
3814 
3815         RBBISentMonkey monkey;
3816         if (U_FAILURE(monkey.deferredStatus)) {
3817             continue;
3818         }
3819 
3820         const int EXPECTEDSIZE = 50;
3821         int expected[EXPECTEDSIZE];
3822         int expectedcount = 0;
3823 
3824         monkey.setText(ustr);
3825 
3826         int i;
3827         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3828             if (expectedcount >= EXPECTEDSIZE) {
3829                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3830                 return;
3831             }
3832             expected[expectedcount ++] = i;
3833         }
3834 
3835         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3836     }
3837     delete bi;
3838 #endif
3839 }
3840 
TestMonkey()3841 void RBBITest::TestMonkey() {
3842 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3843 
3844     UErrorCode     status    = U_ZERO_ERROR;
3845     int32_t        loopCount = 500;
3846     int32_t        seed      = 1;
3847     UnicodeString  breakType = "all";
3848     Locale         locale("en");
3849     UBool          useUText  = false;
3850 
3851     if (quick == false) {
3852         loopCount = 10000;
3853     }
3854 
3855     if (fTestParams) {
3856         UnicodeString p(fTestParams);
3857         loopCount = getIntParam("loop", p, loopCount);
3858         seed      = getIntParam("seed", p, seed);
3859 
3860         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3861         if (m.find()) {
3862             breakType = m.group(1, status);
3863             m.reset();
3864             p = m.replaceFirst("", status);
3865         }
3866 
3867         RegexMatcher u(" *utext", p, 0, status);
3868         if (u.find()) {
3869             useUText = true;
3870             u.reset();
3871             p = u.replaceFirst("", status);
3872         }
3873 
3874 
3875         // m.reset(p);
3876         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3877             // Each option is stripped out of the option string as it is processed.
3878             // All options have been checked.  The option string should have been completely emptied..
3879             char buf[100];
3880             p.extract(buf, sizeof(buf), nullptr, status);
3881             buf[sizeof(buf)-1] = 0;
3882             errln("Unrecognized or extra parameter:  %s\n", buf);
3883             return;
3884         }
3885 
3886     }
3887 
3888     if (breakType == "char" || breakType == "all") {
3889         RBBICharMonkey  m;
3890         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3891         if (U_SUCCESS(status)) {
3892             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3893             if (breakType == "all" && useUText==false) {
3894                 // Also run a quick test with UText when "all" is specified
3895                 RunMonkey(bi, m, "char", seed, loopCount, true);
3896             }
3897         }
3898         else {
3899             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3900         }
3901         delete bi;
3902     }
3903 
3904     if (breakType == "word" || breakType == "all") {
3905         logln("Word Break Monkey Test");
3906         RBBIWordMonkey  m;
3907         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3908         if (U_SUCCESS(status)) {
3909             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3910         }
3911         else {
3912             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3913         }
3914         delete bi;
3915     }
3916 
3917     if (breakType == "line" || breakType == "all") {
3918         logln("Line Break Monkey Test");
3919         RBBILineMonkey  m;
3920         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3921         if (loopCount >= 10) {
3922             loopCount = loopCount / 5;   // Line break runs slower than the others.
3923         }
3924         if (U_SUCCESS(status)) {
3925             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3926         }
3927         else {
3928             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3929         }
3930         delete bi;
3931     }
3932 
3933     if (breakType == "sent" || breakType == "all"  ) {
3934         logln("Sentence Break Monkey Test");
3935         RBBISentMonkey  m;
3936         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3937         if (loopCount >= 10) {
3938             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3939         }
3940         if (U_SUCCESS(status)) {
3941             RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3942         }
3943         else {
3944             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3945         }
3946         delete bi;
3947     }
3948 
3949 #endif
3950 }
3951 
3952 //
3953 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3954 //    Parameters:
3955 //       bi      - the break iterator to use
3956 //       mk      - MonkeyKind, abstraction for obtaining expected results
3957 //       name    - Name of test (char, word, etc.) for use in error messages
3958 //       seed    - Seed for starting random number generator (parameter from user)
3959 //       numIterations
3960 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3961 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3962                          int32_t numIterations, UBool useUText) {
3963 
3964 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3965 
3966     const int32_t    TESTSTRINGLEN = 500;
3967     UnicodeString    testText;
3968     int32_t          numCharClasses;
3969     UVector          *chClasses;
3970     int              expectedCount = 0;
3971     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3972     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3973     char             reverseBreaks[TESTSTRINGLEN*2+1];
3974     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3975     char             followingBreaks[TESTSTRINGLEN*2+1];
3976     char             precedingBreaks[TESTSTRINGLEN*2+1];
3977     int              i;
3978     int              loopCount = 0;
3979 
3980 
3981     m_seed = seed;
3982 
3983     numCharClasses = mk.charClasses()->size();
3984     chClasses      = mk.charClasses();
3985 
3986     // Check for errors that occurred during the construction of the MonkeyKind object.
3987     //  Can't report them where they occurred because errln() is a method coming from intlTest,
3988     //  and is not visible outside of RBBITest :-(
3989     if (U_FAILURE(mk.deferredStatus)) {
3990         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3991         return;
3992     }
3993 
3994     // Verify that the character classes all have at least one member.
3995     for (i=0; i<numCharClasses; i++) {
3996         UnicodeSet *s = static_cast<UnicodeSet *>(chClasses->elementAt(i));
3997         if (s == nullptr || s->size() == 0) {
3998             errln("Character Class #%d is null or of zero size.", i);
3999             return;
4000         }
4001     }
4002 
4003     // For minimizing width of class name output.
4004     int classNameSize = mk.maxClassNameSize();
4005 
4006     while (loopCount < numIterations || numIterations == -1) {
4007         if (numIterations == -1 && loopCount % 10 == 0) {
4008             // If test is running in an infinite loop, display a periodic tic so
4009             //   we can tell that it is making progress.
4010             fprintf(stderr, ".");
4011         }
4012         // Save current random number seed, so that we can recreate the random numbers
4013         //   for this loop iteration in event of an error.
4014         seed = m_seed;
4015 
4016         // Populate a test string with data.
4017         testText.truncate(0);
4018         for (i=0; i<TESTSTRINGLEN; i++) {
4019             int32_t  aClassNum = m_rand() % numCharClasses;
4020             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4021             int32_t   charIdx = m_rand() % classSet->size();
4022             UChar32   c = classSet->charAt(charIdx);
4023             if (c < 0) {   // TODO:  deal with sets containing strings.
4024                 errln("%s:%d c < 0", __FILE__, __LINE__);
4025                 break;
4026             }
4027             // Do not assemble a supplementary character from randomly generated separate surrogates.
4028             //   (It could be a dictionary character)
4029             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4030                 continue;
4031             }
4032 
4033             testText.append(c);
4034         }
4035 
4036         // Calculate the expected results for this test string and reset applied rules.
4037         mk.setText(testText);
4038 
4039         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4040         expectedBreaks[0] = 1;
4041         int32_t breakPos = 0;
4042         expectedCount = 0;
4043         for (;;) {
4044             breakPos = mk.next(breakPos);
4045             if (breakPos == -1) {
4046                 break;
4047             }
4048             if (breakPos > testText.length()) {
4049                 errln("breakPos > testText.length()");
4050             }
4051             expectedBreaks[breakPos] = 1;
4052             expectedCount++;
4053             U_ASSERT(expectedCount<testText.length());
4054         }
4055 
4056         // Find the break positions using forward iteration
4057         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4058         if (useUText) {
4059             UErrorCode status = U_ZERO_ERROR;
4060             UText *testUText = utext_openReplaceable(nullptr, &testText, &status);
4061             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4062             bi->setText(testUText, status);
4063             TEST_ASSERT_SUCCESS(status);
4064             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4065                                       //  This UText can be closed immediately, so long as the
4066                                       //  testText string continues to exist.
4067         } else {
4068             bi->setText(testText);
4069         }
4070 
4071         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4072             if (i < 0 || i > testText.length()) {
4073                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4074                 break;
4075             }
4076             forwardBreaks[i] = 1;
4077         }
4078 
4079         // Find the break positions using reverse iteration
4080         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4081         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4082             if (i < 0 || i > testText.length()) {
4083                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4084                 break;
4085             }
4086             reverseBreaks[i] = 1;
4087         }
4088 
4089         // Find the break positions using isBoundary() tests.
4090         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4091         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4092         for (i=0; i<=testText.length(); i++) {
4093             isBoundaryBreaks[i] = bi->isBoundary(i);
4094         }
4095 
4096 
4097         // Find the break positions using the following() function.
4098         // printf(".");
4099         memset(followingBreaks, 0, sizeof(followingBreaks));
4100         int32_t   lastBreakPos = 0;
4101         followingBreaks[0] = 1;
4102         for (i=0; i<testText.length(); i++) {
4103             breakPos = bi->following(i);
4104             if (breakPos <= i ||
4105                 breakPos < lastBreakPos ||
4106                 breakPos > testText.length() ||
4107                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4108                 errln("%s break monkey test: "
4109                     "Out of range value returned by BreakIterator::following().\n"
4110                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4111                          name, seed, i, breakPos, lastBreakPos);
4112                 break;
4113             }
4114             followingBreaks[breakPos] = 1;
4115             lastBreakPos = breakPos;
4116         }
4117 
4118         // Find the break positions using the preceding() function.
4119         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4120         lastBreakPos = testText.length();
4121         precedingBreaks[testText.length()] = 1;
4122         for (i=testText.length(); i>0; i--) {
4123             breakPos = bi->preceding(i);
4124             if (breakPos >= i ||
4125                 breakPos > lastBreakPos ||
4126                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4127                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4128                 errln("%s break monkey test: "
4129                     "Out of range value returned by BreakIterator::preceding().\n"
4130                     "index=%d;  prev returned %d; lastBreak=%d" ,
4131                     name,  i, breakPos, lastBreakPos);
4132                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4133                     precedingBreaks[i] = 2;   // Forces an error.
4134                 }
4135             } else {
4136                 if (breakPos >= 0) {
4137                     precedingBreaks[breakPos] = 1;
4138                 }
4139                 lastBreakPos = breakPos;
4140             }
4141         }
4142 
4143         // Compare the expected and actual results.
4144         for (i=0; i<=testText.length(); i++) {
4145             const char *errorType = nullptr;
4146             const char* currentBreakData = nullptr;
4147             if  (forwardBreaks[i] != expectedBreaks[i]) {
4148                 errorType = "next()";
4149                 currentBreakData = forwardBreaks;
4150             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4151                 errorType = "previous()";
4152                 currentBreakData = reverseBreaks;
4153            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4154                 errorType = "isBoundary()";
4155                 currentBreakData = isBoundaryBreaks;
4156             } else if (followingBreaks[i] != expectedBreaks[i]) {
4157                 errorType = "following()";
4158                 currentBreakData = followingBreaks;
4159             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4160                 errorType = "preceding()";
4161                 currentBreakData = precedingBreaks;
4162             }
4163 
4164             if (errorType != nullptr) {
4165                 // Format a range of the test text that includes the failure as
4166                 //  a data item that can be included in the rbbi test data file.
4167 
4168                 // Start of the range is the last point where expected and actual results
4169                 //  both agreed that there was a break position.
4170 
4171                 int startContext = i;
4172                 int32_t count = 0;
4173                 for (;;) {
4174                     if (startContext==0) { break; }
4175                     startContext --;
4176                     if (expectedBreaks[startContext] != 0) {
4177                         if (count == 2) break;
4178                         count ++;
4179                     }
4180                 }
4181 
4182                 // End of range is two expected breaks past the start position.
4183                 int endContext = i + 1;
4184                 int ci;
4185                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4186                     for (;;) {
4187                         if (endContext >= testText.length()) {break;}
4188                         if (expectedBreaks[endContext-1] != 0) {
4189                             if (count == 0) break;
4190                             count --;
4191                         }
4192                         endContext ++;
4193                     }
4194                 }
4195 
4196                 // Formatting of each line includes:
4197                 //   character code
4198                 //   reference break: '|' -> a break, '.' -> no break
4199                 //   actual break:    '|' -> a break, '.' -> no break
4200                 //   (name of character clase)
4201                 //   Unicode name of character
4202                 //   '-->' indicates location of the difference.
4203 
4204                 MONKEY_ERROR(
4205                     (expectedBreaks[i] ? "Break expected but not found" :
4206                        "Break found but not expected"),
4207                     name, i, seed);
4208 
4209                 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4210                     UChar32  c;
4211                     c = testText.char32At(ci);
4212 
4213                     std::string currentLineFlag = "   ";
4214                     if (ci == i) {
4215                         currentLineFlag = "-->";  // Error position
4216                     }
4217 
4218                     // BMP or SMP character in hex
4219                     char hexCodePoint[12];
4220                     std::string format = "    \\u%04x";
4221                     if (c >= 0x10000) {
4222                         format = "\\U%08x";
4223                     }
4224                     snprintf(hexCodePoint, sizeof(hexCodePoint), format.c_str(), c);
4225 
4226                     // Get the class name and character name for the character.
4227                     char cName[200];
4228                     UErrorCode status = U_ZERO_ERROR;
4229                     u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4230 
4231                     char buffer[200];
4232                     auto ret = snprintf(buffer, sizeof(buffer),
4233                              "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
4234                              currentLineFlag.c_str(),
4235                              ci,
4236                              expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
4237                              currentBreakData[ci] == 0 ? "." : "|",  // Actual break
4238                              hexCodePoint,
4239                              classNameSize,
4240                              mk.classNameFromCodepoint(c).c_str(),
4241                              mk.getAppliedRule(ci).c_str(), cName);
4242                     (void)ret;
4243                     U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4244 
4245                     // Output the error
4246                     if (ci == i) {
4247                         errln(buffer);
4248                     } else {
4249                         infoln(buffer);
4250                     }
4251 
4252                     if (ci >= endContext) { break; }
4253                 }
4254                 break;
4255             }
4256         }
4257 
4258         loopCount++;
4259     }
4260 #endif
4261 }
4262 
4263 
4264 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4265 //             This test checks the initial patch,
4266 //             which is to just keep it from crashing.  Correct word boundaries
4267 //             await a proper fix to the dictionary code.
4268 //
TestBug5532()4269 void RBBITest::TestBug5532()  {
4270    // Text includes a mixture of Thai and Latin.
4271    const unsigned char utf8Data[] = {
4272            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4273            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4274            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4275            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4276            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4277            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4278            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4279            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4280            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4281            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4282            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4283 
4284     UErrorCode status = U_ZERO_ERROR;
4285     UText utext=UTEXT_INITIALIZER;
4286     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4287     TEST_ASSERT_SUCCESS(status);
4288 
4289     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4290     TEST_ASSERT_SUCCESS(status);
4291     if (U_SUCCESS(status)) {
4292         bi->setText(&utext, status);
4293         TEST_ASSERT_SUCCESS(status);
4294 
4295         int32_t breakCount = 0;
4296         int32_t previousBreak = -1;
4297         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4298             // For now, just make sure that the break iterator doesn't hang.
4299             TEST_ASSERT(previousBreak < bi->current());
4300             previousBreak = bi->current();
4301         }
4302         TEST_ASSERT(breakCount > 0);
4303     }
4304     delete bi;
4305     utext_close(&utext);
4306 }
4307 
4308 
TestBug9983()4309 void RBBITest::TestBug9983()  {
4310     UnicodeString text = UnicodeString("\\u002A"  // * Other
4311                                        "\\uFF65"  //   Other
4312                                        "\\u309C"  //   Katakana
4313                                        "\\uFF9F"  //   Extend
4314                                        "\\uFF65"  //   Other
4315                                        "\\u0020"  //   Other
4316                                        "\\u0000").unescape();
4317 
4318     UErrorCode status = U_ZERO_ERROR;
4319     LocalPointer<RuleBasedBreakIterator> brkiter(dynamic_cast<RuleBasedBreakIterator *>(
4320         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4321     TEST_ASSERT_SUCCESS(status);
4322     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(dynamic_cast<RuleBasedBreakIterator *>(
4323         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4324     TEST_ASSERT_SUCCESS(status);
4325     if (U_FAILURE(status)) {
4326         return;
4327     }
4328     int32_t offset, rstatus, iterationCount;
4329 
4330     brkiter->setText(text);
4331     brkiter->last();
4332     iterationCount = 0;
4333     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4334         iterationCount++;
4335         rstatus = brkiter->getRuleStatus();
4336         (void)rstatus;     // Suppress set but not used warning.
4337         if (iterationCount >= 10) {
4338            break;
4339         }
4340     }
4341     TEST_ASSERT(iterationCount == 6);
4342 
4343     brkiterPOSIX->setText(text);
4344     brkiterPOSIX->last();
4345     iterationCount = 0;
4346     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4347         iterationCount++;
4348         rstatus = brkiterPOSIX->getRuleStatus();
4349         (void)rstatus;     // Suppress set but not used warning.
4350         if (iterationCount >= 10) {
4351            break;
4352         }
4353     }
4354     TEST_ASSERT(iterationCount == 6);
4355 }
4356 
4357 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4358 //
TestBug7547()4359 void RBBITest::TestBug7547() {
4360     UnicodeString rules;
4361     UErrorCode status = U_ZERO_ERROR;
4362     UParseError parseError;
4363     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4364     if (status != U_BRK_RULE_SYNTAX) {
4365         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4366     }
4367     if (parseError.line != 1 || parseError.offset != 0) {
4368         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4369     }
4370 }
4371 
4372 
TestBug12797()4373 void RBBITest::TestBug12797() {
4374     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4375     UErrorCode status = U_ZERO_ERROR;
4376     UParseError parseError;
4377     RuleBasedBreakIterator bi(rules, parseError, status);
4378     if (U_FAILURE(status)) {
4379         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4380         return;
4381     }
4382     UnicodeString text = "abc";
4383     bi.setText(text);
4384     bi.first();
4385     int32_t boundary = bi.next();
4386     if (boundary != 3) {
4387         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4388     }
4389 }
4390 
TestBug12918()4391 void RBBITest::TestBug12918() {
4392     // This test triggers an assertion failure in dictbe.cpp
4393     const char16_t *crasherString = u"\u3325\u4a16";
4394     UErrorCode status = U_ZERO_ERROR;
4395     UBreakIterator* iter = ubrk_open(UBRK_WORD, nullptr, crasherString, -1, &status);
4396     if (U_FAILURE(status)) {
4397         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4398         return;
4399     }
4400     ubrk_first(iter);
4401     int32_t pos = 0;
4402     int32_t lastPos = -1;
4403     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4404         if (pos <= lastPos) {
4405             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4406             break;
4407         }
4408     }
4409     ubrk_close(iter);
4410 }
4411 
TestBug12932()4412 void RBBITest::TestBug12932() {
4413     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4414     UnicodeString ruleStr(
4415             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4416             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4417             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4418             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4419             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4420             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4421 
4422     UErrorCode status = U_ZERO_ERROR;
4423     UParseError parseError;
4424     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4425     if (status != U_BRK_RULE_SYNTAX) {
4426         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4427                 __FILE__, __LINE__, u_errorName(status));
4428     }
4429 }
4430 
4431 
4432 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4433 //             remain undevided by ICU char, word and line break.
TestEmoji()4434 void RBBITest::TestEmoji() {
4435 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4436     UErrorCode  status = U_ZERO_ERROR;
4437 
4438     CharString testFileName;
4439     testFileName.append(IntlTest::getSourceTestData(status), status);
4440     testFileName.appendPathPart("emoji-test.txt", status);
4441     if (U_FAILURE(status)) {
4442         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4443         return;
4444     }
4445     logln("Opening data file %s\n", testFileName.data());
4446 
4447     int    len;
4448     char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4449     if (U_FAILURE(status) || testFile == nullptr) {
4450         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4451         return;
4452     }
4453     UnicodeString testFileAsString(testFile, len);
4454     delete [] testFile;
4455 
4456     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4457     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4458     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4459     int32_t lineNumber = 0;
4460 
4461     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4462     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4463     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4464     if (U_FAILURE(status)) {
4465         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4466         return;
4467     }
4468 
4469     while (lineMatcher.find()) {
4470         ++lineNumber;
4471         UnicodeString line = lineMatcher.group(status);
4472         hexMatcher.reset(line);
4473         UnicodeString testString;   // accumulates the emoji sequence.
4474         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4475             UnicodeString hex = hexMatcher.group(1, status);
4476             if (hex.length() > 8) {
4477                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4478                 break;
4479             }
4480             CharString hex8;
4481             hex8.appendInvariantChars(hex, status);
4482             UChar32 c = (UChar32)strtol(hex8.data(), nullptr, 16);
4483             if (c<=0x10ffff) {
4484                 testString.append(c);
4485             } else {
4486                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4487                         __FILE__, __LINE__, lineNumber, hex8.data());
4488                 break;
4489             }
4490         }
4491 
4492         if (testString.length() > 1) {
4493             charBreaks->setText(testString);
4494             charBreaks->first();
4495             int32_t firstBreak = charBreaks->next();
4496             if (testString.length() != firstBreak) {
4497                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4498                         __FILE__, __LINE__, lineNumber, firstBreak);
4499             }
4500             wordBreaks->setText(testString);
4501             wordBreaks->first();
4502             firstBreak = wordBreaks->next();
4503             if (testString.length() != firstBreak) {
4504                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4505                         __FILE__, __LINE__, lineNumber, firstBreak);
4506             }
4507             lineBreaks->setText(testString);
4508             lineBreaks->first();
4509             firstBreak = lineBreaks->next();
4510             if (testString.length() != firstBreak) {
4511                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4512                         __FILE__, __LINE__, lineNumber, firstBreak);
4513             }
4514         }
4515     }
4516 #endif
4517 }
4518 
4519 
4520 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4521 
TestBug12519()4522 void RBBITest::TestBug12519() {
4523     UErrorCode status = U_ZERO_ERROR;
4524     LocalPointer<RuleBasedBreakIterator> biEn(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4525     LocalPointer<RuleBasedBreakIterator> biFr(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getFrance(), status)));
4526     if (!assertSuccess(WHERE, status)) {
4527         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4528         return;
4529     }
4530     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4531 
4532     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4533     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4534 
4535     LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4536     assertTrue(WHERE, *biEn == *cloneEn);
4537     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4538 
4539     LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4540     assertTrue(WHERE, *biFr == *cloneFr);
4541     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4542 
4543     LocalPointer<RuleBasedBreakIterator>biDe(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getGerman(), status)));
4544     UnicodeString text("Hallo Welt");
4545     biDe->setText(text);
4546     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4547     *biDe = *biFr;
4548     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4549 }
4550 
TestBug12677()4551 void RBBITest::TestBug12677() {
4552     // Check that stripping of comments from rules for getRules() is not confused by
4553     // the presence of '#' characters in the rules that do not introduce comments.
4554     UnicodeString rules(u"!!forward; \n"
4555                          "$x = [ab#];  # a set with a # literal. \n"
4556                          " # .;        # a comment that looks sort of like a rule.   \n"
4557                          " '#' '?';    # a rule with a quoted #   \n"
4558                        );
4559 
4560     UErrorCode status = U_ZERO_ERROR;
4561     UParseError pe;
4562     RuleBasedBreakIterator bi(rules, pe, status);
4563     assertSuccess(WHERE, status);
4564     UnicodeString rtRules = bi.getRules();
4565     assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"),  rtRules);
4566 }
4567 
4568 
TestTableRedundancies()4569 void RBBITest::TestTableRedundancies() {
4570     UErrorCode status = U_ZERO_ERROR;
4571 
4572     LocalPointer<RuleBasedBreakIterator> bi (
4573         dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4574     assertSuccess(WHERE, status);
4575     if (U_FAILURE(status)) return;
4576 
4577     RBBIDataWrapper *dw = bi->fData;
4578     const RBBIStateTable *fwtbl = dw->fForwardTable;
4579     UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4580     int32_t numCharClasses = dw->fHeader->fCatCount;
4581     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4582 
4583     // Check for duplicate columns (character categories)
4584 
4585     std::vector<UnicodeString> columns;
4586     for (int32_t column = 0; column < numCharClasses; column++) {
4587         UnicodeString s;
4588         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4589             RBBIStateTableRow  *row = reinterpret_cast<RBBIStateTableRow *>(const_cast<char*>(fwtbl->fTableData + (fwtbl->fRowLen * r)));
4590             s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4591         }
4592         columns.push_back(s);
4593     }
4594     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4595     for (int c1=1; c1<numCharClasses; c1++) {
4596         int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4597         for (int c2 = c1+1; c2 < limit; c2++) {
4598             if (columns.at(c1) == columns.at(c2)) {
4599                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4600                 goto out;
4601             }
4602         }
4603     }
4604   out:
4605 
4606     // Check for duplicate states
4607     std::vector<UnicodeString> rows;
4608     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4609         UnicodeString s;
4610         RBBIStateTableRow  *row = reinterpret_cast<RBBIStateTableRow *>(const_cast<char*>((fwtbl->fTableData + (fwtbl->fRowLen * r))));
4611         if (in8Bits) {
4612             s.append(row->r8.fAccepting);
4613             s.append(row->r8.fLookAhead);
4614             s.append(row->r8.fTagsIdx);
4615             for (int32_t column = 0; column < numCharClasses; column++) {
4616                 s.append(row->r8.fNextState[column]);
4617             }
4618         } else {
4619             s.append(row->r16.fAccepting);
4620             s.append(row->r16.fLookAhead);
4621             s.append(row->r16.fTagsIdx);
4622             for (int32_t column = 0; column < numCharClasses; column++) {
4623                 s.append(row->r16.fNextState[column]);
4624             }
4625         }
4626         rows.push_back(s);
4627     }
4628     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4629         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4630             if (rows.at(r1) == rows.at(r2)) {
4631                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4632                 return;
4633             }
4634         }
4635     }
4636 }
4637 
4638 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4639 //            even after next() has returned DONE.
4640 
TestBug13447()4641 void RBBITest::TestBug13447() {
4642     UErrorCode status = U_ZERO_ERROR;
4643     LocalPointer<RuleBasedBreakIterator> bi(
4644         dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4645     assertSuccess(WHERE, status);
4646     if (U_FAILURE(status)) return;
4647     UnicodeString data(u"1234");
4648     bi->setText(data);
4649     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4650     assertEquals(WHERE, 4, bi->next());
4651     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4652     assertEquals(WHERE, UBRK_DONE, bi->next());
4653     assertEquals(WHERE, 4, bi->current());
4654     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4655 }
4656 
4657 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4658 //  for filling the break iterator cache when starting from random positions
4659 //  in the text.
4660 //
4661 //  It's a monkey test, working on random data, with the expected data obtained
4662 //  from forward iteration (no safe rules involved), comparing with results
4663 //  when indexing into the interior of the string (safe rules needed).
4664 
TestReverse()4665 void RBBITest::TestReverse() {
4666     UErrorCode status = U_ZERO_ERROR;
4667 
4668     TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4669             BreakIterator::createCharacterInstance(Locale::getEnglish(), status))));
4670     assertSuccess(WHERE, status, true);
4671     status = U_ZERO_ERROR;
4672     TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4673             BreakIterator::createWordInstance(Locale::getEnglish(), status))));
4674     assertSuccess(WHERE, status, true);
4675     status = U_ZERO_ERROR;
4676     TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4677             BreakIterator::createLineInstance(Locale::getEnglish(), status))));
4678     assertSuccess(WHERE, status, true);
4679     status = U_ZERO_ERROR;
4680     TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4681             BreakIterator::createSentenceInstance(Locale::getEnglish(), status))));
4682     assertSuccess(WHERE, status, true);
4683 }
4684 
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4685 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4686     if (!bi) {
4687         return;
4688     }
4689 
4690     // From the mapping trie in the break iterator's internal data, create a
4691     // vector of UnicodeStrings, one for each character category, containing
4692     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4693     // to avoid an execess of unassigned code points.
4694 
4695     RBBIDataWrapper *data = bi->fData;
4696     int32_t categoryCount = data->fHeader->fCatCount;
4697     UCPTrie *trie = data->fTrie;
4698     bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4699     uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4700 
4701     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4702     for (int cp=0; cp<0x1fff0; ++cp) {
4703         int cat = ucptrie_get(trie, cp);
4704         cat &= ~dictBit;    // And off the dictionary bit from the category.
4705         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4706         if (cat < 0 || cat >= categoryCount) return;
4707         strings[cat].append(cp);
4708     }
4709 
4710     icu_rand randomGen;
4711     const int testStringLength = 10000;
4712     UnicodeString testString;
4713 
4714     for (int i=0; i<testStringLength; ++i) {
4715         int charClass = randomGen() % categoryCount;
4716         if (strings[charClass].length() > 0) {
4717             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4718             testString.append(cp);
4719         }
4720     }
4721 
4722     typedef std::pair<UBool, int32_t> Result;
4723     std::vector<Result> expectedResults;
4724     bi->setText(testString);
4725     for (int i=0; i<testString.length(); ++i) {
4726         bool isboundary = bi->isBoundary(i);
4727         int  ruleStatus = bi->getRuleStatus();
4728         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4729     }
4730 
4731     for (int i=testString.length()-1; i>=0; --i) {
4732         bi->setText(testString);   // clears the internal break cache
4733         Result expected = expectedResults[i];
4734         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4735         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4736     }
4737 }
4738 
4739 
4740 // Ticket 13692 - finding word boundaries in very large numbers or words could
4741 //                be very time consuming. When the problem was present, this void test
4742 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4743 
TestBug13692()4744 void RBBITest::TestBug13692() {
4745     UErrorCode status = U_ZERO_ERROR;
4746     LocalPointer<RuleBasedBreakIterator> bi (dynamic_cast<RuleBasedBreakIterator*>(
4747             BreakIterator::createWordInstance(Locale::getEnglish(), status)), status);
4748     if (!assertSuccess(WHERE, status, true)) {
4749         return;
4750     }
4751     constexpr int32_t LENGTH = 1000000;
4752     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4753     for (int i=0; i<20; i+=2) {
4754         longNumber.setCharAt(i, u' ');
4755     }
4756     bi->setText(longNumber);
4757     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4758     assertSuccess(WHERE, status);
4759 }
4760 
4761 
TestProperties()4762 void RBBITest::TestProperties() {
4763     UErrorCode errorCode = U_ZERO_ERROR;
4764     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4765     if (!prependSet.isEmpty()) {
4766         errln(
4767             "[:GCB=Prepend:] is not empty any more. "
4768             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4769             "change this test to the opposite condition.");
4770     }
4771 }
4772 
4773 
4774 //
4775 //  TestDebug    -  A place-holder test for debugging purposes.
4776 //                  For putting in fragments of other tests that can be invoked
4777 //                  for tracing  without a lot of unwanted extra stuff happening.
4778 //
TestDebug()4779 void RBBITest::TestDebug() {
4780     UErrorCode status = U_ZERO_ERROR;
4781     LocalPointer<RuleBasedBreakIterator> bi (dynamic_cast<RuleBasedBreakIterator*>(
4782             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)), status);
4783     if (!assertSuccess(WHERE, status, true)) {
4784         return;
4785     }
4786     const UnicodeString &rules = bi->getRules();
4787     UParseError pe;
4788     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4789     assertSuccess(WHERE, status);
4790 }
4791 
4792 
4793 //
4794 //  TestDebugRules   A stub test for use in debugging rule compilation problems.
4795 //                   Can be freely altered as needed or convenient.
4796 //                   Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4797 //                   data files may not be available in all environments.
4798 //                   Any permanent test cases should be moved to rbbitst.txt
4799 //                   (see Bug 20303 in that file, for example), or to another test function in this file.
4800 //
TestDebugRules()4801 void RBBITest::TestDebugRules() {
4802 #if 0
4803     const char16_t *rules = u""
4804         "!!quoted_literals_only; \n"
4805         "!!chain; \n"
4806         "!!lookAheadHardBreak; \n"
4807         " \n"
4808         // "[a] / ; \n"
4809         "[a] [b] / [c] [d]; \n"
4810         "[a] [b] / [c] [d] {100}; \n"
4811         "[x] [a] [b] / [c] [d] {100}; \n"
4812         "[a] [b] [c] / [d] {100}; \n"
4813         //" [c] [d] / [e] [f]; \n"
4814         //"[a] [b] / [c]; \n"
4815         ;
4816 
4817     UErrorCode status = U_ZERO_ERROR;
4818     CharString path(pathToDataDirectory(), status);
4819     path.appendPathPart("brkitr", status);
4820     path.appendPathPart("rules", status);
4821     path.appendPathPart("line.txt", status);
4822     int    len;
4823     std::unique_ptr<char16_t []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4824     if (!assertSuccess(WHERE, status)) {
4825         return;
4826     }
4827 
4828     UParseError pe;
4829     // rules = testFile.get();
4830     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4831 
4832     if (!assertSuccess(WHERE, status)) {
4833         delete bi;
4834         return;
4835     }
4836     // bi->dumpTables();
4837 
4838     delete bi;
4839 #endif
4840 }
4841 
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4842 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4843     UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4844     int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4845     // Text are duplicate characters from U+4E00 to U+4FFF
4846     UnicodeString text;
4847     for (char16_t c = 0x4e00; c < 0x5000; c++) {
4848         text.append(c).append(c);
4849     }
4850     // Generate rule which will caused length+4 character classes and
4851     // length+3 states
4852     UnicodeString rules(u"!!quoted_literals_only;");
4853     for (char16_t c = 0x4e00; c < 0x4e00 + numChar; c++) {
4854         rules.append(u'\'').append(c).append(c).append(u"';");
4855     }
4856     rules.append(u".;");
4857     UErrorCode status = U_ZERO_ERROR;
4858     UParseError parseError;
4859     RuleBasedBreakIterator bi(rules, parseError, status);
4860 
4861     assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4862     assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4863     assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4864     assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4865     assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4866 
4867     bi.setText(text);
4868 
4869     int32_t pos;
4870     int32_t i = 0;
4871     while ((pos = bi.next()) > 0) {
4872         // The first numChar should not break between the pair
4873         if (i++ < numChar) {
4874             assertEquals(WHERE, i * 2, pos);
4875         } else {
4876             // After the first numChar next(), break on each character.
4877             assertEquals(WHERE, i + numChar, pos);
4878         }
4879     }
4880     while ((pos = bi.previous()) > 0) {
4881         // The first numChar should not break between the pair
4882         if (--i < numChar) {
4883             assertEquals(WHERE, i * 2, pos);
4884         } else {
4885             // After the first numChar next(), break on each character.
4886             assertEquals(WHERE, i + numChar, pos);
4887         }
4888     }
4889 }
4890 
Test8BitsTrieWith8BitStateTable()4891 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4892     testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4893 }
4894 
Test16BitsTrieWith8BitStateTable()4895 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4896     testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4897 }
4898 
Test16BitsTrieWith16BitStateTable()4899 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4900     testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4901 }
4902 
Test8BitsTrieWith16BitStateTable()4903 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4904     // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4905     // create state table in 16 bits.
4906 
4907     // Generate 510 'a' as text
4908     UnicodeString text;
4909     for (int32_t i = 0; i < 510; i++) {
4910         text.append(u'a');
4911     }
4912 
4913     UnicodeString rules(u"!!quoted_literals_only;'");
4914     // 254 'a' in the rule will cause 256 states
4915     for (int32_t i = 0; i < 254; i++) {
4916         rules.append(u'a');
4917     }
4918     rules.append(u"';.;");
4919 
4920     UErrorCode status = U_ZERO_ERROR;
4921     UParseError parseError;
4922     LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4923 
4924     assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4925     assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4926     assertEquals(WHERE,
4927                  false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
4928     bi->setText(text);
4929 
4930     // break positions:
4931     // 254, 508, 509, ... 510
4932     assertEquals("next()", 254, bi->next());
4933     int32_t i = 0;
4934     int32_t pos;
4935     while ((pos = bi->next()) > 0) {
4936         assertEquals(WHERE, 508 + i , pos);
4937         i++;
4938     }
4939     i = 0;
4940     while ((pos = bi->previous()) > 0) {
4941         i++;
4942         if (pos >= 508) {
4943             assertEquals(WHERE, 510 - i , pos);
4944         } else {
4945             assertEquals(WHERE, 254 , pos);
4946         }
4947     }
4948 }
4949 
4950 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
4951 // that there are no problems with rules at the size that transitions between the two.
4952 //
4953 // A rule that matches a literal string, like 'abcdefghij', will require one state and
4954 // one character class per character in the string. So we can make a rule to tickle the
4955 // boundaries by using literal strings of various lengths.
4956 //
4957 // For both the number of states and the number of character classes, the eight bit format
4958 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
4959 // leaving 120 something available. This test runs the string over the range of 120 - 130,
4960 // which allows some margin for changes to the number of values reserved by the rule builder
4961 // without breaking the test.
4962 
TestTable_8_16_Bits()4963 void RBBITest::TestTable_8_16_Bits() {
4964 
4965     // testStr serves as both the source of the rule string (truncated to the desired length)
4966     // and as test data to check matching behavior. A break rule consisting of the first 120
4967     // characters of testStr will match the first 120 chars of the full-length testStr.
4968     UnicodeString testStr;
4969     for (char16_t c=0x3000; c<0x3200; ++c) {
4970         testStr.append(c);
4971     }
4972 
4973     const int32_t startLength = 120;   // The shortest rule string to test.
4974     const int32_t endLength = 260;     // The longest rule string to test
4975     const int32_t increment = this->quick ? endLength - startLength : 1;
4976 
4977     for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
4978         UParseError parseError;
4979         UErrorCode status = U_ZERO_ERROR;
4980 
4981         UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
4982         ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
4983         RuleBasedBreakIterator bi(ruleString, parseError, status);
4984         if (!assertSuccess(WHERE, status)) {
4985             errln(ruleString);
4986             break;
4987         }
4988         // bi.dumpTables();
4989 
4990         // Verify that the break iterator is functioning - that the first boundary found
4991         // in testStr is at the length of the rule string.
4992         bi.setText(testStr);
4993         assertEquals(WHERE, ruleLen, bi.next());
4994 
4995         // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
4996         // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
4997         bi.setText(testStr);
4998         int32_t result = bi.preceding(ruleLen);
4999         assertEquals(WHERE, 0, result);
5000 
5001         // Verify that the range of rule lengths being tested cover the translations
5002         // from 8 to 16 bit data.
5003         bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
5004         bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
5005 
5006         if (ruleLen == startLength) {
5007             assertEquals(WHERE, true, has8BitRowData);
5008             assertEquals(WHERE, true, has8BitsTrie);
5009         }
5010         if (ruleLen == endLength) {
5011             assertEquals(WHERE, false, has8BitRowData);
5012             assertEquals(WHERE, false, has8BitsTrie);
5013         }
5014     }
5015 }
5016 
5017 /* Test handling of a large number of look-ahead rules.
5018  * The number of rules in the test exceeds the implementation limits prior to the
5019  * improvements introduced with #13590.
5020  *
5021  * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5022  * The text being matched is sequential, "ABCDEFGHI..."
5023  *
5024  * The upshot is that the look-ahead rules all match on their preceding context,
5025  * and consequently must save a potential result, but then fail to match on their
5026  * trailing context, so that they don't actually cause a boundary.
5027  *
5028  * Additionally, add a ".*" rule, so there are no boundaries unless a
5029  * look-ahead hard-break rule forces one.
5030  */
TestBug13590()5031 void RBBITest::TestBug13590() {
5032     UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5033 
5034     const int NUM_LOOKAHEAD_RULES = 50;
5035     const char16_t STARTING_CHAR = u'\u5000';
5036     char16_t firstChar;
5037     for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5038         firstChar = STARTING_CHAR + ruleNum*2;
5039         rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5040              .append(u' ') .append(u'/') .append(u' ')
5041              .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5042              .append(u';') .append(u'\n');
5043     }
5044 
5045     // Change the last rule added from the form "UV / WY" to "UV / WX".
5046     // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5047     rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5048 
5049     UErrorCode status = U_ZERO_ERROR;
5050     UParseError parseError;
5051     RuleBasedBreakIterator bi(rules, parseError, status);
5052     if (!assertSuccess(WHERE, status)) {
5053         errln(rules);
5054         return;
5055     }
5056     // bi.dumpTables();
5057 
5058     UnicodeString testString;
5059     for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5060         testString.append(c);
5061     }
5062     bi.setText(testString);
5063 
5064     int breaksFound = 0;
5065     while (bi.next() != UBRK_DONE) {
5066         ++breaksFound;
5067     }
5068 
5069     // Two matches are expected, one from the last rule that was explicitly modified,
5070     // and one at the end of the text.
5071     assertEquals(WHERE, 2, breaksFound);
5072 }
5073 
5074 
5075 #if U_ENABLE_TRACING
5076 static std::vector<std::string> gData;
5077 static std::vector<int32_t> gEntryFn;
5078 static std::vector<int32_t> gExitFn;
5079 static std::vector<int32_t> gDataFn;
5080 
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5081 static void U_CALLCONV traceData(
5082         const void*,
5083         int32_t fnNumber,
5084         int32_t,
5085         const char *,
5086         va_list args) {
5087     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5088         const char* data = va_arg(args, const char*);
5089         gDataFn.push_back(fnNumber);
5090         gData.push_back(data);
5091     }
5092 }
5093 
traceEntry(const void *,int32_t fnNumber)5094 static void traceEntry(const void *, int32_t fnNumber) {
5095     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5096         gEntryFn.push_back(fnNumber);
5097     }
5098 }
5099 
traceExit(const void *,int32_t fnNumber,const char *,va_list)5100 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5101     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5102         gExitFn.push_back(fnNumber);
5103     }
5104 }
5105 
5106 
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5107 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5108     assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5109     assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5110     assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5111     assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5112 
5113     if (expectedData == nullptr) {
5114       assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5115       assertEquals("utrace_data should not be called ", 0, gData.size());
5116     } else {
5117       assertEquals("utrace_data should be called ", 1, gDataFn.size());
5118       assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5119       assertEquals("utrace_data should be called ", 1, gData.size());
5120       assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5121     }
5122 }
5123 
SetupTestTrace()5124 void SetupTestTrace() {
5125     gEntryFn.clear();
5126     gExitFn.clear();
5127     gDataFn.clear();
5128     gData.clear();
5129 
5130     const void* context = nullptr;
5131     utrace_setFunctions(context, traceEntry, traceExit, traceData);
5132     utrace_setLevel(UTRACE_INFO);
5133 }
5134 
TestTraceCreateCharacter()5135 void RBBITest::TestTraceCreateCharacter() {
5136     SetupTestTrace();
5137     IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5138     LocalPointer<BreakIterator> brkitr(
5139         BreakIterator::createCharacterInstance("zh-CN", status));
5140     status.errIfFailureAndReset();
5141     assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5142 }
5143 
TestTraceCreateTitle()5144 void RBBITest::TestTraceCreateTitle() {
5145     SetupTestTrace();
5146     IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5147     LocalPointer<BreakIterator> brkitr(
5148         BreakIterator::createTitleInstance("zh-CN", status));
5149     status.errIfFailureAndReset();
5150     assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5151 }
5152 
TestTraceCreateSentence()5153 void RBBITest::TestTraceCreateSentence() {
5154     SetupTestTrace();
5155     IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5156     LocalPointer<BreakIterator> brkitr(
5157         BreakIterator::createSentenceInstance("zh-CN", status));
5158     status.errIfFailureAndReset();
5159     assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5160 }
5161 
TestTraceCreateWord()5162 void RBBITest::TestTraceCreateWord() {
5163     SetupTestTrace();
5164     IcuTestErrorCode status(*this, "TestTraceCreateWord");
5165     LocalPointer<BreakIterator> brkitr(
5166         BreakIterator::createWordInstance("zh-CN", status));
5167     status.errIfFailureAndReset();
5168     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5169 }
5170 
TestTraceCreateLine()5171 void RBBITest::TestTraceCreateLine() {
5172     SetupTestTrace();
5173     IcuTestErrorCode status(*this, "TestTraceCreateLine");
5174     LocalPointer<BreakIterator> brkitr(
5175         BreakIterator::createLineInstance("zh-CN", status));
5176     status.errIfFailureAndReset();
5177     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line");
5178 }
5179 
TestTraceCreateLineStrict()5180 void RBBITest::TestTraceCreateLineStrict() {
5181     SetupTestTrace();
5182     IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5183     LocalPointer<BreakIterator> brkitr(
5184         BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5185     status.errIfFailureAndReset();
5186     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict");
5187 }
5188 
TestTraceCreateLineNormal()5189 void RBBITest::TestTraceCreateLineNormal() {
5190     SetupTestTrace();
5191     IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5192     LocalPointer<BreakIterator> brkitr(
5193         BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5194     status.errIfFailureAndReset();
5195     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal");
5196 }
5197 
TestTraceCreateLineLoose()5198 void RBBITest::TestTraceCreateLineLoose() {
5199     SetupTestTrace();
5200     IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5201     LocalPointer<BreakIterator> brkitr(
5202         BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5203     status.errIfFailureAndReset();
5204     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose");
5205 }
5206 
TestTraceCreateLineLoosePhrase()5207 void RBBITest::TestTraceCreateLineLoosePhrase() {
5208     SetupTestTrace();
5209     IcuTestErrorCode status(*this, "TestTraceCreateLineLoosePhrase");
5210     LocalPointer<BreakIterator> brkitr(
5211         BreakIterator::createLineInstance("ja-u-lb-loose-lw-phrase", status));
5212     status.errIfFailureAndReset();
5213     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose_phrase");
5214 }
5215 
TestTraceCreateLineNormalPhrase()5216 void RBBITest::TestTraceCreateLineNormalPhrase() {
5217     SetupTestTrace();
5218     IcuTestErrorCode status(*this, "TestTraceCreateLineNormalPhrase");
5219     LocalPointer<BreakIterator> brkitr(
5220         BreakIterator::createLineInstance("ja-u-lb-normal-lw-phrase", status));
5221     status.errIfFailureAndReset();
5222     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal_phrase");
5223 }
5224 
TestTraceCreateLineStrictPhrase()5225 void RBBITest::TestTraceCreateLineStrictPhrase() {
5226     SetupTestTrace();
5227     IcuTestErrorCode status(*this, "TestTraceCreateLineStrictPhrase");
5228     LocalPointer<BreakIterator> brkitr(
5229         BreakIterator::createLineInstance("ja-u-lb-strict-lw-phrase", status));
5230     status.errIfFailureAndReset();
5231     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict_phrase");
5232 }
5233 
TestTraceCreateLinePhrase()5234 void RBBITest::TestTraceCreateLinePhrase() {
5235     SetupTestTrace();
5236     IcuTestErrorCode status(*this, "TestTraceCreateLinePhrase");
5237     LocalPointer<BreakIterator> brkitr(
5238         BreakIterator::createLineInstance("ja-u-lw-phrase", status));
5239     status.errIfFailureAndReset();
5240     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_phrase");
5241 }
5242 
TestTraceCreateBreakEngine()5243 void RBBITest::TestTraceCreateBreakEngine() {
5244     rbbi_cleanup();
5245     SetupTestTrace();
5246     IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5247     LocalPointer<BreakIterator> brkitr(
5248         BreakIterator::createWordInstance("zh-CN", status));
5249     status.errIfFailureAndReset();
5250     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5251 
5252     // To word break the following text, BreakIterator will create 5 dictionary
5253     // break engine internally.
5254     UnicodeString text(
5255         u"test "
5256         u"測試 " // Hani
5257         u"សាកល្បង " // Khmr
5258         u"ທົດສອບ " // Laoo
5259         u"စမ်းသပ်မှု " // Mymr
5260         u"ทดสอบ " // Thai
5261         u"test "
5262     );
5263     brkitr->setText(text);
5264 
5265     // Loop through all the text.
5266     while (brkitr->next() > 0) ;
5267 
5268     assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5269     assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5270     assertEquals("utrace_data should be called ", 5, gDataFn.size());
5271 
5272     for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5273         assertEquals("utrace_entry should be called ",
5274                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5275         assertEquals("utrace_exit should be called ",
5276                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5277         assertEquals("utrace_data should be called ",
5278                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5279     }
5280 
5281     assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5282     assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5283     assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5284     assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5285     assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5286 
5287 }
5288 #endif
5289 
TestUnpairedSurrogate()5290 void RBBITest::TestUnpairedSurrogate() {
5291     UnicodeString rules(u"ab;");
5292 
5293     UErrorCode status = U_ZERO_ERROR;
5294     UParseError pe;
5295     RuleBasedBreakIterator bi1(rules, pe, status);
5296     assertSuccess(WHERE, status);
5297     UnicodeString rtRules = bi1.getRules();
5298     // make sure the simple one work first.
5299     assertEquals(WHERE, rules,  rtRules);
5300 
5301 
5302     rules = UnicodeString(u"a\\ud800b;").unescape();
5303     pe.line = 0;
5304     pe.offset = 0;
5305     RuleBasedBreakIterator bi2(rules, pe, status);
5306     assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5307     if (pe.line != 1 || pe.offset != 1) {
5308         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5309     }
5310 
5311     status = U_ZERO_ERROR;
5312     rules = UnicodeString(u"a\\ude00b;").unescape();
5313     pe.line = 0;
5314     pe.offset = 0;
5315     RuleBasedBreakIterator bi3(rules, pe, status);
5316     assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5317     if (pe.line != 1 || pe.offset != 1) {
5318         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5319     }
5320 
5321     // make sure the surrogate one work too.
5322     status = U_ZERO_ERROR;
5323     rules = UnicodeString(u"a��b;");
5324     RuleBasedBreakIterator bi4(rules, pe, status);
5325     rtRules = bi4.getRules();
5326     assertEquals(WHERE, rules, rtRules);
5327 }
5328 
5329 // Read file generated by
5330 // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5331 // as test cases and compare the Output.
5332 // Format of the file
5333 //   Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5334 //   Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5335 //   Input:\t[source text]
5336 //   Output:\t[expected output separated by | ]
5337 //   Input: ...
5338 //   Output: ...
5339 
runLSTMTestFromFile(const char * filename,UScriptCode script)5340 void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5341     // The expectation in this test depends on LSTM, skip the test if the
5342     // configuration is not build with LSTM data.
5343     if (skipLSTMTest()) {
5344         return;
5345     }
5346     UErrorCode   status = U_ZERO_ERROR;
5347     LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5348     if (U_FAILURE(status)) {
5349         errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5350         return;
5351     }
5352     //  Open and read the test data file.
5353     const char *testDataDirectory = IntlTest::getSourceTestData(status);
5354     CharString testFileName(testDataDirectory, -1, status);
5355     testFileName.append(filename, -1, status);
5356 
5357     int len;
5358     char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5359     if (U_FAILURE(status)) {
5360         errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5361         return;
5362     }
5363 
5364     //  Put the test data into a UnicodeString
5365     UnicodeString testString(false, testFile, len);
5366 
5367     int32_t start = 0;
5368 
5369     UnicodeString line;
5370     int32_t end;
5371     std::string actual_sep_str;
5372     int32_t caseNum = 0;
5373     // Iterate through all the lines in the test file.
5374     do {
5375         int32_t cr = testString.indexOf(u'\r', start);
5376         int32_t lf = testString.indexOf(u'\n', start);
5377         end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5378         line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5379         if (line.length() > 0) {
5380             // Separate each line to key and value by TAB.
5381             int32_t tab = line.indexOf(u'\t');
5382             UnicodeString key = line.tempSubString(0, tab);
5383             const UnicodeString value = line.tempSubString(tab+1);
5384 
5385             if (key == "Model:") {
5386                 // Verify the expectation in the test file match the LSTM model
5387                 // we are using now.
5388                 const LSTMData* data = CreateLSTMDataForScript(script, status);
5389                 if (U_FAILURE(status)) {
5390                     dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5391                               __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5392                     return;
5393                 }
5394                 UnicodeString name(LSTMDataName(data));
5395                 DeleteLSTMData(data);
5396                 if (value != name) {
5397                     std::string utf8Name, utf8Value;
5398                     dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5399                               __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5400                               name.toUTF8String<std::string>(utf8Name).c_str(),
5401                               value.toUTF8String<std::string>(utf8Value).c_str());
5402                     return;
5403                 }
5404             } else if (key == "Input:") {
5405                 UnicodeString input("prefix ");
5406                 input += value + " suffix";
5407                 std::stringstream ss;
5408 
5409                 // Construct the UText which is expected by the the engine as
5410                 // input from the UnicodeString.
5411                 UText ut = UTEXT_INITIALIZER;
5412                 utext_openConstUnicodeString(&ut, &input, &status);
5413                 if (U_FAILURE(status)) {
5414                     dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5415                     return;
5416                 }
5417 
5418                 iterator->setText(&ut, status);
5419                 if (U_FAILURE(status)) {
5420                     errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5421                     return;
5422                 }
5423 
5424                 int32_t bp;
5425                 for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5426                     ss << bp;
5427                     if (bp != input.length()) {
5428                         ss << ", ";
5429                     }
5430                 }
5431 
5432                 utext_close(&ut);
5433                 // Turn the break points into a string for easy comparison
5434                 // output.
5435                 actual_sep_str = "{" + ss.str() + "}";
5436             } else if (key == "Output:" && !actual_sep_str.empty()) {
5437                 UnicodeString input("prefix| |");
5438                 input += value + "| |suffix";
5439                 std::string d;
5440                 int32_t sep;
5441                 int32_t start = 0;
5442                 int32_t curr = 0;
5443                 std::stringstream ss;
5444                 // Include 0 as the break point.
5445                 ss << "0, ";
5446                 while ((sep = input.indexOf(u'|', start)) >= 0) {
5447                     int32_t len = sep - start;
5448                     if (len > 0) {
5449                         if (curr > 0) {
5450                             ss << ", ";
5451                         }
5452                         curr += len;
5453                         ss << curr;
5454                     }
5455                     start = sep + 1;
5456                 }
5457                 // Include end of the string as break point.
5458                 ss << ", " << curr + input.length() - start;
5459                 // Turn the break points into a string for easy comparison
5460                 // output.
5461                 std::string expected = "{" + ss.str() + "}";
5462                 std::string utf8;
5463 
5464                 assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5465                              expected.c_str(), actual_sep_str.c_str());
5466                 actual_sep_str.clear();
5467             }
5468         }
5469         start = std::max(cr, lf) + 1;
5470     } while (end >= 0);
5471 
5472     delete [] testFile;
5473 }
5474 
TestLSTMThai()5475 void RBBITest::TestLSTMThai() {
5476     runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5477 }
5478 
TestLSTMBurmese()5479 void RBBITest::TestLSTMBurmese() {
5480     runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5481 }
5482 
5483 
5484 // Test preceding(index) and following(index), with semi-random indexes.
5485 // The random indexes are produced in clusters that are relatively closely spaced,
5486 // to increase the occurrences of hits to the internal break cache.
5487 
TestRandomAccess()5488 void RBBITest::TestRandomAccess() {
5489     static constexpr int32_t CACHE_SIZE = 128;
5490 
5491     UnicodeString testData;
5492     for (int i=0; i<CACHE_SIZE*2; ++i) {
5493         testData.append(u"aaaa\n");
5494     }
5495 
5496     UErrorCode status = U_ZERO_ERROR;
5497     LocalPointer<RuleBasedBreakIterator> bi(
5498           dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status)),
5499             status);
5500     if (!assertSuccess(WHERE, status)) { return; };
5501 
5502     bi->setText(testData);
5503 
5504     auto expectedPreceding = [](int from) {
5505         if (from == 0) {return UBRK_DONE;}
5506         if (from % 5 == 0) {return from - 5;}
5507         return from - (from % 5);
5508     };
5509 
5510     auto expectedFollow = [testData](int from) {
5511         if (from >= testData.length()) {return UBRK_DONE;}
5512         if (from % 5 == 0) {return from + 5;}
5513         return from + (5 - (from % 5));
5514     };
5515 
5516     auto randomStringIndex = [testData]() {
5517         static icu_rand randomGenerator;  // produces random uint32_t values.
5518         static int lastNum;
5519         static int clusterCount;
5520         static constexpr int CLUSTER_SIZE = 100;
5521         static constexpr int CLUSTER_LENGTH = 10;
5522 
5523         if (clusterCount < CLUSTER_LENGTH) {
5524             ++clusterCount;
5525             lastNum += (randomGenerator() % CLUSTER_SIZE);
5526             lastNum -= CLUSTER_SIZE / 2;
5527             lastNum = std::max(0, lastNum);
5528             // Deliberately test indexes > testData.length.
5529             lastNum = std::min(testData.length() + 5, lastNum);
5530         } else {
5531             clusterCount = 0;
5532             lastNum = randomGenerator() % testData.length();
5533         }
5534         return lastNum;
5535     };
5536 
5537     for (int i=0; i<5000; ++i) {
5538         int idx = randomStringIndex();
5539         assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
5540         idx = randomStringIndex();
5541         assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
5542     }
5543 }
5544 
5545 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5546