• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1999-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Date        Name        Description
10 *   12/15/99    Madhu        Creation.
11 *   01/12/2000  Madhu        Updated for changed API and added new tests
12 ************************************************************************/
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include <algorithm>
18 #include <sstream>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <utility>
23 #include <vector>
24 
25 #include "unicode/brkiter.h"
26 #include "unicode/localpointer.h"
27 #include "unicode/numfmt.h"
28 #include "unicode/rbbi.h"
29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30 #include "unicode/regex.h"
31 #endif
32 #include "unicode/schriter.h"
33 #include "unicode/uchar.h"
34 #include "unicode/utf16.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uscript.h"
38 #include "unicode/ustring.h"
39 #include "unicode/utext.h"
40 #include "unicode/utrace.h"
41 
42 #include "charstr.h"
43 #include "cmemory.h"
44 #include "cstr.h"
45 #include "intltest.h"
46 #include "lstmbe.h"
47 #include "rbbitst.h"
48 #include "rbbidata.h"
49 #include "utypeinfo.h"  // for 'typeid' to work
50 #include "uvector.h"
51 #include "uvectr32.h"
52 
53 
54 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 #include "unicode/filteredbrk.h"
56 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
57 
58 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
59     if (!(x)) { \
60         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
61     } \
62 } UPRV_BLOCK_MACRO_END
63 
64 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
65     if (U_FAILURE(errcode)) { \
66         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
67     } \
68 } UPRV_BLOCK_MACRO_END
69 
70 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
71     IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
72                     __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
73 }
74 
75 //---------------------------------------------
76 // runIndexedTest
77 //---------------------------------------------
78 
79 
80 //  Note:  Before adding new tests to this file, check whether the desired test data can
81 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
82 //         it's much less work than writing a new test, diagnostic output in the event of failures
83 //         is good, and the test data file will is shared with ICU4J, so eventually the test
84 //         will run there as well, without additional effort.
85 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)86 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
87 {
88     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
89     fTestParams = params;
90 
91     TESTCASE_AUTO_BEGIN;
92 #if !UCONFIG_NO_FILE_IO
93     TESTCASE_AUTO(TestBug4153072);
94 #endif
95 #if !UCONFIG_NO_FILE_IO
96     TESTCASE_AUTO(TestUnicodeFiles);
97 #endif
98     TESTCASE_AUTO(TestGetAvailableLocales);
99     TESTCASE_AUTO(TestGetDisplayName);
100 #if !UCONFIG_NO_FILE_IO
101     TESTCASE_AUTO(TestEndBehaviour);
102     TESTCASE_AUTO(TestWordBreaks);
103     TESTCASE_AUTO(TestWordBoundary);
104     TESTCASE_AUTO(TestLineBreaks);
105     TESTCASE_AUTO(TestSentBreaks);
106     TESTCASE_AUTO(TestExtended);
107 #endif
108 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
109     TESTCASE_AUTO(TestMonkey);
110 #endif
111 #if !UCONFIG_NO_FILE_IO
112     TESTCASE_AUTO(TestBug3818);
113 #endif
114     TESTCASE_AUTO(TestDebug);
115 #if !UCONFIG_NO_FILE_IO
116     TESTCASE_AUTO(TestBug5775);
117 #endif
118     TESTCASE_AUTO(TestBug9983);
119     TESTCASE_AUTO(TestDictRules);
120     TESTCASE_AUTO(TestBug5532);
121     TESTCASE_AUTO(TestBug7547);
122     TESTCASE_AUTO(TestBug12797);
123     TESTCASE_AUTO(TestBug12918);
124     TESTCASE_AUTO(TestBug12932);
125     TESTCASE_AUTO(TestEmoji);
126     TESTCASE_AUTO(TestBug12519);
127     TESTCASE_AUTO(TestBug12677);
128     TESTCASE_AUTO(TestTableRedundancies);
129     TESTCASE_AUTO(TestBug13447);
130     TESTCASE_AUTO(TestReverse);
131     TESTCASE_AUTO(TestBug13692);
132     TESTCASE_AUTO(TestDebugRules);
133     TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
134     TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
135     TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
136     TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
137     TESTCASE_AUTO(TestTable_8_16_Bits);
138     TESTCASE_AUTO(TestBug13590);
139     TESTCASE_AUTO(TestUnpairedSurrogate);
140     TESTCASE_AUTO(TestLSTMThai);
141     TESTCASE_AUTO(TestLSTMBurmese);
142 
143 #if U_ENABLE_TRACING
144     TESTCASE_AUTO(TestTraceCreateCharacter);
145     TESTCASE_AUTO(TestTraceCreateWord);
146     TESTCASE_AUTO(TestTraceCreateSentence);
147     TESTCASE_AUTO(TestTraceCreateTitle);
148     TESTCASE_AUTO(TestTraceCreateLine);
149     TESTCASE_AUTO(TestTraceCreateLineNormal);
150     TESTCASE_AUTO(TestTraceCreateLineLoose);
151     TESTCASE_AUTO(TestTraceCreateLineStrict);
152     TESTCASE_AUTO(TestTraceCreateLineNormalPhrase);
153     TESTCASE_AUTO(TestTraceCreateLineLoosePhrase);
154     TESTCASE_AUTO(TestTraceCreateLineStrictPhrase);
155     TESTCASE_AUTO(TestTraceCreateLinePhrase);
156     TESTCASE_AUTO(TestTraceCreateBreakEngine);
157 #endif
158 
159     TESTCASE_AUTO_END;
160 }
161 
162 
163 //--------------------------------------------------------------------------------------
164 //
165 //    RBBITest    constructor and destructor
166 //
167 //--------------------------------------------------------------------------------------
168 
RBBITest()169 RBBITest::RBBITest() {
170     fTestParams = NULL;
171 }
172 
173 
~RBBITest()174 RBBITest::~RBBITest() {
175 }
176 
177 
printStringBreaks(UText * tstr,int expected[],int expectedCount)178 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
179     UErrorCode status = U_ZERO_ERROR;
180     char name[100];
181     printf("code    alpha extend alphanum type word sent line name\n");
182     int nextExpectedIndex = 0;
183     utext_setNativeIndex(tstr, 0);
184     for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
185         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
186             printf("------------------------------------------------ %d\n", j);
187             ++nextExpectedIndex;
188         }
189 
190         UChar32 c = utext_next32(tstr);
191         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
192         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
193                            u_isUAlphabetic(c),
194                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
195                            u_isalnum(c),
196                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
197                                                   u_charType(c),
198                                                   U_SHORT_PROPERTY_NAME),
199                            u_getPropertyValueName(UCHAR_WORD_BREAK,
200                                                   u_getIntPropertyValue(c,
201                                                           UCHAR_WORD_BREAK),
202                                                   U_SHORT_PROPERTY_NAME),
203                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
204                                    u_getIntPropertyValue(c,
205                                            UCHAR_SENTENCE_BREAK),
206                                    U_SHORT_PROPERTY_NAME),
207                            u_getPropertyValueName(UCHAR_LINE_BREAK,
208                                    u_getIntPropertyValue(c,
209                                            UCHAR_LINE_BREAK),
210                                    U_SHORT_PROPERTY_NAME),
211                            name);
212     }
213 }
214 
215 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)216 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
217    UErrorCode status = U_ZERO_ERROR;
218    UText *tstr = NULL;
219    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
220    if (U_FAILURE(status)) {
221        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
222        return;
223     }
224    printStringBreaks(tstr, expected, expectedCount);
225    utext_close(tstr);
226 }
227 
228 
TestBug3818()229 void RBBITest::TestBug3818() {
230     UErrorCode  status = U_ZERO_ERROR;
231 
232     // Four Thai words...
233     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
234                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
235     UnicodeString  thaiStr(thaiWordData);
236 
237     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
238     if (U_FAILURE(status) || bi == NULL) {
239         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
240         return;
241     }
242     bi->setText(thaiStr);
243 
244     int32_t  startOfSecondWord = bi->following(1);
245     if (startOfSecondWord != 4) {
246         errln("Fail at file %s, line %d expected start of word at 4, got %d",
247             __FILE__, __LINE__, startOfSecondWord);
248     }
249     startOfSecondWord = bi->following(0);
250     if (startOfSecondWord != 4) {
251         errln("Fail at file %s, line %d expected start of word at 4, got %d",
252             __FILE__, __LINE__, startOfSecondWord);
253     }
254     delete bi;
255 }
256 
257 
258 //---------------------------------------------
259 //
260 //     other tests
261 //
262 //---------------------------------------------
263 
TestGetAvailableLocales()264 void RBBITest::TestGetAvailableLocales()
265 {
266     int32_t locCount = 0;
267     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
268 
269     if (locCount == 0)
270         dataerrln("getAvailableLocales() returned an empty list!");
271     // Just make sure that it's returning good memory.
272     int32_t i;
273     for (i = 0; i < locCount; ++i) {
274         logln(locList[i].getName());
275     }
276 }
277 
278 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()279 void RBBITest::TestGetDisplayName()
280 {
281     UnicodeString   result;
282 
283     BreakIterator::getDisplayName(Locale::getUS(), result);
284     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
285         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
286                 + result);
287 
288     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
289     if (result != "French (France)")
290         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
291                 + result);
292 }
293 /**
294  * Test End Behaviour
295  * @bug 4068137
296  */
TestEndBehaviour()297 void RBBITest::TestEndBehaviour()
298 {
299     UErrorCode status = U_ZERO_ERROR;
300     UnicodeString testString("boo.");
301     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
302     if (U_FAILURE(status))
303     {
304         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
305         return;
306     }
307     wb->setText(testString);
308 
309     if (wb->first() != 0)
310         errln("Didn't get break at beginning of string.");
311     if (wb->next() != 3)
312         errln("Didn't get break before period in \"boo.\"");
313     if (wb->current() != 4 && wb->next() != 4)
314         errln("Didn't get break at end of string.");
315     delete wb;
316 }
317 /*
318  * @bug 4153072
319  */
TestBug4153072()320 void RBBITest::TestBug4153072() {
321     UErrorCode status = U_ZERO_ERROR;
322     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
323     if (U_FAILURE(status))
324     {
325         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
326         return;
327     }
328     UnicodeString str("...Hello, World!...");
329     int32_t begin = 3;
330     int32_t end = str.length() - 3;
331     UBool onBoundary;
332 
333     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
334     iter->adoptText(textIterator);
335     int index;
336     // Note: with the switch to UText, there is no way to restrict the
337     //       iteration range to begin at an index other than zero.
338     //       String character iterators created with a non-zero bound are
339     //         treated by RBBI as being empty.
340     for (index = -1; index < begin + 1; ++index) {
341         onBoundary = iter->isBoundary(index);
342         if (index == 0?  !onBoundary : onBoundary) {
343             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
344                             " and begin index = " + begin);
345         }
346     }
347     delete iter;
348 }
349 
350 
351 //
352 // Test for problem reported by Ashok Matoria on 9 July 2007
353 //    One.<kSoftHyphen><kSpace>Two.
354 //
355 //    Sentence break at start (0) and then on calling next() it breaks at
356 //   'T' of "Two". Now, at this point if I do next() and
357 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
358 //
TestBug5775()359 void RBBITest::TestBug5775() {
360     UErrorCode status = U_ZERO_ERROR;
361     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
362     TEST_ASSERT_SUCCESS(status);
363     if (U_FAILURE(status)) {
364         return;
365     }
366 // Check for status first for better handling of no data errors.
367     TEST_ASSERT(bi != NULL);
368     if (bi == NULL) {
369         return;
370     }
371 
372     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
373     //               01234      56789
374     s = s.unescape();
375     bi->setText(s);
376     int pos = bi->next();
377     TEST_ASSERT(pos == 6);
378     pos = bi->next();
379     TEST_ASSERT(pos == 10);
380     pos = bi->previous();
381     TEST_ASSERT(pos == 6);
382     delete bi;
383 }
384 
385 
386 
387 //------------------------------------------------------------------------------
388 //
389 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
390 //
391 //------------------------------------------------------------------------------
392 
393 struct TestParams {
394     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
395                                            //   Changed out whenever test data changes break type.
396 
397     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
398     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
399     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
400     UVector32       *srcCol;
401 
402     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
403     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
404     CharString       utf8String;           // UTF-8 form of text to break.
405 
TestParamsTestParams406     TestParams(UErrorCode &status) : dataToBreak() {
407         bi               = NULL;
408         expectedBreaks   = new UVector32(status);
409         srcLine          = new UVector32(status);
410         srcCol           = new UVector32(status);
411         textToBreak      = NULL;
412         textMap          = new UVector32(status);
413     }
414 
~TestParamsTestParams415     ~TestParams() {
416         delete bi;
417         delete expectedBreaks;
418         delete srcLine;
419         delete srcCol;
420         utext_close(textToBreak);
421         delete textMap;
422     }
423 
424     int32_t getSrcLine(int32_t bp);
425     int32_t getExpectedBreak(int32_t bp);
426     int32_t getSrcCol(int32_t bp);
427 
428     void setUTF16(UErrorCode &status);
429     void setUTF8(UErrorCode &status);
430 };
431 
432 // Append a UnicodeString to a CharString with UTF-8 encoding.
433 // Substitute any invalid chars.
434 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)435 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
436     if (U_FAILURE(status)) {
437         return;
438     }
439     int32_t utf8Length;
440     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
441                        src.getBuffer(), src.length(),   // UTF-16 data
442                        0xfffd, NULL,                    // Substitution char, number of subs.
443                        &status);
444     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
445         return;
446     }
447     status = U_ZERO_ERROR;
448     int32_t capacity;
449     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
450     u_strToUTF8WithSub(buffer, utf8Length, NULL,
451                        src.getBuffer(), src.length(),
452                        0xfffd, NULL, &status);
453     dest.append(buffer, utf8Length, status);
454 }
455 
456 
setUTF16(UErrorCode & status)457 void TestParams::setUTF16(UErrorCode &status) {
458     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
459     textMap->removeAllElements();
460     for (int32_t i=0; i<dataToBreak.length(); i++) {
461         if (i == dataToBreak.getChar32Start(i)) {
462             textMap->addElement(i, status);
463         } else {
464             textMap->addElement(-1, status);
465         }
466     }
467     textMap->addElement(dataToBreak.length(), status);
468     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
469 }
470 
471 
setUTF8(UErrorCode & status)472 void TestParams::setUTF8(UErrorCode &status) {
473     if (U_FAILURE(status)) {
474         return;
475     }
476     utf8String.clear();
477     CharStringAppend(utf8String, dataToBreak, status);
478     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
479     if (U_FAILURE(status)) {
480         return;
481     }
482 
483     textMap->removeAllElements();
484     int32_t utf16Index = 0;
485     for (;;) {
486         textMap->addElement(utf16Index, status);
487         UChar32 c32 = utext_current32(textToBreak);
488         if (c32 < 0) {
489             break;
490         }
491         utf16Index += U16_LENGTH(c32);
492         utext_next32(textToBreak);
493         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
494             textMap->addElement(-1, status);
495         }
496     }
497     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
498 }
499 
500 
getSrcLine(int32_t bp)501 int32_t TestParams::getSrcLine(int32_t bp) {
502     if (bp >= textMap->size()) {
503         bp = textMap->size() - 1;
504     }
505     int32_t i = 0;
506     for(; bp >= 0 ; --bp) {
507         // Move to a character boundary if we are not on one already.
508         i = textMap->elementAti(bp);
509         if (i >= 0) {
510             break;
511         }
512     }
513     return srcLine->elementAti(i);
514 }
515 
516 
getExpectedBreak(int32_t bp)517 int32_t TestParams::getExpectedBreak(int32_t bp) {
518     if (bp >= textMap->size()) {
519         return 0;
520     }
521     int32_t i = textMap->elementAti(bp);
522     int32_t retVal = 0;
523     if (i >= 0) {
524         retVal = expectedBreaks->elementAti(i);
525     }
526     return retVal;
527 }
528 
529 
getSrcCol(int32_t bp)530 int32_t TestParams::getSrcCol(int32_t bp) {
531     if (bp >= textMap->size()) {
532         bp = textMap->size() - 1;
533     }
534     int32_t i = 0;
535     for(; bp >= 0; --bp) {
536         // Move bp to a character boundary if we are not on one already.
537         i = textMap->elementAti(bp);
538         if (i >= 0) {
539             break;
540         }
541     }
542     return srcCol->elementAti(i);
543 }
544 
545 
executeTest(TestParams * t,UErrorCode & status)546 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
547     int32_t    bp;
548     int32_t    prevBP;
549     int32_t    i;
550 
551     TEST_ASSERT_SUCCESS(status);
552     if (U_FAILURE(status)) {
553         return;
554     }
555 
556     if (t->bi == NULL) {
557         return;
558     }
559 
560     t->bi->setText(t->textToBreak, status);
561     //
562     //  Run the iterator forward
563     //
564     prevBP = -1;
565     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
566         if (prevBP ==  bp) {
567             // Fail for lack of forward progress.
568             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
569                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
570             break;
571         }
572 
573         // Check that there we didn't miss an expected break between the last one
574         //  and this one.
575         for (i=prevBP+1; i<bp; i++) {
576             if (t->getExpectedBreak(i) != 0) {
577                 int expected[] = {0, i};
578                 printStringBreaks(t->dataToBreak, expected, 2);
579                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
580                       i, t->getSrcLine(i), t->getSrcCol(i));
581             }
582         }
583 
584         // Check that the break we did find was expected
585         if (t->getExpectedBreak(bp) == 0) {
586             int expected[] = {0, bp};
587             printStringBreaks(t->textToBreak, expected, 2);
588             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
589                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
590         } else {
591             // The break was expected.
592             //   Check that the {nnn} tag value is correct.
593             int32_t expectedTagVal = t->getExpectedBreak(bp);
594             if (expectedTagVal == -1) {
595                 expectedTagVal = 0;
596             }
597             int32_t line = t->getSrcLine(bp);
598             int32_t rs = t->bi->getRuleStatus();
599             if (rs != expectedTagVal) {
600                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
601                       "          Actual, Expected status = %4d, %4d",
602                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
603             }
604         }
605 
606         prevBP = bp;
607     }
608 
609     // Verify that there were no missed expected breaks after the last one found
610     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
611         if (t->getExpectedBreak(i) != 0) {
612             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
613                       i, t->getSrcLine(i), t->getSrcCol(i));
614         }
615     }
616 
617     //
618     //  Run the iterator backwards, verify that the same breaks are found.
619     //
620     prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
621     bp = t->bi->last();
622     while (bp != BreakIterator::DONE) {
623         if (prevBP ==  bp) {
624             // Fail for lack of progress.
625             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
626                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
627             break;
628         }
629 
630         // Check that we didn't miss an expected break between the last one
631         //  and this one.  (UVector returns zeros for index out of bounds.)
632         for (i=prevBP-1; i>bp; i--) {
633             if (t->getExpectedBreak(i) != 0) {
634                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
635                       i, t->getSrcLine(i), t->getSrcCol(i));
636             }
637         }
638 
639         // Check that the break we did find was expected
640         if (t->getExpectedBreak(bp) == 0) {
641             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
642                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
643         } else {
644             // The break was expected.
645             //   Check that the {nnn} tag value is correct.
646             int32_t expectedTagVal = t->getExpectedBreak(bp);
647             if (expectedTagVal == -1) {
648                 expectedTagVal = 0;
649             }
650             int line = t->getSrcLine(bp);
651             int32_t rs = t->bi->getRuleStatus();
652             if (rs != expectedTagVal) {
653                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
654                       "          Actual, Expected status = %4d, %4d",
655                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
656             }
657         }
658 
659         prevBP = bp;
660         bp = t->bi->previous();
661     }
662 
663     // Verify that there were no missed breaks prior to the last one found
664     for (i=prevBP-1; i>=0; i--) {
665         if (t->getExpectedBreak(i) != 0) {
666             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
667                       i, t->getSrcLine(i), t->getSrcCol(i));
668         }
669     }
670 
671     // Check isBoundary()
672     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
673         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
674         UBool boundaryFound    = t->bi->isBoundary(i);
675         if (boundaryExpected != boundaryFound) {
676             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
677                   "        Expected, Actual= %s, %s",
678                   i, t->getSrcLine(i), t->getSrcCol(i),
679                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
680         }
681     }
682 
683     // Check following()
684     for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
685         int32_t actualBreak = t->bi->following(i);
686         int32_t expectedBreak = BreakIterator::DONE;
687         for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
688             if (t->getExpectedBreak(j) != 0) {
689                 expectedBreak = j;
690                 break;
691             }
692         }
693         if (expectedBreak != actualBreak) {
694             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
695                   "        Expected, Actual= %d, %d",
696                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
697         }
698     }
699 
700     // Check preceding()
701     for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
702         int32_t actualBreak = t->bi->preceding(i);
703         int32_t expectedBreak = BreakIterator::DONE;
704 
705         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
706         // preceding(trailing byte) will return the index of some preceding code point,
707         // not the lead byte of the current code point, even though that has a smaller index.
708         // Therefore, start looking at the expected break data not at i-1, but at
709         // the start of code point index - 1.
710         utext_setNativeIndex(t->textToBreak, i);
711         int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
712         for (; j >= 0; j--) {
713             if (t->getExpectedBreak(j) != 0) {
714                 expectedBreak = j;
715                 break;
716             }
717         }
718         if (expectedBreak != actualBreak) {
719             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
720                   "        Expected, Actual= %d, %d",
721                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
722         }
723     }
724 }
725 
TestExtended()726 void RBBITest::TestExtended() {
727      // The expectations in this test heavily depends on the Thai dictionary.
728      // Therefore, we skip this test under the LSTM configuration.
729      if (skipDictionaryTest()) {
730          return;
731      }
732   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
733   // data driven test closely entangles filtered and regular data.
734 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
735     UErrorCode      status  = U_ZERO_ERROR;
736     Locale          locale("");
737 
738     TestParams          tp(status);
739 
740     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
741     if (U_FAILURE(status)) {
742         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
743     }
744 
745     //
746     //  Open and read the test data file.
747     //
748     const char *testDataDirectory = IntlTest::getSourceTestData(status);
749     CharString testFileName(testDataDirectory, -1, status);
750     testFileName.append("rbbitst.txt", -1, status);
751 
752     int    len;
753     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
754     if (U_FAILURE(status)) {
755         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
756         return;
757     }
758 
759     bool skipTest = false; // Skip this test?
760 
761     //
762     //  Put the test data into a UnicodeString
763     //
764     UnicodeString testString(FALSE, testFile, len);
765 
766     enum EParseState{
767         PARSE_COMMENT,
768         PARSE_TAG,
769         PARSE_DATA,
770         PARSE_NUM,
771         PARSE_RULES
772     }
773     parseState = PARSE_TAG;
774 
775     EParseState savedState = PARSE_TAG;
776 
777     int32_t    lineNum  = 1;
778     int32_t    colStart = 0;
779     int32_t    column   = 0;
780     int32_t    charIdx  = 0;
781 
782     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
783 
784     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
785     int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
786 
787     for (charIdx = 0; charIdx < len; ) {
788         status = U_ZERO_ERROR;
789         UChar  c = testString.charAt(charIdx);
790         charIdx++;
791         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
792             // treat CRLF as a unit
793             c = u'\n';
794             charIdx++;
795         }
796         if (c == u'\n' || c == u'\r') {
797             lineNum++;
798             colStart = charIdx;
799         }
800         column = charIdx - colStart + 1;
801 
802         switch (parseState) {
803         case PARSE_COMMENT:
804             if (c == u'\n' || c == u'\r') {
805                 parseState = savedState;
806             }
807             break;
808 
809         case PARSE_TAG:
810             {
811             if (c == u'#') {
812                 parseState = PARSE_COMMENT;
813                 savedState = PARSE_TAG;
814                 break;
815             }
816             if (u_isUWhiteSpace(c)) {
817                 break;
818             }
819             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
820                 delete tp.bi;
821                 tp.bi = BreakIterator::createWordInstance(locale,  status);
822                 skipTest = false;
823                 charIdx += 5;
824                 break;
825             }
826             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
827                 delete tp.bi;
828                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
829                 skipTest = false;
830                 charIdx += 5;
831                 break;
832             }
833             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
834                 delete tp.bi;
835                 tp.bi = BreakIterator::createLineInstance(locale,  status);
836                 skipTest = false;
837                 charIdx += 5;
838                 break;
839             }
840             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
841                 delete tp.bi;
842                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
843                 skipTest = false;
844                 charIdx += 5;
845                 break;
846             }
847             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
848                 delete tp.bi;
849                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
850                 charIdx += 6;
851                 break;
852             }
853 
854             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
855                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
856                 charIdx = testString.indexOf(u'>', charIdx) + 1;
857                 parseState = PARSE_RULES;
858                 rules.remove();
859                 rulesFirstLine = lineNum;
860                 break;
861             }
862 
863             // <locale  loc_name>
864             localeMatcher.reset(testString);
865             if (localeMatcher.lookingAt(charIdx-1, status)) {
866                 UnicodeString localeName = localeMatcher.group(1, status);
867                 char localeName8[100];
868                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
869                 locale = Locale::createFromName(localeName8);
870                 charIdx += localeMatcher.group(0, status).length() - 1;
871                 TEST_ASSERT_SUCCESS(status);
872                 break;
873             }
874             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
875                 parseState = PARSE_DATA;
876                 charIdx += 5;
877                 tp.dataToBreak = "";
878                 tp.expectedBreaks->removeAllElements();
879                 tp.srcCol ->removeAllElements();
880                 tp.srcLine->removeAllElements();
881                 break;
882             }
883 
884             errln("line %d: Tag expected in test file.", lineNum);
885             parseState = PARSE_COMMENT;
886             savedState = PARSE_DATA;
887             goto end_test; // Stop the test.
888             }
889             break;
890 
891         case PARSE_RULES:
892             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
893                 charIdx += 7;
894                 parseState = PARSE_TAG;
895                 delete tp.bi;
896                 UParseError pe;
897                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
898                 skipTest = U_FAILURE(status);
899                 if (U_FAILURE(status)) {
900                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
901                         rulesFirstLine + pe.line - 1, u_errorName(status));
902                 }
903             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
904                 charIdx += 10;
905                 parseState = PARSE_TAG;
906                 UErrorCode ec = U_ZERO_ERROR;
907                 UParseError pe;
908                 RuleBasedBreakIterator bi(rules, pe, ec);
909                 if (U_SUCCESS(ec)) {
910                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
911                         rulesFirstLine + pe.line - 1);
912                 }
913             } else {
914                 rules.append(c);
915             }
916             break;
917 
918         case PARSE_DATA:
919             if (c == u'•') {
920                 int32_t  breakIdx = tp.dataToBreak.length();
921                 if (tp.expectedBreaks->size() > breakIdx) {
922                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
923                           lineNum, column);
924                 }
925                 tp.expectedBreaks->setSize(breakIdx+1);
926                 tp.expectedBreaks->setElementAt(-1, breakIdx);
927                 tp.srcLine->setSize(breakIdx+1);
928                 tp.srcLine->setElementAt(lineNum, breakIdx);
929                 tp.srcCol ->setSize(breakIdx+1);
930                 tp.srcCol ->setElementAt(column, breakIdx);
931                 break;
932             }
933 
934             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
935                 // Add final entry to mappings from break location to source file position.
936                 //  Need one extra because last break position returned is after the
937                 //    last char in the data, not at the last char.
938                 tp.srcLine->addElement(lineNum, status);
939                 tp.srcCol ->addElement(column, status);
940 
941                 parseState = PARSE_TAG;
942                 charIdx += 6;
943 
944                 if (!skipTest) {
945                     // RUN THE TEST!
946                     status = U_ZERO_ERROR;
947                     tp.setUTF16(status);
948                     executeTest(&tp, status);
949                     TEST_ASSERT_SUCCESS(status);
950 
951                     // Run again, this time with UTF-8 text wrapped in a UText.
952                     status = U_ZERO_ERROR;
953                     tp.setUTF8(status);
954                     TEST_ASSERT_SUCCESS(status);
955                     executeTest(&tp, status);
956                 }
957                 break;
958             }
959 
960             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
961                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
962                 // Get the code point from the name and insert it into the test data.
963                 //   (Damn, no API takes names in Unicode  !!!
964                 //    we've got to take it back to char *)
965                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
966                 int32_t nameLength = nameEndIdx - (charIdx+2);
967                 char charNameBuf[200];
968                 UChar32 theChar = -1;
969                 if (nameEndIdx != -1) {
970                     UErrorCode status = U_ZERO_ERROR;
971                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
972                     charNameBuf[sizeof(charNameBuf)-1] = 0;
973                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
974                     if (U_FAILURE(status)) {
975                         theChar = -1;
976                     }
977                 }
978                 if (theChar == -1) {
979                     errln("Error in named character in test file at line %d, col %d",
980                         lineNum, column);
981                 } else {
982                     // Named code point was recognized.  Insert it
983                     //   into the test data.
984                     tp.dataToBreak.append(theChar);
985                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
986                         tp.srcLine->addElement(lineNum, status);
987                         tp.srcCol ->addElement(column, status);
988                     }
989                 }
990                 if (nameEndIdx > charIdx) {
991                     charIdx = nameEndIdx+1;
992 
993                 }
994                 break;
995             }
996 
997 
998 
999             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
1000                 charIdx++;
1001                 int32_t  breakIdx = tp.dataToBreak.length();
1002                 tp.expectedBreaks->setSize(breakIdx+1);
1003                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1004                 tp.srcLine->setSize(breakIdx+1);
1005                 tp.srcLine->setElementAt(lineNum, breakIdx);
1006                 tp.srcCol ->setSize(breakIdx+1);
1007                 tp.srcCol ->setElementAt(column, breakIdx);
1008                 break;
1009             }
1010 
1011             if (c == u'<') {
1012                 tagValue   = 0;
1013                 parseState = PARSE_NUM;
1014                 break;
1015             }
1016 
1017             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
1018                 parseState = PARSE_COMMENT;
1019                 savedState = PARSE_DATA;
1020                 break;
1021             }
1022 
1023             if (c == u'\\') {
1024                 // Check for \ at end of line, a line continuation.
1025                 //     Advance over (discard) the newline
1026                 UChar32 cp = testString.char32At(charIdx);
1027                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1028                     // We have a CR LF
1029                     //  Need an extra increment of the input ptr to move over both of them
1030                     charIdx++;
1031                 }
1032                 if (cp == u'\n' || cp == u'\r') {
1033                     lineNum++;
1034                     colStart = charIdx;
1035                     charIdx++;
1036                     break;
1037                 }
1038 
1039                 // Let unescape handle the back slash.
1040                 cp = testString.unescapeAt(charIdx);
1041                 if (cp != -1) {
1042                     // Escape sequence was recognized.  Insert the char
1043                     //   into the test data.
1044                     tp.dataToBreak.append(cp);
1045                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1046                         tp.srcLine->addElement(lineNum, status);
1047                         tp.srcCol ->addElement(column, status);
1048                     }
1049                     break;
1050                 }
1051 
1052 
1053                 // Not a recognized backslash escape sequence.
1054                 // Take the next char as a literal.
1055                 //  TODO:  Should this be an error?
1056                 c = testString.charAt(charIdx);
1057                 charIdx = testString.moveIndex32(charIdx, 1);
1058             }
1059 
1060             // Normal, non-escaped data char.
1061             tp.dataToBreak.append(c);
1062 
1063             // Save the mapping from offset in the data to line/column numbers in
1064             //   the original input file.  Will be used for better error messages only.
1065             //   If there's an expected break before this char, the slot in the mapping
1066             //     vector will already be set for this char; don't overwrite it.
1067             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1068                 tp.srcLine->addElement(lineNum, status);
1069                 tp.srcCol ->addElement(column, status);
1070             }
1071             break;
1072 
1073 
1074         case PARSE_NUM:
1075             // We are parsing an expected numeric tag value, like <1234>,
1076             //   within a chunk of data.
1077             if (u_isUWhiteSpace(c)) {
1078                 break;
1079             }
1080 
1081             if (c == u'>') {
1082                 // Finished the number.  Add the info to the expected break data,
1083                 //   and switch parse state back to doing plain data.
1084                 parseState = PARSE_DATA;
1085                 if (tagValue == 0) {
1086                     tagValue = -1;
1087                 }
1088                 int32_t  breakIdx = tp.dataToBreak.length();
1089                 if (tp.expectedBreaks->size() > breakIdx) {
1090                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1091                           lineNum, column);
1092                 }
1093                 tp.expectedBreaks->setSize(breakIdx+1);
1094                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1095                 tp.srcLine->setSize(breakIdx+1);
1096                 tp.srcLine->setElementAt(lineNum, breakIdx);
1097                 tp.srcCol ->setSize(breakIdx+1);
1098                 tp.srcCol ->setElementAt(column, breakIdx);
1099                 break;
1100             }
1101 
1102             if (u_isdigit(c)) {
1103                 tagValue = tagValue*10 + u_charDigitValue(c);
1104                 break;
1105             }
1106 
1107             errln("Syntax Error in test file at line %d, col %d",
1108                 lineNum, column);
1109             parseState = PARSE_COMMENT;
1110             goto end_test; // Stop the test
1111             break;
1112         }
1113 
1114 
1115         if (U_FAILURE(status)) {
1116             dataerrln("ICU Error %s while parsing test file at line %d.",
1117                 u_errorName(status), lineNum);
1118             status = U_ZERO_ERROR;
1119             goto end_test; // Stop the test
1120         }
1121 
1122     }
1123 
1124     // Reached end of test file. Raise an error if parseState indicates that we are
1125     //   within a block that should have been terminated.
1126 
1127     if (parseState == PARSE_RULES) {
1128         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1129             lineNum, rulesFirstLine);
1130     }
1131     if (parseState == PARSE_DATA) {
1132         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1133     }
1134 
1135 
1136 end_test:
1137     delete [] testFile;
1138 #endif
1139 }
1140 
1141 //-------------------------------------------------------------------------------
1142 //
1143 //  TestDictRules   create a break iterator from source rules that includes a
1144 //                  dictionary range.   Regression for bug #7130.  Source rules
1145 //                  do not declare a break iterator type (word, line, sentence, etc.
1146 //                  but the dictionary code, without a type, would loop.
1147 //
1148 //-------------------------------------------------------------------------------
TestDictRules()1149 void RBBITest::TestDictRules() {
1150     const char *rules =  "$dictionary = [a-z]; \n"
1151                          "!!forward; \n"
1152                          "$dictionary $dictionary; \n"
1153                          "!!reverse; \n"
1154                          "$dictionary $dictionary; \n";
1155     const char *text = "aa";
1156     UErrorCode status = U_ZERO_ERROR;
1157     UParseError parseError;
1158 
1159     RuleBasedBreakIterator bi(rules, parseError, status);
1160     if (U_SUCCESS(status)) {
1161         UnicodeString utext = text;
1162         bi.setText(utext);
1163         int32_t position;
1164         int32_t loops;
1165         for (loops = 0; loops<10; loops++) {
1166             position = bi.next();
1167             if (position == RuleBasedBreakIterator::DONE) {
1168                 break;
1169             }
1170         }
1171         TEST_ASSERT(loops == 1);
1172     } else {
1173         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1174     }
1175 }
1176 
1177 
1178 
1179 //--------------------------------------------------------------------------------------------
1180 //
1181 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1182 //
1183 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1184 void RBBITest::TestUnicodeFiles() {
1185     RuleBasedBreakIterator  *bi;
1186     UErrorCode               status = U_ZERO_ERROR;
1187 
1188     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1189     TEST_ASSERT_SUCCESS(status);
1190     if (U_SUCCESS(status)) {
1191         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1192     }
1193     delete bi;
1194 
1195     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1196     TEST_ASSERT_SUCCESS(status);
1197     if (U_SUCCESS(status)) {
1198         runUnicodeTestData("WordBreakTest.txt", bi);
1199     }
1200     delete bi;
1201 
1202     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1203     TEST_ASSERT_SUCCESS(status);
1204     if (U_SUCCESS(status)) {
1205         runUnicodeTestData("SentenceBreakTest.txt", bi);
1206     }
1207     delete bi;
1208 
1209     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1210     TEST_ASSERT_SUCCESS(status);
1211     if (U_SUCCESS(status)) {
1212         runUnicodeTestData("LineBreakTest.txt", bi);
1213     }
1214     delete bi;
1215 }
1216 
1217 
1218 // Check for test cases from the Unicode test data files that are known to fail
1219 // and should be skipped as known issues because ICU does not fully implement
1220 // the Unicode specifications, or because ICU includes tailorings that differ from
1221 // the Unicode standard.
1222 //
1223 // Test cases are identified by the test data sequence, which tends to be more stable
1224 // across Unicode versions than the test file line numbers.
1225 //
1226 // The test case with ticket "10666" is a dummy, included as an example.
1227 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1228 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1229     static struct TestCase {
1230         const char *fTicketNum;
1231         const char *fFileName;
1232         const UChar *fString;
1233     } badTestCases[] = {
1234         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1235         // The following tests were originally for
1236         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1237         // However, that ticket has been closed as fixed but these tests still fail, so
1238         // ICU-21097 has been created to investigate and address these remaining issues.
1239         {"21097",  "LineBreakTest.txt", u"-#"},
1240         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1241         {"21097",  "LineBreakTest.txt", u"\u002d\u00a7"},
1242         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1243         {"21097",  "LineBreakTest.txt", u"\u002d\U00050005"},
1244         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1245         {"21097",  "LineBreakTest.txt", u"\u002d\u0e01"},
1246         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1247 
1248         // The following tests were originally for
1249         // Issue ICU-12017 Improve line break around numbers.
1250         // However, that ticket has been closed as fixed but these tests still fail, so
1251         // ICU-21097 has been created to investigate and address these remaining issues.
1252         {"21097", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1253         {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1254         {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1255         {"21097", "LineBreakTest.txt", u"a.2 "},
1256         {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1257         {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1258         {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1259         {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1260         {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1261         {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1262         {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1263         {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1264         {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1265         {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1266     };
1267 
1268     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1269         const TestCase &badCase = badTestCases[n];
1270         if (!strcmp(fileName, badCase.fFileName) &&
1271                 testCase == UnicodeString(badCase.fString)) {
1272             return logKnownIssue(badCase.fTicketNum);
1273         }
1274     }
1275     return FALSE;
1276 }
1277 
1278 
1279 //--------------------------------------------------------------------------------------------
1280 //
1281 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1282 //
1283 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1284 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1285 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1286     UErrorCode  status = U_ZERO_ERROR;
1287 
1288     //
1289     //  Open and read the test data file, put it into a UnicodeString.
1290     //
1291     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1292     char testFileName[1000];
1293     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1294         dataerrln("Can't open test data.  Path too long.");
1295         return;
1296     }
1297     strcpy(testFileName, testDataDirectory);
1298     strcat(testFileName, fileName);
1299 
1300     logln("Opening data file %s\n", fileName);
1301 
1302     int    len;
1303     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1304     if (status != U_FILE_ACCESS_ERROR) {
1305         TEST_ASSERT_SUCCESS(status);
1306         TEST_ASSERT(testFile != NULL);
1307     }
1308     if (U_FAILURE(status) || testFile == NULL) {
1309         return; /* something went wrong, error already output */
1310     }
1311     UnicodeString testFileAsString(TRUE, testFile, len);
1312 
1313     //
1314     //  Parse the test data file using a regular expression.
1315     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1316     //     is identified by which group had a match.
1317     //
1318     //    Capture Group  #                  1          2            3            4           5
1319     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1320     //
1321     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1322     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1323     UnicodeString   testString;
1324     UVector32       breakPositions(status);
1325     int             lineNumber = 1;
1326     TEST_ASSERT_SUCCESS(status);
1327     if (U_FAILURE(status)) {
1328         return;
1329     }
1330 
1331     //
1332     //  Scan through each test case, building up the string to be broken in testString,
1333     //   and the positions that should be boundaries in the breakPositions vector.
1334     //
1335     int spin = 0;
1336     while (tokenMatcher.find()) {
1337         if(tokenMatcher.hitEnd()) {
1338           /* Shouldn't Happen(TM).  This means we didn't find the symbols we were looking for.
1339              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1340              and caused an infinite loop here on EBCDIC systems!
1341           */
1342           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1343           //       return;
1344         }
1345         if (tokenMatcher.start(1, status) >= 0) {
1346             // Scanned a divide sign, indicating a break position in the test data.
1347             if (testString.length()>0) {
1348                 breakPositions.addElement(testString.length(), status);
1349             }
1350         }
1351         else if (tokenMatcher.start(2, status) >= 0) {
1352             // Scanned an 'x', meaning no break at this position in the test data
1353             //   Nothing to be done here.
1354             }
1355         else if (tokenMatcher.start(3, status) >= 0) {
1356             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1357             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1358             int length = hexNumber.length();
1359             if (length<=8) {
1360                 char buf[10];
1361                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1362                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1363                 if (c<=0x10ffff) {
1364                     testString.append(c);
1365                 } else {
1366                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1367                        fileName, lineNumber);
1368                 }
1369             } else {
1370                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1371                        fileName, lineNumber);
1372              }
1373         }
1374         else if (tokenMatcher.start(4, status) >= 0) {
1375             // Scanned to end of a line, possibly skipping over a comment in the process.
1376             //   If the line from the file contained test data, run the test now.
1377             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1378                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1379             }
1380 
1381             // Clear out this test case.
1382             //    The string and breakPositions vector will be refilled as the next
1383             //       test case is parsed.
1384             testString.remove();
1385             breakPositions.removeAllElements();
1386             lineNumber++;
1387         } else {
1388             // Scanner catchall.  Something unrecognized appeared on the line.
1389             char token[16];
1390             UnicodeString uToken = tokenMatcher.group(0, status);
1391             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1392             token[sizeof(token)-1] = 0;
1393             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1394 
1395             // Clean up, in preparation for continuing with the next line.
1396             testString.remove();
1397             breakPositions.removeAllElements();
1398             lineNumber++;
1399         }
1400         TEST_ASSERT_SUCCESS(status);
1401         if (U_FAILURE(status)) {
1402             break;
1403         }
1404     }
1405 
1406     delete [] testFile;
1407  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1408 }
1409 
1410 //--------------------------------------------------------------------------------------------
1411 //
1412 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1413 //                            test data files.  Do only a simple, forward-only check -
1414 //                            this test is mostly to check that ICU and the Unicode
1415 //                            data agree with each other.
1416 //
1417 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1418 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1419                          const UnicodeString &testString,   // Text data to be broken
1420                          UVector32 *breakPositions,         // Positions where breaks should be found.
1421                          RuleBasedBreakIterator *bi) {
1422     int32_t pos;                 // Break Position in the test string
1423     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1424     int32_t expectedPos;         // Expected break position (index into test string)
1425 
1426     bi->setText(testString);
1427     pos = bi->first();
1428     pos = bi->next();
1429 
1430     while (pos != BreakIterator::DONE) {
1431         if (expectedI >= breakPositions->size()) {
1432             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1433                 testFileName, lineNumber, pos);
1434             break;
1435         }
1436         expectedPos = breakPositions->elementAti(expectedI);
1437         if (pos < expectedPos) {
1438             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1439                 testFileName, lineNumber, pos);
1440             break;
1441         }
1442         if (pos > expectedPos) {
1443             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1444                 testFileName, lineNumber, expectedPos);
1445             break;
1446         }
1447         pos = bi->next();
1448         expectedI++;
1449     }
1450 
1451     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1452         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1453             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1454     }
1455 }
1456 
1457 
1458 
1459 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1460 //---------------------------------------------------------------------------------------
1461 //
1462 //   class RBBIMonkeyKind
1463 //
1464 //      Monkey Test for Break Iteration
1465 //      Abstract interface class.   Concrete derived classes independently
1466 //      implement the break rules for different iterator types.
1467 //
1468 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1469 //      testing, but works purely in terms of the interface defined here.
1470 //
1471 //---------------------------------------------------------------------------------------
1472 class RBBIMonkeyKind {
1473 public:
1474     // Return a UVector of UnicodeSets, representing the character classes used
1475     //   for this type of iterator.
1476     virtual  UVector  *charClasses() = 0;
1477 
1478     // Set the test text on which subsequent calls to next() will operate
1479     virtual  void      setText(const UnicodeString &s) = 0;
1480 
1481     // Find the next break position, starting from the prev break position, or from zero.
1482     // Return -1 after reaching end of string.
1483     virtual  int32_t   next(int32_t i) = 0;
1484 
1485     // Name of each character class, parallel with charClasses. Used for debugging output
1486     // of characters.
1487     virtual  std::vector<std::string>&     characterClassNames();
1488 
1489     void setAppliedRule(int32_t position, const char* value);
1490 
1491     std::string getAppliedRule(int32_t position);
1492 
1493     virtual ~RBBIMonkeyKind();
1494     UErrorCode deferredStatus;
1495 
1496     std::string classNameFromCodepoint(const UChar32 c);
1497     unsigned int maxClassNameSize();
1498 
1499  protected:
1500      RBBIMonkeyKind();
1501      std::vector<std::string> classNames;
1502      std::vector<std::string> appliedRules;
1503 
1504     // Clear `appliedRules` and fill it with empty strings in the size of test text.
1505     void prepareAppliedRules(int32_t size );
1506 
1507  private:
1508 
1509 };
1510 
RBBIMonkeyKind()1511 RBBIMonkeyKind::RBBIMonkeyKind() {
1512     deferredStatus = U_ZERO_ERROR;
1513 }
1514 
~RBBIMonkeyKind()1515 RBBIMonkeyKind::~RBBIMonkeyKind() {
1516 }
1517 
characterClassNames()1518 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1519     return classNames;
1520 }
1521 
prepareAppliedRules(int32_t size)1522 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1523     // Remove all the information in the `appliedRules`.
1524     appliedRules.clear();
1525     appliedRules.resize(size + 1);
1526 }
1527 
setAppliedRule(int32_t position,const char * value)1528 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1529     appliedRules[position] = value;
1530 }
1531 
getAppliedRule(int32_t position)1532 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1533     return appliedRules[position];
1534 }
1535 
classNameFromCodepoint(const UChar32 c)1536 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1537     // Simply iterate through charClasses to find character's class
1538     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1539         UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1540         if (classSet->contains(c)) {
1541             return classNames[aClassNum];
1542         }
1543     }
1544     U_ASSERT(FALSE);  // This should not happen.
1545     return "bad class name";
1546 }
1547 
maxClassNameSize()1548 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1549     unsigned int maxSize = 0;
1550     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1551         auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1552         if (aClassNumSize > maxSize) {
1553             maxSize = aClassNumSize;
1554         }
1555     }
1556     return maxSize;
1557 }
1558 
1559 //----------------------------------------------------------------------------------------
1560 //
1561 //   Random Numbers.  Similar to standard lib rand() and srand()
1562 //                    Not using library to
1563 //                      1.  Get same results on all platforms.
1564 //                      2.  Get access to current seed, to more easily reproduce failures.
1565 //
1566 //---------------------------------------------------------------------------------------
1567 static uint32_t m_seed = 1;
1568 
m_rand()1569 static uint32_t m_rand()
1570 {
1571     m_seed = m_seed * 1103515245 + 12345;
1572     return (uint32_t)(m_seed/65536) % 32768;
1573 }
1574 
1575 
1576 //------------------------------------------------------------------------------------------
1577 //
1578 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1579 //                             of RBBIMonkeyKind.
1580 //
1581 //------------------------------------------------------------------------------------------
1582 class RBBICharMonkey: public RBBIMonkeyKind {
1583 public:
1584     RBBICharMonkey();
1585     virtual          ~RBBICharMonkey();
1586     virtual  UVector *charClasses() override;
1587     virtual  void     setText(const UnicodeString &s) override;
1588     virtual  int32_t  next(int32_t i) override;
1589 private:
1590     UVector   *fSets;
1591 
1592     UnicodeSet  *fCRLFSet;
1593     UnicodeSet  *fControlSet;
1594     UnicodeSet  *fExtendSet;
1595     UnicodeSet  *fZWJSet;
1596     UnicodeSet  *fRegionalIndicatorSet;
1597     UnicodeSet  *fPrependSet;
1598     UnicodeSet  *fSpacingSet;
1599     UnicodeSet  *fLSet;
1600     UnicodeSet  *fVSet;
1601     UnicodeSet  *fTSet;
1602     UnicodeSet  *fLVSet;
1603     UnicodeSet  *fLVTSet;
1604     UnicodeSet  *fHangulSet;
1605     UnicodeSet  *fExtendedPictSet;
1606     UnicodeSet  *fViramaSet;
1607     UnicodeSet  *fLinkingConsonantSet;
1608     UnicodeSet  *fExtCccZwjSet;
1609     UnicodeSet  *fAnySet;
1610 
1611     const UnicodeString *fText;
1612 };
1613 
1614 
RBBICharMonkey()1615 RBBICharMonkey::RBBICharMonkey() {
1616     UErrorCode  status = U_ZERO_ERROR;
1617 
1618     fText = NULL;
1619 
1620     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1621     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1622     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1623     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1624     fRegionalIndicatorSet =
1625                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1626     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1627     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1628     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1629     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1630     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1631     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1632     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1633     fHangulSet  = new UnicodeSet();
1634     fHangulSet->addAll(*fLSet);
1635     fHangulSet->addAll(*fVSet);
1636     fHangulSet->addAll(*fTSet);
1637     fHangulSet->addAll(*fLVSet);
1638     fHangulSet->addAll(*fLVTSet);
1639 
1640     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1641     fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1642                                         "\\p{Indic_Syllabic_Category=Virama}]", status);
1643     fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1644                                         "\\p{Indic_Syllabic_Category=Consonant}]", status);
1645     fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1646     fAnySet           = new UnicodeSet(0, 0x10ffff);
1647 
1648     // Create sets of characters, and add the names of the above character sets.
1649     // In each new ICU release, add new names corresponding to the sets above.
1650     fSets             = new UVector(status);
1651 
1652     // Important: Keep class names the same as the class contents.
1653     fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1654     fSets->addElement(fControlSet, status); classNames.push_back("Control");
1655     fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1656     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1657     if (!fPrependSet->isEmpty()) {
1658         fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1659     }
1660     fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1661     fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1662     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1663     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1664     fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1665     fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1666     fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1667     fSets->addElement(fAnySet, status); classNames.push_back("Any");
1668 
1669     if (U_FAILURE(status)) {
1670         deferredStatus = status;
1671     }
1672 }
1673 
1674 
setText(const UnicodeString & s)1675 void RBBICharMonkey::setText(const UnicodeString &s) {
1676     fText = &s;
1677     prepareAppliedRules(s.length());
1678 }
1679 
1680 
1681 
next(int32_t prevPos)1682 int32_t RBBICharMonkey::next(int32_t prevPos) {
1683     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1684                               //   break position being tested.  The candidate break
1685                               //   location is before p2.
1686 
1687     int     breakPos = -1;
1688 
1689     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1690     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1691 
1692     if (U_FAILURE(deferredStatus)) {
1693         return -1;
1694     }
1695 
1696     // Previous break at end of string.  return DONE.
1697     if (prevPos >= fText->length()) {
1698         return -1;
1699     }
1700 
1701     p0 = p1 = p2 = p3 = prevPos;
1702     c3 =  fText->char32At(prevPos);
1703     c0 = c1 = c2 = cBase = 0;
1704     (void)p0;   // suppress set but not used warning.
1705     (void)c0;
1706 
1707     // Loop runs once per "significant" character position in the input text.
1708     for (;;) {
1709         // Move all of the positions forward in the input string.
1710         p0 = p1;  c0 = c1;
1711         p1 = p2;  c1 = c2;
1712         p2 = p3;  c2 = c3;
1713 
1714         // Advance p3 by one codepoint
1715         p3 = fText->moveIndex32(p3, 1);
1716         c3 = fText->char32At(p3);
1717 
1718         if (p1 == p2) {
1719             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1720             continue;
1721         }
1722 
1723         if (p2 == fText->length()) {
1724             setAppliedRule(p2, "End of String");
1725             break;
1726         }
1727 
1728         //     No Extend or Format characters may appear between the CR and LF,
1729         //     which requires the additional check for p2 immediately following p1.
1730         //
1731         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1732           setAppliedRule(p2, "GB3   CR x LF");
1733           continue;
1734         }
1735 
1736         if (fControlSet->contains(c1) ||
1737             c1 == 0x0D ||
1738             c1 == 0x0A)  {
1739           setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
1740           break;
1741         }
1742 
1743         if (fControlSet->contains(c2) ||
1744             c2 == 0x0D ||
1745             c2 == 0x0A)  {
1746             setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
1747             break;
1748         }
1749 
1750         if (fLSet->contains(c1) &&
1751                (fLSet->contains(c2)  ||
1752                 fVSet->contains(c2)  ||
1753                 fLVSet->contains(c2) ||
1754                 fLVTSet->contains(c2))) {
1755             setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
1756             continue;
1757         }
1758 
1759         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1760             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1761             setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
1762             continue;
1763         }
1764 
1765         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1766             fTSet->contains(c2))  {
1767             setAppliedRule(p2, "GB8   ( LVT | T)  x T");
1768             continue;
1769         }
1770 
1771         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1772             if (!fExtendSet->contains(c1)) {
1773                 cBase = c1;
1774             }
1775             setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
1776             continue;
1777         }
1778 
1779         if (fSpacingSet->contains(c2)) {
1780             setAppliedRule(p2, "GB9a  x  SpacingMark");
1781             continue;
1782         }
1783 
1784         if (fPrependSet->contains(c1)) {
1785             setAppliedRule(p2, "GB9b  Prepend x");
1786             continue;
1787         }
1788 
1789         //   Note: Viramas are also included in the ExtCccZwj class.
1790         if (fLinkingConsonantSet->contains(c2)) {
1791             int pi = p1;
1792             bool sawVirama = false;
1793             while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1794                 if (fViramaSet->contains(fText->char32At(pi))) {
1795                     sawVirama = true;
1796                 }
1797                 pi = fText->moveIndex32(pi, -1);
1798             }
1799             if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1800               setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1801               continue;
1802             }
1803         }
1804 
1805         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1806           setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1807           continue;
1808         }
1809 
1810         //                   Note: The first if condition is a little tricky. We only need to force
1811         //                      a break if there are three or more contiguous RIs. If there are
1812         //                      only two, a break following will occur via other rules, and will include
1813         //                      any trailing extend characters, which is needed behavior.
1814         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1815                 && fRegionalIndicatorSet->contains(c2)) {
1816           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1817           break;
1818         }
1819         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1820           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1821           continue;
1822         }
1823 
1824         setAppliedRule(p2, "GB999 Any <break> Any");
1825         break;
1826     }
1827 
1828     breakPos = p2;
1829     return breakPos;
1830 }
1831 
1832 
1833 
charClasses()1834 UVector  *RBBICharMonkey::charClasses() {
1835     return fSets;
1836 }
1837 
~RBBICharMonkey()1838 RBBICharMonkey::~RBBICharMonkey() {
1839     delete fSets;
1840     delete fCRLFSet;
1841     delete fControlSet;
1842     delete fExtendSet;
1843     delete fRegionalIndicatorSet;
1844     delete fPrependSet;
1845     delete fSpacingSet;
1846     delete fLSet;
1847     delete fVSet;
1848     delete fTSet;
1849     delete fLVSet;
1850     delete fLVTSet;
1851     delete fHangulSet;
1852     delete fAnySet;
1853     delete fZWJSet;
1854     delete fExtendedPictSet;
1855     delete fViramaSet;
1856     delete fLinkingConsonantSet;
1857     delete fExtCccZwjSet;
1858 }
1859 
1860 //------------------------------------------------------------------------------------------
1861 //
1862 //   class RBBIWordMonkey      Word Break specific implementation
1863 //                             of RBBIMonkeyKind.
1864 //
1865 //------------------------------------------------------------------------------------------
1866 class RBBIWordMonkey: public RBBIMonkeyKind {
1867 public:
1868     RBBIWordMonkey();
1869     virtual          ~RBBIWordMonkey();
1870     virtual  UVector *charClasses() override;
1871     virtual  void     setText(const UnicodeString &s) override;
1872     virtual int32_t   next(int32_t i) override;
1873 private:
1874     UVector      *fSets;
1875 
1876     UnicodeSet  *fCRSet;
1877     UnicodeSet  *fLFSet;
1878     UnicodeSet  *fNewlineSet;
1879     UnicodeSet  *fRegionalIndicatorSet;
1880     UnicodeSet  *fKatakanaSet;
1881     UnicodeSet  *fHebrew_LetterSet;
1882     UnicodeSet  *fALetterSet;
1883     UnicodeSet  *fSingle_QuoteSet;
1884     UnicodeSet  *fDouble_QuoteSet;
1885     UnicodeSet  *fMidNumLetSet;
1886     UnicodeSet  *fMidLetterSet;
1887     UnicodeSet  *fMidNumSet;
1888     UnicodeSet  *fNumericSet;
1889     UnicodeSet  *fFormatSet;
1890     UnicodeSet  *fOtherSet = nullptr;
1891     UnicodeSet  *fExtendSet;
1892     UnicodeSet  *fExtendNumLetSet;
1893     UnicodeSet  *fWSegSpaceSet;
1894     UnicodeSet  *fDictionarySet = nullptr;
1895     UnicodeSet  *fZWJSet;
1896     UnicodeSet  *fExtendedPictSet;
1897 
1898     const UnicodeString  *fText;
1899 };
1900 
1901 
RBBIWordMonkey()1902 RBBIWordMonkey::RBBIWordMonkey()
1903 {
1904     UErrorCode  status = U_ZERO_ERROR;
1905 
1906     fSets            = new UVector(status);
1907 
1908     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
1909     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
1910     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
1911     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
1912     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1913     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1914     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1915     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
1916     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
1917     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
1918     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]",    status);
1919     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
1920     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1921     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
1922     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1923     // There are some sc=Hani characters with WB=Extend.
1924     // The break rules need to pick one or the other because
1925     // Extend overlapping with something else is messy.
1926     // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1927     // in $Han (for $dictionary) and out of $Extend.
1928     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1929     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
1930 
1931     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
1932     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1933     if(U_FAILURE(status)) {
1934         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1935         deferredStatus = status;
1936         return;
1937     }
1938 
1939     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1940     fDictionarySet->addAll(*fKatakanaSet);
1941     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1942 
1943     fALetterSet->removeAll(*fDictionarySet);
1944 
1945     fOtherSet        = new UnicodeSet();
1946     if(U_FAILURE(status)) {
1947         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1948         deferredStatus = status;
1949         return;
1950     }
1951 
1952     fOtherSet->complement();
1953     fOtherSet->removeAll(*fCRSet);
1954     fOtherSet->removeAll(*fLFSet);
1955     fOtherSet->removeAll(*fNewlineSet);
1956     fOtherSet->removeAll(*fKatakanaSet);
1957     fOtherSet->removeAll(*fHebrew_LetterSet);
1958     fOtherSet->removeAll(*fALetterSet);
1959     fOtherSet->removeAll(*fSingle_QuoteSet);
1960     fOtherSet->removeAll(*fDouble_QuoteSet);
1961     fOtherSet->removeAll(*fMidLetterSet);
1962     fOtherSet->removeAll(*fMidNumSet);
1963     fOtherSet->removeAll(*fNumericSet);
1964     fOtherSet->removeAll(*fExtendNumLetSet);
1965     fOtherSet->removeAll(*fWSegSpaceSet);
1966     fOtherSet->removeAll(*fFormatSet);
1967     fOtherSet->removeAll(*fExtendSet);
1968     fOtherSet->removeAll(*fRegionalIndicatorSet);
1969     fOtherSet->removeAll(*fZWJSet);
1970     fOtherSet->removeAll(*fExtendedPictSet);
1971 
1972     // Inhibit dictionary characters from being tested at all.
1973     fOtherSet->removeAll(*fDictionarySet);
1974 
1975     // Add classes and their names
1976     fSets->addElement(fCRSet, status); classNames.push_back("CR");
1977     fSets->addElement(fLFSet, status); classNames.push_back("LF");
1978     fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
1979     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1980     fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
1981     fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
1982     fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
1983     fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
1984     // Omit Katakana from fSets, which omits Katakana characters
1985     // from the test data. They are all in the dictionary set,
1986     // which this (old, to be retired) monkey test cannot handle.
1987     //fSets->addElement(fKatakanaSet, status);
1988 
1989     fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
1990     fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
1991     fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
1992     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
1993     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
1994     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
1995     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
1996     fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
1997     fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
1998 
1999     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2000     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2001 
2002     if (U_FAILURE(status)) {
2003         deferredStatus = status;
2004     }
2005 }
2006 
setText(const UnicodeString & s)2007 void RBBIWordMonkey::setText(const UnicodeString &s) {
2008     fText       = &s;
2009     prepareAppliedRules(s.length());
2010 }
2011 
2012 
next(int32_t prevPos)2013 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2014     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2015                               //   break position being tested.  The candidate break
2016                               //   location is before p2.
2017 
2018     int     breakPos = -1;
2019 
2020     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2021 
2022     if (U_FAILURE(deferredStatus)) {
2023         return -1;
2024     }
2025 
2026     // Prev break at end of string.  return DONE.
2027     if (prevPos >= fText->length()) {
2028         return -1;
2029     }
2030     p0 = p1 = p2 = p3 = prevPos;
2031     c3 =  fText->char32At(prevPos);
2032     c0 = c1 = c2 = 0;
2033     (void)p0;       // Suppress set but not used warning.
2034 
2035     // Loop runs once per "significant" character position in the input text.
2036     for (;;) {
2037         // Move all of the positions forward in the input string.
2038         p0 = p1;  c0 = c1;
2039         p1 = p2;  c1 = c2;
2040         p2 = p3;  c2 = c3;
2041 
2042         // Advance p3 by    X(Extend | Format)*   Rule 4
2043         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2044         do {
2045             p3 = fText->moveIndex32(p3, 1);
2046             c3 = fText->char32At(p3);
2047             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2048                break;
2049             }
2050         }
2051         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2052 
2053 
2054         if (p1 == p2) {
2055             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2056             continue;
2057         }
2058 
2059         if (p2 == fText->length()) {
2060             // Reached end of string.  Always a break position.
2061             break;
2062         }
2063 
2064         //     No Extend or Format characters may appear between the CR and LF,
2065         //     which requires the additional check for p2 immediately following p1.
2066         //
2067         if (c1==0x0D && c2==0x0A) {
2068           setAppliedRule(p2, "WB3   CR x LF");
2069           continue;
2070         }
2071 
2072         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2073             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2074             break;
2075         }
2076         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2077             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2078             break;
2079         }
2080 
2081         //              Not ignoring extend chars, so peek into input text to
2082         //              get the potential ZWJ, the character immediately preceding c2.
2083         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2084         //              but char32At will get the full code point.
2085         if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2086             setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
2087             continue;
2088         }
2089 
2090         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2091             setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
2092             continue;
2093         }
2094 
2095         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2096             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2097             setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2098             continue;
2099         }
2100 
2101         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2102              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2103              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2104             setAppliedRule(p2,
2105                            "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2106             continue;
2107         }
2108 
2109         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2110             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2111             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2112             setAppliedRule(p2,
2113                            "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
2114             continue;
2115         }
2116 
2117         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2118             setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
2119             continue;
2120         }
2121 
2122           if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2123             setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
2124             continue;
2125         }
2126 
2127         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2128             setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
2129             continue;
2130         }
2131 
2132         if (fNumericSet->contains(c1) &&
2133             fNumericSet->contains(c2)) {
2134             setAppliedRule(p2, "WB8   Numeric x Numeric");
2135             continue;
2136         }
2137 
2138         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2139             fNumericSet->contains(c2)) {
2140             setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
2141             continue;
2142         }
2143 
2144         if (fNumericSet->contains(c1) &&
2145             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2146             setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
2147             continue;
2148         }
2149 
2150           if (fNumericSet->contains(c0) &&
2151             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2152             fNumericSet->contains(c2)) {
2153             setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
2154             continue;
2155         }
2156 
2157         if (fNumericSet->contains(c1) &&
2158             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2159             fNumericSet->contains(c3)) {
2160             setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2161             continue;
2162         }
2163 
2164         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2165         //                  all Katakana are handled by the dictionary breaker.
2166         if (fKatakanaSet->contains(c1) &&
2167             fKatakanaSet->contains(c2))  {
2168             setAppliedRule(p2, "WB13  Katakana x Katakana");
2169             continue;
2170         }
2171 
2172         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2173              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2174              fExtendNumLetSet->contains(c2)) {
2175             setAppliedRule(p2,
2176                            "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2177             continue;
2178         }
2179 
2180         if (fExtendNumLetSet->contains(c1) &&
2181                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2182                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2183             setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2184             continue;
2185         }
2186 
2187         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2188             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2189             break;
2190         }
2191         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2192             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2193             continue;
2194         }
2195 
2196         setAppliedRule(p2, "WB999");
2197         break;
2198     }
2199 
2200     breakPos = p2;
2201     return breakPos;
2202 }
2203 
2204 
charClasses()2205 UVector  *RBBIWordMonkey::charClasses() {
2206     return fSets;
2207 }
2208 
~RBBIWordMonkey()2209 RBBIWordMonkey::~RBBIWordMonkey() {
2210     delete fSets;
2211     delete fCRSet;
2212     delete fLFSet;
2213     delete fNewlineSet;
2214     delete fKatakanaSet;
2215     delete fHebrew_LetterSet;
2216     delete fALetterSet;
2217     delete fSingle_QuoteSet;
2218     delete fDouble_QuoteSet;
2219     delete fMidNumLetSet;
2220     delete fMidLetterSet;
2221     delete fMidNumSet;
2222     delete fNumericSet;
2223     delete fFormatSet;
2224     delete fExtendSet;
2225     delete fExtendNumLetSet;
2226     delete fWSegSpaceSet;
2227     delete fRegionalIndicatorSet;
2228     delete fDictionarySet;
2229     delete fOtherSet;
2230     delete fZWJSet;
2231     delete fExtendedPictSet;
2232 }
2233 
2234 
2235 
2236 
2237 //------------------------------------------------------------------------------------------
2238 //
2239 //   class RBBISentMonkey      Sentence Break specific implementation
2240 //                             of RBBIMonkeyKind.
2241 //
2242 //------------------------------------------------------------------------------------------
2243 class RBBISentMonkey: public RBBIMonkeyKind {
2244 public:
2245     RBBISentMonkey();
2246     virtual          ~RBBISentMonkey();
2247     virtual  UVector *charClasses() override;
2248     virtual  void     setText(const UnicodeString &s) override;
2249     virtual int32_t   next(int32_t i) override;
2250 private:
2251     int               moveBack(int posFrom);
2252     int               moveForward(int posFrom);
2253     UChar32           cAt(int pos);
2254 
2255     UVector      *fSets;
2256 
2257     UnicodeSet  *fSepSet;
2258     UnicodeSet  *fFormatSet;
2259     UnicodeSet  *fSpSet;
2260     UnicodeSet  *fLowerSet;
2261     UnicodeSet  *fUpperSet;
2262     UnicodeSet  *fOLetterSet;
2263     UnicodeSet  *fNumericSet;
2264     UnicodeSet  *fATermSet;
2265     UnicodeSet  *fSContinueSet;
2266     UnicodeSet  *fSTermSet;
2267     UnicodeSet  *fCloseSet;
2268     UnicodeSet  *fOtherSet;
2269     UnicodeSet  *fExtendSet;
2270 
2271     const UnicodeString  *fText;
2272 };
2273 
RBBISentMonkey()2274 RBBISentMonkey::RBBISentMonkey()
2275 {
2276     UErrorCode  status = U_ZERO_ERROR;
2277 
2278     fSets            = new UVector(status);
2279 
2280     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2281     //                       set and made into character classes of their own.  For the monkey impl,
2282     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2283     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2284     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2285     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2286     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2287     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2288     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2289     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2290     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2291     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2292     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2293     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2294     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2295     fOtherSet        = new UnicodeSet();
2296 
2297     if(U_FAILURE(status)) {
2298       deferredStatus = status;
2299       return;
2300     }
2301 
2302     fOtherSet->complement();
2303     fOtherSet->removeAll(*fSepSet);
2304     fOtherSet->removeAll(*fFormatSet);
2305     fOtherSet->removeAll(*fSpSet);
2306     fOtherSet->removeAll(*fLowerSet);
2307     fOtherSet->removeAll(*fUpperSet);
2308     fOtherSet->removeAll(*fOLetterSet);
2309     fOtherSet->removeAll(*fNumericSet);
2310     fOtherSet->removeAll(*fATermSet);
2311     fOtherSet->removeAll(*fSContinueSet);
2312     fOtherSet->removeAll(*fSTermSet);
2313     fOtherSet->removeAll(*fCloseSet);
2314     fOtherSet->removeAll(*fExtendSet);
2315 
2316     fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2317     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2318     fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2319     fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2320     fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2321     fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2322     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2323     fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2324     fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2325     fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2326     fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2327     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2328     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2329 
2330     if (U_FAILURE(status)) {
2331         deferredStatus = status;
2332     }
2333 }
2334 
2335 
2336 
setText(const UnicodeString & s)2337 void RBBISentMonkey::setText(const UnicodeString &s) {
2338     fText       = &s;
2339     prepareAppliedRules(s.length());
2340 }
2341 
charClasses()2342 UVector  *RBBISentMonkey::charClasses() {
2343     return fSets;
2344 }
2345 
2346 //  moveBack()   Find the "significant" code point preceding the index i.
2347 //               Skips over ($Extend | $Format)* .
2348 //
moveBack(int i)2349 int RBBISentMonkey::moveBack(int i) {
2350     if (i <= 0) {
2351         return -1;
2352     }
2353     UChar32   c;
2354     int32_t   j = i;
2355     do {
2356         j = fText->moveIndex32(j, -1);
2357         c = fText->char32At(j);
2358     }
2359     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2360     return j;
2361 
2362  }
2363 
2364 
moveForward(int i)2365 int RBBISentMonkey::moveForward(int i) {
2366     if (i>=fText->length()) {
2367         return fText->length();
2368     }
2369     UChar32   c;
2370     int32_t   j = i;
2371     do {
2372         j = fText->moveIndex32(j, 1);
2373         c = cAt(j);
2374     }
2375     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2376     return j;
2377 }
2378 
cAt(int pos)2379 UChar32 RBBISentMonkey::cAt(int pos) {
2380     if (pos<0 || pos>=fText->length()) {
2381         return -1;
2382     } else {
2383         return fText->char32At(pos);
2384     }
2385 }
2386 
next(int32_t prevPos)2387 int32_t RBBISentMonkey::next(int32_t prevPos) {
2388     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2389                               //   break position being tested.  The candidate break
2390                               //   location is before p2.
2391 
2392     int     breakPos = -1;
2393 
2394     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2395     UChar32 c;
2396 
2397     if (U_FAILURE(deferredStatus)) {
2398         return -1;
2399     }
2400 
2401     // Prev break at end of string.  return DONE.
2402     if (prevPos >= fText->length()) {
2403         return -1;
2404     }
2405     p0 = p1 = p2 = p3 = prevPos;
2406     c3 =  fText->char32At(prevPos);
2407     c0 = c1 = c2 = 0;
2408     (void)p0;     // Suppress set but not used warning.
2409 
2410     // Loop runs once per "significant" character position in the input text.
2411     for (;;) {
2412         // Move all of the positions forward in the input string.
2413         p0 = p1;  c0 = c1;
2414         p1 = p2;  c1 = c2;
2415         p2 = p3;  c2 = c3;
2416 
2417         // Advance p3 by    X(Extend | Format)*   Rule 4
2418         p3 = moveForward(p3);
2419         c3 = cAt(p3);
2420 
2421         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2422             setAppliedRule(p2, "SB3   CR x LF");
2423             continue;
2424         }
2425 
2426         if (fSepSet->contains(c1)) {
2427             p2 = p1+1;   // Separators don't combine with Extend or Format.
2428 
2429             setAppliedRule(p2, "SB4   Sep  <break>");
2430             break;
2431         }
2432 
2433         if (p2 >= fText->length()) {
2434             // Reached end of string.  Always a break position.
2435             setAppliedRule(p2, "SB4   Sep  <break>");
2436             break;
2437         }
2438 
2439         if (p2 == prevPos) {
2440             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2441             setAppliedRule(p2, "SB4   Sep  <break>");
2442             continue;
2443         }
2444 
2445         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2446             setAppliedRule(p2, "SB6   ATerm x Numeric");
2447             continue;
2448         }
2449 
2450           if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2451                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2452             setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
2453             continue;
2454         }
2455 
2456         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2457         //                  note to the Unicode 5.0 documents.
2458         int p8 = p1;
2459         while (fSpSet->contains(cAt(p8))) {
2460             p8 = moveBack(p8);
2461         }
2462         while (fCloseSet->contains(cAt(p8))) {
2463             p8 = moveBack(p8);
2464         }
2465         if (fATermSet->contains(cAt(p8))) {
2466             p8=p2;
2467             for (;;) {
2468                 c = cAt(p8);
2469                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2470                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2471                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2472 
2473                     setAppliedRule(p2,
2474                                    "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2475                     break;
2476                 }
2477                 p8 = moveForward(p8);
2478             }
2479             if (fLowerSet->contains(cAt(p8))) {
2480 
2481                 setAppliedRule(p2,
2482                                "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2483                 continue;
2484             }
2485         }
2486 
2487         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2488             p8 = p1;
2489             while (fSpSet->contains(cAt(p8))) {
2490                 p8 = moveBack(p8);
2491             }
2492             while (fCloseSet->contains(cAt(p8))) {
2493                 p8 = moveBack(p8);
2494             }
2495             c = cAt(p8);
2496             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2497                 setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2498                 continue;
2499             }
2500         }
2501 
2502         int p9 = p1;
2503         while (fCloseSet->contains(cAt(p9))) {
2504             p9 = moveBack(p9);
2505         }
2506         c = cAt(p9);
2507         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2508             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2509 
2510                 setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
2511                 continue;
2512             }
2513         }
2514 
2515         int p10 = p1;
2516         while (fSpSet->contains(cAt(p10))) {
2517             p10 = moveBack(p10);
2518         }
2519         while (fCloseSet->contains(cAt(p10))) {
2520             p10 = moveBack(p10);
2521         }
2522         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2523             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2524                 setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
2525                 continue;
2526             }
2527         }
2528 
2529         int p11 = p1;
2530         if (fSepSet->contains(cAt(p11))) {
2531             p11 = moveBack(p11);
2532         }
2533         while (fSpSet->contains(cAt(p11))) {
2534             p11 = moveBack(p11);
2535         }
2536         while (fCloseSet->contains(cAt(p11))) {
2537             p11 = moveBack(p11);
2538         }
2539         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2540           setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
2541             break;
2542         }
2543 
2544         setAppliedRule(p2, "SB12  Any x Any");
2545         continue;
2546     }
2547 
2548     breakPos = p2;
2549     return breakPos;
2550 }
2551 
~RBBISentMonkey()2552 RBBISentMonkey::~RBBISentMonkey() {
2553     delete fSets;
2554     delete fSepSet;
2555     delete fFormatSet;
2556     delete fSpSet;
2557     delete fLowerSet;
2558     delete fUpperSet;
2559     delete fOLetterSet;
2560     delete fNumericSet;
2561     delete fATermSet;
2562     delete fSContinueSet;
2563     delete fSTermSet;
2564     delete fCloseSet;
2565     delete fOtherSet;
2566     delete fExtendSet;
2567 }
2568 
2569 
2570 
2571 //-------------------------------------------------------------------------------------------
2572 //
2573 //  RBBILineMonkey
2574 //
2575 //-------------------------------------------------------------------------------------------
2576 
2577 class RBBILineMonkey: public RBBIMonkeyKind {
2578 public:
2579     RBBILineMonkey();
2580     virtual          ~RBBILineMonkey();
2581     virtual  UVector *charClasses() override;
2582     virtual  void     setText(const UnicodeString &s) override;
2583     virtual  int32_t  next(int32_t i) override;
2584     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2585 private:
2586     UVector      *fSets;
2587 
2588     UnicodeSet  *fBK;
2589     UnicodeSet  *fCR;
2590     UnicodeSet  *fLF;
2591     UnicodeSet  *fCM;
2592     UnicodeSet  *fNL;
2593     UnicodeSet  *fSG;
2594     UnicodeSet  *fWJ;
2595     UnicodeSet  *fZW;
2596     UnicodeSet  *fGL;
2597     UnicodeSet  *fCB;
2598     UnicodeSet  *fSP;
2599     UnicodeSet  *fB2;
2600     UnicodeSet  *fBA;
2601     UnicodeSet  *fBB;
2602     UnicodeSet  *fHH;
2603     UnicodeSet  *fHY;
2604     UnicodeSet  *fH2;
2605     UnicodeSet  *fH3;
2606     UnicodeSet  *fCL;
2607     UnicodeSet  *fCP;
2608     UnicodeSet  *fEX;
2609     UnicodeSet  *fIN;
2610     UnicodeSet  *fJL;
2611     UnicodeSet  *fJV;
2612     UnicodeSet  *fJT;
2613     UnicodeSet  *fNS;
2614     UnicodeSet  *fOP;
2615     UnicodeSet  *fQU;
2616     UnicodeSet  *fIS;
2617     UnicodeSet  *fNU;
2618     UnicodeSet  *fPO;
2619     UnicodeSet  *fPR;
2620     UnicodeSet  *fSY;
2621     UnicodeSet  *fAI;
2622     UnicodeSet  *fAL;
2623     UnicodeSet  *fCJ;
2624     UnicodeSet  *fHL;
2625     UnicodeSet  *fID;
2626     UnicodeSet  *fRI;
2627     UnicodeSet  *fXX;
2628     UnicodeSet  *fEB;
2629     UnicodeSet  *fEM;
2630     UnicodeSet  *fZWJ;
2631     UnicodeSet  *fOP30;
2632     UnicodeSet  *fCP30;
2633     UnicodeSet  *fExtPictUnassigned;
2634 
2635     BreakIterator        *fCharBI;
2636     const UnicodeString  *fText;
2637     RegexMatcher         *fNumberMatcher;
2638 };
2639 
RBBILineMonkey()2640 RBBILineMonkey::RBBILineMonkey() :
2641     RBBIMonkeyKind(),
2642     fSets(NULL),
2643 
2644     fCharBI(NULL),
2645     fText(NULL),
2646     fNumberMatcher(NULL)
2647 
2648 {
2649     if (U_FAILURE(deferredStatus)) {
2650         return;
2651     }
2652 
2653     UErrorCode  status = U_ZERO_ERROR;
2654 
2655     fSets  = new UVector(status);
2656 
2657     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2658     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2659     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2660     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2661     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2662     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2663     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2664     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2665     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2666     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2667     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2668     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2669     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2670     fHH    = new UnicodeSet();
2671     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2672     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2673     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2674     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2675     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2676     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2677     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2678     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2679     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2680     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2681     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2682     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2683     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2684     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2685     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2686     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2687     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2688     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2689     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2690     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2691     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2692     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2693     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2694     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2695     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2696     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2697     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2698     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2699     fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2700     fOP30  = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2701     fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2702     fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2703 
2704     if (U_FAILURE(status)) {
2705         deferredStatus = status;
2706         return;
2707     }
2708 
2709     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2710     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2711     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2712 
2713     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2714     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2715 
2716     fHH->add(u'\u2010');   // Hyphen, '‐'
2717 
2718     // Sets and names.
2719     fSets->addElement(fBK, status); classNames.push_back("fBK");
2720     fSets->addElement(fCR, status); classNames.push_back("fCR");
2721     fSets->addElement(fLF, status); classNames.push_back("fLF");
2722     fSets->addElement(fCM, status); classNames.push_back("fCM");
2723     fSets->addElement(fNL, status); classNames.push_back("fNL");
2724     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2725     fSets->addElement(fZW, status); classNames.push_back("fZW");
2726     fSets->addElement(fGL, status); classNames.push_back("fGL");
2727     fSets->addElement(fCB, status); classNames.push_back("fCB");
2728     fSets->addElement(fSP, status); classNames.push_back("fSP");
2729     fSets->addElement(fB2, status); classNames.push_back("fB2");
2730     fSets->addElement(fBA, status); classNames.push_back("fBA");
2731     fSets->addElement(fBB, status); classNames.push_back("fBB");
2732     fSets->addElement(fHY, status); classNames.push_back("fHY");
2733     fSets->addElement(fH2, status); classNames.push_back("fH2");
2734     fSets->addElement(fH3, status); classNames.push_back("fH3");
2735     fSets->addElement(fCL, status); classNames.push_back("fCL");
2736     fSets->addElement(fCP, status); classNames.push_back("fCP");
2737     fSets->addElement(fEX, status); classNames.push_back("fEX");
2738     fSets->addElement(fIN, status); classNames.push_back("fIN");
2739     fSets->addElement(fJL, status); classNames.push_back("fJL");
2740     fSets->addElement(fJT, status); classNames.push_back("fJT");
2741     fSets->addElement(fJV, status); classNames.push_back("fJV");
2742     fSets->addElement(fNS, status); classNames.push_back("fNS");
2743     fSets->addElement(fOP, status); classNames.push_back("fOP");
2744     fSets->addElement(fQU, status); classNames.push_back("fQU");
2745     fSets->addElement(fIS, status); classNames.push_back("fIS");
2746     fSets->addElement(fNU, status); classNames.push_back("fNU");
2747     fSets->addElement(fPO, status); classNames.push_back("fPO");
2748     fSets->addElement(fPR, status); classNames.push_back("fPR");
2749     fSets->addElement(fSY, status); classNames.push_back("fSY");
2750     fSets->addElement(fAI, status); classNames.push_back("fAI");
2751     fSets->addElement(fAL, status); classNames.push_back("fAL");
2752     fSets->addElement(fHL, status); classNames.push_back("fHL");
2753     fSets->addElement(fID, status); classNames.push_back("fID");
2754     fSets->addElement(fRI, status); classNames.push_back("fRI");
2755     fSets->addElement(fSG, status); classNames.push_back("fSG");
2756     fSets->addElement(fEB, status); classNames.push_back("fEB");
2757     fSets->addElement(fEM, status); classNames.push_back("fEM");
2758     fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2759     // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2760     fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2761     fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2762     fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
2763 
2764     const char *rules =
2765             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2766             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2767             "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2768             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2769             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2770             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2771             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2772 
2773     fNumberMatcher = new RegexMatcher(
2774         UnicodeString(rules, -1, US_INV), 0, status);
2775 
2776     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2777 
2778     if (U_FAILURE(status)) {
2779         deferredStatus = status;
2780     }
2781 
2782 }
2783 
2784 
setText(const UnicodeString & s)2785 void RBBILineMonkey::setText(const UnicodeString &s) {
2786     fText       = &s;
2787     fCharBI->setText(s);
2788     prepareAppliedRules(s.length());
2789     fNumberMatcher->reset(s);
2790 }
2791 
2792 //
2793 //  rule9Adjust
2794 //     Line Break TR rules 9 and 10 implementation.
2795 //     This deals with combining marks and other sequences that
2796 //     that must be treated as if they were something other than what they actually are.
2797 //
2798 //     This is factored out into a separate function because it must be applied twice for
2799 //     each potential break, once to the chars before the position being checked, then
2800 //     again to the text following the possible break.
2801 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2802 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2803     if (pos == -1) {
2804         // Invalid initial position.  Happens during the warmup iteration of the
2805         //   main loop in next().
2806         return;
2807     }
2808 
2809     int32_t  nPos = *nextPos;
2810 
2811     // LB 9  Keep combining sequences together.
2812     // advance over any CM class chars.  Note that Line Break CM is different
2813     // from the normal Grapheme Extend property.
2814     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2815           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2816         for (;;) {
2817             *nextChar = fText->char32At(nPos);
2818             if (!fCM->contains(*nextChar)) {
2819                 break;
2820             }
2821             nPos = fText->moveIndex32(nPos, 1);
2822         }
2823     }
2824 
2825 
2826     // LB 9 Treat X CM* as if it were x.
2827     //       No explicit action required.
2828 
2829     // LB 10  Treat any remaining combining mark as AL
2830     if (fCM->contains(*posChar)) {
2831         *posChar = u'A';
2832     }
2833 
2834     // Push the updated nextPos and nextChar back to our caller.
2835     // This only makes a difference if posChar got bigger by consuming a
2836     // combining sequence.
2837     *nextPos  = nPos;
2838     *nextChar = fText->char32At(nPos);
2839 }
2840 
2841 
2842 
next(int32_t startPos)2843 int32_t RBBILineMonkey::next(int32_t startPos) {
2844     UErrorCode status = U_ZERO_ERROR;
2845     int32_t    pos;       //  Index of the char following a potential break position
2846     UChar32    thisChar;  //  Character at above position "pos"
2847 
2848     int32_t    prevPos;   //  Index of the char preceding a potential break position
2849     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2850                           //   and thisChar may not be adjacent because combining
2851                           //   characters between them will be ignored.
2852 
2853     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2854     UChar32    prevCharX2;
2855 
2856     int32_t    nextPos;   //  Index of the next character following pos.
2857                           //     Usually skips over combining marks.
2858     int32_t    nextCPPos; //  Index of the code point following "pos."
2859                           //     May point to a combining mark.
2860     int32_t    tPos;      //  temp value.
2861     UChar32    c;
2862 
2863     if (U_FAILURE(deferredStatus)) {
2864         return -1;
2865     }
2866 
2867     if (startPos >= fText->length()) {
2868         return -1;
2869     }
2870 
2871 
2872     // Initial values for loop.  Loop will run the first time without finding breaks,
2873     //                           while the invalid values shift out and the "this" and
2874     //                           "prev" positions are filled in with good values.
2875     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2876     thisChar = prevChar  = prevCharX2 = 0;
2877     nextPos  = nextCPPos = startPos;
2878 
2879 
2880     // Loop runs once per position in the test text, until a break position
2881     //  is found.
2882     for (;;) {
2883         prevPosX2 = prevPos;
2884         prevCharX2 = prevChar;
2885 
2886         prevPos   = pos;
2887         prevChar  = thisChar;
2888 
2889         pos       = nextPos;
2890         thisChar  = fText->char32At(pos);
2891 
2892         nextCPPos = fText->moveIndex32(pos, 1);
2893         nextPos   = nextCPPos;
2894 
2895 
2896         if (pos >= fText->length()) {
2897             setAppliedRule(pos, "LB2 - Break at end of text.");
2898             break;
2899         }
2900 
2901 
2902         //             We do this one out-of-order because the adjustment does not change anything
2903         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2904         //             be applied.
2905         rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2906         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2907         c = fText->char32At(nextPos);
2908         rule9Adjust(pos, &thisChar, &nextPos, &c);
2909 
2910         // If the loop is still warming up - if we haven't shifted the initial
2911         //   -1 positions out of prevPos yet - loop back to advance the
2912         //    position in the input without any further looking for breaks.
2913         if (prevPos == -1) {
2914           setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2915             continue;
2916         }
2917 
2918 
2919         if (fBK->contains(prevChar)) {
2920             setAppliedRule(pos, "LB 4  Always break after hard line breaks");
2921             break;
2922         }
2923 
2924 
2925         if (prevChar == 0x0d && thisChar == 0x0a) {
2926             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
2927             continue;
2928         }
2929         if (prevChar == 0x0d ||
2930             prevChar == 0x0a ||
2931             prevChar == 0x85)  {
2932             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
2933             break;
2934         }
2935 
2936 
2937         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2938             fBK->contains(thisChar)) {
2939             setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
2940             continue;
2941         }
2942 
2943 
2944         if (fSP->contains(thisChar)) {
2945             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
2946             continue;
2947         }
2948 
2949         // !!! ??? Is this the right text for the applied rule?
2950         if (fZW->contains(thisChar)) {
2951             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
2952             continue;
2953         }
2954 
2955 
2956         //       ZW SP* ÷
2957         //       Scan backwards from prevChar for SP* ZW
2958         tPos = prevPos;
2959         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2960             tPos = fText->moveIndex32(tPos, -1);
2961         }
2962         if (fZW->contains(fText->char32At(tPos))) {
2963             setAppliedRule(pos, "LB 8  Break after zero width space");
2964             break;
2965         }
2966 
2967 
2968         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
2969         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
2970         if (fNumberMatcher->lookingAt(prevPos, status)) {
2971             if (U_FAILURE(status)) {
2972                 setAppliedRule(pos, "LB 25 Numbers");
2973                 break;
2974             }
2975             // Matched a number.  But could have been just a single digit, which would
2976             //    not represent a "no break here" between prevChar and thisChar
2977             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
2978             if (numEndIdx > pos) {
2979                 // Number match includes at least our two chars being checked
2980                 if (numEndIdx > nextPos) {
2981                     // Number match includes additional chars.  Update pos and nextPos
2982                     //   so that next loop iteration will continue at the end of the number,
2983                     //   checking for breaks between last char in number & whatever follows.
2984                     pos = nextPos = numEndIdx;
2985                     do {
2986                         pos = fText->moveIndex32(pos, -1);
2987                         thisChar = fText->char32At(pos);
2988                     } while (fCM->contains(thisChar));
2989                 }
2990                 setAppliedRule(pos, "LB 25 Numbers");
2991                 continue;
2992             }
2993         }
2994 
2995 
2996         //       The monkey test's way of ignoring combining characters doesn't work
2997         //       for this rule. ZJ is also a CM. Need to get the actual character
2998         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
2999         {
3000             int32_t prevIdx = fText->moveIndex32(pos, -1);
3001             UChar32 prevC = fText->char32At(prevIdx);
3002             if (fZWJ->contains(prevC)) {
3003                 setAppliedRule(pos, "LB 8a ZWJ x");
3004                 continue;
3005             }
3006         }
3007 
3008 
3009         // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
3010         //
3011 
3012 
3013         //    x  WJ
3014         //    WJ  x
3015         //
3016         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3017             setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
3018             continue;
3019         }
3020 
3021 
3022         if (fGL->contains(prevChar)) {
3023             setAppliedRule(pos, "LB 12  GL  x");
3024             continue;
3025         }
3026 
3027 
3028           if (!(fSP->contains(prevChar) ||
3029               fBA->contains(prevChar) ||
3030               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3031               setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
3032               continue;
3033         }
3034 
3035 
3036         if (fCL->contains(thisChar) ||
3037                 fCP->contains(thisChar) ||
3038                 fEX->contains(thisChar) ||
3039                 fSY->contains(thisChar)) {
3040             setAppliedRule(pos, "LB 13  Don't break before closings.");
3041             continue;
3042         }
3043 
3044 
3045         //       Scan backwards, checking for this sequence.
3046         //       The OP char could include combining marks, so we actually check for
3047         //           OP CM* SP*
3048         //       Another Twist: The Rule 9 fixes may have changed a SP CM
3049         //       sequence into a ID char, so before scanning back through spaces,
3050         //       verify that prevChar is indeed a space.  The prevChar variable
3051         //       may differ from fText[prevPos]
3052         tPos = prevPos;
3053         if (fSP->contains(prevChar)) {
3054             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3055                 tPos=fText->moveIndex32(tPos, -1);
3056             }
3057         }
3058         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3059             tPos=fText->moveIndex32(tPos, -1);
3060         }
3061         if (fOP->contains(fText->char32At(tPos))) {
3062             setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3063             continue;
3064         }
3065 
3066 
3067         if (nextPos < fText->length()) {
3068             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3069             //       from a legit ffff character. So test length separately.
3070             UChar32 nextChar = fText->char32At(nextPos);
3071             if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3072                 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3073                 break;
3074             }
3075         }
3076 
3077 
3078           if (fIS->contains(thisChar)) {
3079               setAppliedRule(pos, "LB 14b  Do not break before numeric separators, even after spaces.");
3080               continue;
3081         }
3082 
3083 
3084         if (fOP->contains(thisChar)) {
3085             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3086             int tPos = prevPos;
3087             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3088                 tPos = fText->moveIndex32(tPos, -1);
3089             }
3090             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3091                 tPos = fText->moveIndex32(tPos, -1);
3092             }
3093             if (fQU->contains(fText->char32At(tPos))) {
3094                 setAppliedRule(pos, "LB 15    QU SP* x OP");
3095                 continue;
3096             }
3097         }
3098 
3099 
3100         //    Scan backwards for SP* CM* (CL | CP)
3101         if (fNS->contains(thisChar)) {
3102             int tPos = prevPos;
3103             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3104                 tPos = fText->moveIndex32(tPos, -1);
3105             }
3106             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3107                 tPos = fText->moveIndex32(tPos, -1);
3108             }
3109             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3110                 setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
3111                 continue;
3112             }
3113         }
3114 
3115 
3116         if (fB2->contains(thisChar)) {
3117             //  Scan backwards, checking for the B2 CM* SP* sequence.
3118             tPos = prevPos;
3119             if (fSP->contains(prevChar)) {
3120                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3121                     tPos=fText->moveIndex32(tPos, -1);
3122                 }
3123             }
3124             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3125                 tPos=fText->moveIndex32(tPos, -1);
3126             }
3127             if (fB2->contains(fText->char32At(tPos))) {
3128                 setAppliedRule(pos, "LB 17   B2 SP* x B2");
3129                 continue;
3130             }
3131         }
3132 
3133 
3134         if (fSP->contains(prevChar)) {
3135             setAppliedRule(pos, "LB 18    break after space");
3136             break;
3137         }
3138 
3139         //    x   QU
3140         //    QU  x
3141         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3142             setAppliedRule(pos, "LB 19");
3143             continue;
3144         }
3145 
3146         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3147             setAppliedRule(pos, "LB 20  Break around a CB");
3148             break;
3149         }
3150 
3151         //           Don't break between Hyphens and letters if a break precedes the hyphen.
3152         //           Formerly this was a Finnish tailoring.
3153         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3154         //           ^($HY | $HH) $AL;
3155         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3156                 prevPosX2 == -1) {
3157             setAppliedRule(pos, "LB 20.09");
3158             continue;
3159         }
3160 
3161         if (fBA->contains(thisChar) ||
3162             fHY->contains(thisChar) ||
3163             fNS->contains(thisChar) ||
3164             fBB->contains(prevChar) )   {
3165             setAppliedRule(pos, "LB 21");
3166             continue;
3167         }
3168 
3169         if (fHL->contains(prevCharX2) &&
3170                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3171             setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
3172             continue;
3173         }
3174 
3175         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3176             setAppliedRule(pos, "LB 21b SY x HL");
3177             continue;
3178         }
3179 
3180         if (fIN->contains(thisChar))   {
3181             setAppliedRule(pos, "LB 22");
3182             continue;
3183         }
3184 
3185 
3186         //          (AL | HL) x NU
3187         //          NU x (AL | HL)
3188         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3189             setAppliedRule(pos, "LB 23");
3190             continue;
3191         }
3192         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3193             setAppliedRule(pos, "LB 23");
3194             continue;
3195         }
3196 
3197         // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3198         //      PR x (ID | EB | EM)
3199         //     (ID | EB | EM) x PO
3200         if (fPR->contains(prevChar) &&
3201                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3202             setAppliedRule(pos, "LB 23a");
3203             continue;
3204         }
3205         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3206                 fPO->contains(thisChar)) {
3207             setAppliedRule(pos, "LB 23a");
3208             continue;
3209         }
3210 
3211         //   Do not break between prefix and letters or ideographs.
3212         //         (PR | PO) x (AL | HL)
3213         //         (AL | HL) x (PR | PO)
3214         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3215                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3216             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3217             continue;
3218         }
3219         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3220                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3221             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3222             continue;
3223         }
3224 
3225         // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3226 
3227         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3228                                         fJV->contains(thisChar) ||
3229                                         fH2->contains(thisChar) ||
3230                                         fH3->contains(thisChar))) {
3231             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3232             continue;
3233                                         }
3234 
3235         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3236             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3237             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3238             continue;
3239         }
3240 
3241         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3242             fJT->contains(thisChar)) {
3243             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3244             continue;
3245         }
3246 
3247         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3248             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3249             fPO->contains(thisChar)) {
3250             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3251             continue;
3252         }
3253         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3254             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3255             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3256             continue;
3257         }
3258 
3259 
3260 
3261         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3262             setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
3263             continue;
3264         }
3265 
3266           if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3267               setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3268               continue;
3269         }
3270 
3271         //          (AL | NU) x OP
3272         //          CP x (AL | NU)
3273         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3274             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3275             continue;
3276         }
3277         if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3278             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3279             continue;
3280         }
3281 
3282         //             RI  x  RI
3283         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3284             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3285             break;
3286         }
3287         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3288             // Two Regional Indicators have been paired.
3289             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3290             // following RI. This is a hack.
3291             thisChar = -1;
3292             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3293             continue;
3294         }
3295 
3296         // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3297         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3298             setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
3299             continue;
3300         }
3301 
3302         if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3303             setAppliedRule(pos, "LB30b    [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
3304             continue;
3305         }
3306 
3307         setAppliedRule(pos, "LB 31    Break everywhere else");
3308         break;
3309     }
3310 
3311     return pos;
3312 }
3313 
3314 
charClasses()3315 UVector  *RBBILineMonkey::charClasses() {
3316     return fSets;
3317 }
3318 
3319 
~RBBILineMonkey()3320 RBBILineMonkey::~RBBILineMonkey() {
3321     delete fSets;
3322 
3323     delete fBK;
3324     delete fCR;
3325     delete fLF;
3326     delete fCM;
3327     delete fNL;
3328     delete fWJ;
3329     delete fZW;
3330     delete fGL;
3331     delete fCB;
3332     delete fSP;
3333     delete fB2;
3334     delete fBA;
3335     delete fBB;
3336     delete fHH;
3337     delete fHY;
3338     delete fH2;
3339     delete fH3;
3340     delete fCL;
3341     delete fCP;
3342     delete fEX;
3343     delete fIN;
3344     delete fJL;
3345     delete fJV;
3346     delete fJT;
3347     delete fNS;
3348     delete fOP;
3349     delete fQU;
3350     delete fIS;
3351     delete fNU;
3352     delete fPO;
3353     delete fPR;
3354     delete fSY;
3355     delete fAI;
3356     delete fAL;
3357     delete fCJ;
3358     delete fHL;
3359     delete fID;
3360     delete fRI;
3361     delete fSG;
3362     delete fXX;
3363     delete fEB;
3364     delete fEM;
3365     delete fZWJ;
3366     delete fOP30;
3367     delete fCP30;
3368     delete fExtPictUnassigned;
3369 
3370     delete fCharBI;
3371     delete fNumberMatcher;
3372 }
3373 
3374 
3375 //-------------------------------------------------------------------------------------------
3376 //
3377 //   TestMonkey
3378 //
3379 //     params
3380 //       seed=nnnnn        Random number starting seed.
3381 //                         Setting the seed allows errors to be reproduced.
3382 //       loop=nnn          Looping count.  Controls running time.
3383 //                         -1:  run forever.
3384 //                          0 or greater:  run length.
3385 //
3386 //       type = char | word | line | sent | title
3387 //
3388 //  Example:
3389 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3390 //
3391 //-------------------------------------------------------------------------------------------
3392 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3393 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3394     int32_t val = defaultVal;
3395     name.append(" *= *(-?\\d+)");
3396     UErrorCode status = U_ZERO_ERROR;
3397     RegexMatcher m(name, params, 0, status);
3398     if (m.find()) {
3399         // The param exists.  Convert the string to an int.
3400         char valString[100];
3401         int32_t paramLength = m.end(1, status) - m.start(1, status);
3402         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3403             paramLength = (int32_t)(sizeof(valString)-2);
3404         }
3405         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3406         val = strtol(valString, NULL, 10);
3407 
3408         // Delete this parameter from the params string.
3409         m.reset();
3410         params = m.replaceFirst("", status);
3411     }
3412     U_ASSERT(U_SUCCESS(status));
3413     return val;
3414 }
3415 #endif
3416 
3417 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3418 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3419                                     BreakIterator *bi,
3420                                     int expected[],
3421                                     int expectedcount)
3422 {
3423     int count = 0;
3424     int i = 0;
3425     int forward[50];
3426     bi->setText(ustr);
3427     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3428         forward[count] = i;
3429         if (count < expectedcount && expected[count] != i) {
3430             test->errln("%s:%d break forward test failed: expected %d but got %d",
3431                         __FILE__, __LINE__, expected[count], i);
3432             break;
3433         }
3434         count ++;
3435     }
3436     if (count != expectedcount) {
3437         printStringBreaks(ustr, expected, expectedcount);
3438         test->errln("%s:%d break forward test failed: missed %d match",
3439                     __FILE__, __LINE__, expectedcount - count);
3440         return;
3441     }
3442     // testing boundaries
3443     for (i = 1; i < expectedcount; i ++) {
3444         int j = expected[i - 1];
3445         if (!bi->isBoundary(j)) {
3446             printStringBreaks(ustr, expected, expectedcount);
3447             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3448                     __FILE__, __LINE__, j);
3449             return;
3450         }
3451         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3452             if (bi->isBoundary(j)) {
3453                 printStringBreaks(ustr, expected, expectedcount);
3454                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3455                     __FILE__, __LINE__, j);
3456                 return;
3457             }
3458         }
3459     }
3460 
3461     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3462         count --;
3463         if (forward[count] != i) {
3464             printStringBreaks(ustr, expected, expectedcount);
3465             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3466                         __FILE__, __LINE__, forward[count], i);
3467             break;
3468         }
3469     }
3470     if (count != 0) {
3471         printStringBreaks(ustr, expected, expectedcount);
3472         test->errln("break test previous() failed: missed a match");
3473         return;
3474     }
3475 
3476     // testing preceding
3477     for (i = 0; i < expectedcount - 1; i ++) {
3478         // int j = expected[i] + 1;
3479         int j = ustr.moveIndex32(expected[i], 1);
3480         for (; j <= expected[i + 1]; j ++) {
3481             int32_t expectedPreceding = expected[i];
3482             int32_t actualPreceding = bi->preceding(j);
3483             if (actualPreceding != expectedPreceding) {
3484                 printStringBreaks(ustr, expected, expectedcount);
3485                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3486                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3487                 return;
3488             }
3489         }
3490     }
3491 }
3492 #endif
3493 
TestWordBreaks(void)3494 void RBBITest::TestWordBreaks(void)
3495 {
3496 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3497 
3498     Locale        locale("en");
3499     UErrorCode    status = U_ZERO_ERROR;
3500     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3501     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3502     // Replaced any C+J characters in a row with a random sequence of characters
3503     // of the same length to make our C+J segmentation not get in the way.
3504     static const char *strlist[] =
3505     {
3506     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3507     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3508     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3509     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3510     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3511     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3512     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3513     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3514     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3515     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3516     "\\u2027\\U000e0067\\u0a47\\u00b7",
3517     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3518     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3519     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3520     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3521     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3522     "\\u0027\\u11af\\U000e0057\\u0602",
3523     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3524     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3525     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3526     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3527     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3528     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3529     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3530     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3531     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3532     "\\u18f4\\U000e0049\\u20e7\\u2027",
3533     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3534     "\\ua183\\u102d\\u0bec\\u003a",
3535     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3536     "\\u003a\\u0e57\\u0fad\\u002e",
3537     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3538     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3539     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3540     "\\u003a\\u0664\\u00b7\\u1fba",
3541     "\\u003b\\u0027\\u00b7\\u47a3",
3542     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3543     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3544     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3545     };
3546     int loop;
3547     if (U_FAILURE(status)) {
3548         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3549         return;
3550     }
3551     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3552         // printf("looping %d\n", loop);
3553         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3554         // RBBICharMonkey monkey;
3555         RBBIWordMonkey monkey;
3556 
3557         int expected[50];
3558         int expectedcount = 0;
3559 
3560         monkey.setText(ustr);
3561         int i;
3562         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3563             expected[expectedcount ++] = i;
3564         }
3565 
3566         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3567     }
3568     delete bi;
3569 #endif
3570 }
3571 
TestWordBoundary(void)3572 void RBBITest::TestWordBoundary(void)
3573 {
3574     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3575     Locale        locale("en");
3576     UErrorCode    status = U_ZERO_ERROR;
3577     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3578     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3579     if (U_FAILURE(status)) {
3580         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3581                 __FILE__, __LINE__, u_errorName(status));
3582         return;
3583     }
3584     UChar         str[50];
3585     static const char *strlist[] =
3586     {
3587     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3588     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3589     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3590     "\\u2027\\U000e0067\\u0a47\\u00b7",
3591     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3592     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3593     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3594     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3595     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3596     "\\u0027\\u11af\\U000e0057\\u0602",
3597     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3598     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3599     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3600     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3601     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3602     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3603     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3604     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3605     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3606     "\\u58f4\\U000e0049\\u20e7\\u2027",
3607     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3608     "\\ua183\\u102d\\u0bec\\u003a",
3609     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3610     "\\u003a\\u0e57\\u0fad\\u002e",
3611     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3612     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3613     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3614     "\\u003a\\u0664\\u00b7\\u1fba",
3615     "\\u003b\\u0027\\u00b7\\u47a3",
3616     };
3617     int loop;
3618     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3619         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3620         UnicodeString ustr(str);
3621         int forward[50];
3622         int count = 0;
3623 
3624         bi->setText(ustr);
3625         int prev = -1;
3626         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3627             ++count;
3628             if (count >= UPRV_LENGTHOF(forward)) {
3629                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3630                         __FILE__, __LINE__, loop, count, boundary);
3631                 return;
3632             }
3633             forward[count] = boundary;
3634             if (boundary <= prev) {
3635                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3636                         __FILE__, __LINE__, loop, prev, boundary);
3637                 break;
3638             }
3639             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3640                 if (bi->isBoundary(nonBoundary)) {
3641                     printStringBreaks(ustr, forward, count);
3642                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3643                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3644                     return;
3645                 }
3646             }
3647             if (!bi->isBoundary(boundary)) {
3648                 printStringBreaks(ustr, forward, count);
3649                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3650                        __FILE__, __LINE__, boundary);
3651                 return;
3652             }
3653             prev = boundary;
3654         }
3655     }
3656 }
3657 
TestLineBreaks(void)3658 void RBBITest::TestLineBreaks(void)
3659 {
3660 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3661     Locale        locale("en");
3662     UErrorCode    status = U_ZERO_ERROR;
3663     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3664     const int32_t  STRSIZE = 50;
3665     UChar         str[STRSIZE];
3666     static const char *strlist[] =
3667     {
3668      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3669      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3670              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3671      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3672              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3673      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3674      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3675      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3676      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3677      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3678      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3679      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3680      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3681      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3682      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3683      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3684      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3685      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3686      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3687      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3688      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3689      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3690      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3691      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3692      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3693      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3694      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3695      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3696      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3697      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3698      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3699      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3700      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3701      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3702      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3703      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3704      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3705      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3706      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3707          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3708     };
3709     int loop;
3710     TEST_ASSERT_SUCCESS(status);
3711     if (U_FAILURE(status)) {
3712         return;
3713     }
3714     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3715         // printf("looping %d\n", loop);
3716         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3717         if (t >= STRSIZE) {
3718             TEST_ASSERT(FALSE);
3719             continue;
3720         }
3721 
3722 
3723         UnicodeString ustr(str);
3724         RBBILineMonkey monkey;
3725         if (U_FAILURE(monkey.deferredStatus)) {
3726             continue;
3727         }
3728 
3729         const int EXPECTEDSIZE = 50;
3730         int expected[EXPECTEDSIZE];
3731         int expectedcount = 0;
3732 
3733         monkey.setText(ustr);
3734 
3735         int i;
3736         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3737             if (expectedcount >= EXPECTEDSIZE) {
3738                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3739                 return;
3740             }
3741             expected[expectedcount ++] = i;
3742         }
3743 
3744         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3745     }
3746     delete bi;
3747 #endif
3748 }
3749 
TestSentBreaks(void)3750 void RBBITest::TestSentBreaks(void)
3751 {
3752 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3753     Locale        locale("en");
3754     UErrorCode    status = U_ZERO_ERROR;
3755     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3756     UChar         str[200];
3757     static const char *strlist[] =
3758     {
3759      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3760      "This\n",
3761      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3762      "\"Sentence ending with a quote.\" Bye.",
3763      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3764      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3765      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3766      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3767      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3768      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3769      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3770              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3771              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3772              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3773      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3774              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3775              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3776              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3777              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3778              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3779     };
3780     int loop;
3781     if (U_FAILURE(status)) {
3782         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3783         return;
3784     }
3785     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3786         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3787         UnicodeString ustr(str);
3788 
3789         RBBISentMonkey monkey;
3790         if (U_FAILURE(monkey.deferredStatus)) {
3791             continue;
3792         }
3793 
3794         const int EXPECTEDSIZE = 50;
3795         int expected[EXPECTEDSIZE];
3796         int expectedcount = 0;
3797 
3798         monkey.setText(ustr);
3799 
3800         int i;
3801         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3802             if (expectedcount >= EXPECTEDSIZE) {
3803                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3804                 return;
3805             }
3806             expected[expectedcount ++] = i;
3807         }
3808 
3809         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3810     }
3811     delete bi;
3812 #endif
3813 }
3814 
TestMonkey()3815 void RBBITest::TestMonkey() {
3816 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3817 
3818     UErrorCode     status    = U_ZERO_ERROR;
3819     int32_t        loopCount = 500;
3820     int32_t        seed      = 1;
3821     UnicodeString  breakType = "all";
3822     Locale         locale("en");
3823     UBool          useUText  = FALSE;
3824 
3825     if (quick == FALSE) {
3826         loopCount = 10000;
3827     }
3828 
3829     if (fTestParams) {
3830         UnicodeString p(fTestParams);
3831         loopCount = getIntParam("loop", p, loopCount);
3832         seed      = getIntParam("seed", p, seed);
3833 
3834         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3835         if (m.find()) {
3836             breakType = m.group(1, status);
3837             m.reset();
3838             p = m.replaceFirst("", status);
3839         }
3840 
3841         RegexMatcher u(" *utext", p, 0, status);
3842         if (u.find()) {
3843             useUText = TRUE;
3844             u.reset();
3845             p = u.replaceFirst("", status);
3846         }
3847 
3848 
3849         // m.reset(p);
3850         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3851             // Each option is stripped out of the option string as it is processed.
3852             // All options have been checked.  The option string should have been completely emptied..
3853             char buf[100];
3854             p.extract(buf, sizeof(buf), NULL, status);
3855             buf[sizeof(buf)-1] = 0;
3856             errln("Unrecognized or extra parameter:  %s\n", buf);
3857             return;
3858         }
3859 
3860     }
3861 
3862     if (breakType == "char" || breakType == "all") {
3863         RBBICharMonkey  m;
3864         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3865         if (U_SUCCESS(status)) {
3866             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3867             if (breakType == "all" && useUText==FALSE) {
3868                 // Also run a quick test with UText when "all" is specified
3869                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3870             }
3871         }
3872         else {
3873             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3874         }
3875         delete bi;
3876     }
3877 
3878     if (breakType == "word" || breakType == "all") {
3879         logln("Word Break Monkey Test");
3880         RBBIWordMonkey  m;
3881         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3882         if (U_SUCCESS(status)) {
3883             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3884         }
3885         else {
3886             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3887         }
3888         delete bi;
3889     }
3890 
3891     if (breakType == "line" || breakType == "all") {
3892         logln("Line Break Monkey Test");
3893         RBBILineMonkey  m;
3894         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3895         if (loopCount >= 10) {
3896             loopCount = loopCount / 5;   // Line break runs slower than the others.
3897         }
3898         if (U_SUCCESS(status)) {
3899             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3900         }
3901         else {
3902             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3903         }
3904         delete bi;
3905     }
3906 
3907     if (breakType == "sent" || breakType == "all"  ) {
3908         logln("Sentence Break Monkey Test");
3909         RBBISentMonkey  m;
3910         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3911         if (loopCount >= 10) {
3912             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3913         }
3914         if (U_SUCCESS(status)) {
3915             RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3916         }
3917         else {
3918             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3919         }
3920         delete bi;
3921     }
3922 
3923 #endif
3924 }
3925 
3926 //
3927 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3928 //    Parameters:
3929 //       bi      - the break iterator to use
3930 //       mk      - MonkeyKind, abstraction for obtaining expected results
3931 //       name    - Name of test (char, word, etc.) for use in error messages
3932 //       seed    - Seed for starting random number generator (parameter from user)
3933 //       numIterations
3934 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3935 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3936                          int32_t numIterations, UBool useUText) {
3937 
3938 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3939 
3940     const int32_t    TESTSTRINGLEN = 500;
3941     UnicodeString    testText;
3942     int32_t          numCharClasses;
3943     UVector          *chClasses;
3944     int              expectedCount = 0;
3945     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3946     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3947     char             reverseBreaks[TESTSTRINGLEN*2+1];
3948     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3949     char             followingBreaks[TESTSTRINGLEN*2+1];
3950     char             precedingBreaks[TESTSTRINGLEN*2+1];
3951     int              i;
3952     int              loopCount = 0;
3953 
3954 
3955     m_seed = seed;
3956 
3957     numCharClasses = mk.charClasses()->size();
3958     chClasses      = mk.charClasses();
3959 
3960     // Check for errors that occurred during the construction of the MonkeyKind object.
3961     //  Can't report them where they occurred because errln() is a method coming from intlTest,
3962     //  and is not visible outside of RBBITest :-(
3963     if (U_FAILURE(mk.deferredStatus)) {
3964         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3965         return;
3966     }
3967 
3968     // Verify that the character classes all have at least one member.
3969     for (i=0; i<numCharClasses; i++) {
3970         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3971         if (s == NULL || s->size() == 0) {
3972             errln("Character Class #%d is null or of zero size.", i);
3973             return;
3974         }
3975     }
3976 
3977     // For minimizing width of class name output.
3978     int classNameSize = mk.maxClassNameSize();
3979 
3980     while (loopCount < numIterations || numIterations == -1) {
3981         if (numIterations == -1 && loopCount % 10 == 0) {
3982             // If test is running in an infinite loop, display a periodic tic so
3983             //   we can tell that it is making progress.
3984             fprintf(stderr, ".");
3985         }
3986         // Save current random number seed, so that we can recreate the random numbers
3987         //   for this loop iteration in event of an error.
3988         seed = m_seed;
3989 
3990         // Populate a test string with data.
3991         testText.truncate(0);
3992         for (i=0; i<TESTSTRINGLEN; i++) {
3993             int32_t  aClassNum = m_rand() % numCharClasses;
3994             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3995             int32_t   charIdx = m_rand() % classSet->size();
3996             UChar32   c = classSet->charAt(charIdx);
3997             if (c < 0) {   // TODO:  deal with sets containing strings.
3998                 errln("%s:%d c < 0", __FILE__, __LINE__);
3999                 break;
4000             }
4001             // Do not assemble a supplementary character from randomly generated separate surrogates.
4002             //   (It could be a dictionary character)
4003             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4004                 continue;
4005             }
4006 
4007             testText.append(c);
4008         }
4009 
4010         // Calculate the expected results for this test string and reset applied rules.
4011         mk.setText(testText);
4012 
4013         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4014         expectedBreaks[0] = 1;
4015         int32_t breakPos = 0;
4016         expectedCount = 0;
4017         for (;;) {
4018             breakPos = mk.next(breakPos);
4019             if (breakPos == -1) {
4020                 break;
4021             }
4022             if (breakPos > testText.length()) {
4023                 errln("breakPos > testText.length()");
4024             }
4025             expectedBreaks[breakPos] = 1;
4026             expectedCount++;
4027             U_ASSERT(expectedCount<testText.length());
4028         }
4029 
4030         // Find the break positions using forward iteration
4031         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4032         if (useUText) {
4033             UErrorCode status = U_ZERO_ERROR;
4034             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4035             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4036             bi->setText(testUText, status);
4037             TEST_ASSERT_SUCCESS(status);
4038             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4039                                       //  This UText can be closed immediately, so long as the
4040                                       //  testText string continues to exist.
4041         } else {
4042             bi->setText(testText);
4043         }
4044 
4045         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4046             if (i < 0 || i > testText.length()) {
4047                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4048                 break;
4049             }
4050             forwardBreaks[i] = 1;
4051         }
4052 
4053         // Find the break positions using reverse iteration
4054         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4055         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4056             if (i < 0 || i > testText.length()) {
4057                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4058                 break;
4059             }
4060             reverseBreaks[i] = 1;
4061         }
4062 
4063         // Find the break positions using isBoundary() tests.
4064         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4065         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4066         for (i=0; i<=testText.length(); i++) {
4067             isBoundaryBreaks[i] = bi->isBoundary(i);
4068         }
4069 
4070 
4071         // Find the break positions using the following() function.
4072         // printf(".");
4073         memset(followingBreaks, 0, sizeof(followingBreaks));
4074         int32_t   lastBreakPos = 0;
4075         followingBreaks[0] = 1;
4076         for (i=0; i<testText.length(); i++) {
4077             breakPos = bi->following(i);
4078             if (breakPos <= i ||
4079                 breakPos < lastBreakPos ||
4080                 breakPos > testText.length() ||
4081                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4082                 errln("%s break monkey test: "
4083                     "Out of range value returned by BreakIterator::following().\n"
4084                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4085                          name, seed, i, breakPos, lastBreakPos);
4086                 break;
4087             }
4088             followingBreaks[breakPos] = 1;
4089             lastBreakPos = breakPos;
4090         }
4091 
4092         // Find the break positions using the preceding() function.
4093         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4094         lastBreakPos = testText.length();
4095         precedingBreaks[testText.length()] = 1;
4096         for (i=testText.length(); i>0; i--) {
4097             breakPos = bi->preceding(i);
4098             if (breakPos >= i ||
4099                 breakPos > lastBreakPos ||
4100                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4101                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4102                 errln("%s break monkey test: "
4103                     "Out of range value returned by BreakIterator::preceding().\n"
4104                     "index=%d;  prev returned %d; lastBreak=%d" ,
4105                     name,  i, breakPos, lastBreakPos);
4106                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4107                     precedingBreaks[i] = 2;   // Forces an error.
4108                 }
4109             } else {
4110                 if (breakPos >= 0) {
4111                     precedingBreaks[breakPos] = 1;
4112                 }
4113                 lastBreakPos = breakPos;
4114             }
4115         }
4116 
4117         // Compare the expected and actual results.
4118         for (i=0; i<=testText.length(); i++) {
4119             const char *errorType = NULL;
4120             const char* currentBreakData = NULL;
4121             if  (forwardBreaks[i] != expectedBreaks[i]) {
4122                 errorType = "next()";
4123                 currentBreakData = forwardBreaks;
4124             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4125                 errorType = "previous()";
4126                 currentBreakData = reverseBreaks;
4127            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4128                 errorType = "isBoundary()";
4129                 currentBreakData = isBoundaryBreaks;
4130             } else if (followingBreaks[i] != expectedBreaks[i]) {
4131                 errorType = "following()";
4132                 currentBreakData = followingBreaks;
4133             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4134                 errorType = "preceding()";
4135                 currentBreakData = precedingBreaks;
4136             }
4137 
4138             if (errorType != NULL) {
4139                 // Format a range of the test text that includes the failure as
4140                 //  a data item that can be included in the rbbi test data file.
4141 
4142                 // Start of the range is the last point where expected and actual results
4143                 //  both agreed that there was a break position.
4144 
4145                 int startContext = i;
4146                 int32_t count = 0;
4147                 for (;;) {
4148                     if (startContext==0) { break; }
4149                     startContext --;
4150                     if (expectedBreaks[startContext] != 0) {
4151                         if (count == 2) break;
4152                         count ++;
4153                     }
4154                 }
4155 
4156                 // End of range is two expected breaks past the start position.
4157                 int endContext = i + 1;
4158                 int ci;
4159                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4160                     for (;;) {
4161                         if (endContext >= testText.length()) {break;}
4162                         if (expectedBreaks[endContext-1] != 0) {
4163                             if (count == 0) break;
4164                             count --;
4165                         }
4166                         endContext ++;
4167                     }
4168                 }
4169 
4170                 // Formatting of each line includes:
4171                 //   character code
4172                 //   reference break: '|' -> a break, '.' -> no break
4173                 //   actual break:    '|' -> a break, '.' -> no break
4174                 //   (name of character clase)
4175                 //   Unicode name of character
4176                 //   '-->' indicates location of the difference.
4177 
4178                 MONKEY_ERROR(
4179                     (expectedBreaks[i] ? "Break expected but not found" :
4180                        "Break found but not expected"),
4181                     name, i, seed);
4182 
4183                 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4184                     UChar32  c;
4185                     c = testText.char32At(ci);
4186 
4187                     std::string currentLineFlag = "   ";
4188                     if (ci == i) {
4189                         currentLineFlag = "-->";  // Error position
4190                     }
4191 
4192                     // BMP or SMP character in hex
4193                     char hexCodePoint[12];
4194                     std::string format = "    \\u%04x";
4195                     if (c >= 0x10000) {
4196                         format = "\\U%08x";
4197                     }
4198                     sprintf(hexCodePoint, format.c_str(), c);
4199 
4200                     // Get the class name and character name for the character.
4201                     char cName[200];
4202                     UErrorCode status = U_ZERO_ERROR;
4203                     u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4204 
4205                     char buffer[200];
4206                     auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4207                              "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
4208                              currentLineFlag.c_str(),
4209                              ci,
4210                              expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
4211                              currentBreakData[ci] == 0 ? "." : "|",  // Actual break
4212                              hexCodePoint,
4213                              classNameSize,
4214                              mk.classNameFromCodepoint(c).c_str(),
4215                              mk.getAppliedRule(ci).c_str(), cName);
4216                     (void)ret;
4217                     U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4218 
4219                     // Output the error
4220                     if (ci == i) {
4221                         errln(buffer);
4222                     } else {
4223                         infoln(buffer);
4224                     }
4225 
4226                     if (ci >= endContext) { break; }
4227                 }
4228                 break;
4229             }
4230         }
4231 
4232         loopCount++;
4233     }
4234 #endif
4235 }
4236 
4237 
4238 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4239 //             This test checks the initial patch,
4240 //             which is to just keep it from crashing.  Correct word boundaries
4241 //             await a proper fix to the dictionary code.
4242 //
TestBug5532(void)4243 void RBBITest::TestBug5532(void)  {
4244    // Text includes a mixture of Thai and Latin.
4245    const unsigned char utf8Data[] = {
4246            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4247            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4248            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4249            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4250            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4251            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4252            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4253            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4254            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4255            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4256            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4257 
4258     UErrorCode status = U_ZERO_ERROR;
4259     UText utext=UTEXT_INITIALIZER;
4260     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4261     TEST_ASSERT_SUCCESS(status);
4262 
4263     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4264     TEST_ASSERT_SUCCESS(status);
4265     if (U_SUCCESS(status)) {
4266         bi->setText(&utext, status);
4267         TEST_ASSERT_SUCCESS(status);
4268 
4269         int32_t breakCount = 0;
4270         int32_t previousBreak = -1;
4271         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4272             // For now, just make sure that the break iterator doesn't hang.
4273             TEST_ASSERT(previousBreak < bi->current());
4274             previousBreak = bi->current();
4275         }
4276         TEST_ASSERT(breakCount > 0);
4277     }
4278     delete bi;
4279     utext_close(&utext);
4280 }
4281 
4282 
TestBug9983(void)4283 void RBBITest::TestBug9983(void)  {
4284     UnicodeString text = UnicodeString("\\u002A"  // * Other
4285                                        "\\uFF65"  //   Other
4286                                        "\\u309C"  //   Katakana
4287                                        "\\uFF9F"  //   Extend
4288                                        "\\uFF65"  //   Other
4289                                        "\\u0020"  //   Other
4290                                        "\\u0000").unescape();
4291 
4292     UErrorCode status = U_ZERO_ERROR;
4293     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4294         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4295     TEST_ASSERT_SUCCESS(status);
4296     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4297         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4298     TEST_ASSERT_SUCCESS(status);
4299     if (U_FAILURE(status)) {
4300         return;
4301     }
4302     int32_t offset, rstatus, iterationCount;
4303 
4304     brkiter->setText(text);
4305     brkiter->last();
4306     iterationCount = 0;
4307     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4308         iterationCount++;
4309         rstatus = brkiter->getRuleStatus();
4310         (void)rstatus;     // Suppress set but not used warning.
4311         if (iterationCount >= 10) {
4312            break;
4313         }
4314     }
4315     TEST_ASSERT(iterationCount == 6);
4316 
4317     brkiterPOSIX->setText(text);
4318     brkiterPOSIX->last();
4319     iterationCount = 0;
4320     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4321         iterationCount++;
4322         rstatus = brkiterPOSIX->getRuleStatus();
4323         (void)rstatus;     // Suppress set but not used warning.
4324         if (iterationCount >= 10) {
4325            break;
4326         }
4327     }
4328     TEST_ASSERT(iterationCount == 6);
4329 }
4330 
4331 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4332 //
TestBug7547()4333 void RBBITest::TestBug7547() {
4334     UnicodeString rules;
4335     UErrorCode status = U_ZERO_ERROR;
4336     UParseError parseError;
4337     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4338     if (status != U_BRK_RULE_SYNTAX) {
4339         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4340     }
4341     if (parseError.line != 1 || parseError.offset != 0) {
4342         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4343     }
4344 }
4345 
4346 
TestBug12797()4347 void RBBITest::TestBug12797() {
4348     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4349     UErrorCode status = U_ZERO_ERROR;
4350     UParseError parseError;
4351     RuleBasedBreakIterator bi(rules, parseError, status);
4352     if (U_FAILURE(status)) {
4353         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4354         return;
4355     }
4356     UnicodeString text = "abc";
4357     bi.setText(text);
4358     bi.first();
4359     int32_t boundary = bi.next();
4360     if (boundary != 3) {
4361         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4362     }
4363 }
4364 
TestBug12918()4365 void RBBITest::TestBug12918() {
4366     // This test triggers an assertion failure in dictbe.cpp
4367     const UChar *crasherString = u"\u3325\u4a16";
4368     UErrorCode status = U_ZERO_ERROR;
4369     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4370     if (U_FAILURE(status)) {
4371         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4372         return;
4373     }
4374     ubrk_first(iter);
4375     int32_t pos = 0;
4376     int32_t lastPos = -1;
4377     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4378         if (pos <= lastPos) {
4379             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4380             break;
4381         }
4382     }
4383     ubrk_close(iter);
4384 }
4385 
TestBug12932()4386 void RBBITest::TestBug12932() {
4387     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4388     UnicodeString ruleStr(
4389             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4390             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4391             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4392             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4393             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4394             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4395 
4396     UErrorCode status = U_ZERO_ERROR;
4397     UParseError parseError;
4398     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4399     if (status != U_BRK_RULE_SYNTAX) {
4400         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4401                 __FILE__, __LINE__, u_errorName(status));
4402     }
4403 }
4404 
4405 
4406 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4407 //             remain undevided by ICU char, word and line break.
TestEmoji()4408 void RBBITest::TestEmoji() {
4409 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4410     UErrorCode  status = U_ZERO_ERROR;
4411 
4412     CharString testFileName;
4413     testFileName.append(IntlTest::getSourceTestData(status), status);
4414     testFileName.appendPathPart("emoji-test.txt", status);
4415     if (U_FAILURE(status)) {
4416         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4417         return;
4418     }
4419     logln("Opening data file %s\n", testFileName.data());
4420 
4421     int    len;
4422     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4423     if (U_FAILURE(status) || testFile == NULL) {
4424         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4425         return;
4426     }
4427     UnicodeString testFileAsString(testFile, len);
4428     delete [] testFile;
4429 
4430     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4431     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4432     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4433     int32_t lineNumber = 0;
4434 
4435     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4436     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4437     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4438     if (U_FAILURE(status)) {
4439         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4440         return;
4441     }
4442 
4443     while (lineMatcher.find()) {
4444         ++lineNumber;
4445         UnicodeString line = lineMatcher.group(status);
4446         hexMatcher.reset(line);
4447         UnicodeString testString;   // accumulates the emoji sequence.
4448         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4449             UnicodeString hex = hexMatcher.group(1, status);
4450             if (hex.length() > 8) {
4451                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4452                 break;
4453             }
4454             CharString hex8;
4455             hex8.appendInvariantChars(hex, status);
4456             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4457             if (c<=0x10ffff) {
4458                 testString.append(c);
4459             } else {
4460                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4461                         __FILE__, __LINE__, lineNumber, hex8.data());
4462                 break;
4463             }
4464         }
4465 
4466         if (testString.length() > 1) {
4467             charBreaks->setText(testString);
4468             charBreaks->first();
4469             int32_t firstBreak = charBreaks->next();
4470             if (testString.length() != firstBreak) {
4471                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4472                         __FILE__, __LINE__, lineNumber, firstBreak);
4473             }
4474             wordBreaks->setText(testString);
4475             wordBreaks->first();
4476             firstBreak = wordBreaks->next();
4477             if (testString.length() != firstBreak) {
4478                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4479                         __FILE__, __LINE__, lineNumber, firstBreak);
4480             }
4481             lineBreaks->setText(testString);
4482             lineBreaks->first();
4483             firstBreak = lineBreaks->next();
4484             if (testString.length() != firstBreak) {
4485                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4486                         __FILE__, __LINE__, lineNumber, firstBreak);
4487             }
4488         }
4489     }
4490 #endif
4491 }
4492 
4493 
4494 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4495 
TestBug12519()4496 void RBBITest::TestBug12519() {
4497     UErrorCode status = U_ZERO_ERROR;
4498     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4499     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4500     if (!assertSuccess(WHERE, status)) {
4501         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4502         return;
4503     }
4504     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4505 
4506     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4507     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4508 
4509     LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4510     assertTrue(WHERE, *biEn == *cloneEn);
4511     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4512 
4513     LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4514     assertTrue(WHERE, *biFr == *cloneFr);
4515     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4516 
4517     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4518     UnicodeString text("Hallo Welt");
4519     biDe->setText(text);
4520     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4521     *biDe = *biFr;
4522     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4523 }
4524 
TestBug12677()4525 void RBBITest::TestBug12677() {
4526     // Check that stripping of comments from rules for getRules() is not confused by
4527     // the presence of '#' characters in the rules that do not introduce comments.
4528     UnicodeString rules(u"!!forward; \n"
4529                          "$x = [ab#];  # a set with a # literal. \n"
4530                          " # .;        # a comment that looks sort of like a rule.   \n"
4531                          " '#' '?';    # a rule with a quoted #   \n"
4532                        );
4533 
4534     UErrorCode status = U_ZERO_ERROR;
4535     UParseError pe;
4536     RuleBasedBreakIterator bi(rules, pe, status);
4537     assertSuccess(WHERE, status);
4538     UnicodeString rtRules = bi.getRules();
4539     assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"),  rtRules);
4540 }
4541 
4542 
TestTableRedundancies()4543 void RBBITest::TestTableRedundancies() {
4544     UErrorCode status = U_ZERO_ERROR;
4545 
4546     LocalPointer<RuleBasedBreakIterator> bi (
4547         (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4548     assertSuccess(WHERE, status);
4549     if (U_FAILURE(status)) return;
4550 
4551     RBBIDataWrapper *dw = bi->fData;
4552     const RBBIStateTable *fwtbl = dw->fForwardTable;
4553     UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4554     int32_t numCharClasses = dw->fHeader->fCatCount;
4555     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4556 
4557     // Check for duplicate columns (character categories)
4558 
4559     std::vector<UnicodeString> columns;
4560     for (int32_t column = 0; column < numCharClasses; column++) {
4561         UnicodeString s;
4562         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4563             RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4564             s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4565         }
4566         columns.push_back(s);
4567     }
4568     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4569     for (int c1=1; c1<numCharClasses; c1++) {
4570         int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4571         for (int c2 = c1+1; c2 < limit; c2++) {
4572             if (columns.at(c1) == columns.at(c2)) {
4573                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4574                 goto out;
4575             }
4576         }
4577     }
4578   out:
4579 
4580     // Check for duplicate states
4581     std::vector<UnicodeString> rows;
4582     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4583         UnicodeString s;
4584         RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4585         if (in8Bits) {
4586             s.append(row->r8.fAccepting);
4587             s.append(row->r8.fLookAhead);
4588             s.append(row->r8.fTagsIdx);
4589             for (int32_t column = 0; column < numCharClasses; column++) {
4590                 s.append(row->r8.fNextState[column]);
4591             }
4592         } else {
4593             s.append(row->r16.fAccepting);
4594             s.append(row->r16.fLookAhead);
4595             s.append(row->r16.fTagsIdx);
4596             for (int32_t column = 0; column < numCharClasses; column++) {
4597                 s.append(row->r16.fNextState[column]);
4598             }
4599         }
4600         rows.push_back(s);
4601     }
4602     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4603         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4604             if (rows.at(r1) == rows.at(r2)) {
4605                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4606                 return;
4607             }
4608         }
4609     }
4610 }
4611 
4612 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4613 //            even after next() has returned DONE.
4614 
TestBug13447()4615 void RBBITest::TestBug13447() {
4616     UErrorCode status = U_ZERO_ERROR;
4617     LocalPointer<RuleBasedBreakIterator> bi(
4618         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4619     assertSuccess(WHERE, status);
4620     if (U_FAILURE(status)) return;
4621     UnicodeString data(u"1234");
4622     bi->setText(data);
4623     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4624     assertEquals(WHERE, 4, bi->next());
4625     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4626     assertEquals(WHERE, UBRK_DONE, bi->next());
4627     assertEquals(WHERE, 4, bi->current());
4628     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4629 }
4630 
4631 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4632 //  for filling the break iterator cache when starting from random positions
4633 //  in the text.
4634 //
4635 //  It's a monkey test, working on random data, with the expected data obtained
4636 //  from forward iteration (no safe rules involved), comparing with results
4637 //  when indexing into the interior of the string (safe rules needed).
4638 
TestReverse()4639 void RBBITest::TestReverse() {
4640     UErrorCode status = U_ZERO_ERROR;
4641 
4642     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4643             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4644     assertSuccess(WHERE, status, true);
4645     status = U_ZERO_ERROR;
4646     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4647             BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4648     assertSuccess(WHERE, status, true);
4649     status = U_ZERO_ERROR;
4650     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4651             BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4652     assertSuccess(WHERE, status, true);
4653     status = U_ZERO_ERROR;
4654     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4655             BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4656     assertSuccess(WHERE, status, true);
4657 }
4658 
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4659 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4660     if (!bi) {
4661         return;
4662     }
4663 
4664     // From the mapping trie in the break iterator's internal data, create a
4665     // vector of UnicodeStrings, one for each character category, containing
4666     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4667     // to avoid an execess of unassigned code points.
4668 
4669     RBBIDataWrapper *data = bi->fData;
4670     int32_t categoryCount = data->fHeader->fCatCount;
4671     UCPTrie *trie = data->fTrie;
4672     bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4673     uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4674 
4675     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4676     for (int cp=0; cp<0x1fff0; ++cp) {
4677         int cat = ucptrie_get(trie, cp);
4678         cat &= ~dictBit;    // And off the dictionary bit from the category.
4679         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4680         if (cat < 0 || cat >= categoryCount) return;
4681         strings[cat].append(cp);
4682     }
4683 
4684     icu_rand randomGen;
4685     const int testStringLength = 10000;
4686     UnicodeString testString;
4687 
4688     for (int i=0; i<testStringLength; ++i) {
4689         int charClass = randomGen() % categoryCount;
4690         if (strings[charClass].length() > 0) {
4691             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4692             testString.append(cp);
4693         }
4694     }
4695 
4696     typedef std::pair<UBool, int32_t> Result;
4697     std::vector<Result> expectedResults;
4698     bi->setText(testString);
4699     for (int i=0; i<testString.length(); ++i) {
4700         bool isboundary = bi->isBoundary(i);
4701         int  ruleStatus = bi->getRuleStatus();
4702         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4703     }
4704 
4705     for (int i=testString.length()-1; i>=0; --i) {
4706         bi->setText(testString);   // clears the internal break cache
4707         Result expected = expectedResults[i];
4708         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4709         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4710     }
4711 }
4712 
4713 
4714 // Ticket 13692 - finding word boundaries in very large numbers or words could
4715 //                be very time consuming. When the problem was present, this void test
4716 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4717 
TestBug13692()4718 void RBBITest::TestBug13692() {
4719     UErrorCode status = U_ZERO_ERROR;
4720     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4721             BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4722     if (!assertSuccess(WHERE, status, true)) {
4723         return;
4724     }
4725     constexpr int32_t LENGTH = 1000000;
4726     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4727     for (int i=0; i<20; i+=2) {
4728         longNumber.setCharAt(i, u' ');
4729     }
4730     bi->setText(longNumber);
4731     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4732     assertSuccess(WHERE, status);
4733 }
4734 
4735 
TestProperties()4736 void RBBITest::TestProperties() {
4737     UErrorCode errorCode = U_ZERO_ERROR;
4738     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4739     if (!prependSet.isEmpty()) {
4740         errln(
4741             "[:GCB=Prepend:] is not empty any more. "
4742             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4743             "change this test to the opposite condition.");
4744     }
4745 }
4746 
4747 
4748 //
4749 //  TestDebug    -  A place-holder test for debugging purposes.
4750 //                  For putting in fragments of other tests that can be invoked
4751 //                  for tracing  without a lot of unwanted extra stuff happening.
4752 //
TestDebug(void)4753 void RBBITest::TestDebug(void) {
4754     UErrorCode status = U_ZERO_ERROR;
4755     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4756             BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4757     if (!assertSuccess(WHERE, status, true)) {
4758         return;
4759     }
4760     const UnicodeString &rules = bi->getRules();
4761     UParseError pe;
4762     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4763     assertSuccess(WHERE, status);
4764 }
4765 
4766 
4767 //
4768 //  TestDebugRules   A stub test for use in debugging rule compilation problems.
4769 //                   Can be freely altered as needed or convenient.
4770 //                   Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4771 //                   data files may not be available in all environments.
4772 //                   Any permanent test cases should be moved to rbbitst.txt
4773 //                   (see Bug 20303 in that file, for example), or to another test function in this file.
4774 //
TestDebugRules()4775 void RBBITest::TestDebugRules() {
4776 #if 0
4777     const char16_t *rules = u""
4778         "!!quoted_literals_only; \n"
4779         "!!chain; \n"
4780         "!!lookAheadHardBreak; \n"
4781         " \n"
4782         // "[a] / ; \n"
4783         "[a] [b] / [c] [d]; \n"
4784         "[a] [b] / [c] [d] {100}; \n"
4785         "[x] [a] [b] / [c] [d] {100}; \n"
4786         "[a] [b] [c] / [d] {100}; \n"
4787         //" [c] [d] / [e] [f]; \n"
4788         //"[a] [b] / [c]; \n"
4789         ;
4790 
4791     UErrorCode status = U_ZERO_ERROR;
4792     CharString path(pathToDataDirectory(), status);
4793     path.appendPathPart("brkitr", status);
4794     path.appendPathPart("rules", status);
4795     path.appendPathPart("line.txt", status);
4796     int    len;
4797     std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4798     if (!assertSuccess(WHERE, status)) {
4799         return;
4800     }
4801 
4802     UParseError pe;
4803     // rules = testFile.get();
4804     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4805 
4806     if (!assertSuccess(WHERE, status)) {
4807         delete bi;
4808         return;
4809     }
4810     // bi->dumpTables();
4811 
4812     delete bi;
4813 #endif
4814 }
4815 
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4816 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4817     UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4818     int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4819     // Text are duplicate characters from U+4E00 to U+4FFF
4820     UnicodeString text;
4821     for (UChar c = 0x4e00; c < 0x5000; c++) {
4822         text.append(c).append(c);
4823     }
4824     // Generate rule which will caused length+4 character classes and
4825     // length+3 states
4826     UnicodeString rules(u"!!quoted_literals_only;");
4827     for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4828         rules.append(u'\'').append(c).append(c).append(u"';");
4829     }
4830     rules.append(u".;");
4831     UErrorCode status = U_ZERO_ERROR;
4832     UParseError parseError;
4833     RuleBasedBreakIterator bi(rules, parseError, status);
4834 
4835     assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4836     assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4837     assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4838     assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4839     assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4840 
4841     bi.setText(text);
4842 
4843     int32_t pos;
4844     int32_t i = 0;
4845     while ((pos = bi.next()) > 0) {
4846         // The first numChar should not break between the pair
4847         if (i++ < numChar) {
4848             assertEquals(WHERE, i * 2, pos);
4849         } else {
4850             // After the first numChar next(), break on each character.
4851             assertEquals(WHERE, i + numChar, pos);
4852         }
4853     }
4854     while ((pos = bi.previous()) > 0) {
4855         // The first numChar should not break between the pair
4856         if (--i < numChar) {
4857             assertEquals(WHERE, i * 2, pos);
4858         } else {
4859             // After the first numChar next(), break on each character.
4860             assertEquals(WHERE, i + numChar, pos);
4861         }
4862     }
4863 }
4864 
Test8BitsTrieWith8BitStateTable()4865 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4866     testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4867 }
4868 
Test16BitsTrieWith8BitStateTable()4869 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4870     testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4871 }
4872 
Test16BitsTrieWith16BitStateTable()4873 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4874     testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4875 }
4876 
Test8BitsTrieWith16BitStateTable()4877 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4878     // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4879     // create state table in 16 bits.
4880 
4881     // Generate 510 'a' as text
4882     UnicodeString text;
4883     for (int32_t i = 0; i < 510; i++) {
4884         text.append(u'a');
4885     }
4886 
4887     UnicodeString rules(u"!!quoted_literals_only;'");
4888     // 254 'a' in the rule will cause 256 states
4889     for (int32_t i = 0; i < 254; i++) {
4890         rules.append(u'a');
4891     }
4892     rules.append(u"';.;");
4893 
4894     UErrorCode status = U_ZERO_ERROR;
4895     UParseError parseError;
4896     LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4897 
4898     assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4899     assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4900     assertEquals(WHERE,
4901                  false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
4902     bi->setText(text);
4903 
4904     // break positions:
4905     // 254, 508, 509, ... 510
4906     assertEquals("next()", 254, bi->next());
4907     int32_t i = 0;
4908     int32_t pos;
4909     while ((pos = bi->next()) > 0) {
4910         assertEquals(WHERE, 508 + i , pos);
4911         i++;
4912     }
4913     i = 0;
4914     while ((pos = bi->previous()) > 0) {
4915         i++;
4916         if (pos >= 508) {
4917             assertEquals(WHERE, 510 - i , pos);
4918         } else {
4919             assertEquals(WHERE, 254 , pos);
4920         }
4921     }
4922 }
4923 
4924 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
4925 // that there are no problems with rules at the size that transitions between the two.
4926 //
4927 // A rule that matches a literal string, like 'abcdefghij', will require one state and
4928 // one character class per character in the string. So we can make a rule to tickle the
4929 // boundaries by using literal strings of various lengths.
4930 //
4931 // For both the number of states and the number of character classes, the eight bit format
4932 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
4933 // leaving 120 something available. This test runs the string over the range of 120 - 130,
4934 // which allows some margin for changes to the number of values reserved by the rule builder
4935 // without breaking the test.
4936 
TestTable_8_16_Bits()4937 void RBBITest::TestTable_8_16_Bits() {
4938 
4939     // testStr serves as both the source of the rule string (truncated to the desired length)
4940     // and as test data to check matching behavior. A break rule consisting of the first 120
4941     // characters of testStr will match the first 120 chars of the full-length testStr.
4942     UnicodeString testStr;
4943     for (UChar c=0x3000; c<0x3200; ++c) {
4944         testStr.append(c);
4945     }
4946 
4947     const int32_t startLength = 120;   // The shortest rule string to test.
4948     const int32_t endLength = 260;     // The longest rule string to test
4949     const int32_t increment = this->quick ? endLength - startLength : 1;
4950 
4951     for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
4952         UParseError parseError;
4953         UErrorCode status = U_ZERO_ERROR;
4954 
4955         UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
4956         ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
4957         RuleBasedBreakIterator bi(ruleString, parseError, status);
4958         if (!assertSuccess(WHERE, status)) {
4959             errln(ruleString);
4960             break;
4961         }
4962         // bi.dumpTables();
4963 
4964         // Verify that the break iterator is functioning - that the first boundary found
4965         // in testStr is at the length of the rule string.
4966         bi.setText(testStr);
4967         assertEquals(WHERE, ruleLen, bi.next());
4968 
4969         // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
4970         // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
4971         bi.setText(testStr);
4972         int32_t result = bi.preceding(ruleLen);
4973         assertEquals(WHERE, 0, result);
4974 
4975         // Verify that the range of rule lengths being tested cover the translations
4976         // from 8 to 16 bit data.
4977         bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
4978         bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
4979 
4980         if (ruleLen == startLength) {
4981             assertEquals(WHERE, true, has8BitRowData);
4982             assertEquals(WHERE, true, has8BitsTrie);
4983         }
4984         if (ruleLen == endLength) {
4985             assertEquals(WHERE, false, has8BitRowData);
4986             assertEquals(WHERE, false, has8BitsTrie);
4987         }
4988     }
4989 }
4990 
4991 /* Test handling of a large number of look-ahead rules.
4992  * The number of rules in the test exceeds the implementation limits prior to the
4993  * improvements introduced with #13590.
4994  *
4995  * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
4996  * The text being matched is sequential, "ABCDEFGHI..."
4997  *
4998  * The upshot is that the look-ahead rules all match on their preceding context,
4999  * and consequently must save a potential result, but then fail to match on their
5000  * trailing context, so that they don't actually cause a boundary.
5001  *
5002  * Additionally, add a ".*" rule, so there are no boundaries unless a
5003  * look-ahead hard-break rule forces one.
5004  */
TestBug13590()5005 void RBBITest::TestBug13590() {
5006     UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5007 
5008     const int NUM_LOOKAHEAD_RULES = 50;
5009     const char16_t STARTING_CHAR = u'\u5000';
5010     char16_t firstChar;
5011     for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5012         firstChar = STARTING_CHAR + ruleNum*2;
5013         rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5014              .append(u' ') .append(u'/') .append(u' ')
5015              .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5016              .append(u';') .append(u'\n');
5017     }
5018 
5019     // Change the last rule added from the form "UV / WY" to "UV / WX".
5020     // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5021     rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5022 
5023     UErrorCode status = U_ZERO_ERROR;
5024     UParseError parseError;
5025     RuleBasedBreakIterator bi(rules, parseError, status);
5026     if (!assertSuccess(WHERE, status)) {
5027         errln(rules);
5028         return;
5029     }
5030     // bi.dumpTables();
5031 
5032     UnicodeString testString;
5033     for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5034         testString.append(c);
5035     }
5036     bi.setText(testString);
5037 
5038     int breaksFound = 0;
5039     while (bi.next() != UBRK_DONE) {
5040         ++breaksFound;
5041     }
5042 
5043     // Two matches are expected, one from the last rule that was explicitly modified,
5044     // and one at the end of the text.
5045     assertEquals(WHERE, 2, breaksFound);
5046 }
5047 
5048 
5049 #if U_ENABLE_TRACING
5050 static std::vector<std::string> gData;
5051 static std::vector<int32_t> gEntryFn;
5052 static std::vector<int32_t> gExitFn;
5053 static std::vector<int32_t> gDataFn;
5054 
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5055 static void U_CALLCONV traceData(
5056         const void*,
5057         int32_t fnNumber,
5058         int32_t,
5059         const char *,
5060         va_list args) {
5061     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5062         const char* data = va_arg(args, const char*);
5063         gDataFn.push_back(fnNumber);
5064         gData.push_back(data);
5065     }
5066 }
5067 
traceEntry(const void *,int32_t fnNumber)5068 static void traceEntry(const void *, int32_t fnNumber) {
5069     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5070         gEntryFn.push_back(fnNumber);
5071     }
5072 }
5073 
traceExit(const void *,int32_t fnNumber,const char *,va_list)5074 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5075     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5076         gExitFn.push_back(fnNumber);
5077     }
5078 }
5079 
5080 
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5081 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5082     assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5083     assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5084     assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5085     assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5086 
5087     if (expectedData == nullptr) {
5088       assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5089       assertEquals("utrace_data should not be called ", 0, gData.size());
5090     } else {
5091       assertEquals("utrace_data should be called ", 1, gDataFn.size());
5092       assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5093       assertEquals("utrace_data should be called ", 1, gData.size());
5094       assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5095     }
5096 }
5097 
SetupTestTrace()5098 void SetupTestTrace() {
5099     gEntryFn.clear();
5100     gExitFn.clear();
5101     gDataFn.clear();
5102     gData.clear();
5103 
5104     const void* context = nullptr;
5105     utrace_setFunctions(context, traceEntry, traceExit, traceData);
5106     utrace_setLevel(UTRACE_INFO);
5107 }
5108 
TestTraceCreateCharacter(void)5109 void RBBITest::TestTraceCreateCharacter(void) {
5110     SetupTestTrace();
5111     IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5112     LocalPointer<BreakIterator> brkitr(
5113         BreakIterator::createCharacterInstance("zh-CN", status));
5114     status.errIfFailureAndReset();
5115     assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5116 }
5117 
TestTraceCreateTitle(void)5118 void RBBITest::TestTraceCreateTitle(void) {
5119     SetupTestTrace();
5120     IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5121     LocalPointer<BreakIterator> brkitr(
5122         BreakIterator::createTitleInstance("zh-CN", status));
5123     status.errIfFailureAndReset();
5124     assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5125 }
5126 
TestTraceCreateSentence(void)5127 void RBBITest::TestTraceCreateSentence(void) {
5128     SetupTestTrace();
5129     IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5130     LocalPointer<BreakIterator> brkitr(
5131         BreakIterator::createSentenceInstance("zh-CN", status));
5132     status.errIfFailureAndReset();
5133     assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5134 }
5135 
TestTraceCreateWord(void)5136 void RBBITest::TestTraceCreateWord(void) {
5137     SetupTestTrace();
5138     IcuTestErrorCode status(*this, "TestTraceCreateWord");
5139     LocalPointer<BreakIterator> brkitr(
5140         BreakIterator::createWordInstance("zh-CN", status));
5141     status.errIfFailureAndReset();
5142     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5143 }
5144 
TestTraceCreateLine(void)5145 void RBBITest::TestTraceCreateLine(void) {
5146     SetupTestTrace();
5147     IcuTestErrorCode status(*this, "TestTraceCreateLine");
5148     LocalPointer<BreakIterator> brkitr(
5149         BreakIterator::createLineInstance("zh-CN", status));
5150     status.errIfFailureAndReset();
5151     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line");
5152 }
5153 
TestTraceCreateLineStrict(void)5154 void RBBITest::TestTraceCreateLineStrict(void) {
5155     SetupTestTrace();
5156     IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5157     LocalPointer<BreakIterator> brkitr(
5158         BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5159     status.errIfFailureAndReset();
5160     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict");
5161 }
5162 
TestTraceCreateLineNormal(void)5163 void RBBITest::TestTraceCreateLineNormal(void) {
5164     SetupTestTrace();
5165     IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5166     LocalPointer<BreakIterator> brkitr(
5167         BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5168     status.errIfFailureAndReset();
5169     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal");
5170 }
5171 
TestTraceCreateLineLoose(void)5172 void RBBITest::TestTraceCreateLineLoose(void) {
5173     SetupTestTrace();
5174     IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5175     LocalPointer<BreakIterator> brkitr(
5176         BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5177     status.errIfFailureAndReset();
5178     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose");
5179 }
5180 
TestTraceCreateLineLoosePhrase(void)5181 void RBBITest::TestTraceCreateLineLoosePhrase(void) {
5182     SetupTestTrace();
5183     IcuTestErrorCode status(*this, "TestTraceCreateLineLoosePhrase");
5184     LocalPointer<BreakIterator> brkitr(
5185         BreakIterator::createLineInstance("ja-u-lb-loose-lw-phrase", status));
5186     status.errIfFailureAndReset();
5187     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose_phrase");
5188 }
5189 
TestTraceCreateLineNormalPhrase(void)5190 void RBBITest::TestTraceCreateLineNormalPhrase(void) {
5191     SetupTestTrace();
5192     IcuTestErrorCode status(*this, "TestTraceCreateLineNormalPhrase");
5193     LocalPointer<BreakIterator> brkitr(
5194         BreakIterator::createLineInstance("ja-u-lb-normal-lw-phrase", status));
5195     status.errIfFailureAndReset();
5196     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal_phrase");
5197 }
5198 
TestTraceCreateLineStrictPhrase(void)5199 void RBBITest::TestTraceCreateLineStrictPhrase(void) {
5200     SetupTestTrace();
5201     IcuTestErrorCode status(*this, "TestTraceCreateLineStrictPhrase");
5202     LocalPointer<BreakIterator> brkitr(
5203         BreakIterator::createLineInstance("ja-u-lb-strict-lw-phrase", status));
5204     status.errIfFailureAndReset();
5205     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict_phrase");
5206 }
5207 
TestTraceCreateLinePhrase(void)5208 void RBBITest::TestTraceCreateLinePhrase(void) {
5209     SetupTestTrace();
5210     IcuTestErrorCode status(*this, "TestTraceCreateLinePhrase");
5211     LocalPointer<BreakIterator> brkitr(
5212         BreakIterator::createLineInstance("ja-u-lw-phrase", status));
5213     status.errIfFailureAndReset();
5214     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_phrase");
5215 }
5216 
TestTraceCreateBreakEngine(void)5217 void RBBITest::TestTraceCreateBreakEngine(void) {
5218     rbbi_cleanup();
5219     SetupTestTrace();
5220     IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5221     LocalPointer<BreakIterator> brkitr(
5222         BreakIterator::createWordInstance("zh-CN", status));
5223     status.errIfFailureAndReset();
5224     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5225 
5226     // To word break the following text, BreakIterator will create 5 dictionary
5227     // break engine internally.
5228     brkitr->setText(
5229         u"test "
5230         u"測試 " // Hani
5231         u"សាកល្បង " // Khmr
5232         u"ທົດສອບ " // Laoo
5233         u"စမ်းသပ်မှု " // Mymr
5234         u"ทดสอบ " // Thai
5235         u"test "
5236     );
5237 
5238     // Loop through all the text.
5239     while (brkitr->next() > 0) ;
5240 
5241     assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5242     assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5243     assertEquals("utrace_data should be called ", 5, gDataFn.size());
5244 
5245     for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5246         assertEquals("utrace_entry should be called ",
5247                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5248         assertEquals("utrace_exit should be called ",
5249                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5250         assertEquals("utrace_data should be called ",
5251                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5252     }
5253 
5254     assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5255     assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5256     assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5257     assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5258     assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5259 
5260 }
5261 #endif
5262 
TestUnpairedSurrogate()5263 void RBBITest::TestUnpairedSurrogate() {
5264     UnicodeString rules(u"ab;");
5265 
5266     UErrorCode status = U_ZERO_ERROR;
5267     UParseError pe;
5268     RuleBasedBreakIterator bi1(rules, pe, status);
5269     assertSuccess(WHERE, status);
5270     UnicodeString rtRules = bi1.getRules();
5271     // make sure the simple one work first.
5272     assertEquals(WHERE, rules,  rtRules);
5273 
5274 
5275     rules = UnicodeString(u"a\\ud800b;").unescape();
5276     pe.line = 0;
5277     pe.offset = 0;
5278     RuleBasedBreakIterator bi2(rules, pe, status);
5279     assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5280     if (pe.line != 1 || pe.offset != 1) {
5281         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5282     }
5283 
5284     status = U_ZERO_ERROR;
5285     rules = UnicodeString(u"a\\ude00b;").unescape();
5286     pe.line = 0;
5287     pe.offset = 0;
5288     RuleBasedBreakIterator bi3(rules, pe, status);
5289     assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5290     if (pe.line != 1 || pe.offset != 1) {
5291         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5292     }
5293 
5294     // make sure the surrogate one work too.
5295     status = U_ZERO_ERROR;
5296     rules = UnicodeString(u"a��b;");
5297     RuleBasedBreakIterator bi4(rules, pe, status);
5298     rtRules = bi4.getRules();
5299     assertEquals(WHERE, rules, rtRules);
5300 }
5301 
5302 // Read file generated by
5303 // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5304 // as test cases and compare the Output.
5305 // Format of the file
5306 //   Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5307 //   Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5308 //   Input:\t[source text]
5309 //   Output:\t[expected output separated by | ]
5310 //   Input: ...
5311 //   Output: ...
5312 
runLSTMTestFromFile(const char * filename,UScriptCode script)5313 void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5314     // The expectation in this test depends on LSTM, skip the test if the
5315     // configuration is not build with LSTM data.
5316     if (skipLSTMTest()) {
5317         return;
5318     }
5319     UErrorCode   status = U_ZERO_ERROR;
5320     LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5321     if (U_FAILURE(status)) {
5322         errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5323         return;
5324     }
5325     //  Open and read the test data file.
5326     const char *testDataDirectory = IntlTest::getSourceTestData(status);
5327     CharString testFileName(testDataDirectory, -1, status);
5328     testFileName.append(filename, -1, status);
5329 
5330     int len;
5331     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5332     if (U_FAILURE(status)) {
5333         errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5334         return;
5335     }
5336 
5337     //  Put the test data into a UnicodeString
5338     UnicodeString testString(FALSE, testFile, len);
5339 
5340     int32_t start = 0;
5341 
5342     UnicodeString line;
5343     int32_t end;
5344     std::string actual_sep_str;
5345     int32_t caseNum = 0;
5346     // Iterate through all the lines in the test file.
5347     do {
5348         int32_t cr = testString.indexOf(u'\r', start);
5349         int32_t lf = testString.indexOf(u'\n', start);
5350         end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5351         line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5352         if (line.length() > 0) {
5353             // Separate each line to key and value by TAB.
5354             int32_t tab = line.indexOf(u'\t');
5355             UnicodeString key = line.tempSubString(0, tab);
5356             const UnicodeString value = line.tempSubString(tab+1);
5357 
5358             if (key == "Model:") {
5359                 // Verify the expectation in the test file match the LSTM model
5360                 // we are using now.
5361                 const LSTMData* data = CreateLSTMDataForScript(script, status);
5362                 if (U_FAILURE(status)) {
5363                     dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5364                               __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5365                     return;
5366                 }
5367                 UnicodeString name(LSTMDataName(data));
5368                 DeleteLSTMData(data);
5369                 if (value != name) {
5370                     std::string utf8Name, utf8Value;
5371                     dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5372                               __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5373                               name.toUTF8String<std::string>(utf8Name).c_str(),
5374                               value.toUTF8String<std::string>(utf8Value).c_str());
5375                     return;
5376                 }
5377             } else if (key == "Input:") {
5378                 UnicodeString input("prefix ");
5379                 input += value + " suffix";
5380                 std::stringstream ss;
5381 
5382                 // Construct the UText which is expected by the the engine as
5383                 // input from the UnicodeString.
5384                 UText ut = UTEXT_INITIALIZER;
5385                 utext_openConstUnicodeString(&ut, &input, &status);
5386                 if (U_FAILURE(status)) {
5387                     dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5388                     return;
5389                 }
5390 
5391                 iterator->setText(&ut, status);
5392                 if (U_FAILURE(status)) {
5393                     errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5394                     return;
5395                 }
5396 
5397                 int32_t bp;
5398                 for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5399                     ss << bp;
5400                     if (bp != input.length()) {
5401                         ss << ", ";
5402                     }
5403                 }
5404 
5405                 utext_close(&ut);
5406                 // Turn the break points into a string for easy comparison
5407                 // output.
5408                 actual_sep_str = "{" + ss.str() + "}";
5409             } else if (key == "Output:" && !actual_sep_str.empty()) {
5410                 UnicodeString input("prefix| |");
5411                 input += value + "| |suffix";
5412                 std::string d;
5413                 int32_t sep;
5414                 int32_t start = 0;
5415                 int32_t curr = 0;
5416                 std::stringstream ss;
5417                 // Include 0 as the break point.
5418                 ss << "0, ";
5419                 while ((sep = input.indexOf(u'|', start)) >= 0) {
5420                     int32_t len = sep - start;
5421                     if (len > 0) {
5422                         if (curr > 0) {
5423                             ss << ", ";
5424                         }
5425                         curr += len;
5426                         ss << curr;
5427                     }
5428                     start = sep + 1;
5429                 }
5430                 // Include end of the string as break point.
5431                 ss << ", " << curr + input.length() - start;
5432                 // Turn the break points into a string for easy comparison
5433                 // output.
5434                 std::string expected = "{" + ss.str() + "}";
5435                 std::string utf8;
5436 
5437                 assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5438                              expected.c_str(), actual_sep_str.c_str());
5439                 actual_sep_str.clear();
5440             }
5441         }
5442         start = std::max(cr, lf) + 1;
5443     } while (end >= 0);
5444 
5445     delete [] testFile;
5446 }
5447 
TestLSTMThai()5448 void RBBITest::TestLSTMThai() {
5449     runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5450 }
5451 
TestLSTMBurmese()5452 void RBBITest::TestLSTMBurmese() {
5453     runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5454 }
5455 
5456 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5457