• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1999-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Date        Name        Description
10 *   12/15/99    Madhu        Creation.
11 *   01/12/2000  Madhu        Updated for changed API and added new tests
12 ************************************************************************/
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include <sstream>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <utility>
22 #include <vector>
23 
24 #include "unicode/brkiter.h"
25 #include "unicode/localpointer.h"
26 #include "unicode/numfmt.h"
27 #include "unicode/rbbi.h"
28 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
29 #include "unicode/regex.h"
30 #endif
31 #include "unicode/schriter.h"
32 #include "unicode/uchar.h"
33 #include "unicode/utf16.h"
34 #include "unicode/ucnv.h"
35 #include "unicode/uniset.h"
36 #include "unicode/uscript.h"
37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
39 #include "unicode/utrace.h"
40 
41 #include "charstr.h"
42 #include "cmemory.h"
43 #include "cstr.h"
44 #include "intltest.h"
45 #include "rbbitst.h"
46 #include "rbbidata.h"
47 #include "utypeinfo.h"  // for 'typeid' to work
48 #include "uvector.h"
49 #include "uvectr32.h"
50 
51 
52 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
53 #include "unicode/filteredbrk.h"
54 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 
56 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
57     if (!(x)) { \
58         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
59     } \
60 } UPRV_BLOCK_MACRO_END
61 
62 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
63     if (U_FAILURE(errcode)) { \
64         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
65     } \
66 } UPRV_BLOCK_MACRO_END
67 
68 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
69     IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
70                     __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
71 }
72 
73 //---------------------------------------------
74 // runIndexedTest
75 //---------------------------------------------
76 
77 
78 //  Note:  Before adding new tests to this file, check whether the desired test data can
79 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
80 //         it's much less work than writing a new test, diagnostic output in the event of failures
81 //         is good, and the test data file will is shared with ICU4J, so eventually the test
82 //         will run there as well, without additional effort.
83 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)84 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
85 {
86     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
87     fTestParams = params;
88 
89     TESTCASE_AUTO_BEGIN;
90 #if !UCONFIG_NO_FILE_IO
91     TESTCASE_AUTO(TestBug4153072);
92 #endif
93 #if !UCONFIG_NO_FILE_IO
94     TESTCASE_AUTO(TestUnicodeFiles);
95 #endif
96     TESTCASE_AUTO(TestGetAvailableLocales);
97     TESTCASE_AUTO(TestGetDisplayName);
98 #if !UCONFIG_NO_FILE_IO
99     TESTCASE_AUTO(TestEndBehaviour);
100     TESTCASE_AUTO(TestWordBreaks);
101     TESTCASE_AUTO(TestWordBoundary);
102     TESTCASE_AUTO(TestLineBreaks);
103     TESTCASE_AUTO(TestSentBreaks);
104     TESTCASE_AUTO(TestExtended);
105 #endif
106 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
107     TESTCASE_AUTO(TestMonkey);
108 #endif
109 #if !UCONFIG_NO_FILE_IO
110     TESTCASE_AUTO(TestBug3818);
111 #endif
112     TESTCASE_AUTO(TestDebug);
113 #if !UCONFIG_NO_FILE_IO
114     TESTCASE_AUTO(TestBug5775);
115 #endif
116     TESTCASE_AUTO(TestBug9983);
117     TESTCASE_AUTO(TestDictRules);
118     TESTCASE_AUTO(TestBug5532);
119     TESTCASE_AUTO(TestBug7547);
120     TESTCASE_AUTO(TestBug12797);
121     TESTCASE_AUTO(TestBug12918);
122     TESTCASE_AUTO(TestBug12932);
123     TESTCASE_AUTO(TestEmoji);
124     TESTCASE_AUTO(TestBug12519);
125     TESTCASE_AUTO(TestBug12677);
126     TESTCASE_AUTO(TestTableRedundancies);
127     TESTCASE_AUTO(TestBug13447);
128     TESTCASE_AUTO(TestReverse);
129     TESTCASE_AUTO(TestBug13692);
130     TESTCASE_AUTO(TestDebugRules);
131     TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
132     TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
133     TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
134     TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
135     TESTCASE_AUTO(TestTable_8_16_Bits);
136     TESTCASE_AUTO(TestBug13590);
137     TESTCASE_AUTO(TestUnpairedSurrogate);
138 
139 #if U_ENABLE_TRACING
140     TESTCASE_AUTO(TestTraceCreateCharacter);
141     TESTCASE_AUTO(TestTraceCreateWord);
142     TESTCASE_AUTO(TestTraceCreateSentence);
143     TESTCASE_AUTO(TestTraceCreateTitle);
144     TESTCASE_AUTO(TestTraceCreateLine);
145     TESTCASE_AUTO(TestTraceCreateLineNormal);
146     TESTCASE_AUTO(TestTraceCreateLineLoose);
147     TESTCASE_AUTO(TestTraceCreateLineStrict);
148     TESTCASE_AUTO(TestTraceCreateBreakEngine);
149 #endif
150 
151     TESTCASE_AUTO_END;
152 }
153 
154 
155 //--------------------------------------------------------------------------------------
156 //
157 //    RBBITest    constructor and destructor
158 //
159 //--------------------------------------------------------------------------------------
160 
RBBITest()161 RBBITest::RBBITest() {
162     fTestParams = NULL;
163 }
164 
165 
~RBBITest()166 RBBITest::~RBBITest() {
167 }
168 
169 
printStringBreaks(UText * tstr,int expected[],int expectedCount)170 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
171     UErrorCode status = U_ZERO_ERROR;
172     char name[100];
173     printf("code    alpha extend alphanum type word sent line name\n");
174     int nextExpectedIndex = 0;
175     utext_setNativeIndex(tstr, 0);
176     for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
177         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
178             printf("------------------------------------------------ %d\n", j);
179             ++nextExpectedIndex;
180         }
181 
182         UChar32 c = utext_next32(tstr);
183         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
184         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
185                            u_isUAlphabetic(c),
186                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
187                            u_isalnum(c),
188                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
189                                                   u_charType(c),
190                                                   U_SHORT_PROPERTY_NAME),
191                            u_getPropertyValueName(UCHAR_WORD_BREAK,
192                                                   u_getIntPropertyValue(c,
193                                                           UCHAR_WORD_BREAK),
194                                                   U_SHORT_PROPERTY_NAME),
195                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
196                                    u_getIntPropertyValue(c,
197                                            UCHAR_SENTENCE_BREAK),
198                                    U_SHORT_PROPERTY_NAME),
199                            u_getPropertyValueName(UCHAR_LINE_BREAK,
200                                    u_getIntPropertyValue(c,
201                                            UCHAR_LINE_BREAK),
202                                    U_SHORT_PROPERTY_NAME),
203                            name);
204     }
205 }
206 
207 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)208 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
209    UErrorCode status = U_ZERO_ERROR;
210    UText *tstr = NULL;
211    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
212    if (U_FAILURE(status)) {
213        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
214        return;
215     }
216    printStringBreaks(tstr, expected, expectedCount);
217    utext_close(tstr);
218 }
219 
220 
TestBug3818()221 void RBBITest::TestBug3818() {
222     UErrorCode  status = U_ZERO_ERROR;
223 
224     // Four Thai words...
225     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
226                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
227     UnicodeString  thaiStr(thaiWordData);
228 
229     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
230     if (U_FAILURE(status) || bi == NULL) {
231         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
232         return;
233     }
234     bi->setText(thaiStr);
235 
236     int32_t  startOfSecondWord = bi->following(1);
237     if (startOfSecondWord != 4) {
238         errln("Fail at file %s, line %d expected start of word at 4, got %d",
239             __FILE__, __LINE__, startOfSecondWord);
240     }
241     startOfSecondWord = bi->following(0);
242     if (startOfSecondWord != 4) {
243         errln("Fail at file %s, line %d expected start of word at 4, got %d",
244             __FILE__, __LINE__, startOfSecondWord);
245     }
246     delete bi;
247 }
248 
249 
250 //---------------------------------------------
251 //
252 //     other tests
253 //
254 //---------------------------------------------
255 
TestGetAvailableLocales()256 void RBBITest::TestGetAvailableLocales()
257 {
258     int32_t locCount = 0;
259     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
260 
261     if (locCount == 0)
262         dataerrln("getAvailableLocales() returned an empty list!");
263     // Just make sure that it's returning good memory.
264     int32_t i;
265     for (i = 0; i < locCount; ++i) {
266         logln(locList[i].getName());
267     }
268 }
269 
270 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()271 void RBBITest::TestGetDisplayName()
272 {
273     UnicodeString   result;
274 
275     BreakIterator::getDisplayName(Locale::getUS(), result);
276     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
277         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
278                 + result);
279 
280     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
281     if (result != "French (France)")
282         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
283                 + result);
284 }
285 /**
286  * Test End Behaviour
287  * @bug 4068137
288  */
TestEndBehaviour()289 void RBBITest::TestEndBehaviour()
290 {
291     UErrorCode status = U_ZERO_ERROR;
292     UnicodeString testString("boo.");
293     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
294     if (U_FAILURE(status))
295     {
296         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
297         return;
298     }
299     wb->setText(testString);
300 
301     if (wb->first() != 0)
302         errln("Didn't get break at beginning of string.");
303     if (wb->next() != 3)
304         errln("Didn't get break before period in \"boo.\"");
305     if (wb->current() != 4 && wb->next() != 4)
306         errln("Didn't get break at end of string.");
307     delete wb;
308 }
309 /*
310  * @bug 4153072
311  */
TestBug4153072()312 void RBBITest::TestBug4153072() {
313     UErrorCode status = U_ZERO_ERROR;
314     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
315     if (U_FAILURE(status))
316     {
317         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
318         return;
319     }
320     UnicodeString str("...Hello, World!...");
321     int32_t begin = 3;
322     int32_t end = str.length() - 3;
323     UBool onBoundary;
324 
325     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
326     iter->adoptText(textIterator);
327     int index;
328     // Note: with the switch to UText, there is no way to restrict the
329     //       iteration range to begin at an index other than zero.
330     //       String character iterators created with a non-zero bound are
331     //         treated by RBBI as being empty.
332     for (index = -1; index < begin + 1; ++index) {
333         onBoundary = iter->isBoundary(index);
334         if (index == 0?  !onBoundary : onBoundary) {
335             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
336                             " and begin index = " + begin);
337         }
338     }
339     delete iter;
340 }
341 
342 
343 //
344 // Test for problem reported by Ashok Matoria on 9 July 2007
345 //    One.<kSoftHyphen><kSpace>Two.
346 //
347 //    Sentence break at start (0) and then on calling next() it breaks at
348 //   'T' of "Two". Now, at this point if I do next() and
349 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
350 //
TestBug5775()351 void RBBITest::TestBug5775() {
352     UErrorCode status = U_ZERO_ERROR;
353     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
354     TEST_ASSERT_SUCCESS(status);
355     if (U_FAILURE(status)) {
356         return;
357     }
358 // Check for status first for better handling of no data errors.
359     TEST_ASSERT(bi != NULL);
360     if (bi == NULL) {
361         return;
362     }
363 
364     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
365     //               01234      56789
366     s = s.unescape();
367     bi->setText(s);
368     int pos = bi->next();
369     TEST_ASSERT(pos == 6);
370     pos = bi->next();
371     TEST_ASSERT(pos == 10);
372     pos = bi->previous();
373     TEST_ASSERT(pos == 6);
374     delete bi;
375 }
376 
377 
378 
379 //------------------------------------------------------------------------------
380 //
381 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
382 //
383 //------------------------------------------------------------------------------
384 
385 struct TestParams {
386     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
387                                            //   Changed out whenever test data changes break type.
388 
389     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
390     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
391     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
392     UVector32       *srcCol;
393 
394     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
395     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
396     CharString       utf8String;           // UTF-8 form of text to break.
397 
TestParamsTestParams398     TestParams(UErrorCode &status) : dataToBreak() {
399         bi               = NULL;
400         expectedBreaks   = new UVector32(status);
401         srcLine          = new UVector32(status);
402         srcCol           = new UVector32(status);
403         textToBreak      = NULL;
404         textMap          = new UVector32(status);
405     }
406 
~TestParamsTestParams407     ~TestParams() {
408         delete bi;
409         delete expectedBreaks;
410         delete srcLine;
411         delete srcCol;
412         utext_close(textToBreak);
413         delete textMap;
414     }
415 
416     int32_t getSrcLine(int32_t bp);
417     int32_t getExpectedBreak(int32_t bp);
418     int32_t getSrcCol(int32_t bp);
419 
420     void setUTF16(UErrorCode &status);
421     void setUTF8(UErrorCode &status);
422 };
423 
424 // Append a UnicodeString to a CharString with UTF-8 encoding.
425 // Substitute any invalid chars.
426 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)427 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
428     if (U_FAILURE(status)) {
429         return;
430     }
431     int32_t utf8Length;
432     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
433                        src.getBuffer(), src.length(),   // UTF-16 data
434                        0xfffd, NULL,                    // Substitution char, number of subs.
435                        &status);
436     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
437         return;
438     }
439     status = U_ZERO_ERROR;
440     int32_t capacity;
441     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
442     u_strToUTF8WithSub(buffer, utf8Length, NULL,
443                        src.getBuffer(), src.length(),
444                        0xfffd, NULL, &status);
445     dest.append(buffer, utf8Length, status);
446 }
447 
448 
setUTF16(UErrorCode & status)449 void TestParams::setUTF16(UErrorCode &status) {
450     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
451     textMap->removeAllElements();
452     for (int32_t i=0; i<dataToBreak.length(); i++) {
453         if (i == dataToBreak.getChar32Start(i)) {
454             textMap->addElement(i, status);
455         } else {
456             textMap->addElement(-1, status);
457         }
458     }
459     textMap->addElement(dataToBreak.length(), status);
460     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
461 }
462 
463 
setUTF8(UErrorCode & status)464 void TestParams::setUTF8(UErrorCode &status) {
465     if (U_FAILURE(status)) {
466         return;
467     }
468     utf8String.clear();
469     CharStringAppend(utf8String, dataToBreak, status);
470     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
471     if (U_FAILURE(status)) {
472         return;
473     }
474 
475     textMap->removeAllElements();
476     int32_t utf16Index = 0;
477     for (;;) {
478         textMap->addElement(utf16Index, status);
479         UChar32 c32 = utext_current32(textToBreak);
480         if (c32 < 0) {
481             break;
482         }
483         utf16Index += U16_LENGTH(c32);
484         utext_next32(textToBreak);
485         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
486             textMap->addElement(-1, status);
487         }
488     }
489     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
490 }
491 
492 
getSrcLine(int32_t bp)493 int32_t TestParams::getSrcLine(int32_t bp) {
494     if (bp >= textMap->size()) {
495         bp = textMap->size() - 1;
496     }
497     int32_t i = 0;
498     for(; bp >= 0 ; --bp) {
499         // Move to a character boundary if we are not on one already.
500         i = textMap->elementAti(bp);
501         if (i >= 0) {
502             break;
503         }
504     }
505     return srcLine->elementAti(i);
506 }
507 
508 
getExpectedBreak(int32_t bp)509 int32_t TestParams::getExpectedBreak(int32_t bp) {
510     if (bp >= textMap->size()) {
511         return 0;
512     }
513     int32_t i = textMap->elementAti(bp);
514     int32_t retVal = 0;
515     if (i >= 0) {
516         retVal = expectedBreaks->elementAti(i);
517     }
518     return retVal;
519 }
520 
521 
getSrcCol(int32_t bp)522 int32_t TestParams::getSrcCol(int32_t bp) {
523     if (bp >= textMap->size()) {
524         bp = textMap->size() - 1;
525     }
526     int32_t i = 0;
527     for(; bp >= 0; --bp) {
528         // Move bp to a character boundary if we are not on one already.
529         i = textMap->elementAti(bp);
530         if (i >= 0) {
531             break;
532         }
533     }
534     return srcCol->elementAti(i);
535 }
536 
537 
executeTest(TestParams * t,UErrorCode & status)538 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
539     int32_t    bp;
540     int32_t    prevBP;
541     int32_t    i;
542 
543     TEST_ASSERT_SUCCESS(status);
544     if (U_FAILURE(status)) {
545         return;
546     }
547 
548     if (t->bi == NULL) {
549         return;
550     }
551 
552     t->bi->setText(t->textToBreak, status);
553     //
554     //  Run the iterator forward
555     //
556     prevBP = -1;
557     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
558         if (prevBP ==  bp) {
559             // Fail for lack of forward progress.
560             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
561                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
562             break;
563         }
564 
565         // Check that there we didn't miss an expected break between the last one
566         //  and this one.
567         for (i=prevBP+1; i<bp; i++) {
568             if (t->getExpectedBreak(i) != 0) {
569                 int expected[] = {0, i};
570                 printStringBreaks(t->dataToBreak, expected, 2);
571                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
572                       i, t->getSrcLine(i), t->getSrcCol(i));
573             }
574         }
575 
576         // Check that the break we did find was expected
577         if (t->getExpectedBreak(bp) == 0) {
578             int expected[] = {0, bp};
579             printStringBreaks(t->textToBreak, expected, 2);
580             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
581                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
582         } else {
583             // The break was expected.
584             //   Check that the {nnn} tag value is correct.
585             int32_t expectedTagVal = t->getExpectedBreak(bp);
586             if (expectedTagVal == -1) {
587                 expectedTagVal = 0;
588             }
589             int32_t line = t->getSrcLine(bp);
590             int32_t rs = t->bi->getRuleStatus();
591             if (rs != expectedTagVal) {
592                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
593                       "          Actual, Expected status = %4d, %4d",
594                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
595             }
596         }
597 
598         prevBP = bp;
599     }
600 
601     // Verify that there were no missed expected breaks after the last one found
602     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
603         if (t->getExpectedBreak(i) != 0) {
604             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
605                       i, t->getSrcLine(i), t->getSrcCol(i));
606         }
607     }
608 
609     //
610     //  Run the iterator backwards, verify that the same breaks are found.
611     //
612     prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
613     bp = t->bi->last();
614     while (bp != BreakIterator::DONE) {
615         if (prevBP ==  bp) {
616             // Fail for lack of progress.
617             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
618                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
619             break;
620         }
621 
622         // Check that we didn't miss an expected break between the last one
623         //  and this one.  (UVector returns zeros for index out of bounds.)
624         for (i=prevBP-1; i>bp; i--) {
625             if (t->getExpectedBreak(i) != 0) {
626                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
627                       i, t->getSrcLine(i), t->getSrcCol(i));
628             }
629         }
630 
631         // Check that the break we did find was expected
632         if (t->getExpectedBreak(bp) == 0) {
633             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
634                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
635         } else {
636             // The break was expected.
637             //   Check that the {nnn} tag value is correct.
638             int32_t expectedTagVal = t->getExpectedBreak(bp);
639             if (expectedTagVal == -1) {
640                 expectedTagVal = 0;
641             }
642             int line = t->getSrcLine(bp);
643             int32_t rs = t->bi->getRuleStatus();
644             if (rs != expectedTagVal) {
645                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
646                       "          Actual, Expected status = %4d, %4d",
647                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
648             }
649         }
650 
651         prevBP = bp;
652         bp = t->bi->previous();
653     }
654 
655     // Verify that there were no missed breaks prior to the last one found
656     for (i=prevBP-1; i>=0; i--) {
657         if (t->getExpectedBreak(i) != 0) {
658             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
659                       i, t->getSrcLine(i), t->getSrcCol(i));
660         }
661     }
662 
663     // Check isBoundary()
664     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
665         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
666         UBool boundaryFound    = t->bi->isBoundary(i);
667         if (boundaryExpected != boundaryFound) {
668             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
669                   "        Expected, Actual= %s, %s",
670                   i, t->getSrcLine(i), t->getSrcCol(i),
671                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
672         }
673     }
674 
675     // Check following()
676     for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
677         int32_t actualBreak = t->bi->following(i);
678         int32_t expectedBreak = BreakIterator::DONE;
679         for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
680             if (t->getExpectedBreak(j) != 0) {
681                 expectedBreak = j;
682                 break;
683             }
684         }
685         if (expectedBreak != actualBreak) {
686             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
687                   "        Expected, Actual= %d, %d",
688                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
689         }
690     }
691 
692     // Check preceding()
693     for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
694         int32_t actualBreak = t->bi->preceding(i);
695         int32_t expectedBreak = BreakIterator::DONE;
696 
697         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
698         // preceding(trailing byte) will return the index of some preceding code point,
699         // not the lead byte of the current code point, even though that has a smaller index.
700         // Therefore, start looking at the expected break data not at i-1, but at
701         // the start of code point index - 1.
702         utext_setNativeIndex(t->textToBreak, i);
703         int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
704         for (; j >= 0; j--) {
705             if (t->getExpectedBreak(j) != 0) {
706                 expectedBreak = j;
707                 break;
708             }
709         }
710         if (expectedBreak != actualBreak) {
711             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
712                   "        Expected, Actual= %d, %d",
713                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
714         }
715     }
716 }
717 
718 
TestExtended()719 void RBBITest::TestExtended() {
720   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
721   // data driven test closely entangles filtered and regular data.
722 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
723     UErrorCode      status  = U_ZERO_ERROR;
724     Locale          locale("");
725 
726     TestParams          tp(status);
727 
728     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
729     if (U_FAILURE(status)) {
730         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
731     }
732 
733     //
734     //  Open and read the test data file.
735     //
736     const char *testDataDirectory = IntlTest::getSourceTestData(status);
737     CharString testFileName(testDataDirectory, -1, status);
738     testFileName.append("rbbitst.txt", -1, status);
739 
740     int    len;
741     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
742     if (U_FAILURE(status)) {
743         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
744         return;
745     }
746 
747     bool skipTest = false; // Skip this test?
748 
749     //
750     //  Put the test data into a UnicodeString
751     //
752     UnicodeString testString(FALSE, testFile, len);
753 
754     enum EParseState{
755         PARSE_COMMENT,
756         PARSE_TAG,
757         PARSE_DATA,
758         PARSE_NUM,
759         PARSE_RULES
760     }
761     parseState = PARSE_TAG;
762 
763     EParseState savedState = PARSE_TAG;
764 
765     int32_t    lineNum  = 1;
766     int32_t    colStart = 0;
767     int32_t    column   = 0;
768     int32_t    charIdx  = 0;
769 
770     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
771 
772     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
773     int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
774 
775     for (charIdx = 0; charIdx < len; ) {
776         status = U_ZERO_ERROR;
777         UChar  c = testString.charAt(charIdx);
778         charIdx++;
779         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
780             // treat CRLF as a unit
781             c = u'\n';
782             charIdx++;
783         }
784         if (c == u'\n' || c == u'\r') {
785             lineNum++;
786             colStart = charIdx;
787         }
788         column = charIdx - colStart + 1;
789 
790         switch (parseState) {
791         case PARSE_COMMENT:
792             if (c == u'\n' || c == u'\r') {
793                 parseState = savedState;
794             }
795             break;
796 
797         case PARSE_TAG:
798             {
799             if (c == u'#') {
800                 parseState = PARSE_COMMENT;
801                 savedState = PARSE_TAG;
802                 break;
803             }
804             if (u_isUWhiteSpace(c)) {
805                 break;
806             }
807             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
808                 delete tp.bi;
809                 tp.bi = BreakIterator::createWordInstance(locale,  status);
810                 skipTest = false;
811                 charIdx += 5;
812                 break;
813             }
814             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
815                 delete tp.bi;
816                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
817                 skipTest = false;
818                 charIdx += 5;
819                 break;
820             }
821             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
822                 delete tp.bi;
823                 tp.bi = BreakIterator::createLineInstance(locale,  status);
824                 skipTest = false;
825                 charIdx += 5;
826                 break;
827             }
828             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
829                 delete tp.bi;
830                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
831                 skipTest = false;
832                 charIdx += 5;
833                 break;
834             }
835             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
836                 delete tp.bi;
837                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
838                 charIdx += 6;
839                 break;
840             }
841 
842             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
843                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
844                 charIdx = testString.indexOf(u'>', charIdx) + 1;
845                 parseState = PARSE_RULES;
846                 rules.remove();
847                 rulesFirstLine = lineNum;
848                 break;
849             }
850 
851             // <locale  loc_name>
852             localeMatcher.reset(testString);
853             if (localeMatcher.lookingAt(charIdx-1, status)) {
854                 UnicodeString localeName = localeMatcher.group(1, status);
855                 char localeName8[100];
856                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
857                 locale = Locale::createFromName(localeName8);
858                 charIdx += localeMatcher.group(0, status).length() - 1;
859                 TEST_ASSERT_SUCCESS(status);
860                 break;
861             }
862             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
863                 parseState = PARSE_DATA;
864                 charIdx += 5;
865                 tp.dataToBreak = "";
866                 tp.expectedBreaks->removeAllElements();
867                 tp.srcCol ->removeAllElements();
868                 tp.srcLine->removeAllElements();
869                 break;
870             }
871 
872             errln("line %d: Tag expected in test file.", lineNum);
873             parseState = PARSE_COMMENT;
874             savedState = PARSE_DATA;
875             goto end_test; // Stop the test.
876             }
877             break;
878 
879         case PARSE_RULES:
880             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
881                 charIdx += 7;
882                 parseState = PARSE_TAG;
883                 delete tp.bi;
884                 UParseError pe;
885                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
886                 skipTest = U_FAILURE(status);
887                 if (U_FAILURE(status)) {
888                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
889                         rulesFirstLine + pe.line - 1, u_errorName(status));
890                 }
891             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
892                 charIdx += 10;
893                 parseState = PARSE_TAG;
894                 UErrorCode ec = U_ZERO_ERROR;
895                 UParseError pe;
896                 RuleBasedBreakIterator bi(rules, pe, ec);
897                 if (U_SUCCESS(ec)) {
898                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
899                         rulesFirstLine + pe.line - 1);
900                 }
901             } else {
902                 rules.append(c);
903             }
904             break;
905 
906         case PARSE_DATA:
907             if (c == u'•') {
908                 int32_t  breakIdx = tp.dataToBreak.length();
909                 if (tp.expectedBreaks->size() > breakIdx) {
910                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
911                           lineNum, column);
912                 }
913                 tp.expectedBreaks->setSize(breakIdx+1);
914                 tp.expectedBreaks->setElementAt(-1, breakIdx);
915                 tp.srcLine->setSize(breakIdx+1);
916                 tp.srcLine->setElementAt(lineNum, breakIdx);
917                 tp.srcCol ->setSize(breakIdx+1);
918                 tp.srcCol ->setElementAt(column, breakIdx);
919                 break;
920             }
921 
922             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
923                 // Add final entry to mappings from break location to source file position.
924                 //  Need one extra because last break position returned is after the
925                 //    last char in the data, not at the last char.
926                 tp.srcLine->addElement(lineNum, status);
927                 tp.srcCol ->addElement(column, status);
928 
929                 parseState = PARSE_TAG;
930                 charIdx += 6;
931 
932                 if (!skipTest) {
933                     // RUN THE TEST!
934                     status = U_ZERO_ERROR;
935                     tp.setUTF16(status);
936                     executeTest(&tp, status);
937                     TEST_ASSERT_SUCCESS(status);
938 
939                     // Run again, this time with UTF-8 text wrapped in a UText.
940                     status = U_ZERO_ERROR;
941                     tp.setUTF8(status);
942                     TEST_ASSERT_SUCCESS(status);
943                     executeTest(&tp, status);
944                 }
945                 break;
946             }
947 
948             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
949                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
950                 // Get the code point from the name and insert it into the test data.
951                 //   (Damn, no API takes names in Unicode  !!!
952                 //    we've got to take it back to char *)
953                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
954                 int32_t nameLength = nameEndIdx - (charIdx+2);
955                 char charNameBuf[200];
956                 UChar32 theChar = -1;
957                 if (nameEndIdx != -1) {
958                     UErrorCode status = U_ZERO_ERROR;
959                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
960                     charNameBuf[sizeof(charNameBuf)-1] = 0;
961                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
962                     if (U_FAILURE(status)) {
963                         theChar = -1;
964                     }
965                 }
966                 if (theChar == -1) {
967                     errln("Error in named character in test file at line %d, col %d",
968                         lineNum, column);
969                 } else {
970                     // Named code point was recognized.  Insert it
971                     //   into the test data.
972                     tp.dataToBreak.append(theChar);
973                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
974                         tp.srcLine->addElement(lineNum, status);
975                         tp.srcCol ->addElement(column, status);
976                     }
977                 }
978                 if (nameEndIdx > charIdx) {
979                     charIdx = nameEndIdx+1;
980 
981                 }
982                 break;
983             }
984 
985 
986 
987             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
988                 charIdx++;
989                 int32_t  breakIdx = tp.dataToBreak.length();
990                 tp.expectedBreaks->setSize(breakIdx+1);
991                 tp.expectedBreaks->setElementAt(-1, breakIdx);
992                 tp.srcLine->setSize(breakIdx+1);
993                 tp.srcLine->setElementAt(lineNum, breakIdx);
994                 tp.srcCol ->setSize(breakIdx+1);
995                 tp.srcCol ->setElementAt(column, breakIdx);
996                 break;
997             }
998 
999             if (c == u'<') {
1000                 tagValue   = 0;
1001                 parseState = PARSE_NUM;
1002                 break;
1003             }
1004 
1005             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
1006                 parseState = PARSE_COMMENT;
1007                 savedState = PARSE_DATA;
1008                 break;
1009             }
1010 
1011             if (c == u'\\') {
1012                 // Check for \ at end of line, a line continuation.
1013                 //     Advance over (discard) the newline
1014                 UChar32 cp = testString.char32At(charIdx);
1015                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1016                     // We have a CR LF
1017                     //  Need an extra increment of the input ptr to move over both of them
1018                     charIdx++;
1019                 }
1020                 if (cp == u'\n' || cp == u'\r') {
1021                     lineNum++;
1022                     colStart = charIdx;
1023                     charIdx++;
1024                     break;
1025                 }
1026 
1027                 // Let unescape handle the back slash.
1028                 cp = testString.unescapeAt(charIdx);
1029                 if (cp != -1) {
1030                     // Escape sequence was recognized.  Insert the char
1031                     //   into the test data.
1032                     tp.dataToBreak.append(cp);
1033                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1034                         tp.srcLine->addElement(lineNum, status);
1035                         tp.srcCol ->addElement(column, status);
1036                     }
1037                     break;
1038                 }
1039 
1040 
1041                 // Not a recognized backslash escape sequence.
1042                 // Take the next char as a literal.
1043                 //  TODO:  Should this be an error?
1044                 c = testString.charAt(charIdx);
1045                 charIdx = testString.moveIndex32(charIdx, 1);
1046             }
1047 
1048             // Normal, non-escaped data char.
1049             tp.dataToBreak.append(c);
1050 
1051             // Save the mapping from offset in the data to line/column numbers in
1052             //   the original input file.  Will be used for better error messages only.
1053             //   If there's an expected break before this char, the slot in the mapping
1054             //     vector will already be set for this char; don't overwrite it.
1055             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1056                 tp.srcLine->addElement(lineNum, status);
1057                 tp.srcCol ->addElement(column, status);
1058             }
1059             break;
1060 
1061 
1062         case PARSE_NUM:
1063             // We are parsing an expected numeric tag value, like <1234>,
1064             //   within a chunk of data.
1065             if (u_isUWhiteSpace(c)) {
1066                 break;
1067             }
1068 
1069             if (c == u'>') {
1070                 // Finished the number.  Add the info to the expected break data,
1071                 //   and switch parse state back to doing plain data.
1072                 parseState = PARSE_DATA;
1073                 if (tagValue == 0) {
1074                     tagValue = -1;
1075                 }
1076                 int32_t  breakIdx = tp.dataToBreak.length();
1077                 if (tp.expectedBreaks->size() > breakIdx) {
1078                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1079                           lineNum, column);
1080                 }
1081                 tp.expectedBreaks->setSize(breakIdx+1);
1082                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1083                 tp.srcLine->setSize(breakIdx+1);
1084                 tp.srcLine->setElementAt(lineNum, breakIdx);
1085                 tp.srcCol ->setSize(breakIdx+1);
1086                 tp.srcCol ->setElementAt(column, breakIdx);
1087                 break;
1088             }
1089 
1090             if (u_isdigit(c)) {
1091                 tagValue = tagValue*10 + u_charDigitValue(c);
1092                 break;
1093             }
1094 
1095             errln("Syntax Error in test file at line %d, col %d",
1096                 lineNum, column);
1097             parseState = PARSE_COMMENT;
1098             goto end_test; // Stop the test
1099             break;
1100         }
1101 
1102 
1103         if (U_FAILURE(status)) {
1104             dataerrln("ICU Error %s while parsing test file at line %d.",
1105                 u_errorName(status), lineNum);
1106             status = U_ZERO_ERROR;
1107             goto end_test; // Stop the test
1108         }
1109 
1110     }
1111 
1112     // Reached end of test file. Raise an error if parseState indicates that we are
1113     //   within a block that should have been terminated.
1114 
1115     if (parseState == PARSE_RULES) {
1116         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1117             lineNum, rulesFirstLine);
1118     }
1119     if (parseState == PARSE_DATA) {
1120         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1121     }
1122 
1123 
1124 end_test:
1125     delete [] testFile;
1126 #endif
1127 }
1128 
1129 
1130 //-------------------------------------------------------------------------------
1131 //
1132 //  TestDictRules   create a break iterator from source rules that includes a
1133 //                  dictionary range.   Regression for bug #7130.  Source rules
1134 //                  do not declare a break iterator type (word, line, sentence, etc.
1135 //                  but the dictionary code, without a type, would loop.
1136 //
1137 //-------------------------------------------------------------------------------
TestDictRules()1138 void RBBITest::TestDictRules() {
1139     const char *rules =  "$dictionary = [a-z]; \n"
1140                          "!!forward; \n"
1141                          "$dictionary $dictionary; \n"
1142                          "!!reverse; \n"
1143                          "$dictionary $dictionary; \n";
1144     const char *text = "aa";
1145     UErrorCode status = U_ZERO_ERROR;
1146     UParseError parseError;
1147 
1148     RuleBasedBreakIterator bi(rules, parseError, status);
1149     if (U_SUCCESS(status)) {
1150         UnicodeString utext = text;
1151         bi.setText(utext);
1152         int32_t position;
1153         int32_t loops;
1154         for (loops = 0; loops<10; loops++) {
1155             position = bi.next();
1156             if (position == RuleBasedBreakIterator::DONE) {
1157                 break;
1158             }
1159         }
1160         TEST_ASSERT(loops == 1);
1161     } else {
1162         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1163     }
1164 }
1165 
1166 
1167 
1168 //-------------------------------------------------------------------------------
1169 //
1170 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1171 //    return the data in one big UChar * buffer, which the caller must delete.
1172 //
1173 //    parameters:
1174 //          fileName:   the name of the file, with no directory part.  The test data directory
1175 //                      is assumed.
1176 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1177 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1178 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1179 //                      Pass NULL for the system default encoding.
1180 //          status
1181 //    returns:
1182 //                      The file data, converted to UChar.
1183 //                      The caller must delete this when done with
1184 //                           delete [] theBuffer;
1185 //
1186 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1187 //           Move this function to some common place.
1188 //
1189 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1190 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1191     UChar       *retPtr  = NULL;
1192     char        *fileBuf = NULL;
1193     UConverter* conv     = NULL;
1194     FILE        *f       = NULL;
1195 
1196     ulen = 0;
1197     if (U_FAILURE(status)) {
1198         return retPtr;
1199     }
1200 
1201     //
1202     //  Open the file.
1203     //
1204     f = fopen(fileName, "rb");
1205     if (f == 0) {
1206         dataerrln("Error opening test data file %s\n", fileName);
1207         status = U_FILE_ACCESS_ERROR;
1208         return NULL;
1209     }
1210     //
1211     //  Read it in
1212     //
1213     int   fileSize;
1214     int   amt_read;
1215 
1216     fseek( f, 0, SEEK_END);
1217     fileSize = ftell(f);
1218     fileBuf = new char[fileSize];
1219     fseek(f, 0, SEEK_SET);
1220     amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
1221     if (amt_read != fileSize || fileSize <= 0) {
1222         errln("Error reading test data file.");
1223         goto cleanUpAndReturn;
1224     }
1225 
1226     //
1227     // Look for a Unicode Signature (BOM) on the data just read
1228     //
1229     int32_t        signatureLength;
1230     const char *   fileBufC;
1231     const char*    bomEncoding;
1232 
1233     fileBufC = fileBuf;
1234     bomEncoding = ucnv_detectUnicodeSignature(
1235         fileBuf, fileSize, &signatureLength, &status);
1236     if(bomEncoding!=NULL ){
1237         fileBufC  += signatureLength;
1238         fileSize  -= signatureLength;
1239         encoding = bomEncoding;
1240     }
1241 
1242     //
1243     // Open a converter to take the rule file to UTF-16
1244     //
1245     conv = ucnv_open(encoding, &status);
1246     if (U_FAILURE(status)) {
1247         goto cleanUpAndReturn;
1248     }
1249 
1250     //
1251     // Convert the rules to UChar.
1252     //  Preflight first to determine required buffer size.
1253     //
1254     ulen = ucnv_toUChars(conv,
1255         NULL,           //  dest,
1256         0,              //  destCapacity,
1257         fileBufC,
1258         fileSize,
1259         &status);
1260     if (status == U_BUFFER_OVERFLOW_ERROR) {
1261         // Buffer Overflow is expected from the preflight operation.
1262         status = U_ZERO_ERROR;
1263 
1264         retPtr = new UChar[ulen+1];
1265         ucnv_toUChars(conv,
1266             retPtr,       //  dest,
1267             ulen+1,
1268             fileBufC,
1269             fileSize,
1270             &status);
1271     }
1272 
1273 cleanUpAndReturn:
1274     fclose(f);
1275     delete []fileBuf;
1276     ucnv_close(conv);
1277     if (U_FAILURE(status)) {
1278         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1279         delete []retPtr;
1280         retPtr = 0;
1281         ulen   = 0;
1282     }
1283     return retPtr;
1284 }
1285 
1286 
1287 
1288 //--------------------------------------------------------------------------------------------
1289 //
1290 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1291 //
1292 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1293 void RBBITest::TestUnicodeFiles() {
1294     RuleBasedBreakIterator  *bi;
1295     UErrorCode               status = U_ZERO_ERROR;
1296 
1297     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1298     TEST_ASSERT_SUCCESS(status);
1299     if (U_SUCCESS(status)) {
1300         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1301     }
1302     delete bi;
1303 
1304     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1305     TEST_ASSERT_SUCCESS(status);
1306     if (U_SUCCESS(status)) {
1307         runUnicodeTestData("WordBreakTest.txt", bi);
1308     }
1309     delete bi;
1310 
1311     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1312     TEST_ASSERT_SUCCESS(status);
1313     if (U_SUCCESS(status)) {
1314         runUnicodeTestData("SentenceBreakTest.txt", bi);
1315     }
1316     delete bi;
1317 
1318     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1319     TEST_ASSERT_SUCCESS(status);
1320     if (U_SUCCESS(status)) {
1321         runUnicodeTestData("LineBreakTest.txt", bi);
1322     }
1323     delete bi;
1324 }
1325 
1326 
1327 // Check for test cases from the Unicode test data files that are known to fail
1328 // and should be skipped as known issues because ICU does not fully implement
1329 // the Unicode specifications, or because ICU includes tailorings that differ from
1330 // the Unicode standard.
1331 //
1332 // Test cases are identified by the test data sequence, which tends to be more stable
1333 // across Unicode versions than the test file line numbers.
1334 //
1335 // The test case with ticket "10666" is a dummy, included as an example.
1336 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1337 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1338     static struct TestCase {
1339         const char *fTicketNum;
1340         const char *fFileName;
1341         const UChar *fString;
1342     } badTestCases[] = {
1343         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1344         // The following tests were originally for
1345         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1346         // However, that ticket has been closed as fixed but these tests still fail, so
1347         // ICU-21097 has been created to investigate and address these remaining issues.
1348         {"21097",  "LineBreakTest.txt", u"-#"},
1349         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1350         {"21097",  "LineBreakTest.txt", u"\u002d\u00a7"},
1351         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1352         {"21097",  "LineBreakTest.txt", u"\u002d\U00050005"},
1353         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1354         {"21097",  "LineBreakTest.txt", u"\u002d\u0e01"},
1355         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1356 
1357         // The following tests were originally for
1358         // Issue ICU-12017 Improve line break around numbers.
1359         // However, that ticket has been closed as fixed but these tests still fail, so
1360         // ICU-21097 has been created to investigate and address these remaining issues.
1361         {"21097", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1362         {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1363         {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1364         {"21097", "LineBreakTest.txt", u"a.2 "},
1365         {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1366         {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1367         {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1368         {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1369         {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1370         {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1371         {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1372         {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1373         {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1374         {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1375     };
1376 
1377     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1378         const TestCase &badCase = badTestCases[n];
1379         if (!strcmp(fileName, badCase.fFileName) &&
1380                 testCase == UnicodeString(badCase.fString)) {
1381             return logKnownIssue(badCase.fTicketNum);
1382         }
1383     }
1384     return FALSE;
1385 }
1386 
1387 
1388 //--------------------------------------------------------------------------------------------
1389 //
1390 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1391 //
1392 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1393 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1394 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1395     UErrorCode  status = U_ZERO_ERROR;
1396 
1397     //
1398     //  Open and read the test data file, put it into a UnicodeString.
1399     //
1400     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1401     char testFileName[1000];
1402     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1403         dataerrln("Can't open test data.  Path too long.");
1404         return;
1405     }
1406     strcpy(testFileName, testDataDirectory);
1407     strcat(testFileName, fileName);
1408 
1409     logln("Opening data file %s\n", fileName);
1410 
1411     int    len;
1412     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1413     if (status != U_FILE_ACCESS_ERROR) {
1414         TEST_ASSERT_SUCCESS(status);
1415         TEST_ASSERT(testFile != NULL);
1416     }
1417     if (U_FAILURE(status) || testFile == NULL) {
1418         return; /* something went wrong, error already output */
1419     }
1420     UnicodeString testFileAsString(TRUE, testFile, len);
1421 
1422     //
1423     //  Parse the test data file using a regular expression.
1424     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1425     //     is identified by which group had a match.
1426     //
1427     //    Caputure Group #                  1          2            3            4           5
1428     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1429     //
1430     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1431     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1432     UnicodeString   testString;
1433     UVector32       breakPositions(status);
1434     int             lineNumber = 1;
1435     TEST_ASSERT_SUCCESS(status);
1436     if (U_FAILURE(status)) {
1437         return;
1438     }
1439 
1440     //
1441     //  Scan through each test case, building up the string to be broken in testString,
1442     //   and the positions that should be boundaries in the breakPositions vector.
1443     //
1444     int spin = 0;
1445     while (tokenMatcher.find()) {
1446         if(tokenMatcher.hitEnd()) {
1447           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1448              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1449              and caused an infinite loop here on EBCDIC systems!
1450           */
1451           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1452           //       return;
1453         }
1454         if (tokenMatcher.start(1, status) >= 0) {
1455             // Scanned a divide sign, indicating a break position in the test data.
1456             if (testString.length()>0) {
1457                 breakPositions.addElement(testString.length(), status);
1458             }
1459         }
1460         else if (tokenMatcher.start(2, status) >= 0) {
1461             // Scanned an 'x', meaning no break at this position in the test data
1462             //   Nothing to be done here.
1463             }
1464         else if (tokenMatcher.start(3, status) >= 0) {
1465             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1466             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1467             int length = hexNumber.length();
1468             if (length<=8) {
1469                 char buf[10];
1470                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1471                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1472                 if (c<=0x10ffff) {
1473                     testString.append(c);
1474                 } else {
1475                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1476                        fileName, lineNumber);
1477                 }
1478             } else {
1479                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1480                        fileName, lineNumber);
1481              }
1482         }
1483         else if (tokenMatcher.start(4, status) >= 0) {
1484             // Scanned to end of a line, possibly skipping over a comment in the process.
1485             //   If the line from the file contained test data, run the test now.
1486             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1487                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1488             }
1489 
1490             // Clear out this test case.
1491             //    The string and breakPositions vector will be refilled as the next
1492             //       test case is parsed.
1493             testString.remove();
1494             breakPositions.removeAllElements();
1495             lineNumber++;
1496         } else {
1497             // Scanner catchall.  Something unrecognized appeared on the line.
1498             char token[16];
1499             UnicodeString uToken = tokenMatcher.group(0, status);
1500             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1501             token[sizeof(token)-1] = 0;
1502             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1503 
1504             // Clean up, in preparation for continuing with the next line.
1505             testString.remove();
1506             breakPositions.removeAllElements();
1507             lineNumber++;
1508         }
1509         TEST_ASSERT_SUCCESS(status);
1510         if (U_FAILURE(status)) {
1511             break;
1512         }
1513     }
1514 
1515     delete [] testFile;
1516  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1517 }
1518 
1519 //--------------------------------------------------------------------------------------------
1520 //
1521 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1522 //                            test data files.  Do only a simple, forward-only check -
1523 //                            this test is mostly to check that ICU and the Unicode
1524 //                            data agree with each other.
1525 //
1526 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1527 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1528                          const UnicodeString &testString,   // Text data to be broken
1529                          UVector32 *breakPositions,         // Positions where breaks should be found.
1530                          RuleBasedBreakIterator *bi) {
1531     int32_t pos;                 // Break Position in the test string
1532     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1533     int32_t expectedPos;         // Expected break position (index into test string)
1534 
1535     bi->setText(testString);
1536     pos = bi->first();
1537     pos = bi->next();
1538 
1539     while (pos != BreakIterator::DONE) {
1540         if (expectedI >= breakPositions->size()) {
1541             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1542                 testFileName, lineNumber, pos);
1543             break;
1544         }
1545         expectedPos = breakPositions->elementAti(expectedI);
1546         if (pos < expectedPos) {
1547             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1548                 testFileName, lineNumber, pos);
1549             break;
1550         }
1551         if (pos > expectedPos) {
1552             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1553                 testFileName, lineNumber, expectedPos);
1554             break;
1555         }
1556         pos = bi->next();
1557         expectedI++;
1558     }
1559 
1560     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1561         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1562             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1563     }
1564 }
1565 
1566 
1567 
1568 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1569 //---------------------------------------------------------------------------------------
1570 //
1571 //   classs RBBIMonkeyKind
1572 //
1573 //      Monkey Test for Break Iteration
1574 //      Abstract interface class.   Concrete derived classes independently
1575 //      implement the break rules for different iterator types.
1576 //
1577 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1578 //      testing, but works purely in terms of the interface defined here.
1579 //
1580 //---------------------------------------------------------------------------------------
1581 class RBBIMonkeyKind {
1582 public:
1583     // Return a UVector of UnicodeSets, representing the character classes used
1584     //   for this type of iterator.
1585     virtual  UVector  *charClasses() = 0;
1586 
1587     // Set the test text on which subsequent calls to next() will operate
1588     virtual  void      setText(const UnicodeString &s) = 0;
1589 
1590     // Find the next break postion, starting from the prev break position, or from zero.
1591     // Return -1 after reaching end of string.
1592     virtual  int32_t   next(int32_t i) = 0;
1593 
1594     // Name of each character class, parallel with charClasses. Used for debugging output
1595     // of characters.
1596     virtual  std::vector<std::string>&     characterClassNames();
1597 
1598     void setAppliedRule(int32_t position, const char* value);
1599 
1600     std::string getAppliedRule(int32_t position);
1601 
1602     virtual ~RBBIMonkeyKind();
1603     UErrorCode deferredStatus;
1604 
1605     std::string classNameFromCodepoint(const UChar32 c);
1606     unsigned int maxClassNameSize();
1607 
1608  protected:
1609      RBBIMonkeyKind();
1610      std::vector<std::string> classNames;
1611      std::vector<std::string> appliedRules;
1612 
1613     // Clear `appliedRules` and fill it with empty strings in the size of test text.
1614     void prepareAppliedRules(int32_t size );
1615 
1616  private:
1617 
1618 };
1619 
RBBIMonkeyKind()1620 RBBIMonkeyKind::RBBIMonkeyKind() {
1621     deferredStatus = U_ZERO_ERROR;
1622 }
1623 
~RBBIMonkeyKind()1624 RBBIMonkeyKind::~RBBIMonkeyKind() {
1625 }
1626 
characterClassNames()1627 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1628     return classNames;
1629 }
1630 
prepareAppliedRules(int32_t size)1631 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1632     // Remove all the information in the `appliedRules`.
1633     appliedRules.clear();
1634     appliedRules.resize(size + 1);
1635 }
1636 
setAppliedRule(int32_t position,const char * value)1637 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1638     appliedRules[position] = value;
1639 }
1640 
getAppliedRule(int32_t position)1641 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1642     return appliedRules[position];
1643 }
1644 
classNameFromCodepoint(const UChar32 c)1645 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1646     // Simply iterate through charClasses to find character's class
1647     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1648         UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1649         if (classSet->contains(c)) {
1650             return classNames[aClassNum];
1651         }
1652     }
1653     U_ASSERT(FALSE);  // This should not happen.
1654     return "bad class name";
1655 }
1656 
maxClassNameSize()1657 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1658     unsigned int maxSize = 0;
1659     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1660         auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1661         if (aClassNumSize > maxSize) {
1662             maxSize = aClassNumSize;
1663         }
1664     }
1665     return maxSize;
1666 }
1667 
1668 //----------------------------------------------------------------------------------------
1669 //
1670 //   Random Numbers.  Similar to standard lib rand() and srand()
1671 //                    Not using library to
1672 //                      1.  Get same results on all platforms.
1673 //                      2.  Get access to current seed, to more easily reproduce failures.
1674 //
1675 //---------------------------------------------------------------------------------------
1676 static uint32_t m_seed = 1;
1677 
m_rand()1678 static uint32_t m_rand()
1679 {
1680     m_seed = m_seed * 1103515245 + 12345;
1681     return (uint32_t)(m_seed/65536) % 32768;
1682 }
1683 
1684 
1685 //------------------------------------------------------------------------------------------
1686 //
1687 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1688 //                             of RBBIMonkeyKind.
1689 //
1690 //------------------------------------------------------------------------------------------
1691 class RBBICharMonkey: public RBBIMonkeyKind {
1692 public:
1693     RBBICharMonkey();
1694     virtual          ~RBBICharMonkey();
1695     virtual  UVector *charClasses();
1696     virtual  void     setText(const UnicodeString &s);
1697     virtual  int32_t  next(int32_t i);
1698 private:
1699     UVector   *fSets;
1700 
1701     UnicodeSet  *fCRLFSet;
1702     UnicodeSet  *fControlSet;
1703     UnicodeSet  *fExtendSet;
1704     UnicodeSet  *fZWJSet;
1705     UnicodeSet  *fRegionalIndicatorSet;
1706     UnicodeSet  *fPrependSet;
1707     UnicodeSet  *fSpacingSet;
1708     UnicodeSet  *fLSet;
1709     UnicodeSet  *fVSet;
1710     UnicodeSet  *fTSet;
1711     UnicodeSet  *fLVSet;
1712     UnicodeSet  *fLVTSet;
1713     UnicodeSet  *fHangulSet;
1714     UnicodeSet  *fExtendedPictSet;
1715     UnicodeSet  *fViramaSet;
1716     UnicodeSet  *fLinkingConsonantSet;
1717     UnicodeSet  *fExtCccZwjSet;
1718     UnicodeSet  *fAnySet;
1719 
1720     const UnicodeString *fText;
1721 };
1722 
1723 
RBBICharMonkey()1724 RBBICharMonkey::RBBICharMonkey() {
1725     UErrorCode  status = U_ZERO_ERROR;
1726 
1727     fText = NULL;
1728 
1729     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1730     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1731     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1732     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1733     fRegionalIndicatorSet =
1734                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1735     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1736     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1737     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1738     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1739     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1740     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1741     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1742     fHangulSet  = new UnicodeSet();
1743     fHangulSet->addAll(*fLSet);
1744     fHangulSet->addAll(*fVSet);
1745     fHangulSet->addAll(*fTSet);
1746     fHangulSet->addAll(*fLVSet);
1747     fHangulSet->addAll(*fLVTSet);
1748 
1749     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1750     fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1751                                         "\\p{Indic_Syllabic_Category=Virama}]", status);
1752     fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1753                                         "\\p{Indic_Syllabic_Category=Consonant}]", status);
1754     fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1755     fAnySet           = new UnicodeSet(0, 0x10ffff);
1756 
1757     // Create sets of characters, and add the names of the above character sets.
1758     // In each new ICU release, add new names corresponding to the sets above.
1759     fSets             = new UVector(status);
1760 
1761     // Important: Keep class names the same as the class contents.
1762     fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1763     fSets->addElement(fControlSet, status); classNames.push_back("Control");
1764     fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1765     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1766     if (!fPrependSet->isEmpty()) {
1767         fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1768     }
1769     fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1770     fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1771     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1772     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1773     fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1774     fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1775     fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1776     fSets->addElement(fAnySet, status); classNames.push_back("Any");
1777 
1778     if (U_FAILURE(status)) {
1779         deferredStatus = status;
1780     }
1781 }
1782 
1783 
setText(const UnicodeString & s)1784 void RBBICharMonkey::setText(const UnicodeString &s) {
1785     fText = &s;
1786     prepareAppliedRules(s.length());
1787 }
1788 
1789 
1790 
next(int32_t prevPos)1791 int32_t RBBICharMonkey::next(int32_t prevPos) {
1792     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1793                               //   break position being tested.  The candidate break
1794                               //   location is before p2.
1795 
1796     int     breakPos = -1;
1797 
1798     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1799     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1800 
1801     if (U_FAILURE(deferredStatus)) {
1802         return -1;
1803     }
1804 
1805     // Previous break at end of string.  return DONE.
1806     if (prevPos >= fText->length()) {
1807         return -1;
1808     }
1809 
1810     p0 = p1 = p2 = p3 = prevPos;
1811     c3 =  fText->char32At(prevPos);
1812     c0 = c1 = c2 = cBase = 0;
1813     (void)p0;   // suppress set but not used warning.
1814     (void)c0;
1815 
1816     // Loop runs once per "significant" character position in the input text.
1817     for (;;) {
1818         // Move all of the positions forward in the input string.
1819         p0 = p1;  c0 = c1;
1820         p1 = p2;  c1 = c2;
1821         p2 = p3;  c2 = c3;
1822 
1823         // Advance p3 by one codepoint
1824         p3 = fText->moveIndex32(p3, 1);
1825         c3 = fText->char32At(p3);
1826 
1827         if (p1 == p2) {
1828             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1829             continue;
1830         }
1831 
1832         if (p2 == fText->length()) {
1833             setAppliedRule(p2, "End of String");
1834             break;
1835         }
1836 
1837         //     No Extend or Format characters may appear between the CR and LF,
1838         //     which requires the additional check for p2 immediately following p1.
1839         //
1840         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1841           setAppliedRule(p2, "GB3   CR x LF");
1842           continue;
1843         }
1844 
1845         if (fControlSet->contains(c1) ||
1846             c1 == 0x0D ||
1847             c1 == 0x0A)  {
1848           setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
1849           break;
1850         }
1851 
1852         if (fControlSet->contains(c2) ||
1853             c2 == 0x0D ||
1854             c2 == 0x0A)  {
1855             setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
1856             break;
1857         }
1858 
1859         if (fLSet->contains(c1) &&
1860                (fLSet->contains(c2)  ||
1861                 fVSet->contains(c2)  ||
1862                 fLVSet->contains(c2) ||
1863                 fLVTSet->contains(c2))) {
1864             setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
1865             continue;
1866         }
1867 
1868         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1869             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1870             setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
1871             continue;
1872         }
1873 
1874         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1875             fTSet->contains(c2))  {
1876             setAppliedRule(p2, "GB8   ( LVT | T)  x T");
1877             continue;
1878         }
1879 
1880         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1881             if (!fExtendSet->contains(c1)) {
1882                 cBase = c1;
1883             }
1884             setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
1885             continue;
1886         }
1887 
1888         if (fSpacingSet->contains(c2)) {
1889             setAppliedRule(p2, "GB9a  x  SpacingMark");
1890             continue;
1891         }
1892 
1893         if (fPrependSet->contains(c1)) {
1894             setAppliedRule(p2, "GB9b  Prepend x");
1895             continue;
1896         }
1897 
1898         //   Note: Viramas are also included in the ExtCccZwj class.
1899         if (fLinkingConsonantSet->contains(c2)) {
1900             int pi = p1;
1901             bool sawVirama = false;
1902             while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1903                 if (fViramaSet->contains(fText->char32At(pi))) {
1904                     sawVirama = true;
1905                 }
1906                 pi = fText->moveIndex32(pi, -1);
1907             }
1908             if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1909               setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1910               continue;
1911             }
1912         }
1913 
1914         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1915           setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1916           continue;
1917         }
1918 
1919         //                   Note: The first if condition is a little tricky. We only need to force
1920         //                      a break if there are three or more contiguous RIs. If there are
1921         //                      only two, a break following will occur via other rules, and will include
1922         //                      any trailing extend characters, which is needed behavior.
1923         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1924                 && fRegionalIndicatorSet->contains(c2)) {
1925           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1926           break;
1927         }
1928         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1929           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1930           continue;
1931         }
1932 
1933         setAppliedRule(p2, "GB999 Any <break> Any");
1934         break;
1935     }
1936 
1937     breakPos = p2;
1938     return breakPos;
1939 }
1940 
1941 
1942 
charClasses()1943 UVector  *RBBICharMonkey::charClasses() {
1944     return fSets;
1945 }
1946 
~RBBICharMonkey()1947 RBBICharMonkey::~RBBICharMonkey() {
1948     delete fSets;
1949     delete fCRLFSet;
1950     delete fControlSet;
1951     delete fExtendSet;
1952     delete fRegionalIndicatorSet;
1953     delete fPrependSet;
1954     delete fSpacingSet;
1955     delete fLSet;
1956     delete fVSet;
1957     delete fTSet;
1958     delete fLVSet;
1959     delete fLVTSet;
1960     delete fHangulSet;
1961     delete fAnySet;
1962     delete fZWJSet;
1963     delete fExtendedPictSet;
1964     delete fViramaSet;
1965     delete fLinkingConsonantSet;
1966     delete fExtCccZwjSet;
1967 }
1968 
1969 //------------------------------------------------------------------------------------------
1970 //
1971 //   class RBBIWordMonkey      Word Break specific implementation
1972 //                             of RBBIMonkeyKind.
1973 //
1974 //------------------------------------------------------------------------------------------
1975 class RBBIWordMonkey: public RBBIMonkeyKind {
1976 public:
1977     RBBIWordMonkey();
1978     virtual          ~RBBIWordMonkey();
1979     virtual  UVector *charClasses();
1980     virtual  void     setText(const UnicodeString &s);
1981     virtual int32_t   next(int32_t i);
1982 private:
1983     UVector      *fSets;
1984 
1985     UnicodeSet  *fCRSet;
1986     UnicodeSet  *fLFSet;
1987     UnicodeSet  *fNewlineSet;
1988     UnicodeSet  *fRegionalIndicatorSet;
1989     UnicodeSet  *fKatakanaSet;
1990     UnicodeSet  *fHebrew_LetterSet;
1991     UnicodeSet  *fALetterSet;
1992     UnicodeSet  *fSingle_QuoteSet;
1993     UnicodeSet  *fDouble_QuoteSet;
1994     UnicodeSet  *fMidNumLetSet;
1995     UnicodeSet  *fMidLetterSet;
1996     UnicodeSet  *fMidNumSet;
1997     UnicodeSet  *fNumericSet;
1998     UnicodeSet  *fFormatSet;
1999     UnicodeSet  *fOtherSet;
2000     UnicodeSet  *fExtendSet;
2001     UnicodeSet  *fExtendNumLetSet;
2002     UnicodeSet  *fWSegSpaceSet;
2003     UnicodeSet  *fDictionarySet;
2004     UnicodeSet  *fZWJSet;
2005     UnicodeSet  *fExtendedPictSet;
2006 
2007     const UnicodeString  *fText;
2008 };
2009 
2010 
RBBIWordMonkey()2011 RBBIWordMonkey::RBBIWordMonkey()
2012 {
2013     UErrorCode  status = U_ZERO_ERROR;
2014 
2015     fSets            = new UVector(status);
2016 
2017     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
2018     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
2019     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
2020     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
2021     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2022     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2023     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2024     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
2025     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
2026     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
2027     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]",    status);
2028     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
2029     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
2030     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
2031     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
2032     // There are some sc=Hani characters with WB=Extend.
2033     // The break rules need to pick one or the other because
2034     // Extend overlapping with something else is messy.
2035     // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
2036     // in $Han (for $dictionary) and out of $Extend.
2037     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
2038     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
2039 
2040     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
2041     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
2042 
2043     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2044     fDictionarySet->addAll(*fKatakanaSet);
2045     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2046 
2047     fALetterSet->removeAll(*fDictionarySet);
2048 
2049     fOtherSet        = new UnicodeSet();
2050     if(U_FAILURE(status)) {
2051         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2052         deferredStatus = status;
2053         return;
2054     }
2055 
2056     fOtherSet->complement();
2057     fOtherSet->removeAll(*fCRSet);
2058     fOtherSet->removeAll(*fLFSet);
2059     fOtherSet->removeAll(*fNewlineSet);
2060     fOtherSet->removeAll(*fKatakanaSet);
2061     fOtherSet->removeAll(*fHebrew_LetterSet);
2062     fOtherSet->removeAll(*fALetterSet);
2063     fOtherSet->removeAll(*fSingle_QuoteSet);
2064     fOtherSet->removeAll(*fDouble_QuoteSet);
2065     fOtherSet->removeAll(*fMidLetterSet);
2066     fOtherSet->removeAll(*fMidNumSet);
2067     fOtherSet->removeAll(*fNumericSet);
2068     fOtherSet->removeAll(*fExtendNumLetSet);
2069     fOtherSet->removeAll(*fWSegSpaceSet);
2070     fOtherSet->removeAll(*fFormatSet);
2071     fOtherSet->removeAll(*fExtendSet);
2072     fOtherSet->removeAll(*fRegionalIndicatorSet);
2073     fOtherSet->removeAll(*fZWJSet);
2074     fOtherSet->removeAll(*fExtendedPictSet);
2075 
2076     // Inhibit dictionary characters from being tested at all.
2077     fOtherSet->removeAll(*fDictionarySet);
2078 
2079     // Add classes and their names
2080     fSets->addElement(fCRSet, status); classNames.push_back("CR");
2081     fSets->addElement(fLFSet, status); classNames.push_back("LF");
2082     fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2083     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2084     fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2085     fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2086     fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2087     fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2088     // Omit Katakana from fSets, which omits Katakana characters
2089     // from the test data. They are all in the dictionary set,
2090     // which this (old, to be retired) monkey test cannot handle.
2091     //fSets->addElement(fKatakanaSet, status);
2092 
2093     fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2094     fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2095     fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2096     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2097     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2098     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2099     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2100     fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2101     fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2102 
2103     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2104     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2105 
2106     if (U_FAILURE(status)) {
2107         deferredStatus = status;
2108     }
2109 }
2110 
setText(const UnicodeString & s)2111 void RBBIWordMonkey::setText(const UnicodeString &s) {
2112     fText       = &s;
2113     prepareAppliedRules(s.length());
2114 }
2115 
2116 
next(int32_t prevPos)2117 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2118     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2119                               //   break position being tested.  The candidate break
2120                               //   location is before p2.
2121 
2122     int     breakPos = -1;
2123 
2124     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2125 
2126     if (U_FAILURE(deferredStatus)) {
2127         return -1;
2128     }
2129 
2130     // Prev break at end of string.  return DONE.
2131     if (prevPos >= fText->length()) {
2132         return -1;
2133     }
2134     p0 = p1 = p2 = p3 = prevPos;
2135     c3 =  fText->char32At(prevPos);
2136     c0 = c1 = c2 = 0;
2137     (void)p0;       // Suppress set but not used warning.
2138 
2139     // Loop runs once per "significant" character position in the input text.
2140     for (;;) {
2141         // Move all of the positions forward in the input string.
2142         p0 = p1;  c0 = c1;
2143         p1 = p2;  c1 = c2;
2144         p2 = p3;  c2 = c3;
2145 
2146         // Advance p3 by    X(Extend | Format)*   Rule 4
2147         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2148         do {
2149             p3 = fText->moveIndex32(p3, 1);
2150             c3 = fText->char32At(p3);
2151             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2152                break;
2153             }
2154         }
2155         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2156 
2157 
2158         if (p1 == p2) {
2159             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2160             continue;
2161         }
2162 
2163         if (p2 == fText->length()) {
2164             // Reached end of string.  Always a break position.
2165             break;
2166         }
2167 
2168         //     No Extend or Format characters may appear between the CR and LF,
2169         //     which requires the additional check for p2 immediately following p1.
2170         //
2171         if (c1==0x0D && c2==0x0A) {
2172           setAppliedRule(p2, "WB3   CR x LF");
2173           continue;
2174         }
2175 
2176         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2177             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2178             break;
2179         }
2180         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2181             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2182             break;
2183         }
2184 
2185         //              Not ignoring extend chars, so peek into input text to
2186         //              get the potential ZWJ, the character immediately preceding c2.
2187         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2188         //              but char32At will get the full code point.
2189         if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2190             setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
2191             continue;
2192         }
2193 
2194         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2195             setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
2196             continue;
2197         }
2198 
2199         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2200             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2201             setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2202             continue;
2203         }
2204 
2205         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2206              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2207              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2208             setAppliedRule(p2,
2209                            "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2210             continue;
2211         }
2212 
2213         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2214             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2215             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2216             setAppliedRule(p2,
2217                            "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
2218             continue;
2219         }
2220 
2221         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2222             setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
2223             continue;
2224         }
2225 
2226           if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2227             setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
2228             continue;
2229         }
2230 
2231         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2232             setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
2233             continue;
2234         }
2235 
2236         if (fNumericSet->contains(c1) &&
2237             fNumericSet->contains(c2)) {
2238             setAppliedRule(p2, "WB8   Numeric x Numeric");
2239             continue;
2240         }
2241 
2242         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2243             fNumericSet->contains(c2)) {
2244             setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
2245             continue;
2246         }
2247 
2248         if (fNumericSet->contains(c1) &&
2249             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2250             setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
2251             continue;
2252         }
2253 
2254           if (fNumericSet->contains(c0) &&
2255             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2256             fNumericSet->contains(c2)) {
2257             setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
2258             continue;
2259         }
2260 
2261         if (fNumericSet->contains(c1) &&
2262             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2263             fNumericSet->contains(c3)) {
2264             setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2265             continue;
2266         }
2267 
2268         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2269         //                  all Katakana are handled by the dictionary breaker.
2270         if (fKatakanaSet->contains(c1) &&
2271             fKatakanaSet->contains(c2))  {
2272             setAppliedRule(p2, "WB13  Katakana x Katakana");
2273             continue;
2274         }
2275 
2276         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2277              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2278              fExtendNumLetSet->contains(c2)) {
2279             setAppliedRule(p2,
2280                            "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2281             continue;
2282         }
2283 
2284         if (fExtendNumLetSet->contains(c1) &&
2285                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2286                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2287             setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2288             continue;
2289         }
2290 
2291         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2292             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2293             break;
2294         }
2295         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2296             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2297             continue;
2298         }
2299 
2300         setAppliedRule(p2, "WB999");
2301         break;
2302     }
2303 
2304     breakPos = p2;
2305     return breakPos;
2306 }
2307 
2308 
charClasses()2309 UVector  *RBBIWordMonkey::charClasses() {
2310     return fSets;
2311 }
2312 
~RBBIWordMonkey()2313 RBBIWordMonkey::~RBBIWordMonkey() {
2314     delete fSets;
2315     delete fCRSet;
2316     delete fLFSet;
2317     delete fNewlineSet;
2318     delete fKatakanaSet;
2319     delete fHebrew_LetterSet;
2320     delete fALetterSet;
2321     delete fSingle_QuoteSet;
2322     delete fDouble_QuoteSet;
2323     delete fMidNumLetSet;
2324     delete fMidLetterSet;
2325     delete fMidNumSet;
2326     delete fNumericSet;
2327     delete fFormatSet;
2328     delete fExtendSet;
2329     delete fExtendNumLetSet;
2330     delete fWSegSpaceSet;
2331     delete fRegionalIndicatorSet;
2332     delete fDictionarySet;
2333     delete fOtherSet;
2334     delete fZWJSet;
2335     delete fExtendedPictSet;
2336 }
2337 
2338 
2339 
2340 
2341 //------------------------------------------------------------------------------------------
2342 //
2343 //   class RBBISentMonkey      Sentence Break specific implementation
2344 //                             of RBBIMonkeyKind.
2345 //
2346 //------------------------------------------------------------------------------------------
2347 class RBBISentMonkey: public RBBIMonkeyKind {
2348 public:
2349     RBBISentMonkey();
2350     virtual          ~RBBISentMonkey();
2351     virtual  UVector *charClasses();
2352     virtual  void     setText(const UnicodeString &s);
2353     virtual int32_t   next(int32_t i);
2354 private:
2355     int               moveBack(int posFrom);
2356     int               moveForward(int posFrom);
2357     UChar32           cAt(int pos);
2358 
2359     UVector      *fSets;
2360 
2361     UnicodeSet  *fSepSet;
2362     UnicodeSet  *fFormatSet;
2363     UnicodeSet  *fSpSet;
2364     UnicodeSet  *fLowerSet;
2365     UnicodeSet  *fUpperSet;
2366     UnicodeSet  *fOLetterSet;
2367     UnicodeSet  *fNumericSet;
2368     UnicodeSet  *fATermSet;
2369     UnicodeSet  *fSContinueSet;
2370     UnicodeSet  *fSTermSet;
2371     UnicodeSet  *fCloseSet;
2372     UnicodeSet  *fOtherSet;
2373     UnicodeSet  *fExtendSet;
2374 
2375     const UnicodeString  *fText;
2376 };
2377 
RBBISentMonkey()2378 RBBISentMonkey::RBBISentMonkey()
2379 {
2380     UErrorCode  status = U_ZERO_ERROR;
2381 
2382     fSets            = new UVector(status);
2383 
2384     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2385     //                       set and made into character classes of their own.  For the monkey impl,
2386     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2387     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2388     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2389     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2390     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2391     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2392     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2393     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2394     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2395     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2396     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2397     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2398     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2399     fOtherSet        = new UnicodeSet();
2400 
2401     if(U_FAILURE(status)) {
2402       deferredStatus = status;
2403       return;
2404     }
2405 
2406     fOtherSet->complement();
2407     fOtherSet->removeAll(*fSepSet);
2408     fOtherSet->removeAll(*fFormatSet);
2409     fOtherSet->removeAll(*fSpSet);
2410     fOtherSet->removeAll(*fLowerSet);
2411     fOtherSet->removeAll(*fUpperSet);
2412     fOtherSet->removeAll(*fOLetterSet);
2413     fOtherSet->removeAll(*fNumericSet);
2414     fOtherSet->removeAll(*fATermSet);
2415     fOtherSet->removeAll(*fSContinueSet);
2416     fOtherSet->removeAll(*fSTermSet);
2417     fOtherSet->removeAll(*fCloseSet);
2418     fOtherSet->removeAll(*fExtendSet);
2419 
2420     fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2421     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2422     fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2423     fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2424     fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2425     fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2426     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2427     fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2428     fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2429     fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2430     fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2431     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2432     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2433 
2434     if (U_FAILURE(status)) {
2435         deferredStatus = status;
2436     }
2437 }
2438 
2439 
2440 
setText(const UnicodeString & s)2441 void RBBISentMonkey::setText(const UnicodeString &s) {
2442     fText       = &s;
2443     prepareAppliedRules(s.length());
2444 }
2445 
charClasses()2446 UVector  *RBBISentMonkey::charClasses() {
2447     return fSets;
2448 }
2449 
2450 //  moveBack()   Find the "significant" code point preceding the index i.
2451 //               Skips over ($Extend | $Format)* .
2452 //
moveBack(int i)2453 int RBBISentMonkey::moveBack(int i) {
2454     if (i <= 0) {
2455         return -1;
2456     }
2457     UChar32   c;
2458     int32_t   j = i;
2459     do {
2460         j = fText->moveIndex32(j, -1);
2461         c = fText->char32At(j);
2462     }
2463     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2464     return j;
2465 
2466  }
2467 
2468 
moveForward(int i)2469 int RBBISentMonkey::moveForward(int i) {
2470     if (i>=fText->length()) {
2471         return fText->length();
2472     }
2473     UChar32   c;
2474     int32_t   j = i;
2475     do {
2476         j = fText->moveIndex32(j, 1);
2477         c = cAt(j);
2478     }
2479     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2480     return j;
2481 }
2482 
cAt(int pos)2483 UChar32 RBBISentMonkey::cAt(int pos) {
2484     if (pos<0 || pos>=fText->length()) {
2485         return -1;
2486     } else {
2487         return fText->char32At(pos);
2488     }
2489 }
2490 
next(int32_t prevPos)2491 int32_t RBBISentMonkey::next(int32_t prevPos) {
2492     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2493                               //   break position being tested.  The candidate break
2494                               //   location is before p2.
2495 
2496     int     breakPos = -1;
2497 
2498     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2499     UChar32 c;
2500 
2501     if (U_FAILURE(deferredStatus)) {
2502         return -1;
2503     }
2504 
2505     // Prev break at end of string.  return DONE.
2506     if (prevPos >= fText->length()) {
2507         return -1;
2508     }
2509     p0 = p1 = p2 = p3 = prevPos;
2510     c3 =  fText->char32At(prevPos);
2511     c0 = c1 = c2 = 0;
2512     (void)p0;     // Suppress set but not used warning.
2513 
2514     // Loop runs once per "significant" character position in the input text.
2515     for (;;) {
2516         // Move all of the positions forward in the input string.
2517         p0 = p1;  c0 = c1;
2518         p1 = p2;  c1 = c2;
2519         p2 = p3;  c2 = c3;
2520 
2521         // Advance p3 by    X(Extend | Format)*   Rule 4
2522         p3 = moveForward(p3);
2523         c3 = cAt(p3);
2524 
2525         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2526             setAppliedRule(p2, "SB3   CR x LF");
2527             continue;
2528         }
2529 
2530         if (fSepSet->contains(c1)) {
2531             p2 = p1+1;   // Separators don't combine with Extend or Format.
2532 
2533             setAppliedRule(p2, "SB4   Sep  <break>");
2534             break;
2535         }
2536 
2537         if (p2 >= fText->length()) {
2538             // Reached end of string.  Always a break position.
2539             setAppliedRule(p2, "SB4   Sep  <break>");
2540             break;
2541         }
2542 
2543         if (p2 == prevPos) {
2544             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2545             setAppliedRule(p2, "SB4   Sep  <break>");
2546             continue;
2547         }
2548 
2549         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2550             setAppliedRule(p2, "SB6   ATerm x Numeric");
2551             continue;
2552         }
2553 
2554           if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2555                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2556             setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
2557             continue;
2558         }
2559 
2560         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2561         //                  note to the Unicode 5.0 documents.
2562         int p8 = p1;
2563         while (fSpSet->contains(cAt(p8))) {
2564             p8 = moveBack(p8);
2565         }
2566         while (fCloseSet->contains(cAt(p8))) {
2567             p8 = moveBack(p8);
2568         }
2569         if (fATermSet->contains(cAt(p8))) {
2570             p8=p2;
2571             for (;;) {
2572                 c = cAt(p8);
2573                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2574                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2575                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2576 
2577                     setAppliedRule(p2,
2578                                    "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2579                     break;
2580                 }
2581                 p8 = moveForward(p8);
2582             }
2583             if (fLowerSet->contains(cAt(p8))) {
2584 
2585                 setAppliedRule(p2,
2586                                "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2587                 continue;
2588             }
2589         }
2590 
2591         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2592             p8 = p1;
2593             while (fSpSet->contains(cAt(p8))) {
2594                 p8 = moveBack(p8);
2595             }
2596             while (fCloseSet->contains(cAt(p8))) {
2597                 p8 = moveBack(p8);
2598             }
2599             c = cAt(p8);
2600             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2601                 setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2602                 continue;
2603             }
2604         }
2605 
2606         int p9 = p1;
2607         while (fCloseSet->contains(cAt(p9))) {
2608             p9 = moveBack(p9);
2609         }
2610         c = cAt(p9);
2611         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2612             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2613 
2614                 setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
2615                 continue;
2616             }
2617         }
2618 
2619         int p10 = p1;
2620         while (fSpSet->contains(cAt(p10))) {
2621             p10 = moveBack(p10);
2622         }
2623         while (fCloseSet->contains(cAt(p10))) {
2624             p10 = moveBack(p10);
2625         }
2626         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2627             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2628                 setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
2629                 continue;
2630             }
2631         }
2632 
2633         int p11 = p1;
2634         if (fSepSet->contains(cAt(p11))) {
2635             p11 = moveBack(p11);
2636         }
2637         while (fSpSet->contains(cAt(p11))) {
2638             p11 = moveBack(p11);
2639         }
2640         while (fCloseSet->contains(cAt(p11))) {
2641             p11 = moveBack(p11);
2642         }
2643         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2644           setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
2645             break;
2646         }
2647 
2648         setAppliedRule(p2, "SB12  Any x Any");
2649         continue;
2650     }
2651 
2652     breakPos = p2;
2653     return breakPos;
2654 }
2655 
~RBBISentMonkey()2656 RBBISentMonkey::~RBBISentMonkey() {
2657     delete fSets;
2658     delete fSepSet;
2659     delete fFormatSet;
2660     delete fSpSet;
2661     delete fLowerSet;
2662     delete fUpperSet;
2663     delete fOLetterSet;
2664     delete fNumericSet;
2665     delete fATermSet;
2666     delete fSContinueSet;
2667     delete fSTermSet;
2668     delete fCloseSet;
2669     delete fOtherSet;
2670     delete fExtendSet;
2671 }
2672 
2673 
2674 
2675 //-------------------------------------------------------------------------------------------
2676 //
2677 //  RBBILineMonkey
2678 //
2679 //-------------------------------------------------------------------------------------------
2680 
2681 class RBBILineMonkey: public RBBIMonkeyKind {
2682 public:
2683     RBBILineMonkey();
2684     virtual          ~RBBILineMonkey();
2685     virtual  UVector *charClasses();
2686     virtual  void     setText(const UnicodeString &s);
2687     virtual  int32_t  next(int32_t i);
2688     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2689 private:
2690     UVector      *fSets;
2691 
2692     UnicodeSet  *fBK;
2693     UnicodeSet  *fCR;
2694     UnicodeSet  *fLF;
2695     UnicodeSet  *fCM;
2696     UnicodeSet  *fNL;
2697     UnicodeSet  *fSG;
2698     UnicodeSet  *fWJ;
2699     UnicodeSet  *fZW;
2700     UnicodeSet  *fGL;
2701     UnicodeSet  *fCB;
2702     UnicodeSet  *fSP;
2703     UnicodeSet  *fB2;
2704     UnicodeSet  *fBA;
2705     UnicodeSet  *fBB;
2706     UnicodeSet  *fHH;
2707     UnicodeSet  *fHY;
2708     UnicodeSet  *fH2;
2709     UnicodeSet  *fH3;
2710     UnicodeSet  *fCL;
2711     UnicodeSet  *fCP;
2712     UnicodeSet  *fEX;
2713     UnicodeSet  *fIN;
2714     UnicodeSet  *fJL;
2715     UnicodeSet  *fJV;
2716     UnicodeSet  *fJT;
2717     UnicodeSet  *fNS;
2718     UnicodeSet  *fOP;
2719     UnicodeSet  *fQU;
2720     UnicodeSet  *fIS;
2721     UnicodeSet  *fNU;
2722     UnicodeSet  *fPO;
2723     UnicodeSet  *fPR;
2724     UnicodeSet  *fSY;
2725     UnicodeSet  *fAI;
2726     UnicodeSet  *fAL;
2727     UnicodeSet  *fCJ;
2728     UnicodeSet  *fHL;
2729     UnicodeSet  *fID;
2730     UnicodeSet  *fRI;
2731     UnicodeSet  *fXX;
2732     UnicodeSet  *fEB;
2733     UnicodeSet  *fEM;
2734     UnicodeSet  *fZWJ;
2735     UnicodeSet  *fOP30;
2736     UnicodeSet  *fCP30;
2737 
2738     BreakIterator        *fCharBI;
2739     const UnicodeString  *fText;
2740     RegexMatcher         *fNumberMatcher;
2741 };
2742 
RBBILineMonkey()2743 RBBILineMonkey::RBBILineMonkey() :
2744     RBBIMonkeyKind(),
2745     fSets(NULL),
2746 
2747     fCharBI(NULL),
2748     fText(NULL),
2749     fNumberMatcher(NULL)
2750 
2751 {
2752     if (U_FAILURE(deferredStatus)) {
2753         return;
2754     }
2755 
2756     UErrorCode  status = U_ZERO_ERROR;
2757 
2758     fSets  = new UVector(status);
2759 
2760     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2761     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2762     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2763     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2764     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2765     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2766     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2767     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2768     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2769     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2770     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2771     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2772     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2773     fHH    = new UnicodeSet();
2774     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2775     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2776     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2777     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2778     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2779     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2780     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2781     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2782     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2783     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2784     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2785     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2786     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2787     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2788     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2789     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2790     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2791     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2792     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2793     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2794     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2795     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2796     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2797     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2798     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2799     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2800     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2801     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2802     fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2803     fOP30  = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2804     fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2805 
2806     if (U_FAILURE(status)) {
2807         deferredStatus = status;
2808         return;
2809     }
2810 
2811     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2812     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2813     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2814 
2815     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2816     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2817 
2818     fHH->add(u'\u2010');   // Hyphen, '‐'
2819 
2820     // Sets and names.
2821     fSets->addElement(fBK, status); classNames.push_back("fBK");
2822     fSets->addElement(fCR, status); classNames.push_back("fCR");
2823     fSets->addElement(fLF, status); classNames.push_back("fLF");
2824     fSets->addElement(fCM, status); classNames.push_back("fCM");
2825     fSets->addElement(fNL, status); classNames.push_back("fNL");
2826     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2827     fSets->addElement(fZW, status); classNames.push_back("fZW");
2828     fSets->addElement(fGL, status); classNames.push_back("fGL");
2829     fSets->addElement(fCB, status); classNames.push_back("fCB");
2830     fSets->addElement(fSP, status); classNames.push_back("fSP");
2831     fSets->addElement(fB2, status); classNames.push_back("fB2");
2832     fSets->addElement(fBA, status); classNames.push_back("fBA");
2833     fSets->addElement(fBB, status); classNames.push_back("fBB");
2834     fSets->addElement(fHY, status); classNames.push_back("fHY");
2835     fSets->addElement(fH2, status); classNames.push_back("fH2");
2836     fSets->addElement(fH3, status); classNames.push_back("fH3");
2837     fSets->addElement(fCL, status); classNames.push_back("fCL");
2838     fSets->addElement(fCP, status); classNames.push_back("fCP");
2839     fSets->addElement(fEX, status); classNames.push_back("fEX");
2840     fSets->addElement(fIN, status); classNames.push_back("fIN");
2841     fSets->addElement(fJL, status); classNames.push_back("fJL");
2842     fSets->addElement(fJT, status); classNames.push_back("fJT");
2843     fSets->addElement(fJV, status); classNames.push_back("fJV");
2844     fSets->addElement(fNS, status); classNames.push_back("fNS");
2845     fSets->addElement(fOP, status); classNames.push_back("fOP");
2846     fSets->addElement(fQU, status); classNames.push_back("fQU");
2847     fSets->addElement(fIS, status); classNames.push_back("fIS");
2848     fSets->addElement(fNU, status); classNames.push_back("fNU");
2849     fSets->addElement(fPO, status); classNames.push_back("fPO");
2850     fSets->addElement(fPR, status); classNames.push_back("fPR");
2851     fSets->addElement(fSY, status); classNames.push_back("fSY");
2852     fSets->addElement(fAI, status); classNames.push_back("fAI");
2853     fSets->addElement(fAL, status); classNames.push_back("fAL");
2854     fSets->addElement(fHL, status); classNames.push_back("fHL");
2855     fSets->addElement(fID, status); classNames.push_back("fID");
2856     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2857     fSets->addElement(fRI, status); classNames.push_back("fRI");
2858     fSets->addElement(fSG, status); classNames.push_back("fSG");
2859     fSets->addElement(fEB, status); classNames.push_back("fEB");
2860     fSets->addElement(fEM, status); classNames.push_back("fEM");
2861     fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2862     // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2863     fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2864     fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2865 
2866     const char *rules =
2867             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2868             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2869             "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2870             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2871             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2872             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2873             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2874 
2875     fNumberMatcher = new RegexMatcher(
2876         UnicodeString(rules, -1, US_INV), 0, status);
2877 
2878     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2879 
2880     if (U_FAILURE(status)) {
2881         deferredStatus = status;
2882     }
2883 
2884 }
2885 
2886 
setText(const UnicodeString & s)2887 void RBBILineMonkey::setText(const UnicodeString &s) {
2888     fText       = &s;
2889     fCharBI->setText(s);
2890     prepareAppliedRules(s.length());
2891     fNumberMatcher->reset(s);
2892 }
2893 
2894 //
2895 //  rule9Adjust
2896 //     Line Break TR rules 9 and 10 implementation.
2897 //     This deals with combining marks and other sequences that
2898 //     that must be treated as if they were something other than what they actually are.
2899 //
2900 //     This is factored out into a separate function because it must be applied twice for
2901 //     each potential break, once to the chars before the position being checked, then
2902 //     again to the text following the possible break.
2903 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2904 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2905     if (pos == -1) {
2906         // Invalid initial position.  Happens during the warmup iteration of the
2907         //   main loop in next().
2908         return;
2909     }
2910 
2911     int32_t  nPos = *nextPos;
2912 
2913     // LB 9  Keep combining sequences together.
2914     // advance over any CM class chars.  Note that Line Break CM is different
2915     // from the normal Grapheme Extend property.
2916     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2917           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2918         for (;;) {
2919             *nextChar = fText->char32At(nPos);
2920             if (!fCM->contains(*nextChar)) {
2921                 break;
2922             }
2923             nPos = fText->moveIndex32(nPos, 1);
2924         }
2925     }
2926 
2927 
2928     // LB 9 Treat X CM* as if it were x.
2929     //       No explicit action required.
2930 
2931     // LB 10  Treat any remaining combining mark as AL
2932     if (fCM->contains(*posChar)) {
2933         *posChar = u'A';
2934     }
2935 
2936     // Push the updated nextPos and nextChar back to our caller.
2937     // This only makes a difference if posChar got bigger by consuming a
2938     // combining sequence.
2939     *nextPos  = nPos;
2940     *nextChar = fText->char32At(nPos);
2941 }
2942 
2943 
2944 
next(int32_t startPos)2945 int32_t RBBILineMonkey::next(int32_t startPos) {
2946     UErrorCode status = U_ZERO_ERROR;
2947     int32_t    pos;       //  Index of the char following a potential break position
2948     UChar32    thisChar;  //  Character at above position "pos"
2949 
2950     int32_t    prevPos;   //  Index of the char preceding a potential break position
2951     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2952                           //   and thisChar may not be adjacent because combining
2953                           //   characters between them will be ignored.
2954 
2955     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2956     UChar32    prevCharX2;
2957 
2958     int32_t    nextPos;   //  Index of the next character following pos.
2959                           //     Usually skips over combining marks.
2960     int32_t    nextCPPos; //  Index of the code point following "pos."
2961                           //     May point to a combining mark.
2962     int32_t    tPos;      //  temp value.
2963     UChar32    c;
2964 
2965     if (U_FAILURE(deferredStatus)) {
2966         return -1;
2967     }
2968 
2969     if (startPos >= fText->length()) {
2970         return -1;
2971     }
2972 
2973 
2974     // Initial values for loop.  Loop will run the first time without finding breaks,
2975     //                           while the invalid values shift out and the "this" and
2976     //                           "prev" positions are filled in with good values.
2977     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2978     thisChar = prevChar  = prevCharX2 = 0;
2979     nextPos  = nextCPPos = startPos;
2980 
2981 
2982     // Loop runs once per position in the test text, until a break position
2983     //  is found.
2984     for (;;) {
2985         prevPosX2 = prevPos;
2986         prevCharX2 = prevChar;
2987 
2988         prevPos   = pos;
2989         prevChar  = thisChar;
2990 
2991         pos       = nextPos;
2992         thisChar  = fText->char32At(pos);
2993 
2994         nextCPPos = fText->moveIndex32(pos, 1);
2995         nextPos   = nextCPPos;
2996 
2997 
2998         if (pos >= fText->length()) {
2999             setAppliedRule(pos, "LB2 - Break at end of text.");
3000             break;
3001         }
3002 
3003 
3004         //             We do this one out-of-order because the adjustment does not change anything
3005         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3006         //             be applied.
3007         rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3008         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3009         c = fText->char32At(nextPos);
3010         rule9Adjust(pos, &thisChar, &nextPos, &c);
3011 
3012         // If the loop is still warming up - if we haven't shifted the initial
3013         //   -1 positions out of prevPos yet - loop back to advance the
3014         //    position in the input without any further looking for breaks.
3015         if (prevPos == -1) {
3016           setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
3017             continue;
3018         }
3019 
3020 
3021         if (fBK->contains(prevChar)) {
3022             setAppliedRule(pos, "LB 4  Always break after hard line breaks");
3023             break;
3024         }
3025 
3026 
3027         if (prevChar == 0x0d && thisChar == 0x0a) {
3028             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
3029             continue;
3030         }
3031         if (prevChar == 0x0d ||
3032             prevChar == 0x0a ||
3033             prevChar == 0x85)  {
3034             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
3035             break;
3036         }
3037 
3038 
3039         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3040             fBK->contains(thisChar)) {
3041             setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
3042             continue;
3043         }
3044 
3045 
3046         if (fSP->contains(thisChar)) {
3047             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
3048             continue;
3049         }
3050 
3051         // !!! ??? Is this the right text for the applied rule?
3052         if (fZW->contains(thisChar)) {
3053             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
3054             continue;
3055         }
3056 
3057 
3058         //       ZW SP* ÷
3059         //       Scan backwards from prevChar for SP* ZW
3060         tPos = prevPos;
3061         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3062             tPos = fText->moveIndex32(tPos, -1);
3063         }
3064         if (fZW->contains(fText->char32At(tPos))) {
3065             setAppliedRule(pos, "LB 8  Break after zero width space");
3066             break;
3067         }
3068 
3069 
3070         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
3071         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
3072         if (fNumberMatcher->lookingAt(prevPos, status)) {
3073             if (U_FAILURE(status)) {
3074                 setAppliedRule(pos, "LB 25 Numbers");
3075                 break;
3076             }
3077             // Matched a number.  But could have been just a single digit, which would
3078             //    not represent a "no break here" between prevChar and thisChar
3079             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3080             if (numEndIdx > pos) {
3081                 // Number match includes at least our two chars being checked
3082                 if (numEndIdx > nextPos) {
3083                     // Number match includes additional chars.  Update pos and nextPos
3084                     //   so that next loop iteration will continue at the end of the number,
3085                     //   checking for breaks between last char in number & whatever follows.
3086                     pos = nextPos = numEndIdx;
3087                     do {
3088                         pos = fText->moveIndex32(pos, -1);
3089                         thisChar = fText->char32At(pos);
3090                     } while (fCM->contains(thisChar));
3091                 }
3092                 setAppliedRule(pos, "LB 25 Numbers");
3093                 continue;
3094             }
3095         }
3096 
3097 
3098         //       The monkey test's way of ignoring combining characters doesn't work
3099         //       for this rule. ZJ is also a CM. Need to get the actual character
3100         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3101         {
3102             int32_t prevIdx = fText->moveIndex32(pos, -1);
3103             UChar32 prevC = fText->char32At(prevIdx);
3104             if (fZWJ->contains(prevC)) {
3105                 setAppliedRule(pos, "LB 8a ZWJ x");
3106                 continue;
3107             }
3108         }
3109 
3110 
3111         // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
3112         //
3113 
3114 
3115         //    x  WJ
3116         //    WJ  x
3117         //
3118         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3119             setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
3120             continue;
3121         }
3122 
3123 
3124         if (fGL->contains(prevChar)) {
3125             setAppliedRule(pos, "LB 12  GL  x");
3126             continue;
3127         }
3128 
3129 
3130           if (!(fSP->contains(prevChar) ||
3131               fBA->contains(prevChar) ||
3132               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3133               setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
3134               continue;
3135         }
3136 
3137 
3138         if (fCL->contains(thisChar) ||
3139                 fCP->contains(thisChar) ||
3140                 fEX->contains(thisChar) ||
3141                 fSY->contains(thisChar)) {
3142             setAppliedRule(pos, "LB 13  Don't break before closings.");
3143             continue;
3144         }
3145 
3146 
3147         //       Scan backwards, checking for this sequence.
3148         //       The OP char could include combining marks, so we actually check for
3149         //           OP CM* SP*
3150         //       Another Twist: The Rule 9 fixes may have changed a SP CM
3151         //       sequence into a ID char, so before scanning back through spaces,
3152         //       verify that prevChar is indeed a space.  The prevChar variable
3153         //       may differ from fText[prevPos]
3154         tPos = prevPos;
3155         if (fSP->contains(prevChar)) {
3156             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3157                 tPos=fText->moveIndex32(tPos, -1);
3158             }
3159         }
3160         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3161             tPos=fText->moveIndex32(tPos, -1);
3162         }
3163         if (fOP->contains(fText->char32At(tPos))) {
3164             setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3165             continue;
3166         }
3167 
3168 
3169         if (nextPos < fText->length()) {
3170             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3171             //       from a legit ffff character. So test length separately.
3172             UChar32 nextChar = fText->char32At(nextPos);
3173             if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3174                 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3175                 break;
3176             }
3177         }
3178 
3179 
3180           if (fIS->contains(thisChar)) {
3181               setAppliedRule(pos, "LB 14b  Do not break before numeric separators, even after spaces.");
3182               continue;
3183         }
3184 
3185 
3186         if (fOP->contains(thisChar)) {
3187             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3188             int tPos = prevPos;
3189             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3190                 tPos = fText->moveIndex32(tPos, -1);
3191             }
3192             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3193                 tPos = fText->moveIndex32(tPos, -1);
3194             }
3195             if (fQU->contains(fText->char32At(tPos))) {
3196                 setAppliedRule(pos, "LB 15    QU SP* x OP");
3197                 continue;
3198             }
3199         }
3200 
3201 
3202         //    Scan backwards for SP* CM* (CL | CP)
3203         if (fNS->contains(thisChar)) {
3204             int tPos = prevPos;
3205             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3206                 tPos = fText->moveIndex32(tPos, -1);
3207             }
3208             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3209                 tPos = fText->moveIndex32(tPos, -1);
3210             }
3211             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3212                 setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
3213                 continue;
3214             }
3215         }
3216 
3217 
3218         if (fB2->contains(thisChar)) {
3219             //  Scan backwards, checking for the B2 CM* SP* sequence.
3220             tPos = prevPos;
3221             if (fSP->contains(prevChar)) {
3222                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3223                     tPos=fText->moveIndex32(tPos, -1);
3224                 }
3225             }
3226             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3227                 tPos=fText->moveIndex32(tPos, -1);
3228             }
3229             if (fB2->contains(fText->char32At(tPos))) {
3230                 setAppliedRule(pos, "LB 17   B2 SP* x B2");
3231                 continue;
3232             }
3233         }
3234 
3235 
3236         if (fSP->contains(prevChar)) {
3237             setAppliedRule(pos, "LB 18    break after space");
3238             break;
3239         }
3240 
3241         //    x   QU
3242         //    QU  x
3243         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3244             setAppliedRule(pos, "LB 19");
3245             continue;
3246         }
3247 
3248         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3249             setAppliedRule(pos, "LB 20  Break around a CB");
3250             break;
3251         }
3252 
3253         //           Don't break between Hyphens and letters if a break precedes the hyphen.
3254         //           Formerly this was a Finnish tailoring.
3255         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3256         //           ^($HY | $HH) $AL;
3257         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3258                 prevPosX2 == -1) {
3259             setAppliedRule(pos, "LB 20.09");
3260             continue;
3261         }
3262 
3263         if (fBA->contains(thisChar) ||
3264             fHY->contains(thisChar) ||
3265             fNS->contains(thisChar) ||
3266             fBB->contains(prevChar) )   {
3267             setAppliedRule(pos, "LB 21");
3268             continue;
3269         }
3270 
3271         if (fHL->contains(prevCharX2) &&
3272                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3273             setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
3274             continue;
3275         }
3276 
3277         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3278             setAppliedRule(pos, "LB 21b SY x HL");
3279             continue;
3280         }
3281 
3282         if (fIN->contains(thisChar))   {
3283             setAppliedRule(pos, "LB 22");
3284             continue;
3285         }
3286 
3287 
3288         //          (AL | HL) x NU
3289         //          NU x (AL | HL)
3290         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3291             setAppliedRule(pos, "LB 23");
3292             continue;
3293         }
3294         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3295             setAppliedRule(pos, "LB 23");
3296             continue;
3297         }
3298 
3299         // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3300         //      PR x (ID | EB | EM)
3301         //     (ID | EB | EM) x PO
3302         if (fPR->contains(prevChar) &&
3303                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3304             setAppliedRule(pos, "LB 23a");
3305             continue;
3306         }
3307         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3308                 fPO->contains(thisChar)) {
3309             setAppliedRule(pos, "LB 23a");
3310             continue;
3311         }
3312 
3313         //   Do not break between prefix and letters or ideographs.
3314         //         (PR | PO) x (AL | HL)
3315         //         (AL | HL) x (PR | PO)
3316         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3317                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3318             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3319             continue;
3320         }
3321         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3322                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3323             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3324             continue;
3325         }
3326 
3327         // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3328 
3329         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3330                                         fJV->contains(thisChar) ||
3331                                         fH2->contains(thisChar) ||
3332                                         fH3->contains(thisChar))) {
3333             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3334             continue;
3335                                         }
3336 
3337         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3338             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3339             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3340             continue;
3341         }
3342 
3343         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3344             fJT->contains(thisChar)) {
3345             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3346             continue;
3347         }
3348 
3349         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3350             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3351             fIN->contains(thisChar)) {
3352             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3353             continue;
3354             }
3355         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3356             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3357             fPO->contains(thisChar)) {
3358             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3359             continue;
3360             }
3361         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3362             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3363             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3364             continue;
3365             }
3366 
3367 
3368 
3369         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3370             setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
3371             continue;
3372         }
3373 
3374           if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3375               setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3376               continue;
3377         }
3378 
3379         //          (AL | NU) x OP
3380         //          CP x (AL | NU)
3381         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3382             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3383             continue;
3384         }
3385         if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3386             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3387             continue;
3388         }
3389 
3390         //             RI  x  RI
3391         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3392             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3393             break;
3394         }
3395         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3396             // Two Regional Indicators have been paired.
3397             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3398             // following RI. This is a hack.
3399             thisChar = -1;
3400             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3401             continue;
3402         }
3403 
3404         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3405             setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
3406             continue;
3407         }
3408 
3409         setAppliedRule(pos, "LB 31    Break everywhere else");
3410         break;
3411     }
3412 
3413     return pos;
3414 }
3415 
3416 
charClasses()3417 UVector  *RBBILineMonkey::charClasses() {
3418     return fSets;
3419 }
3420 
3421 
~RBBILineMonkey()3422 RBBILineMonkey::~RBBILineMonkey() {
3423     delete fSets;
3424 
3425     delete fBK;
3426     delete fCR;
3427     delete fLF;
3428     delete fCM;
3429     delete fNL;
3430     delete fWJ;
3431     delete fZW;
3432     delete fGL;
3433     delete fCB;
3434     delete fSP;
3435     delete fB2;
3436     delete fBA;
3437     delete fBB;
3438     delete fHH;
3439     delete fHY;
3440     delete fH2;
3441     delete fH3;
3442     delete fCL;
3443     delete fCP;
3444     delete fEX;
3445     delete fIN;
3446     delete fJL;
3447     delete fJV;
3448     delete fJT;
3449     delete fNS;
3450     delete fOP;
3451     delete fQU;
3452     delete fIS;
3453     delete fNU;
3454     delete fPO;
3455     delete fPR;
3456     delete fSY;
3457     delete fAI;
3458     delete fAL;
3459     delete fCJ;
3460     delete fHL;
3461     delete fID;
3462     delete fRI;
3463     delete fSG;
3464     delete fXX;
3465     delete fEB;
3466     delete fEM;
3467     delete fZWJ;
3468     delete fOP30;
3469     delete fCP30;
3470 
3471     delete fCharBI;
3472     delete fNumberMatcher;
3473 }
3474 
3475 
3476 //-------------------------------------------------------------------------------------------
3477 //
3478 //   TestMonkey
3479 //
3480 //     params
3481 //       seed=nnnnn        Random number starting seed.
3482 //                         Setting the seed allows errors to be reproduced.
3483 //       loop=nnn          Looping count.  Controls running time.
3484 //                         -1:  run forever.
3485 //                          0 or greater:  run length.
3486 //
3487 //       type = char | word | line | sent | title
3488 //
3489 //  Example:
3490 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3491 //
3492 //-------------------------------------------------------------------------------------------
3493 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3494 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3495     int32_t val = defaultVal;
3496     name.append(" *= *(-?\\d+)");
3497     UErrorCode status = U_ZERO_ERROR;
3498     RegexMatcher m(name, params, 0, status);
3499     if (m.find()) {
3500         // The param exists.  Convert the string to an int.
3501         char valString[100];
3502         int32_t paramLength = m.end(1, status) - m.start(1, status);
3503         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3504             paramLength = (int32_t)(sizeof(valString)-2);
3505         }
3506         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3507         val = strtol(valString, NULL, 10);
3508 
3509         // Delete this parameter from the params string.
3510         m.reset();
3511         params = m.replaceFirst("", status);
3512     }
3513     U_ASSERT(U_SUCCESS(status));
3514     return val;
3515 }
3516 #endif
3517 
3518 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3519 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3520                                     BreakIterator *bi,
3521                                     int expected[],
3522                                     int expectedcount)
3523 {
3524     int count = 0;
3525     int i = 0;
3526     int forward[50];
3527     bi->setText(ustr);
3528     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3529         forward[count] = i;
3530         if (count < expectedcount && expected[count] != i) {
3531             test->errln("%s:%d break forward test failed: expected %d but got %d",
3532                         __FILE__, __LINE__, expected[count], i);
3533             break;
3534         }
3535         count ++;
3536     }
3537     if (count != expectedcount) {
3538         printStringBreaks(ustr, expected, expectedcount);
3539         test->errln("%s:%d break forward test failed: missed %d match",
3540                     __FILE__, __LINE__, expectedcount - count);
3541         return;
3542     }
3543     // testing boundaries
3544     for (i = 1; i < expectedcount; i ++) {
3545         int j = expected[i - 1];
3546         if (!bi->isBoundary(j)) {
3547             printStringBreaks(ustr, expected, expectedcount);
3548             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3549                     __FILE__, __LINE__, j);
3550             return;
3551         }
3552         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3553             if (bi->isBoundary(j)) {
3554                 printStringBreaks(ustr, expected, expectedcount);
3555                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3556                     __FILE__, __LINE__, j);
3557                 return;
3558             }
3559         }
3560     }
3561 
3562     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3563         count --;
3564         if (forward[count] != i) {
3565             printStringBreaks(ustr, expected, expectedcount);
3566             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3567                         __FILE__, __LINE__, forward[count], i);
3568             break;
3569         }
3570     }
3571     if (count != 0) {
3572         printStringBreaks(ustr, expected, expectedcount);
3573         test->errln("break test previous() failed: missed a match");
3574         return;
3575     }
3576 
3577     // testing preceding
3578     for (i = 0; i < expectedcount - 1; i ++) {
3579         // int j = expected[i] + 1;
3580         int j = ustr.moveIndex32(expected[i], 1);
3581         for (; j <= expected[i + 1]; j ++) {
3582             int32_t expectedPreceding = expected[i];
3583             int32_t actualPreceding = bi->preceding(j);
3584             if (actualPreceding != expectedPreceding) {
3585                 printStringBreaks(ustr, expected, expectedcount);
3586                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3587                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3588                 return;
3589             }
3590         }
3591     }
3592 }
3593 #endif
3594 
TestWordBreaks(void)3595 void RBBITest::TestWordBreaks(void)
3596 {
3597 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3598 
3599     Locale        locale("en");
3600     UErrorCode    status = U_ZERO_ERROR;
3601     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3602     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3603     // Replaced any C+J characters in a row with a random sequence of characters
3604     // of the same length to make our C+J segmentation not get in the way.
3605     static const char *strlist[] =
3606     {
3607     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3608     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3609     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3610     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3611     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3612     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3613     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3614     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3615     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3616     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3617     "\\u2027\\U000e0067\\u0a47\\u00b7",
3618     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3619     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3620     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3621     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3622     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3623     "\\u0027\\u11af\\U000e0057\\u0602",
3624     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3625     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3626     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3627     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3628     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3629     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3630     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3631     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3632     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3633     "\\u18f4\\U000e0049\\u20e7\\u2027",
3634     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3635     "\\ua183\\u102d\\u0bec\\u003a",
3636     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3637     "\\u003a\\u0e57\\u0fad\\u002e",
3638     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3639     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3640     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3641     "\\u003a\\u0664\\u00b7\\u1fba",
3642     "\\u003b\\u0027\\u00b7\\u47a3",
3643     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3644     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3645     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3646     };
3647     int loop;
3648     if (U_FAILURE(status)) {
3649         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3650         return;
3651     }
3652     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3653         // printf("looping %d\n", loop);
3654         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3655         // RBBICharMonkey monkey;
3656         RBBIWordMonkey monkey;
3657 
3658         int expected[50];
3659         int expectedcount = 0;
3660 
3661         monkey.setText(ustr);
3662         int i;
3663         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3664             expected[expectedcount ++] = i;
3665         }
3666 
3667         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3668     }
3669     delete bi;
3670 #endif
3671 }
3672 
TestWordBoundary(void)3673 void RBBITest::TestWordBoundary(void)
3674 {
3675     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3676     Locale        locale("en");
3677     UErrorCode    status = U_ZERO_ERROR;
3678     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3679     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3680     if (U_FAILURE(status)) {
3681         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3682                 __FILE__, __LINE__, u_errorName(status));
3683         return;
3684     }
3685     UChar         str[50];
3686     static const char *strlist[] =
3687     {
3688     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3689     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3690     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3691     "\\u2027\\U000e0067\\u0a47\\u00b7",
3692     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3693     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3694     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3695     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3696     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3697     "\\u0027\\u11af\\U000e0057\\u0602",
3698     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3699     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3700     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3701     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3702     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3703     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3704     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3705     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3706     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3707     "\\u58f4\\U000e0049\\u20e7\\u2027",
3708     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3709     "\\ua183\\u102d\\u0bec\\u003a",
3710     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3711     "\\u003a\\u0e57\\u0fad\\u002e",
3712     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3713     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3714     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3715     "\\u003a\\u0664\\u00b7\\u1fba",
3716     "\\u003b\\u0027\\u00b7\\u47a3",
3717     };
3718     int loop;
3719     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3720         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3721         UnicodeString ustr(str);
3722         int forward[50];
3723         int count = 0;
3724 
3725         bi->setText(ustr);
3726         int prev = -1;
3727         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3728             ++count;
3729             if (count >= UPRV_LENGTHOF(forward)) {
3730                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3731                         __FILE__, __LINE__, loop, count, boundary);
3732                 return;
3733             }
3734             forward[count] = boundary;
3735             if (boundary <= prev) {
3736                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3737                         __FILE__, __LINE__, loop, prev, boundary);
3738                 break;
3739             }
3740             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3741                 if (bi->isBoundary(nonBoundary)) {
3742                     printStringBreaks(ustr, forward, count);
3743                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3744                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3745                     return;
3746                 }
3747             }
3748             if (!bi->isBoundary(boundary)) {
3749                 printStringBreaks(ustr, forward, count);
3750                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3751                        __FILE__, __LINE__, boundary);
3752                 return;
3753             }
3754             prev = boundary;
3755         }
3756     }
3757 }
3758 
TestLineBreaks(void)3759 void RBBITest::TestLineBreaks(void)
3760 {
3761 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3762     Locale        locale("en");
3763     UErrorCode    status = U_ZERO_ERROR;
3764     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3765     const int32_t  STRSIZE = 50;
3766     UChar         str[STRSIZE];
3767     static const char *strlist[] =
3768     {
3769      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3770      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3771              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3772      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3773              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3774      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3775      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3776      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3777      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3778      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3779      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3780      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3781      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3782      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3783      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3784      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3785      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3786      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3787      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3788      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3789      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3790      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3791      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3792      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3793      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3794      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3795      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3796      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3797      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3798      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3799      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3800      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3801      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3802      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3803      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3804      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3805      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3806      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3807      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3808          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3809     };
3810     int loop;
3811     TEST_ASSERT_SUCCESS(status);
3812     if (U_FAILURE(status)) {
3813         return;
3814     }
3815     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3816         // printf("looping %d\n", loop);
3817         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3818         if (t >= STRSIZE) {
3819             TEST_ASSERT(FALSE);
3820             continue;
3821         }
3822 
3823 
3824         UnicodeString ustr(str);
3825         RBBILineMonkey monkey;
3826         if (U_FAILURE(monkey.deferredStatus)) {
3827             continue;
3828         }
3829 
3830         const int EXPECTEDSIZE = 50;
3831         int expected[EXPECTEDSIZE];
3832         int expectedcount = 0;
3833 
3834         monkey.setText(ustr);
3835 
3836         int i;
3837         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3838             if (expectedcount >= EXPECTEDSIZE) {
3839                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3840                 return;
3841             }
3842             expected[expectedcount ++] = i;
3843         }
3844 
3845         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3846     }
3847     delete bi;
3848 #endif
3849 }
3850 
TestSentBreaks(void)3851 void RBBITest::TestSentBreaks(void)
3852 {
3853 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3854     Locale        locale("en");
3855     UErrorCode    status = U_ZERO_ERROR;
3856     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3857     UChar         str[200];
3858     static const char *strlist[] =
3859     {
3860      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3861      "This\n",
3862      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3863      "\"Sentence ending with a quote.\" Bye.",
3864      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3865      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3866      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3867      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3868      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3869      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3870      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3871              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3872              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3873              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3874      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3875              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3876              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3877              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3878              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3879              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3880     };
3881     int loop;
3882     if (U_FAILURE(status)) {
3883         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3884         return;
3885     }
3886     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3887         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3888         UnicodeString ustr(str);
3889 
3890         RBBISentMonkey monkey;
3891         if (U_FAILURE(monkey.deferredStatus)) {
3892             continue;
3893         }
3894 
3895         const int EXPECTEDSIZE = 50;
3896         int expected[EXPECTEDSIZE];
3897         int expectedcount = 0;
3898 
3899         monkey.setText(ustr);
3900 
3901         int i;
3902         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3903             if (expectedcount >= EXPECTEDSIZE) {
3904                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3905                 return;
3906             }
3907             expected[expectedcount ++] = i;
3908         }
3909 
3910         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3911     }
3912     delete bi;
3913 #endif
3914 }
3915 
TestMonkey()3916 void RBBITest::TestMonkey() {
3917 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3918 
3919     UErrorCode     status    = U_ZERO_ERROR;
3920     int32_t        loopCount = 500;
3921     int32_t        seed      = 1;
3922     UnicodeString  breakType = "all";
3923     Locale         locale("en");
3924     UBool          useUText  = FALSE;
3925 
3926     if (quick == FALSE) {
3927         loopCount = 10000;
3928     }
3929 
3930     if (fTestParams) {
3931         UnicodeString p(fTestParams);
3932         loopCount = getIntParam("loop", p, loopCount);
3933         seed      = getIntParam("seed", p, seed);
3934 
3935         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3936         if (m.find()) {
3937             breakType = m.group(1, status);
3938             m.reset();
3939             p = m.replaceFirst("", status);
3940         }
3941 
3942         RegexMatcher u(" *utext", p, 0, status);
3943         if (u.find()) {
3944             useUText = TRUE;
3945             u.reset();
3946             p = u.replaceFirst("", status);
3947         }
3948 
3949 
3950         // m.reset(p);
3951         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3952             // Each option is stripped out of the option string as it is processed.
3953             // All options have been checked.  The option string should have been completely emptied..
3954             char buf[100];
3955             p.extract(buf, sizeof(buf), NULL, status);
3956             buf[sizeof(buf)-1] = 0;
3957             errln("Unrecognized or extra parameter:  %s\n", buf);
3958             return;
3959         }
3960 
3961     }
3962 
3963     if (breakType == "char" || breakType == "all") {
3964         RBBICharMonkey  m;
3965         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3966         if (U_SUCCESS(status)) {
3967             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3968             if (breakType == "all" && useUText==FALSE) {
3969                 // Also run a quick test with UText when "all" is specified
3970                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3971             }
3972         }
3973         else {
3974             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3975         }
3976         delete bi;
3977     }
3978 
3979     if (breakType == "word" || breakType == "all") {
3980         logln("Word Break Monkey Test");
3981         RBBIWordMonkey  m;
3982         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3983         if (U_SUCCESS(status)) {
3984             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3985         }
3986         else {
3987             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3988         }
3989         delete bi;
3990     }
3991 
3992     if (breakType == "line" || breakType == "all") {
3993         logln("Line Break Monkey Test");
3994         RBBILineMonkey  m;
3995         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3996         if (loopCount >= 10) {
3997             loopCount = loopCount / 5;   // Line break runs slower than the others.
3998         }
3999         if (U_SUCCESS(status)) {
4000             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4001         }
4002         else {
4003             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4004         }
4005         delete bi;
4006     }
4007 
4008     if (breakType == "sent" || breakType == "all"  ) {
4009         logln("Sentence Break Monkey Test");
4010         RBBISentMonkey  m;
4011         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4012         if (loopCount >= 10) {
4013             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4014         }
4015         if (U_SUCCESS(status)) {
4016             RunMonkey(bi, m, "sent", seed, loopCount, useUText);
4017         }
4018         else {
4019             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4020         }
4021         delete bi;
4022     }
4023 
4024 #endif
4025 }
4026 
4027 //
4028 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4029 //    Parameters:
4030 //       bi      - the break iterator to use
4031 //       mk      - MonkeyKind, abstraction for obtaining expected results
4032 //       name    - Name of test (char, word, etc.) for use in error messages
4033 //       seed    - Seed for starting random number generator (parameter from user)
4034 //       numIterations
4035 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4036 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4037                          int32_t numIterations, UBool useUText) {
4038 
4039 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4040 
4041     const int32_t    TESTSTRINGLEN = 500;
4042     UnicodeString    testText;
4043     int32_t          numCharClasses;
4044     UVector          *chClasses;
4045     int              expectedCount = 0;
4046     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4047     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4048     char             reverseBreaks[TESTSTRINGLEN*2+1];
4049     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4050     char             followingBreaks[TESTSTRINGLEN*2+1];
4051     char             precedingBreaks[TESTSTRINGLEN*2+1];
4052     int              i;
4053     int              loopCount = 0;
4054 
4055 
4056     m_seed = seed;
4057 
4058     numCharClasses = mk.charClasses()->size();
4059     chClasses      = mk.charClasses();
4060 
4061     // Check for errors that occured during the construction of the MonkeyKind object.
4062     //  Can't report them where they occured because errln() is a method coming from intlTest,
4063     //  and is not visible outside of RBBITest :-(
4064     if (U_FAILURE(mk.deferredStatus)) {
4065         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4066         return;
4067     }
4068 
4069     // Verify that the character classes all have at least one member.
4070     for (i=0; i<numCharClasses; i++) {
4071         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4072         if (s == NULL || s->size() == 0) {
4073             errln("Character Class #%d is null or of zero size.", i);
4074             return;
4075         }
4076     }
4077 
4078     // For minimizing width of class name output.
4079     int classNameSize = mk.maxClassNameSize();
4080 
4081     while (loopCount < numIterations || numIterations == -1) {
4082         if (numIterations == -1 && loopCount % 10 == 0) {
4083             // If test is running in an infinite loop, display a periodic tic so
4084             //   we can tell that it is making progress.
4085             fprintf(stderr, ".");
4086         }
4087         // Save current random number seed, so that we can recreate the random numbers
4088         //   for this loop iteration in event of an error.
4089         seed = m_seed;
4090 
4091         // Populate a test string with data.
4092         testText.truncate(0);
4093         for (i=0; i<TESTSTRINGLEN; i++) {
4094             int32_t  aClassNum = m_rand() % numCharClasses;
4095             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4096             int32_t   charIdx = m_rand() % classSet->size();
4097             UChar32   c = classSet->charAt(charIdx);
4098             if (c < 0) {   // TODO:  deal with sets containing strings.
4099                 errln("%s:%d c < 0", __FILE__, __LINE__);
4100                 break;
4101             }
4102             // Do not assemble a supplementary character from randomly generated separate surrogates.
4103             //   (It could be a dictionary character)
4104             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4105                 continue;
4106             }
4107 
4108             testText.append(c);
4109         }
4110 
4111         // Calculate the expected results for this test string and reset applied rules.
4112         mk.setText(testText);
4113 
4114         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4115         expectedBreaks[0] = 1;
4116         int32_t breakPos = 0;
4117         expectedCount = 0;
4118         for (;;) {
4119             breakPos = mk.next(breakPos);
4120             if (breakPos == -1) {
4121                 break;
4122             }
4123             if (breakPos > testText.length()) {
4124                 errln("breakPos > testText.length()");
4125             }
4126             expectedBreaks[breakPos] = 1;
4127             U_ASSERT(expectedCount<testText.length());
4128         }
4129 
4130         // Find the break positions using forward iteration
4131         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4132         if (useUText) {
4133             UErrorCode status = U_ZERO_ERROR;
4134             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4135             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4136             bi->setText(testUText, status);
4137             TEST_ASSERT_SUCCESS(status);
4138             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4139                                       //  This UText can be closed immediately, so long as the
4140                                       //  testText string continues to exist.
4141         } else {
4142             bi->setText(testText);
4143         }
4144 
4145         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4146             if (i < 0 || i > testText.length()) {
4147                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4148                 break;
4149             }
4150             forwardBreaks[i] = 1;
4151         }
4152 
4153         // Find the break positions using reverse iteration
4154         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4155         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4156             if (i < 0 || i > testText.length()) {
4157                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4158                 break;
4159             }
4160             reverseBreaks[i] = 1;
4161         }
4162 
4163         // Find the break positions using isBoundary() tests.
4164         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4165         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4166         for (i=0; i<=testText.length(); i++) {
4167             isBoundaryBreaks[i] = bi->isBoundary(i);
4168         }
4169 
4170 
4171         // Find the break positions using the following() function.
4172         // printf(".");
4173         memset(followingBreaks, 0, sizeof(followingBreaks));
4174         int32_t   lastBreakPos = 0;
4175         followingBreaks[0] = 1;
4176         for (i=0; i<testText.length(); i++) {
4177             breakPos = bi->following(i);
4178             if (breakPos <= i ||
4179                 breakPos < lastBreakPos ||
4180                 breakPos > testText.length() ||
4181                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4182                 errln("%s break monkey test: "
4183                     "Out of range value returned by BreakIterator::following().\n"
4184                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4185                          name, seed, i, breakPos, lastBreakPos);
4186                 break;
4187             }
4188             followingBreaks[breakPos] = 1;
4189             lastBreakPos = breakPos;
4190         }
4191 
4192         // Find the break positions using the preceding() function.
4193         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4194         lastBreakPos = testText.length();
4195         precedingBreaks[testText.length()] = 1;
4196         for (i=testText.length(); i>0; i--) {
4197             breakPos = bi->preceding(i);
4198             if (breakPos >= i ||
4199                 breakPos > lastBreakPos ||
4200                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4201                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4202                 errln("%s break monkey test: "
4203                     "Out of range value returned by BreakIterator::preceding().\n"
4204                     "index=%d;  prev returned %d; lastBreak=%d" ,
4205                     name,  i, breakPos, lastBreakPos);
4206                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4207                     precedingBreaks[i] = 2;   // Forces an error.
4208                 }
4209             } else {
4210                 if (breakPos >= 0) {
4211                     precedingBreaks[breakPos] = 1;
4212                 }
4213                 lastBreakPos = breakPos;
4214             }
4215         }
4216 
4217         // Compare the expected and actual results.
4218         for (i=0; i<=testText.length(); i++) {
4219             const char *errorType = NULL;
4220             const char* currentBreakData = NULL;
4221             if  (forwardBreaks[i] != expectedBreaks[i]) {
4222                 errorType = "next()";
4223                 currentBreakData = forwardBreaks;
4224             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4225                 errorType = "previous()";
4226                 currentBreakData = reverseBreaks;
4227            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4228                 errorType = "isBoundary()";
4229                 currentBreakData = isBoundaryBreaks;
4230             } else if (followingBreaks[i] != expectedBreaks[i]) {
4231                 errorType = "following()";
4232                 currentBreakData = followingBreaks;
4233             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4234                 errorType = "preceding()";
4235                 currentBreakData = precedingBreaks;
4236             }
4237 
4238             if (errorType != NULL) {
4239                 // Format a range of the test text that includes the failure as
4240                 //  a data item that can be included in the rbbi test data file.
4241 
4242                 // Start of the range is the last point where expected and actual results
4243                 //  both agreed that there was a break position.
4244 
4245                 int startContext = i;
4246                 int32_t count = 0;
4247                 for (;;) {
4248                     if (startContext==0) { break; }
4249                     startContext --;
4250                     if (expectedBreaks[startContext] != 0) {
4251                         if (count == 2) break;
4252                         count ++;
4253                     }
4254                 }
4255 
4256                 // End of range is two expected breaks past the start position.
4257                 int endContext = i + 1;
4258                 int ci;
4259                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4260                     for (;;) {
4261                         if (endContext >= testText.length()) {break;}
4262                         if (expectedBreaks[endContext-1] != 0) {
4263                             if (count == 0) break;
4264                             count --;
4265                         }
4266                         endContext ++;
4267                     }
4268                 }
4269 
4270                 // Formatting of each line includes:
4271                 //   character code
4272                 //   reference break: '|' -> a break, '.' -> no break
4273                 //   actual break:    '|' -> a break, '.' -> no break
4274                 //   (name of character clase)
4275                 //   Unicode name of character
4276                 //   '-->' indicates location of the difference.
4277 
4278                 MONKEY_ERROR(
4279                     (expectedBreaks[i] ? "Break expected but not found" :
4280                        "Break found but not expected"),
4281                     name, i, seed);
4282 
4283                 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4284                     UChar32  c;
4285                     c = testText.char32At(ci);
4286 
4287                     std::string currentLineFlag = "   ";
4288                     if (ci == i) {
4289                         currentLineFlag = "-->";  // Error position
4290                     }
4291 
4292                     // BMP or SMP character in hex
4293                     char hexCodePoint[12];
4294                     std::string format = "    \\u%04x";
4295                     if (c >= 0x10000) {
4296                         format = "\\U%08x";
4297                     }
4298                     sprintf(hexCodePoint, format.c_str(), c);
4299 
4300                     // Get the class name and character name for the character.
4301                     char cName[200];
4302                     UErrorCode status = U_ZERO_ERROR;
4303                     u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4304 
4305                     char buffer[200];
4306                     auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4307                              "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
4308                              currentLineFlag.c_str(),
4309                              ci,
4310                              expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
4311                              currentBreakData[ci] == 0 ? "." : "|",  // Actual break
4312                              hexCodePoint,
4313                              classNameSize,
4314                              mk.classNameFromCodepoint(c).c_str(),
4315                              mk.getAppliedRule(ci).c_str(), cName);
4316                     (void)ret;
4317                     U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4318 
4319                     // Output the error
4320                     if (ci == i) {
4321                         errln(buffer);
4322                     } else {
4323                         infoln(buffer);
4324                     }
4325 
4326                     if (ci >= endContext) { break; }
4327                 }
4328                 break;
4329             }
4330         }
4331 
4332         loopCount++;
4333     }
4334 #endif
4335 }
4336 
4337 
4338 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4339 //             This test checks the initial patch,
4340 //             which is to just keep it from crashing.  Correct word boundaries
4341 //             await a proper fix to the dictionary code.
4342 //
TestBug5532(void)4343 void RBBITest::TestBug5532(void)  {
4344    // Text includes a mixture of Thai and Latin.
4345    const unsigned char utf8Data[] = {
4346            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4347            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4348            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4349            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4350            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4351            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4352            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4353            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4354            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4355            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4356            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4357 
4358     UErrorCode status = U_ZERO_ERROR;
4359     UText utext=UTEXT_INITIALIZER;
4360     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4361     TEST_ASSERT_SUCCESS(status);
4362 
4363     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4364     TEST_ASSERT_SUCCESS(status);
4365     if (U_SUCCESS(status)) {
4366         bi->setText(&utext, status);
4367         TEST_ASSERT_SUCCESS(status);
4368 
4369         int32_t breakCount = 0;
4370         int32_t previousBreak = -1;
4371         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4372             // For now, just make sure that the break iterator doesn't hang.
4373             TEST_ASSERT(previousBreak < bi->current());
4374             previousBreak = bi->current();
4375         }
4376         TEST_ASSERT(breakCount > 0);
4377     }
4378     delete bi;
4379     utext_close(&utext);
4380 }
4381 
4382 
TestBug9983(void)4383 void RBBITest::TestBug9983(void)  {
4384     UnicodeString text = UnicodeString("\\u002A"  // * Other
4385                                        "\\uFF65"  //   Other
4386                                        "\\u309C"  //   Katakana
4387                                        "\\uFF9F"  //   Extend
4388                                        "\\uFF65"  //   Other
4389                                        "\\u0020"  //   Other
4390                                        "\\u0000").unescape();
4391 
4392     UErrorCode status = U_ZERO_ERROR;
4393     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4394         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4395     TEST_ASSERT_SUCCESS(status);
4396     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4397         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4398     TEST_ASSERT_SUCCESS(status);
4399     if (U_FAILURE(status)) {
4400         return;
4401     }
4402     int32_t offset, rstatus, iterationCount;
4403 
4404     brkiter->setText(text);
4405     brkiter->last();
4406     iterationCount = 0;
4407     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4408         iterationCount++;
4409         rstatus = brkiter->getRuleStatus();
4410         (void)rstatus;     // Suppress set but not used warning.
4411         if (iterationCount >= 10) {
4412            break;
4413         }
4414     }
4415     TEST_ASSERT(iterationCount == 6);
4416 
4417     brkiterPOSIX->setText(text);
4418     brkiterPOSIX->last();
4419     iterationCount = 0;
4420     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4421         iterationCount++;
4422         rstatus = brkiterPOSIX->getRuleStatus();
4423         (void)rstatus;     // Suppress set but not used warning.
4424         if (iterationCount >= 10) {
4425            break;
4426         }
4427     }
4428     TEST_ASSERT(iterationCount == 6);
4429 }
4430 
4431 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4432 //
TestBug7547()4433 void RBBITest::TestBug7547() {
4434     UnicodeString rules;
4435     UErrorCode status = U_ZERO_ERROR;
4436     UParseError parseError;
4437     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4438     if (status != U_BRK_RULE_SYNTAX) {
4439         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4440     }
4441     if (parseError.line != 1 || parseError.offset != 0) {
4442         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4443     }
4444 }
4445 
4446 
TestBug12797()4447 void RBBITest::TestBug12797() {
4448     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4449     UErrorCode status = U_ZERO_ERROR;
4450     UParseError parseError;
4451     RuleBasedBreakIterator bi(rules, parseError, status);
4452     if (U_FAILURE(status)) {
4453         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4454         return;
4455     }
4456     UnicodeString text = "abc";
4457     bi.setText(text);
4458     bi.first();
4459     int32_t boundary = bi.next();
4460     if (boundary != 3) {
4461         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4462     }
4463 }
4464 
TestBug12918()4465 void RBBITest::TestBug12918() {
4466     // This test triggers an assertion failure in dictbe.cpp
4467     const UChar *crasherString = u"\u3325\u4a16";
4468     UErrorCode status = U_ZERO_ERROR;
4469     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4470     if (U_FAILURE(status)) {
4471         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4472         return;
4473     }
4474     ubrk_first(iter);
4475     int32_t pos = 0;
4476     int32_t lastPos = -1;
4477     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4478         if (pos <= lastPos) {
4479             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4480             break;
4481         }
4482     }
4483     ubrk_close(iter);
4484 }
4485 
TestBug12932()4486 void RBBITest::TestBug12932() {
4487     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4488     UnicodeString ruleStr(
4489             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4490             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4491             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4492             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4493             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4494             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4495 
4496     UErrorCode status = U_ZERO_ERROR;
4497     UParseError parseError;
4498     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4499     if (status != U_BRK_RULE_SYNTAX) {
4500         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4501                 __FILE__, __LINE__, u_errorName(status));
4502     }
4503 }
4504 
4505 
4506 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4507 //             remain undevided by ICU char, word and line break.
TestEmoji()4508 void RBBITest::TestEmoji() {
4509 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4510     UErrorCode  status = U_ZERO_ERROR;
4511 
4512     CharString testFileName;
4513     testFileName.append(IntlTest::getSourceTestData(status), status);
4514     testFileName.appendPathPart("emoji-test.txt", status);
4515     if (U_FAILURE(status)) {
4516         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4517         return;
4518     }
4519     logln("Opening data file %s\n", testFileName.data());
4520 
4521     int    len;
4522     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4523     if (U_FAILURE(status) || testFile == NULL) {
4524         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4525         return;
4526     }
4527     UnicodeString testFileAsString(testFile, len);
4528     delete [] testFile;
4529 
4530     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4531     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4532     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4533     int32_t lineNumber = 0;
4534 
4535     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4536     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4537     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4538     if (U_FAILURE(status)) {
4539         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4540         return;
4541     }
4542 
4543     while (lineMatcher.find()) {
4544         ++lineNumber;
4545         UnicodeString line = lineMatcher.group(status);
4546         hexMatcher.reset(line);
4547         UnicodeString testString;   // accumulates the emoji sequence.
4548         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4549             UnicodeString hex = hexMatcher.group(1, status);
4550             if (hex.length() > 8) {
4551                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4552                 break;
4553             }
4554             CharString hex8;
4555             hex8.appendInvariantChars(hex, status);
4556             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4557             if (c<=0x10ffff) {
4558                 testString.append(c);
4559             } else {
4560                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4561                         __FILE__, __LINE__, lineNumber, hex8.data());
4562                 break;
4563             }
4564         }
4565 
4566         if (testString.length() > 1) {
4567             charBreaks->setText(testString);
4568             charBreaks->first();
4569             int32_t firstBreak = charBreaks->next();
4570             if (testString.length() != firstBreak) {
4571                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4572                         __FILE__, __LINE__, lineNumber, firstBreak);
4573             }
4574             wordBreaks->setText(testString);
4575             wordBreaks->first();
4576             firstBreak = wordBreaks->next();
4577             if (testString.length() != firstBreak) {
4578                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4579                         __FILE__, __LINE__, lineNumber, firstBreak);
4580             }
4581             lineBreaks->setText(testString);
4582             lineBreaks->first();
4583             firstBreak = lineBreaks->next();
4584             if (testString.length() != firstBreak) {
4585                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4586                         __FILE__, __LINE__, lineNumber, firstBreak);
4587             }
4588         }
4589     }
4590 #endif
4591 }
4592 
4593 
4594 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4595 
TestBug12519()4596 void RBBITest::TestBug12519() {
4597     UErrorCode status = U_ZERO_ERROR;
4598     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4599     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4600     if (!assertSuccess(WHERE, status)) {
4601         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4602         return;
4603     }
4604     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4605 
4606     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4607     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4608 
4609     LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4610     assertTrue(WHERE, *biEn == *cloneEn);
4611     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4612 
4613     LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4614     assertTrue(WHERE, *biFr == *cloneFr);
4615     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4616 
4617     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4618     UnicodeString text("Hallo Welt");
4619     biDe->setText(text);
4620     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4621     *biDe = *biFr;
4622     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4623 }
4624 
TestBug12677()4625 void RBBITest::TestBug12677() {
4626     // Check that stripping of comments from rules for getRules() is not confused by
4627     // the presence of '#' characters in the rules that do not introduce comments.
4628     UnicodeString rules(u"!!forward; \n"
4629                          "$x = [ab#];  # a set with a # literal. \n"
4630                          " # .;        # a comment that looks sort of like a rule.   \n"
4631                          " '#' '?';    # a rule with a quoted #   \n"
4632                        );
4633 
4634     UErrorCode status = U_ZERO_ERROR;
4635     UParseError pe;
4636     RuleBasedBreakIterator bi(rules, pe, status);
4637     assertSuccess(WHERE, status);
4638     UnicodeString rtRules = bi.getRules();
4639     assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"),  rtRules);
4640 }
4641 
4642 
TestTableRedundancies()4643 void RBBITest::TestTableRedundancies() {
4644     UErrorCode status = U_ZERO_ERROR;
4645 
4646     LocalPointer<RuleBasedBreakIterator> bi (
4647         (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4648     assertSuccess(WHERE, status);
4649     if (U_FAILURE(status)) return;
4650 
4651     RBBIDataWrapper *dw = bi->fData;
4652     const RBBIStateTable *fwtbl = dw->fForwardTable;
4653     UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4654     int32_t numCharClasses = dw->fHeader->fCatCount;
4655     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4656 
4657     // Check for duplicate columns (character categories)
4658 
4659     std::vector<UnicodeString> columns;
4660     for (int32_t column = 0; column < numCharClasses; column++) {
4661         UnicodeString s;
4662         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4663             RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4664             s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4665         }
4666         columns.push_back(s);
4667     }
4668     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4669     for (int c1=1; c1<numCharClasses; c1++) {
4670         int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4671         for (int c2 = c1+1; c2 < limit; c2++) {
4672             if (columns.at(c1) == columns.at(c2)) {
4673                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4674                 goto out;
4675             }
4676         }
4677     }
4678   out:
4679 
4680     // Check for duplicate states
4681     std::vector<UnicodeString> rows;
4682     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4683         UnicodeString s;
4684         RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4685         if (in8Bits) {
4686             s.append(row->r8.fAccepting);
4687             s.append(row->r8.fLookAhead);
4688             s.append(row->r8.fTagsIdx);
4689             for (int32_t column = 0; column < numCharClasses; column++) {
4690                 s.append(row->r8.fNextState[column]);
4691             }
4692         } else {
4693             s.append(row->r16.fAccepting);
4694             s.append(row->r16.fLookAhead);
4695             s.append(row->r16.fTagsIdx);
4696             for (int32_t column = 0; column < numCharClasses; column++) {
4697                 s.append(row->r16.fNextState[column]);
4698             }
4699         }
4700         rows.push_back(s);
4701     }
4702     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4703         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4704             if (rows.at(r1) == rows.at(r2)) {
4705                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4706                 return;
4707             }
4708         }
4709     }
4710 }
4711 
4712 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4713 //            even after next() has returned DONE.
4714 
TestBug13447()4715 void RBBITest::TestBug13447() {
4716     UErrorCode status = U_ZERO_ERROR;
4717     LocalPointer<RuleBasedBreakIterator> bi(
4718         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4719     assertSuccess(WHERE, status);
4720     if (U_FAILURE(status)) return;
4721     UnicodeString data(u"1234");
4722     bi->setText(data);
4723     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4724     assertEquals(WHERE, 4, bi->next());
4725     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4726     assertEquals(WHERE, UBRK_DONE, bi->next());
4727     assertEquals(WHERE, 4, bi->current());
4728     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4729 }
4730 
4731 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4732 //  for filling the break iterator cache when starting from random positions
4733 //  in the text.
4734 //
4735 //  It's a monkey test, working on random data, with the expected data obtained
4736 //  from forward iteration (no safe rules involved), comparing with results
4737 //  when indexing into the interior of the string (safe rules needed).
4738 
TestReverse()4739 void RBBITest::TestReverse() {
4740     UErrorCode status = U_ZERO_ERROR;
4741 
4742     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4743             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4744     assertSuccess(WHERE, status, true);
4745     status = U_ZERO_ERROR;
4746     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4747             BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4748     assertSuccess(WHERE, status, true);
4749     status = U_ZERO_ERROR;
4750     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4751             BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4752     assertSuccess(WHERE, status, true);
4753     status = U_ZERO_ERROR;
4754     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4755             BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4756     assertSuccess(WHERE, status, true);
4757 }
4758 
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4759 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4760     if (!bi) {
4761         return;
4762     }
4763 
4764     // From the mapping trie in the break iterator's internal data, create a
4765     // vector of UnicodeStrings, one for each character category, containing
4766     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4767     // to avoid an execess of unassigned code points.
4768 
4769     RBBIDataWrapper *data = bi->fData;
4770     int32_t categoryCount = data->fHeader->fCatCount;
4771     UCPTrie *trie = data->fTrie;
4772     bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4773     uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4774 
4775     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4776     for (int cp=0; cp<0x1fff0; ++cp) {
4777         int cat = ucptrie_get(trie, cp);
4778         cat &= ~dictBit;    // And off the dictionary bit from the category.
4779         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4780         if (cat < 0 || cat >= categoryCount) return;
4781         strings[cat].append(cp);
4782     }
4783 
4784     icu_rand randomGen;
4785     const int testStringLength = 10000;
4786     UnicodeString testString;
4787 
4788     for (int i=0; i<testStringLength; ++i) {
4789         int charClass = randomGen() % categoryCount;
4790         if (strings[charClass].length() > 0) {
4791             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4792             testString.append(cp);
4793         }
4794     }
4795 
4796     typedef std::pair<UBool, int32_t> Result;
4797     std::vector<Result> expectedResults;
4798     bi->setText(testString);
4799     for (int i=0; i<testString.length(); ++i) {
4800         bool isboundary = bi->isBoundary(i);
4801         int  ruleStatus = bi->getRuleStatus();
4802         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4803     }
4804 
4805     for (int i=testString.length()-1; i>=0; --i) {
4806         bi->setText(testString);   // clears the internal break cache
4807         Result expected = expectedResults[i];
4808         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4809         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4810     }
4811 }
4812 
4813 
4814 // Ticket 13692 - finding word boundaries in very large numbers or words could
4815 //                be very time consuming. When the problem was present, this void test
4816 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4817 
TestBug13692()4818 void RBBITest::TestBug13692() {
4819     UErrorCode status = U_ZERO_ERROR;
4820     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4821             BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4822     if (!assertSuccess(WHERE, status, true)) {
4823         return;
4824     }
4825     constexpr int32_t LENGTH = 1000000;
4826     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4827     for (int i=0; i<20; i+=2) {
4828         longNumber.setCharAt(i, u' ');
4829     }
4830     bi->setText(longNumber);
4831     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4832     assertSuccess(WHERE, status);
4833 }
4834 
4835 
TestProperties()4836 void RBBITest::TestProperties() {
4837     UErrorCode errorCode = U_ZERO_ERROR;
4838     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4839     if (!prependSet.isEmpty()) {
4840         errln(
4841             "[:GCB=Prepend:] is not empty any more. "
4842             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4843             "change this test to the opposite condition.");
4844     }
4845 }
4846 
4847 
4848 //
4849 //  TestDebug    -  A place-holder test for debugging purposes.
4850 //                  For putting in fragments of other tests that can be invoked
4851 //                  for tracing  without a lot of unwanted extra stuff happening.
4852 //
TestDebug(void)4853 void RBBITest::TestDebug(void) {
4854     UErrorCode status = U_ZERO_ERROR;
4855     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4856             BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4857     if (!assertSuccess(WHERE, status, true)) {
4858         return;
4859     }
4860     const UnicodeString &rules = bi->getRules();
4861     UParseError pe;
4862     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4863     assertSuccess(WHERE, status);
4864 }
4865 
4866 
4867 //
4868 //  TestDebugRules   A stub test for use in debugging rule compilation problems.
4869 //                   Can be freely altered as needed or convenient.
4870 //                   Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4871 //                   data files may not be available in all environments.
4872 //                   Any permanent test cases should be moved to rbbitst.txt
4873 //                   (see Bug 20303 in that file, for example), or to another test function in this file.
4874 //
TestDebugRules()4875 void RBBITest::TestDebugRules() {
4876 #if 0
4877     const char16_t *rules = u""
4878         "!!quoted_literals_only; \n"
4879         "!!chain; \n"
4880         "!!lookAheadHardBreak; \n"
4881         " \n"
4882         // "[a] / ; \n"
4883         "[a] [b] / [c] [d]; \n"
4884         "[a] [b] / [c] [d] {100}; \n"
4885         "[x] [a] [b] / [c] [d] {100}; \n"
4886         "[a] [b] [c] / [d] {100}; \n"
4887         //" [c] [d] / [e] [f]; \n"
4888         //"[a] [b] / [c]; \n"
4889         ;
4890 
4891     UErrorCode status = U_ZERO_ERROR;
4892     CharString path(pathToDataDirectory(), status);
4893     path.appendPathPart("brkitr", status);
4894     path.appendPathPart("rules", status);
4895     path.appendPathPart("line.txt", status);
4896     int    len;
4897     std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4898     if (!assertSuccess(WHERE, status)) {
4899         return;
4900     }
4901 
4902     UParseError pe;
4903     // rules = testFile.get();
4904     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4905 
4906     if (!assertSuccess(WHERE, status)) {
4907         delete bi;
4908         return;
4909     }
4910     // bi->dumpTables();
4911 
4912     delete bi;
4913 #endif
4914 }
4915 
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4916 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4917     UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4918     int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4919     // Text are duplicate characters from U+4E00 to U+4FFF
4920     UnicodeString text;
4921     for (UChar c = 0x4e00; c < 0x5000; c++) {
4922         text.append(c).append(c);
4923     }
4924     // Generate rule which will caused length+4 character classes and
4925     // length+3 states
4926     UnicodeString rules(u"!!quoted_literals_only;");
4927     for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4928         rules.append(u'\'').append(c).append(c).append(u"';");
4929     }
4930     rules.append(u".;");
4931     UErrorCode status = U_ZERO_ERROR;
4932     UParseError parseError;
4933     RuleBasedBreakIterator bi(rules, parseError, status);
4934 
4935     assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4936     assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4937     assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4938     assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4939     assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4940 
4941     bi.setText(text);
4942 
4943     int32_t pos;
4944     int32_t i = 0;
4945     while ((pos = bi.next()) > 0) {
4946         // The first numChar should not break between the pair
4947         if (i++ < numChar) {
4948             assertEquals(WHERE, i * 2, pos);
4949         } else {
4950             // After the first numChar next(), break on each character.
4951             assertEquals(WHERE, i + numChar, pos);
4952         }
4953     }
4954     while ((pos = bi.previous()) > 0) {
4955         // The first numChar should not break between the pair
4956         if (--i < numChar) {
4957             assertEquals(WHERE, i * 2, pos);
4958         } else {
4959             // After the first numChar next(), break on each character.
4960             assertEquals(WHERE, i + numChar, pos);
4961         }
4962     }
4963 }
4964 
Test8BitsTrieWith8BitStateTable()4965 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4966     testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4967 }
4968 
Test16BitsTrieWith8BitStateTable()4969 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4970     testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4971 }
4972 
Test16BitsTrieWith16BitStateTable()4973 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4974     testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4975 }
4976 
Test8BitsTrieWith16BitStateTable()4977 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4978     // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4979     // create state table in 16 bits.
4980 
4981     // Generate 510 'a' as text
4982     UnicodeString text;
4983     for (int32_t i = 0; i < 510; i++) {
4984         text.append(u'a');
4985     }
4986 
4987     UnicodeString rules(u"!!quoted_literals_only;'");
4988     // 254 'a' in the rule will cause 256 states
4989     for (int32_t i = 0; i < 254; i++) {
4990         rules.append(u'a');
4991     }
4992     rules.append(u"';.;");
4993 
4994     UErrorCode status = U_ZERO_ERROR;
4995     UParseError parseError;
4996     LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4997 
4998     assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4999     assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
5000     assertEquals(WHERE,
5001                  false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
5002     bi->setText(text);
5003 
5004     // break positions:
5005     // 254, 508, 509, ... 510
5006     assertEquals("next()", 254, bi->next());
5007     int32_t i = 0;
5008     int32_t pos;
5009     while ((pos = bi->next()) > 0) {
5010         assertEquals(WHERE, 508 + i , pos);
5011         i++;
5012     }
5013     i = 0;
5014     while ((pos = bi->previous()) > 0) {
5015         i++;
5016         if (pos >= 508) {
5017             assertEquals(WHERE, 510 - i , pos);
5018         } else {
5019             assertEquals(WHERE, 254 , pos);
5020         }
5021     }
5022 }
5023 
5024 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
5025 // that there are no problems with rules at the size that transitions between the two.
5026 //
5027 // A rule that matches a literal string, like 'abcdefghij', will require one state and
5028 // one character class per character in the string. So we can make a rule to tickle the
5029 // boundaries by using literal strings of various lengths.
5030 //
5031 // For both the number of states and the number of character classes, the eight bit format
5032 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
5033 // leaving 120 something available. This test runs the string over the range of 120 - 130,
5034 // which allows some margin for changes to the number of values reserved by the rule builder
5035 // without breaking the test.
5036 
TestTable_8_16_Bits()5037 void RBBITest::TestTable_8_16_Bits() {
5038 
5039     // testStr serves as both the source of the rule string (truncated to the desired length)
5040     // and as test data to check matching behavior. A break rule consisting of the first 120
5041     // characters of testStr will match the first 120 chars of the full-length testStr.
5042     UnicodeString testStr;
5043     for (UChar c=0x3000; c<0x3200; ++c) {
5044         testStr.append(c);
5045     }
5046 
5047     const int32_t startLength = 120;   // The shortest rule string to test.
5048     const int32_t endLength = 260;     // The longest rule string to test
5049     const int32_t increment = this->quick ? endLength - startLength : 1;
5050 
5051     for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
5052         UParseError parseError;
5053         UErrorCode status = U_ZERO_ERROR;
5054 
5055         UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
5056         ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
5057         RuleBasedBreakIterator bi(ruleString, parseError, status);
5058         if (!assertSuccess(WHERE, status)) {
5059             errln(ruleString);
5060             break;
5061         }
5062         // bi.dumpTables();
5063 
5064         // Verify that the break iterator is functioning - that the first boundary found
5065         // in testStr is at the length of the rule string.
5066         bi.setText(testStr);
5067         assertEquals(WHERE, ruleLen, bi.next());
5068 
5069         // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
5070         // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
5071         bi.setText(testStr);
5072         int32_t result = bi.preceding(ruleLen);
5073         assertEquals(WHERE, 0, result);
5074 
5075         // Verify that the range of rule lengths being tested cover the transations
5076         // from 8 to 16 bit data.
5077         bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
5078         bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
5079 
5080         if (ruleLen == startLength) {
5081             assertEquals(WHERE, true, has8BitRowData);
5082             assertEquals(WHERE, true, has8BitsTrie);
5083         }
5084         if (ruleLen == endLength) {
5085             assertEquals(WHERE, false, has8BitRowData);
5086             assertEquals(WHERE, false, has8BitsTrie);
5087         }
5088     }
5089 }
5090 
5091 /* Test handling of a large number of look-ahead rules.
5092  * The number of rules in the test exceeds the implementation limits prior to the
5093  * improvements introduced with #13590.
5094  *
5095  * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5096  * The text being matched is sequential, "ABCDEFGHI..."
5097  *
5098  * The upshot is that the look-ahead rules all match on their preceding context,
5099  * and consequently must save a potential result, but then fail to match on their
5100  * trailing context, so that they don't actually cause a boundary.
5101  *
5102  * Additionally, add a ".*" rule, so there are no boundaries unless a
5103  * look-ahead hard-break rule forces one.
5104  */
TestBug13590()5105 void RBBITest::TestBug13590() {
5106     UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5107 
5108     const int NUM_LOOKAHEAD_RULES = 50;
5109     const char16_t STARTING_CHAR = u'\u5000';
5110     char16_t firstChar;
5111     for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5112         firstChar = STARTING_CHAR + ruleNum*2;
5113         rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5114              .append(u' ') .append(u'/') .append(u' ')
5115              .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5116              .append(u';') .append(u'\n');
5117     }
5118 
5119     // Change the last rule added from the form "UV / WY" to "UV / WX".
5120     // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5121     rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5122 
5123     UErrorCode status = U_ZERO_ERROR;
5124     UParseError parseError;
5125     RuleBasedBreakIterator bi(rules, parseError, status);
5126     if (!assertSuccess(WHERE, status)) {
5127         errln(rules);
5128         return;
5129     }
5130     // bi.dumpTables();
5131 
5132     UnicodeString testString;
5133     for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5134         testString.append(c);
5135     }
5136     bi.setText(testString);
5137 
5138     int breaksFound = 0;
5139     while (bi.next() != UBRK_DONE) {
5140         ++breaksFound;
5141     }
5142 
5143     // Two matches are expected, one from the last rule that was explicitly modified,
5144     // and one at the end of the text.
5145     assertEquals(WHERE, 2, breaksFound);
5146 }
5147 
5148 
5149 #if U_ENABLE_TRACING
5150 static std::vector<std::string> gData;
5151 static std::vector<int32_t> gEntryFn;
5152 static std::vector<int32_t> gExitFn;
5153 static std::vector<int32_t> gDataFn;
5154 
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5155 static void U_CALLCONV traceData(
5156         const void*,
5157         int32_t fnNumber,
5158         int32_t,
5159         const char *,
5160         va_list args) {
5161     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5162         const char* data = va_arg(args, const char*);
5163         gDataFn.push_back(fnNumber);
5164         gData.push_back(data);
5165     }
5166 }
5167 
traceEntry(const void *,int32_t fnNumber)5168 static void traceEntry(const void *, int32_t fnNumber) {
5169     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5170         gEntryFn.push_back(fnNumber);
5171     }
5172 }
5173 
traceExit(const void *,int32_t fnNumber,const char *,va_list)5174 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5175     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5176         gExitFn.push_back(fnNumber);
5177     }
5178 }
5179 
5180 
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5181 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5182     assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5183     assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5184     assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5185     assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5186 
5187     if (expectedData == nullptr) {
5188       assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5189       assertEquals("utrace_data should not be called ", 0, gData.size());
5190     } else {
5191       assertEquals("utrace_data should be called ", 1, gDataFn.size());
5192       assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5193       assertEquals("utrace_data should be called ", 1, gData.size());
5194       assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5195     }
5196 }
5197 
SetupTestTrace()5198 void SetupTestTrace() {
5199     gEntryFn.clear();
5200     gExitFn.clear();
5201     gDataFn.clear();
5202     gData.clear();
5203 
5204     const void* context = nullptr;
5205     utrace_setFunctions(context, traceEntry, traceExit, traceData);
5206     utrace_setLevel(UTRACE_INFO);
5207 }
5208 
TestTraceCreateCharacter(void)5209 void RBBITest::TestTraceCreateCharacter(void) {
5210     SetupTestTrace();
5211     IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5212     LocalPointer<BreakIterator> brkitr(
5213         BreakIterator::createCharacterInstance("zh-CN", status));
5214     status.errIfFailureAndReset();
5215     assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5216 }
5217 
TestTraceCreateTitle(void)5218 void RBBITest::TestTraceCreateTitle(void) {
5219     SetupTestTrace();
5220     IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5221     LocalPointer<BreakIterator> brkitr(
5222         BreakIterator::createTitleInstance("zh-CN", status));
5223     status.errIfFailureAndReset();
5224     assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5225 }
5226 
TestTraceCreateSentence(void)5227 void RBBITest::TestTraceCreateSentence(void) {
5228     SetupTestTrace();
5229     IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5230     LocalPointer<BreakIterator> brkitr(
5231         BreakIterator::createSentenceInstance("zh-CN", status));
5232     status.errIfFailureAndReset();
5233     assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5234 }
5235 
TestTraceCreateWord(void)5236 void RBBITest::TestTraceCreateWord(void) {
5237     SetupTestTrace();
5238     IcuTestErrorCode status(*this, "TestTraceCreateWord");
5239     LocalPointer<BreakIterator> brkitr(
5240         BreakIterator::createWordInstance("zh-CN", status));
5241     status.errIfFailureAndReset();
5242     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5243 }
5244 
TestTraceCreateLine(void)5245 void RBBITest::TestTraceCreateLine(void) {
5246     SetupTestTrace();
5247     IcuTestErrorCode status(*this, "TestTraceCreateLine");
5248     LocalPointer<BreakIterator> brkitr(
5249         BreakIterator::createLineInstance("zh-CN", status));
5250     status.errIfFailureAndReset();
5251     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "");
5252 }
5253 
TestTraceCreateLineStrict(void)5254 void RBBITest::TestTraceCreateLineStrict(void) {
5255     SetupTestTrace();
5256     IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5257     LocalPointer<BreakIterator> brkitr(
5258         BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5259     status.errIfFailureAndReset();
5260     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "strict");
5261 }
5262 
TestTraceCreateLineNormal(void)5263 void RBBITest::TestTraceCreateLineNormal(void) {
5264     SetupTestTrace();
5265     IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5266     LocalPointer<BreakIterator> brkitr(
5267         BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5268     status.errIfFailureAndReset();
5269     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "normal");
5270 }
5271 
TestTraceCreateLineLoose(void)5272 void RBBITest::TestTraceCreateLineLoose(void) {
5273     SetupTestTrace();
5274     IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5275     LocalPointer<BreakIterator> brkitr(
5276         BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5277     status.errIfFailureAndReset();
5278     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "loose");
5279 }
5280 
TestTraceCreateBreakEngine(void)5281 void RBBITest::TestTraceCreateBreakEngine(void) {
5282     rbbi_cleanup();
5283     SetupTestTrace();
5284     IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5285     LocalPointer<BreakIterator> brkitr(
5286         BreakIterator::createWordInstance("zh-CN", status));
5287     status.errIfFailureAndReset();
5288     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5289 
5290     // To word break the following text, BreakIterator will create 5 dictionary
5291     // break engine internally.
5292     brkitr->setText(
5293         u"test "
5294         u"測試 " // Hani
5295         u"សាកល្បង " // Khmr
5296         u"ທົດສອບ " // Laoo
5297         u"စမ်းသပ်မှု " // Mymr
5298         u"ทดสอบ " // Thai
5299         u"test "
5300     );
5301 
5302     // Loop through all the text.
5303     while (brkitr->next() > 0) ;
5304 
5305     assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5306     assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5307     assertEquals("utrace_data should be called ", 5, gDataFn.size());
5308 
5309     for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5310         assertEquals("utrace_entry should be called ",
5311                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5312         assertEquals("utrace_exit should be called ",
5313                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5314         assertEquals("utrace_data should be called ",
5315                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5316     }
5317 
5318     assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5319     assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5320     assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5321     assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5322     assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5323 
5324 }
5325 #endif
5326 
TestUnpairedSurrogate()5327 void RBBITest::TestUnpairedSurrogate() {
5328     UnicodeString rules(u"ab;");
5329 
5330     UErrorCode status = U_ZERO_ERROR;
5331     UParseError pe;
5332     RuleBasedBreakIterator bi1(rules, pe, status);
5333     assertSuccess(WHERE, status);
5334     UnicodeString rtRules = bi1.getRules();
5335     // make sure the simple one work first.
5336     assertEquals(WHERE, rules,  rtRules);
5337 
5338 
5339     rules = UnicodeString(u"a\\ud800b;").unescape();
5340     pe.line = 0;
5341     pe.offset = 0;
5342     RuleBasedBreakIterator bi2(rules, pe, status);
5343     assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5344     if (pe.line != 1 || pe.offset != 1) {
5345         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5346     }
5347 
5348     status = U_ZERO_ERROR;
5349     rules = UnicodeString(u"a\\ude00b;").unescape();
5350     pe.line = 0;
5351     pe.offset = 0;
5352     RuleBasedBreakIterator bi3(rules, pe, status);
5353     assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5354     if (pe.line != 1 || pe.offset != 1) {
5355         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5356     }
5357 
5358     // make sure the surrogate one work too.
5359     status = U_ZERO_ERROR;
5360     rules = UnicodeString(u"a��b;");
5361     RuleBasedBreakIterator bi4(rules, pe, status);
5362     rtRules = bi4.getRules();
5363     assertEquals(WHERE, rules, rtRules);
5364 }
5365 
5366 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5367