• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1999-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Date        Name        Description
10 *   12/15/99    Madhu        Creation.
11 *   01/12/2000  Madhu        Updated for changed API and added new tests
12 ************************************************************************/
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include <sstream>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <utility>
22 #include <vector>
23 
24 #include "unicode/brkiter.h"
25 #include "unicode/localpointer.h"
26 #include "unicode/numfmt.h"
27 #include "unicode/rbbi.h"
28 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
29 #include "unicode/regex.h"
30 #endif
31 #include "unicode/schriter.h"
32 #include "unicode/uchar.h"
33 #include "unicode/utf16.h"
34 #include "unicode/ucnv.h"
35 #include "unicode/uniset.h"
36 #include "unicode/uscript.h"
37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
39 #include "unicode/utrace.h"
40 
41 #include "charstr.h"
42 #include "cmemory.h"
43 #include "cstr.h"
44 #include "intltest.h"
45 #include "rbbitst.h"
46 #include "rbbidata.h"
47 #include "utypeinfo.h"  // for 'typeid' to work
48 #include "uvector.h"
49 #include "uvectr32.h"
50 
51 
52 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
53 #include "unicode/filteredbrk.h"
54 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 
56 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
57     if (!(x)) { \
58         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
59     } \
60 } UPRV_BLOCK_MACRO_END
61 
62 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
63     if (U_FAILURE(errcode)) { \
64         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
65     } \
66 } UPRV_BLOCK_MACRO_END
67 
68 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
69     IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
70                     __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
71 }
72 
73 //---------------------------------------------
74 // runIndexedTest
75 //---------------------------------------------
76 
77 
78 //  Note:  Before adding new tests to this file, check whether the desired test data can
79 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
80 //         it's much less work than writing a new test, diagnostic output in the event of failures
81 //         is good, and the test data file will is shared with ICU4J, so eventually the test
82 //         will run there as well, without additional effort.
83 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)84 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
85 {
86     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
87     fTestParams = params;
88 
89     TESTCASE_AUTO_BEGIN;
90 #if !UCONFIG_NO_FILE_IO
91     TESTCASE_AUTO(TestBug4153072);
92 #endif
93 #if !UCONFIG_NO_FILE_IO
94     TESTCASE_AUTO(TestUnicodeFiles);
95 #endif
96     TESTCASE_AUTO(TestGetAvailableLocales);
97     TESTCASE_AUTO(TestGetDisplayName);
98 #if !UCONFIG_NO_FILE_IO
99     TESTCASE_AUTO(TestEndBehaviour);
100     TESTCASE_AUTO(TestWordBreaks);
101     TESTCASE_AUTO(TestWordBoundary);
102     TESTCASE_AUTO(TestLineBreaks);
103     TESTCASE_AUTO(TestSentBreaks);
104     TESTCASE_AUTO(TestExtended);
105 #endif
106 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
107     TESTCASE_AUTO(TestMonkey);
108 #endif
109 #if !UCONFIG_NO_FILE_IO
110     TESTCASE_AUTO(TestBug3818);
111 #endif
112     TESTCASE_AUTO(TestDebug);
113 #if !UCONFIG_NO_FILE_IO
114     TESTCASE_AUTO(TestBug5775);
115 #endif
116     TESTCASE_AUTO(TestBug9983);
117     TESTCASE_AUTO(TestDictRules);
118     TESTCASE_AUTO(TestBug5532);
119     TESTCASE_AUTO(TestBug7547);
120     TESTCASE_AUTO(TestBug12797);
121     TESTCASE_AUTO(TestBug12918);
122     TESTCASE_AUTO(TestBug12932);
123     TESTCASE_AUTO(TestEmoji);
124     TESTCASE_AUTO(TestBug12519);
125     TESTCASE_AUTO(TestBug12677);
126     TESTCASE_AUTO(TestTableRedundancies);
127     TESTCASE_AUTO(TestBug13447);
128     TESTCASE_AUTO(TestReverse);
129     TESTCASE_AUTO(TestBug13692);
130     TESTCASE_AUTO(TestDebugRules);
131     TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
132     TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
133     TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
134     TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
135     TESTCASE_AUTO(TestTable_8_16_Bits);
136     TESTCASE_AUTO(TestBug13590);
137 
138 #if U_ENABLE_TRACING
139     TESTCASE_AUTO(TestTraceCreateCharacter);
140     TESTCASE_AUTO(TestTraceCreateWord);
141     TESTCASE_AUTO(TestTraceCreateSentence);
142     TESTCASE_AUTO(TestTraceCreateTitle);
143     TESTCASE_AUTO(TestTraceCreateLine);
144     TESTCASE_AUTO(TestTraceCreateLineNormal);
145     TESTCASE_AUTO(TestTraceCreateLineLoose);
146     TESTCASE_AUTO(TestTraceCreateLineStrict);
147     TESTCASE_AUTO(TestTraceCreateBreakEngine);
148 #endif
149 
150     TESTCASE_AUTO_END;
151 }
152 
153 
154 //--------------------------------------------------------------------------------------
155 //
156 //    RBBITest    constructor and destructor
157 //
158 //--------------------------------------------------------------------------------------
159 
RBBITest()160 RBBITest::RBBITest() {
161     fTestParams = NULL;
162 }
163 
164 
~RBBITest()165 RBBITest::~RBBITest() {
166 }
167 
168 
printStringBreaks(UText * tstr,int expected[],int expectedCount)169 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
170     UErrorCode status = U_ZERO_ERROR;
171     char name[100];
172     printf("code    alpha extend alphanum type word sent line name\n");
173     int nextExpectedIndex = 0;
174     utext_setNativeIndex(tstr, 0);
175     for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
176         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
177             printf("------------------------------------------------ %d\n", j);
178             ++nextExpectedIndex;
179         }
180 
181         UChar32 c = utext_next32(tstr);
182         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
183         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
184                            u_isUAlphabetic(c),
185                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
186                            u_isalnum(c),
187                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
188                                                   u_charType(c),
189                                                   U_SHORT_PROPERTY_NAME),
190                            u_getPropertyValueName(UCHAR_WORD_BREAK,
191                                                   u_getIntPropertyValue(c,
192                                                           UCHAR_WORD_BREAK),
193                                                   U_SHORT_PROPERTY_NAME),
194                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
195                                    u_getIntPropertyValue(c,
196                                            UCHAR_SENTENCE_BREAK),
197                                    U_SHORT_PROPERTY_NAME),
198                            u_getPropertyValueName(UCHAR_LINE_BREAK,
199                                    u_getIntPropertyValue(c,
200                                            UCHAR_LINE_BREAK),
201                                    U_SHORT_PROPERTY_NAME),
202                            name);
203     }
204 }
205 
206 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)207 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
208    UErrorCode status = U_ZERO_ERROR;
209    UText *tstr = NULL;
210    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
211    if (U_FAILURE(status)) {
212        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
213        return;
214     }
215    printStringBreaks(tstr, expected, expectedCount);
216    utext_close(tstr);
217 }
218 
219 
TestBug3818()220 void RBBITest::TestBug3818() {
221     UErrorCode  status = U_ZERO_ERROR;
222 
223     // Four Thai words...
224     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
225                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
226     UnicodeString  thaiStr(thaiWordData);
227 
228     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
229     if (U_FAILURE(status) || bi == NULL) {
230         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
231         return;
232     }
233     bi->setText(thaiStr);
234 
235     int32_t  startOfSecondWord = bi->following(1);
236     if (startOfSecondWord != 4) {
237         errln("Fail at file %s, line %d expected start of word at 4, got %d",
238             __FILE__, __LINE__, startOfSecondWord);
239     }
240     startOfSecondWord = bi->following(0);
241     if (startOfSecondWord != 4) {
242         errln("Fail at file %s, line %d expected start of word at 4, got %d",
243             __FILE__, __LINE__, startOfSecondWord);
244     }
245     delete bi;
246 }
247 
248 
249 //---------------------------------------------
250 //
251 //     other tests
252 //
253 //---------------------------------------------
254 
TestGetAvailableLocales()255 void RBBITest::TestGetAvailableLocales()
256 {
257     int32_t locCount = 0;
258     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
259 
260     if (locCount == 0)
261         dataerrln("getAvailableLocales() returned an empty list!");
262     // Just make sure that it's returning good memory.
263     int32_t i;
264     for (i = 0; i < locCount; ++i) {
265         logln(locList[i].getName());
266     }
267 }
268 
269 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()270 void RBBITest::TestGetDisplayName()
271 {
272     UnicodeString   result;
273 
274     BreakIterator::getDisplayName(Locale::getUS(), result);
275     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
276         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
277                 + result);
278 
279     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
280     if (result != "French (France)")
281         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
282                 + result);
283 }
284 /**
285  * Test End Behaviour
286  * @bug 4068137
287  */
TestEndBehaviour()288 void RBBITest::TestEndBehaviour()
289 {
290     UErrorCode status = U_ZERO_ERROR;
291     UnicodeString testString("boo.");
292     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
293     if (U_FAILURE(status))
294     {
295         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
296         return;
297     }
298     wb->setText(testString);
299 
300     if (wb->first() != 0)
301         errln("Didn't get break at beginning of string.");
302     if (wb->next() != 3)
303         errln("Didn't get break before period in \"boo.\"");
304     if (wb->current() != 4 && wb->next() != 4)
305         errln("Didn't get break at end of string.");
306     delete wb;
307 }
308 /*
309  * @bug 4153072
310  */
TestBug4153072()311 void RBBITest::TestBug4153072() {
312     UErrorCode status = U_ZERO_ERROR;
313     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
314     if (U_FAILURE(status))
315     {
316         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
317         return;
318     }
319     UnicodeString str("...Hello, World!...");
320     int32_t begin = 3;
321     int32_t end = str.length() - 3;
322     UBool onBoundary;
323 
324     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
325     iter->adoptText(textIterator);
326     int index;
327     // Note: with the switch to UText, there is no way to restrict the
328     //       iteration range to begin at an index other than zero.
329     //       String character iterators created with a non-zero bound are
330     //         treated by RBBI as being empty.
331     for (index = -1; index < begin + 1; ++index) {
332         onBoundary = iter->isBoundary(index);
333         if (index == 0?  !onBoundary : onBoundary) {
334             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
335                             " and begin index = " + begin);
336         }
337     }
338     delete iter;
339 }
340 
341 
342 //
343 // Test for problem reported by Ashok Matoria on 9 July 2007
344 //    One.<kSoftHyphen><kSpace>Two.
345 //
346 //    Sentence break at start (0) and then on calling next() it breaks at
347 //   'T' of "Two". Now, at this point if I do next() and
348 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
349 //
TestBug5775()350 void RBBITest::TestBug5775() {
351     UErrorCode status = U_ZERO_ERROR;
352     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
353     TEST_ASSERT_SUCCESS(status);
354     if (U_FAILURE(status)) {
355         return;
356     }
357 // Check for status first for better handling of no data errors.
358     TEST_ASSERT(bi != NULL);
359     if (bi == NULL) {
360         return;
361     }
362 
363     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
364     //               01234      56789
365     s = s.unescape();
366     bi->setText(s);
367     int pos = bi->next();
368     TEST_ASSERT(pos == 6);
369     pos = bi->next();
370     TEST_ASSERT(pos == 10);
371     pos = bi->previous();
372     TEST_ASSERT(pos == 6);
373     delete bi;
374 }
375 
376 
377 
378 //------------------------------------------------------------------------------
379 //
380 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
381 //
382 //------------------------------------------------------------------------------
383 
384 struct TestParams {
385     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
386                                            //   Changed out whenever test data changes break type.
387 
388     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
389     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
390     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
391     UVector32       *srcCol;
392 
393     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
394     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
395     CharString       utf8String;           // UTF-8 form of text to break.
396 
TestParamsTestParams397     TestParams(UErrorCode &status) : dataToBreak() {
398         bi               = NULL;
399         expectedBreaks   = new UVector32(status);
400         srcLine          = new UVector32(status);
401         srcCol           = new UVector32(status);
402         textToBreak      = NULL;
403         textMap          = new UVector32(status);
404     }
405 
~TestParamsTestParams406     ~TestParams() {
407         delete bi;
408         delete expectedBreaks;
409         delete srcLine;
410         delete srcCol;
411         utext_close(textToBreak);
412         delete textMap;
413     }
414 
415     int32_t getSrcLine(int32_t bp);
416     int32_t getExpectedBreak(int32_t bp);
417     int32_t getSrcCol(int32_t bp);
418 
419     void setUTF16(UErrorCode &status);
420     void setUTF8(UErrorCode &status);
421 };
422 
423 // Append a UnicodeString to a CharString with UTF-8 encoding.
424 // Substitute any invalid chars.
425 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)426 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
427     if (U_FAILURE(status)) {
428         return;
429     }
430     int32_t utf8Length;
431     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
432                        src.getBuffer(), src.length(),   // UTF-16 data
433                        0xfffd, NULL,                    // Substitution char, number of subs.
434                        &status);
435     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
436         return;
437     }
438     status = U_ZERO_ERROR;
439     int32_t capacity;
440     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
441     u_strToUTF8WithSub(buffer, utf8Length, NULL,
442                        src.getBuffer(), src.length(),
443                        0xfffd, NULL, &status);
444     dest.append(buffer, utf8Length, status);
445 }
446 
447 
setUTF16(UErrorCode & status)448 void TestParams::setUTF16(UErrorCode &status) {
449     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
450     textMap->removeAllElements();
451     for (int32_t i=0; i<dataToBreak.length(); i++) {
452         if (i == dataToBreak.getChar32Start(i)) {
453             textMap->addElement(i, status);
454         } else {
455             textMap->addElement(-1, status);
456         }
457     }
458     textMap->addElement(dataToBreak.length(), status);
459     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
460 }
461 
462 
setUTF8(UErrorCode & status)463 void TestParams::setUTF8(UErrorCode &status) {
464     if (U_FAILURE(status)) {
465         return;
466     }
467     utf8String.clear();
468     CharStringAppend(utf8String, dataToBreak, status);
469     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
470     if (U_FAILURE(status)) {
471         return;
472     }
473 
474     textMap->removeAllElements();
475     int32_t utf16Index = 0;
476     for (;;) {
477         textMap->addElement(utf16Index, status);
478         UChar32 c32 = utext_current32(textToBreak);
479         if (c32 < 0) {
480             break;
481         }
482         utf16Index += U16_LENGTH(c32);
483         utext_next32(textToBreak);
484         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
485             textMap->addElement(-1, status);
486         }
487     }
488     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
489 }
490 
491 
getSrcLine(int32_t bp)492 int32_t TestParams::getSrcLine(int32_t bp) {
493     if (bp >= textMap->size()) {
494         bp = textMap->size() - 1;
495     }
496     int32_t i = 0;
497     for(; bp >= 0 ; --bp) {
498         // Move to a character boundary if we are not on one already.
499         i = textMap->elementAti(bp);
500         if (i >= 0) {
501             break;
502         }
503     }
504     return srcLine->elementAti(i);
505 }
506 
507 
getExpectedBreak(int32_t bp)508 int32_t TestParams::getExpectedBreak(int32_t bp) {
509     if (bp >= textMap->size()) {
510         return 0;
511     }
512     int32_t i = textMap->elementAti(bp);
513     int32_t retVal = 0;
514     if (i >= 0) {
515         retVal = expectedBreaks->elementAti(i);
516     }
517     return retVal;
518 }
519 
520 
getSrcCol(int32_t bp)521 int32_t TestParams::getSrcCol(int32_t bp) {
522     if (bp >= textMap->size()) {
523         bp = textMap->size() - 1;
524     }
525     int32_t i = 0;
526     for(; bp >= 0; --bp) {
527         // Move bp to a character boundary if we are not on one already.
528         i = textMap->elementAti(bp);
529         if (i >= 0) {
530             break;
531         }
532     }
533     return srcCol->elementAti(i);
534 }
535 
536 
executeTest(TestParams * t,UErrorCode & status)537 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
538     int32_t    bp;
539     int32_t    prevBP;
540     int32_t    i;
541 
542     TEST_ASSERT_SUCCESS(status);
543     if (U_FAILURE(status)) {
544         return;
545     }
546 
547     if (t->bi == NULL) {
548         return;
549     }
550 
551     t->bi->setText(t->textToBreak, status);
552     //
553     //  Run the iterator forward
554     //
555     prevBP = -1;
556     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
557         if (prevBP ==  bp) {
558             // Fail for lack of forward progress.
559             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
560                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
561             break;
562         }
563 
564         // Check that there we didn't miss an expected break between the last one
565         //  and this one.
566         for (i=prevBP+1; i<bp; i++) {
567             if (t->getExpectedBreak(i) != 0) {
568                 int expected[] = {0, i};
569                 printStringBreaks(t->dataToBreak, expected, 2);
570                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
571                       i, t->getSrcLine(i), t->getSrcCol(i));
572             }
573         }
574 
575         // Check that the break we did find was expected
576         if (t->getExpectedBreak(bp) == 0) {
577             int expected[] = {0, bp};
578             printStringBreaks(t->textToBreak, expected, 2);
579             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
580                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
581         } else {
582             // The break was expected.
583             //   Check that the {nnn} tag value is correct.
584             int32_t expectedTagVal = t->getExpectedBreak(bp);
585             if (expectedTagVal == -1) {
586                 expectedTagVal = 0;
587             }
588             int32_t line = t->getSrcLine(bp);
589             int32_t rs = t->bi->getRuleStatus();
590             if (rs != expectedTagVal) {
591                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
592                       "          Actual, Expected status = %4d, %4d",
593                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
594             }
595         }
596 
597         prevBP = bp;
598     }
599 
600     // Verify that there were no missed expected breaks after the last one found
601     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
602         if (t->getExpectedBreak(i) != 0) {
603             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
604                       i, t->getSrcLine(i), t->getSrcCol(i));
605         }
606     }
607 
608     //
609     //  Run the iterator backwards, verify that the same breaks are found.
610     //
611     prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
612     bp = t->bi->last();
613     while (bp != BreakIterator::DONE) {
614         if (prevBP ==  bp) {
615             // Fail for lack of progress.
616             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
617                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
618             break;
619         }
620 
621         // Check that we didn't miss an expected break between the last one
622         //  and this one.  (UVector returns zeros for index out of bounds.)
623         for (i=prevBP-1; i>bp; i--) {
624             if (t->getExpectedBreak(i) != 0) {
625                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
626                       i, t->getSrcLine(i), t->getSrcCol(i));
627             }
628         }
629 
630         // Check that the break we did find was expected
631         if (t->getExpectedBreak(bp) == 0) {
632             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
633                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
634         } else {
635             // The break was expected.
636             //   Check that the {nnn} tag value is correct.
637             int32_t expectedTagVal = t->getExpectedBreak(bp);
638             if (expectedTagVal == -1) {
639                 expectedTagVal = 0;
640             }
641             int line = t->getSrcLine(bp);
642             int32_t rs = t->bi->getRuleStatus();
643             if (rs != expectedTagVal) {
644                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
645                       "          Actual, Expected status = %4d, %4d",
646                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
647             }
648         }
649 
650         prevBP = bp;
651         bp = t->bi->previous();
652     }
653 
654     // Verify that there were no missed breaks prior to the last one found
655     for (i=prevBP-1; i>=0; i--) {
656         if (t->getExpectedBreak(i) != 0) {
657             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
658                       i, t->getSrcLine(i), t->getSrcCol(i));
659         }
660     }
661 
662     // Check isBoundary()
663     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
664         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
665         UBool boundaryFound    = t->bi->isBoundary(i);
666         if (boundaryExpected != boundaryFound) {
667             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
668                   "        Expected, Actual= %s, %s",
669                   i, t->getSrcLine(i), t->getSrcCol(i),
670                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
671         }
672     }
673 
674     // Check following()
675     for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
676         int32_t actualBreak = t->bi->following(i);
677         int32_t expectedBreak = BreakIterator::DONE;
678         for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
679             if (t->getExpectedBreak(j) != 0) {
680                 expectedBreak = j;
681                 break;
682             }
683         }
684         if (expectedBreak != actualBreak) {
685             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
686                   "        Expected, Actual= %d, %d",
687                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
688         }
689     }
690 
691     // Check preceding()
692     for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
693         int32_t actualBreak = t->bi->preceding(i);
694         int32_t expectedBreak = BreakIterator::DONE;
695 
696         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
697         // preceding(trailing byte) will return the index of some preceding code point,
698         // not the lead byte of the current code point, even though that has a smaller index.
699         // Therefore, start looking at the expected break data not at i-1, but at
700         // the start of code point index - 1.
701         utext_setNativeIndex(t->textToBreak, i);
702         int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
703         for (; j >= 0; j--) {
704             if (t->getExpectedBreak(j) != 0) {
705                 expectedBreak = j;
706                 break;
707             }
708         }
709         if (expectedBreak != actualBreak) {
710             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
711                   "        Expected, Actual= %d, %d",
712                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
713         }
714     }
715 }
716 
717 
TestExtended()718 void RBBITest::TestExtended() {
719   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
720   // data driven test closely entangles filtered and regular data.
721 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
722     UErrorCode      status  = U_ZERO_ERROR;
723     Locale          locale("");
724 
725     TestParams          tp(status);
726 
727     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
728     if (U_FAILURE(status)) {
729         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
730     }
731 
732     //
733     //  Open and read the test data file.
734     //
735     const char *testDataDirectory = IntlTest::getSourceTestData(status);
736     CharString testFileName(testDataDirectory, -1, status);
737     testFileName.append("rbbitst.txt", -1, status);
738 
739     int    len;
740     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
741     if (U_FAILURE(status)) {
742         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
743         return;
744     }
745 
746     bool skipTest = false; // Skip this test?
747 
748     //
749     //  Put the test data into a UnicodeString
750     //
751     UnicodeString testString(FALSE, testFile, len);
752 
753     enum EParseState{
754         PARSE_COMMENT,
755         PARSE_TAG,
756         PARSE_DATA,
757         PARSE_NUM,
758         PARSE_RULES
759     }
760     parseState = PARSE_TAG;
761 
762     EParseState savedState = PARSE_TAG;
763 
764     int32_t    lineNum  = 1;
765     int32_t    colStart = 0;
766     int32_t    column   = 0;
767     int32_t    charIdx  = 0;
768 
769     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
770 
771     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
772     int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
773 
774     for (charIdx = 0; charIdx < len; ) {
775         status = U_ZERO_ERROR;
776         UChar  c = testString.charAt(charIdx);
777         charIdx++;
778         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
779             // treat CRLF as a unit
780             c = u'\n';
781             charIdx++;
782         }
783         if (c == u'\n' || c == u'\r') {
784             lineNum++;
785             colStart = charIdx;
786         }
787         column = charIdx - colStart + 1;
788 
789         switch (parseState) {
790         case PARSE_COMMENT:
791             if (c == u'\n' || c == u'\r') {
792                 parseState = savedState;
793             }
794             break;
795 
796         case PARSE_TAG:
797             {
798             if (c == u'#') {
799                 parseState = PARSE_COMMENT;
800                 savedState = PARSE_TAG;
801                 break;
802             }
803             if (u_isUWhiteSpace(c)) {
804                 break;
805             }
806             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
807                 delete tp.bi;
808                 tp.bi = BreakIterator::createWordInstance(locale,  status);
809                 skipTest = false;
810                 charIdx += 5;
811                 break;
812             }
813             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
814                 delete tp.bi;
815                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
816                 skipTest = false;
817                 charIdx += 5;
818                 break;
819             }
820             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
821                 delete tp.bi;
822                 tp.bi = BreakIterator::createLineInstance(locale,  status);
823                 skipTest = false;
824                 charIdx += 5;
825                 break;
826             }
827             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
828                 delete tp.bi;
829                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
830                 skipTest = false;
831                 charIdx += 5;
832                 break;
833             }
834             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
835                 delete tp.bi;
836                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
837                 charIdx += 6;
838                 break;
839             }
840 
841             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
842                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
843                 charIdx = testString.indexOf(u'>', charIdx) + 1;
844                 parseState = PARSE_RULES;
845                 rules.remove();
846                 rulesFirstLine = lineNum;
847                 break;
848             }
849 
850             // <locale  loc_name>
851             localeMatcher.reset(testString);
852             if (localeMatcher.lookingAt(charIdx-1, status)) {
853                 UnicodeString localeName = localeMatcher.group(1, status);
854                 char localeName8[100];
855                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
856                 locale = Locale::createFromName(localeName8);
857                 charIdx += localeMatcher.group(0, status).length() - 1;
858                 TEST_ASSERT_SUCCESS(status);
859                 break;
860             }
861             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
862                 parseState = PARSE_DATA;
863                 charIdx += 5;
864                 tp.dataToBreak = "";
865                 tp.expectedBreaks->removeAllElements();
866                 tp.srcCol ->removeAllElements();
867                 tp.srcLine->removeAllElements();
868                 break;
869             }
870 
871             errln("line %d: Tag expected in test file.", lineNum);
872             parseState = PARSE_COMMENT;
873             savedState = PARSE_DATA;
874             goto end_test; // Stop the test.
875             }
876             break;
877 
878         case PARSE_RULES:
879             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
880                 charIdx += 7;
881                 parseState = PARSE_TAG;
882                 delete tp.bi;
883                 UParseError pe;
884                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
885                 skipTest = U_FAILURE(status);
886                 if (U_FAILURE(status)) {
887                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
888                         rulesFirstLine + pe.line - 1, u_errorName(status));
889                 }
890             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
891                 charIdx += 10;
892                 parseState = PARSE_TAG;
893                 UErrorCode ec = U_ZERO_ERROR;
894                 UParseError pe;
895                 RuleBasedBreakIterator bi(rules, pe, ec);
896                 if (U_SUCCESS(ec)) {
897                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
898                         rulesFirstLine + pe.line - 1);
899                 }
900             } else {
901                 rules.append(c);
902             }
903             break;
904 
905         case PARSE_DATA:
906             if (c == u'•') {
907                 int32_t  breakIdx = tp.dataToBreak.length();
908                 if (tp.expectedBreaks->size() > breakIdx) {
909                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
910                           lineNum, column);
911                 }
912                 tp.expectedBreaks->setSize(breakIdx+1);
913                 tp.expectedBreaks->setElementAt(-1, breakIdx);
914                 tp.srcLine->setSize(breakIdx+1);
915                 tp.srcLine->setElementAt(lineNum, breakIdx);
916                 tp.srcCol ->setSize(breakIdx+1);
917                 tp.srcCol ->setElementAt(column, breakIdx);
918                 break;
919             }
920 
921             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
922                 // Add final entry to mappings from break location to source file position.
923                 //  Need one extra because last break position returned is after the
924                 //    last char in the data, not at the last char.
925                 tp.srcLine->addElement(lineNum, status);
926                 tp.srcCol ->addElement(column, status);
927 
928                 parseState = PARSE_TAG;
929                 charIdx += 6;
930 
931                 if (!skipTest) {
932                     // RUN THE TEST!
933                     status = U_ZERO_ERROR;
934                     tp.setUTF16(status);
935                     executeTest(&tp, status);
936                     TEST_ASSERT_SUCCESS(status);
937 
938                     // Run again, this time with UTF-8 text wrapped in a UText.
939                     status = U_ZERO_ERROR;
940                     tp.setUTF8(status);
941                     TEST_ASSERT_SUCCESS(status);
942                     executeTest(&tp, status);
943                 }
944                 break;
945             }
946 
947             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
948                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
949                 // Get the code point from the name and insert it into the test data.
950                 //   (Damn, no API takes names in Unicode  !!!
951                 //    we've got to take it back to char *)
952                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
953                 int32_t nameLength = nameEndIdx - (charIdx+2);
954                 char charNameBuf[200];
955                 UChar32 theChar = -1;
956                 if (nameEndIdx != -1) {
957                     UErrorCode status = U_ZERO_ERROR;
958                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
959                     charNameBuf[sizeof(charNameBuf)-1] = 0;
960                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
961                     if (U_FAILURE(status)) {
962                         theChar = -1;
963                     }
964                 }
965                 if (theChar == -1) {
966                     errln("Error in named character in test file at line %d, col %d",
967                         lineNum, column);
968                 } else {
969                     // Named code point was recognized.  Insert it
970                     //   into the test data.
971                     tp.dataToBreak.append(theChar);
972                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
973                         tp.srcLine->addElement(lineNum, status);
974                         tp.srcCol ->addElement(column, status);
975                     }
976                 }
977                 if (nameEndIdx > charIdx) {
978                     charIdx = nameEndIdx+1;
979 
980                 }
981                 break;
982             }
983 
984 
985 
986             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
987                 charIdx++;
988                 int32_t  breakIdx = tp.dataToBreak.length();
989                 tp.expectedBreaks->setSize(breakIdx+1);
990                 tp.expectedBreaks->setElementAt(-1, breakIdx);
991                 tp.srcLine->setSize(breakIdx+1);
992                 tp.srcLine->setElementAt(lineNum, breakIdx);
993                 tp.srcCol ->setSize(breakIdx+1);
994                 tp.srcCol ->setElementAt(column, breakIdx);
995                 break;
996             }
997 
998             if (c == u'<') {
999                 tagValue   = 0;
1000                 parseState = PARSE_NUM;
1001                 break;
1002             }
1003 
1004             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
1005                 parseState = PARSE_COMMENT;
1006                 savedState = PARSE_DATA;
1007                 break;
1008             }
1009 
1010             if (c == u'\\') {
1011                 // Check for \ at end of line, a line continuation.
1012                 //     Advance over (discard) the newline
1013                 UChar32 cp = testString.char32At(charIdx);
1014                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1015                     // We have a CR LF
1016                     //  Need an extra increment of the input ptr to move over both of them
1017                     charIdx++;
1018                 }
1019                 if (cp == u'\n' || cp == u'\r') {
1020                     lineNum++;
1021                     colStart = charIdx;
1022                     charIdx++;
1023                     break;
1024                 }
1025 
1026                 // Let unescape handle the back slash.
1027                 cp = testString.unescapeAt(charIdx);
1028                 if (cp != -1) {
1029                     // Escape sequence was recognized.  Insert the char
1030                     //   into the test data.
1031                     tp.dataToBreak.append(cp);
1032                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1033                         tp.srcLine->addElement(lineNum, status);
1034                         tp.srcCol ->addElement(column, status);
1035                     }
1036                     break;
1037                 }
1038 
1039 
1040                 // Not a recognized backslash escape sequence.
1041                 // Take the next char as a literal.
1042                 //  TODO:  Should this be an error?
1043                 c = testString.charAt(charIdx);
1044                 charIdx = testString.moveIndex32(charIdx, 1);
1045             }
1046 
1047             // Normal, non-escaped data char.
1048             tp.dataToBreak.append(c);
1049 
1050             // Save the mapping from offset in the data to line/column numbers in
1051             //   the original input file.  Will be used for better error messages only.
1052             //   If there's an expected break before this char, the slot in the mapping
1053             //     vector will already be set for this char; don't overwrite it.
1054             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1055                 tp.srcLine->addElement(lineNum, status);
1056                 tp.srcCol ->addElement(column, status);
1057             }
1058             break;
1059 
1060 
1061         case PARSE_NUM:
1062             // We are parsing an expected numeric tag value, like <1234>,
1063             //   within a chunk of data.
1064             if (u_isUWhiteSpace(c)) {
1065                 break;
1066             }
1067 
1068             if (c == u'>') {
1069                 // Finished the number.  Add the info to the expected break data,
1070                 //   and switch parse state back to doing plain data.
1071                 parseState = PARSE_DATA;
1072                 if (tagValue == 0) {
1073                     tagValue = -1;
1074                 }
1075                 int32_t  breakIdx = tp.dataToBreak.length();
1076                 if (tp.expectedBreaks->size() > breakIdx) {
1077                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1078                           lineNum, column);
1079                 }
1080                 tp.expectedBreaks->setSize(breakIdx+1);
1081                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1082                 tp.srcLine->setSize(breakIdx+1);
1083                 tp.srcLine->setElementAt(lineNum, breakIdx);
1084                 tp.srcCol ->setSize(breakIdx+1);
1085                 tp.srcCol ->setElementAt(column, breakIdx);
1086                 break;
1087             }
1088 
1089             if (u_isdigit(c)) {
1090                 tagValue = tagValue*10 + u_charDigitValue(c);
1091                 break;
1092             }
1093 
1094             errln("Syntax Error in test file at line %d, col %d",
1095                 lineNum, column);
1096             parseState = PARSE_COMMENT;
1097             goto end_test; // Stop the test
1098             break;
1099         }
1100 
1101 
1102         if (U_FAILURE(status)) {
1103             dataerrln("ICU Error %s while parsing test file at line %d.",
1104                 u_errorName(status), lineNum);
1105             status = U_ZERO_ERROR;
1106             goto end_test; // Stop the test
1107         }
1108 
1109     }
1110 
1111     // Reached end of test file. Raise an error if parseState indicates that we are
1112     //   within a block that should have been terminated.
1113 
1114     if (parseState == PARSE_RULES) {
1115         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1116             lineNum, rulesFirstLine);
1117     }
1118     if (parseState == PARSE_DATA) {
1119         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1120     }
1121 
1122 
1123 end_test:
1124     delete [] testFile;
1125 #endif
1126 }
1127 
1128 
1129 //-------------------------------------------------------------------------------
1130 //
1131 //  TestDictRules   create a break iterator from source rules that includes a
1132 //                  dictionary range.   Regression for bug #7130.  Source rules
1133 //                  do not declare a break iterator type (word, line, sentence, etc.
1134 //                  but the dictionary code, without a type, would loop.
1135 //
1136 //-------------------------------------------------------------------------------
TestDictRules()1137 void RBBITest::TestDictRules() {
1138     const char *rules =  "$dictionary = [a-z]; \n"
1139                          "!!forward; \n"
1140                          "$dictionary $dictionary; \n"
1141                          "!!reverse; \n"
1142                          "$dictionary $dictionary; \n";
1143     const char *text = "aa";
1144     UErrorCode status = U_ZERO_ERROR;
1145     UParseError parseError;
1146 
1147     RuleBasedBreakIterator bi(rules, parseError, status);
1148     if (U_SUCCESS(status)) {
1149         UnicodeString utext = text;
1150         bi.setText(utext);
1151         int32_t position;
1152         int32_t loops;
1153         for (loops = 0; loops<10; loops++) {
1154             position = bi.next();
1155             if (position == RuleBasedBreakIterator::DONE) {
1156                 break;
1157             }
1158         }
1159         TEST_ASSERT(loops == 1);
1160     } else {
1161         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1162     }
1163 }
1164 
1165 
1166 
1167 //-------------------------------------------------------------------------------
1168 //
1169 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1170 //    return the data in one big UChar * buffer, which the caller must delete.
1171 //
1172 //    parameters:
1173 //          fileName:   the name of the file, with no directory part.  The test data directory
1174 //                      is assumed.
1175 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1176 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1177 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1178 //                      Pass NULL for the system default encoding.
1179 //          status
1180 //    returns:
1181 //                      The file data, converted to UChar.
1182 //                      The caller must delete this when done with
1183 //                           delete [] theBuffer;
1184 //
1185 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1186 //           Move this function to some common place.
1187 //
1188 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1189 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1190     UChar       *retPtr  = NULL;
1191     char        *fileBuf = NULL;
1192     UConverter* conv     = NULL;
1193     FILE        *f       = NULL;
1194 
1195     ulen = 0;
1196     if (U_FAILURE(status)) {
1197         return retPtr;
1198     }
1199 
1200     //
1201     //  Open the file.
1202     //
1203     f = fopen(fileName, "rb");
1204     if (f == 0) {
1205         dataerrln("Error opening test data file %s\n", fileName);
1206         status = U_FILE_ACCESS_ERROR;
1207         return NULL;
1208     }
1209     //
1210     //  Read it in
1211     //
1212     int   fileSize;
1213     int   amt_read;
1214 
1215     fseek( f, 0, SEEK_END);
1216     fileSize = ftell(f);
1217     fileBuf = new char[fileSize];
1218     fseek(f, 0, SEEK_SET);
1219     amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
1220     if (amt_read != fileSize || fileSize <= 0) {
1221         errln("Error reading test data file.");
1222         goto cleanUpAndReturn;
1223     }
1224 
1225     //
1226     // Look for a Unicode Signature (BOM) on the data just read
1227     //
1228     int32_t        signatureLength;
1229     const char *   fileBufC;
1230     const char*    bomEncoding;
1231 
1232     fileBufC = fileBuf;
1233     bomEncoding = ucnv_detectUnicodeSignature(
1234         fileBuf, fileSize, &signatureLength, &status);
1235     if(bomEncoding!=NULL ){
1236         fileBufC  += signatureLength;
1237         fileSize  -= signatureLength;
1238         encoding = bomEncoding;
1239     }
1240 
1241     //
1242     // Open a converter to take the rule file to UTF-16
1243     //
1244     conv = ucnv_open(encoding, &status);
1245     if (U_FAILURE(status)) {
1246         goto cleanUpAndReturn;
1247     }
1248 
1249     //
1250     // Convert the rules to UChar.
1251     //  Preflight first to determine required buffer size.
1252     //
1253     ulen = ucnv_toUChars(conv,
1254         NULL,           //  dest,
1255         0,              //  destCapacity,
1256         fileBufC,
1257         fileSize,
1258         &status);
1259     if (status == U_BUFFER_OVERFLOW_ERROR) {
1260         // Buffer Overflow is expected from the preflight operation.
1261         status = U_ZERO_ERROR;
1262 
1263         retPtr = new UChar[ulen+1];
1264         ucnv_toUChars(conv,
1265             retPtr,       //  dest,
1266             ulen+1,
1267             fileBufC,
1268             fileSize,
1269             &status);
1270     }
1271 
1272 cleanUpAndReturn:
1273     fclose(f);
1274     delete []fileBuf;
1275     ucnv_close(conv);
1276     if (U_FAILURE(status)) {
1277         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1278         delete []retPtr;
1279         retPtr = 0;
1280         ulen   = 0;
1281     }
1282     return retPtr;
1283 }
1284 
1285 
1286 
1287 //--------------------------------------------------------------------------------------------
1288 //
1289 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1290 //
1291 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1292 void RBBITest::TestUnicodeFiles() {
1293     RuleBasedBreakIterator  *bi;
1294     UErrorCode               status = U_ZERO_ERROR;
1295 
1296     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1297     TEST_ASSERT_SUCCESS(status);
1298     if (U_SUCCESS(status)) {
1299         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1300     }
1301     delete bi;
1302 
1303     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1304     TEST_ASSERT_SUCCESS(status);
1305     if (U_SUCCESS(status)) {
1306         runUnicodeTestData("WordBreakTest.txt", bi);
1307     }
1308     delete bi;
1309 
1310     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1311     TEST_ASSERT_SUCCESS(status);
1312     if (U_SUCCESS(status)) {
1313         runUnicodeTestData("SentenceBreakTest.txt", bi);
1314     }
1315     delete bi;
1316 
1317     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1318     TEST_ASSERT_SUCCESS(status);
1319     if (U_SUCCESS(status)) {
1320         runUnicodeTestData("LineBreakTest.txt", bi);
1321     }
1322     delete bi;
1323 }
1324 
1325 
1326 // Check for test cases from the Unicode test data files that are known to fail
1327 // and should be skipped as known issues because ICU does not fully implement
1328 // the Unicode specifications, or because ICU includes tailorings that differ from
1329 // the Unicode standard.
1330 //
1331 // Test cases are identified by the test data sequence, which tends to be more stable
1332 // across Unicode versions than the test file line numbers.
1333 //
1334 // The test case with ticket "10666" is a dummy, included as an example.
1335 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1336 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1337     static struct TestCase {
1338         const char *fTicketNum;
1339         const char *fFileName;
1340         const UChar *fString;
1341     } badTestCases[] = {
1342         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1343         // The following tests were originally for
1344         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1345         // However, that ticket has been closed as fixed but these tests still fail, so
1346         // ICU-21097 has been created to investigate and address these remaining issues.
1347         {"21097",  "LineBreakTest.txt", u"-#"},
1348         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1349         {"21097",  "LineBreakTest.txt", u"\u002d\u00a7"},
1350         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1351         {"21097",  "LineBreakTest.txt", u"\u002d\U00050005"},
1352         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1353         {"21097",  "LineBreakTest.txt", u"\u002d\u0e01"},
1354         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1355 
1356         // The following tests were originally for
1357         // Issue ICU-12017 Improve line break around numbers.
1358         // However, that ticket has been closed as fixed but these tests still fail, so
1359         // ICU-21097 has been created to investigate and address these remaining issues.
1360         {"21097", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1361         {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1362         {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1363         {"21097", "LineBreakTest.txt", u"a.2 "},
1364         {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1365         {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1366         {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1367         {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1368         {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1369         {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1370         {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1371         {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1372         {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1373         {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1374     };
1375 
1376     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1377         const TestCase &badCase = badTestCases[n];
1378         if (!strcmp(fileName, badCase.fFileName) &&
1379                 testCase == UnicodeString(badCase.fString)) {
1380             return logKnownIssue(badCase.fTicketNum);
1381         }
1382     }
1383     return FALSE;
1384 }
1385 
1386 
1387 //--------------------------------------------------------------------------------------------
1388 //
1389 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1390 //
1391 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1392 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1393 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1394     UErrorCode  status = U_ZERO_ERROR;
1395 
1396     //
1397     //  Open and read the test data file, put it into a UnicodeString.
1398     //
1399     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1400     char testFileName[1000];
1401     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1402         dataerrln("Can't open test data.  Path too long.");
1403         return;
1404     }
1405     strcpy(testFileName, testDataDirectory);
1406     strcat(testFileName, fileName);
1407 
1408     logln("Opening data file %s\n", fileName);
1409 
1410     int    len;
1411     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1412     if (status != U_FILE_ACCESS_ERROR) {
1413         TEST_ASSERT_SUCCESS(status);
1414         TEST_ASSERT(testFile != NULL);
1415     }
1416     if (U_FAILURE(status) || testFile == NULL) {
1417         return; /* something went wrong, error already output */
1418     }
1419     UnicodeString testFileAsString(TRUE, testFile, len);
1420 
1421     //
1422     //  Parse the test data file using a regular expression.
1423     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1424     //     is identified by which group had a match.
1425     //
1426     //    Caputure Group #                  1          2            3            4           5
1427     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1428     //
1429     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1430     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1431     UnicodeString   testString;
1432     UVector32       breakPositions(status);
1433     int             lineNumber = 1;
1434     TEST_ASSERT_SUCCESS(status);
1435     if (U_FAILURE(status)) {
1436         return;
1437     }
1438 
1439     //
1440     //  Scan through each test case, building up the string to be broken in testString,
1441     //   and the positions that should be boundaries in the breakPositions vector.
1442     //
1443     int spin = 0;
1444     while (tokenMatcher.find()) {
1445         if(tokenMatcher.hitEnd()) {
1446           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1447              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1448              and caused an infinite loop here on EBCDIC systems!
1449           */
1450           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1451           //       return;
1452         }
1453         if (tokenMatcher.start(1, status) >= 0) {
1454             // Scanned a divide sign, indicating a break position in the test data.
1455             if (testString.length()>0) {
1456                 breakPositions.addElement(testString.length(), status);
1457             }
1458         }
1459         else if (tokenMatcher.start(2, status) >= 0) {
1460             // Scanned an 'x', meaning no break at this position in the test data
1461             //   Nothing to be done here.
1462             }
1463         else if (tokenMatcher.start(3, status) >= 0) {
1464             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1465             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1466             int length = hexNumber.length();
1467             if (length<=8) {
1468                 char buf[10];
1469                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1470                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1471                 if (c<=0x10ffff) {
1472                     testString.append(c);
1473                 } else {
1474                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1475                        fileName, lineNumber);
1476                 }
1477             } else {
1478                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1479                        fileName, lineNumber);
1480              }
1481         }
1482         else if (tokenMatcher.start(4, status) >= 0) {
1483             // Scanned to end of a line, possibly skipping over a comment in the process.
1484             //   If the line from the file contained test data, run the test now.
1485             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1486                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1487             }
1488 
1489             // Clear out this test case.
1490             //    The string and breakPositions vector will be refilled as the next
1491             //       test case is parsed.
1492             testString.remove();
1493             breakPositions.removeAllElements();
1494             lineNumber++;
1495         } else {
1496             // Scanner catchall.  Something unrecognized appeared on the line.
1497             char token[16];
1498             UnicodeString uToken = tokenMatcher.group(0, status);
1499             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1500             token[sizeof(token)-1] = 0;
1501             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1502 
1503             // Clean up, in preparation for continuing with the next line.
1504             testString.remove();
1505             breakPositions.removeAllElements();
1506             lineNumber++;
1507         }
1508         TEST_ASSERT_SUCCESS(status);
1509         if (U_FAILURE(status)) {
1510             break;
1511         }
1512     }
1513 
1514     delete [] testFile;
1515  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1516 }
1517 
1518 //--------------------------------------------------------------------------------------------
1519 //
1520 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1521 //                            test data files.  Do only a simple, forward-only check -
1522 //                            this test is mostly to check that ICU and the Unicode
1523 //                            data agree with each other.
1524 //
1525 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1526 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1527                          const UnicodeString &testString,   // Text data to be broken
1528                          UVector32 *breakPositions,         // Positions where breaks should be found.
1529                          RuleBasedBreakIterator *bi) {
1530     int32_t pos;                 // Break Position in the test string
1531     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1532     int32_t expectedPos;         // Expected break position (index into test string)
1533 
1534     bi->setText(testString);
1535     pos = bi->first();
1536     pos = bi->next();
1537 
1538     while (pos != BreakIterator::DONE) {
1539         if (expectedI >= breakPositions->size()) {
1540             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1541                 testFileName, lineNumber, pos);
1542             break;
1543         }
1544         expectedPos = breakPositions->elementAti(expectedI);
1545         if (pos < expectedPos) {
1546             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1547                 testFileName, lineNumber, pos);
1548             break;
1549         }
1550         if (pos > expectedPos) {
1551             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1552                 testFileName, lineNumber, expectedPos);
1553             break;
1554         }
1555         pos = bi->next();
1556         expectedI++;
1557     }
1558 
1559     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1560         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1561             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1562     }
1563 }
1564 
1565 
1566 
1567 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1568 //---------------------------------------------------------------------------------------
1569 //
1570 //   classs RBBIMonkeyKind
1571 //
1572 //      Monkey Test for Break Iteration
1573 //      Abstract interface class.   Concrete derived classes independently
1574 //      implement the break rules for different iterator types.
1575 //
1576 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1577 //      testing, but works purely in terms of the interface defined here.
1578 //
1579 //---------------------------------------------------------------------------------------
1580 class RBBIMonkeyKind {
1581 public:
1582     // Return a UVector of UnicodeSets, representing the character classes used
1583     //   for this type of iterator.
1584     virtual  UVector  *charClasses() = 0;
1585 
1586     // Set the test text on which subsequent calls to next() will operate
1587     virtual  void      setText(const UnicodeString &s) = 0;
1588 
1589     // Find the next break postion, starting from the prev break position, or from zero.
1590     // Return -1 after reaching end of string.
1591     virtual  int32_t   next(int32_t i) = 0;
1592 
1593     // Name of each character class, parallel with charClasses. Used for debugging output
1594     // of characters.
1595     virtual  std::vector<std::string>&     characterClassNames();
1596 
1597     void setAppliedRule(int32_t position, const char* value);
1598 
1599     std::string getAppliedRule(int32_t position);
1600 
1601     virtual ~RBBIMonkeyKind();
1602     UErrorCode deferredStatus;
1603 
1604     std::string classNameFromCodepoint(const UChar32 c);
1605     unsigned int maxClassNameSize();
1606 
1607  protected:
1608      RBBIMonkeyKind();
1609      std::vector<std::string> classNames;
1610      std::vector<std::string> appliedRules;
1611 
1612     // Clear `appliedRules` and fill it with empty strings in the size of test text.
1613     void prepareAppliedRules(int32_t size );
1614 
1615  private:
1616 
1617 };
1618 
RBBIMonkeyKind()1619 RBBIMonkeyKind::RBBIMonkeyKind() {
1620     deferredStatus = U_ZERO_ERROR;
1621 }
1622 
~RBBIMonkeyKind()1623 RBBIMonkeyKind::~RBBIMonkeyKind() {
1624 }
1625 
characterClassNames()1626 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1627     return classNames;
1628 }
1629 
prepareAppliedRules(int32_t size)1630 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1631     // Remove all the information in the `appliedRules`.
1632     appliedRules.clear();
1633     appliedRules.resize(size + 1);
1634 }
1635 
setAppliedRule(int32_t position,const char * value)1636 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1637     appliedRules[position] = value;
1638 }
1639 
getAppliedRule(int32_t position)1640 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1641     return appliedRules[position];
1642 }
1643 
classNameFromCodepoint(const UChar32 c)1644 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1645     // Simply iterate through charClasses to find character's class
1646     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1647         UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1648         if (classSet->contains(c)) {
1649             return classNames[aClassNum];
1650         }
1651     }
1652     U_ASSERT(FALSE);  // This should not happen.
1653     return "bad class name";
1654 }
1655 
maxClassNameSize()1656 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1657     unsigned int maxSize = 0;
1658     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1659         auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1660         if (aClassNumSize > maxSize) {
1661             maxSize = aClassNumSize;
1662         }
1663     }
1664     return maxSize;
1665 }
1666 
1667 //----------------------------------------------------------------------------------------
1668 //
1669 //   Random Numbers.  Similar to standard lib rand() and srand()
1670 //                    Not using library to
1671 //                      1.  Get same results on all platforms.
1672 //                      2.  Get access to current seed, to more easily reproduce failures.
1673 //
1674 //---------------------------------------------------------------------------------------
1675 static uint32_t m_seed = 1;
1676 
m_rand()1677 static uint32_t m_rand()
1678 {
1679     m_seed = m_seed * 1103515245 + 12345;
1680     return (uint32_t)(m_seed/65536) % 32768;
1681 }
1682 
1683 
1684 //------------------------------------------------------------------------------------------
1685 //
1686 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1687 //                             of RBBIMonkeyKind.
1688 //
1689 //------------------------------------------------------------------------------------------
1690 class RBBICharMonkey: public RBBIMonkeyKind {
1691 public:
1692     RBBICharMonkey();
1693     virtual          ~RBBICharMonkey();
1694     virtual  UVector *charClasses();
1695     virtual  void     setText(const UnicodeString &s);
1696     virtual  int32_t  next(int32_t i);
1697 private:
1698     UVector   *fSets;
1699 
1700     UnicodeSet  *fCRLFSet;
1701     UnicodeSet  *fControlSet;
1702     UnicodeSet  *fExtendSet;
1703     UnicodeSet  *fZWJSet;
1704     UnicodeSet  *fRegionalIndicatorSet;
1705     UnicodeSet  *fPrependSet;
1706     UnicodeSet  *fSpacingSet;
1707     UnicodeSet  *fLSet;
1708     UnicodeSet  *fVSet;
1709     UnicodeSet  *fTSet;
1710     UnicodeSet  *fLVSet;
1711     UnicodeSet  *fLVTSet;
1712     UnicodeSet  *fHangulSet;
1713     UnicodeSet  *fExtendedPictSet;
1714     UnicodeSet  *fViramaSet;
1715     UnicodeSet  *fLinkingConsonantSet;
1716     UnicodeSet  *fExtCccZwjSet;
1717     UnicodeSet  *fAnySet;
1718 
1719     const UnicodeString *fText;
1720 };
1721 
1722 
RBBICharMonkey()1723 RBBICharMonkey::RBBICharMonkey() {
1724     UErrorCode  status = U_ZERO_ERROR;
1725 
1726     fText = NULL;
1727 
1728     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1729     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1730     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1731     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1732     fRegionalIndicatorSet =
1733                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1734     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1735     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1736     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1737     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1738     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1739     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1740     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1741     fHangulSet  = new UnicodeSet();
1742     fHangulSet->addAll(*fLSet);
1743     fHangulSet->addAll(*fVSet);
1744     fHangulSet->addAll(*fTSet);
1745     fHangulSet->addAll(*fLVSet);
1746     fHangulSet->addAll(*fLVTSet);
1747 
1748     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1749     fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1750                                         "\\p{Indic_Syllabic_Category=Virama}]", status);
1751     fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1752                                         "\\p{Indic_Syllabic_Category=Consonant}]", status);
1753     fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1754     fAnySet           = new UnicodeSet(0, 0x10ffff);
1755 
1756     // Create sets of characters, and add the names of the above character sets.
1757     // In each new ICU release, add new names corresponding to the sets above.
1758     fSets             = new UVector(status);
1759 
1760     // Important: Keep class names the same as the class contents.
1761     fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1762     fSets->addElement(fControlSet, status); classNames.push_back("Control");
1763     fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1764     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1765     if (!fPrependSet->isEmpty()) {
1766         fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1767     }
1768     fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1769     fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1770     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1771     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1772     fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1773     fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1774     fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1775     fSets->addElement(fAnySet, status); classNames.push_back("Any");
1776 
1777     if (U_FAILURE(status)) {
1778         deferredStatus = status;
1779     }
1780 }
1781 
1782 
setText(const UnicodeString & s)1783 void RBBICharMonkey::setText(const UnicodeString &s) {
1784     fText = &s;
1785     prepareAppliedRules(s.length());
1786 }
1787 
1788 
1789 
next(int32_t prevPos)1790 int32_t RBBICharMonkey::next(int32_t prevPos) {
1791     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1792                               //   break position being tested.  The candidate break
1793                               //   location is before p2.
1794 
1795     int     breakPos = -1;
1796 
1797     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1798     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1799 
1800     if (U_FAILURE(deferredStatus)) {
1801         return -1;
1802     }
1803 
1804     // Previous break at end of string.  return DONE.
1805     if (prevPos >= fText->length()) {
1806         return -1;
1807     }
1808 
1809     p0 = p1 = p2 = p3 = prevPos;
1810     c3 =  fText->char32At(prevPos);
1811     c0 = c1 = c2 = cBase = 0;
1812     (void)p0;   // suppress set but not used warning.
1813     (void)c0;
1814 
1815     // Loop runs once per "significant" character position in the input text.
1816     for (;;) {
1817         // Move all of the positions forward in the input string.
1818         p0 = p1;  c0 = c1;
1819         p1 = p2;  c1 = c2;
1820         p2 = p3;  c2 = c3;
1821 
1822         // Advance p3 by one codepoint
1823         p3 = fText->moveIndex32(p3, 1);
1824         c3 = fText->char32At(p3);
1825 
1826         if (p1 == p2) {
1827             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1828             continue;
1829         }
1830 
1831         if (p2 == fText->length()) {
1832             setAppliedRule(p2, "End of String");
1833             break;
1834         }
1835 
1836         //     No Extend or Format characters may appear between the CR and LF,
1837         //     which requires the additional check for p2 immediately following p1.
1838         //
1839         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1840           setAppliedRule(p2, "GB3   CR x LF");
1841           continue;
1842         }
1843 
1844         if (fControlSet->contains(c1) ||
1845             c1 == 0x0D ||
1846             c1 == 0x0A)  {
1847           setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
1848           break;
1849         }
1850 
1851         if (fControlSet->contains(c2) ||
1852             c2 == 0x0D ||
1853             c2 == 0x0A)  {
1854             setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
1855             break;
1856         }
1857 
1858         if (fLSet->contains(c1) &&
1859                (fLSet->contains(c2)  ||
1860                 fVSet->contains(c2)  ||
1861                 fLVSet->contains(c2) ||
1862                 fLVTSet->contains(c2))) {
1863             setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
1864             continue;
1865         }
1866 
1867         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1868             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1869             setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
1870             continue;
1871         }
1872 
1873         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1874             fTSet->contains(c2))  {
1875             setAppliedRule(p2, "GB8   ( LVT | T)  x T");
1876             continue;
1877         }
1878 
1879         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1880             if (!fExtendSet->contains(c1)) {
1881                 cBase = c1;
1882             }
1883             setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
1884             continue;
1885         }
1886 
1887         if (fSpacingSet->contains(c2)) {
1888             setAppliedRule(p2, "GB9a  x  SpacingMark");
1889             continue;
1890         }
1891 
1892         if (fPrependSet->contains(c1)) {
1893             setAppliedRule(p2, "GB9b  Prepend x");
1894             continue;
1895         }
1896 
1897         //   Note: Viramas are also included in the ExtCccZwj class.
1898         if (fLinkingConsonantSet->contains(c2)) {
1899             int pi = p1;
1900             bool sawVirama = false;
1901             while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1902                 if (fViramaSet->contains(fText->char32At(pi))) {
1903                     sawVirama = true;
1904                 }
1905                 pi = fText->moveIndex32(pi, -1);
1906             }
1907             if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1908               setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1909               continue;
1910             }
1911         }
1912 
1913         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1914           setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1915           continue;
1916         }
1917 
1918         //                   Note: The first if condition is a little tricky. We only need to force
1919         //                      a break if there are three or more contiguous RIs. If there are
1920         //                      only two, a break following will occur via other rules, and will include
1921         //                      any trailing extend characters, which is needed behavior.
1922         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1923                 && fRegionalIndicatorSet->contains(c2)) {
1924           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1925           break;
1926         }
1927         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1928           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1929           continue;
1930         }
1931 
1932         setAppliedRule(p2, "GB999 Any <break> Any");
1933         break;
1934     }
1935 
1936     breakPos = p2;
1937     return breakPos;
1938 }
1939 
1940 
1941 
charClasses()1942 UVector  *RBBICharMonkey::charClasses() {
1943     return fSets;
1944 }
1945 
~RBBICharMonkey()1946 RBBICharMonkey::~RBBICharMonkey() {
1947     delete fSets;
1948     delete fCRLFSet;
1949     delete fControlSet;
1950     delete fExtendSet;
1951     delete fRegionalIndicatorSet;
1952     delete fPrependSet;
1953     delete fSpacingSet;
1954     delete fLSet;
1955     delete fVSet;
1956     delete fTSet;
1957     delete fLVSet;
1958     delete fLVTSet;
1959     delete fHangulSet;
1960     delete fAnySet;
1961     delete fZWJSet;
1962     delete fExtendedPictSet;
1963     delete fViramaSet;
1964     delete fLinkingConsonantSet;
1965     delete fExtCccZwjSet;
1966 }
1967 
1968 //------------------------------------------------------------------------------------------
1969 //
1970 //   class RBBIWordMonkey      Word Break specific implementation
1971 //                             of RBBIMonkeyKind.
1972 //
1973 //------------------------------------------------------------------------------------------
1974 class RBBIWordMonkey: public RBBIMonkeyKind {
1975 public:
1976     RBBIWordMonkey();
1977     virtual          ~RBBIWordMonkey();
1978     virtual  UVector *charClasses();
1979     virtual  void     setText(const UnicodeString &s);
1980     virtual int32_t   next(int32_t i);
1981 private:
1982     UVector      *fSets;
1983 
1984     UnicodeSet  *fCRSet;
1985     UnicodeSet  *fLFSet;
1986     UnicodeSet  *fNewlineSet;
1987     UnicodeSet  *fRegionalIndicatorSet;
1988     UnicodeSet  *fKatakanaSet;
1989     UnicodeSet  *fHebrew_LetterSet;
1990     UnicodeSet  *fALetterSet;
1991     UnicodeSet  *fSingle_QuoteSet;
1992     UnicodeSet  *fDouble_QuoteSet;
1993     UnicodeSet  *fMidNumLetSet;
1994     UnicodeSet  *fMidLetterSet;
1995     UnicodeSet  *fMidNumSet;
1996     UnicodeSet  *fNumericSet;
1997     UnicodeSet  *fFormatSet;
1998     UnicodeSet  *fOtherSet;
1999     UnicodeSet  *fExtendSet;
2000     UnicodeSet  *fExtendNumLetSet;
2001     UnicodeSet  *fWSegSpaceSet;
2002     UnicodeSet  *fDictionarySet;
2003     UnicodeSet  *fZWJSet;
2004     UnicodeSet  *fExtendedPictSet;
2005 
2006     const UnicodeString  *fText;
2007 };
2008 
2009 
RBBIWordMonkey()2010 RBBIWordMonkey::RBBIWordMonkey()
2011 {
2012     UErrorCode  status = U_ZERO_ERROR;
2013 
2014     fSets            = new UVector(status);
2015 
2016     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
2017     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
2018     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
2019     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
2020     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2021     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2022     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2023     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
2024     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
2025     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
2026     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]",    status);
2027     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
2028     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
2029     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
2030     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
2031     // There are some sc=Hani characters with WB=Extend.
2032     // The break rules need to pick one or the other because
2033     // Extend overlapping with something else is messy.
2034     // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
2035     // in $Han (for $dictionary) and out of $Extend.
2036     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
2037     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
2038 
2039     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
2040     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
2041 
2042     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2043     fDictionarySet->addAll(*fKatakanaSet);
2044     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2045 
2046     fALetterSet->removeAll(*fDictionarySet);
2047 
2048     fOtherSet        = new UnicodeSet();
2049     if(U_FAILURE(status)) {
2050         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2051         deferredStatus = status;
2052         return;
2053     }
2054 
2055     fOtherSet->complement();
2056     fOtherSet->removeAll(*fCRSet);
2057     fOtherSet->removeAll(*fLFSet);
2058     fOtherSet->removeAll(*fNewlineSet);
2059     fOtherSet->removeAll(*fKatakanaSet);
2060     fOtherSet->removeAll(*fHebrew_LetterSet);
2061     fOtherSet->removeAll(*fALetterSet);
2062     fOtherSet->removeAll(*fSingle_QuoteSet);
2063     fOtherSet->removeAll(*fDouble_QuoteSet);
2064     fOtherSet->removeAll(*fMidLetterSet);
2065     fOtherSet->removeAll(*fMidNumSet);
2066     fOtherSet->removeAll(*fNumericSet);
2067     fOtherSet->removeAll(*fExtendNumLetSet);
2068     fOtherSet->removeAll(*fWSegSpaceSet);
2069     fOtherSet->removeAll(*fFormatSet);
2070     fOtherSet->removeAll(*fExtendSet);
2071     fOtherSet->removeAll(*fRegionalIndicatorSet);
2072     fOtherSet->removeAll(*fZWJSet);
2073     fOtherSet->removeAll(*fExtendedPictSet);
2074 
2075     // Inhibit dictionary characters from being tested at all.
2076     fOtherSet->removeAll(*fDictionarySet);
2077 
2078     // Add classes and their names
2079     fSets->addElement(fCRSet, status); classNames.push_back("CR");
2080     fSets->addElement(fLFSet, status); classNames.push_back("LF");
2081     fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2082     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2083     fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2084     fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2085     fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2086     fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2087     // Omit Katakana from fSets, which omits Katakana characters
2088     // from the test data. They are all in the dictionary set,
2089     // which this (old, to be retired) monkey test cannot handle.
2090     //fSets->addElement(fKatakanaSet, status);
2091 
2092     fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2093     fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2094     fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2095     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2096     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2097     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2098     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2099     fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2100     fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2101 
2102     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2103     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2104 
2105     if (U_FAILURE(status)) {
2106         deferredStatus = status;
2107     }
2108 }
2109 
setText(const UnicodeString & s)2110 void RBBIWordMonkey::setText(const UnicodeString &s) {
2111     fText       = &s;
2112     prepareAppliedRules(s.length());
2113 }
2114 
2115 
next(int32_t prevPos)2116 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2117     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2118                               //   break position being tested.  The candidate break
2119                               //   location is before p2.
2120 
2121     int     breakPos = -1;
2122 
2123     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2124 
2125     if (U_FAILURE(deferredStatus)) {
2126         return -1;
2127     }
2128 
2129     // Prev break at end of string.  return DONE.
2130     if (prevPos >= fText->length()) {
2131         return -1;
2132     }
2133     p0 = p1 = p2 = p3 = prevPos;
2134     c3 =  fText->char32At(prevPos);
2135     c0 = c1 = c2 = 0;
2136     (void)p0;       // Suppress set but not used warning.
2137 
2138     // Loop runs once per "significant" character position in the input text.
2139     for (;;) {
2140         // Move all of the positions forward in the input string.
2141         p0 = p1;  c0 = c1;
2142         p1 = p2;  c1 = c2;
2143         p2 = p3;  c2 = c3;
2144 
2145         // Advance p3 by    X(Extend | Format)*   Rule 4
2146         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2147         do {
2148             p3 = fText->moveIndex32(p3, 1);
2149             c3 = fText->char32At(p3);
2150             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2151                break;
2152             }
2153         }
2154         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2155 
2156 
2157         if (p1 == p2) {
2158             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2159             continue;
2160         }
2161 
2162         if (p2 == fText->length()) {
2163             // Reached end of string.  Always a break position.
2164             break;
2165         }
2166 
2167         //     No Extend or Format characters may appear between the CR and LF,
2168         //     which requires the additional check for p2 immediately following p1.
2169         //
2170         if (c1==0x0D && c2==0x0A) {
2171           setAppliedRule(p2, "WB3   CR x LF");
2172           continue;
2173         }
2174 
2175         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2176             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2177             break;
2178         }
2179         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2180             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2181             break;
2182         }
2183 
2184         //              Not ignoring extend chars, so peek into input text to
2185         //              get the potential ZWJ, the character immediately preceding c2.
2186         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2187         //              but char32At will get the full code point.
2188         if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2189             setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
2190             continue;
2191         }
2192 
2193         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2194             setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
2195             continue;
2196         }
2197 
2198         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2199             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2200             setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2201             continue;
2202         }
2203 
2204         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2205              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2206              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2207             setAppliedRule(p2,
2208                            "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2209             continue;
2210         }
2211 
2212         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2213             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2214             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2215             setAppliedRule(p2,
2216                            "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
2217             continue;
2218         }
2219 
2220         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2221             setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
2222             continue;
2223         }
2224 
2225           if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2226             setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
2227             continue;
2228         }
2229 
2230         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2231             setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
2232             continue;
2233         }
2234 
2235         if (fNumericSet->contains(c1) &&
2236             fNumericSet->contains(c2)) {
2237             setAppliedRule(p2, "WB8   Numeric x Numeric");
2238             continue;
2239         }
2240 
2241         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2242             fNumericSet->contains(c2)) {
2243             setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
2244             continue;
2245         }
2246 
2247         if (fNumericSet->contains(c1) &&
2248             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2249             setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
2250             continue;
2251         }
2252 
2253           if (fNumericSet->contains(c0) &&
2254             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2255             fNumericSet->contains(c2)) {
2256             setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
2257             continue;
2258         }
2259 
2260         if (fNumericSet->contains(c1) &&
2261             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2262             fNumericSet->contains(c3)) {
2263             setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2264             continue;
2265         }
2266 
2267         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2268         //                  all Katakana are handled by the dictionary breaker.
2269         if (fKatakanaSet->contains(c1) &&
2270             fKatakanaSet->contains(c2))  {
2271             setAppliedRule(p2, "WB13  Katakana x Katakana");
2272             continue;
2273         }
2274 
2275         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2276              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2277              fExtendNumLetSet->contains(c2)) {
2278             setAppliedRule(p2,
2279                            "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2280             continue;
2281         }
2282 
2283         if (fExtendNumLetSet->contains(c1) &&
2284                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2285                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2286             setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2287             continue;
2288         }
2289 
2290         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2291             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2292             break;
2293         }
2294         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2295             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2296             continue;
2297         }
2298 
2299         setAppliedRule(p2, "WB999");
2300         break;
2301     }
2302 
2303     breakPos = p2;
2304     return breakPos;
2305 }
2306 
2307 
charClasses()2308 UVector  *RBBIWordMonkey::charClasses() {
2309     return fSets;
2310 }
2311 
~RBBIWordMonkey()2312 RBBIWordMonkey::~RBBIWordMonkey() {
2313     delete fSets;
2314     delete fCRSet;
2315     delete fLFSet;
2316     delete fNewlineSet;
2317     delete fKatakanaSet;
2318     delete fHebrew_LetterSet;
2319     delete fALetterSet;
2320     delete fSingle_QuoteSet;
2321     delete fDouble_QuoteSet;
2322     delete fMidNumLetSet;
2323     delete fMidLetterSet;
2324     delete fMidNumSet;
2325     delete fNumericSet;
2326     delete fFormatSet;
2327     delete fExtendSet;
2328     delete fExtendNumLetSet;
2329     delete fWSegSpaceSet;
2330     delete fRegionalIndicatorSet;
2331     delete fDictionarySet;
2332     delete fOtherSet;
2333     delete fZWJSet;
2334     delete fExtendedPictSet;
2335 }
2336 
2337 
2338 
2339 
2340 //------------------------------------------------------------------------------------------
2341 //
2342 //   class RBBISentMonkey      Sentence Break specific implementation
2343 //                             of RBBIMonkeyKind.
2344 //
2345 //------------------------------------------------------------------------------------------
2346 class RBBISentMonkey: public RBBIMonkeyKind {
2347 public:
2348     RBBISentMonkey();
2349     virtual          ~RBBISentMonkey();
2350     virtual  UVector *charClasses();
2351     virtual  void     setText(const UnicodeString &s);
2352     virtual int32_t   next(int32_t i);
2353 private:
2354     int               moveBack(int posFrom);
2355     int               moveForward(int posFrom);
2356     UChar32           cAt(int pos);
2357 
2358     UVector      *fSets;
2359 
2360     UnicodeSet  *fSepSet;
2361     UnicodeSet  *fFormatSet;
2362     UnicodeSet  *fSpSet;
2363     UnicodeSet  *fLowerSet;
2364     UnicodeSet  *fUpperSet;
2365     UnicodeSet  *fOLetterSet;
2366     UnicodeSet  *fNumericSet;
2367     UnicodeSet  *fATermSet;
2368     UnicodeSet  *fSContinueSet;
2369     UnicodeSet  *fSTermSet;
2370     UnicodeSet  *fCloseSet;
2371     UnicodeSet  *fOtherSet;
2372     UnicodeSet  *fExtendSet;
2373 
2374     const UnicodeString  *fText;
2375 };
2376 
RBBISentMonkey()2377 RBBISentMonkey::RBBISentMonkey()
2378 {
2379     UErrorCode  status = U_ZERO_ERROR;
2380 
2381     fSets            = new UVector(status);
2382 
2383     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2384     //                       set and made into character classes of their own.  For the monkey impl,
2385     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2386     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2387     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2388     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2389     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2390     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2391     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2392     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2393     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2394     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2395     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2396     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2397     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2398     fOtherSet        = new UnicodeSet();
2399 
2400     if(U_FAILURE(status)) {
2401       deferredStatus = status;
2402       return;
2403     }
2404 
2405     fOtherSet->complement();
2406     fOtherSet->removeAll(*fSepSet);
2407     fOtherSet->removeAll(*fFormatSet);
2408     fOtherSet->removeAll(*fSpSet);
2409     fOtherSet->removeAll(*fLowerSet);
2410     fOtherSet->removeAll(*fUpperSet);
2411     fOtherSet->removeAll(*fOLetterSet);
2412     fOtherSet->removeAll(*fNumericSet);
2413     fOtherSet->removeAll(*fATermSet);
2414     fOtherSet->removeAll(*fSContinueSet);
2415     fOtherSet->removeAll(*fSTermSet);
2416     fOtherSet->removeAll(*fCloseSet);
2417     fOtherSet->removeAll(*fExtendSet);
2418 
2419     fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2420     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2421     fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2422     fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2423     fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2424     fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2425     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2426     fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2427     fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2428     fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2429     fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2430     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2431     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2432 
2433     if (U_FAILURE(status)) {
2434         deferredStatus = status;
2435     }
2436 }
2437 
2438 
2439 
setText(const UnicodeString & s)2440 void RBBISentMonkey::setText(const UnicodeString &s) {
2441     fText       = &s;
2442     prepareAppliedRules(s.length());
2443 }
2444 
charClasses()2445 UVector  *RBBISentMonkey::charClasses() {
2446     return fSets;
2447 }
2448 
2449 //  moveBack()   Find the "significant" code point preceding the index i.
2450 //               Skips over ($Extend | $Format)* .
2451 //
moveBack(int i)2452 int RBBISentMonkey::moveBack(int i) {
2453     if (i <= 0) {
2454         return -1;
2455     }
2456     UChar32   c;
2457     int32_t   j = i;
2458     do {
2459         j = fText->moveIndex32(j, -1);
2460         c = fText->char32At(j);
2461     }
2462     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2463     return j;
2464 
2465  }
2466 
2467 
moveForward(int i)2468 int RBBISentMonkey::moveForward(int i) {
2469     if (i>=fText->length()) {
2470         return fText->length();
2471     }
2472     UChar32   c;
2473     int32_t   j = i;
2474     do {
2475         j = fText->moveIndex32(j, 1);
2476         c = cAt(j);
2477     }
2478     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2479     return j;
2480 }
2481 
cAt(int pos)2482 UChar32 RBBISentMonkey::cAt(int pos) {
2483     if (pos<0 || pos>=fText->length()) {
2484         return -1;
2485     } else {
2486         return fText->char32At(pos);
2487     }
2488 }
2489 
next(int32_t prevPos)2490 int32_t RBBISentMonkey::next(int32_t prevPos) {
2491     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2492                               //   break position being tested.  The candidate break
2493                               //   location is before p2.
2494 
2495     int     breakPos = -1;
2496 
2497     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2498     UChar32 c;
2499 
2500     if (U_FAILURE(deferredStatus)) {
2501         return -1;
2502     }
2503 
2504     // Prev break at end of string.  return DONE.
2505     if (prevPos >= fText->length()) {
2506         return -1;
2507     }
2508     p0 = p1 = p2 = p3 = prevPos;
2509     c3 =  fText->char32At(prevPos);
2510     c0 = c1 = c2 = 0;
2511     (void)p0;     // Suppress set but not used warning.
2512 
2513     // Loop runs once per "significant" character position in the input text.
2514     for (;;) {
2515         // Move all of the positions forward in the input string.
2516         p0 = p1;  c0 = c1;
2517         p1 = p2;  c1 = c2;
2518         p2 = p3;  c2 = c3;
2519 
2520         // Advance p3 by    X(Extend | Format)*   Rule 4
2521         p3 = moveForward(p3);
2522         c3 = cAt(p3);
2523 
2524         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2525             setAppliedRule(p2, "SB3   CR x LF");
2526             continue;
2527         }
2528 
2529         if (fSepSet->contains(c1)) {
2530             p2 = p1+1;   // Separators don't combine with Extend or Format.
2531 
2532             setAppliedRule(p2, "SB4   Sep  <break>");
2533             break;
2534         }
2535 
2536         if (p2 >= fText->length()) {
2537             // Reached end of string.  Always a break position.
2538             setAppliedRule(p2, "SB4   Sep  <break>");
2539             break;
2540         }
2541 
2542         if (p2 == prevPos) {
2543             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2544             setAppliedRule(p2, "SB4   Sep  <break>");
2545             continue;
2546         }
2547 
2548         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2549             setAppliedRule(p2, "SB6   ATerm x Numeric");
2550             continue;
2551         }
2552 
2553           if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2554                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2555             setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
2556             continue;
2557         }
2558 
2559         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2560         //                  note to the Unicode 5.0 documents.
2561         int p8 = p1;
2562         while (fSpSet->contains(cAt(p8))) {
2563             p8 = moveBack(p8);
2564         }
2565         while (fCloseSet->contains(cAt(p8))) {
2566             p8 = moveBack(p8);
2567         }
2568         if (fATermSet->contains(cAt(p8))) {
2569             p8=p2;
2570             for (;;) {
2571                 c = cAt(p8);
2572                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2573                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2574                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2575 
2576                     setAppliedRule(p2,
2577                                    "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2578                     break;
2579                 }
2580                 p8 = moveForward(p8);
2581             }
2582             if (fLowerSet->contains(cAt(p8))) {
2583 
2584                 setAppliedRule(p2,
2585                                "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2586                 continue;
2587             }
2588         }
2589 
2590         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2591             p8 = p1;
2592             while (fSpSet->contains(cAt(p8))) {
2593                 p8 = moveBack(p8);
2594             }
2595             while (fCloseSet->contains(cAt(p8))) {
2596                 p8 = moveBack(p8);
2597             }
2598             c = cAt(p8);
2599             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2600                 setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2601                 continue;
2602             }
2603         }
2604 
2605         int p9 = p1;
2606         while (fCloseSet->contains(cAt(p9))) {
2607             p9 = moveBack(p9);
2608         }
2609         c = cAt(p9);
2610         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2611             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2612 
2613                 setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
2614                 continue;
2615             }
2616         }
2617 
2618         int p10 = p1;
2619         while (fSpSet->contains(cAt(p10))) {
2620             p10 = moveBack(p10);
2621         }
2622         while (fCloseSet->contains(cAt(p10))) {
2623             p10 = moveBack(p10);
2624         }
2625         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2626             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2627                 setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
2628                 continue;
2629             }
2630         }
2631 
2632         int p11 = p1;
2633         if (fSepSet->contains(cAt(p11))) {
2634             p11 = moveBack(p11);
2635         }
2636         while (fSpSet->contains(cAt(p11))) {
2637             p11 = moveBack(p11);
2638         }
2639         while (fCloseSet->contains(cAt(p11))) {
2640             p11 = moveBack(p11);
2641         }
2642         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2643           setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
2644             break;
2645         }
2646 
2647         setAppliedRule(p2, "SB12  Any x Any");
2648         continue;
2649     }
2650 
2651     breakPos = p2;
2652     return breakPos;
2653 }
2654 
~RBBISentMonkey()2655 RBBISentMonkey::~RBBISentMonkey() {
2656     delete fSets;
2657     delete fSepSet;
2658     delete fFormatSet;
2659     delete fSpSet;
2660     delete fLowerSet;
2661     delete fUpperSet;
2662     delete fOLetterSet;
2663     delete fNumericSet;
2664     delete fATermSet;
2665     delete fSContinueSet;
2666     delete fSTermSet;
2667     delete fCloseSet;
2668     delete fOtherSet;
2669     delete fExtendSet;
2670 }
2671 
2672 
2673 
2674 //-------------------------------------------------------------------------------------------
2675 //
2676 //  RBBILineMonkey
2677 //
2678 //-------------------------------------------------------------------------------------------
2679 
2680 class RBBILineMonkey: public RBBIMonkeyKind {
2681 public:
2682     RBBILineMonkey();
2683     virtual          ~RBBILineMonkey();
2684     virtual  UVector *charClasses();
2685     virtual  void     setText(const UnicodeString &s);
2686     virtual  int32_t  next(int32_t i);
2687     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2688 private:
2689     UVector      *fSets;
2690 
2691     UnicodeSet  *fBK;
2692     UnicodeSet  *fCR;
2693     UnicodeSet  *fLF;
2694     UnicodeSet  *fCM;
2695     UnicodeSet  *fNL;
2696     UnicodeSet  *fSG;
2697     UnicodeSet  *fWJ;
2698     UnicodeSet  *fZW;
2699     UnicodeSet  *fGL;
2700     UnicodeSet  *fCB;
2701     UnicodeSet  *fSP;
2702     UnicodeSet  *fB2;
2703     UnicodeSet  *fBA;
2704     UnicodeSet  *fBB;
2705     UnicodeSet  *fHH;
2706     UnicodeSet  *fHY;
2707     UnicodeSet  *fH2;
2708     UnicodeSet  *fH3;
2709     UnicodeSet  *fCL;
2710     UnicodeSet  *fCP;
2711     UnicodeSet  *fEX;
2712     UnicodeSet  *fIN;
2713     UnicodeSet  *fJL;
2714     UnicodeSet  *fJV;
2715     UnicodeSet  *fJT;
2716     UnicodeSet  *fNS;
2717     UnicodeSet  *fOP;
2718     UnicodeSet  *fQU;
2719     UnicodeSet  *fIS;
2720     UnicodeSet  *fNU;
2721     UnicodeSet  *fPO;
2722     UnicodeSet  *fPR;
2723     UnicodeSet  *fSY;
2724     UnicodeSet  *fAI;
2725     UnicodeSet  *fAL;
2726     UnicodeSet  *fCJ;
2727     UnicodeSet  *fHL;
2728     UnicodeSet  *fID;
2729     UnicodeSet  *fRI;
2730     UnicodeSet  *fXX;
2731     UnicodeSet  *fEB;
2732     UnicodeSet  *fEM;
2733     UnicodeSet  *fZWJ;
2734     UnicodeSet  *fOP30;
2735     UnicodeSet  *fCP30;
2736 
2737     BreakIterator        *fCharBI;
2738     const UnicodeString  *fText;
2739     RegexMatcher         *fNumberMatcher;
2740 };
2741 
RBBILineMonkey()2742 RBBILineMonkey::RBBILineMonkey() :
2743     RBBIMonkeyKind(),
2744     fSets(NULL),
2745 
2746     fCharBI(NULL),
2747     fText(NULL),
2748     fNumberMatcher(NULL)
2749 
2750 {
2751     if (U_FAILURE(deferredStatus)) {
2752         return;
2753     }
2754 
2755     UErrorCode  status = U_ZERO_ERROR;
2756 
2757     fSets  = new UVector(status);
2758 
2759     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2760     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2761     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2762     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2763     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2764     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2765     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2766     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2767     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2768     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2769     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2770     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2771     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2772     fHH    = new UnicodeSet();
2773     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2774     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2775     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2776     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2777     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2778     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2779     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2780     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2781     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2782     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2783     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2784     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2785     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2786     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2787     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2788     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2789     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2790     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2791     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2792     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2793     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2794     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2795     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2796     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2797     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2798     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2799     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2800     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2801     fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2802     fOP30  = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2803     fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2804 
2805     if (U_FAILURE(status)) {
2806         deferredStatus = status;
2807         return;
2808     }
2809 
2810     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2811     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2812     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2813 
2814     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2815     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2816 
2817     fHH->add(u'\u2010');   // Hyphen, '‐'
2818 
2819     // Sets and names.
2820     fSets->addElement(fBK, status); classNames.push_back("fBK");
2821     fSets->addElement(fCR, status); classNames.push_back("fCR");
2822     fSets->addElement(fLF, status); classNames.push_back("fLF");
2823     fSets->addElement(fCM, status); classNames.push_back("fCM");
2824     fSets->addElement(fNL, status); classNames.push_back("fNL");
2825     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2826     fSets->addElement(fZW, status); classNames.push_back("fZW");
2827     fSets->addElement(fGL, status); classNames.push_back("fGL");
2828     fSets->addElement(fCB, status); classNames.push_back("fCB");
2829     fSets->addElement(fSP, status); classNames.push_back("fSP");
2830     fSets->addElement(fB2, status); classNames.push_back("fB2");
2831     fSets->addElement(fBA, status); classNames.push_back("fBA");
2832     fSets->addElement(fBB, status); classNames.push_back("fBB");
2833     fSets->addElement(fHY, status); classNames.push_back("fHY");
2834     fSets->addElement(fH2, status); classNames.push_back("fH2");
2835     fSets->addElement(fH3, status); classNames.push_back("fH3");
2836     fSets->addElement(fCL, status); classNames.push_back("fCL");
2837     fSets->addElement(fCP, status); classNames.push_back("fCP");
2838     fSets->addElement(fEX, status); classNames.push_back("fEX");
2839     fSets->addElement(fIN, status); classNames.push_back("fIN");
2840     fSets->addElement(fJL, status); classNames.push_back("fJL");
2841     fSets->addElement(fJT, status); classNames.push_back("fJT");
2842     fSets->addElement(fJV, status); classNames.push_back("fJV");
2843     fSets->addElement(fNS, status); classNames.push_back("fNS");
2844     fSets->addElement(fOP, status); classNames.push_back("fOP");
2845     fSets->addElement(fQU, status); classNames.push_back("fQU");
2846     fSets->addElement(fIS, status); classNames.push_back("fIS");
2847     fSets->addElement(fNU, status); classNames.push_back("fNU");
2848     fSets->addElement(fPO, status); classNames.push_back("fPO");
2849     fSets->addElement(fPR, status); classNames.push_back("fPR");
2850     fSets->addElement(fSY, status); classNames.push_back("fSY");
2851     fSets->addElement(fAI, status); classNames.push_back("fAI");
2852     fSets->addElement(fAL, status); classNames.push_back("fAL");
2853     fSets->addElement(fHL, status); classNames.push_back("fHL");
2854     fSets->addElement(fID, status); classNames.push_back("fID");
2855     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2856     fSets->addElement(fRI, status); classNames.push_back("fRI");
2857     fSets->addElement(fSG, status); classNames.push_back("fSG");
2858     fSets->addElement(fEB, status); classNames.push_back("fEB");
2859     fSets->addElement(fEM, status); classNames.push_back("fEM");
2860     fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2861     // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2862     fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2863     fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2864 
2865     const char *rules =
2866             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2867             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2868             "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2869             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2870             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2871             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2872             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2873 
2874     fNumberMatcher = new RegexMatcher(
2875         UnicodeString(rules, -1, US_INV), 0, status);
2876 
2877     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2878 
2879     if (U_FAILURE(status)) {
2880         deferredStatus = status;
2881     }
2882 
2883 }
2884 
2885 
setText(const UnicodeString & s)2886 void RBBILineMonkey::setText(const UnicodeString &s) {
2887     fText       = &s;
2888     fCharBI->setText(s);
2889     prepareAppliedRules(s.length());
2890     fNumberMatcher->reset(s);
2891 }
2892 
2893 //
2894 //  rule9Adjust
2895 //     Line Break TR rules 9 and 10 implementation.
2896 //     This deals with combining marks and other sequences that
2897 //     that must be treated as if they were something other than what they actually are.
2898 //
2899 //     This is factored out into a separate function because it must be applied twice for
2900 //     each potential break, once to the chars before the position being checked, then
2901 //     again to the text following the possible break.
2902 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2903 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2904     if (pos == -1) {
2905         // Invalid initial position.  Happens during the warmup iteration of the
2906         //   main loop in next().
2907         return;
2908     }
2909 
2910     int32_t  nPos = *nextPos;
2911 
2912     // LB 9  Keep combining sequences together.
2913     // advance over any CM class chars.  Note that Line Break CM is different
2914     // from the normal Grapheme Extend property.
2915     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2916           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2917         for (;;) {
2918             *nextChar = fText->char32At(nPos);
2919             if (!fCM->contains(*nextChar)) {
2920                 break;
2921             }
2922             nPos = fText->moveIndex32(nPos, 1);
2923         }
2924     }
2925 
2926 
2927     // LB 9 Treat X CM* as if it were x.
2928     //       No explicit action required.
2929 
2930     // LB 10  Treat any remaining combining mark as AL
2931     if (fCM->contains(*posChar)) {
2932         *posChar = u'A';
2933     }
2934 
2935     // Push the updated nextPos and nextChar back to our caller.
2936     // This only makes a difference if posChar got bigger by consuming a
2937     // combining sequence.
2938     *nextPos  = nPos;
2939     *nextChar = fText->char32At(nPos);
2940 }
2941 
2942 
2943 
next(int32_t startPos)2944 int32_t RBBILineMonkey::next(int32_t startPos) {
2945     UErrorCode status = U_ZERO_ERROR;
2946     int32_t    pos;       //  Index of the char following a potential break position
2947     UChar32    thisChar;  //  Character at above position "pos"
2948 
2949     int32_t    prevPos;   //  Index of the char preceding a potential break position
2950     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2951                           //   and thisChar may not be adjacent because combining
2952                           //   characters between them will be ignored.
2953 
2954     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2955     UChar32    prevCharX2;
2956 
2957     int32_t    nextPos;   //  Index of the next character following pos.
2958                           //     Usually skips over combining marks.
2959     int32_t    nextCPPos; //  Index of the code point following "pos."
2960                           //     May point to a combining mark.
2961     int32_t    tPos;      //  temp value.
2962     UChar32    c;
2963 
2964     if (U_FAILURE(deferredStatus)) {
2965         return -1;
2966     }
2967 
2968     if (startPos >= fText->length()) {
2969         return -1;
2970     }
2971 
2972 
2973     // Initial values for loop.  Loop will run the first time without finding breaks,
2974     //                           while the invalid values shift out and the "this" and
2975     //                           "prev" positions are filled in with good values.
2976     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2977     thisChar = prevChar  = prevCharX2 = 0;
2978     nextPos  = nextCPPos = startPos;
2979 
2980 
2981     // Loop runs once per position in the test text, until a break position
2982     //  is found.
2983     for (;;) {
2984         prevPosX2 = prevPos;
2985         prevCharX2 = prevChar;
2986 
2987         prevPos   = pos;
2988         prevChar  = thisChar;
2989 
2990         pos       = nextPos;
2991         thisChar  = fText->char32At(pos);
2992 
2993         nextCPPos = fText->moveIndex32(pos, 1);
2994         nextPos   = nextCPPos;
2995 
2996 
2997         if (pos >= fText->length()) {
2998             setAppliedRule(pos, "LB2 - Break at end of text.");
2999             break;
3000         }
3001 
3002 
3003         //             We do this one out-of-order because the adjustment does not change anything
3004         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3005         //             be applied.
3006         rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3007         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3008         c = fText->char32At(nextPos);
3009         rule9Adjust(pos, &thisChar, &nextPos, &c);
3010 
3011         // If the loop is still warming up - if we haven't shifted the initial
3012         //   -1 positions out of prevPos yet - loop back to advance the
3013         //    position in the input without any further looking for breaks.
3014         if (prevPos == -1) {
3015           setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
3016             continue;
3017         }
3018 
3019 
3020         if (fBK->contains(prevChar)) {
3021             setAppliedRule(pos, "LB 4  Always break after hard line breaks");
3022             break;
3023         }
3024 
3025 
3026         if (prevChar == 0x0d && thisChar == 0x0a) {
3027             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
3028             continue;
3029         }
3030         if (prevChar == 0x0d ||
3031             prevChar == 0x0a ||
3032             prevChar == 0x85)  {
3033             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
3034             break;
3035         }
3036 
3037 
3038         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3039             fBK->contains(thisChar)) {
3040             setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
3041             continue;
3042         }
3043 
3044 
3045         if (fSP->contains(thisChar)) {
3046             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
3047             continue;
3048         }
3049 
3050         // !!! ??? Is this the right text for the applied rule?
3051         if (fZW->contains(thisChar)) {
3052             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
3053             continue;
3054         }
3055 
3056 
3057         //       ZW SP* ÷
3058         //       Scan backwards from prevChar for SP* ZW
3059         tPos = prevPos;
3060         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3061             tPos = fText->moveIndex32(tPos, -1);
3062         }
3063         if (fZW->contains(fText->char32At(tPos))) {
3064             setAppliedRule(pos, "LB 8  Break after zero width space");
3065             break;
3066         }
3067 
3068 
3069         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
3070         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
3071         if (fNumberMatcher->lookingAt(prevPos, status)) {
3072             if (U_FAILURE(status)) {
3073                 setAppliedRule(pos, "LB 25 Numbers");
3074                 break;
3075             }
3076             // Matched a number.  But could have been just a single digit, which would
3077             //    not represent a "no break here" between prevChar and thisChar
3078             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3079             if (numEndIdx > pos) {
3080                 // Number match includes at least our two chars being checked
3081                 if (numEndIdx > nextPos) {
3082                     // Number match includes additional chars.  Update pos and nextPos
3083                     //   so that next loop iteration will continue at the end of the number,
3084                     //   checking for breaks between last char in number & whatever follows.
3085                     pos = nextPos = numEndIdx;
3086                     do {
3087                         pos = fText->moveIndex32(pos, -1);
3088                         thisChar = fText->char32At(pos);
3089                     } while (fCM->contains(thisChar));
3090                 }
3091                 setAppliedRule(pos, "LB 25 Numbers");
3092                 continue;
3093             }
3094         }
3095 
3096 
3097         //       The monkey test's way of ignoring combining characters doesn't work
3098         //       for this rule. ZJ is also a CM. Need to get the actual character
3099         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3100         {
3101             int32_t prevIdx = fText->moveIndex32(pos, -1);
3102             UChar32 prevC = fText->char32At(prevIdx);
3103             if (fZWJ->contains(prevC)) {
3104                 setAppliedRule(pos, "LB 8a ZWJ x");
3105                 continue;
3106             }
3107         }
3108 
3109 
3110         // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
3111         //
3112 
3113 
3114         //    x  WJ
3115         //    WJ  x
3116         //
3117         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3118             setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
3119             continue;
3120         }
3121 
3122 
3123         if (fGL->contains(prevChar)) {
3124             setAppliedRule(pos, "LB 12  GL  x");
3125             continue;
3126         }
3127 
3128 
3129           if (!(fSP->contains(prevChar) ||
3130               fBA->contains(prevChar) ||
3131               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3132               setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
3133               continue;
3134         }
3135 
3136 
3137         if (fCL->contains(thisChar) ||
3138                 fCP->contains(thisChar) ||
3139                 fEX->contains(thisChar) ||
3140                 fSY->contains(thisChar)) {
3141             setAppliedRule(pos, "LB 13  Don't break before closings.");
3142             continue;
3143         }
3144 
3145 
3146         //       Scan backwards, checking for this sequence.
3147         //       The OP char could include combining marks, so we actually check for
3148         //           OP CM* SP*
3149         //       Another Twist: The Rule 9 fixes may have changed a SP CM
3150         //       sequence into a ID char, so before scanning back through spaces,
3151         //       verify that prevChar is indeed a space.  The prevChar variable
3152         //       may differ from fText[prevPos]
3153         tPos = prevPos;
3154         if (fSP->contains(prevChar)) {
3155             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3156                 tPos=fText->moveIndex32(tPos, -1);
3157             }
3158         }
3159         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3160             tPos=fText->moveIndex32(tPos, -1);
3161         }
3162         if (fOP->contains(fText->char32At(tPos))) {
3163             setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3164             continue;
3165         }
3166 
3167 
3168         if (nextPos < fText->length()) {
3169             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3170             //       from a legit ffff character. So test length separately.
3171             UChar32 nextChar = fText->char32At(nextPos);
3172             if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3173                 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3174                 break;
3175             }
3176         }
3177 
3178 
3179           if (fIS->contains(thisChar)) {
3180               setAppliedRule(pos, "LB 14b  Do not break before numeric separators, even after spaces.");
3181               continue;
3182         }
3183 
3184 
3185         if (fOP->contains(thisChar)) {
3186             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3187             int tPos = prevPos;
3188             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3189                 tPos = fText->moveIndex32(tPos, -1);
3190             }
3191             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3192                 tPos = fText->moveIndex32(tPos, -1);
3193             }
3194             if (fQU->contains(fText->char32At(tPos))) {
3195                 setAppliedRule(pos, "LB 15    QU SP* x OP");
3196                 continue;
3197             }
3198         }
3199 
3200 
3201         //    Scan backwards for SP* CM* (CL | CP)
3202         if (fNS->contains(thisChar)) {
3203             int tPos = prevPos;
3204             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3205                 tPos = fText->moveIndex32(tPos, -1);
3206             }
3207             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3208                 tPos = fText->moveIndex32(tPos, -1);
3209             }
3210             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3211                 setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
3212                 continue;
3213             }
3214         }
3215 
3216 
3217         if (fB2->contains(thisChar)) {
3218             //  Scan backwards, checking for the B2 CM* SP* sequence.
3219             tPos = prevPos;
3220             if (fSP->contains(prevChar)) {
3221                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3222                     tPos=fText->moveIndex32(tPos, -1);
3223                 }
3224             }
3225             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3226                 tPos=fText->moveIndex32(tPos, -1);
3227             }
3228             if (fB2->contains(fText->char32At(tPos))) {
3229                 setAppliedRule(pos, "LB 17   B2 SP* x B2");
3230                 continue;
3231             }
3232         }
3233 
3234 
3235         if (fSP->contains(prevChar)) {
3236             setAppliedRule(pos, "LB 18    break after space");
3237             break;
3238         }
3239 
3240         //    x   QU
3241         //    QU  x
3242         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3243             setAppliedRule(pos, "LB 19");
3244             continue;
3245         }
3246 
3247         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3248             setAppliedRule(pos, "LB 20  Break around a CB");
3249             break;
3250         }
3251 
3252         //           Don't break between Hyphens and letters if a break precedes the hyphen.
3253         //           Formerly this was a Finnish tailoring.
3254         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3255         //           ^($HY | $HH) $AL;
3256         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3257                 prevPosX2 == -1) {
3258             setAppliedRule(pos, "LB 20.09");
3259             continue;
3260         }
3261 
3262         if (fBA->contains(thisChar) ||
3263             fHY->contains(thisChar) ||
3264             fNS->contains(thisChar) ||
3265             fBB->contains(prevChar) )   {
3266             setAppliedRule(pos, "LB 21");
3267             continue;
3268         }
3269 
3270         if (fHL->contains(prevCharX2) &&
3271                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3272             setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
3273             continue;
3274         }
3275 
3276         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3277             setAppliedRule(pos, "LB 21b SY x HL");
3278             continue;
3279         }
3280 
3281         if (fIN->contains(thisChar))   {
3282             setAppliedRule(pos, "LB 22");
3283             continue;
3284         }
3285 
3286 
3287         //          (AL | HL) x NU
3288         //          NU x (AL | HL)
3289         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3290             setAppliedRule(pos, "LB 23");
3291             continue;
3292         }
3293         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3294             setAppliedRule(pos, "LB 23");
3295             continue;
3296         }
3297 
3298         // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3299         //      PR x (ID | EB | EM)
3300         //     (ID | EB | EM) x PO
3301         if (fPR->contains(prevChar) &&
3302                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3303             setAppliedRule(pos, "LB 23a");
3304             continue;
3305         }
3306         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3307                 fPO->contains(thisChar)) {
3308             setAppliedRule(pos, "LB 23a");
3309             continue;
3310         }
3311 
3312         //   Do not break between prefix and letters or ideographs.
3313         //         (PR | PO) x (AL | HL)
3314         //         (AL | HL) x (PR | PO)
3315         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3316                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3317             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3318             continue;
3319         }
3320         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3321                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3322             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3323             continue;
3324         }
3325 
3326         // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3327 
3328         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3329                                         fJV->contains(thisChar) ||
3330                                         fH2->contains(thisChar) ||
3331                                         fH3->contains(thisChar))) {
3332             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3333             continue;
3334                                         }
3335 
3336         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3337             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3338             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3339             continue;
3340         }
3341 
3342         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3343             fJT->contains(thisChar)) {
3344             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3345             continue;
3346         }
3347 
3348         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3349             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3350             fIN->contains(thisChar)) {
3351             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3352             continue;
3353             }
3354         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3355             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3356             fPO->contains(thisChar)) {
3357             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3358             continue;
3359             }
3360         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3361             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3362             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3363             continue;
3364             }
3365 
3366 
3367 
3368         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3369             setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
3370             continue;
3371         }
3372 
3373           if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3374               setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3375               continue;
3376         }
3377 
3378         //          (AL | NU) x OP
3379         //          CP x (AL | NU)
3380         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3381             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3382             continue;
3383         }
3384         if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3385             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3386             continue;
3387         }
3388 
3389         //             RI  x  RI
3390         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3391             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3392             break;
3393         }
3394         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3395             // Two Regional Indicators have been paired.
3396             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3397             // following RI. This is a hack.
3398             thisChar = -1;
3399             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3400             continue;
3401         }
3402 
3403         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3404             setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
3405             continue;
3406         }
3407 
3408         setAppliedRule(pos, "LB 31    Break everywhere else");
3409         break;
3410     }
3411 
3412     return pos;
3413 }
3414 
3415 
charClasses()3416 UVector  *RBBILineMonkey::charClasses() {
3417     return fSets;
3418 }
3419 
3420 
~RBBILineMonkey()3421 RBBILineMonkey::~RBBILineMonkey() {
3422     delete fSets;
3423 
3424     delete fBK;
3425     delete fCR;
3426     delete fLF;
3427     delete fCM;
3428     delete fNL;
3429     delete fWJ;
3430     delete fZW;
3431     delete fGL;
3432     delete fCB;
3433     delete fSP;
3434     delete fB2;
3435     delete fBA;
3436     delete fBB;
3437     delete fHH;
3438     delete fHY;
3439     delete fH2;
3440     delete fH3;
3441     delete fCL;
3442     delete fCP;
3443     delete fEX;
3444     delete fIN;
3445     delete fJL;
3446     delete fJV;
3447     delete fJT;
3448     delete fNS;
3449     delete fOP;
3450     delete fQU;
3451     delete fIS;
3452     delete fNU;
3453     delete fPO;
3454     delete fPR;
3455     delete fSY;
3456     delete fAI;
3457     delete fAL;
3458     delete fCJ;
3459     delete fHL;
3460     delete fID;
3461     delete fRI;
3462     delete fSG;
3463     delete fXX;
3464     delete fEB;
3465     delete fEM;
3466     delete fZWJ;
3467     delete fOP30;
3468     delete fCP30;
3469 
3470     delete fCharBI;
3471     delete fNumberMatcher;
3472 }
3473 
3474 
3475 //-------------------------------------------------------------------------------------------
3476 //
3477 //   TestMonkey
3478 //
3479 //     params
3480 //       seed=nnnnn        Random number starting seed.
3481 //                         Setting the seed allows errors to be reproduced.
3482 //       loop=nnn          Looping count.  Controls running time.
3483 //                         -1:  run forever.
3484 //                          0 or greater:  run length.
3485 //
3486 //       type = char | word | line | sent | title
3487 //
3488 //  Example:
3489 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3490 //
3491 //-------------------------------------------------------------------------------------------
3492 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3493 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3494     int32_t val = defaultVal;
3495     name.append(" *= *(-?\\d+)");
3496     UErrorCode status = U_ZERO_ERROR;
3497     RegexMatcher m(name, params, 0, status);
3498     if (m.find()) {
3499         // The param exists.  Convert the string to an int.
3500         char valString[100];
3501         int32_t paramLength = m.end(1, status) - m.start(1, status);
3502         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3503             paramLength = (int32_t)(sizeof(valString)-2);
3504         }
3505         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3506         val = strtol(valString, NULL, 10);
3507 
3508         // Delete this parameter from the params string.
3509         m.reset();
3510         params = m.replaceFirst("", status);
3511     }
3512     U_ASSERT(U_SUCCESS(status));
3513     return val;
3514 }
3515 #endif
3516 
3517 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3518 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3519                                     BreakIterator *bi,
3520                                     int expected[],
3521                                     int expectedcount)
3522 {
3523     int count = 0;
3524     int i = 0;
3525     int forward[50];
3526     bi->setText(ustr);
3527     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3528         forward[count] = i;
3529         if (count < expectedcount && expected[count] != i) {
3530             test->errln("%s:%d break forward test failed: expected %d but got %d",
3531                         __FILE__, __LINE__, expected[count], i);
3532             break;
3533         }
3534         count ++;
3535     }
3536     if (count != expectedcount) {
3537         printStringBreaks(ustr, expected, expectedcount);
3538         test->errln("%s:%d break forward test failed: missed %d match",
3539                     __FILE__, __LINE__, expectedcount - count);
3540         return;
3541     }
3542     // testing boundaries
3543     for (i = 1; i < expectedcount; i ++) {
3544         int j = expected[i - 1];
3545         if (!bi->isBoundary(j)) {
3546             printStringBreaks(ustr, expected, expectedcount);
3547             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3548                     __FILE__, __LINE__, j);
3549             return;
3550         }
3551         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3552             if (bi->isBoundary(j)) {
3553                 printStringBreaks(ustr, expected, expectedcount);
3554                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3555                     __FILE__, __LINE__, j);
3556                 return;
3557             }
3558         }
3559     }
3560 
3561     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3562         count --;
3563         if (forward[count] != i) {
3564             printStringBreaks(ustr, expected, expectedcount);
3565             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3566                         __FILE__, __LINE__, forward[count], i);
3567             break;
3568         }
3569     }
3570     if (count != 0) {
3571         printStringBreaks(ustr, expected, expectedcount);
3572         test->errln("break test previous() failed: missed a match");
3573         return;
3574     }
3575 
3576     // testing preceding
3577     for (i = 0; i < expectedcount - 1; i ++) {
3578         // int j = expected[i] + 1;
3579         int j = ustr.moveIndex32(expected[i], 1);
3580         for (; j <= expected[i + 1]; j ++) {
3581             int32_t expectedPreceding = expected[i];
3582             int32_t actualPreceding = bi->preceding(j);
3583             if (actualPreceding != expectedPreceding) {
3584                 printStringBreaks(ustr, expected, expectedcount);
3585                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3586                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3587                 return;
3588             }
3589         }
3590     }
3591 }
3592 #endif
3593 
TestWordBreaks(void)3594 void RBBITest::TestWordBreaks(void)
3595 {
3596 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3597 
3598     Locale        locale("en");
3599     UErrorCode    status = U_ZERO_ERROR;
3600     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3601     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3602     // Replaced any C+J characters in a row with a random sequence of characters
3603     // of the same length to make our C+J segmentation not get in the way.
3604     static const char *strlist[] =
3605     {
3606     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3607     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3608     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3609     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3610     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3611     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3612     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3613     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3614     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3615     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3616     "\\u2027\\U000e0067\\u0a47\\u00b7",
3617     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3618     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3619     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3620     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3621     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3622     "\\u0027\\u11af\\U000e0057\\u0602",
3623     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3624     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3625     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3626     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3627     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3628     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3629     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3630     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3631     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3632     "\\u18f4\\U000e0049\\u20e7\\u2027",
3633     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3634     "\\ua183\\u102d\\u0bec\\u003a",
3635     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3636     "\\u003a\\u0e57\\u0fad\\u002e",
3637     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3638     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3639     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3640     "\\u003a\\u0664\\u00b7\\u1fba",
3641     "\\u003b\\u0027\\u00b7\\u47a3",
3642     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3643     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3644     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3645     };
3646     int loop;
3647     if (U_FAILURE(status)) {
3648         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3649         return;
3650     }
3651     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3652         // printf("looping %d\n", loop);
3653         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3654         // RBBICharMonkey monkey;
3655         RBBIWordMonkey monkey;
3656 
3657         int expected[50];
3658         int expectedcount = 0;
3659 
3660         monkey.setText(ustr);
3661         int i;
3662         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3663             expected[expectedcount ++] = i;
3664         }
3665 
3666         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3667     }
3668     delete bi;
3669 #endif
3670 }
3671 
TestWordBoundary(void)3672 void RBBITest::TestWordBoundary(void)
3673 {
3674     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3675     Locale        locale("en");
3676     UErrorCode    status = U_ZERO_ERROR;
3677     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3678     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3679     if (U_FAILURE(status)) {
3680         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3681                 __FILE__, __LINE__, u_errorName(status));
3682         return;
3683     }
3684     UChar         str[50];
3685     static const char *strlist[] =
3686     {
3687     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3688     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3689     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3690     "\\u2027\\U000e0067\\u0a47\\u00b7",
3691     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3692     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3693     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3694     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3695     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3696     "\\u0027\\u11af\\U000e0057\\u0602",
3697     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3698     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3699     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3700     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3701     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3702     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3703     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3704     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3705     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3706     "\\u58f4\\U000e0049\\u20e7\\u2027",
3707     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3708     "\\ua183\\u102d\\u0bec\\u003a",
3709     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3710     "\\u003a\\u0e57\\u0fad\\u002e",
3711     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3712     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3713     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3714     "\\u003a\\u0664\\u00b7\\u1fba",
3715     "\\u003b\\u0027\\u00b7\\u47a3",
3716     };
3717     int loop;
3718     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3719         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3720         UnicodeString ustr(str);
3721         int forward[50];
3722         int count = 0;
3723 
3724         bi->setText(ustr);
3725         int prev = -1;
3726         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3727             ++count;
3728             if (count >= UPRV_LENGTHOF(forward)) {
3729                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3730                         __FILE__, __LINE__, loop, count, boundary);
3731                 return;
3732             }
3733             forward[count] = boundary;
3734             if (boundary <= prev) {
3735                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3736                         __FILE__, __LINE__, loop, prev, boundary);
3737                 break;
3738             }
3739             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3740                 if (bi->isBoundary(nonBoundary)) {
3741                     printStringBreaks(ustr, forward, count);
3742                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3743                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3744                     return;
3745                 }
3746             }
3747             if (!bi->isBoundary(boundary)) {
3748                 printStringBreaks(ustr, forward, count);
3749                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3750                        __FILE__, __LINE__, boundary);
3751                 return;
3752             }
3753             prev = boundary;
3754         }
3755     }
3756 }
3757 
TestLineBreaks(void)3758 void RBBITest::TestLineBreaks(void)
3759 {
3760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3761     Locale        locale("en");
3762     UErrorCode    status = U_ZERO_ERROR;
3763     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3764     const int32_t  STRSIZE = 50;
3765     UChar         str[STRSIZE];
3766     static const char *strlist[] =
3767     {
3768      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3769      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3770              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3771      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3772              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3773      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3774      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3775      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3776      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3777      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3778      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3779      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3780      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3781      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3782      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3783      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3784      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3785      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3786      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3787      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3788      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3789      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3790      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3791      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3792      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3793      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3794      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3795      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3796      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3797      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3798      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3799      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3800      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3801      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3802      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3803      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3804      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3805      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3806      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3807          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3808     };
3809     int loop;
3810     TEST_ASSERT_SUCCESS(status);
3811     if (U_FAILURE(status)) {
3812         return;
3813     }
3814     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3815         // printf("looping %d\n", loop);
3816         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3817         if (t >= STRSIZE) {
3818             TEST_ASSERT(FALSE);
3819             continue;
3820         }
3821 
3822 
3823         UnicodeString ustr(str);
3824         RBBILineMonkey monkey;
3825         if (U_FAILURE(monkey.deferredStatus)) {
3826             continue;
3827         }
3828 
3829         const int EXPECTEDSIZE = 50;
3830         int expected[EXPECTEDSIZE];
3831         int expectedcount = 0;
3832 
3833         monkey.setText(ustr);
3834 
3835         int i;
3836         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3837             if (expectedcount >= EXPECTEDSIZE) {
3838                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3839                 return;
3840             }
3841             expected[expectedcount ++] = i;
3842         }
3843 
3844         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3845     }
3846     delete bi;
3847 #endif
3848 }
3849 
TestSentBreaks(void)3850 void RBBITest::TestSentBreaks(void)
3851 {
3852 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3853     Locale        locale("en");
3854     UErrorCode    status = U_ZERO_ERROR;
3855     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3856     UChar         str[200];
3857     static const char *strlist[] =
3858     {
3859      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3860      "This\n",
3861      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3862      "\"Sentence ending with a quote.\" Bye.",
3863      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3864      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3865      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3866      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3867      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3868      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3869      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3870              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3871              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3872              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3873      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3874              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3875              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3876              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3877              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3878              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3879     };
3880     int loop;
3881     if (U_FAILURE(status)) {
3882         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3883         return;
3884     }
3885     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3886         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3887         UnicodeString ustr(str);
3888 
3889         RBBISentMonkey monkey;
3890         if (U_FAILURE(monkey.deferredStatus)) {
3891             continue;
3892         }
3893 
3894         const int EXPECTEDSIZE = 50;
3895         int expected[EXPECTEDSIZE];
3896         int expectedcount = 0;
3897 
3898         monkey.setText(ustr);
3899 
3900         int i;
3901         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3902             if (expectedcount >= EXPECTEDSIZE) {
3903                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3904                 return;
3905             }
3906             expected[expectedcount ++] = i;
3907         }
3908 
3909         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3910     }
3911     delete bi;
3912 #endif
3913 }
3914 
TestMonkey()3915 void RBBITest::TestMonkey() {
3916 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3917 
3918     UErrorCode     status    = U_ZERO_ERROR;
3919     int32_t        loopCount = 500;
3920     int32_t        seed      = 1;
3921     UnicodeString  breakType = "all";
3922     Locale         locale("en");
3923     UBool          useUText  = FALSE;
3924 
3925     if (quick == FALSE) {
3926         loopCount = 10000;
3927     }
3928 
3929     if (fTestParams) {
3930         UnicodeString p(fTestParams);
3931         loopCount = getIntParam("loop", p, loopCount);
3932         seed      = getIntParam("seed", p, seed);
3933 
3934         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3935         if (m.find()) {
3936             breakType = m.group(1, status);
3937             m.reset();
3938             p = m.replaceFirst("", status);
3939         }
3940 
3941         RegexMatcher u(" *utext", p, 0, status);
3942         if (u.find()) {
3943             useUText = TRUE;
3944             u.reset();
3945             p = u.replaceFirst("", status);
3946         }
3947 
3948 
3949         // m.reset(p);
3950         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3951             // Each option is stripped out of the option string as it is processed.
3952             // All options have been checked.  The option string should have been completely emptied..
3953             char buf[100];
3954             p.extract(buf, sizeof(buf), NULL, status);
3955             buf[sizeof(buf)-1] = 0;
3956             errln("Unrecognized or extra parameter:  %s\n", buf);
3957             return;
3958         }
3959 
3960     }
3961 
3962     if (breakType == "char" || breakType == "all") {
3963         RBBICharMonkey  m;
3964         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3965         if (U_SUCCESS(status)) {
3966             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3967             if (breakType == "all" && useUText==FALSE) {
3968                 // Also run a quick test with UText when "all" is specified
3969                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3970             }
3971         }
3972         else {
3973             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3974         }
3975         delete bi;
3976     }
3977 
3978     if (breakType == "word" || breakType == "all") {
3979         logln("Word Break Monkey Test");
3980         RBBIWordMonkey  m;
3981         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3982         if (U_SUCCESS(status)) {
3983             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3984         }
3985         else {
3986             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3987         }
3988         delete bi;
3989     }
3990 
3991     if (breakType == "line" || breakType == "all") {
3992         logln("Line Break Monkey Test");
3993         RBBILineMonkey  m;
3994         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3995         if (loopCount >= 10) {
3996             loopCount = loopCount / 5;   // Line break runs slower than the others.
3997         }
3998         if (U_SUCCESS(status)) {
3999             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4000         }
4001         else {
4002             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4003         }
4004         delete bi;
4005     }
4006 
4007     if (breakType == "sent" || breakType == "all"  ) {
4008         logln("Sentence Break Monkey Test");
4009         RBBISentMonkey  m;
4010         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4011         if (loopCount >= 10) {
4012             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4013         }
4014         if (U_SUCCESS(status)) {
4015             RunMonkey(bi, m, "sent", seed, loopCount, useUText);
4016         }
4017         else {
4018             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4019         }
4020         delete bi;
4021     }
4022 
4023 #endif
4024 }
4025 
4026 //
4027 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4028 //    Parameters:
4029 //       bi      - the break iterator to use
4030 //       mk      - MonkeyKind, abstraction for obtaining expected results
4031 //       name    - Name of test (char, word, etc.) for use in error messages
4032 //       seed    - Seed for starting random number generator (parameter from user)
4033 //       numIterations
4034 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4035 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4036                          int32_t numIterations, UBool useUText) {
4037 
4038 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4039 
4040     const int32_t    TESTSTRINGLEN = 500;
4041     UnicodeString    testText;
4042     int32_t          numCharClasses;
4043     UVector          *chClasses;
4044     int              expectedCount = 0;
4045     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4046     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4047     char             reverseBreaks[TESTSTRINGLEN*2+1];
4048     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4049     char             followingBreaks[TESTSTRINGLEN*2+1];
4050     char             precedingBreaks[TESTSTRINGLEN*2+1];
4051     int              i;
4052     int              loopCount = 0;
4053 
4054 
4055     m_seed = seed;
4056 
4057     numCharClasses = mk.charClasses()->size();
4058     chClasses      = mk.charClasses();
4059 
4060     // Check for errors that occured during the construction of the MonkeyKind object.
4061     //  Can't report them where they occured because errln() is a method coming from intlTest,
4062     //  and is not visible outside of RBBITest :-(
4063     if (U_FAILURE(mk.deferredStatus)) {
4064         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4065         return;
4066     }
4067 
4068     // Verify that the character classes all have at least one member.
4069     for (i=0; i<numCharClasses; i++) {
4070         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4071         if (s == NULL || s->size() == 0) {
4072             errln("Character Class #%d is null or of zero size.", i);
4073             return;
4074         }
4075     }
4076 
4077     // For minimizing width of class name output.
4078     int classNameSize = mk.maxClassNameSize();
4079 
4080     while (loopCount < numIterations || numIterations == -1) {
4081         if (numIterations == -1 && loopCount % 10 == 0) {
4082             // If test is running in an infinite loop, display a periodic tic so
4083             //   we can tell that it is making progress.
4084             fprintf(stderr, ".");
4085         }
4086         // Save current random number seed, so that we can recreate the random numbers
4087         //   for this loop iteration in event of an error.
4088         seed = m_seed;
4089 
4090         // Populate a test string with data.
4091         testText.truncate(0);
4092         for (i=0; i<TESTSTRINGLEN; i++) {
4093             int32_t  aClassNum = m_rand() % numCharClasses;
4094             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4095             int32_t   charIdx = m_rand() % classSet->size();
4096             UChar32   c = classSet->charAt(charIdx);
4097             if (c < 0) {   // TODO:  deal with sets containing strings.
4098                 errln("%s:%d c < 0", __FILE__, __LINE__);
4099                 break;
4100             }
4101             // Do not assemble a supplementary character from randomly generated separate surrogates.
4102             //   (It could be a dictionary character)
4103             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4104                 continue;
4105             }
4106 
4107             testText.append(c);
4108         }
4109 
4110         // Calculate the expected results for this test string and reset applied rules.
4111         mk.setText(testText);
4112 
4113         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4114         expectedBreaks[0] = 1;
4115         int32_t breakPos = 0;
4116         expectedCount = 0;
4117         for (;;) {
4118             breakPos = mk.next(breakPos);
4119             if (breakPos == -1) {
4120                 break;
4121             }
4122             if (breakPos > testText.length()) {
4123                 errln("breakPos > testText.length()");
4124             }
4125             expectedBreaks[breakPos] = 1;
4126             U_ASSERT(expectedCount<testText.length());
4127         }
4128 
4129         // Find the break positions using forward iteration
4130         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4131         if (useUText) {
4132             UErrorCode status = U_ZERO_ERROR;
4133             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4134             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4135             bi->setText(testUText, status);
4136             TEST_ASSERT_SUCCESS(status);
4137             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4138                                       //  This UText can be closed immediately, so long as the
4139                                       //  testText string continues to exist.
4140         } else {
4141             bi->setText(testText);
4142         }
4143 
4144         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4145             if (i < 0 || i > testText.length()) {
4146                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4147                 break;
4148             }
4149             forwardBreaks[i] = 1;
4150         }
4151 
4152         // Find the break positions using reverse iteration
4153         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4154         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4155             if (i < 0 || i > testText.length()) {
4156                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4157                 break;
4158             }
4159             reverseBreaks[i] = 1;
4160         }
4161 
4162         // Find the break positions using isBoundary() tests.
4163         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4164         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4165         for (i=0; i<=testText.length(); i++) {
4166             isBoundaryBreaks[i] = bi->isBoundary(i);
4167         }
4168 
4169 
4170         // Find the break positions using the following() function.
4171         // printf(".");
4172         memset(followingBreaks, 0, sizeof(followingBreaks));
4173         int32_t   lastBreakPos = 0;
4174         followingBreaks[0] = 1;
4175         for (i=0; i<testText.length(); i++) {
4176             breakPos = bi->following(i);
4177             if (breakPos <= i ||
4178                 breakPos < lastBreakPos ||
4179                 breakPos > testText.length() ||
4180                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4181                 errln("%s break monkey test: "
4182                     "Out of range value returned by BreakIterator::following().\n"
4183                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4184                          name, seed, i, breakPos, lastBreakPos);
4185                 break;
4186             }
4187             followingBreaks[breakPos] = 1;
4188             lastBreakPos = breakPos;
4189         }
4190 
4191         // Find the break positions using the preceding() function.
4192         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4193         lastBreakPos = testText.length();
4194         precedingBreaks[testText.length()] = 1;
4195         for (i=testText.length(); i>0; i--) {
4196             breakPos = bi->preceding(i);
4197             if (breakPos >= i ||
4198                 breakPos > lastBreakPos ||
4199                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4200                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4201                 errln("%s break monkey test: "
4202                     "Out of range value returned by BreakIterator::preceding().\n"
4203                     "index=%d;  prev returned %d; lastBreak=%d" ,
4204                     name,  i, breakPos, lastBreakPos);
4205                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4206                     precedingBreaks[i] = 2;   // Forces an error.
4207                 }
4208             } else {
4209                 if (breakPos >= 0) {
4210                     precedingBreaks[breakPos] = 1;
4211                 }
4212                 lastBreakPos = breakPos;
4213             }
4214         }
4215 
4216         // Compare the expected and actual results.
4217         for (i=0; i<=testText.length(); i++) {
4218             const char *errorType = NULL;
4219             const char* currentBreakData = NULL;
4220             if  (forwardBreaks[i] != expectedBreaks[i]) {
4221                 errorType = "next()";
4222                 currentBreakData = forwardBreaks;
4223             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4224                 errorType = "previous()";
4225                 currentBreakData = reverseBreaks;
4226            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4227                 errorType = "isBoundary()";
4228                 currentBreakData = isBoundaryBreaks;
4229             } else if (followingBreaks[i] != expectedBreaks[i]) {
4230                 errorType = "following()";
4231                 currentBreakData = followingBreaks;
4232             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4233                 errorType = "preceding()";
4234                 currentBreakData = precedingBreaks;
4235             }
4236 
4237             if (errorType != NULL) {
4238                 // Format a range of the test text that includes the failure as
4239                 //  a data item that can be included in the rbbi test data file.
4240 
4241                 // Start of the range is the last point where expected and actual results
4242                 //  both agreed that there was a break position.
4243 
4244                 int startContext = i;
4245                 int32_t count = 0;
4246                 for (;;) {
4247                     if (startContext==0) { break; }
4248                     startContext --;
4249                     if (expectedBreaks[startContext] != 0) {
4250                         if (count == 2) break;
4251                         count ++;
4252                     }
4253                 }
4254 
4255                 // End of range is two expected breaks past the start position.
4256                 int endContext = i + 1;
4257                 int ci;
4258                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4259                     for (;;) {
4260                         if (endContext >= testText.length()) {break;}
4261                         if (expectedBreaks[endContext-1] != 0) {
4262                             if (count == 0) break;
4263                             count --;
4264                         }
4265                         endContext ++;
4266                     }
4267                 }
4268 
4269                 // Formatting of each line includes:
4270                 //   character code
4271                 //   reference break: '|' -> a break, '.' -> no break
4272                 //   actual break:    '|' -> a break, '.' -> no break
4273                 //   (name of character clase)
4274                 //   Unicode name of character
4275                 //   '-->' indicates location of the difference.
4276 
4277                 MONKEY_ERROR(
4278                     (expectedBreaks[i] ? "Break expected but not found" :
4279                        "Break found but not expected"),
4280                     name, i, seed);
4281 
4282                 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4283                     UChar32  c;
4284                     c = testText.char32At(ci);
4285 
4286                     std::string currentLineFlag = "   ";
4287                     if (ci == i) {
4288                         currentLineFlag = "-->";  // Error position
4289                     }
4290 
4291                     // BMP or SMP character in hex
4292                     char hexCodePoint[12];
4293                     std::string format = "    \\u%04x";
4294                     if (c >= 0x10000) {
4295                         format = "\\U%08x";
4296                     }
4297                     sprintf(hexCodePoint, format.c_str(), c);
4298 
4299                     // Get the class name and character name for the character.
4300                     char cName[200];
4301                     UErrorCode status = U_ZERO_ERROR;
4302                     u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4303 
4304                     char buffer[200];
4305                     auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4306                              "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
4307                              currentLineFlag.c_str(),
4308                              ci,
4309                              expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
4310                              currentBreakData[ci] == 0 ? "." : "|",  // Actual break
4311                              hexCodePoint,
4312                              classNameSize,
4313                              mk.classNameFromCodepoint(c).c_str(),
4314                              mk.getAppliedRule(ci).c_str(), cName);
4315                     (void)ret;
4316                     U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4317 
4318                     // Output the error
4319                     if (ci == i) {
4320                         errln(buffer);
4321                     } else {
4322                         infoln(buffer);
4323                     }
4324 
4325                     if (ci >= endContext) { break; }
4326                 }
4327                 break;
4328             }
4329         }
4330 
4331         loopCount++;
4332     }
4333 #endif
4334 }
4335 
4336 
4337 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4338 //             This test checks the initial patch,
4339 //             which is to just keep it from crashing.  Correct word boundaries
4340 //             await a proper fix to the dictionary code.
4341 //
TestBug5532(void)4342 void RBBITest::TestBug5532(void)  {
4343    // Text includes a mixture of Thai and Latin.
4344    const unsigned char utf8Data[] = {
4345            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4346            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4347            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4348            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4349            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4350            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4351            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4352            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4353            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4354            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4355            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4356 
4357     UErrorCode status = U_ZERO_ERROR;
4358     UText utext=UTEXT_INITIALIZER;
4359     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4360     TEST_ASSERT_SUCCESS(status);
4361 
4362     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4363     TEST_ASSERT_SUCCESS(status);
4364     if (U_SUCCESS(status)) {
4365         bi->setText(&utext, status);
4366         TEST_ASSERT_SUCCESS(status);
4367 
4368         int32_t breakCount = 0;
4369         int32_t previousBreak = -1;
4370         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4371             // For now, just make sure that the break iterator doesn't hang.
4372             TEST_ASSERT(previousBreak < bi->current());
4373             previousBreak = bi->current();
4374         }
4375         TEST_ASSERT(breakCount > 0);
4376     }
4377     delete bi;
4378     utext_close(&utext);
4379 }
4380 
4381 
TestBug9983(void)4382 void RBBITest::TestBug9983(void)  {
4383     UnicodeString text = UnicodeString("\\u002A"  // * Other
4384                                        "\\uFF65"  //   Other
4385                                        "\\u309C"  //   Katakana
4386                                        "\\uFF9F"  //   Extend
4387                                        "\\uFF65"  //   Other
4388                                        "\\u0020"  //   Other
4389                                        "\\u0000").unescape();
4390 
4391     UErrorCode status = U_ZERO_ERROR;
4392     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4393         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4394     TEST_ASSERT_SUCCESS(status);
4395     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4396         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4397     TEST_ASSERT_SUCCESS(status);
4398     if (U_FAILURE(status)) {
4399         return;
4400     }
4401     int32_t offset, rstatus, iterationCount;
4402 
4403     brkiter->setText(text);
4404     brkiter->last();
4405     iterationCount = 0;
4406     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4407         iterationCount++;
4408         rstatus = brkiter->getRuleStatus();
4409         (void)rstatus;     // Suppress set but not used warning.
4410         if (iterationCount >= 10) {
4411            break;
4412         }
4413     }
4414     TEST_ASSERT(iterationCount == 6);
4415 
4416     brkiterPOSIX->setText(text);
4417     brkiterPOSIX->last();
4418     iterationCount = 0;
4419     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4420         iterationCount++;
4421         rstatus = brkiterPOSIX->getRuleStatus();
4422         (void)rstatus;     // Suppress set but not used warning.
4423         if (iterationCount >= 10) {
4424            break;
4425         }
4426     }
4427     TEST_ASSERT(iterationCount == 6);
4428 }
4429 
4430 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4431 //
TestBug7547()4432 void RBBITest::TestBug7547() {
4433     UnicodeString rules;
4434     UErrorCode status = U_ZERO_ERROR;
4435     UParseError parseError;
4436     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4437     if (status != U_BRK_RULE_SYNTAX) {
4438         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4439     }
4440     if (parseError.line != 1 || parseError.offset != 0) {
4441         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4442     }
4443 }
4444 
4445 
TestBug12797()4446 void RBBITest::TestBug12797() {
4447     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4448     UErrorCode status = U_ZERO_ERROR;
4449     UParseError parseError;
4450     RuleBasedBreakIterator bi(rules, parseError, status);
4451     if (U_FAILURE(status)) {
4452         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4453         return;
4454     }
4455     UnicodeString text = "abc";
4456     bi.setText(text);
4457     bi.first();
4458     int32_t boundary = bi.next();
4459     if (boundary != 3) {
4460         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4461     }
4462 }
4463 
TestBug12918()4464 void RBBITest::TestBug12918() {
4465     // This test triggers an assertion failure in dictbe.cpp
4466     const UChar *crasherString = u"\u3325\u4a16";
4467     UErrorCode status = U_ZERO_ERROR;
4468     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4469     if (U_FAILURE(status)) {
4470         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4471         return;
4472     }
4473     ubrk_first(iter);
4474     int32_t pos = 0;
4475     int32_t lastPos = -1;
4476     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4477         if (pos <= lastPos) {
4478             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4479             break;
4480         }
4481     }
4482     ubrk_close(iter);
4483 }
4484 
TestBug12932()4485 void RBBITest::TestBug12932() {
4486     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4487     UnicodeString ruleStr(
4488             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4489             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4490             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4491             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4492             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4493             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4494 
4495     UErrorCode status = U_ZERO_ERROR;
4496     UParseError parseError;
4497     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4498     if (status != U_BRK_RULE_SYNTAX) {
4499         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4500                 __FILE__, __LINE__, u_errorName(status));
4501     }
4502 }
4503 
4504 
4505 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4506 //             remain undevided by ICU char, word and line break.
TestEmoji()4507 void RBBITest::TestEmoji() {
4508 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4509     UErrorCode  status = U_ZERO_ERROR;
4510 
4511     CharString testFileName;
4512     testFileName.append(IntlTest::getSourceTestData(status), status);
4513     testFileName.appendPathPart("emoji-test.txt", status);
4514     if (U_FAILURE(status)) {
4515         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4516         return;
4517     }
4518     logln("Opening data file %s\n", testFileName.data());
4519 
4520     int    len;
4521     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4522     if (U_FAILURE(status) || testFile == NULL) {
4523         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4524         return;
4525     }
4526     UnicodeString testFileAsString(testFile, len);
4527     delete [] testFile;
4528 
4529     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4530     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4531     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4532     int32_t lineNumber = 0;
4533 
4534     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4535     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4536     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4537     if (U_FAILURE(status)) {
4538         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4539         return;
4540     }
4541 
4542     while (lineMatcher.find()) {
4543         ++lineNumber;
4544         UnicodeString line = lineMatcher.group(status);
4545         hexMatcher.reset(line);
4546         UnicodeString testString;   // accumulates the emoji sequence.
4547         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4548             UnicodeString hex = hexMatcher.group(1, status);
4549             if (hex.length() > 8) {
4550                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4551                 break;
4552             }
4553             CharString hex8;
4554             hex8.appendInvariantChars(hex, status);
4555             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4556             if (c<=0x10ffff) {
4557                 testString.append(c);
4558             } else {
4559                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4560                         __FILE__, __LINE__, lineNumber, hex8.data());
4561                 break;
4562             }
4563         }
4564 
4565         if (testString.length() > 1) {
4566             charBreaks->setText(testString);
4567             charBreaks->first();
4568             int32_t firstBreak = charBreaks->next();
4569             if (testString.length() != firstBreak) {
4570                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4571                         __FILE__, __LINE__, lineNumber, firstBreak);
4572             }
4573             wordBreaks->setText(testString);
4574             wordBreaks->first();
4575             firstBreak = wordBreaks->next();
4576             if (testString.length() != firstBreak) {
4577                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4578                         __FILE__, __LINE__, lineNumber, firstBreak);
4579             }
4580             lineBreaks->setText(testString);
4581             lineBreaks->first();
4582             firstBreak = lineBreaks->next();
4583             if (testString.length() != firstBreak) {
4584                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4585                         __FILE__, __LINE__, lineNumber, firstBreak);
4586             }
4587         }
4588     }
4589 #endif
4590 }
4591 
4592 
4593 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4594 
TestBug12519()4595 void RBBITest::TestBug12519() {
4596     UErrorCode status = U_ZERO_ERROR;
4597     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4598     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4599     if (!assertSuccess(WHERE, status)) {
4600         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4601         return;
4602     }
4603     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4604 
4605     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4606     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4607 
4608     LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4609     assertTrue(WHERE, *biEn == *cloneEn);
4610     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4611 
4612     LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4613     assertTrue(WHERE, *biFr == *cloneFr);
4614     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4615 
4616     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4617     UnicodeString text("Hallo Welt");
4618     biDe->setText(text);
4619     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4620     *biDe = *biFr;
4621     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4622 }
4623 
TestBug12677()4624 void RBBITest::TestBug12677() {
4625     // Check that stripping of comments from rules for getRules() is not confused by
4626     // the presence of '#' characters in the rules that do not introduce comments.
4627     UnicodeString rules(u"!!forward; \n"
4628                          "$x = [ab#];  # a set with a # literal. \n"
4629                          " # .;        # a comment that looks sort of like a rule.   \n"
4630                          " '#' '?';    # a rule with a quoted #   \n"
4631                        );
4632 
4633     UErrorCode status = U_ZERO_ERROR;
4634     UParseError pe;
4635     RuleBasedBreakIterator bi(rules, pe, status);
4636     assertSuccess(WHERE, status);
4637     UnicodeString rtRules = bi.getRules();
4638     assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"),  rtRules);
4639 }
4640 
4641 
TestTableRedundancies()4642 void RBBITest::TestTableRedundancies() {
4643     UErrorCode status = U_ZERO_ERROR;
4644 
4645     LocalPointer<RuleBasedBreakIterator> bi (
4646         (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4647     assertSuccess(WHERE, status);
4648     if (U_FAILURE(status)) return;
4649 
4650     RBBIDataWrapper *dw = bi->fData;
4651     const RBBIStateTable *fwtbl = dw->fForwardTable;
4652     UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4653     int32_t numCharClasses = dw->fHeader->fCatCount;
4654     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4655 
4656     // Check for duplicate columns (character categories)
4657 
4658     std::vector<UnicodeString> columns;
4659     for (int32_t column = 0; column < numCharClasses; column++) {
4660         UnicodeString s;
4661         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4662             RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4663             s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4664         }
4665         columns.push_back(s);
4666     }
4667     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4668     for (int c1=1; c1<numCharClasses; c1++) {
4669         int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4670         for (int c2 = c1+1; c2 < limit; c2++) {
4671             if (columns.at(c1) == columns.at(c2)) {
4672                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4673                 goto out;
4674             }
4675         }
4676     }
4677   out:
4678 
4679     // Check for duplicate states
4680     std::vector<UnicodeString> rows;
4681     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4682         UnicodeString s;
4683         RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4684         if (in8Bits) {
4685             s.append(row->r8.fAccepting);
4686             s.append(row->r8.fLookAhead);
4687             s.append(row->r8.fTagsIdx);
4688             for (int32_t column = 0; column < numCharClasses; column++) {
4689                 s.append(row->r8.fNextState[column]);
4690             }
4691         } else {
4692             s.append(row->r16.fAccepting);
4693             s.append(row->r16.fLookAhead);
4694             s.append(row->r16.fTagsIdx);
4695             for (int32_t column = 0; column < numCharClasses; column++) {
4696                 s.append(row->r16.fNextState[column]);
4697             }
4698         }
4699         rows.push_back(s);
4700     }
4701     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4702         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4703             if (rows.at(r1) == rows.at(r2)) {
4704                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4705                 return;
4706             }
4707         }
4708     }
4709 }
4710 
4711 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4712 //            even after next() has returned DONE.
4713 
TestBug13447()4714 void RBBITest::TestBug13447() {
4715     UErrorCode status = U_ZERO_ERROR;
4716     LocalPointer<RuleBasedBreakIterator> bi(
4717         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4718     assertSuccess(WHERE, status);
4719     if (U_FAILURE(status)) return;
4720     UnicodeString data(u"1234");
4721     bi->setText(data);
4722     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4723     assertEquals(WHERE, 4, bi->next());
4724     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4725     assertEquals(WHERE, UBRK_DONE, bi->next());
4726     assertEquals(WHERE, 4, bi->current());
4727     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4728 }
4729 
4730 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4731 //  for filling the break iterator cache when starting from random positions
4732 //  in the text.
4733 //
4734 //  It's a monkey test, working on random data, with the expected data obtained
4735 //  from forward iteration (no safe rules involved), comparing with results
4736 //  when indexing into the interior of the string (safe rules needed).
4737 
TestReverse()4738 void RBBITest::TestReverse() {
4739     UErrorCode status = U_ZERO_ERROR;
4740 
4741     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4742             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4743     assertSuccess(WHERE, status, true);
4744     status = U_ZERO_ERROR;
4745     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4746             BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4747     assertSuccess(WHERE, status, true);
4748     status = U_ZERO_ERROR;
4749     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4750             BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4751     assertSuccess(WHERE, status, true);
4752     status = U_ZERO_ERROR;
4753     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4754             BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4755     assertSuccess(WHERE, status, true);
4756 }
4757 
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4758 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4759     if (!bi) {
4760         return;
4761     }
4762 
4763     // From the mapping trie in the break iterator's internal data, create a
4764     // vector of UnicodeStrings, one for each character category, containing
4765     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4766     // to avoid an execess of unassigned code points.
4767 
4768     RBBIDataWrapper *data = bi->fData;
4769     int32_t categoryCount = data->fHeader->fCatCount;
4770     UCPTrie *trie = data->fTrie;
4771     bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4772     uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4773 
4774     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4775     for (int cp=0; cp<0x1fff0; ++cp) {
4776         int cat = ucptrie_get(trie, cp);
4777         cat &= ~dictBit;    // And off the dictionary bit from the category.
4778         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4779         if (cat < 0 || cat >= categoryCount) return;
4780         strings[cat].append(cp);
4781     }
4782 
4783     icu_rand randomGen;
4784     const int testStringLength = 10000;
4785     UnicodeString testString;
4786 
4787     for (int i=0; i<testStringLength; ++i) {
4788         int charClass = randomGen() % categoryCount;
4789         if (strings[charClass].length() > 0) {
4790             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4791             testString.append(cp);
4792         }
4793     }
4794 
4795     typedef std::pair<UBool, int32_t> Result;
4796     std::vector<Result> expectedResults;
4797     bi->setText(testString);
4798     for (int i=0; i<testString.length(); ++i) {
4799         bool isboundary = bi->isBoundary(i);
4800         int  ruleStatus = bi->getRuleStatus();
4801         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4802     }
4803 
4804     for (int i=testString.length()-1; i>=0; --i) {
4805         bi->setText(testString);   // clears the internal break cache
4806         Result expected = expectedResults[i];
4807         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4808         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4809     }
4810 }
4811 
4812 
4813 // Ticket 13692 - finding word boundaries in very large numbers or words could
4814 //                be very time consuming. When the problem was present, this void test
4815 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4816 
TestBug13692()4817 void RBBITest::TestBug13692() {
4818     UErrorCode status = U_ZERO_ERROR;
4819     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4820             BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4821     if (!assertSuccess(WHERE, status, true)) {
4822         return;
4823     }
4824     constexpr int32_t LENGTH = 1000000;
4825     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4826     for (int i=0; i<20; i+=2) {
4827         longNumber.setCharAt(i, u' ');
4828     }
4829     bi->setText(longNumber);
4830     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4831     assertSuccess(WHERE, status);
4832 }
4833 
4834 
TestProperties()4835 void RBBITest::TestProperties() {
4836     UErrorCode errorCode = U_ZERO_ERROR;
4837     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4838     if (!prependSet.isEmpty()) {
4839         errln(
4840             "[:GCB=Prepend:] is not empty any more. "
4841             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4842             "change this test to the opposite condition.");
4843     }
4844 }
4845 
4846 
4847 //
4848 //  TestDebug    -  A place-holder test for debugging purposes.
4849 //                  For putting in fragments of other tests that can be invoked
4850 //                  for tracing  without a lot of unwanted extra stuff happening.
4851 //
TestDebug(void)4852 void RBBITest::TestDebug(void) {
4853     UErrorCode status = U_ZERO_ERROR;
4854     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4855             BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4856     if (!assertSuccess(WHERE, status, true)) {
4857         return;
4858     }
4859     const UnicodeString &rules = bi->getRules();
4860     UParseError pe;
4861     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4862     assertSuccess(WHERE, status);
4863 }
4864 
4865 
4866 //
4867 //  TestDebugRules   A stub test for use in debugging rule compilation problems.
4868 //                   Can be freely altered as needed or convenient.
4869 //                   Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4870 //                   data files may not be available in all environments.
4871 //                   Any permanent test cases should be moved to rbbitst.txt
4872 //                   (see Bug 20303 in that file, for example), or to another test function in this file.
4873 //
TestDebugRules()4874 void RBBITest::TestDebugRules() {
4875 #if 0
4876     const char16_t *rules = u""
4877         "!!quoted_literals_only; \n"
4878         "!!chain; \n"
4879         "!!lookAheadHardBreak; \n"
4880         " \n"
4881         // "[a] / ; \n"
4882         "[a] [b] / [c] [d]; \n"
4883         "[a] [b] / [c] [d] {100}; \n"
4884         "[x] [a] [b] / [c] [d] {100}; \n"
4885         "[a] [b] [c] / [d] {100}; \n"
4886         //" [c] [d] / [e] [f]; \n"
4887         //"[a] [b] / [c]; \n"
4888         ;
4889 
4890     UErrorCode status = U_ZERO_ERROR;
4891     CharString path(pathToDataDirectory(), status);
4892     path.appendPathPart("brkitr", status);
4893     path.appendPathPart("rules", status);
4894     path.appendPathPart("line.txt", status);
4895     int    len;
4896     std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4897     if (!assertSuccess(WHERE, status)) {
4898         return;
4899     }
4900 
4901     UParseError pe;
4902     // rules = testFile.get();
4903     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4904 
4905     if (!assertSuccess(WHERE, status)) {
4906         delete bi;
4907         return;
4908     }
4909     // bi->dumpTables();
4910 
4911     delete bi;
4912 #endif
4913 }
4914 
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4915 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4916     UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4917     int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4918     // Text are duplicate characters from U+4E00 to U+4FFF
4919     UnicodeString text;
4920     for (UChar c = 0x4e00; c < 0x5000; c++) {
4921         text.append(c).append(c);
4922     }
4923     // Generate rule which will caused length+4 character classes and
4924     // length+3 states
4925     UnicodeString rules(u"!!quoted_literals_only;");
4926     for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4927         rules.append(u'\'').append(c).append(c).append(u"';");
4928     }
4929     rules.append(u".;");
4930     UErrorCode status = U_ZERO_ERROR;
4931     UParseError parseError;
4932     RuleBasedBreakIterator bi(rules, parseError, status);
4933 
4934     assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4935     assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4936     assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4937     assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4938     assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4939 
4940     bi.setText(text);
4941 
4942     int32_t pos;
4943     int32_t i = 0;
4944     while ((pos = bi.next()) > 0) {
4945         // The first numChar should not break between the pair
4946         if (i++ < numChar) {
4947             assertEquals(WHERE, i * 2, pos);
4948         } else {
4949             // After the first numChar next(), break on each character.
4950             assertEquals(WHERE, i + numChar, pos);
4951         }
4952     }
4953     while ((pos = bi.previous()) > 0) {
4954         // The first numChar should not break between the pair
4955         if (--i < numChar) {
4956             assertEquals(WHERE, i * 2, pos);
4957         } else {
4958             // After the first numChar next(), break on each character.
4959             assertEquals(WHERE, i + numChar, pos);
4960         }
4961     }
4962 }
4963 
Test8BitsTrieWith8BitStateTable()4964 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4965     testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4966 }
4967 
Test16BitsTrieWith8BitStateTable()4968 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4969     testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4970 }
4971 
Test16BitsTrieWith16BitStateTable()4972 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4973     testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4974 }
4975 
Test8BitsTrieWith16BitStateTable()4976 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4977     // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4978     // create state table in 16 bits.
4979 
4980     // Generate 510 'a' as text
4981     UnicodeString text;
4982     for (int32_t i = 0; i < 510; i++) {
4983         text.append(u'a');
4984     }
4985 
4986     UnicodeString rules(u"!!quoted_literals_only;'");
4987     // 254 'a' in the rule will cause 256 states
4988     for (int32_t i = 0; i < 254; i++) {
4989         rules.append(u'a');
4990     }
4991     rules.append(u"';.;");
4992 
4993     UErrorCode status = U_ZERO_ERROR;
4994     UParseError parseError;
4995     LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4996 
4997     assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4998     assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4999     assertEquals(WHERE,
5000                  false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
5001     bi->setText(text);
5002 
5003     // break positions:
5004     // 254, 508, 509, ... 510
5005     assertEquals("next()", 254, bi->next());
5006     int32_t i = 0;
5007     int32_t pos;
5008     while ((pos = bi->next()) > 0) {
5009         assertEquals(WHERE, 508 + i , pos);
5010         i++;
5011     }
5012     i = 0;
5013     while ((pos = bi->previous()) > 0) {
5014         i++;
5015         if (pos >= 508) {
5016             assertEquals(WHERE, 510 - i , pos);
5017         } else {
5018             assertEquals(WHERE, 254 , pos);
5019         }
5020     }
5021 }
5022 
5023 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
5024 // that there are no problems with rules at the size that transitions between the two.
5025 //
5026 // A rule that matches a literal string, like 'abcdefghij', will require one state and
5027 // one character class per character in the string. So we can make a rule to tickle the
5028 // boundaries by using literal strings of various lengths.
5029 //
5030 // For both the number of states and the number of character classes, the eight bit format
5031 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
5032 // leaving 120 something available. This test runs the string over the range of 120 - 130,
5033 // which allows some margin for changes to the number of values reserved by the rule builder
5034 // without breaking the test.
5035 
TestTable_8_16_Bits()5036 void RBBITest::TestTable_8_16_Bits() {
5037 
5038     // testStr serves as both the source of the rule string (truncated to the desired length)
5039     // and as test data to check matching behavior. A break rule consisting of the first 120
5040     // characters of testStr will match the first 120 chars of the full-length testStr.
5041     UnicodeString testStr;
5042     for (UChar c=0x3000; c<0x3200; ++c) {
5043         testStr.append(c);
5044     }
5045 
5046     const int32_t startLength = 120;   // The shortest rule string to test.
5047     const int32_t endLength = 260;     // The longest rule string to test
5048     const int32_t increment = this->quick ? endLength - startLength : 1;
5049 
5050     for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
5051         UParseError parseError;
5052         UErrorCode status = U_ZERO_ERROR;
5053 
5054         UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
5055         ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
5056         RuleBasedBreakIterator bi(ruleString, parseError, status);
5057         if (!assertSuccess(WHERE, status)) {
5058             errln(ruleString);
5059             break;
5060         }
5061         // bi.dumpTables();
5062 
5063         // Verify that the break iterator is functioning - that the first boundary found
5064         // in testStr is at the length of the rule string.
5065         bi.setText(testStr);
5066         assertEquals(WHERE, ruleLen, bi.next());
5067 
5068         // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
5069         // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
5070         bi.setText(testStr);
5071         int32_t result = bi.preceding(ruleLen);
5072         assertEquals(WHERE, 0, result);
5073 
5074         // Verify that the range of rule lengths being tested cover the transations
5075         // from 8 to 16 bit data.
5076         bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
5077         bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
5078 
5079         if (ruleLen == startLength) {
5080             assertEquals(WHERE, true, has8BitRowData);
5081             assertEquals(WHERE, true, has8BitsTrie);
5082         }
5083         if (ruleLen == endLength) {
5084             assertEquals(WHERE, false, has8BitRowData);
5085             assertEquals(WHERE, false, has8BitsTrie);
5086         }
5087     }
5088 }
5089 
5090 /* Test handling of a large number of look-ahead rules.
5091  * The number of rules in the test exceeds the implementation limits prior to the
5092  * improvements introduced with #13590.
5093  *
5094  * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5095  * The text being matched is sequential, "ABCDEFGHI..."
5096  *
5097  * The upshot is that the look-ahead rules all match on their preceding context,
5098  * and consequently must save a potential result, but then fail to match on their
5099  * trailing context, so that they don't actually cause a boundary.
5100  *
5101  * Additionally, add a ".*" rule, so there are no boundaries unless a
5102  * look-ahead hard-break rule forces one.
5103  */
TestBug13590()5104 void RBBITest::TestBug13590() {
5105     UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5106 
5107     const int NUM_LOOKAHEAD_RULES = 50;
5108     const char16_t STARTING_CHAR = u'\u5000';
5109     char16_t firstChar;
5110     for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5111         firstChar = STARTING_CHAR + ruleNum*2;
5112         rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5113              .append(u' ') .append(u'/') .append(u' ')
5114              .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5115              .append(u';') .append(u'\n');
5116     }
5117 
5118     // Change the last rule added from the form "UV / WY" to "UV / WX".
5119     // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5120     rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5121 
5122     UErrorCode status = U_ZERO_ERROR;
5123     UParseError parseError;
5124     RuleBasedBreakIterator bi(rules, parseError, status);
5125     if (!assertSuccess(WHERE, status)) {
5126         errln(rules);
5127         return;
5128     }
5129     // bi.dumpTables();
5130 
5131     UnicodeString testString;
5132     for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5133         testString.append(c);
5134     }
5135     bi.setText(testString);
5136 
5137     int breaksFound = 0;
5138     while (bi.next() != UBRK_DONE) {
5139         ++breaksFound;
5140     }
5141 
5142     // Two matches are expected, one from the last rule that was explicitly modified,
5143     // and one at the end of the text.
5144     assertEquals(WHERE, 2, breaksFound);
5145 }
5146 
5147 
5148 #if U_ENABLE_TRACING
5149 static std::vector<std::string> gData;
5150 static std::vector<int32_t> gEntryFn;
5151 static std::vector<int32_t> gExitFn;
5152 static std::vector<int32_t> gDataFn;
5153 
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5154 static void U_CALLCONV traceData(
5155         const void*,
5156         int32_t fnNumber,
5157         int32_t,
5158         const char *,
5159         va_list args) {
5160     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5161         const char* data = va_arg(args, const char*);
5162         gDataFn.push_back(fnNumber);
5163         gData.push_back(data);
5164     }
5165 }
5166 
traceEntry(const void *,int32_t fnNumber)5167 static void traceEntry(const void *, int32_t fnNumber) {
5168     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5169         gEntryFn.push_back(fnNumber);
5170     }
5171 }
5172 
traceExit(const void *,int32_t fnNumber,const char *,va_list)5173 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5174     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5175         gExitFn.push_back(fnNumber);
5176     }
5177 }
5178 
5179 
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5180 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5181     assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5182     assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5183     assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5184     assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5185 
5186     if (expectedData == nullptr) {
5187       assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5188       assertEquals("utrace_data should not be called ", 0, gData.size());
5189     } else {
5190       assertEquals("utrace_data should be called ", 1, gDataFn.size());
5191       assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5192       assertEquals("utrace_data should be called ", 1, gData.size());
5193       assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5194     }
5195 }
5196 
SetupTestTrace()5197 void SetupTestTrace() {
5198     gEntryFn.clear();
5199     gExitFn.clear();
5200     gDataFn.clear();
5201     gData.clear();
5202 
5203     const void* context = nullptr;
5204     utrace_setFunctions(context, traceEntry, traceExit, traceData);
5205     utrace_setLevel(UTRACE_INFO);
5206 }
5207 
TestTraceCreateCharacter(void)5208 void RBBITest::TestTraceCreateCharacter(void) {
5209     SetupTestTrace();
5210     IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5211     LocalPointer<BreakIterator> brkitr(
5212         BreakIterator::createCharacterInstance("zh-CN", status));
5213     status.errIfFailureAndReset();
5214     assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5215 }
5216 
TestTraceCreateTitle(void)5217 void RBBITest::TestTraceCreateTitle(void) {
5218     SetupTestTrace();
5219     IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5220     LocalPointer<BreakIterator> brkitr(
5221         BreakIterator::createTitleInstance("zh-CN", status));
5222     status.errIfFailureAndReset();
5223     assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5224 }
5225 
TestTraceCreateSentence(void)5226 void RBBITest::TestTraceCreateSentence(void) {
5227     SetupTestTrace();
5228     IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5229     LocalPointer<BreakIterator> brkitr(
5230         BreakIterator::createSentenceInstance("zh-CN", status));
5231     status.errIfFailureAndReset();
5232     assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5233 }
5234 
TestTraceCreateWord(void)5235 void RBBITest::TestTraceCreateWord(void) {
5236     SetupTestTrace();
5237     IcuTestErrorCode status(*this, "TestTraceCreateWord");
5238     LocalPointer<BreakIterator> brkitr(
5239         BreakIterator::createWordInstance("zh-CN", status));
5240     status.errIfFailureAndReset();
5241     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5242 }
5243 
TestTraceCreateLine(void)5244 void RBBITest::TestTraceCreateLine(void) {
5245     SetupTestTrace();
5246     IcuTestErrorCode status(*this, "TestTraceCreateLine");
5247     LocalPointer<BreakIterator> brkitr(
5248         BreakIterator::createLineInstance("zh-CN", status));
5249     status.errIfFailureAndReset();
5250     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "");
5251 }
5252 
TestTraceCreateLineStrict(void)5253 void RBBITest::TestTraceCreateLineStrict(void) {
5254     SetupTestTrace();
5255     IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5256     LocalPointer<BreakIterator> brkitr(
5257         BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5258     status.errIfFailureAndReset();
5259     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "strict");
5260 }
5261 
TestTraceCreateLineNormal(void)5262 void RBBITest::TestTraceCreateLineNormal(void) {
5263     SetupTestTrace();
5264     IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5265     LocalPointer<BreakIterator> brkitr(
5266         BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5267     status.errIfFailureAndReset();
5268     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "normal");
5269 }
5270 
TestTraceCreateLineLoose(void)5271 void RBBITest::TestTraceCreateLineLoose(void) {
5272     SetupTestTrace();
5273     IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5274     LocalPointer<BreakIterator> brkitr(
5275         BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5276     status.errIfFailureAndReset();
5277     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "loose");
5278 }
5279 
TestTraceCreateBreakEngine(void)5280 void RBBITest::TestTraceCreateBreakEngine(void) {
5281     rbbi_cleanup();
5282     SetupTestTrace();
5283     IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5284     LocalPointer<BreakIterator> brkitr(
5285         BreakIterator::createWordInstance("zh-CN", status));
5286     status.errIfFailureAndReset();
5287     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5288 
5289     // To word break the following text, BreakIterator will create 5 dictionary
5290     // break engine internally.
5291     brkitr->setText(
5292         u"test "
5293         u"測試 " // Hani
5294         u"សាកល្បង " // Khmr
5295         u"ທົດສອບ " // Laoo
5296         u"စမ်းသပ်မှု " // Mymr
5297         u"ทดสอบ " // Thai
5298         u"test "
5299     );
5300 
5301     // Loop through all the text.
5302     while (brkitr->next() > 0) ;
5303 
5304     assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5305     assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5306     assertEquals("utrace_data should be called ", 5, gDataFn.size());
5307 
5308     for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5309         assertEquals("utrace_entry should be called ",
5310                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5311         assertEquals("utrace_exit should be called ",
5312                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5313         assertEquals("utrace_data should be called ",
5314                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5315     }
5316 
5317     assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5318     assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5319     assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5320     assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5321     assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5322 
5323 }
5324 #endif
5325 
5326 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5327