• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1999-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Date        Name        Description
10 *   12/15/99    Madhu        Creation.
11 *   01/12/2000  Madhu        Updated for changed API and added new tests
12 ************************************************************************/
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include <sstream>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <utility>
22 #include <vector>
23 
24 #include "unicode/brkiter.h"
25 #include "unicode/localpointer.h"
26 #include "unicode/numfmt.h"
27 #include "unicode/rbbi.h"
28 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
29 #include "unicode/regex.h"
30 #endif
31 #include "unicode/schriter.h"
32 #include "unicode/uchar.h"
33 #include "unicode/utf16.h"
34 #include "unicode/ucnv.h"
35 #include "unicode/uniset.h"
36 #include "unicode/uscript.h"
37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
39 #include "unicode/utrace.h"
40 
41 #include "charstr.h"
42 #include "cmemory.h"
43 #include "cstr.h"
44 #include "intltest.h"
45 #include "rbbitst.h"
46 #include "rbbidata.h"
47 #include "utypeinfo.h"  // for 'typeid' to work
48 #include "uvector.h"
49 #include "uvectr32.h"
50 
51 
52 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
53 #include "unicode/filteredbrk.h"
54 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 
56 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
57     if (!(x)) { \
58         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
59     } \
60 } UPRV_BLOCK_MACRO_END
61 
62 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
63     if (U_FAILURE(errcode)) { \
64         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
65     } \
66 } UPRV_BLOCK_MACRO_END
67 
68 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
69     IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
70                     __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
71 }
72 
73 //---------------------------------------------
74 // runIndexedTest
75 //---------------------------------------------
76 
77 
78 //  Note:  Before adding new tests to this file, check whether the desired test data can
79 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
80 //         it's much less work than writing a new test, diagnostic output in the event of failures
81 //         is good, and the test data file will is shared with ICU4J, so eventually the test
82 //         will run there as well, without additional effort.
83 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)84 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
85 {
86     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
87     fTestParams = params;
88 
89     TESTCASE_AUTO_BEGIN;
90 #if !UCONFIG_NO_FILE_IO
91     TESTCASE_AUTO(TestBug4153072);
92 #endif
93 #if !UCONFIG_NO_FILE_IO
94     TESTCASE_AUTO(TestUnicodeFiles);
95 #endif
96     TESTCASE_AUTO(TestGetAvailableLocales);
97     TESTCASE_AUTO(TestGetDisplayName);
98 #if !UCONFIG_NO_FILE_IO
99     TESTCASE_AUTO(TestEndBehaviour);
100     TESTCASE_AUTO(TestWordBreaks);
101     TESTCASE_AUTO(TestWordBoundary);
102     TESTCASE_AUTO(TestLineBreaks);
103     TESTCASE_AUTO(TestSentBreaks);
104     TESTCASE_AUTO(TestExtended);
105 #endif
106 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
107     TESTCASE_AUTO(TestMonkey);
108 #endif
109 #if !UCONFIG_NO_FILE_IO
110     TESTCASE_AUTO(TestBug3818);
111 #endif
112     TESTCASE_AUTO(TestDebug);
113 #if !UCONFIG_NO_FILE_IO
114     TESTCASE_AUTO(TestBug5775);
115 #endif
116     TESTCASE_AUTO(TestBug9983);
117     TESTCASE_AUTO(TestDictRules);
118     TESTCASE_AUTO(TestBug5532);
119     TESTCASE_AUTO(TestBug7547);
120     TESTCASE_AUTO(TestBug12797);
121     TESTCASE_AUTO(TestBug12918);
122     TESTCASE_AUTO(TestBug12932);
123     TESTCASE_AUTO(TestEmoji);
124     TESTCASE_AUTO(TestBug12519);
125     TESTCASE_AUTO(TestBug12677);
126     TESTCASE_AUTO(TestTableRedundancies);
127     TESTCASE_AUTO(TestBug13447);
128     TESTCASE_AUTO(TestReverse);
129     TESTCASE_AUTO(TestBug13692);
130     TESTCASE_AUTO(TestDebugRules);
131 
132 #if U_ENABLE_TRACING
133     TESTCASE_AUTO(TestTraceCreateCharacter);
134     TESTCASE_AUTO(TestTraceCreateWord);
135     TESTCASE_AUTO(TestTraceCreateSentence);
136     TESTCASE_AUTO(TestTraceCreateTitle);
137     TESTCASE_AUTO(TestTraceCreateLine);
138     TESTCASE_AUTO(TestTraceCreateLineNormal);
139     TESTCASE_AUTO(TestTraceCreateLineLoose);
140     TESTCASE_AUTO(TestTraceCreateLineStrict);
141     TESTCASE_AUTO(TestTraceCreateBreakEngine);
142 #endif
143 
144     TESTCASE_AUTO_END;
145 }
146 
147 
148 //--------------------------------------------------------------------------------------
149 //
150 //    RBBITest    constructor and destructor
151 //
152 //--------------------------------------------------------------------------------------
153 
RBBITest()154 RBBITest::RBBITest() {
155     fTestParams = NULL;
156 }
157 
158 
~RBBITest()159 RBBITest::~RBBITest() {
160 }
161 
162 
printStringBreaks(UText * tstr,int expected[],int expectedCount)163 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
164     UErrorCode status = U_ZERO_ERROR;
165     char name[100];
166     printf("code    alpha extend alphanum type word sent line name\n");
167     int nextExpectedIndex = 0;
168     utext_setNativeIndex(tstr, 0);
169     for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
170         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
171             printf("------------------------------------------------ %d\n", j);
172             ++nextExpectedIndex;
173         }
174 
175         UChar32 c = utext_next32(tstr);
176         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
177         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
178                            u_isUAlphabetic(c),
179                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
180                            u_isalnum(c),
181                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
182                                                   u_charType(c),
183                                                   U_SHORT_PROPERTY_NAME),
184                            u_getPropertyValueName(UCHAR_WORD_BREAK,
185                                                   u_getIntPropertyValue(c,
186                                                           UCHAR_WORD_BREAK),
187                                                   U_SHORT_PROPERTY_NAME),
188                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
189                                    u_getIntPropertyValue(c,
190                                            UCHAR_SENTENCE_BREAK),
191                                    U_SHORT_PROPERTY_NAME),
192                            u_getPropertyValueName(UCHAR_LINE_BREAK,
193                                    u_getIntPropertyValue(c,
194                                            UCHAR_LINE_BREAK),
195                                    U_SHORT_PROPERTY_NAME),
196                            name);
197     }
198 }
199 
200 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)201 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
202    UErrorCode status = U_ZERO_ERROR;
203    UText *tstr = NULL;
204    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
205    if (U_FAILURE(status)) {
206        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
207        return;
208     }
209    printStringBreaks(tstr, expected, expectedCount);
210    utext_close(tstr);
211 }
212 
213 
TestBug3818()214 void RBBITest::TestBug3818() {
215     UErrorCode  status = U_ZERO_ERROR;
216 
217     // Four Thai words...
218     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
219                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
220     UnicodeString  thaiStr(thaiWordData);
221 
222     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
223     if (U_FAILURE(status) || bi == NULL) {
224         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
225         return;
226     }
227     bi->setText(thaiStr);
228 
229     int32_t  startOfSecondWord = bi->following(1);
230     if (startOfSecondWord != 4) {
231         errln("Fail at file %s, line %d expected start of word at 4, got %d",
232             __FILE__, __LINE__, startOfSecondWord);
233     }
234     startOfSecondWord = bi->following(0);
235     if (startOfSecondWord != 4) {
236         errln("Fail at file %s, line %d expected start of word at 4, got %d",
237             __FILE__, __LINE__, startOfSecondWord);
238     }
239     delete bi;
240 }
241 
242 
243 //---------------------------------------------
244 //
245 //     other tests
246 //
247 //---------------------------------------------
248 
TestGetAvailableLocales()249 void RBBITest::TestGetAvailableLocales()
250 {
251     int32_t locCount = 0;
252     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
253 
254     if (locCount == 0)
255         dataerrln("getAvailableLocales() returned an empty list!");
256     // Just make sure that it's returning good memory.
257     int32_t i;
258     for (i = 0; i < locCount; ++i) {
259         logln(locList[i].getName());
260     }
261 }
262 
263 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()264 void RBBITest::TestGetDisplayName()
265 {
266     UnicodeString   result;
267 
268     BreakIterator::getDisplayName(Locale::getUS(), result);
269     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
270         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
271                 + result);
272 
273     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
274     if (result != "French (France)")
275         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
276                 + result);
277 }
278 /**
279  * Test End Behaviour
280  * @bug 4068137
281  */
TestEndBehaviour()282 void RBBITest::TestEndBehaviour()
283 {
284     UErrorCode status = U_ZERO_ERROR;
285     UnicodeString testString("boo.");
286     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
287     if (U_FAILURE(status))
288     {
289         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
290         return;
291     }
292     wb->setText(testString);
293 
294     if (wb->first() != 0)
295         errln("Didn't get break at beginning of string.");
296     if (wb->next() != 3)
297         errln("Didn't get break before period in \"boo.\"");
298     if (wb->current() != 4 && wb->next() != 4)
299         errln("Didn't get break at end of string.");
300     delete wb;
301 }
302 /*
303  * @bug 4153072
304  */
TestBug4153072()305 void RBBITest::TestBug4153072() {
306     UErrorCode status = U_ZERO_ERROR;
307     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
308     if (U_FAILURE(status))
309     {
310         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
311         return;
312     }
313     UnicodeString str("...Hello, World!...");
314     int32_t begin = 3;
315     int32_t end = str.length() - 3;
316     UBool onBoundary;
317 
318     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
319     iter->adoptText(textIterator);
320     int index;
321     // Note: with the switch to UText, there is no way to restrict the
322     //       iteration range to begin at an index other than zero.
323     //       String character iterators created with a non-zero bound are
324     //         treated by RBBI as being empty.
325     for (index = -1; index < begin + 1; ++index) {
326         onBoundary = iter->isBoundary(index);
327         if (index == 0?  !onBoundary : onBoundary) {
328             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
329                             " and begin index = " + begin);
330         }
331     }
332     delete iter;
333 }
334 
335 
336 //
337 // Test for problem reported by Ashok Matoria on 9 July 2007
338 //    One.<kSoftHyphen><kSpace>Two.
339 //
340 //    Sentence break at start (0) and then on calling next() it breaks at
341 //   'T' of "Two". Now, at this point if I do next() and
342 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
343 //
TestBug5775()344 void RBBITest::TestBug5775() {
345     UErrorCode status = U_ZERO_ERROR;
346     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
347     TEST_ASSERT_SUCCESS(status);
348     if (U_FAILURE(status)) {
349         return;
350     }
351 // Check for status first for better handling of no data errors.
352     TEST_ASSERT(bi != NULL);
353     if (bi == NULL) {
354         return;
355     }
356 
357     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
358     //               01234      56789
359     s = s.unescape();
360     bi->setText(s);
361     int pos = bi->next();
362     TEST_ASSERT(pos == 6);
363     pos = bi->next();
364     TEST_ASSERT(pos == 10);
365     pos = bi->previous();
366     TEST_ASSERT(pos == 6);
367     delete bi;
368 }
369 
370 
371 
372 //------------------------------------------------------------------------------
373 //
374 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
375 //
376 //------------------------------------------------------------------------------
377 
378 struct TestParams {
379     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
380                                            //   Changed out whenever test data changes break type.
381 
382     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
383     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
384     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
385     UVector32       *srcCol;
386 
387     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
388     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
389     CharString       utf8String;           // UTF-8 form of text to break.
390 
TestParamsTestParams391     TestParams(UErrorCode &status) : dataToBreak() {
392         bi               = NULL;
393         expectedBreaks   = new UVector32(status);
394         srcLine          = new UVector32(status);
395         srcCol           = new UVector32(status);
396         textToBreak      = NULL;
397         textMap          = new UVector32(status);
398     }
399 
~TestParamsTestParams400     ~TestParams() {
401         delete bi;
402         delete expectedBreaks;
403         delete srcLine;
404         delete srcCol;
405         utext_close(textToBreak);
406         delete textMap;
407     }
408 
409     int32_t getSrcLine(int32_t bp);
410     int32_t getExpectedBreak(int32_t bp);
411     int32_t getSrcCol(int32_t bp);
412 
413     void setUTF16(UErrorCode &status);
414     void setUTF8(UErrorCode &status);
415 };
416 
417 // Append a UnicodeString to a CharString with UTF-8 encoding.
418 // Substitute any invalid chars.
419 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)420 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
421     if (U_FAILURE(status)) {
422         return;
423     }
424     int32_t utf8Length;
425     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
426                        src.getBuffer(), src.length(),   // UTF-16 data
427                        0xfffd, NULL,                    // Substitution char, number of subs.
428                        &status);
429     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
430         return;
431     }
432     status = U_ZERO_ERROR;
433     int32_t capacity;
434     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
435     u_strToUTF8WithSub(buffer, utf8Length, NULL,
436                        src.getBuffer(), src.length(),
437                        0xfffd, NULL, &status);
438     dest.append(buffer, utf8Length, status);
439 }
440 
441 
setUTF16(UErrorCode & status)442 void TestParams::setUTF16(UErrorCode &status) {
443     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
444     textMap->removeAllElements();
445     for (int32_t i=0; i<dataToBreak.length(); i++) {
446         if (i == dataToBreak.getChar32Start(i)) {
447             textMap->addElement(i, status);
448         } else {
449             textMap->addElement(-1, status);
450         }
451     }
452     textMap->addElement(dataToBreak.length(), status);
453     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
454 }
455 
456 
setUTF8(UErrorCode & status)457 void TestParams::setUTF8(UErrorCode &status) {
458     if (U_FAILURE(status)) {
459         return;
460     }
461     utf8String.clear();
462     CharStringAppend(utf8String, dataToBreak, status);
463     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
464     if (U_FAILURE(status)) {
465         return;
466     }
467 
468     textMap->removeAllElements();
469     int32_t utf16Index = 0;
470     for (;;) {
471         textMap->addElement(utf16Index, status);
472         UChar32 c32 = utext_current32(textToBreak);
473         if (c32 < 0) {
474             break;
475         }
476         utf16Index += U16_LENGTH(c32);
477         utext_next32(textToBreak);
478         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
479             textMap->addElement(-1, status);
480         }
481     }
482     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
483 }
484 
485 
getSrcLine(int32_t bp)486 int32_t TestParams::getSrcLine(int32_t bp) {
487     if (bp >= textMap->size()) {
488         bp = textMap->size() - 1;
489     }
490     int32_t i = 0;
491     for(; bp >= 0 ; --bp) {
492         // Move to a character boundary if we are not on one already.
493         i = textMap->elementAti(bp);
494         if (i >= 0) {
495             break;
496         }
497     }
498     return srcLine->elementAti(i);
499 }
500 
501 
getExpectedBreak(int32_t bp)502 int32_t TestParams::getExpectedBreak(int32_t bp) {
503     if (bp >= textMap->size()) {
504         return 0;
505     }
506     int32_t i = textMap->elementAti(bp);
507     int32_t retVal = 0;
508     if (i >= 0) {
509         retVal = expectedBreaks->elementAti(i);
510     }
511     return retVal;
512 }
513 
514 
getSrcCol(int32_t bp)515 int32_t TestParams::getSrcCol(int32_t bp) {
516     if (bp >= textMap->size()) {
517         bp = textMap->size() - 1;
518     }
519     int32_t i = 0;
520     for(; bp >= 0; --bp) {
521         // Move bp to a character boundary if we are not on one already.
522         i = textMap->elementAti(bp);
523         if (i >= 0) {
524             break;
525         }
526     }
527     return srcCol->elementAti(i);
528 }
529 
530 
executeTest(TestParams * t,UErrorCode & status)531 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
532     int32_t    bp;
533     int32_t    prevBP;
534     int32_t    i;
535 
536     TEST_ASSERT_SUCCESS(status);
537     if (U_FAILURE(status)) {
538         return;
539     }
540 
541     if (t->bi == NULL) {
542         return;
543     }
544 
545     t->bi->setText(t->textToBreak, status);
546     //
547     //  Run the iterator forward
548     //
549     prevBP = -1;
550     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
551         if (prevBP ==  bp) {
552             // Fail for lack of forward progress.
553             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
554                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
555             break;
556         }
557 
558         // Check that there we didn't miss an expected break between the last one
559         //  and this one.
560         for (i=prevBP+1; i<bp; i++) {
561             if (t->getExpectedBreak(i) != 0) {
562                 int expected[] = {0, i};
563                 printStringBreaks(t->dataToBreak, expected, 2);
564                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
565                       i, t->getSrcLine(i), t->getSrcCol(i));
566             }
567         }
568 
569         // Check that the break we did find was expected
570         if (t->getExpectedBreak(bp) == 0) {
571             int expected[] = {0, bp};
572             printStringBreaks(t->textToBreak, expected, 2);
573             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
574                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
575         } else {
576             // The break was expected.
577             //   Check that the {nnn} tag value is correct.
578             int32_t expectedTagVal = t->getExpectedBreak(bp);
579             if (expectedTagVal == -1) {
580                 expectedTagVal = 0;
581             }
582             int32_t line = t->getSrcLine(bp);
583             int32_t rs = t->bi->getRuleStatus();
584             if (rs != expectedTagVal) {
585                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
586                       "          Actual, Expected status = %4d, %4d",
587                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
588             }
589         }
590 
591         prevBP = bp;
592     }
593 
594     // Verify that there were no missed expected breaks after the last one found
595     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
596         if (t->getExpectedBreak(i) != 0) {
597             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
598                       i, t->getSrcLine(i), t->getSrcCol(i));
599         }
600     }
601 
602     //
603     //  Run the iterator backwards, verify that the same breaks are found.
604     //
605     prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
606     bp = t->bi->last();
607     while (bp != BreakIterator::DONE) {
608         if (prevBP ==  bp) {
609             // Fail for lack of progress.
610             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
611                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
612             break;
613         }
614 
615         // Check that we didn't miss an expected break between the last one
616         //  and this one.  (UVector returns zeros for index out of bounds.)
617         for (i=prevBP-1; i>bp; i--) {
618             if (t->getExpectedBreak(i) != 0) {
619                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
620                       i, t->getSrcLine(i), t->getSrcCol(i));
621             }
622         }
623 
624         // Check that the break we did find was expected
625         if (t->getExpectedBreak(bp) == 0) {
626             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
627                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
628         } else {
629             // The break was expected.
630             //   Check that the {nnn} tag value is correct.
631             int32_t expectedTagVal = t->getExpectedBreak(bp);
632             if (expectedTagVal == -1) {
633                 expectedTagVal = 0;
634             }
635             int line = t->getSrcLine(bp);
636             int32_t rs = t->bi->getRuleStatus();
637             if (rs != expectedTagVal) {
638                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
639                       "          Actual, Expected status = %4d, %4d",
640                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
641             }
642         }
643 
644         prevBP = bp;
645         bp = t->bi->previous();
646     }
647 
648     // Verify that there were no missed breaks prior to the last one found
649     for (i=prevBP-1; i>=0; i--) {
650         if (t->getExpectedBreak(i) != 0) {
651             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
652                       i, t->getSrcLine(i), t->getSrcCol(i));
653         }
654     }
655 
656     // Check isBoundary()
657     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
658         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
659         UBool boundaryFound    = t->bi->isBoundary(i);
660         if (boundaryExpected != boundaryFound) {
661             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
662                   "        Expected, Actual= %s, %s",
663                   i, t->getSrcLine(i), t->getSrcCol(i),
664                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
665         }
666     }
667 
668     // Check following()
669     for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
670         int32_t actualBreak = t->bi->following(i);
671         int32_t expectedBreak = BreakIterator::DONE;
672         for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
673             if (t->getExpectedBreak(j) != 0) {
674                 expectedBreak = j;
675                 break;
676             }
677         }
678         if (expectedBreak != actualBreak) {
679             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
680                   "        Expected, Actual= %d, %d",
681                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
682         }
683     }
684 
685     // Check preceding()
686     for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
687         int32_t actualBreak = t->bi->preceding(i);
688         int32_t expectedBreak = BreakIterator::DONE;
689 
690         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
691         // preceding(trailing byte) will return the index of some preceding code point,
692         // not the lead byte of the current code point, even though that has a smaller index.
693         // Therefore, start looking at the expected break data not at i-1, but at
694         // the start of code point index - 1.
695         utext_setNativeIndex(t->textToBreak, i);
696         int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
697         for (; j >= 0; j--) {
698             if (t->getExpectedBreak(j) != 0) {
699                 expectedBreak = j;
700                 break;
701             }
702         }
703         if (expectedBreak != actualBreak) {
704             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
705                   "        Expected, Actual= %d, %d",
706                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
707         }
708     }
709 }
710 
711 
TestExtended()712 void RBBITest::TestExtended() {
713   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
714   // data driven test closely entangles filtered and regular data.
715 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
716     UErrorCode      status  = U_ZERO_ERROR;
717     Locale          locale("");
718 
719     TestParams          tp(status);
720 
721     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
722     if (U_FAILURE(status)) {
723         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
724     }
725 
726     //
727     //  Open and read the test data file.
728     //
729     const char *testDataDirectory = IntlTest::getSourceTestData(status);
730     CharString testFileName(testDataDirectory, -1, status);
731     testFileName.append("rbbitst.txt", -1, status);
732 
733     int    len;
734     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
735     if (U_FAILURE(status)) {
736         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
737         return;
738     }
739 
740     bool skipTest = false; // Skip this test?
741 
742     //
743     //  Put the test data into a UnicodeString
744     //
745     UnicodeString testString(FALSE, testFile, len);
746 
747     enum EParseState{
748         PARSE_COMMENT,
749         PARSE_TAG,
750         PARSE_DATA,
751         PARSE_NUM,
752         PARSE_RULES
753     }
754     parseState = PARSE_TAG;
755 
756     EParseState savedState = PARSE_TAG;
757 
758     int32_t    lineNum  = 1;
759     int32_t    colStart = 0;
760     int32_t    column   = 0;
761     int32_t    charIdx  = 0;
762 
763     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
764 
765     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
766     int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
767 
768     for (charIdx = 0; charIdx < len; ) {
769         status = U_ZERO_ERROR;
770         UChar  c = testString.charAt(charIdx);
771         charIdx++;
772         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
773             // treat CRLF as a unit
774             c = u'\n';
775             charIdx++;
776         }
777         if (c == u'\n' || c == u'\r') {
778             lineNum++;
779             colStart = charIdx;
780         }
781         column = charIdx - colStart + 1;
782 
783         switch (parseState) {
784         case PARSE_COMMENT:
785             if (c == u'\n' || c == u'\r') {
786                 parseState = savedState;
787             }
788             break;
789 
790         case PARSE_TAG:
791             {
792             if (c == u'#') {
793                 parseState = PARSE_COMMENT;
794                 savedState = PARSE_TAG;
795                 break;
796             }
797             if (u_isUWhiteSpace(c)) {
798                 break;
799             }
800             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
801                 delete tp.bi;
802                 tp.bi = BreakIterator::createWordInstance(locale,  status);
803                 skipTest = false;
804                 charIdx += 5;
805                 break;
806             }
807             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
808                 delete tp.bi;
809                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
810                 skipTest = false;
811                 charIdx += 5;
812                 break;
813             }
814             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
815                 delete tp.bi;
816                 tp.bi = BreakIterator::createLineInstance(locale,  status);
817                 skipTest = false;
818                 charIdx += 5;
819                 break;
820             }
821             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
822                 delete tp.bi;
823                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
824                 skipTest = false;
825                 charIdx += 5;
826                 break;
827             }
828             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
829                 delete tp.bi;
830                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
831                 charIdx += 6;
832                 break;
833             }
834 
835             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
836                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
837                 charIdx = testString.indexOf(u'>', charIdx) + 1;
838                 parseState = PARSE_RULES;
839                 rules.remove();
840                 rulesFirstLine = lineNum;
841                 break;
842             }
843 
844             // <locale  loc_name>
845             localeMatcher.reset(testString);
846             if (localeMatcher.lookingAt(charIdx-1, status)) {
847                 UnicodeString localeName = localeMatcher.group(1, status);
848                 char localeName8[100];
849                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
850                 locale = Locale::createFromName(localeName8);
851                 charIdx += localeMatcher.group(0, status).length() - 1;
852                 TEST_ASSERT_SUCCESS(status);
853                 break;
854             }
855             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
856                 parseState = PARSE_DATA;
857                 charIdx += 5;
858                 tp.dataToBreak = "";
859                 tp.expectedBreaks->removeAllElements();
860                 tp.srcCol ->removeAllElements();
861                 tp.srcLine->removeAllElements();
862                 break;
863             }
864 
865             errln("line %d: Tag expected in test file.", lineNum);
866             parseState = PARSE_COMMENT;
867             savedState = PARSE_DATA;
868             goto end_test; // Stop the test.
869             }
870             break;
871 
872         case PARSE_RULES:
873             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
874                 charIdx += 7;
875                 parseState = PARSE_TAG;
876                 delete tp.bi;
877                 UParseError pe;
878                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
879                 skipTest = U_FAILURE(status);
880                 if (U_FAILURE(status)) {
881                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
882                         rulesFirstLine + pe.line - 1, u_errorName(status));
883                 }
884             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
885                 charIdx += 10;
886                 parseState = PARSE_TAG;
887                 UErrorCode ec = U_ZERO_ERROR;
888                 UParseError pe;
889                 RuleBasedBreakIterator bi(rules, pe, ec);
890                 if (U_SUCCESS(ec)) {
891                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
892                         rulesFirstLine + pe.line - 1);
893                 }
894             } else {
895                 rules.append(c);
896             }
897             break;
898 
899         case PARSE_DATA:
900             if (c == u'•') {
901                 int32_t  breakIdx = tp.dataToBreak.length();
902                 tp.expectedBreaks->setSize(breakIdx+1);
903                 tp.expectedBreaks->setElementAt(-1, breakIdx);
904                 tp.srcLine->setSize(breakIdx+1);
905                 tp.srcLine->setElementAt(lineNum, breakIdx);
906                 tp.srcCol ->setSize(breakIdx+1);
907                 tp.srcCol ->setElementAt(column, breakIdx);
908                 break;
909             }
910 
911             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
912                 // Add final entry to mappings from break location to source file position.
913                 //  Need one extra because last break position returned is after the
914                 //    last char in the data, not at the last char.
915                 tp.srcLine->addElement(lineNum, status);
916                 tp.srcCol ->addElement(column, status);
917 
918                 parseState = PARSE_TAG;
919                 charIdx += 6;
920 
921                 if (!skipTest) {
922                     // RUN THE TEST!
923                     status = U_ZERO_ERROR;
924                     tp.setUTF16(status);
925                     executeTest(&tp, status);
926                     TEST_ASSERT_SUCCESS(status);
927 
928                     // Run again, this time with UTF-8 text wrapped in a UText.
929                     status = U_ZERO_ERROR;
930                     tp.setUTF8(status);
931                     TEST_ASSERT_SUCCESS(status);
932                     executeTest(&tp, status);
933                 }
934                 break;
935             }
936 
937             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
938                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
939                 // Get the code point from the name and insert it into the test data.
940                 //   (Damn, no API takes names in Unicode  !!!
941                 //    we've got to take it back to char *)
942                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
943                 int32_t nameLength = nameEndIdx - (charIdx+2);
944                 char charNameBuf[200];
945                 UChar32 theChar = -1;
946                 if (nameEndIdx != -1) {
947                     UErrorCode status = U_ZERO_ERROR;
948                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
949                     charNameBuf[sizeof(charNameBuf)-1] = 0;
950                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
951                     if (U_FAILURE(status)) {
952                         theChar = -1;
953                     }
954                 }
955                 if (theChar == -1) {
956                     errln("Error in named character in test file at line %d, col %d",
957                         lineNum, column);
958                 } else {
959                     // Named code point was recognized.  Insert it
960                     //   into the test data.
961                     tp.dataToBreak.append(theChar);
962                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
963                         tp.srcLine->addElement(lineNum, status);
964                         tp.srcCol ->addElement(column, status);
965                     }
966                 }
967                 if (nameEndIdx > charIdx) {
968                     charIdx = nameEndIdx+1;
969 
970                 }
971                 break;
972             }
973 
974 
975 
976             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
977                 charIdx++;
978                 int32_t  breakIdx = tp.dataToBreak.length();
979                 tp.expectedBreaks->setSize(breakIdx+1);
980                 tp.expectedBreaks->setElementAt(-1, breakIdx);
981                 tp.srcLine->setSize(breakIdx+1);
982                 tp.srcLine->setElementAt(lineNum, breakIdx);
983                 tp.srcCol ->setSize(breakIdx+1);
984                 tp.srcCol ->setElementAt(column, breakIdx);
985                 break;
986             }
987 
988             if (c == u'<') {
989                 tagValue   = 0;
990                 parseState = PARSE_NUM;
991                 break;
992             }
993 
994             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
995                 parseState = PARSE_COMMENT;
996                 savedState = PARSE_DATA;
997                 break;
998             }
999 
1000             if (c == u'\\') {
1001                 // Check for \ at end of line, a line continuation.
1002                 //     Advance over (discard) the newline
1003                 UChar32 cp = testString.char32At(charIdx);
1004                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1005                     // We have a CR LF
1006                     //  Need an extra increment of the input ptr to move over both of them
1007                     charIdx++;
1008                 }
1009                 if (cp == u'\n' || cp == u'\r') {
1010                     lineNum++;
1011                     colStart = charIdx;
1012                     charIdx++;
1013                     break;
1014                 }
1015 
1016                 // Let unescape handle the back slash.
1017                 cp = testString.unescapeAt(charIdx);
1018                 if (cp != -1) {
1019                     // Escape sequence was recognized.  Insert the char
1020                     //   into the test data.
1021                     tp.dataToBreak.append(cp);
1022                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1023                         tp.srcLine->addElement(lineNum, status);
1024                         tp.srcCol ->addElement(column, status);
1025                     }
1026                     break;
1027                 }
1028 
1029 
1030                 // Not a recognized backslash escape sequence.
1031                 // Take the next char as a literal.
1032                 //  TODO:  Should this be an error?
1033                 c = testString.charAt(charIdx);
1034                 charIdx = testString.moveIndex32(charIdx, 1);
1035             }
1036 
1037             // Normal, non-escaped data char.
1038             tp.dataToBreak.append(c);
1039 
1040             // Save the mapping from offset in the data to line/column numbers in
1041             //   the original input file.  Will be used for better error messages only.
1042             //   If there's an expected break before this char, the slot in the mapping
1043             //     vector will already be set for this char; don't overwrite it.
1044             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1045                 tp.srcLine->addElement(lineNum, status);
1046                 tp.srcCol ->addElement(column, status);
1047             }
1048             break;
1049 
1050 
1051         case PARSE_NUM:
1052             // We are parsing an expected numeric tag value, like <1234>,
1053             //   within a chunk of data.
1054             if (u_isUWhiteSpace(c)) {
1055                 break;
1056             }
1057 
1058             if (c == u'>') {
1059                 // Finished the number.  Add the info to the expected break data,
1060                 //   and switch parse state back to doing plain data.
1061                 parseState = PARSE_DATA;
1062                 if (tagValue == 0) {
1063                     tagValue = -1;
1064                 }
1065                 int32_t  breakIdx = tp.dataToBreak.length();
1066                 tp.expectedBreaks->setSize(breakIdx+1);
1067                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1068                 tp.srcLine->setSize(breakIdx+1);
1069                 tp.srcLine->setElementAt(lineNum, breakIdx);
1070                 tp.srcCol ->setSize(breakIdx+1);
1071                 tp.srcCol ->setElementAt(column, breakIdx);
1072                 break;
1073             }
1074 
1075             if (u_isdigit(c)) {
1076                 tagValue = tagValue*10 + u_charDigitValue(c);
1077                 break;
1078             }
1079 
1080             errln("Syntax Error in test file at line %d, col %d",
1081                 lineNum, column);
1082             parseState = PARSE_COMMENT;
1083             goto end_test; // Stop the test
1084             break;
1085         }
1086 
1087 
1088         if (U_FAILURE(status)) {
1089             dataerrln("ICU Error %s while parsing test file at line %d.",
1090                 u_errorName(status), lineNum);
1091             status = U_ZERO_ERROR;
1092             goto end_test; // Stop the test
1093         }
1094 
1095     }
1096 
1097     // Reached end of test file. Raise an error if parseState indicates that we are
1098     //   within a block that should have been terminated.
1099 
1100     if (parseState == PARSE_RULES) {
1101         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1102             lineNum, rulesFirstLine);
1103     }
1104     if (parseState == PARSE_DATA) {
1105         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1106     }
1107 
1108 
1109 end_test:
1110     delete [] testFile;
1111 #endif
1112 }
1113 
1114 
1115 //-------------------------------------------------------------------------------
1116 //
1117 //  TestDictRules   create a break iterator from source rules that includes a
1118 //                  dictionary range.   Regression for bug #7130.  Source rules
1119 //                  do not declare a break iterator type (word, line, sentence, etc.
1120 //                  but the dictionary code, without a type, would loop.
1121 //
1122 //-------------------------------------------------------------------------------
TestDictRules()1123 void RBBITest::TestDictRules() {
1124     const char *rules =  "$dictionary = [a-z]; \n"
1125                          "!!forward; \n"
1126                          "$dictionary $dictionary; \n"
1127                          "!!reverse; \n"
1128                          "$dictionary $dictionary; \n";
1129     const char *text = "aa";
1130     UErrorCode status = U_ZERO_ERROR;
1131     UParseError parseError;
1132 
1133     RuleBasedBreakIterator bi(rules, parseError, status);
1134     if (U_SUCCESS(status)) {
1135         UnicodeString utext = text;
1136         bi.setText(utext);
1137         int32_t position;
1138         int32_t loops;
1139         for (loops = 0; loops<10; loops++) {
1140             position = bi.next();
1141             if (position == RuleBasedBreakIterator::DONE) {
1142                 break;
1143             }
1144         }
1145         TEST_ASSERT(loops == 1);
1146     } else {
1147         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1148     }
1149 }
1150 
1151 
1152 
1153 //-------------------------------------------------------------------------------
1154 //
1155 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1156 //    return the data in one big UChar * buffer, which the caller must delete.
1157 //
1158 //    parameters:
1159 //          fileName:   the name of the file, with no directory part.  The test data directory
1160 //                      is assumed.
1161 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1162 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1163 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1164 //                      Pass NULL for the system default encoding.
1165 //          status
1166 //    returns:
1167 //                      The file data, converted to UChar.
1168 //                      The caller must delete this when done with
1169 //                           delete [] theBuffer;
1170 //
1171 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1172 //           Move this function to some common place.
1173 //
1174 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1175 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1176     UChar       *retPtr  = NULL;
1177     char        *fileBuf = NULL;
1178     UConverter* conv     = NULL;
1179     FILE        *f       = NULL;
1180 
1181     ulen = 0;
1182     if (U_FAILURE(status)) {
1183         return retPtr;
1184     }
1185 
1186     //
1187     //  Open the file.
1188     //
1189     f = fopen(fileName, "rb");
1190     if (f == 0) {
1191         dataerrln("Error opening test data file %s\n", fileName);
1192         status = U_FILE_ACCESS_ERROR;
1193         return NULL;
1194     }
1195     //
1196     //  Read it in
1197     //
1198     int   fileSize;
1199     int   amt_read;
1200 
1201     fseek( f, 0, SEEK_END);
1202     fileSize = ftell(f);
1203     fileBuf = new char[fileSize];
1204     fseek(f, 0, SEEK_SET);
1205     amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
1206     if (amt_read != fileSize || fileSize <= 0) {
1207         errln("Error reading test data file.");
1208         goto cleanUpAndReturn;
1209     }
1210 
1211     //
1212     // Look for a Unicode Signature (BOM) on the data just read
1213     //
1214     int32_t        signatureLength;
1215     const char *   fileBufC;
1216     const char*    bomEncoding;
1217 
1218     fileBufC = fileBuf;
1219     bomEncoding = ucnv_detectUnicodeSignature(
1220         fileBuf, fileSize, &signatureLength, &status);
1221     if(bomEncoding!=NULL ){
1222         fileBufC  += signatureLength;
1223         fileSize  -= signatureLength;
1224         encoding = bomEncoding;
1225     }
1226 
1227     //
1228     // Open a converter to take the rule file to UTF-16
1229     //
1230     conv = ucnv_open(encoding, &status);
1231     if (U_FAILURE(status)) {
1232         goto cleanUpAndReturn;
1233     }
1234 
1235     //
1236     // Convert the rules to UChar.
1237     //  Preflight first to determine required buffer size.
1238     //
1239     ulen = ucnv_toUChars(conv,
1240         NULL,           //  dest,
1241         0,              //  destCapacity,
1242         fileBufC,
1243         fileSize,
1244         &status);
1245     if (status == U_BUFFER_OVERFLOW_ERROR) {
1246         // Buffer Overflow is expected from the preflight operation.
1247         status = U_ZERO_ERROR;
1248 
1249         retPtr = new UChar[ulen+1];
1250         ucnv_toUChars(conv,
1251             retPtr,       //  dest,
1252             ulen+1,
1253             fileBufC,
1254             fileSize,
1255             &status);
1256     }
1257 
1258 cleanUpAndReturn:
1259     fclose(f);
1260     delete []fileBuf;
1261     ucnv_close(conv);
1262     if (U_FAILURE(status)) {
1263         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1264         delete []retPtr;
1265         retPtr = 0;
1266         ulen   = 0;
1267     }
1268     return retPtr;
1269 }
1270 
1271 
1272 
1273 //--------------------------------------------------------------------------------------------
1274 //
1275 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1276 //
1277 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1278 void RBBITest::TestUnicodeFiles() {
1279     RuleBasedBreakIterator  *bi;
1280     UErrorCode               status = U_ZERO_ERROR;
1281 
1282     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1283     TEST_ASSERT_SUCCESS(status);
1284     if (U_SUCCESS(status)) {
1285         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1286     }
1287     delete bi;
1288 
1289     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1290     TEST_ASSERT_SUCCESS(status);
1291     if (U_SUCCESS(status)) {
1292         runUnicodeTestData("WordBreakTest.txt", bi);
1293     }
1294     delete bi;
1295 
1296     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1297     TEST_ASSERT_SUCCESS(status);
1298     if (U_SUCCESS(status)) {
1299         runUnicodeTestData("SentenceBreakTest.txt", bi);
1300     }
1301     delete bi;
1302 
1303     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1304     TEST_ASSERT_SUCCESS(status);
1305     if (U_SUCCESS(status)) {
1306         runUnicodeTestData("LineBreakTest.txt", bi);
1307     }
1308     delete bi;
1309 }
1310 
1311 
1312 // Check for test cases from the Unicode test data files that are known to fail
1313 // and should be skipped as known issues because ICU does not fully implement
1314 // the Unicode specifications, or because ICU includes tailorings that differ from
1315 // the Unicode standard.
1316 //
1317 // Test cases are identified by the test data sequence, which tends to be more stable
1318 // across Unicode versions than the test file line numbers.
1319 //
1320 // The test case with ticket "10666" is a dummy, included as an example.
1321 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1322 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1323     static struct TestCase {
1324         const char *fTicketNum;
1325         const char *fFileName;
1326         const UChar *fString;
1327     } badTestCases[] = {
1328         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1329         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1330         // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1331         // ICU is out of sync with Unicode.
1332         {"8151",  "LineBreakTest.txt", u"-#"},
1333         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1334         {"8151",  "LineBreakTest.txt", u"\u002d\u00a7"},
1335         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1336         {"8151",  "LineBreakTest.txt", u"\u002d\U00050005"},
1337         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1338         {"8151",  "LineBreakTest.txt", u"\u002d\u0e01"},
1339         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1340 
1341         // Issue ICU-12017 Improve line break around numbers
1342         {"12017", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1343         {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1344         {"12017", "LineBreakTest.txt", u"find .com"},
1345         {"12017", "LineBreakTest.txt", u"equals .35 cents"},
1346         {"12017", "LineBreakTest.txt", u"a.2 "},
1347         {"12017", "LineBreakTest.txt", u"a.2 \u0915"},
1348         {"12017", "LineBreakTest.txt", u"a.2 \u672C"},
1349         {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1350         {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1351         {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1352         {"12017", "LineBreakTest.txt", u"A.1 \uBABB"},
1353         {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1354         {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1355         {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1356         {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1357     };
1358 
1359     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1360         const TestCase &badCase = badTestCases[n];
1361         if (!strcmp(fileName, badCase.fFileName) &&
1362                 testCase == UnicodeString(badCase.fString)) {
1363             return logKnownIssue(badCase.fTicketNum);
1364         }
1365     }
1366     return FALSE;
1367 }
1368 
1369 
1370 //--------------------------------------------------------------------------------------------
1371 //
1372 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1373 //
1374 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1375 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1376 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1377     UErrorCode  status = U_ZERO_ERROR;
1378 
1379     //
1380     //  Open and read the test data file, put it into a UnicodeString.
1381     //
1382     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1383     char testFileName[1000];
1384     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1385         dataerrln("Can't open test data.  Path too long.");
1386         return;
1387     }
1388     strcpy(testFileName, testDataDirectory);
1389     strcat(testFileName, fileName);
1390 
1391     logln("Opening data file %s\n", fileName);
1392 
1393     int    len;
1394     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1395     if (status != U_FILE_ACCESS_ERROR) {
1396         TEST_ASSERT_SUCCESS(status);
1397         TEST_ASSERT(testFile != NULL);
1398     }
1399     if (U_FAILURE(status) || testFile == NULL) {
1400         return; /* something went wrong, error already output */
1401     }
1402     UnicodeString testFileAsString(TRUE, testFile, len);
1403 
1404     //
1405     //  Parse the test data file using a regular expression.
1406     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1407     //     is identified by which group had a match.
1408     //
1409     //    Caputure Group #                  1          2            3            4           5
1410     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1411     //
1412     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1413     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1414     UnicodeString   testString;
1415     UVector32       breakPositions(status);
1416     int             lineNumber = 1;
1417     TEST_ASSERT_SUCCESS(status);
1418     if (U_FAILURE(status)) {
1419         return;
1420     }
1421 
1422     //
1423     //  Scan through each test case, building up the string to be broken in testString,
1424     //   and the positions that should be boundaries in the breakPositions vector.
1425     //
1426     int spin = 0;
1427     while (tokenMatcher.find()) {
1428         if(tokenMatcher.hitEnd()) {
1429           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1430              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1431              and caused an infinite loop here on EBCDIC systems!
1432           */
1433           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1434           //       return;
1435         }
1436         if (tokenMatcher.start(1, status) >= 0) {
1437             // Scanned a divide sign, indicating a break position in the test data.
1438             if (testString.length()>0) {
1439                 breakPositions.addElement(testString.length(), status);
1440             }
1441         }
1442         else if (tokenMatcher.start(2, status) >= 0) {
1443             // Scanned an 'x', meaning no break at this position in the test data
1444             //   Nothing to be done here.
1445             }
1446         else if (tokenMatcher.start(3, status) >= 0) {
1447             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1448             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1449             int length = hexNumber.length();
1450             if (length<=8) {
1451                 char buf[10];
1452                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1453                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1454                 if (c<=0x10ffff) {
1455                     testString.append(c);
1456                 } else {
1457                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1458                        fileName, lineNumber);
1459                 }
1460             } else {
1461                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1462                        fileName, lineNumber);
1463              }
1464         }
1465         else if (tokenMatcher.start(4, status) >= 0) {
1466             // Scanned to end of a line, possibly skipping over a comment in the process.
1467             //   If the line from the file contained test data, run the test now.
1468             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1469                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1470             }
1471 
1472             // Clear out this test case.
1473             //    The string and breakPositions vector will be refilled as the next
1474             //       test case is parsed.
1475             testString.remove();
1476             breakPositions.removeAllElements();
1477             lineNumber++;
1478         } else {
1479             // Scanner catchall.  Something unrecognized appeared on the line.
1480             char token[16];
1481             UnicodeString uToken = tokenMatcher.group(0, status);
1482             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1483             token[sizeof(token)-1] = 0;
1484             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1485 
1486             // Clean up, in preparation for continuing with the next line.
1487             testString.remove();
1488             breakPositions.removeAllElements();
1489             lineNumber++;
1490         }
1491         TEST_ASSERT_SUCCESS(status);
1492         if (U_FAILURE(status)) {
1493             break;
1494         }
1495     }
1496 
1497     delete [] testFile;
1498  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1499 }
1500 
1501 //--------------------------------------------------------------------------------------------
1502 //
1503 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1504 //                            test data files.  Do only a simple, forward-only check -
1505 //                            this test is mostly to check that ICU and the Unicode
1506 //                            data agree with each other.
1507 //
1508 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1509 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1510                          const UnicodeString &testString,   // Text data to be broken
1511                          UVector32 *breakPositions,         // Positions where breaks should be found.
1512                          RuleBasedBreakIterator *bi) {
1513     int32_t pos;                 // Break Position in the test string
1514     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1515     int32_t expectedPos;         // Expected break position (index into test string)
1516 
1517     bi->setText(testString);
1518     pos = bi->first();
1519     pos = bi->next();
1520 
1521     while (pos != BreakIterator::DONE) {
1522         if (expectedI >= breakPositions->size()) {
1523             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1524                 testFileName, lineNumber, pos);
1525             break;
1526         }
1527         expectedPos = breakPositions->elementAti(expectedI);
1528         if (pos < expectedPos) {
1529             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1530                 testFileName, lineNumber, pos);
1531             break;
1532         }
1533         if (pos > expectedPos) {
1534             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1535                 testFileName, lineNumber, expectedPos);
1536             break;
1537         }
1538         pos = bi->next();
1539         expectedI++;
1540     }
1541 
1542     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1543         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1544             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1545     }
1546 }
1547 
1548 
1549 
1550 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1551 //---------------------------------------------------------------------------------------
1552 //
1553 //   classs RBBIMonkeyKind
1554 //
1555 //      Monkey Test for Break Iteration
1556 //      Abstract interface class.   Concrete derived classes independently
1557 //      implement the break rules for different iterator types.
1558 //
1559 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1560 //      testing, but works purely in terms of the interface defined here.
1561 //
1562 //---------------------------------------------------------------------------------------
1563 class RBBIMonkeyKind {
1564 public:
1565     // Return a UVector of UnicodeSets, representing the character classes used
1566     //   for this type of iterator.
1567     virtual  UVector  *charClasses() = 0;
1568 
1569     // Set the test text on which subsequent calls to next() will operate
1570     virtual  void      setText(const UnicodeString &s) = 0;
1571 
1572     // Find the next break postion, starting from the prev break position, or from zero.
1573     // Return -1 after reaching end of string.
1574     virtual  int32_t   next(int32_t i) = 0;
1575 
1576     // Name of each character class, parallel with charClasses. Used for debugging output
1577     // of characters.
1578     virtual  std::vector<std::string>&     characterClassNames();
1579 
1580     void setAppliedRule(int32_t position, const char* value);
1581 
1582     std::string getAppliedRule(int32_t position);
1583 
1584     virtual ~RBBIMonkeyKind();
1585     UErrorCode deferredStatus;
1586 
1587     std::string classNameFromCodepoint(const UChar32 c);
1588     unsigned int maxClassNameSize();
1589 
1590  protected:
1591      RBBIMonkeyKind();
1592      std::vector<std::string> classNames;
1593      std::vector<std::string> appliedRules;
1594 
1595     // Clear `appliedRules` and fill it with empty strings in the size of test text.
1596     void prepareAppliedRules(int32_t size );
1597 
1598  private:
1599 
1600 };
1601 
RBBIMonkeyKind()1602 RBBIMonkeyKind::RBBIMonkeyKind() {
1603     deferredStatus = U_ZERO_ERROR;
1604 }
1605 
~RBBIMonkeyKind()1606 RBBIMonkeyKind::~RBBIMonkeyKind() {
1607 }
1608 
characterClassNames()1609 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1610     return classNames;
1611 }
1612 
prepareAppliedRules(int32_t size)1613 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1614     // Remove all the information in the `appliedRules`.
1615     appliedRules.clear();
1616     appliedRules.resize(size + 1);
1617 }
1618 
setAppliedRule(int32_t position,const char * value)1619 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1620     appliedRules[position] = value;
1621 }
1622 
getAppliedRule(int32_t position)1623 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1624     return appliedRules[position];
1625 }
1626 
classNameFromCodepoint(const UChar32 c)1627 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1628     // Simply iterate through charClasses to find character's class
1629     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1630         UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1631         if (classSet->contains(c)) {
1632             return classNames[aClassNum];
1633         }
1634     }
1635     U_ASSERT(FALSE);  // This should not happen.
1636     return "bad class name";
1637 }
1638 
maxClassNameSize()1639 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1640     unsigned int maxSize = 0;
1641     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1642         auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1643         if (aClassNumSize > maxSize) {
1644             maxSize = aClassNumSize;
1645         }
1646     }
1647     return maxSize;
1648 }
1649 
1650 //----------------------------------------------------------------------------------------
1651 //
1652 //   Random Numbers.  Similar to standard lib rand() and srand()
1653 //                    Not using library to
1654 //                      1.  Get same results on all platforms.
1655 //                      2.  Get access to current seed, to more easily reproduce failures.
1656 //
1657 //---------------------------------------------------------------------------------------
1658 static uint32_t m_seed = 1;
1659 
m_rand()1660 static uint32_t m_rand()
1661 {
1662     m_seed = m_seed * 1103515245 + 12345;
1663     return (uint32_t)(m_seed/65536) % 32768;
1664 }
1665 
1666 
1667 //------------------------------------------------------------------------------------------
1668 //
1669 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1670 //                             of RBBIMonkeyKind.
1671 //
1672 //------------------------------------------------------------------------------------------
1673 class RBBICharMonkey: public RBBIMonkeyKind {
1674 public:
1675     RBBICharMonkey();
1676     virtual          ~RBBICharMonkey();
1677     virtual  UVector *charClasses();
1678     virtual  void     setText(const UnicodeString &s);
1679     virtual  int32_t  next(int32_t i);
1680 private:
1681     UVector   *fSets;
1682 
1683     UnicodeSet  *fCRLFSet;
1684     UnicodeSet  *fControlSet;
1685     UnicodeSet  *fExtendSet;
1686     UnicodeSet  *fZWJSet;
1687     UnicodeSet  *fRegionalIndicatorSet;
1688     UnicodeSet  *fPrependSet;
1689     UnicodeSet  *fSpacingSet;
1690     UnicodeSet  *fLSet;
1691     UnicodeSet  *fVSet;
1692     UnicodeSet  *fTSet;
1693     UnicodeSet  *fLVSet;
1694     UnicodeSet  *fLVTSet;
1695     UnicodeSet  *fHangulSet;
1696     UnicodeSet  *fExtendedPictSet;
1697     UnicodeSet  *fViramaSet;
1698     UnicodeSet  *fLinkingConsonantSet;
1699     UnicodeSet  *fExtCccZwjSet;
1700     UnicodeSet  *fAnySet;
1701 
1702     const UnicodeString *fText;
1703 };
1704 
1705 
RBBICharMonkey()1706 RBBICharMonkey::RBBICharMonkey() {
1707     UErrorCode  status = U_ZERO_ERROR;
1708 
1709     fText = NULL;
1710 
1711     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1712     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1713     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1714     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1715     fRegionalIndicatorSet =
1716                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1717     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1718     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1719     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1720     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1721     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1722     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1723     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1724     fHangulSet  = new UnicodeSet();
1725     fHangulSet->addAll(*fLSet);
1726     fHangulSet->addAll(*fVSet);
1727     fHangulSet->addAll(*fTSet);
1728     fHangulSet->addAll(*fLVSet);
1729     fHangulSet->addAll(*fLVTSet);
1730 
1731     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1732     fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1733                                         "\\p{Indic_Syllabic_Category=Virama}]", status);
1734     fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1735                                         "\\p{Indic_Syllabic_Category=Consonant}]", status);
1736     fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1737     fAnySet           = new UnicodeSet(0, 0x10ffff);
1738 
1739     // Create sets of characters, and add the names of the above character sets.
1740     // In each new ICU release, add new names corresponding to the sets above.
1741     fSets             = new UVector(status);
1742 
1743     // Important: Keep class names the same as the class contents.
1744     fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1745     fSets->addElement(fControlSet, status); classNames.push_back("Control");
1746     fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1747     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1748     if (!fPrependSet->isEmpty()) {
1749         fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1750     }
1751     fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1752     fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1753     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1754     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1755     fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1756     fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1757     fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1758     fSets->addElement(fAnySet, status); classNames.push_back("Any");
1759 
1760     if (U_FAILURE(status)) {
1761         deferredStatus = status;
1762     }
1763 }
1764 
1765 
setText(const UnicodeString & s)1766 void RBBICharMonkey::setText(const UnicodeString &s) {
1767     fText = &s;
1768     prepareAppliedRules(s.length());
1769 }
1770 
1771 
1772 
next(int32_t prevPos)1773 int32_t RBBICharMonkey::next(int32_t prevPos) {
1774     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1775                               //   break position being tested.  The candidate break
1776                               //   location is before p2.
1777 
1778     int     breakPos = -1;
1779 
1780     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1781     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1782 
1783     if (U_FAILURE(deferredStatus)) {
1784         return -1;
1785     }
1786 
1787     // Previous break at end of string.  return DONE.
1788     if (prevPos >= fText->length()) {
1789         return -1;
1790     }
1791 
1792     p0 = p1 = p2 = p3 = prevPos;
1793     c3 =  fText->char32At(prevPos);
1794     c0 = c1 = c2 = cBase = 0;
1795     (void)p0;   // suppress set but not used warning.
1796     (void)c0;
1797 
1798     // Loop runs once per "significant" character position in the input text.
1799     for (;;) {
1800         // Move all of the positions forward in the input string.
1801         p0 = p1;  c0 = c1;
1802         p1 = p2;  c1 = c2;
1803         p2 = p3;  c2 = c3;
1804 
1805         // Advance p3 by one codepoint
1806         p3 = fText->moveIndex32(p3, 1);
1807         c3 = fText->char32At(p3);
1808 
1809         if (p1 == p2) {
1810             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1811             continue;
1812         }
1813 
1814         if (p2 == fText->length()) {
1815             setAppliedRule(p2, "End of String");
1816             break;
1817         }
1818 
1819         //     No Extend or Format characters may appear between the CR and LF,
1820         //     which requires the additional check for p2 immediately following p1.
1821         //
1822         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1823           setAppliedRule(p2, "GB3   CR x LF");
1824           continue;
1825         }
1826 
1827         if (fControlSet->contains(c1) ||
1828             c1 == 0x0D ||
1829             c1 == 0x0A)  {
1830           setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
1831           break;
1832         }
1833 
1834         if (fControlSet->contains(c2) ||
1835             c2 == 0x0D ||
1836             c2 == 0x0A)  {
1837             setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
1838             break;
1839         }
1840 
1841         if (fLSet->contains(c1) &&
1842                (fLSet->contains(c2)  ||
1843                 fVSet->contains(c2)  ||
1844                 fLVSet->contains(c2) ||
1845                 fLVTSet->contains(c2))) {
1846             setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
1847             continue;
1848         }
1849 
1850         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1851             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1852             setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
1853             continue;
1854         }
1855 
1856         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1857             fTSet->contains(c2))  {
1858             setAppliedRule(p2, "GB8   ( LVT | T)  x T");
1859             continue;
1860         }
1861 
1862         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1863             if (!fExtendSet->contains(c1)) {
1864                 cBase = c1;
1865             }
1866             setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
1867             continue;
1868         }
1869 
1870         if (fSpacingSet->contains(c2)) {
1871             setAppliedRule(p2, "GB9a  x  SpacingMark");
1872             continue;
1873         }
1874 
1875         if (fPrependSet->contains(c1)) {
1876             setAppliedRule(p2, "GB9b  Prepend x");
1877             continue;
1878         }
1879 
1880         //   Note: Viramas are also included in the ExtCccZwj class.
1881         if (fLinkingConsonantSet->contains(c2)) {
1882             int pi = p1;
1883             bool sawVirama = false;
1884             while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1885                 if (fViramaSet->contains(fText->char32At(pi))) {
1886                     sawVirama = true;
1887                 }
1888                 pi = fText->moveIndex32(pi, -1);
1889             }
1890             if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1891               setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1892               continue;
1893             }
1894         }
1895 
1896         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1897           setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1898           continue;
1899         }
1900 
1901         //                   Note: The first if condition is a little tricky. We only need to force
1902         //                      a break if there are three or more contiguous RIs. If there are
1903         //                      only two, a break following will occur via other rules, and will include
1904         //                      any trailing extend characters, which is needed behavior.
1905         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1906                 && fRegionalIndicatorSet->contains(c2)) {
1907           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1908           break;
1909         }
1910         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1911           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1912           continue;
1913         }
1914 
1915         setAppliedRule(p2, "GB999 Any <break> Any");
1916         break;
1917     }
1918 
1919     breakPos = p2;
1920     return breakPos;
1921 }
1922 
1923 
1924 
charClasses()1925 UVector  *RBBICharMonkey::charClasses() {
1926     return fSets;
1927 }
1928 
~RBBICharMonkey()1929 RBBICharMonkey::~RBBICharMonkey() {
1930     delete fSets;
1931     delete fCRLFSet;
1932     delete fControlSet;
1933     delete fExtendSet;
1934     delete fRegionalIndicatorSet;
1935     delete fPrependSet;
1936     delete fSpacingSet;
1937     delete fLSet;
1938     delete fVSet;
1939     delete fTSet;
1940     delete fLVSet;
1941     delete fLVTSet;
1942     delete fHangulSet;
1943     delete fAnySet;
1944     delete fZWJSet;
1945     delete fExtendedPictSet;
1946     delete fViramaSet;
1947     delete fLinkingConsonantSet;
1948     delete fExtCccZwjSet;
1949 }
1950 
1951 //------------------------------------------------------------------------------------------
1952 //
1953 //   class RBBIWordMonkey      Word Break specific implementation
1954 //                             of RBBIMonkeyKind.
1955 //
1956 //------------------------------------------------------------------------------------------
1957 class RBBIWordMonkey: public RBBIMonkeyKind {
1958 public:
1959     RBBIWordMonkey();
1960     virtual          ~RBBIWordMonkey();
1961     virtual  UVector *charClasses();
1962     virtual  void     setText(const UnicodeString &s);
1963     virtual int32_t   next(int32_t i);
1964 private:
1965     UVector      *fSets;
1966 
1967     UnicodeSet  *fCRSet;
1968     UnicodeSet  *fLFSet;
1969     UnicodeSet  *fNewlineSet;
1970     UnicodeSet  *fRegionalIndicatorSet;
1971     UnicodeSet  *fKatakanaSet;
1972     UnicodeSet  *fHebrew_LetterSet;
1973     UnicodeSet  *fALetterSet;
1974     UnicodeSet  *fSingle_QuoteSet;
1975     UnicodeSet  *fDouble_QuoteSet;
1976     UnicodeSet  *fMidNumLetSet;
1977     UnicodeSet  *fMidLetterSet;
1978     UnicodeSet  *fMidNumSet;
1979     UnicodeSet  *fNumericSet;
1980     UnicodeSet  *fFormatSet;
1981     UnicodeSet  *fOtherSet;
1982     UnicodeSet  *fExtendSet;
1983     UnicodeSet  *fExtendNumLetSet;
1984     UnicodeSet  *fWSegSpaceSet;
1985     UnicodeSet  *fDictionarySet;
1986     UnicodeSet  *fZWJSet;
1987     UnicodeSet  *fExtendedPictSet;
1988 
1989     const UnicodeString  *fText;
1990 };
1991 
1992 
RBBIWordMonkey()1993 RBBIWordMonkey::RBBIWordMonkey()
1994 {
1995     UErrorCode  status = U_ZERO_ERROR;
1996 
1997     fSets            = new UVector(status);
1998 
1999     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
2000     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
2001     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
2002     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
2003     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2004     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2005     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2006     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
2007     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
2008     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
2009     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]",    status);
2010     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
2011     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
2012     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
2013     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
2014     // There are some sc=Hani characters with WB=Extend.
2015     // The break rules need to pick one or the other because
2016     // Extend overlapping with something else is messy.
2017     // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
2018     // in $Han (for $dictionary) and out of $Extend.
2019     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
2020     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
2021 
2022     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
2023     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
2024 
2025     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2026     fDictionarySet->addAll(*fKatakanaSet);
2027     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2028 
2029     fALetterSet->removeAll(*fDictionarySet);
2030 
2031     fOtherSet        = new UnicodeSet();
2032     if(U_FAILURE(status)) {
2033         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2034         deferredStatus = status;
2035         return;
2036     }
2037 
2038     fOtherSet->complement();
2039     fOtherSet->removeAll(*fCRSet);
2040     fOtherSet->removeAll(*fLFSet);
2041     fOtherSet->removeAll(*fNewlineSet);
2042     fOtherSet->removeAll(*fKatakanaSet);
2043     fOtherSet->removeAll(*fHebrew_LetterSet);
2044     fOtherSet->removeAll(*fALetterSet);
2045     fOtherSet->removeAll(*fSingle_QuoteSet);
2046     fOtherSet->removeAll(*fDouble_QuoteSet);
2047     fOtherSet->removeAll(*fMidLetterSet);
2048     fOtherSet->removeAll(*fMidNumSet);
2049     fOtherSet->removeAll(*fNumericSet);
2050     fOtherSet->removeAll(*fExtendNumLetSet);
2051     fOtherSet->removeAll(*fWSegSpaceSet);
2052     fOtherSet->removeAll(*fFormatSet);
2053     fOtherSet->removeAll(*fExtendSet);
2054     fOtherSet->removeAll(*fRegionalIndicatorSet);
2055     fOtherSet->removeAll(*fZWJSet);
2056     fOtherSet->removeAll(*fExtendedPictSet);
2057 
2058     // Inhibit dictionary characters from being tested at all.
2059     fOtherSet->removeAll(*fDictionarySet);
2060 
2061     // Add classes and their names
2062     fSets->addElement(fCRSet, status); classNames.push_back("CR");
2063     fSets->addElement(fLFSet, status); classNames.push_back("LF");
2064     fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2065     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2066     fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2067     fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2068     fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2069     fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2070     // Omit Katakana from fSets, which omits Katakana characters
2071     // from the test data. They are all in the dictionary set,
2072     // which this (old, to be retired) monkey test cannot handle.
2073     //fSets->addElement(fKatakanaSet, status);
2074 
2075     fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2076     fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2077     fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2078     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2079     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2080     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2081     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2082     fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2083     fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2084 
2085     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2086     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2087 
2088     if (U_FAILURE(status)) {
2089         deferredStatus = status;
2090     }
2091 }
2092 
setText(const UnicodeString & s)2093 void RBBIWordMonkey::setText(const UnicodeString &s) {
2094     fText       = &s;
2095     prepareAppliedRules(s.length());
2096 }
2097 
2098 
next(int32_t prevPos)2099 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2100     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2101                               //   break position being tested.  The candidate break
2102                               //   location is before p2.
2103 
2104     int     breakPos = -1;
2105 
2106     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2107 
2108     if (U_FAILURE(deferredStatus)) {
2109         return -1;
2110     }
2111 
2112     // Prev break at end of string.  return DONE.
2113     if (prevPos >= fText->length()) {
2114         return -1;
2115     }
2116     p0 = p1 = p2 = p3 = prevPos;
2117     c3 =  fText->char32At(prevPos);
2118     c0 = c1 = c2 = 0;
2119     (void)p0;       // Suppress set but not used warning.
2120 
2121     // Loop runs once per "significant" character position in the input text.
2122     for (;;) {
2123         // Move all of the positions forward in the input string.
2124         p0 = p1;  c0 = c1;
2125         p1 = p2;  c1 = c2;
2126         p2 = p3;  c2 = c3;
2127 
2128         // Advance p3 by    X(Extend | Format)*   Rule 4
2129         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2130         do {
2131             p3 = fText->moveIndex32(p3, 1);
2132             c3 = fText->char32At(p3);
2133             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2134                break;
2135             }
2136         }
2137         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2138 
2139 
2140         if (p1 == p2) {
2141             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2142             continue;
2143         }
2144 
2145         if (p2 == fText->length()) {
2146             // Reached end of string.  Always a break position.
2147             break;
2148         }
2149 
2150         //     No Extend or Format characters may appear between the CR and LF,
2151         //     which requires the additional check for p2 immediately following p1.
2152         //
2153         if (c1==0x0D && c2==0x0A) {
2154           setAppliedRule(p2, "WB3   CR x LF");
2155           continue;
2156         }
2157 
2158         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2159             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2160             break;
2161         }
2162         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2163             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2164             break;
2165         }
2166 
2167         //              Not ignoring extend chars, so peek into input text to
2168         //              get the potential ZWJ, the character immediately preceding c2.
2169         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2170         //              but char32At will get the full code point.
2171         if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2172             setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
2173             continue;
2174         }
2175 
2176         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2177             setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
2178             continue;
2179         }
2180 
2181         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2182             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2183             setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2184             continue;
2185         }
2186 
2187         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2188              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2189              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2190             setAppliedRule(p2,
2191                            "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2192             continue;
2193         }
2194 
2195         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2196             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2197             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2198             setAppliedRule(p2,
2199                            "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
2200             continue;
2201         }
2202 
2203         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2204             setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
2205             continue;
2206         }
2207 
2208           if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2209             setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
2210             continue;
2211         }
2212 
2213         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2214             setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
2215             continue;
2216         }
2217 
2218         if (fNumericSet->contains(c1) &&
2219             fNumericSet->contains(c2)) {
2220             setAppliedRule(p2, "WB8   Numeric x Numeric");
2221             continue;
2222         }
2223 
2224         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2225             fNumericSet->contains(c2)) {
2226             setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
2227             continue;
2228         }
2229 
2230         if (fNumericSet->contains(c1) &&
2231             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2232             setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
2233             continue;
2234         }
2235 
2236           if (fNumericSet->contains(c0) &&
2237             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2238             fNumericSet->contains(c2)) {
2239             setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
2240             continue;
2241         }
2242 
2243         if (fNumericSet->contains(c1) &&
2244             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2245             fNumericSet->contains(c3)) {
2246             setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2247             continue;
2248         }
2249 
2250         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2251         //                  all Katakana are handled by the dictionary breaker.
2252         if (fKatakanaSet->contains(c1) &&
2253             fKatakanaSet->contains(c2))  {
2254             setAppliedRule(p2, "WB13  Katakana x Katakana");
2255             continue;
2256         }
2257 
2258         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2259              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2260              fExtendNumLetSet->contains(c2)) {
2261             setAppliedRule(p2,
2262                            "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2263             continue;
2264         }
2265 
2266         if (fExtendNumLetSet->contains(c1) &&
2267                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2268                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2269             setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2270             continue;
2271         }
2272 
2273         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2274             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2275             break;
2276         }
2277         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2278             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2279             continue;
2280         }
2281 
2282         setAppliedRule(p2, "WB999");
2283         break;
2284     }
2285 
2286     breakPos = p2;
2287     return breakPos;
2288 }
2289 
2290 
charClasses()2291 UVector  *RBBIWordMonkey::charClasses() {
2292     return fSets;
2293 }
2294 
~RBBIWordMonkey()2295 RBBIWordMonkey::~RBBIWordMonkey() {
2296     delete fSets;
2297     delete fCRSet;
2298     delete fLFSet;
2299     delete fNewlineSet;
2300     delete fKatakanaSet;
2301     delete fHebrew_LetterSet;
2302     delete fALetterSet;
2303     delete fSingle_QuoteSet;
2304     delete fDouble_QuoteSet;
2305     delete fMidNumLetSet;
2306     delete fMidLetterSet;
2307     delete fMidNumSet;
2308     delete fNumericSet;
2309     delete fFormatSet;
2310     delete fExtendSet;
2311     delete fExtendNumLetSet;
2312     delete fWSegSpaceSet;
2313     delete fRegionalIndicatorSet;
2314     delete fDictionarySet;
2315     delete fOtherSet;
2316     delete fZWJSet;
2317     delete fExtendedPictSet;
2318 }
2319 
2320 
2321 
2322 
2323 //------------------------------------------------------------------------------------------
2324 //
2325 //   class RBBISentMonkey      Sentence Break specific implementation
2326 //                             of RBBIMonkeyKind.
2327 //
2328 //------------------------------------------------------------------------------------------
2329 class RBBISentMonkey: public RBBIMonkeyKind {
2330 public:
2331     RBBISentMonkey();
2332     virtual          ~RBBISentMonkey();
2333     virtual  UVector *charClasses();
2334     virtual  void     setText(const UnicodeString &s);
2335     virtual int32_t   next(int32_t i);
2336 private:
2337     int               moveBack(int posFrom);
2338     int               moveForward(int posFrom);
2339     UChar32           cAt(int pos);
2340 
2341     UVector      *fSets;
2342 
2343     UnicodeSet  *fSepSet;
2344     UnicodeSet  *fFormatSet;
2345     UnicodeSet  *fSpSet;
2346     UnicodeSet  *fLowerSet;
2347     UnicodeSet  *fUpperSet;
2348     UnicodeSet  *fOLetterSet;
2349     UnicodeSet  *fNumericSet;
2350     UnicodeSet  *fATermSet;
2351     UnicodeSet  *fSContinueSet;
2352     UnicodeSet  *fSTermSet;
2353     UnicodeSet  *fCloseSet;
2354     UnicodeSet  *fOtherSet;
2355     UnicodeSet  *fExtendSet;
2356 
2357     const UnicodeString  *fText;
2358 };
2359 
RBBISentMonkey()2360 RBBISentMonkey::RBBISentMonkey()
2361 {
2362     UErrorCode  status = U_ZERO_ERROR;
2363 
2364     fSets            = new UVector(status);
2365 
2366     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2367     //                       set and made into character classes of their own.  For the monkey impl,
2368     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2369     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2370     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2371     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2372     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2373     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2374     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2375     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2376     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2377     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2378     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2379     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2380     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2381     fOtherSet        = new UnicodeSet();
2382 
2383     if(U_FAILURE(status)) {
2384       deferredStatus = status;
2385       return;
2386     }
2387 
2388     fOtherSet->complement();
2389     fOtherSet->removeAll(*fSepSet);
2390     fOtherSet->removeAll(*fFormatSet);
2391     fOtherSet->removeAll(*fSpSet);
2392     fOtherSet->removeAll(*fLowerSet);
2393     fOtherSet->removeAll(*fUpperSet);
2394     fOtherSet->removeAll(*fOLetterSet);
2395     fOtherSet->removeAll(*fNumericSet);
2396     fOtherSet->removeAll(*fATermSet);
2397     fOtherSet->removeAll(*fSContinueSet);
2398     fOtherSet->removeAll(*fSTermSet);
2399     fOtherSet->removeAll(*fCloseSet);
2400     fOtherSet->removeAll(*fExtendSet);
2401 
2402     fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2403     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2404     fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2405     fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2406     fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2407     fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2408     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2409     fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2410     fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2411     fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2412     fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2413     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2414     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2415 
2416     if (U_FAILURE(status)) {
2417         deferredStatus = status;
2418     }
2419 }
2420 
2421 
2422 
setText(const UnicodeString & s)2423 void RBBISentMonkey::setText(const UnicodeString &s) {
2424     fText       = &s;
2425     prepareAppliedRules(s.length());
2426 }
2427 
charClasses()2428 UVector  *RBBISentMonkey::charClasses() {
2429     return fSets;
2430 }
2431 
2432 //  moveBack()   Find the "significant" code point preceding the index i.
2433 //               Skips over ($Extend | $Format)* .
2434 //
moveBack(int i)2435 int RBBISentMonkey::moveBack(int i) {
2436     if (i <= 0) {
2437         return -1;
2438     }
2439     UChar32   c;
2440     int32_t   j = i;
2441     do {
2442         j = fText->moveIndex32(j, -1);
2443         c = fText->char32At(j);
2444     }
2445     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2446     return j;
2447 
2448  }
2449 
2450 
moveForward(int i)2451 int RBBISentMonkey::moveForward(int i) {
2452     if (i>=fText->length()) {
2453         return fText->length();
2454     }
2455     UChar32   c;
2456     int32_t   j = i;
2457     do {
2458         j = fText->moveIndex32(j, 1);
2459         c = cAt(j);
2460     }
2461     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2462     return j;
2463 }
2464 
cAt(int pos)2465 UChar32 RBBISentMonkey::cAt(int pos) {
2466     if (pos<0 || pos>=fText->length()) {
2467         return -1;
2468     } else {
2469         return fText->char32At(pos);
2470     }
2471 }
2472 
next(int32_t prevPos)2473 int32_t RBBISentMonkey::next(int32_t prevPos) {
2474     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2475                               //   break position being tested.  The candidate break
2476                               //   location is before p2.
2477 
2478     int     breakPos = -1;
2479 
2480     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2481     UChar32 c;
2482 
2483     if (U_FAILURE(deferredStatus)) {
2484         return -1;
2485     }
2486 
2487     // Prev break at end of string.  return DONE.
2488     if (prevPos >= fText->length()) {
2489         return -1;
2490     }
2491     p0 = p1 = p2 = p3 = prevPos;
2492     c3 =  fText->char32At(prevPos);
2493     c0 = c1 = c2 = 0;
2494     (void)p0;     // Suppress set but not used warning.
2495 
2496     // Loop runs once per "significant" character position in the input text.
2497     for (;;) {
2498         // Move all of the positions forward in the input string.
2499         p0 = p1;  c0 = c1;
2500         p1 = p2;  c1 = c2;
2501         p2 = p3;  c2 = c3;
2502 
2503         // Advance p3 by    X(Extend | Format)*   Rule 4
2504         p3 = moveForward(p3);
2505         c3 = cAt(p3);
2506 
2507         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2508             setAppliedRule(p2, "SB3   CR x LF");
2509             continue;
2510         }
2511 
2512         if (fSepSet->contains(c1)) {
2513             p2 = p1+1;   // Separators don't combine with Extend or Format.
2514 
2515             setAppliedRule(p2, "SB4   Sep  <break>");
2516             break;
2517         }
2518 
2519         if (p2 >= fText->length()) {
2520             // Reached end of string.  Always a break position.
2521             setAppliedRule(p2, "SB4   Sep  <break>");
2522             break;
2523         }
2524 
2525         if (p2 == prevPos) {
2526             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2527             setAppliedRule(p2, "SB4   Sep  <break>");
2528             continue;
2529         }
2530 
2531         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2532             setAppliedRule(p2, "SB6   ATerm x Numeric");
2533             continue;
2534         }
2535 
2536           if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2537                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2538             setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
2539             continue;
2540         }
2541 
2542         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2543         //                  note to the Unicode 5.0 documents.
2544         int p8 = p1;
2545         while (fSpSet->contains(cAt(p8))) {
2546             p8 = moveBack(p8);
2547         }
2548         while (fCloseSet->contains(cAt(p8))) {
2549             p8 = moveBack(p8);
2550         }
2551         if (fATermSet->contains(cAt(p8))) {
2552             p8=p2;
2553             for (;;) {
2554                 c = cAt(p8);
2555                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2556                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2557                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2558 
2559                     setAppliedRule(p2,
2560                                    "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2561                     break;
2562                 }
2563                 p8 = moveForward(p8);
2564             }
2565             if (fLowerSet->contains(cAt(p8))) {
2566 
2567                 setAppliedRule(p2,
2568                                "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2569                 continue;
2570             }
2571         }
2572 
2573         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2574             p8 = p1;
2575             while (fSpSet->contains(cAt(p8))) {
2576                 p8 = moveBack(p8);
2577             }
2578             while (fCloseSet->contains(cAt(p8))) {
2579                 p8 = moveBack(p8);
2580             }
2581             c = cAt(p8);
2582             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2583                 setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2584                 continue;
2585             }
2586         }
2587 
2588         int p9 = p1;
2589         while (fCloseSet->contains(cAt(p9))) {
2590             p9 = moveBack(p9);
2591         }
2592         c = cAt(p9);
2593         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2594             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2595 
2596                 setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
2597                 continue;
2598             }
2599         }
2600 
2601         int p10 = p1;
2602         while (fSpSet->contains(cAt(p10))) {
2603             p10 = moveBack(p10);
2604         }
2605         while (fCloseSet->contains(cAt(p10))) {
2606             p10 = moveBack(p10);
2607         }
2608         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2609             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2610                 setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
2611                 continue;
2612             }
2613         }
2614 
2615         int p11 = p1;
2616         if (fSepSet->contains(cAt(p11))) {
2617             p11 = moveBack(p11);
2618         }
2619         while (fSpSet->contains(cAt(p11))) {
2620             p11 = moveBack(p11);
2621         }
2622         while (fCloseSet->contains(cAt(p11))) {
2623             p11 = moveBack(p11);
2624         }
2625         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2626           setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
2627             break;
2628         }
2629 
2630         setAppliedRule(p2, "SB12  Any x Any");
2631         continue;
2632     }
2633 
2634     breakPos = p2;
2635     return breakPos;
2636 }
2637 
~RBBISentMonkey()2638 RBBISentMonkey::~RBBISentMonkey() {
2639     delete fSets;
2640     delete fSepSet;
2641     delete fFormatSet;
2642     delete fSpSet;
2643     delete fLowerSet;
2644     delete fUpperSet;
2645     delete fOLetterSet;
2646     delete fNumericSet;
2647     delete fATermSet;
2648     delete fSContinueSet;
2649     delete fSTermSet;
2650     delete fCloseSet;
2651     delete fOtherSet;
2652     delete fExtendSet;
2653 }
2654 
2655 
2656 
2657 //-------------------------------------------------------------------------------------------
2658 //
2659 //  RBBILineMonkey
2660 //
2661 //-------------------------------------------------------------------------------------------
2662 
2663 class RBBILineMonkey: public RBBIMonkeyKind {
2664 public:
2665     RBBILineMonkey();
2666     virtual          ~RBBILineMonkey();
2667     virtual  UVector *charClasses();
2668     virtual  void     setText(const UnicodeString &s);
2669     virtual  int32_t  next(int32_t i);
2670     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2671 private:
2672     UVector      *fSets;
2673 
2674     UnicodeSet  *fBK;
2675     UnicodeSet  *fCR;
2676     UnicodeSet  *fLF;
2677     UnicodeSet  *fCM;
2678     UnicodeSet  *fNL;
2679     UnicodeSet  *fSG;
2680     UnicodeSet  *fWJ;
2681     UnicodeSet  *fZW;
2682     UnicodeSet  *fGL;
2683     UnicodeSet  *fCB;
2684     UnicodeSet  *fSP;
2685     UnicodeSet  *fB2;
2686     UnicodeSet  *fBA;
2687     UnicodeSet  *fBB;
2688     UnicodeSet  *fHH;
2689     UnicodeSet  *fHY;
2690     UnicodeSet  *fH2;
2691     UnicodeSet  *fH3;
2692     UnicodeSet  *fCL;
2693     UnicodeSet  *fCP;
2694     UnicodeSet  *fEX;
2695     UnicodeSet  *fIN;
2696     UnicodeSet  *fJL;
2697     UnicodeSet  *fJV;
2698     UnicodeSet  *fJT;
2699     UnicodeSet  *fNS;
2700     UnicodeSet  *fOP;
2701     UnicodeSet  *fQU;
2702     UnicodeSet  *fIS;
2703     UnicodeSet  *fNU;
2704     UnicodeSet  *fPO;
2705     UnicodeSet  *fPR;
2706     UnicodeSet  *fSY;
2707     UnicodeSet  *fAI;
2708     UnicodeSet  *fAL;
2709     UnicodeSet  *fCJ;
2710     UnicodeSet  *fHL;
2711     UnicodeSet  *fID;
2712     UnicodeSet  *fRI;
2713     UnicodeSet  *fXX;
2714     UnicodeSet  *fEB;
2715     UnicodeSet  *fEM;
2716     UnicodeSet  *fZWJ;
2717     UnicodeSet  *fOP30;
2718     UnicodeSet  *fCP30;
2719 
2720     BreakIterator        *fCharBI;
2721     const UnicodeString  *fText;
2722     RegexMatcher         *fNumberMatcher;
2723 };
2724 
RBBILineMonkey()2725 RBBILineMonkey::RBBILineMonkey() :
2726     RBBIMonkeyKind(),
2727     fSets(NULL),
2728 
2729     fCharBI(NULL),
2730     fText(NULL),
2731     fNumberMatcher(NULL)
2732 
2733 {
2734     if (U_FAILURE(deferredStatus)) {
2735         return;
2736     }
2737 
2738     UErrorCode  status = U_ZERO_ERROR;
2739 
2740     fSets  = new UVector(status);
2741 
2742     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2743     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2744     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2745     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2746     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2747     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2748     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2749     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2750     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2751     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2752     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2753     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2754     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2755     fHH    = new UnicodeSet();
2756     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2757     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2758     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2759     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2760     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2761     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2762     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2763     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2764     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2765     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2766     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2767     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2768     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2769     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2770     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2771     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2772     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2773     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2774     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2775     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2776     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2777     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2778     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2779     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2780     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2781     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2782     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2783     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2784     fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2785     fOP30  = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2786     fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2787 
2788     if (U_FAILURE(status)) {
2789         deferredStatus = status;
2790         return;
2791     }
2792 
2793     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2794     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2795     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2796 
2797     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2798     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2799 
2800     fHH->add(u'\u2010');   // Hyphen, '‐'
2801 
2802     // Sets and names.
2803     fSets->addElement(fBK, status); classNames.push_back("fBK");
2804     fSets->addElement(fCR, status); classNames.push_back("fCR");
2805     fSets->addElement(fLF, status); classNames.push_back("fLF");
2806     fSets->addElement(fCM, status); classNames.push_back("fCM");
2807     fSets->addElement(fNL, status); classNames.push_back("fNL");
2808     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2809     fSets->addElement(fZW, status); classNames.push_back("fZW");
2810     fSets->addElement(fGL, status); classNames.push_back("fGL");
2811     fSets->addElement(fCB, status); classNames.push_back("fCB");
2812     fSets->addElement(fSP, status); classNames.push_back("fSP");
2813     fSets->addElement(fB2, status); classNames.push_back("fB2");
2814     fSets->addElement(fBA, status); classNames.push_back("fBA");
2815     fSets->addElement(fBB, status); classNames.push_back("fBB");
2816     fSets->addElement(fHY, status); classNames.push_back("fHY");
2817     fSets->addElement(fH2, status); classNames.push_back("fH2");
2818     fSets->addElement(fH3, status); classNames.push_back("fH3");
2819     fSets->addElement(fCL, status); classNames.push_back("fCL");
2820     fSets->addElement(fCP, status); classNames.push_back("fCP");
2821     fSets->addElement(fEX, status); classNames.push_back("fEX");
2822     fSets->addElement(fIN, status); classNames.push_back("fIN");
2823     fSets->addElement(fJL, status); classNames.push_back("fJL");
2824     fSets->addElement(fJT, status); classNames.push_back("fJT");
2825     fSets->addElement(fJV, status); classNames.push_back("fJV");
2826     fSets->addElement(fNS, status); classNames.push_back("fNS");
2827     fSets->addElement(fOP, status); classNames.push_back("fOP");
2828     fSets->addElement(fQU, status); classNames.push_back("fQU");
2829     fSets->addElement(fIS, status); classNames.push_back("fIS");
2830     fSets->addElement(fNU, status); classNames.push_back("fNU");
2831     fSets->addElement(fPO, status); classNames.push_back("fPO");
2832     fSets->addElement(fPR, status); classNames.push_back("fPR");
2833     fSets->addElement(fSY, status); classNames.push_back("fSY");
2834     fSets->addElement(fAI, status); classNames.push_back("fAI");
2835     fSets->addElement(fAL, status); classNames.push_back("fAL");
2836     fSets->addElement(fHL, status); classNames.push_back("fHL");
2837     fSets->addElement(fID, status); classNames.push_back("fID");
2838     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2839     fSets->addElement(fRI, status); classNames.push_back("fRI");
2840     fSets->addElement(fSG, status); classNames.push_back("fSG");
2841     fSets->addElement(fEB, status); classNames.push_back("fEB");
2842     fSets->addElement(fEM, status); classNames.push_back("fEM");
2843     fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2844     // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2845     fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2846     fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2847 
2848     const char *rules =
2849             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2850             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2851             "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2852             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2853             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2854             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2855             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2856 
2857     fNumberMatcher = new RegexMatcher(
2858         UnicodeString(rules, -1, US_INV), 0, status);
2859 
2860     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2861 
2862     if (U_FAILURE(status)) {
2863         deferredStatus = status;
2864     }
2865 
2866 }
2867 
2868 
setText(const UnicodeString & s)2869 void RBBILineMonkey::setText(const UnicodeString &s) {
2870     fText       = &s;
2871     fCharBI->setText(s);
2872     prepareAppliedRules(s.length());
2873     fNumberMatcher->reset(s);
2874 }
2875 
2876 //
2877 //  rule9Adjust
2878 //     Line Break TR rules 9 and 10 implementation.
2879 //     This deals with combining marks and other sequences that
2880 //     that must be treated as if they were something other than what they actually are.
2881 //
2882 //     This is factored out into a separate function because it must be applied twice for
2883 //     each potential break, once to the chars before the position being checked, then
2884 //     again to the text following the possible break.
2885 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2886 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2887     if (pos == -1) {
2888         // Invalid initial position.  Happens during the warmup iteration of the
2889         //   main loop in next().
2890         return;
2891     }
2892 
2893     int32_t  nPos = *nextPos;
2894 
2895     // LB 9  Keep combining sequences together.
2896     // advance over any CM class chars.  Note that Line Break CM is different
2897     // from the normal Grapheme Extend property.
2898     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2899           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2900         for (;;) {
2901             *nextChar = fText->char32At(nPos);
2902             if (!fCM->contains(*nextChar)) {
2903                 break;
2904             }
2905             nPos = fText->moveIndex32(nPos, 1);
2906         }
2907     }
2908 
2909 
2910     // LB 9 Treat X CM* as if it were x.
2911     //       No explicit action required.
2912 
2913     // LB 10  Treat any remaining combining mark as AL
2914     if (fCM->contains(*posChar)) {
2915         *posChar = u'A';
2916     }
2917 
2918     // Push the updated nextPos and nextChar back to our caller.
2919     // This only makes a difference if posChar got bigger by consuming a
2920     // combining sequence.
2921     *nextPos  = nPos;
2922     *nextChar = fText->char32At(nPos);
2923 }
2924 
2925 
2926 
next(int32_t startPos)2927 int32_t RBBILineMonkey::next(int32_t startPos) {
2928     UErrorCode status = U_ZERO_ERROR;
2929     int32_t    pos;       //  Index of the char following a potential break position
2930     UChar32    thisChar;  //  Character at above position "pos"
2931 
2932     int32_t    prevPos;   //  Index of the char preceding a potential break position
2933     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2934                           //   and thisChar may not be adjacent because combining
2935                           //   characters between them will be ignored.
2936 
2937     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2938     UChar32    prevCharX2;
2939 
2940     int32_t    nextPos;   //  Index of the next character following pos.
2941                           //     Usually skips over combining marks.
2942     int32_t    nextCPPos; //  Index of the code point following "pos."
2943                           //     May point to a combining mark.
2944     int32_t    tPos;      //  temp value.
2945     UChar32    c;
2946 
2947     if (U_FAILURE(deferredStatus)) {
2948         return -1;
2949     }
2950 
2951     if (startPos >= fText->length()) {
2952         return -1;
2953     }
2954 
2955 
2956     // Initial values for loop.  Loop will run the first time without finding breaks,
2957     //                           while the invalid values shift out and the "this" and
2958     //                           "prev" positions are filled in with good values.
2959     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2960     thisChar = prevChar  = prevCharX2 = 0;
2961     nextPos  = nextCPPos = startPos;
2962 
2963 
2964     // Loop runs once per position in the test text, until a break position
2965     //  is found.
2966     for (;;) {
2967         prevPosX2 = prevPos;
2968         prevCharX2 = prevChar;
2969 
2970         prevPos   = pos;
2971         prevChar  = thisChar;
2972 
2973         pos       = nextPos;
2974         thisChar  = fText->char32At(pos);
2975 
2976         nextCPPos = fText->moveIndex32(pos, 1);
2977         nextPos   = nextCPPos;
2978 
2979 
2980         if (pos >= fText->length()) {
2981             setAppliedRule(pos, "LB2 - Break at end of text.");
2982             break;
2983         }
2984 
2985 
2986         //             We do this one out-of-order because the adjustment does not change anything
2987         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2988         //             be applied.
2989         rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2990         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2991         c = fText->char32At(nextPos);
2992         rule9Adjust(pos, &thisChar, &nextPos, &c);
2993 
2994         // If the loop is still warming up - if we haven't shifted the initial
2995         //   -1 positions out of prevPos yet - loop back to advance the
2996         //    position in the input without any further looking for breaks.
2997         if (prevPos == -1) {
2998           setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2999             continue;
3000         }
3001 
3002 
3003         if (fBK->contains(prevChar)) {
3004             setAppliedRule(pos, "LB 4  Always break after hard line breaks");
3005             break;
3006         }
3007 
3008 
3009         if (prevChar == 0x0d && thisChar == 0x0a) {
3010             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
3011             continue;
3012         }
3013         if (prevChar == 0x0d ||
3014             prevChar == 0x0a ||
3015             prevChar == 0x85)  {
3016             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
3017             break;
3018         }
3019 
3020 
3021         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3022             fBK->contains(thisChar)) {
3023             setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
3024             continue;
3025         }
3026 
3027 
3028         if (fSP->contains(thisChar)) {
3029             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
3030             continue;
3031         }
3032 
3033         // !!! ??? Is this the right text for the applied rule?
3034         if (fZW->contains(thisChar)) {
3035             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
3036             continue;
3037         }
3038 
3039 
3040         //       ZW SP* ÷
3041         //       Scan backwards from prevChar for SP* ZW
3042         tPos = prevPos;
3043         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3044             tPos = fText->moveIndex32(tPos, -1);
3045         }
3046         if (fZW->contains(fText->char32At(tPos))) {
3047             setAppliedRule(pos, "LB 8  Break after zero width space");
3048             break;
3049         }
3050 
3051 
3052         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
3053         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
3054         if (fNumberMatcher->lookingAt(prevPos, status)) {
3055             if (U_FAILURE(status)) {
3056                 setAppliedRule(pos, "LB 25 Numbers");
3057                 break;
3058             }
3059             // Matched a number.  But could have been just a single digit, which would
3060             //    not represent a "no break here" between prevChar and thisChar
3061             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3062             if (numEndIdx > pos) {
3063                 // Number match includes at least our two chars being checked
3064                 if (numEndIdx > nextPos) {
3065                     // Number match includes additional chars.  Update pos and nextPos
3066                     //   so that next loop iteration will continue at the end of the number,
3067                     //   checking for breaks between last char in number & whatever follows.
3068                     pos = nextPos = numEndIdx;
3069                     do {
3070                         pos = fText->moveIndex32(pos, -1);
3071                         thisChar = fText->char32At(pos);
3072                     } while (fCM->contains(thisChar));
3073                 }
3074                 setAppliedRule(pos, "LB 25 Numbers");
3075                 continue;
3076             }
3077         }
3078 
3079 
3080         //       The monkey test's way of ignoring combining characters doesn't work
3081         //       for this rule. ZJ is also a CM. Need to get the actual character
3082         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3083         {
3084             int32_t prevIdx = fText->moveIndex32(pos, -1);
3085             UChar32 prevC = fText->char32At(prevIdx);
3086             if (fZWJ->contains(prevC)) {
3087                 setAppliedRule(pos, "LB 8a ZWJ x");
3088                 continue;
3089             }
3090         }
3091 
3092 
3093         // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
3094         //
3095 
3096 
3097         //    x  WJ
3098         //    WJ  x
3099         //
3100         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3101             setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
3102             continue;
3103         }
3104 
3105 
3106         if (fGL->contains(prevChar)) {
3107             setAppliedRule(pos, "LB 12  GL  x");
3108             continue;
3109         }
3110 
3111 
3112           if (!(fSP->contains(prevChar) ||
3113               fBA->contains(prevChar) ||
3114               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3115               setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
3116               continue;
3117         }
3118 
3119 
3120         if (fCL->contains(thisChar) ||
3121                 fCP->contains(thisChar) ||
3122                 fEX->contains(thisChar) ||
3123                 fSY->contains(thisChar)) {
3124             setAppliedRule(pos, "LB 13  Don't break before closings.");
3125             continue;
3126         }
3127 
3128 
3129         //       Scan backwards, checking for this sequence.
3130         //       The OP char could include combining marks, so we actually check for
3131         //           OP CM* SP*
3132         //       Another Twist: The Rule 9 fixes may have changed a SP CM
3133         //       sequence into a ID char, so before scanning back through spaces,
3134         //       verify that prevChar is indeed a space.  The prevChar variable
3135         //       may differ from fText[prevPos]
3136         tPos = prevPos;
3137         if (fSP->contains(prevChar)) {
3138             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3139                 tPos=fText->moveIndex32(tPos, -1);
3140             }
3141         }
3142         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3143             tPos=fText->moveIndex32(tPos, -1);
3144         }
3145         if (fOP->contains(fText->char32At(tPos))) {
3146             setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3147             continue;
3148         }
3149 
3150 
3151         if (nextPos < fText->length()) {
3152             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3153             //       from a legit ffff character. So test length separately.
3154             UChar32 nextChar = fText->char32At(nextPos);
3155             if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3156                 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3157                 break;
3158             }
3159         }
3160 
3161 
3162           if (fIS->contains(thisChar)) {
3163               setAppliedRule(pos, "LB 14b  Do not break before numeric separators, even after spaces.");
3164               continue;
3165         }
3166 
3167 
3168         if (fOP->contains(thisChar)) {
3169             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3170             int tPos = prevPos;
3171             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3172                 tPos = fText->moveIndex32(tPos, -1);
3173             }
3174             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3175                 tPos = fText->moveIndex32(tPos, -1);
3176             }
3177             if (fQU->contains(fText->char32At(tPos))) {
3178                 setAppliedRule(pos, "LB 15    QU SP* x OP");
3179                 continue;
3180             }
3181         }
3182 
3183 
3184         //    Scan backwards for SP* CM* (CL | CP)
3185         if (fNS->contains(thisChar)) {
3186             int tPos = prevPos;
3187             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3188                 tPos = fText->moveIndex32(tPos, -1);
3189             }
3190             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3191                 tPos = fText->moveIndex32(tPos, -1);
3192             }
3193             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3194                 setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
3195                 continue;
3196             }
3197         }
3198 
3199 
3200         if (fB2->contains(thisChar)) {
3201             //  Scan backwards, checking for the B2 CM* SP* sequence.
3202             tPos = prevPos;
3203             if (fSP->contains(prevChar)) {
3204                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3205                     tPos=fText->moveIndex32(tPos, -1);
3206                 }
3207             }
3208             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3209                 tPos=fText->moveIndex32(tPos, -1);
3210             }
3211             if (fB2->contains(fText->char32At(tPos))) {
3212                 setAppliedRule(pos, "LB 17   B2 SP* x B2");
3213                 continue;
3214             }
3215         }
3216 
3217 
3218         if (fSP->contains(prevChar)) {
3219             setAppliedRule(pos, "LB 18    break after space");
3220             break;
3221         }
3222 
3223         //    x   QU
3224         //    QU  x
3225         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3226             setAppliedRule(pos, "LB 19");
3227             continue;
3228         }
3229 
3230         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3231             setAppliedRule(pos, "LB 20  Break around a CB");
3232             break;
3233         }
3234 
3235         //           Don't break between Hyphens and letters if a break precedes the hyphen.
3236         //           Formerly this was a Finnish tailoring.
3237         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3238         //           ^($HY | $HH) $AL;
3239         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3240                 prevPosX2 == -1) {
3241             setAppliedRule(pos, "LB 20.09");
3242             continue;
3243         }
3244 
3245         if (fBA->contains(thisChar) ||
3246             fHY->contains(thisChar) ||
3247             fNS->contains(thisChar) ||
3248             fBB->contains(prevChar) )   {
3249             setAppliedRule(pos, "LB 21");
3250             continue;
3251         }
3252 
3253         if (fHL->contains(prevCharX2) &&
3254                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3255             setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
3256             continue;
3257         }
3258 
3259         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3260             setAppliedRule(pos, "LB 21b SY x HL");
3261             continue;
3262         }
3263 
3264         if (fIN->contains(thisChar))   {
3265             setAppliedRule(pos, "LB 22");
3266             continue;
3267         }
3268 
3269 
3270         //          (AL | HL) x NU
3271         //          NU x (AL | HL)
3272         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3273             setAppliedRule(pos, "LB 23");
3274             continue;
3275         }
3276         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3277             setAppliedRule(pos, "LB 23");
3278             continue;
3279         }
3280 
3281         // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3282         //      PR x (ID | EB | EM)
3283         //     (ID | EB | EM) x PO
3284         if (fPR->contains(prevChar) &&
3285                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3286             setAppliedRule(pos, "LB 23a");
3287             continue;
3288         }
3289         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3290                 fPO->contains(thisChar)) {
3291             setAppliedRule(pos, "LB 23a");
3292             continue;
3293         }
3294 
3295         //   Do not break between prefix and letters or ideographs.
3296         //         (PR | PO) x (AL | HL)
3297         //         (AL | HL) x (PR | PO)
3298         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3299                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3300             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3301             continue;
3302         }
3303         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3304                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3305             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3306             continue;
3307         }
3308 
3309         // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3310 
3311         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3312                                         fJV->contains(thisChar) ||
3313                                         fH2->contains(thisChar) ||
3314                                         fH3->contains(thisChar))) {
3315             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3316             continue;
3317                                         }
3318 
3319         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3320             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3321             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3322             continue;
3323         }
3324 
3325         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3326             fJT->contains(thisChar)) {
3327             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3328             continue;
3329         }
3330 
3331         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3332             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3333             fIN->contains(thisChar)) {
3334             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3335             continue;
3336             }
3337         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3338             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3339             fPO->contains(thisChar)) {
3340             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3341             continue;
3342             }
3343         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3344             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3345             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3346             continue;
3347             }
3348 
3349 
3350 
3351         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3352             setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
3353             continue;
3354         }
3355 
3356           if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3357               setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3358               continue;
3359         }
3360 
3361         //          (AL | NU) x OP
3362         //          CP x (AL | NU)
3363         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3364             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3365             continue;
3366         }
3367         if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3368             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3369             continue;
3370         }
3371 
3372         //             RI  x  RI
3373         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3374             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3375             break;
3376         }
3377         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3378             // Two Regional Indicators have been paired.
3379             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3380             // following RI. This is a hack.
3381             thisChar = -1;
3382             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3383             continue;
3384         }
3385 
3386         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3387             setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
3388             continue;
3389         }
3390 
3391         setAppliedRule(pos, "LB 31    Break everywhere else");
3392         break;
3393     }
3394 
3395     return pos;
3396 }
3397 
3398 
charClasses()3399 UVector  *RBBILineMonkey::charClasses() {
3400     return fSets;
3401 }
3402 
3403 
~RBBILineMonkey()3404 RBBILineMonkey::~RBBILineMonkey() {
3405     delete fSets;
3406 
3407     delete fBK;
3408     delete fCR;
3409     delete fLF;
3410     delete fCM;
3411     delete fNL;
3412     delete fWJ;
3413     delete fZW;
3414     delete fGL;
3415     delete fCB;
3416     delete fSP;
3417     delete fB2;
3418     delete fBA;
3419     delete fBB;
3420     delete fHH;
3421     delete fHY;
3422     delete fH2;
3423     delete fH3;
3424     delete fCL;
3425     delete fCP;
3426     delete fEX;
3427     delete fIN;
3428     delete fJL;
3429     delete fJV;
3430     delete fJT;
3431     delete fNS;
3432     delete fOP;
3433     delete fQU;
3434     delete fIS;
3435     delete fNU;
3436     delete fPO;
3437     delete fPR;
3438     delete fSY;
3439     delete fAI;
3440     delete fAL;
3441     delete fCJ;
3442     delete fHL;
3443     delete fID;
3444     delete fRI;
3445     delete fSG;
3446     delete fXX;
3447     delete fEB;
3448     delete fEM;
3449     delete fZWJ;
3450     delete fOP30;
3451     delete fCP30;
3452 
3453     delete fCharBI;
3454     delete fNumberMatcher;
3455 }
3456 
3457 
3458 //-------------------------------------------------------------------------------------------
3459 //
3460 //   TestMonkey
3461 //
3462 //     params
3463 //       seed=nnnnn        Random number starting seed.
3464 //                         Setting the seed allows errors to be reproduced.
3465 //       loop=nnn          Looping count.  Controls running time.
3466 //                         -1:  run forever.
3467 //                          0 or greater:  run length.
3468 //
3469 //       type = char | word | line | sent | title
3470 //
3471 //  Example:
3472 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3473 //
3474 //-------------------------------------------------------------------------------------------
3475 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3476 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3477     int32_t val = defaultVal;
3478     name.append(" *= *(-?\\d+)");
3479     UErrorCode status = U_ZERO_ERROR;
3480     RegexMatcher m(name, params, 0, status);
3481     if (m.find()) {
3482         // The param exists.  Convert the string to an int.
3483         char valString[100];
3484         int32_t paramLength = m.end(1, status) - m.start(1, status);
3485         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3486             paramLength = (int32_t)(sizeof(valString)-2);
3487         }
3488         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3489         val = strtol(valString, NULL, 10);
3490 
3491         // Delete this parameter from the params string.
3492         m.reset();
3493         params = m.replaceFirst("", status);
3494     }
3495     U_ASSERT(U_SUCCESS(status));
3496     return val;
3497 }
3498 #endif
3499 
3500 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3501 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3502                                     BreakIterator *bi,
3503                                     int expected[],
3504                                     int expectedcount)
3505 {
3506     int count = 0;
3507     int i = 0;
3508     int forward[50];
3509     bi->setText(ustr);
3510     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3511         forward[count] = i;
3512         if (count < expectedcount && expected[count] != i) {
3513             test->errln("%s:%d break forward test failed: expected %d but got %d",
3514                         __FILE__, __LINE__, expected[count], i);
3515             break;
3516         }
3517         count ++;
3518     }
3519     if (count != expectedcount) {
3520         printStringBreaks(ustr, expected, expectedcount);
3521         test->errln("%s:%d break forward test failed: missed %d match",
3522                     __FILE__, __LINE__, expectedcount - count);
3523         return;
3524     }
3525     // testing boundaries
3526     for (i = 1; i < expectedcount; i ++) {
3527         int j = expected[i - 1];
3528         if (!bi->isBoundary(j)) {
3529             printStringBreaks(ustr, expected, expectedcount);
3530             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3531                     __FILE__, __LINE__, j);
3532             return;
3533         }
3534         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3535             if (bi->isBoundary(j)) {
3536                 printStringBreaks(ustr, expected, expectedcount);
3537                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3538                     __FILE__, __LINE__, j);
3539                 return;
3540             }
3541         }
3542     }
3543 
3544     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3545         count --;
3546         if (forward[count] != i) {
3547             printStringBreaks(ustr, expected, expectedcount);
3548             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3549                         __FILE__, __LINE__, forward[count], i);
3550             break;
3551         }
3552     }
3553     if (count != 0) {
3554         printStringBreaks(ustr, expected, expectedcount);
3555         test->errln("break test previous() failed: missed a match");
3556         return;
3557     }
3558 
3559     // testing preceding
3560     for (i = 0; i < expectedcount - 1; i ++) {
3561         // int j = expected[i] + 1;
3562         int j = ustr.moveIndex32(expected[i], 1);
3563         for (; j <= expected[i + 1]; j ++) {
3564             int32_t expectedPreceding = expected[i];
3565             int32_t actualPreceding = bi->preceding(j);
3566             if (actualPreceding != expectedPreceding) {
3567                 printStringBreaks(ustr, expected, expectedcount);
3568                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3569                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3570                 return;
3571             }
3572         }
3573     }
3574 }
3575 #endif
3576 
TestWordBreaks(void)3577 void RBBITest::TestWordBreaks(void)
3578 {
3579 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3580 
3581     Locale        locale("en");
3582     UErrorCode    status = U_ZERO_ERROR;
3583     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3584     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3585     // Replaced any C+J characters in a row with a random sequence of characters
3586     // of the same length to make our C+J segmentation not get in the way.
3587     static const char *strlist[] =
3588     {
3589     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3590     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3591     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3592     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3593     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3594     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3595     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3596     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3597     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3598     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3599     "\\u2027\\U000e0067\\u0a47\\u00b7",
3600     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3601     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3602     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3603     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3604     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3605     "\\u0027\\u11af\\U000e0057\\u0602",
3606     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3607     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3608     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3609     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3610     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3611     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3612     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3613     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3614     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3615     "\\u18f4\\U000e0049\\u20e7\\u2027",
3616     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3617     "\\ua183\\u102d\\u0bec\\u003a",
3618     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3619     "\\u003a\\u0e57\\u0fad\\u002e",
3620     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3621     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3622     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3623     "\\u003a\\u0664\\u00b7\\u1fba",
3624     "\\u003b\\u0027\\u00b7\\u47a3",
3625     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3626     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3627     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3628     };
3629     int loop;
3630     if (U_FAILURE(status)) {
3631         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3632         return;
3633     }
3634     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3635         // printf("looping %d\n", loop);
3636         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3637         // RBBICharMonkey monkey;
3638         RBBIWordMonkey monkey;
3639 
3640         int expected[50];
3641         int expectedcount = 0;
3642 
3643         monkey.setText(ustr);
3644         int i;
3645         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3646             expected[expectedcount ++] = i;
3647         }
3648 
3649         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3650     }
3651     delete bi;
3652 #endif
3653 }
3654 
TestWordBoundary(void)3655 void RBBITest::TestWordBoundary(void)
3656 {
3657     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3658     Locale        locale("en");
3659     UErrorCode    status = U_ZERO_ERROR;
3660     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3661     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3662     if (U_FAILURE(status)) {
3663         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3664                 __FILE__, __LINE__, u_errorName(status));
3665         return;
3666     }
3667     UChar         str[50];
3668     static const char *strlist[] =
3669     {
3670     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3671     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3672     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3673     "\\u2027\\U000e0067\\u0a47\\u00b7",
3674     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3675     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3676     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3677     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3678     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3679     "\\u0027\\u11af\\U000e0057\\u0602",
3680     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3681     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3682     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3683     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3684     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3685     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3686     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3687     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3688     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3689     "\\u58f4\\U000e0049\\u20e7\\u2027",
3690     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3691     "\\ua183\\u102d\\u0bec\\u003a",
3692     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3693     "\\u003a\\u0e57\\u0fad\\u002e",
3694     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3695     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3696     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3697     "\\u003a\\u0664\\u00b7\\u1fba",
3698     "\\u003b\\u0027\\u00b7\\u47a3",
3699     };
3700     int loop;
3701     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3702         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3703         UnicodeString ustr(str);
3704         int forward[50];
3705         int count = 0;
3706 
3707         bi->setText(ustr);
3708         int prev = -1;
3709         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3710             ++count;
3711             if (count >= UPRV_LENGTHOF(forward)) {
3712                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3713                         __FILE__, __LINE__, loop, count, boundary);
3714                 return;
3715             }
3716             forward[count] = boundary;
3717             if (boundary <= prev) {
3718                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3719                         __FILE__, __LINE__, loop, prev, boundary);
3720                 break;
3721             }
3722             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3723                 if (bi->isBoundary(nonBoundary)) {
3724                     printStringBreaks(ustr, forward, count);
3725                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3726                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3727                     return;
3728                 }
3729             }
3730             if (!bi->isBoundary(boundary)) {
3731                 printStringBreaks(ustr, forward, count);
3732                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3733                        __FILE__, __LINE__, boundary);
3734                 return;
3735             }
3736             prev = boundary;
3737         }
3738     }
3739 }
3740 
TestLineBreaks(void)3741 void RBBITest::TestLineBreaks(void)
3742 {
3743 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3744     Locale        locale("en");
3745     UErrorCode    status = U_ZERO_ERROR;
3746     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3747     const int32_t  STRSIZE = 50;
3748     UChar         str[STRSIZE];
3749     static const char *strlist[] =
3750     {
3751      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3752      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3753              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3754      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3755              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3756      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3757      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3758      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3759      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3760      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3761      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3762      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3763      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3764      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3765      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3766      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3767      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3768      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3769      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3770      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3771      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3772      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3773      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3774      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3775      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3776      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3777      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3778      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3779      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3780      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3781      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3782      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3783      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3784      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3785      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3786      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3787      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3788      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3789      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3790          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3791     };
3792     int loop;
3793     TEST_ASSERT_SUCCESS(status);
3794     if (U_FAILURE(status)) {
3795         return;
3796     }
3797     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3798         // printf("looping %d\n", loop);
3799         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3800         if (t >= STRSIZE) {
3801             TEST_ASSERT(FALSE);
3802             continue;
3803         }
3804 
3805 
3806         UnicodeString ustr(str);
3807         RBBILineMonkey monkey;
3808         if (U_FAILURE(monkey.deferredStatus)) {
3809             continue;
3810         }
3811 
3812         const int EXPECTEDSIZE = 50;
3813         int expected[EXPECTEDSIZE];
3814         int expectedcount = 0;
3815 
3816         monkey.setText(ustr);
3817 
3818         int i;
3819         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3820             if (expectedcount >= EXPECTEDSIZE) {
3821                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3822                 return;
3823             }
3824             expected[expectedcount ++] = i;
3825         }
3826 
3827         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3828     }
3829     delete bi;
3830 #endif
3831 }
3832 
TestSentBreaks(void)3833 void RBBITest::TestSentBreaks(void)
3834 {
3835 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3836     Locale        locale("en");
3837     UErrorCode    status = U_ZERO_ERROR;
3838     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3839     UChar         str[200];
3840     static const char *strlist[] =
3841     {
3842      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3843      "This\n",
3844      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3845      "\"Sentence ending with a quote.\" Bye.",
3846      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3847      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3848      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3849      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3850      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3851      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3852      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3853              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3854              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3855              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3856      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3857              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3858              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3859              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3860              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3861              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3862     };
3863     int loop;
3864     if (U_FAILURE(status)) {
3865         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3866         return;
3867     }
3868     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3869         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3870         UnicodeString ustr(str);
3871 
3872         RBBISentMonkey monkey;
3873         if (U_FAILURE(monkey.deferredStatus)) {
3874             continue;
3875         }
3876 
3877         const int EXPECTEDSIZE = 50;
3878         int expected[EXPECTEDSIZE];
3879         int expectedcount = 0;
3880 
3881         monkey.setText(ustr);
3882 
3883         int i;
3884         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3885             if (expectedcount >= EXPECTEDSIZE) {
3886                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3887                 return;
3888             }
3889             expected[expectedcount ++] = i;
3890         }
3891 
3892         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3893     }
3894     delete bi;
3895 #endif
3896 }
3897 
TestMonkey()3898 void RBBITest::TestMonkey() {
3899 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3900 
3901     UErrorCode     status    = U_ZERO_ERROR;
3902     int32_t        loopCount = 500;
3903     int32_t        seed      = 1;
3904     UnicodeString  breakType = "all";
3905     Locale         locale("en");
3906     UBool          useUText  = FALSE;
3907 
3908     if (quick == FALSE) {
3909         loopCount = 10000;
3910     }
3911 
3912     if (fTestParams) {
3913         UnicodeString p(fTestParams);
3914         loopCount = getIntParam("loop", p, loopCount);
3915         seed      = getIntParam("seed", p, seed);
3916 
3917         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3918         if (m.find()) {
3919             breakType = m.group(1, status);
3920             m.reset();
3921             p = m.replaceFirst("", status);
3922         }
3923 
3924         RegexMatcher u(" *utext", p, 0, status);
3925         if (u.find()) {
3926             useUText = TRUE;
3927             u.reset();
3928             p = u.replaceFirst("", status);
3929         }
3930 
3931 
3932         // m.reset(p);
3933         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3934             // Each option is stripped out of the option string as it is processed.
3935             // All options have been checked.  The option string should have been completely emptied..
3936             char buf[100];
3937             p.extract(buf, sizeof(buf), NULL, status);
3938             buf[sizeof(buf)-1] = 0;
3939             errln("Unrecognized or extra parameter:  %s\n", buf);
3940             return;
3941         }
3942 
3943     }
3944 
3945     if (breakType == "char" || breakType == "all") {
3946         RBBICharMonkey  m;
3947         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3948         if (U_SUCCESS(status)) {
3949             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3950             if (breakType == "all" && useUText==FALSE) {
3951                 // Also run a quick test with UText when "all" is specified
3952                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3953             }
3954         }
3955         else {
3956             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3957         }
3958         delete bi;
3959     }
3960 
3961     if (breakType == "word" || breakType == "all") {
3962         logln("Word Break Monkey Test");
3963         RBBIWordMonkey  m;
3964         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3965         if (U_SUCCESS(status)) {
3966             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3967         }
3968         else {
3969             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3970         }
3971         delete bi;
3972     }
3973 
3974     if (breakType == "line" || breakType == "all") {
3975         logln("Line Break Monkey Test");
3976         RBBILineMonkey  m;
3977         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3978         if (loopCount >= 10) {
3979             loopCount = loopCount / 5;   // Line break runs slower than the others.
3980         }
3981         if (U_SUCCESS(status)) {
3982             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3983         }
3984         else {
3985             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3986         }
3987         delete bi;
3988     }
3989 
3990     if (breakType == "sent" || breakType == "all"  ) {
3991         logln("Sentence Break Monkey Test");
3992         RBBISentMonkey  m;
3993         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3994         if (loopCount >= 10) {
3995             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3996         }
3997         if (U_SUCCESS(status)) {
3998             RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3999         }
4000         else {
4001             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4002         }
4003         delete bi;
4004     }
4005 
4006 #endif
4007 }
4008 
4009 //
4010 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4011 //    Parameters:
4012 //       bi      - the break iterator to use
4013 //       mk      - MonkeyKind, abstraction for obtaining expected results
4014 //       name    - Name of test (char, word, etc.) for use in error messages
4015 //       seed    - Seed for starting random number generator (parameter from user)
4016 //       numIterations
4017 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4018 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4019                          int32_t numIterations, UBool useUText) {
4020 
4021 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4022 
4023     const int32_t    TESTSTRINGLEN = 500;
4024     UnicodeString    testText;
4025     int32_t          numCharClasses;
4026     UVector          *chClasses;
4027     int              expectedCount = 0;
4028     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4029     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4030     char             reverseBreaks[TESTSTRINGLEN*2+1];
4031     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4032     char             followingBreaks[TESTSTRINGLEN*2+1];
4033     char             precedingBreaks[TESTSTRINGLEN*2+1];
4034     int              i;
4035     int              loopCount = 0;
4036 
4037 
4038     m_seed = seed;
4039 
4040     numCharClasses = mk.charClasses()->size();
4041     chClasses      = mk.charClasses();
4042 
4043     // Check for errors that occured during the construction of the MonkeyKind object.
4044     //  Can't report them where they occured because errln() is a method coming from intlTest,
4045     //  and is not visible outside of RBBITest :-(
4046     if (U_FAILURE(mk.deferredStatus)) {
4047         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4048         return;
4049     }
4050 
4051     // Verify that the character classes all have at least one member.
4052     for (i=0; i<numCharClasses; i++) {
4053         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4054         if (s == NULL || s->size() == 0) {
4055             errln("Character Class #%d is null or of zero size.", i);
4056             return;
4057         }
4058     }
4059 
4060     // For minimizing width of class name output.
4061     int classNameSize = mk.maxClassNameSize();
4062 
4063     while (loopCount < numIterations || numIterations == -1) {
4064         if (numIterations == -1 && loopCount % 10 == 0) {
4065             // If test is running in an infinite loop, display a periodic tic so
4066             //   we can tell that it is making progress.
4067             fprintf(stderr, ".");
4068         }
4069         // Save current random number seed, so that we can recreate the random numbers
4070         //   for this loop iteration in event of an error.
4071         seed = m_seed;
4072 
4073         // Populate a test string with data.
4074         testText.truncate(0);
4075         for (i=0; i<TESTSTRINGLEN; i++) {
4076             int32_t  aClassNum = m_rand() % numCharClasses;
4077             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4078             int32_t   charIdx = m_rand() % classSet->size();
4079             UChar32   c = classSet->charAt(charIdx);
4080             if (c < 0) {   // TODO:  deal with sets containing strings.
4081                 errln("%s:%d c < 0", __FILE__, __LINE__);
4082                 break;
4083             }
4084             // Do not assemble a supplementary character from randomly generated separate surrogates.
4085             //   (It could be a dictionary character)
4086             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4087                 continue;
4088             }
4089 
4090             testText.append(c);
4091         }
4092 
4093         // Calculate the expected results for this test string and reset applied rules.
4094         mk.setText(testText);
4095 
4096         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4097         expectedBreaks[0] = 1;
4098         int32_t breakPos = 0;
4099         expectedCount = 0;
4100         for (;;) {
4101             breakPos = mk.next(breakPos);
4102             if (breakPos == -1) {
4103                 break;
4104             }
4105             if (breakPos > testText.length()) {
4106                 errln("breakPos > testText.length()");
4107             }
4108             expectedBreaks[breakPos] = 1;
4109             U_ASSERT(expectedCount<testText.length());
4110         }
4111 
4112         // Find the break positions using forward iteration
4113         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4114         if (useUText) {
4115             UErrorCode status = U_ZERO_ERROR;
4116             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4117             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4118             bi->setText(testUText, status);
4119             TEST_ASSERT_SUCCESS(status);
4120             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4121                                       //  This UText can be closed immediately, so long as the
4122                                       //  testText string continues to exist.
4123         } else {
4124             bi->setText(testText);
4125         }
4126 
4127         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4128             if (i < 0 || i > testText.length()) {
4129                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4130                 break;
4131             }
4132             forwardBreaks[i] = 1;
4133         }
4134 
4135         // Find the break positions using reverse iteration
4136         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4137         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4138             if (i < 0 || i > testText.length()) {
4139                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4140                 break;
4141             }
4142             reverseBreaks[i] = 1;
4143         }
4144 
4145         // Find the break positions using isBoundary() tests.
4146         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4147         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4148         for (i=0; i<=testText.length(); i++) {
4149             isBoundaryBreaks[i] = bi->isBoundary(i);
4150         }
4151 
4152 
4153         // Find the break positions using the following() function.
4154         // printf(".");
4155         memset(followingBreaks, 0, sizeof(followingBreaks));
4156         int32_t   lastBreakPos = 0;
4157         followingBreaks[0] = 1;
4158         for (i=0; i<testText.length(); i++) {
4159             breakPos = bi->following(i);
4160             if (breakPos <= i ||
4161                 breakPos < lastBreakPos ||
4162                 breakPos > testText.length() ||
4163                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4164                 errln("%s break monkey test: "
4165                     "Out of range value returned by BreakIterator::following().\n"
4166                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4167                          name, seed, i, breakPos, lastBreakPos);
4168                 break;
4169             }
4170             followingBreaks[breakPos] = 1;
4171             lastBreakPos = breakPos;
4172         }
4173 
4174         // Find the break positions using the preceding() function.
4175         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4176         lastBreakPos = testText.length();
4177         precedingBreaks[testText.length()] = 1;
4178         for (i=testText.length(); i>0; i--) {
4179             breakPos = bi->preceding(i);
4180             if (breakPos >= i ||
4181                 breakPos > lastBreakPos ||
4182                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4183                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4184                 errln("%s break monkey test: "
4185                     "Out of range value returned by BreakIterator::preceding().\n"
4186                     "index=%d;  prev returned %d; lastBreak=%d" ,
4187                     name,  i, breakPos, lastBreakPos);
4188                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4189                     precedingBreaks[i] = 2;   // Forces an error.
4190                 }
4191             } else {
4192                 if (breakPos >= 0) {
4193                     precedingBreaks[breakPos] = 1;
4194                 }
4195                 lastBreakPos = breakPos;
4196             }
4197         }
4198 
4199         // Compare the expected and actual results.
4200         for (i=0; i<=testText.length(); i++) {
4201             const char *errorType = NULL;
4202             const char* currentBreakData = NULL;
4203             if  (forwardBreaks[i] != expectedBreaks[i]) {
4204                 errorType = "next()";
4205                 currentBreakData = forwardBreaks;
4206             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4207                 errorType = "previous()";
4208                 currentBreakData = reverseBreaks;
4209            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4210                 errorType = "isBoundary()";
4211                 currentBreakData = isBoundaryBreaks;
4212             } else if (followingBreaks[i] != expectedBreaks[i]) {
4213                 errorType = "following()";
4214                 currentBreakData = followingBreaks;
4215             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4216                 errorType = "preceding()";
4217                 currentBreakData = precedingBreaks;
4218             }
4219 
4220             if (errorType != NULL) {
4221                 // Format a range of the test text that includes the failure as
4222                 //  a data item that can be included in the rbbi test data file.
4223 
4224                 // Start of the range is the last point where expected and actual results
4225                 //  both agreed that there was a break position.
4226 
4227                 int startContext = i;
4228                 int32_t count = 0;
4229                 for (;;) {
4230                     if (startContext==0) { break; }
4231                     startContext --;
4232                     if (expectedBreaks[startContext] != 0) {
4233                         if (count == 2) break;
4234                         count ++;
4235                     }
4236                 }
4237 
4238                 // End of range is two expected breaks past the start position.
4239                 int endContext = i + 1;
4240                 int ci;
4241                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4242                     for (;;) {
4243                         if (endContext >= testText.length()) {break;}
4244                         if (expectedBreaks[endContext-1] != 0) {
4245                             if (count == 0) break;
4246                             count --;
4247                         }
4248                         endContext ++;
4249                     }
4250                 }
4251 
4252                 // Formatting of each line includes:
4253                 //   character code
4254                 //   reference break: '|' -> a break, '.' -> no break
4255                 //   actual break:    '|' -> a break, '.' -> no break
4256                 //   (name of character clase)
4257                 //   Unicode name of character
4258                 //   '-->' indicates location of the difference.
4259 
4260                 MONKEY_ERROR(
4261                     (expectedBreaks[i] ? "Break expected but not found" :
4262                        "Break found but not expected"),
4263                     name, i, seed);
4264 
4265                 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4266                     UChar32  c;
4267                     c = testText.char32At(ci);
4268 
4269                     std::string currentLineFlag = "   ";
4270                     if (ci == i) {
4271                         currentLineFlag = "-->";  // Error position
4272                     }
4273 
4274                     // BMP or SMP character in hex
4275                     char hexCodePoint[12];
4276                     std::string format = "    \\u%04x";
4277                     if (c >= 0x10000) {
4278                         format = "\\U%08x";
4279                     }
4280                     sprintf(hexCodePoint, format.c_str(), c);
4281 
4282                     // Get the class name and character name for the character.
4283                     char cName[200];
4284                     UErrorCode status = U_ZERO_ERROR;
4285                     u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4286 
4287                     char buffer[200];
4288                     auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4289                              "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
4290                              currentLineFlag.c_str(),
4291                              ci,
4292                              expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
4293                              currentBreakData[ci] == 0 ? "." : "|",  // Actual break
4294                              hexCodePoint,
4295                              classNameSize,
4296                              mk.classNameFromCodepoint(c).c_str(),
4297                              mk.getAppliedRule(ci).c_str(), cName);
4298                     (void)ret;
4299                     U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4300 
4301                     // Output the error
4302                     if (ci == i) {
4303                         errln(buffer);
4304                     } else {
4305                         infoln(buffer);
4306                     }
4307 
4308                     if (ci >= endContext) { break; }
4309                 }
4310                 break;
4311             }
4312         }
4313 
4314         loopCount++;
4315     }
4316 #endif
4317 }
4318 
4319 
4320 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4321 //             This test checks the initial patch,
4322 //             which is to just keep it from crashing.  Correct word boundaries
4323 //             await a proper fix to the dictionary code.
4324 //
TestBug5532(void)4325 void RBBITest::TestBug5532(void)  {
4326    // Text includes a mixture of Thai and Latin.
4327    const unsigned char utf8Data[] = {
4328            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4329            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4330            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4331            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4332            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4333            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4334            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4335            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4336            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4337            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4338            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4339 
4340     UErrorCode status = U_ZERO_ERROR;
4341     UText utext=UTEXT_INITIALIZER;
4342     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4343     TEST_ASSERT_SUCCESS(status);
4344 
4345     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4346     TEST_ASSERT_SUCCESS(status);
4347     if (U_SUCCESS(status)) {
4348         bi->setText(&utext, status);
4349         TEST_ASSERT_SUCCESS(status);
4350 
4351         int32_t breakCount = 0;
4352         int32_t previousBreak = -1;
4353         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4354             // For now, just make sure that the break iterator doesn't hang.
4355             TEST_ASSERT(previousBreak < bi->current());
4356             previousBreak = bi->current();
4357         }
4358         TEST_ASSERT(breakCount > 0);
4359     }
4360     delete bi;
4361     utext_close(&utext);
4362 }
4363 
4364 
TestBug9983(void)4365 void RBBITest::TestBug9983(void)  {
4366     UnicodeString text = UnicodeString("\\u002A"  // * Other
4367                                        "\\uFF65"  //   Other
4368                                        "\\u309C"  //   Katakana
4369                                        "\\uFF9F"  //   Extend
4370                                        "\\uFF65"  //   Other
4371                                        "\\u0020"  //   Other
4372                                        "\\u0000").unescape();
4373 
4374     UErrorCode status = U_ZERO_ERROR;
4375     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4376         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4377     TEST_ASSERT_SUCCESS(status);
4378     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4379         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4380     TEST_ASSERT_SUCCESS(status);
4381     if (U_FAILURE(status)) {
4382         return;
4383     }
4384     int32_t offset, rstatus, iterationCount;
4385 
4386     brkiter->setText(text);
4387     brkiter->last();
4388     iterationCount = 0;
4389     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4390         iterationCount++;
4391         rstatus = brkiter->getRuleStatus();
4392         (void)rstatus;     // Suppress set but not used warning.
4393         if (iterationCount >= 10) {
4394            break;
4395         }
4396     }
4397     TEST_ASSERT(iterationCount == 6);
4398 
4399     brkiterPOSIX->setText(text);
4400     brkiterPOSIX->last();
4401     iterationCount = 0;
4402     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4403         iterationCount++;
4404         rstatus = brkiterPOSIX->getRuleStatus();
4405         (void)rstatus;     // Suppress set but not used warning.
4406         if (iterationCount >= 10) {
4407            break;
4408         }
4409     }
4410     TEST_ASSERT(iterationCount == 6);
4411 }
4412 
4413 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4414 //
TestBug7547()4415 void RBBITest::TestBug7547() {
4416     UnicodeString rules;
4417     UErrorCode status = U_ZERO_ERROR;
4418     UParseError parseError;
4419     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4420     if (status != U_BRK_RULE_SYNTAX) {
4421         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4422     }
4423     if (parseError.line != 1 || parseError.offset != 0) {
4424         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4425     }
4426 }
4427 
4428 
TestBug12797()4429 void RBBITest::TestBug12797() {
4430     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4431     UErrorCode status = U_ZERO_ERROR;
4432     UParseError parseError;
4433     RuleBasedBreakIterator bi(rules, parseError, status);
4434     if (U_FAILURE(status)) {
4435         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4436         return;
4437     }
4438     UnicodeString text = "abc";
4439     bi.setText(text);
4440     bi.first();
4441     int32_t boundary = bi.next();
4442     if (boundary != 3) {
4443         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4444     }
4445 }
4446 
TestBug12918()4447 void RBBITest::TestBug12918() {
4448     // This test triggers an assertion failure in dictbe.cpp
4449     const UChar *crasherString = u"\u3325\u4a16";
4450     UErrorCode status = U_ZERO_ERROR;
4451     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4452     if (U_FAILURE(status)) {
4453         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4454         return;
4455     }
4456     ubrk_first(iter);
4457     int32_t pos = 0;
4458     int32_t lastPos = -1;
4459     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4460         if (pos <= lastPos) {
4461             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4462             break;
4463         }
4464     }
4465     ubrk_close(iter);
4466 }
4467 
TestBug12932()4468 void RBBITest::TestBug12932() {
4469     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4470     UnicodeString ruleStr(
4471             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4472             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4473             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4474             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4475             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4476             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4477 
4478     UErrorCode status = U_ZERO_ERROR;
4479     UParseError parseError;
4480     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4481     if (status != U_BRK_RULE_SYNTAX) {
4482         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4483                 __FILE__, __LINE__, u_errorName(status));
4484     }
4485 }
4486 
4487 
4488 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4489 //             remain undevided by ICU char, word and line break.
TestEmoji()4490 void RBBITest::TestEmoji() {
4491 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4492     UErrorCode  status = U_ZERO_ERROR;
4493 
4494     CharString testFileName;
4495     testFileName.append(IntlTest::getSourceTestData(status), status);
4496     testFileName.appendPathPart("emoji-test.txt", status);
4497     if (U_FAILURE(status)) {
4498         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4499         return;
4500     }
4501     logln("Opening data file %s\n", testFileName.data());
4502 
4503     int    len;
4504     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4505     if (U_FAILURE(status) || testFile == NULL) {
4506         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4507         return;
4508     }
4509     UnicodeString testFileAsString(testFile, len);
4510     delete [] testFile;
4511 
4512     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4513     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4514     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4515     int32_t lineNumber = 0;
4516 
4517     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4518     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4519     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4520     if (U_FAILURE(status)) {
4521         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4522         return;
4523     }
4524 
4525     while (lineMatcher.find()) {
4526         ++lineNumber;
4527         UnicodeString line = lineMatcher.group(status);
4528         hexMatcher.reset(line);
4529         UnicodeString testString;   // accumulates the emoji sequence.
4530         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4531             UnicodeString hex = hexMatcher.group(1, status);
4532             if (hex.length() > 8) {
4533                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4534                 break;
4535             }
4536             CharString hex8;
4537             hex8.appendInvariantChars(hex, status);
4538             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4539             if (c<=0x10ffff) {
4540                 testString.append(c);
4541             } else {
4542                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4543                         __FILE__, __LINE__, lineNumber, hex8.data());
4544                 break;
4545             }
4546         }
4547 
4548         if (testString.length() > 1) {
4549             charBreaks->setText(testString);
4550             charBreaks->first();
4551             int32_t firstBreak = charBreaks->next();
4552             if (testString.length() != firstBreak) {
4553                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4554                         __FILE__, __LINE__, lineNumber, firstBreak);
4555             }
4556             wordBreaks->setText(testString);
4557             wordBreaks->first();
4558             firstBreak = wordBreaks->next();
4559             if (testString.length() != firstBreak) {
4560                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4561                         __FILE__, __LINE__, lineNumber, firstBreak);
4562             }
4563             lineBreaks->setText(testString);
4564             lineBreaks->first();
4565             firstBreak = lineBreaks->next();
4566             if (testString.length() != firstBreak) {
4567                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4568                         __FILE__, __LINE__, lineNumber, firstBreak);
4569             }
4570         }
4571     }
4572 #endif
4573 }
4574 
4575 
4576 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4577 
TestBug12519()4578 void RBBITest::TestBug12519() {
4579     UErrorCode status = U_ZERO_ERROR;
4580     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4581     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4582     if (!assertSuccess(WHERE, status)) {
4583         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4584         return;
4585     }
4586     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4587 
4588     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4589     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4590 
4591     LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4592     assertTrue(WHERE, *biEn == *cloneEn);
4593     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4594 
4595     LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4596     assertTrue(WHERE, *biFr == *cloneFr);
4597     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4598 
4599     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4600     UnicodeString text("Hallo Welt");
4601     biDe->setText(text);
4602     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4603     *biDe = *biFr;
4604     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4605 }
4606 
TestBug12677()4607 void RBBITest::TestBug12677() {
4608     // Check that stripping of comments from rules for getRules() is not confused by
4609     // the presence of '#' characters in the rules that do not introduce comments.
4610     UnicodeString rules(u"!!forward; \n"
4611                          "$x = [ab#];  # a set with a # literal. \n"
4612                          " # .;        # a comment that looks sort of like a rule.   \n"
4613                          " '#' '?';    # a rule with a quoted #   \n"
4614                        );
4615 
4616     UErrorCode status = U_ZERO_ERROR;
4617     UParseError pe;
4618     RuleBasedBreakIterator bi(rules, pe, status);
4619     assertSuccess(WHERE, status);
4620     UnicodeString rtRules = bi.getRules();
4621     assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "),  rtRules);
4622 }
4623 
4624 
TestTableRedundancies()4625 void RBBITest::TestTableRedundancies() {
4626     UErrorCode status = U_ZERO_ERROR;
4627 
4628     LocalPointer<RuleBasedBreakIterator> bi (
4629         (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4630     assertSuccess(WHERE, status);
4631     if (U_FAILURE(status)) return;
4632 
4633     RBBIDataWrapper *dw = bi->fData;
4634     const RBBIStateTable *fwtbl = dw->fForwardTable;
4635     int32_t numCharClasses = dw->fHeader->fCatCount;
4636     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4637 
4638     // Check for duplicate columns (character categories)
4639 
4640     std::vector<UnicodeString> columns;
4641     for (int32_t column = 0; column < numCharClasses; column++) {
4642         UnicodeString s;
4643         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4644             RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4645             s.append(row->fNextState[column]);
4646         }
4647         columns.push_back(s);
4648     }
4649     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4650     for (int c1=1; c1<numCharClasses; c1++) {
4651         for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4652             if (columns.at(c1) == columns.at(c2)) {
4653                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4654                 goto out;
4655             }
4656         }
4657     }
4658   out:
4659 
4660     // Check for duplicate states
4661     std::vector<UnicodeString> rows;
4662     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4663         UnicodeString s;
4664         RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4665         assertTrue(WHERE, row->fAccepting >= -1);
4666         s.append(row->fAccepting + 1);   // values of -1 are expected.
4667         s.append(row->fLookAhead);
4668         s.append(row->fTagIdx);
4669         for (int32_t column = 0; column < numCharClasses; column++) {
4670             s.append(row->fNextState[column]);
4671         }
4672         rows.push_back(s);
4673     }
4674     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4675         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4676             if (rows.at(r1) == rows.at(r2)) {
4677                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4678                 return;
4679             }
4680         }
4681     }
4682 }
4683 
4684 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4685 //            even after next() has returned DONE.
4686 
TestBug13447()4687 void RBBITest::TestBug13447() {
4688     UErrorCode status = U_ZERO_ERROR;
4689     LocalPointer<RuleBasedBreakIterator> bi(
4690         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4691     assertSuccess(WHERE, status);
4692     if (U_FAILURE(status)) return;
4693     UnicodeString data(u"1234");
4694     bi->setText(data);
4695     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4696     assertEquals(WHERE, 4, bi->next());
4697     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4698     assertEquals(WHERE, UBRK_DONE, bi->next());
4699     assertEquals(WHERE, 4, bi->current());
4700     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4701 }
4702 
4703 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4704 //  for filling the break iterator cache when starting from random positions
4705 //  in the text.
4706 //
4707 //  It's a monkey test, working on random data, with the expected data obtained
4708 //  from forward iteration (no safe rules involved), comparing with results
4709 //  when indexing into the interior of the string (safe rules needed).
4710 
TestReverse()4711 void RBBITest::TestReverse() {
4712     UErrorCode status = U_ZERO_ERROR;
4713 
4714     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4715             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4716     assertSuccess(WHERE, status, true);
4717     status = U_ZERO_ERROR;
4718     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4719             BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4720     assertSuccess(WHERE, status, true);
4721     status = U_ZERO_ERROR;
4722     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4723             BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4724     assertSuccess(WHERE, status, true);
4725     status = U_ZERO_ERROR;
4726     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4727             BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4728     assertSuccess(WHERE, status, true);
4729 }
4730 
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4731 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4732     if (!bi) {
4733         return;
4734     }
4735 
4736     // From the mapping trie in the break iterator's internal data, create a
4737     // vector of UnicodeStrings, one for each character category, containing
4738     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4739     // to avoid an execess of unassigned code points.
4740 
4741     RBBIDataWrapper *data = bi->fData;
4742     int32_t categoryCount = data->fHeader->fCatCount;
4743     UTrie2  *trie = data->fTrie;
4744 
4745     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4746     for (int cp=0; cp<0x1fff0; ++cp) {
4747         int cat = utrie2_get32(trie, cp);
4748         cat &= ~0x4000;    // And off the dictionary bit from the category.
4749         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4750         if (cat < 0 || cat >= categoryCount) return;
4751         strings[cat].append(cp);
4752     }
4753 
4754     icu_rand randomGen;
4755     const int testStringLength = 10000;
4756     UnicodeString testString;
4757 
4758     for (int i=0; i<testStringLength; ++i) {
4759         int charClass = randomGen() % categoryCount;
4760         if (strings[charClass].length() > 0) {
4761             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4762             testString.append(cp);
4763         }
4764     }
4765 
4766     typedef std::pair<UBool, int32_t> Result;
4767     std::vector<Result> expectedResults;
4768     bi->setText(testString);
4769     for (int i=0; i<testString.length(); ++i) {
4770         bool isboundary = bi->isBoundary(i);
4771         int  ruleStatus = bi->getRuleStatus();
4772         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4773     }
4774 
4775     for (int i=testString.length()-1; i>=0; --i) {
4776         bi->setText(testString);   // clears the internal break cache
4777         Result expected = expectedResults[i];
4778         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4779         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4780     }
4781 }
4782 
4783 
4784 // Ticket 13692 - finding word boundaries in very large numbers or words could
4785 //                be very time consuming. When the problem was present, this void test
4786 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4787 
TestBug13692()4788 void RBBITest::TestBug13692() {
4789     UErrorCode status = U_ZERO_ERROR;
4790     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4791             BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4792     if (!assertSuccess(WHERE, status, true)) {
4793         return;
4794     }
4795     constexpr int32_t LENGTH = 1000000;
4796     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4797     for (int i=0; i<20; i+=2) {
4798         longNumber.setCharAt(i, u' ');
4799     }
4800     bi->setText(longNumber);
4801     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4802     assertSuccess(WHERE, status);
4803 }
4804 
4805 
TestProperties()4806 void RBBITest::TestProperties() {
4807     UErrorCode errorCode = U_ZERO_ERROR;
4808     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4809     if (!prependSet.isEmpty()) {
4810         errln(
4811             "[:GCB=Prepend:] is not empty any more. "
4812             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4813             "change this test to the opposite condition.");
4814     }
4815 }
4816 
4817 
4818 //
4819 //  TestDebug    -  A place-holder test for debugging purposes.
4820 //                  For putting in fragments of other tests that can be invoked
4821 //                  for tracing  without a lot of unwanted extra stuff happening.
4822 //
TestDebug(void)4823 void RBBITest::TestDebug(void) {
4824     UErrorCode status = U_ZERO_ERROR;
4825     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4826             BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4827     if (!assertSuccess(WHERE, status, true)) {
4828         return;
4829     }
4830     const UnicodeString &rules = bi->getRules();
4831     UParseError pe;
4832     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4833     assertSuccess(WHERE, status);
4834 }
4835 
4836 
4837 //
4838 //  TestDebugRules   A stub test for use in debugging rule compilation problems.
4839 //                   Can be freely altered as needed or convenient.
4840 //                   Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4841 //                   data files may not be available in all environments.
4842 //                   Any permanent test cases should be moved to rbbitst.txt
4843 //                   (see Bug 20303 in that file, for example), or to another test function in this file.
4844 //
TestDebugRules()4845 void RBBITest::TestDebugRules() {
4846 #if 0
4847     const char16_t *rules = u""
4848         "!!quoted_literals_only; \n"
4849         "!!chain; \n"
4850         "!!lookAheadHardBreak; \n"
4851         " \n"
4852         // "[a] / ; \n"
4853         "[a] [b] / [c] [d]; \n"
4854         "[a] [b] / [c] [d] {100}; \n"
4855         "[x] [a] [b] / [c] [d] {100}; \n"
4856         "[a] [b] [c] / [d] {100}; \n"
4857         //" [c] [d] / [e] [f]; \n"
4858         //"[a] [b] / [c]; \n"
4859         ;
4860 
4861     UErrorCode status = U_ZERO_ERROR;
4862     CharString path(pathToDataDirectory(), status);
4863     path.appendPathPart("brkitr", status);
4864     path.appendPathPart("rules", status);
4865     path.appendPathPart("line.txt", status);
4866     int    len;
4867     std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4868     if (!assertSuccess(WHERE, status)) {
4869         return;
4870     }
4871 
4872     UParseError pe;
4873     // rules = testFile.get();
4874     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4875 
4876     if (!assertSuccess(WHERE, status)) {
4877         delete bi;
4878         return;
4879     }
4880     // bi->dumpTables();
4881 
4882     delete bi;
4883 #endif
4884 }
4885 
4886 #if U_ENABLE_TRACING
4887 static std::vector<std::string> gData;
4888 static std::vector<int32_t> gEntryFn;
4889 static std::vector<int32_t> gExitFn;
4890 static std::vector<int32_t> gDataFn;
4891 
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)4892 static void U_CALLCONV traceData(
4893         const void*,
4894         int32_t fnNumber,
4895         int32_t,
4896         const char *,
4897         va_list args) {
4898     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
4899         const char* data = va_arg(args, const char*);
4900         gDataFn.push_back(fnNumber);
4901         gData.push_back(data);
4902     }
4903 }
4904 
traceEntry(const void *,int32_t fnNumber)4905 static void traceEntry(const void *, int32_t fnNumber) {
4906     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
4907         gEntryFn.push_back(fnNumber);
4908     }
4909 }
4910 
traceExit(const void *,int32_t fnNumber,const char *,va_list)4911 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
4912     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
4913         gExitFn.push_back(fnNumber);
4914     }
4915 }
4916 
4917 
assertTestTraceResult(int32_t fnNumber,const char * expectedData)4918 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
4919     assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
4920     assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
4921     assertEquals("utrace_exit should be called ", 1, gExitFn.size());
4922     assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
4923 
4924     if (expectedData == nullptr) {
4925       assertEquals("utrace_data should not be called ", 0, gDataFn.size());
4926       assertEquals("utrace_data should not be called ", 0, gData.size());
4927     } else {
4928       assertEquals("utrace_data should be called ", 1, gDataFn.size());
4929       assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
4930       assertEquals("utrace_data should be called ", 1, gData.size());
4931       assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
4932     }
4933 }
4934 
SetupTestTrace()4935 void SetupTestTrace() {
4936     gEntryFn.clear();
4937     gExitFn.clear();
4938     gDataFn.clear();
4939     gData.clear();
4940 
4941     const void* context = nullptr;
4942     utrace_setFunctions(context, traceEntry, traceExit, traceData);
4943     utrace_setLevel(UTRACE_INFO);
4944 }
4945 
TestTraceCreateCharacter(void)4946 void RBBITest::TestTraceCreateCharacter(void) {
4947     SetupTestTrace();
4948     IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
4949     LocalPointer<BreakIterator> brkitr(
4950         BreakIterator::createCharacterInstance("zh-CN", status));
4951     status.errIfFailureAndReset();
4952     assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
4953 }
4954 
TestTraceCreateTitle(void)4955 void RBBITest::TestTraceCreateTitle(void) {
4956     SetupTestTrace();
4957     IcuTestErrorCode status(*this, "TestTraceCreateTitle");
4958     LocalPointer<BreakIterator> brkitr(
4959         BreakIterator::createTitleInstance("zh-CN", status));
4960     status.errIfFailureAndReset();
4961     assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
4962 }
4963 
TestTraceCreateSentence(void)4964 void RBBITest::TestTraceCreateSentence(void) {
4965     SetupTestTrace();
4966     IcuTestErrorCode status(*this, "TestTraceCreateSentence");
4967     LocalPointer<BreakIterator> brkitr(
4968         BreakIterator::createSentenceInstance("zh-CN", status));
4969     status.errIfFailureAndReset();
4970     assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
4971 }
4972 
TestTraceCreateWord(void)4973 void RBBITest::TestTraceCreateWord(void) {
4974     SetupTestTrace();
4975     IcuTestErrorCode status(*this, "TestTraceCreateWord");
4976     LocalPointer<BreakIterator> brkitr(
4977         BreakIterator::createWordInstance("zh-CN", status));
4978     status.errIfFailureAndReset();
4979     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
4980 }
4981 
TestTraceCreateLine(void)4982 void RBBITest::TestTraceCreateLine(void) {
4983     SetupTestTrace();
4984     IcuTestErrorCode status(*this, "TestTraceCreateLine");
4985     LocalPointer<BreakIterator> brkitr(
4986         BreakIterator::createLineInstance("zh-CN", status));
4987     status.errIfFailureAndReset();
4988     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "");
4989 }
4990 
TestTraceCreateLineStrict(void)4991 void RBBITest::TestTraceCreateLineStrict(void) {
4992     SetupTestTrace();
4993     IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
4994     LocalPointer<BreakIterator> brkitr(
4995         BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
4996     status.errIfFailureAndReset();
4997     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "strict");
4998 }
4999 
TestTraceCreateLineNormal(void)5000 void RBBITest::TestTraceCreateLineNormal(void) {
5001     SetupTestTrace();
5002     IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5003     LocalPointer<BreakIterator> brkitr(
5004         BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5005     status.errIfFailureAndReset();
5006     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "normal");
5007 }
5008 
TestTraceCreateLineLoose(void)5009 void RBBITest::TestTraceCreateLineLoose(void) {
5010     SetupTestTrace();
5011     IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5012     LocalPointer<BreakIterator> brkitr(
5013         BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5014     status.errIfFailureAndReset();
5015     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "loose");
5016 }
5017 
TestTraceCreateBreakEngine(void)5018 void RBBITest::TestTraceCreateBreakEngine(void) {
5019     rbbi_cleanup();
5020     SetupTestTrace();
5021     IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5022     LocalPointer<BreakIterator> brkitr(
5023         BreakIterator::createWordInstance("zh-CN", status));
5024     status.errIfFailureAndReset();
5025     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5026 
5027     // To word break the following text, BreakIterator will create 5 dictionary
5028     // break engine internally.
5029     brkitr->setText(
5030         u"test "
5031         u"測試 " // Hani
5032         u"សាកល្បង " // Khmr
5033         u"ທົດສອບ " // Laoo
5034         u"စမ်းသပ်မှု " // Mymr
5035         u"ทดสอบ " // Thai
5036         u"test "
5037     );
5038 
5039     // Loop through all the text.
5040     while (brkitr->next() > 0) ;
5041 
5042     assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5043     assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5044     assertEquals("utrace_data should be called ", 5, gDataFn.size());
5045 
5046     for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5047         assertEquals("utrace_entry should be called ",
5048                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5049         assertEquals("utrace_exit should be called ",
5050                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5051         assertEquals("utrace_data should be called ",
5052                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5053     }
5054 
5055     assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5056     assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5057     assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5058     assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5059     assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5060 
5061 }
5062 #endif
5063 
5064 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5065