• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1999-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Date        Name        Description
10 *   12/15/99    Madhu        Creation.
11 *   01/12/2000  Madhu        Updated for changed API and added new tests
12 ************************************************************************/
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <utility>
21 #include <vector>
22 
23 #include "unicode/brkiter.h"
24 #include "unicode/localpointer.h"
25 #include "unicode/numfmt.h"
26 #include "unicode/rbbi.h"
27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
28 #include "unicode/regex.h"
29 #endif
30 #include "unicode/schriter.h"
31 #include "unicode/uchar.h"
32 #include "unicode/utf16.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uscript.h"
36 #include "unicode/ustring.h"
37 #include "unicode/utext.h"
38 
39 #include "charstr.h"
40 #include "cmemory.h"
41 #include "cstr.h"
42 #include "intltest.h"
43 #include "rbbitst.h"
44 #include "rbbidata.h"
45 #include "utypeinfo.h"  // for 'typeid' to work
46 #include "uvector.h"
47 #include "uvectr32.h"
48 
49 
50 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
51 #include "unicode/filteredbrk.h"
52 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
53 
54 #define TEST_ASSERT(x) {if (!(x)) { \
55     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
56 
57 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
58     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
59 
60 //---------------------------------------------
61 // runIndexedTest
62 //---------------------------------------------
63 
64 
65 //  Note:  Before adding new tests to this file, check whether the desired test data can
66 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
67 //         it's much less work than writing a new test, diagnostic output in the event of failures
68 //         is good, and the test data file will is shared with ICU4J, so eventually the test
69 //         will run there as well, without additional effort.
70 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)71 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
72 {
73     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
74     fTestParams = params;
75 
76     TESTCASE_AUTO_BEGIN;
77 #if !UCONFIG_NO_FILE_IO
78     TESTCASE_AUTO(TestBug4153072);
79 #endif
80 #if !UCONFIG_NO_FILE_IO
81     TESTCASE_AUTO(TestUnicodeFiles);
82 #endif
83     TESTCASE_AUTO(TestGetAvailableLocales);
84     TESTCASE_AUTO(TestGetDisplayName);
85 #if !UCONFIG_NO_FILE_IO
86     TESTCASE_AUTO(TestEndBehaviour);
87     TESTCASE_AUTO(TestWordBreaks);
88     TESTCASE_AUTO(TestWordBoundary);
89     TESTCASE_AUTO(TestLineBreaks);
90     TESTCASE_AUTO(TestSentBreaks);
91     TESTCASE_AUTO(TestExtended);
92 #endif
93 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
94     TESTCASE_AUTO(TestMonkey);
95 #endif
96 #if !UCONFIG_NO_FILE_IO
97     TESTCASE_AUTO(TestBug3818);
98 #endif
99     TESTCASE_AUTO(TestDebug);
100 #if !UCONFIG_NO_FILE_IO
101     TESTCASE_AUTO(TestBug5775);
102 #endif
103     TESTCASE_AUTO(TestBug9983);
104     TESTCASE_AUTO(TestDictRules);
105     TESTCASE_AUTO(TestBug5532);
106     TESTCASE_AUTO(TestBug7547);
107     TESTCASE_AUTO(TestBug12797);
108     TESTCASE_AUTO(TestBug12918);
109     TESTCASE_AUTO(TestBug12932);
110     TESTCASE_AUTO(TestEmoji);
111     TESTCASE_AUTO(TestBug12519);
112     TESTCASE_AUTO(TestBug12677);
113     TESTCASE_AUTO(TestTableRedundancies);
114     TESTCASE_AUTO(TestBug13447);
115     TESTCASE_AUTO(TestReverse);
116     TESTCASE_AUTO(TestBug13692);
117     TESTCASE_AUTO_END;
118 }
119 
120 
121 //--------------------------------------------------------------------------------------
122 //
123 //    RBBITest    constructor and destructor
124 //
125 //--------------------------------------------------------------------------------------
126 
RBBITest()127 RBBITest::RBBITest() {
128     fTestParams = NULL;
129 }
130 
131 
~RBBITest()132 RBBITest::~RBBITest() {
133 }
134 
135 
printStringBreaks(UText * tstr,int expected[],int expectedCount)136 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
137     UErrorCode status = U_ZERO_ERROR;
138     char name[100];
139     printf("code    alpha extend alphanum type word sent line name\n");
140     int nextExpectedIndex = 0;
141     utext_setNativeIndex(tstr, 0);
142     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
143         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
144             printf("------------------------------------------------ %d\n", j);
145             ++nextExpectedIndex;
146         }
147 
148         UChar32 c = utext_next32(tstr);
149         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
150         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
151                            u_isUAlphabetic(c),
152                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
153                            u_isalnum(c),
154                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
155                                                   u_charType(c),
156                                                   U_SHORT_PROPERTY_NAME),
157                            u_getPropertyValueName(UCHAR_WORD_BREAK,
158                                                   u_getIntPropertyValue(c,
159                                                           UCHAR_WORD_BREAK),
160                                                   U_SHORT_PROPERTY_NAME),
161                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
162                                    u_getIntPropertyValue(c,
163                                            UCHAR_SENTENCE_BREAK),
164                                    U_SHORT_PROPERTY_NAME),
165                            u_getPropertyValueName(UCHAR_LINE_BREAK,
166                                    u_getIntPropertyValue(c,
167                                            UCHAR_LINE_BREAK),
168                                    U_SHORT_PROPERTY_NAME),
169                            name);
170     }
171 }
172 
173 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)174 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
175    UErrorCode status = U_ZERO_ERROR;
176    UText *tstr = NULL;
177    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
178    if (U_FAILURE(status)) {
179        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
180        return;
181     }
182    printStringBreaks(tstr, expected, expectedCount);
183    utext_close(tstr);
184 }
185 
186 
TestBug3818()187 void RBBITest::TestBug3818() {
188     UErrorCode  status = U_ZERO_ERROR;
189 
190     // Four Thai words...
191     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
192                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
193     UnicodeString  thaiStr(thaiWordData);
194 
195     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
196     if (U_FAILURE(status) || bi == NULL) {
197         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
198         return;
199     }
200     bi->setText(thaiStr);
201 
202     int32_t  startOfSecondWord = bi->following(1);
203     if (startOfSecondWord != 4) {
204         errln("Fail at file %s, line %d expected start of word at 4, got %d",
205             __FILE__, __LINE__, startOfSecondWord);
206     }
207     startOfSecondWord = bi->following(0);
208     if (startOfSecondWord != 4) {
209         errln("Fail at file %s, line %d expected start of word at 4, got %d",
210             __FILE__, __LINE__, startOfSecondWord);
211     }
212     delete bi;
213 }
214 
215 
216 //---------------------------------------------
217 //
218 //     other tests
219 //
220 //---------------------------------------------
221 
TestGetAvailableLocales()222 void RBBITest::TestGetAvailableLocales()
223 {
224     int32_t locCount = 0;
225     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
226 
227     if (locCount == 0)
228         dataerrln("getAvailableLocales() returned an empty list!");
229     // Just make sure that it's returning good memory.
230     int32_t i;
231     for (i = 0; i < locCount; ++i) {
232         logln(locList[i].getName());
233     }
234 }
235 
236 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()237 void RBBITest::TestGetDisplayName()
238 {
239     UnicodeString   result;
240 
241     BreakIterator::getDisplayName(Locale::getUS(), result);
242     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
243         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
244                 + result);
245 
246     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
247     if (result != "French (France)")
248         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
249                 + result);
250 }
251 /**
252  * Test End Behaviour
253  * @bug 4068137
254  */
TestEndBehaviour()255 void RBBITest::TestEndBehaviour()
256 {
257     UErrorCode status = U_ZERO_ERROR;
258     UnicodeString testString("boo.");
259     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
260     if (U_FAILURE(status))
261     {
262         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
263         return;
264     }
265     wb->setText(testString);
266 
267     if (wb->first() != 0)
268         errln("Didn't get break at beginning of string.");
269     if (wb->next() != 3)
270         errln("Didn't get break before period in \"boo.\"");
271     if (wb->current() != 4 && wb->next() != 4)
272         errln("Didn't get break at end of string.");
273     delete wb;
274 }
275 /*
276  * @bug 4153072
277  */
TestBug4153072()278 void RBBITest::TestBug4153072() {
279     UErrorCode status = U_ZERO_ERROR;
280     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
281     if (U_FAILURE(status))
282     {
283         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
284         return;
285     }
286     UnicodeString str("...Hello, World!...");
287     int32_t begin = 3;
288     int32_t end = str.length() - 3;
289     UBool onBoundary;
290 
291     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
292     iter->adoptText(textIterator);
293     int index;
294     // Note: with the switch to UText, there is no way to restrict the
295     //       iteration range to begin at an index other than zero.
296     //       String character iterators created with a non-zero bound are
297     //         treated by RBBI as being empty.
298     for (index = -1; index < begin + 1; ++index) {
299         onBoundary = iter->isBoundary(index);
300         if (index == 0?  !onBoundary : onBoundary) {
301             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
302                             " and begin index = " + begin);
303         }
304     }
305     delete iter;
306 }
307 
308 
309 //
310 // Test for problem reported by Ashok Matoria on 9 July 2007
311 //    One.<kSoftHyphen><kSpace>Two.
312 //
313 //    Sentence break at start (0) and then on calling next() it breaks at
314 //   'T' of "Two". Now, at this point if I do next() and
315 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
316 //
TestBug5775()317 void RBBITest::TestBug5775() {
318     UErrorCode status = U_ZERO_ERROR;
319     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
320     TEST_ASSERT_SUCCESS(status);
321     if (U_FAILURE(status)) {
322         return;
323     }
324 // Check for status first for better handling of no data errors.
325     TEST_ASSERT(bi != NULL);
326     if (bi == NULL) {
327         return;
328     }
329 
330     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
331     //               01234      56789
332     s = s.unescape();
333     bi->setText(s);
334     int pos = bi->next();
335     TEST_ASSERT(pos == 6);
336     pos = bi->next();
337     TEST_ASSERT(pos == 10);
338     pos = bi->previous();
339     TEST_ASSERT(pos == 6);
340     delete bi;
341 }
342 
343 
344 
345 //------------------------------------------------------------------------------
346 //
347 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
348 //
349 //------------------------------------------------------------------------------
350 
351 struct TestParams {
352     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
353                                            //   Changed out whenever test data changes break type.
354 
355     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
356     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
357     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
358     UVector32       *srcCol;
359 
360     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
361     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
362     CharString       utf8String;           // UTF-8 form of text to break.
363 
TestParamsTestParams364     TestParams(UErrorCode &status) : dataToBreak() {
365         bi               = NULL;
366         expectedBreaks   = new UVector32(status);
367         srcLine          = new UVector32(status);
368         srcCol           = new UVector32(status);
369         textToBreak      = NULL;
370         textMap          = new UVector32(status);
371     }
372 
~TestParamsTestParams373     ~TestParams() {
374         delete bi;
375         delete expectedBreaks;
376         delete srcLine;
377         delete srcCol;
378         utext_close(textToBreak);
379         delete textMap;
380     }
381 
382     int32_t getSrcLine(int32_t bp);
383     int32_t getExpectedBreak(int32_t bp);
384     int32_t getSrcCol(int32_t bp);
385 
386     void setUTF16(UErrorCode &status);
387     void setUTF8(UErrorCode &status);
388 };
389 
390 // Append a UnicodeString to a CharString with UTF-8 encoding.
391 // Substitute any invalid chars.
392 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)393 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
394     if (U_FAILURE(status)) {
395         return;
396     }
397     int32_t utf8Length;
398     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
399                        src.getBuffer(), src.length(),   // UTF-16 data
400                        0xfffd, NULL,                    // Substitution char, number of subs.
401                        &status);
402     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
403         return;
404     }
405     status = U_ZERO_ERROR;
406     int32_t capacity;
407     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
408     u_strToUTF8WithSub(buffer, utf8Length, NULL,
409                        src.getBuffer(), src.length(),
410                        0xfffd, NULL, &status);
411     dest.append(buffer, utf8Length, status);
412 }
413 
414 
setUTF16(UErrorCode & status)415 void TestParams::setUTF16(UErrorCode &status) {
416     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
417     textMap->removeAllElements();
418     for (int32_t i=0; i<dataToBreak.length(); i++) {
419         if (i == dataToBreak.getChar32Start(i)) {
420             textMap->addElement(i, status);
421         } else {
422             textMap->addElement(-1, status);
423         }
424     }
425     textMap->addElement(dataToBreak.length(), status);
426     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
427 }
428 
429 
setUTF8(UErrorCode & status)430 void TestParams::setUTF8(UErrorCode &status) {
431     if (U_FAILURE(status)) {
432         return;
433     }
434     utf8String.clear();
435     CharStringAppend(utf8String, dataToBreak, status);
436     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
437     if (U_FAILURE(status)) {
438         return;
439     }
440 
441     textMap->removeAllElements();
442     int32_t utf16Index = 0;
443     for (;;) {
444         textMap->addElement(utf16Index, status);
445         UChar32 c32 = utext_current32(textToBreak);
446         if (c32 < 0) {
447             break;
448         }
449         utf16Index += U16_LENGTH(c32);
450         utext_next32(textToBreak);
451         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
452             textMap->addElement(-1, status);
453         }
454     }
455     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
456 }
457 
458 
getSrcLine(int32_t bp)459 int32_t TestParams::getSrcLine(int32_t bp) {
460     if (bp >= textMap->size()) {
461         bp = textMap->size() - 1;
462     }
463     int32_t i = 0;
464     for(; bp >= 0 ; --bp) {
465         // Move to a character boundary if we are not on one already.
466         i = textMap->elementAti(bp);
467         if (i >= 0) {
468             break;
469         }
470     }
471     return srcLine->elementAti(i);
472 }
473 
474 
getExpectedBreak(int32_t bp)475 int32_t TestParams::getExpectedBreak(int32_t bp) {
476     if (bp >= textMap->size()) {
477         return 0;
478     }
479     int32_t i = textMap->elementAti(bp);
480     int32_t retVal = 0;
481     if (i >= 0) {
482         retVal = expectedBreaks->elementAti(i);
483     }
484     return retVal;
485 }
486 
487 
getSrcCol(int32_t bp)488 int32_t TestParams::getSrcCol(int32_t bp) {
489     if (bp >= textMap->size()) {
490         bp = textMap->size() - 1;
491     }
492     int32_t i = 0;
493     for(; bp >= 0; --bp) {
494         // Move bp to a character boundary if we are not on one already.
495         i = textMap->elementAti(bp);
496         if (i >= 0) {
497             break;
498         }
499     }
500     return srcCol->elementAti(i);
501 }
502 
503 
executeTest(TestParams * t,UErrorCode & status)504 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
505     int32_t    bp;
506     int32_t    prevBP;
507     int32_t    i;
508 
509     TEST_ASSERT_SUCCESS(status);
510     if (U_FAILURE(status)) {
511         return;
512     }
513 
514     if (t->bi == NULL) {
515         return;
516     }
517 
518     t->bi->setText(t->textToBreak, status);
519     //
520     //  Run the iterator forward
521     //
522     prevBP = -1;
523     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
524         if (prevBP ==  bp) {
525             // Fail for lack of forward progress.
526             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
527                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
528             break;
529         }
530 
531         // Check that there we didn't miss an expected break between the last one
532         //  and this one.
533         for (i=prevBP+1; i<bp; i++) {
534             if (t->getExpectedBreak(i) != 0) {
535                 int expected[] = {0, i};
536                 printStringBreaks(t->dataToBreak, expected, 2);
537                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
538                       i, t->getSrcLine(i), t->getSrcCol(i));
539             }
540         }
541 
542         // Check that the break we did find was expected
543         if (t->getExpectedBreak(bp) == 0) {
544             int expected[] = {0, bp};
545             printStringBreaks(t->textToBreak, expected, 2);
546             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
547                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
548         } else {
549             // The break was expected.
550             //   Check that the {nnn} tag value is correct.
551             int32_t expectedTagVal = t->getExpectedBreak(bp);
552             if (expectedTagVal == -1) {
553                 expectedTagVal = 0;
554             }
555             int32_t line = t->getSrcLine(bp);
556             int32_t rs = t->bi->getRuleStatus();
557             if (rs != expectedTagVal) {
558                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
559                       "          Actual, Expected status = %4d, %4d",
560                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
561             }
562         }
563 
564         prevBP = bp;
565     }
566 
567     // Verify that there were no missed expected breaks after the last one found
568     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
569         if (t->getExpectedBreak(i) != 0) {
570             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
571                       i, t->getSrcLine(i), t->getSrcCol(i));
572         }
573     }
574 
575     //
576     //  Run the iterator backwards, verify that the same breaks are found.
577     //
578     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
579     bp = t->bi->last();
580     while (bp != BreakIterator::DONE) {
581         if (prevBP ==  bp) {
582             // Fail for lack of progress.
583             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
584                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
585             break;
586         }
587 
588         // Check that we didn't miss an expected break between the last one
589         //  and this one.  (UVector returns zeros for index out of bounds.)
590         for (i=prevBP-1; i>bp; i--) {
591             if (t->getExpectedBreak(i) != 0) {
592                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
593                       i, t->getSrcLine(i), t->getSrcCol(i));
594             }
595         }
596 
597         // Check that the break we did find was expected
598         if (t->getExpectedBreak(bp) == 0) {
599             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
600                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
601         } else {
602             // The break was expected.
603             //   Check that the {nnn} tag value is correct.
604             int32_t expectedTagVal = t->getExpectedBreak(bp);
605             if (expectedTagVal == -1) {
606                 expectedTagVal = 0;
607             }
608             int line = t->getSrcLine(bp);
609             int32_t rs = t->bi->getRuleStatus();
610             if (rs != expectedTagVal) {
611                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
612                       "          Actual, Expected status = %4d, %4d",
613                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
614             }
615         }
616 
617         prevBP = bp;
618         bp = t->bi->previous();
619     }
620 
621     // Verify that there were no missed breaks prior to the last one found
622     for (i=prevBP-1; i>=0; i--) {
623         if (t->getExpectedBreak(i) != 0) {
624             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
625                       i, t->getSrcLine(i), t->getSrcCol(i));
626         }
627     }
628 
629     // Check isBoundary()
630     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
631         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
632         UBool boundaryFound    = t->bi->isBoundary(i);
633         if (boundaryExpected != boundaryFound) {
634             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
635                   "        Expected, Actual= %s, %s",
636                   i, t->getSrcLine(i), t->getSrcCol(i),
637                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
638         }
639     }
640 
641     // Check following()
642     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
643         int32_t actualBreak = t->bi->following(i);
644         int32_t expectedBreak = BreakIterator::DONE;
645         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
646             if (t->getExpectedBreak(j) != 0) {
647                 expectedBreak = j;
648                 break;
649             }
650         }
651         if (expectedBreak != actualBreak) {
652             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
653                   "        Expected, Actual= %d, %d",
654                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
655         }
656     }
657 
658     // Check preceding()
659     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
660         int32_t actualBreak = t->bi->preceding(i);
661         int32_t expectedBreak = BreakIterator::DONE;
662 
663         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
664         // preceding(trailing byte) will return the index of some preceding code point,
665         // not the lead byte of the current code point, even though that has a smaller index.
666         // Therefore, start looking at the expected break data not at i-1, but at
667         // the start of code point index - 1.
668         utext_setNativeIndex(t->textToBreak, i);
669         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
670         for (; j >= 0; j--) {
671             if (t->getExpectedBreak(j) != 0) {
672                 expectedBreak = j;
673                 break;
674             }
675         }
676         if (expectedBreak != actualBreak) {
677             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
678                   "        Expected, Actual= %d, %d",
679                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
680         }
681     }
682 }
683 
684 
TestExtended()685 void RBBITest::TestExtended() {
686   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
687   // data driven test closely entangles filtered and regular data.
688 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
689     UErrorCode      status  = U_ZERO_ERROR;
690     Locale          locale("");
691 
692     TestParams          tp(status);
693 
694     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
695     if (U_FAILURE(status)) {
696         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
697     }
698 
699     //
700     //  Open and read the test data file.
701     //
702     const char *testDataDirectory = IntlTest::getSourceTestData(status);
703     CharString testFileName(testDataDirectory, -1, status);
704     testFileName.append("rbbitst.txt", -1, status);
705 
706     int    len;
707     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
708     if (U_FAILURE(status)) {
709         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
710         return;
711     }
712 
713     bool skipTest = false; // Skip this test?
714 
715     //
716     //  Put the test data into a UnicodeString
717     //
718     UnicodeString testString(FALSE, testFile, len);
719 
720     enum EParseState{
721         PARSE_COMMENT,
722         PARSE_TAG,
723         PARSE_DATA,
724         PARSE_NUM,
725         PARSE_RULES
726     }
727     parseState = PARSE_TAG;
728 
729     EParseState savedState = PARSE_TAG;
730 
731     int32_t    lineNum  = 1;
732     int32_t    colStart = 0;
733     int32_t    column   = 0;
734     int32_t    charIdx  = 0;
735 
736     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
737 
738     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
739     int32_t             rulesFirstLine;  // Line number of the start of current <rules> block
740 
741     for (charIdx = 0; charIdx < len; ) {
742         status = U_ZERO_ERROR;
743         UChar  c = testString.charAt(charIdx);
744         charIdx++;
745         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
746             // treat CRLF as a unit
747             c = u'\n';
748             charIdx++;
749         }
750         if (c == u'\n' || c == u'\r') {
751             lineNum++;
752             colStart = charIdx;
753         }
754         column = charIdx - colStart + 1;
755 
756         switch (parseState) {
757         case PARSE_COMMENT:
758             if (c == u'\n' || c == u'\r') {
759                 parseState = savedState;
760             }
761             break;
762 
763         case PARSE_TAG:
764             {
765             if (c == u'#') {
766                 parseState = PARSE_COMMENT;
767                 savedState = PARSE_TAG;
768                 break;
769             }
770             if (u_isUWhiteSpace(c)) {
771                 break;
772             }
773             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
774                 delete tp.bi;
775                 tp.bi = BreakIterator::createWordInstance(locale,  status);
776                 skipTest = false;
777                 charIdx += 5;
778                 break;
779             }
780             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
781                 delete tp.bi;
782                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
783                 skipTest = false;
784                 charIdx += 5;
785                 break;
786             }
787             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
788                 delete tp.bi;
789                 tp.bi = BreakIterator::createLineInstance(locale,  status);
790                 skipTest = false;
791                 charIdx += 5;
792                 break;
793             }
794             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
795                 delete tp.bi;
796                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
797                 skipTest = false;
798                 charIdx += 5;
799                 break;
800             }
801             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
802                 delete tp.bi;
803                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
804                 charIdx += 6;
805                 break;
806             }
807 
808             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
809                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
810                 charIdx = testString.indexOf(u'>', charIdx) + 1;
811                 parseState = PARSE_RULES;
812                 rules.remove();
813                 rulesFirstLine = lineNum;
814                 break;
815             }
816 
817             // <locale  loc_name>
818             localeMatcher.reset(testString);
819             if (localeMatcher.lookingAt(charIdx-1, status)) {
820                 UnicodeString localeName = localeMatcher.group(1, status);
821                 char localeName8[100];
822                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
823                 locale = Locale::createFromName(localeName8);
824                 charIdx += localeMatcher.group(0, status).length() - 1;
825                 TEST_ASSERT_SUCCESS(status);
826                 break;
827             }
828             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
829                 parseState = PARSE_DATA;
830                 charIdx += 5;
831                 tp.dataToBreak = "";
832                 tp.expectedBreaks->removeAllElements();
833                 tp.srcCol ->removeAllElements();
834                 tp.srcLine->removeAllElements();
835                 break;
836             }
837 
838             errln("line %d: Tag expected in test file.", lineNum);
839             parseState = PARSE_COMMENT;
840             savedState = PARSE_DATA;
841             goto end_test; // Stop the test.
842             }
843             break;
844 
845         case PARSE_RULES:
846             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
847                 charIdx += 7;
848                 parseState = PARSE_TAG;
849                 delete tp.bi;
850                 UParseError pe;
851                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
852                 skipTest = U_FAILURE(status);
853                 if (U_FAILURE(status)) {
854                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
855                         rulesFirstLine + pe.line - 1, u_errorName(status));
856                 }
857             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
858                 charIdx += 10;
859                 parseState = PARSE_TAG;
860                 UErrorCode ec = U_ZERO_ERROR;
861                 UParseError pe;
862                 RuleBasedBreakIterator bi(rules, pe, ec);
863                 if (U_SUCCESS(ec)) {
864                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
865                         rulesFirstLine + pe.line - 1);
866                 }
867             } else {
868                 rules.append(c);
869             }
870             break;
871 
872         case PARSE_DATA:
873             if (c == u'•') {
874                 int32_t  breakIdx = tp.dataToBreak.length();
875                 tp.expectedBreaks->setSize(breakIdx+1);
876                 tp.expectedBreaks->setElementAt(-1, breakIdx);
877                 tp.srcLine->setSize(breakIdx+1);
878                 tp.srcLine->setElementAt(lineNum, breakIdx);
879                 tp.srcCol ->setSize(breakIdx+1);
880                 tp.srcCol ->setElementAt(column, breakIdx);
881                 break;
882             }
883 
884             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
885                 // Add final entry to mappings from break location to source file position.
886                 //  Need one extra because last break position returned is after the
887                 //    last char in the data, not at the last char.
888                 tp.srcLine->addElement(lineNum, status);
889                 tp.srcCol ->addElement(column, status);
890 
891                 parseState = PARSE_TAG;
892                 charIdx += 6;
893 
894                 if (!skipTest) {
895                     // RUN THE TEST!
896                     status = U_ZERO_ERROR;
897                     tp.setUTF16(status);
898                     executeTest(&tp, status);
899                     TEST_ASSERT_SUCCESS(status);
900 
901                     // Run again, this time with UTF-8 text wrapped in a UText.
902                     status = U_ZERO_ERROR;
903                     tp.setUTF8(status);
904                     TEST_ASSERT_SUCCESS(status);
905                     executeTest(&tp, status);
906                 }
907                 break;
908             }
909 
910             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
911                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
912                 // Get the code point from the name and insert it into the test data.
913                 //   (Damn, no API takes names in Unicode  !!!
914                 //    we've got to take it back to char *)
915                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
916                 int32_t nameLength = nameEndIdx - (charIdx+2);
917                 char charNameBuf[200];
918                 UChar32 theChar = -1;
919                 if (nameEndIdx != -1) {
920                     UErrorCode status = U_ZERO_ERROR;
921                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
922                     charNameBuf[sizeof(charNameBuf)-1] = 0;
923                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
924                     if (U_FAILURE(status)) {
925                         theChar = -1;
926                     }
927                 }
928                 if (theChar == -1) {
929                     errln("Error in named character in test file at line %d, col %d",
930                         lineNum, column);
931                 } else {
932                     // Named code point was recognized.  Insert it
933                     //   into the test data.
934                     tp.dataToBreak.append(theChar);
935                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
936                         tp.srcLine->addElement(lineNum, status);
937                         tp.srcCol ->addElement(column, status);
938                     }
939                 }
940                 if (nameEndIdx > charIdx) {
941                     charIdx = nameEndIdx+1;
942 
943                 }
944                 break;
945             }
946 
947 
948 
949             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
950                 charIdx++;
951                 int32_t  breakIdx = tp.dataToBreak.length();
952                 tp.expectedBreaks->setSize(breakIdx+1);
953                 tp.expectedBreaks->setElementAt(-1, breakIdx);
954                 tp.srcLine->setSize(breakIdx+1);
955                 tp.srcLine->setElementAt(lineNum, breakIdx);
956                 tp.srcCol ->setSize(breakIdx+1);
957                 tp.srcCol ->setElementAt(column, breakIdx);
958                 break;
959             }
960 
961             if (c == u'<') {
962                 tagValue   = 0;
963                 parseState = PARSE_NUM;
964                 break;
965             }
966 
967             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
968                 parseState = PARSE_COMMENT;
969                 savedState = PARSE_DATA;
970                 break;
971             }
972 
973             if (c == u'\\') {
974                 // Check for \ at end of line, a line continuation.
975                 //     Advance over (discard) the newline
976                 UChar32 cp = testString.char32At(charIdx);
977                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
978                     // We have a CR LF
979                     //  Need an extra increment of the input ptr to move over both of them
980                     charIdx++;
981                 }
982                 if (cp == u'\n' || cp == u'\r') {
983                     lineNum++;
984                     colStart = charIdx;
985                     charIdx++;
986                     break;
987                 }
988 
989                 // Let unescape handle the back slash.
990                 cp = testString.unescapeAt(charIdx);
991                 if (cp != -1) {
992                     // Escape sequence was recognized.  Insert the char
993                     //   into the test data.
994                     tp.dataToBreak.append(cp);
995                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
996                         tp.srcLine->addElement(lineNum, status);
997                         tp.srcCol ->addElement(column, status);
998                     }
999                     break;
1000                 }
1001 
1002 
1003                 // Not a recognized backslash escape sequence.
1004                 // Take the next char as a literal.
1005                 //  TODO:  Should this be an error?
1006                 c = testString.charAt(charIdx);
1007                 charIdx = testString.moveIndex32(charIdx, 1);
1008             }
1009 
1010             // Normal, non-escaped data char.
1011             tp.dataToBreak.append(c);
1012 
1013             // Save the mapping from offset in the data to line/column numbers in
1014             //   the original input file.  Will be used for better error messages only.
1015             //   If there's an expected break before this char, the slot in the mapping
1016             //     vector will already be set for this char; don't overwrite it.
1017             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1018                 tp.srcLine->addElement(lineNum, status);
1019                 tp.srcCol ->addElement(column, status);
1020             }
1021             break;
1022 
1023 
1024         case PARSE_NUM:
1025             // We are parsing an expected numeric tag value, like <1234>,
1026             //   within a chunk of data.
1027             if (u_isUWhiteSpace(c)) {
1028                 break;
1029             }
1030 
1031             if (c == u'>') {
1032                 // Finished the number.  Add the info to the expected break data,
1033                 //   and switch parse state back to doing plain data.
1034                 parseState = PARSE_DATA;
1035                 if (tagValue == 0) {
1036                     tagValue = -1;
1037                 }
1038                 int32_t  breakIdx = tp.dataToBreak.length();
1039                 tp.expectedBreaks->setSize(breakIdx+1);
1040                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1041                 tp.srcLine->setSize(breakIdx+1);
1042                 tp.srcLine->setElementAt(lineNum, breakIdx);
1043                 tp.srcCol ->setSize(breakIdx+1);
1044                 tp.srcCol ->setElementAt(column, breakIdx);
1045                 break;
1046             }
1047 
1048             if (u_isdigit(c)) {
1049                 tagValue = tagValue*10 + u_charDigitValue(c);
1050                 break;
1051             }
1052 
1053             errln("Syntax Error in test file at line %d, col %d",
1054                 lineNum, column);
1055             parseState = PARSE_COMMENT;
1056             goto end_test; // Stop the test
1057             break;
1058         }
1059 
1060 
1061         if (U_FAILURE(status)) {
1062             dataerrln("ICU Error %s while parsing test file at line %d.",
1063                 u_errorName(status), lineNum);
1064             status = U_ZERO_ERROR;
1065             goto end_test; // Stop the test
1066         }
1067 
1068     }
1069 
1070     // Reached end of test file. Raise an error if parseState indicates that we are
1071     //   within a block that should have been terminated.
1072 
1073     if (parseState == PARSE_RULES) {
1074         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1075             lineNum, rulesFirstLine);
1076     }
1077     if (parseState == PARSE_DATA) {
1078         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1079     }
1080 
1081 
1082 end_test:
1083     delete [] testFile;
1084 #endif
1085 }
1086 
1087 
1088 //-------------------------------------------------------------------------------
1089 //
1090 //  TestDictRules   create a break iterator from source rules that includes a
1091 //                  dictionary range.   Regression for bug #7130.  Source rules
1092 //                  do not declare a break iterator type (word, line, sentence, etc.
1093 //                  but the dictionary code, without a type, would loop.
1094 //
1095 //-------------------------------------------------------------------------------
TestDictRules()1096 void RBBITest::TestDictRules() {
1097     const char *rules =  "$dictionary = [a-z]; \n"
1098                          "!!forward; \n"
1099                          "$dictionary $dictionary; \n"
1100                          "!!reverse; \n"
1101                          "$dictionary $dictionary; \n";
1102     const char *text = "aa";
1103     UErrorCode status = U_ZERO_ERROR;
1104     UParseError parseError;
1105 
1106     RuleBasedBreakIterator bi(rules, parseError, status);
1107     if (U_SUCCESS(status)) {
1108         UnicodeString utext = text;
1109         bi.setText(utext);
1110         int32_t position;
1111         int32_t loops;
1112         for (loops = 0; loops<10; loops++) {
1113             position = bi.next();
1114             if (position == RuleBasedBreakIterator::DONE) {
1115                 break;
1116             }
1117         }
1118         TEST_ASSERT(loops == 1);
1119     } else {
1120         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1121     }
1122 }
1123 
1124 
1125 
1126 //-------------------------------------------------------------------------------
1127 //
1128 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1129 //    return the data in one big UChar * buffer, which the caller must delete.
1130 //
1131 //    parameters:
1132 //          fileName:   the name of the file, with no directory part.  The test data directory
1133 //                      is assumed.
1134 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1135 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1136 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1137 //                      Pass NULL for the system default encoding.
1138 //          status
1139 //    returns:
1140 //                      The file data, converted to UChar.
1141 //                      The caller must delete this when done with
1142 //                           delete [] theBuffer;
1143 //
1144 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1145 //           Move this function to some common place.
1146 //
1147 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1148 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1149     UChar       *retPtr  = NULL;
1150     char        *fileBuf = NULL;
1151     UConverter* conv     = NULL;
1152     FILE        *f       = NULL;
1153 
1154     ulen = 0;
1155     if (U_FAILURE(status)) {
1156         return retPtr;
1157     }
1158 
1159     //
1160     //  Open the file.
1161     //
1162     f = fopen(fileName, "rb");
1163     if (f == 0) {
1164         dataerrln("Error opening test data file %s\n", fileName);
1165         status = U_FILE_ACCESS_ERROR;
1166         return NULL;
1167     }
1168     //
1169     //  Read it in
1170     //
1171     int   fileSize;
1172     int   amt_read;
1173 
1174     fseek( f, 0, SEEK_END);
1175     fileSize = ftell(f);
1176     fileBuf = new char[fileSize];
1177     fseek(f, 0, SEEK_SET);
1178     amt_read = fread(fileBuf, 1, fileSize, f);
1179     if (amt_read != fileSize || fileSize <= 0) {
1180         errln("Error reading test data file.");
1181         goto cleanUpAndReturn;
1182     }
1183 
1184     //
1185     // Look for a Unicode Signature (BOM) on the data just read
1186     //
1187     int32_t        signatureLength;
1188     const char *   fileBufC;
1189     const char*    bomEncoding;
1190 
1191     fileBufC = fileBuf;
1192     bomEncoding = ucnv_detectUnicodeSignature(
1193         fileBuf, fileSize, &signatureLength, &status);
1194     if(bomEncoding!=NULL ){
1195         fileBufC  += signatureLength;
1196         fileSize  -= signatureLength;
1197         encoding = bomEncoding;
1198     }
1199 
1200     //
1201     // Open a converter to take the rule file to UTF-16
1202     //
1203     conv = ucnv_open(encoding, &status);
1204     if (U_FAILURE(status)) {
1205         goto cleanUpAndReturn;
1206     }
1207 
1208     //
1209     // Convert the rules to UChar.
1210     //  Preflight first to determine required buffer size.
1211     //
1212     ulen = ucnv_toUChars(conv,
1213         NULL,           //  dest,
1214         0,              //  destCapacity,
1215         fileBufC,
1216         fileSize,
1217         &status);
1218     if (status == U_BUFFER_OVERFLOW_ERROR) {
1219         // Buffer Overflow is expected from the preflight operation.
1220         status = U_ZERO_ERROR;
1221 
1222         retPtr = new UChar[ulen+1];
1223         ucnv_toUChars(conv,
1224             retPtr,       //  dest,
1225             ulen+1,
1226             fileBufC,
1227             fileSize,
1228             &status);
1229     }
1230 
1231 cleanUpAndReturn:
1232     fclose(f);
1233     delete []fileBuf;
1234     ucnv_close(conv);
1235     if (U_FAILURE(status)) {
1236         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1237         delete []retPtr;
1238         retPtr = 0;
1239         ulen   = 0;
1240     };
1241     return retPtr;
1242 }
1243 
1244 
1245 
1246 //--------------------------------------------------------------------------------------------
1247 //
1248 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1249 //
1250 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1251 void RBBITest::TestUnicodeFiles() {
1252     RuleBasedBreakIterator  *bi;
1253     UErrorCode               status = U_ZERO_ERROR;
1254 
1255     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1256     TEST_ASSERT_SUCCESS(status);
1257     if (U_SUCCESS(status)) {
1258         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1259     }
1260     delete bi;
1261 
1262     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1263     TEST_ASSERT_SUCCESS(status);
1264     if (U_SUCCESS(status)) {
1265         runUnicodeTestData("WordBreakTest.txt", bi);
1266     }
1267     delete bi;
1268 
1269     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1270     TEST_ASSERT_SUCCESS(status);
1271     if (U_SUCCESS(status)) {
1272         runUnicodeTestData("SentenceBreakTest.txt", bi);
1273     }
1274     delete bi;
1275 
1276     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1277     TEST_ASSERT_SUCCESS(status);
1278     if (U_SUCCESS(status)) {
1279         runUnicodeTestData("LineBreakTest.txt", bi);
1280     }
1281     delete bi;
1282 }
1283 
1284 
1285 // Check for test cases from the Unicode test data files that are known to fail
1286 // and should be skipped as known issues because ICU does not fully implement
1287 // the Unicode specifications, or because ICU includes tailorings that differ from
1288 // the Unicode standard.
1289 //
1290 // Test cases are identified by the test data sequence, which tends to be more stable
1291 // across Unicode versions than the test file line numbers.
1292 //
1293 // The test case with ticket "10666" is a dummy, included as an example.
1294 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1295 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1296     static struct TestCase {
1297         const char *fTicketNum;
1298         const char *fFileName;
1299         const UChar *fString;
1300     } badTestCases[] = {
1301         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1302         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1303         // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1304         // ICU is out of sync with Unicode.
1305         {"8151",  "LineBreakTest.txt", u"-#"},
1306         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1307         {"8151",  "LineBreakTest.txt", u"\u002d\u00a7"},
1308         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1309         {"8151",  "LineBreakTest.txt", u"\u002d\U00050005"},
1310         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1311         {"8151",  "LineBreakTest.txt", u"\u002d\u0e01"},
1312         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1313     };
1314 
1315     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1316         const TestCase &badCase = badTestCases[n];
1317         if (!strcmp(fileName, badCase.fFileName) &&
1318                 testCase == UnicodeString(badCase.fString)) {
1319             return logKnownIssue(badCase.fTicketNum);
1320         }
1321     }
1322     return FALSE;
1323 }
1324 
1325 
1326 //--------------------------------------------------------------------------------------------
1327 //
1328 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1329 //
1330 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1331 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1332 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1333     UErrorCode  status = U_ZERO_ERROR;
1334 
1335     //
1336     //  Open and read the test data file, put it into a UnicodeString.
1337     //
1338     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1339     char testFileName[1000];
1340     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1341         dataerrln("Can't open test data.  Path too long.");
1342         return;
1343     }
1344     strcpy(testFileName, testDataDirectory);
1345     strcat(testFileName, fileName);
1346 
1347     logln("Opening data file %s\n", fileName);
1348 
1349     int    len;
1350     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1351     if (status != U_FILE_ACCESS_ERROR) {
1352         TEST_ASSERT_SUCCESS(status);
1353         TEST_ASSERT(testFile != NULL);
1354     }
1355     if (U_FAILURE(status) || testFile == NULL) {
1356         return; /* something went wrong, error already output */
1357     }
1358     UnicodeString testFileAsString(TRUE, testFile, len);
1359 
1360     //
1361     //  Parse the test data file using a regular expression.
1362     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1363     //     is identified by which group had a match.
1364     //
1365     //    Caputure Group #                  1          2            3            4           5
1366     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1367     //
1368     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1369     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1370     UnicodeString   testString;
1371     UVector32       breakPositions(status);
1372     int             lineNumber = 1;
1373     TEST_ASSERT_SUCCESS(status);
1374     if (U_FAILURE(status)) {
1375         return;
1376     }
1377 
1378     //
1379     //  Scan through each test case, building up the string to be broken in testString,
1380     //   and the positions that should be boundaries in the breakPositions vector.
1381     //
1382     int spin = 0;
1383     while (tokenMatcher.find()) {
1384       	if(tokenMatcher.hitEnd()) {
1385           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1386              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1387              and caused an infinite loop here on EBCDIC systems!
1388           */
1389           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1390           //	   return;
1391       	}
1392         if (tokenMatcher.start(1, status) >= 0) {
1393             // Scanned a divide sign, indicating a break position in the test data.
1394             if (testString.length()>0) {
1395                 breakPositions.addElement(testString.length(), status);
1396             }
1397         }
1398         else if (tokenMatcher.start(2, status) >= 0) {
1399             // Scanned an 'x', meaning no break at this position in the test data
1400             //   Nothing to be done here.
1401             }
1402         else if (tokenMatcher.start(3, status) >= 0) {
1403             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1404             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1405             int length = hexNumber.length();
1406             if (length<=8) {
1407                 char buf[10];
1408                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1409                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1410                 if (c<=0x10ffff) {
1411                     testString.append(c);
1412                 } else {
1413                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1414                        fileName, lineNumber);
1415                 }
1416             } else {
1417                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1418                        fileName, lineNumber);
1419              }
1420         }
1421         else if (tokenMatcher.start(4, status) >= 0) {
1422             // Scanned to end of a line, possibly skipping over a comment in the process.
1423             //   If the line from the file contained test data, run the test now.
1424             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1425                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1426             }
1427 
1428             // Clear out this test case.
1429             //    The string and breakPositions vector will be refilled as the next
1430             //       test case is parsed.
1431             testString.remove();
1432             breakPositions.removeAllElements();
1433             lineNumber++;
1434         } else {
1435             // Scanner catchall.  Something unrecognized appeared on the line.
1436             char token[16];
1437             UnicodeString uToken = tokenMatcher.group(0, status);
1438             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1439             token[sizeof(token)-1] = 0;
1440             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1441 
1442             // Clean up, in preparation for continuing with the next line.
1443             testString.remove();
1444             breakPositions.removeAllElements();
1445             lineNumber++;
1446         }
1447         TEST_ASSERT_SUCCESS(status);
1448         if (U_FAILURE(status)) {
1449             break;
1450         }
1451     }
1452 
1453     delete [] testFile;
1454  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1455 }
1456 
1457 //--------------------------------------------------------------------------------------------
1458 //
1459 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1460 //                            test data files.  Do only a simple, forward-only check -
1461 //                            this test is mostly to check that ICU and the Unicode
1462 //                            data agree with each other.
1463 //
1464 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1465 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1466                          const UnicodeString &testString,   // Text data to be broken
1467                          UVector32 *breakPositions,         // Positions where breaks should be found.
1468                          RuleBasedBreakIterator *bi) {
1469     int32_t pos;                 // Break Position in the test string
1470     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1471     int32_t expectedPos;         // Expected break position (index into test string)
1472 
1473     bi->setText(testString);
1474     pos = bi->first();
1475     pos = bi->next();
1476 
1477     while (pos != BreakIterator::DONE) {
1478         if (expectedI >= breakPositions->size()) {
1479             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1480                 testFileName, lineNumber, pos);
1481             break;
1482         }
1483         expectedPos = breakPositions->elementAti(expectedI);
1484         if (pos < expectedPos) {
1485             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1486                 testFileName, lineNumber, pos);
1487             break;
1488         }
1489         if (pos > expectedPos) {
1490             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1491                 testFileName, lineNumber, expectedPos);
1492             break;
1493         }
1494         pos = bi->next();
1495         expectedI++;
1496     }
1497 
1498     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1499         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1500             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1501     }
1502 }
1503 
1504 
1505 
1506 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1507 //---------------------------------------------------------------------------------------
1508 //
1509 //   classs RBBIMonkeyKind
1510 //
1511 //      Monkey Test for Break Iteration
1512 //      Abstract interface class.   Concrete derived classes independently
1513 //      implement the break rules for different iterator types.
1514 //
1515 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1516 //      testing, but works purely in terms of the interface defined here.
1517 //
1518 //---------------------------------------------------------------------------------------
1519 class RBBIMonkeyKind {
1520 public:
1521     // Return a UVector of UnicodeSets, representing the character classes used
1522     //   for this type of iterator.
1523     virtual  UVector  *charClasses() = 0;
1524 
1525     // Set the test text on which subsequent calls to next() will operate
1526     virtual  void      setText(const UnicodeString &s) = 0;
1527 
1528     // Find the next break postion, starting from the prev break position, or from zero.
1529     // Return -1 after reaching end of string.
1530     virtual  int32_t   next(int32_t i) = 0;
1531 
1532     virtual ~RBBIMonkeyKind();
1533     UErrorCode       deferredStatus;
1534 
1535 
1536 protected:
1537     RBBIMonkeyKind();
1538 
1539 private:
1540 };
1541 
RBBIMonkeyKind()1542 RBBIMonkeyKind::RBBIMonkeyKind() {
1543     deferredStatus = U_ZERO_ERROR;
1544 }
1545 
~RBBIMonkeyKind()1546 RBBIMonkeyKind::~RBBIMonkeyKind() {
1547 }
1548 
1549 
1550 //----------------------------------------------------------------------------------------
1551 //
1552 //   Random Numbers.  Similar to standard lib rand() and srand()
1553 //                    Not using library to
1554 //                      1.  Get same results on all platforms.
1555 //                      2.  Get access to current seed, to more easily reproduce failures.
1556 //
1557 //---------------------------------------------------------------------------------------
1558 static uint32_t m_seed = 1;
1559 
m_rand()1560 static uint32_t m_rand()
1561 {
1562     m_seed = m_seed * 1103515245 + 12345;
1563     return (uint32_t)(m_seed/65536) % 32768;
1564 }
1565 
1566 
1567 //------------------------------------------------------------------------------------------
1568 //
1569 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1570 //                             of RBBIMonkeyKind.
1571 //
1572 //------------------------------------------------------------------------------------------
1573 class RBBICharMonkey: public RBBIMonkeyKind {
1574 public:
1575     RBBICharMonkey();
1576     virtual          ~RBBICharMonkey();
1577     virtual  UVector *charClasses();
1578     virtual  void     setText(const UnicodeString &s);
1579     virtual  int32_t  next(int32_t i);
1580 private:
1581     UVector   *fSets;
1582 
1583     UnicodeSet  *fCRLFSet;
1584     UnicodeSet  *fControlSet;
1585     UnicodeSet  *fExtendSet;
1586     UnicodeSet  *fZWJSet;
1587     UnicodeSet  *fRegionalIndicatorSet;
1588     UnicodeSet  *fPrependSet;
1589     UnicodeSet  *fSpacingSet;
1590     UnicodeSet  *fLSet;
1591     UnicodeSet  *fVSet;
1592     UnicodeSet  *fTSet;
1593     UnicodeSet  *fLVSet;
1594     UnicodeSet  *fLVTSet;
1595     UnicodeSet  *fHangulSet;
1596     UnicodeSet  *fExtendedPictSet;
1597     UnicodeSet  *fAnySet;
1598 
1599     const UnicodeString *fText;
1600 };
1601 
1602 
RBBICharMonkey()1603 RBBICharMonkey::RBBICharMonkey() {
1604     UErrorCode  status = U_ZERO_ERROR;
1605 
1606     fText = NULL;
1607 
1608     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1609     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1610     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1611     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1612     fRegionalIndicatorSet =
1613                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1614     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1615     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1616     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1617     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1618     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1619     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1620     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1621     fHangulSet  = new UnicodeSet();
1622     fHangulSet->addAll(*fLSet);
1623     fHangulSet->addAll(*fVSet);
1624     fHangulSet->addAll(*fTSet);
1625     fHangulSet->addAll(*fLVSet);
1626     fHangulSet->addAll(*fLVTSet);
1627 
1628     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1629     fAnySet           = new UnicodeSet(0, 0x10ffff);
1630 
1631     fSets             = new UVector(status);
1632     fSets->addElement(fCRLFSet,    status);
1633     fSets->addElement(fControlSet, status);
1634     fSets->addElement(fExtendSet,  status);
1635     fSets->addElement(fRegionalIndicatorSet, status);
1636     if (!fPrependSet->isEmpty()) {
1637         fSets->addElement(fPrependSet, status);
1638     }
1639     fSets->addElement(fSpacingSet, status);
1640     fSets->addElement(fHangulSet,  status);
1641     fSets->addElement(fAnySet,     status);
1642     fSets->addElement(fZWJSet,     status);
1643     fSets->addElement(fExtendedPictSet, status);
1644     if (U_FAILURE(status)) {
1645         deferredStatus = status;
1646     }
1647 }
1648 
1649 
setText(const UnicodeString & s)1650 void RBBICharMonkey::setText(const UnicodeString &s) {
1651     fText = &s;
1652 }
1653 
1654 
1655 
next(int32_t prevPos)1656 int32_t RBBICharMonkey::next(int32_t prevPos) {
1657     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1658                               //   break position being tested.  The candidate break
1659                               //   location is before p2.
1660 
1661     int     breakPos = -1;
1662 
1663     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1664     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1665 
1666     if (U_FAILURE(deferredStatus)) {
1667         return -1;
1668     }
1669 
1670     // Previous break at end of string.  return DONE.
1671     if (prevPos >= fText->length()) {
1672         return -1;
1673     }
1674     p0 = p1 = p2 = p3 = prevPos;
1675     c3 =  fText->char32At(prevPos);
1676     c0 = c1 = c2 = cBase = 0;
1677     (void)p0;   // suppress set but not used warning.
1678     (void)c0;
1679 
1680     // Loop runs once per "significant" character position in the input text.
1681     for (;;) {
1682         // Move all of the positions forward in the input string.
1683         p0 = p1;  c0 = c1;
1684         p1 = p2;  c1 = c2;
1685         p2 = p3;  c2 = c3;
1686 
1687         // Advancd p3 by one codepoint
1688         p3 = fText->moveIndex32(p3, 1);
1689         c3 = fText->char32At(p3);
1690 
1691         if (p1 == p2) {
1692             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1693             continue;
1694         }
1695         if (p2 == fText->length()) {
1696             // Reached end of string.  Always a break position.
1697             break;
1698         }
1699 
1700         // Rule  GB3   CR x LF
1701         //     No Extend or Format characters may appear between the CR and LF,
1702         //     which requires the additional check for p2 immediately following p1.
1703         //
1704         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1705             continue;
1706         }
1707 
1708         // Rule (GB4).   ( Control | CR | LF ) <break>
1709         if (fControlSet->contains(c1) ||
1710             c1 == 0x0D ||
1711             c1 == 0x0A)  {
1712             break;
1713         }
1714 
1715         // Rule (GB5)    <break>  ( Control | CR | LF )
1716         //
1717         if (fControlSet->contains(c2) ||
1718             c2 == 0x0D ||
1719             c2 == 0x0A)  {
1720             break;
1721         }
1722 
1723 
1724         // Rule (GB6)  L x ( L | V | LV | LVT )
1725         if (fLSet->contains(c1) &&
1726                (fLSet->contains(c2)  ||
1727                 fVSet->contains(c2)  ||
1728                 fLVSet->contains(c2) ||
1729                 fLVTSet->contains(c2))) {
1730             continue;
1731         }
1732 
1733         // Rule (GB7)    ( LV | V )  x  ( V | T )
1734         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1735             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1736             continue;
1737         }
1738 
1739         // Rule (GB8)    ( LVT | T)  x T
1740         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1741             fTSet->contains(c2))  {
1742             continue;
1743         }
1744 
1745         // Rule (GB9)    x (Extend | ZWJ)
1746         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1747             if (!fExtendSet->contains(c1)) {
1748                 cBase = c1;
1749             }
1750             continue;
1751         }
1752 
1753         // Rule (GB9a)   x  SpacingMark
1754         if (fSpacingSet->contains(c2)) {
1755             continue;
1756         }
1757 
1758         // Rule (GB9b)   Prepend x
1759         if (fPrependSet->contains(c1)) {
1760             continue;
1761         }
1762 
1763         // Rule (GB11)   Extended_Pictographic Extend * ZWJ x Extended_Pictographic
1764         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1765             continue;
1766         }
1767 
1768         // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
1769         //                   Note: The first if condition is a little tricky. We only need to force
1770         //                      a break if there are three or more contiguous RIs. If there are
1771         //                      only two, a break following will occur via other rules, and will include
1772         //                      any trailing extend characters, which is needed behavior.
1773         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1774                 && fRegionalIndicatorSet->contains(c2)) {
1775             break;
1776         }
1777         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1778             continue;
1779         }
1780 
1781         // Rule (GB999)  Any  <break>  Any
1782         break;
1783     }
1784 
1785     breakPos = p2;
1786     return breakPos;
1787 }
1788 
1789 
1790 
charClasses()1791 UVector  *RBBICharMonkey::charClasses() {
1792     return fSets;
1793 }
1794 
1795 
~RBBICharMonkey()1796 RBBICharMonkey::~RBBICharMonkey() {
1797     delete fSets;
1798     delete fCRLFSet;
1799     delete fControlSet;
1800     delete fExtendSet;
1801     delete fRegionalIndicatorSet;
1802     delete fPrependSet;
1803     delete fSpacingSet;
1804     delete fLSet;
1805     delete fVSet;
1806     delete fTSet;
1807     delete fLVSet;
1808     delete fLVTSet;
1809     delete fHangulSet;
1810     delete fAnySet;
1811     delete fZWJSet;
1812     delete fExtendedPictSet;
1813 }
1814 
1815 //------------------------------------------------------------------------------------------
1816 //
1817 //   class RBBIWordMonkey      Word Break specific implementation
1818 //                             of RBBIMonkeyKind.
1819 //
1820 //------------------------------------------------------------------------------------------
1821 class RBBIWordMonkey: public RBBIMonkeyKind {
1822 public:
1823     RBBIWordMonkey();
1824     virtual          ~RBBIWordMonkey();
1825     virtual  UVector *charClasses();
1826     virtual  void     setText(const UnicodeString &s);
1827     virtual int32_t   next(int32_t i);
1828 private:
1829     UVector      *fSets;
1830 
1831     UnicodeSet  *fCRSet;
1832     UnicodeSet  *fLFSet;
1833     UnicodeSet  *fNewlineSet;
1834     UnicodeSet  *fRegionalIndicatorSet;
1835     UnicodeSet  *fKatakanaSet;
1836     UnicodeSet  *fHebrew_LetterSet;
1837     UnicodeSet  *fALetterSet;
1838     UnicodeSet  *fSingle_QuoteSet;
1839     UnicodeSet  *fDouble_QuoteSet;
1840     UnicodeSet  *fMidNumLetSet;
1841     UnicodeSet  *fMidLetterSet;
1842     UnicodeSet  *fMidNumSet;
1843     UnicodeSet  *fNumericSet;
1844     UnicodeSet  *fFormatSet;
1845     UnicodeSet  *fOtherSet;
1846     UnicodeSet  *fExtendSet;
1847     UnicodeSet  *fExtendNumLetSet;
1848     UnicodeSet  *fWSegSpaceSet;
1849     UnicodeSet  *fDictionarySet;
1850     UnicodeSet  *fZWJSet;
1851     UnicodeSet  *fExtendedPictSet;
1852 
1853     const UnicodeString  *fText;
1854 };
1855 
1856 
RBBIWordMonkey()1857 RBBIWordMonkey::RBBIWordMonkey()
1858 {
1859     UErrorCode  status = U_ZERO_ERROR;
1860 
1861     fSets            = new UVector(status);
1862 
1863     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
1864     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
1865     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
1866     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
1867     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1868     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1869     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1870     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
1871     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
1872     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
1873     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]",    status);
1874     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
1875     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]",      status);
1876     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
1877     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1878     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}]",       status);
1879     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
1880 
1881     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
1882     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1883 
1884     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1885     fDictionarySet->addAll(*fKatakanaSet);
1886     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1887 
1888     fALetterSet->removeAll(*fDictionarySet);
1889 
1890     fOtherSet        = new UnicodeSet();
1891     if(U_FAILURE(status)) {
1892         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1893         deferredStatus = status;
1894         return;
1895     }
1896 
1897     fOtherSet->complement();
1898     fOtherSet->removeAll(*fCRSet);
1899     fOtherSet->removeAll(*fLFSet);
1900     fOtherSet->removeAll(*fNewlineSet);
1901     fOtherSet->removeAll(*fKatakanaSet);
1902     fOtherSet->removeAll(*fHebrew_LetterSet);
1903     fOtherSet->removeAll(*fALetterSet);
1904     fOtherSet->removeAll(*fSingle_QuoteSet);
1905     fOtherSet->removeAll(*fDouble_QuoteSet);
1906     fOtherSet->removeAll(*fMidLetterSet);
1907     fOtherSet->removeAll(*fMidNumSet);
1908     fOtherSet->removeAll(*fNumericSet);
1909     fOtherSet->removeAll(*fExtendNumLetSet);
1910     fOtherSet->removeAll(*fWSegSpaceSet);
1911     fOtherSet->removeAll(*fFormatSet);
1912     fOtherSet->removeAll(*fExtendSet);
1913     fOtherSet->removeAll(*fRegionalIndicatorSet);
1914     fOtherSet->removeAll(*fZWJSet);
1915     fOtherSet->removeAll(*fExtendedPictSet);
1916 
1917     // Inhibit dictionary characters from being tested at all.
1918     fOtherSet->removeAll(*fDictionarySet);
1919 
1920     fSets->addElement(fCRSet,                status);
1921     fSets->addElement(fLFSet,                status);
1922     fSets->addElement(fNewlineSet,           status);
1923     fSets->addElement(fRegionalIndicatorSet, status);
1924     fSets->addElement(fHebrew_LetterSet,     status);
1925     fSets->addElement(fALetterSet,           status);
1926     fSets->addElement(fSingle_QuoteSet,      status);
1927     fSets->addElement(fDouble_QuoteSet,      status);
1928     //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
1929                                                         // from the test data. They are all in the dictionary set,
1930                                                         // which this (old, to be retired) monkey test cannot handle.
1931     fSets->addElement(fMidLetterSet,         status);
1932     fSets->addElement(fMidNumLetSet,         status);
1933     fSets->addElement(fMidNumSet,            status);
1934     fSets->addElement(fNumericSet,           status);
1935     fSets->addElement(fFormatSet,            status);
1936     fSets->addElement(fExtendSet,            status);
1937     fSets->addElement(fOtherSet,             status);
1938     fSets->addElement(fExtendNumLetSet,      status);
1939     fSets->addElement(fWSegSpaceSet,         status);
1940 
1941     fSets->addElement(fZWJSet,               status);
1942     fSets->addElement(fExtendedPictSet,      status);
1943 
1944     if (U_FAILURE(status)) {
1945         deferredStatus = status;
1946     }
1947 }
1948 
setText(const UnicodeString & s)1949 void RBBIWordMonkey::setText(const UnicodeString &s) {
1950     fText       = &s;
1951 }
1952 
1953 
next(int32_t prevPos)1954 int32_t RBBIWordMonkey::next(int32_t prevPos) {
1955     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1956                               //   break position being tested.  The candidate break
1957                               //   location is before p2.
1958 
1959     int     breakPos = -1;
1960 
1961     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1962 
1963     if (U_FAILURE(deferredStatus)) {
1964         return -1;
1965     }
1966 
1967     // Prev break at end of string.  return DONE.
1968     if (prevPos >= fText->length()) {
1969         return -1;
1970     }
1971     p0 = p1 = p2 = p3 = prevPos;
1972     c3 =  fText->char32At(prevPos);
1973     c0 = c1 = c2 = 0;
1974     (void)p0;       // Suppress set but not used warning.
1975 
1976     // Loop runs once per "significant" character position in the input text.
1977     for (;;) {
1978         // Move all of the positions forward in the input string.
1979         p0 = p1;  c0 = c1;
1980         p1 = p2;  c1 = c2;
1981         p2 = p3;  c2 = c3;
1982 
1983         // Advancd p3 by    X(Extend | Format)*   Rule 4
1984         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
1985         do {
1986             p3 = fText->moveIndex32(p3, 1);
1987             c3 = fText->char32At(p3);
1988             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
1989                break;
1990             };
1991         }
1992         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
1993 
1994 
1995         if (p1 == p2) {
1996             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1997             continue;
1998         }
1999         if (p2 == fText->length()) {
2000             // Reached end of string.  Always a break position.
2001             break;
2002         }
2003 
2004         // Rule  (3)   CR x LF
2005         //     No Extend or Format characters may appear between the CR and LF,
2006         //     which requires the additional check for p2 immediately following p1.
2007         //
2008         if (c1==0x0D && c2==0x0A) {
2009             continue;
2010         }
2011 
2012         // Rule (3a)  Break before and after newlines (including CR and LF)
2013         //
2014         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2015             break;
2016         };
2017         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2018             break;
2019         };
2020 
2021         // Rule (3c)    ZWJ x Extended_Pictographic
2022         //              Not ignoring extend chars, so peek into input text to
2023         //              get the potential ZWJ, the character immediately preceding c2.
2024         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2025         //              but char32At will get the full code point.
2026         if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
2027             continue;
2028         }
2029 
2030         // Rule (3d)    Keep horizontal whitespace together.
2031         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2032             continue;
2033         }
2034 
2035         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2036         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2037             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2038             continue;
2039         }
2040 
2041         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2042         //
2043         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2044              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2045              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2046             continue;
2047         }
2048 
2049         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2050         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2051             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2052             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2053             continue;
2054         }
2055 
2056         // Rule (7a)     Hebrew_Letter x Single_Quote
2057         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2058             continue;
2059         }
2060 
2061         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2062         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2063             continue;
2064         }
2065 
2066         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2067         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2068             continue;
2069         }
2070 
2071         // Rule (8)    Numeric x Numeric
2072         if (fNumericSet->contains(c1) &&
2073             fNumericSet->contains(c2))  {
2074             continue;
2075         }
2076 
2077         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2078         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2079             fNumericSet->contains(c2))  {
2080             continue;
2081         }
2082 
2083         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2084         if (fNumericSet->contains(c1) &&
2085             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2086             continue;
2087         }
2088 
2089         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2090         if (fNumericSet->contains(c0) &&
2091             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2092             fNumericSet->contains(c2)) {
2093             continue;
2094         }
2095 
2096         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2097         if (fNumericSet->contains(c1) &&
2098             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2099             fNumericSet->contains(c3)) {
2100             continue;
2101         }
2102 
2103         // Rule (13)  Katakana x Katakana
2104         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2105         //                  all Katakana are handled by the dictionary breaker.
2106         if (fKatakanaSet->contains(c1) &&
2107             fKatakanaSet->contains(c2))  {
2108             continue;
2109         }
2110 
2111         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2112         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2113              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2114              fExtendNumLetSet->contains(c2)) {
2115                 continue;
2116         }
2117 
2118         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2119         if (fExtendNumLetSet->contains(c1) &&
2120                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2121                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2122             continue;
2123         }
2124 
2125         // Rule 15 - 17   Group pairs of Regional Indicators.
2126         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2127             break;
2128         }
2129         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2130             continue;
2131         }
2132 
2133         // Rule 999.  Break found here.
2134         break;
2135     }
2136 
2137     breakPos = p2;
2138     return breakPos;
2139 }
2140 
2141 
charClasses()2142 UVector  *RBBIWordMonkey::charClasses() {
2143     return fSets;
2144 }
2145 
2146 
~RBBIWordMonkey()2147 RBBIWordMonkey::~RBBIWordMonkey() {
2148     delete fSets;
2149     delete fCRSet;
2150     delete fLFSet;
2151     delete fNewlineSet;
2152     delete fKatakanaSet;
2153     delete fHebrew_LetterSet;
2154     delete fALetterSet;
2155     delete fSingle_QuoteSet;
2156     delete fDouble_QuoteSet;
2157     delete fMidNumLetSet;
2158     delete fMidLetterSet;
2159     delete fMidNumSet;
2160     delete fNumericSet;
2161     delete fFormatSet;
2162     delete fExtendSet;
2163     delete fExtendNumLetSet;
2164     delete fWSegSpaceSet;
2165     delete fRegionalIndicatorSet;
2166     delete fDictionarySet;
2167     delete fOtherSet;
2168     delete fZWJSet;
2169     delete fExtendedPictSet;
2170 }
2171 
2172 
2173 
2174 
2175 //------------------------------------------------------------------------------------------
2176 //
2177 //   class RBBISentMonkey      Sentence Break specific implementation
2178 //                             of RBBIMonkeyKind.
2179 //
2180 //------------------------------------------------------------------------------------------
2181 class RBBISentMonkey: public RBBIMonkeyKind {
2182 public:
2183     RBBISentMonkey();
2184     virtual          ~RBBISentMonkey();
2185     virtual  UVector *charClasses();
2186     virtual  void     setText(const UnicodeString &s);
2187     virtual int32_t   next(int32_t i);
2188 private:
2189     int               moveBack(int posFrom);
2190     int               moveForward(int posFrom);
2191     UChar32           cAt(int pos);
2192 
2193     UVector      *fSets;
2194 
2195     UnicodeSet  *fSepSet;
2196     UnicodeSet  *fFormatSet;
2197     UnicodeSet  *fSpSet;
2198     UnicodeSet  *fLowerSet;
2199     UnicodeSet  *fUpperSet;
2200     UnicodeSet  *fOLetterSet;
2201     UnicodeSet  *fNumericSet;
2202     UnicodeSet  *fATermSet;
2203     UnicodeSet  *fSContinueSet;
2204     UnicodeSet  *fSTermSet;
2205     UnicodeSet  *fCloseSet;
2206     UnicodeSet  *fOtherSet;
2207     UnicodeSet  *fExtendSet;
2208 
2209     const UnicodeString  *fText;
2210 
2211 };
2212 
RBBISentMonkey()2213 RBBISentMonkey::RBBISentMonkey()
2214 {
2215     UErrorCode  status = U_ZERO_ERROR;
2216 
2217     fSets            = new UVector(status);
2218 
2219     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2220     //                       set and made into character classes of their own.  For the monkey impl,
2221     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2222     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2223     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2224     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2225     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2226     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2227     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2228     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2229     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2230     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2231     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2232     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2233     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2234     fOtherSet        = new UnicodeSet();
2235 
2236     if(U_FAILURE(status)) {
2237       deferredStatus = status;
2238       return;
2239     }
2240 
2241     fOtherSet->complement();
2242     fOtherSet->removeAll(*fSepSet);
2243     fOtherSet->removeAll(*fFormatSet);
2244     fOtherSet->removeAll(*fSpSet);
2245     fOtherSet->removeAll(*fLowerSet);
2246     fOtherSet->removeAll(*fUpperSet);
2247     fOtherSet->removeAll(*fOLetterSet);
2248     fOtherSet->removeAll(*fNumericSet);
2249     fOtherSet->removeAll(*fATermSet);
2250     fOtherSet->removeAll(*fSContinueSet);
2251     fOtherSet->removeAll(*fSTermSet);
2252     fOtherSet->removeAll(*fCloseSet);
2253     fOtherSet->removeAll(*fExtendSet);
2254 
2255     fSets->addElement(fSepSet,       status);
2256     fSets->addElement(fFormatSet,    status);
2257     fSets->addElement(fSpSet,        status);
2258     fSets->addElement(fLowerSet,     status);
2259     fSets->addElement(fUpperSet,     status);
2260     fSets->addElement(fOLetterSet,   status);
2261     fSets->addElement(fNumericSet,   status);
2262     fSets->addElement(fATermSet,     status);
2263     fSets->addElement(fSContinueSet, status);
2264     fSets->addElement(fSTermSet,     status);
2265     fSets->addElement(fCloseSet,     status);
2266     fSets->addElement(fOtherSet,     status);
2267     fSets->addElement(fExtendSet,    status);
2268 
2269     if (U_FAILURE(status)) {
2270         deferredStatus = status;
2271     }
2272 }
2273 
2274 
2275 
setText(const UnicodeString & s)2276 void RBBISentMonkey::setText(const UnicodeString &s) {
2277     fText       = &s;
2278 }
2279 
charClasses()2280 UVector  *RBBISentMonkey::charClasses() {
2281     return fSets;
2282 }
2283 
2284 
2285 //  moveBack()   Find the "significant" code point preceding the index i.
2286 //               Skips over ($Extend | $Format)* .
2287 //
moveBack(int i)2288 int RBBISentMonkey::moveBack(int i) {
2289     if (i <= 0) {
2290         return -1;
2291     }
2292     UChar32   c;
2293     int32_t   j = i;
2294     do {
2295         j = fText->moveIndex32(j, -1);
2296         c = fText->char32At(j);
2297     }
2298     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2299     return j;
2300 
2301  }
2302 
2303 
moveForward(int i)2304 int RBBISentMonkey::moveForward(int i) {
2305     if (i>=fText->length()) {
2306         return fText->length();
2307     }
2308     UChar32   c;
2309     int32_t   j = i;
2310     do {
2311         j = fText->moveIndex32(j, 1);
2312         c = cAt(j);
2313     }
2314     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2315     return j;
2316 }
2317 
cAt(int pos)2318 UChar32 RBBISentMonkey::cAt(int pos) {
2319     if (pos<0 || pos>=fText->length()) {
2320         return -1;
2321     } else {
2322         return fText->char32At(pos);
2323     }
2324 }
2325 
next(int32_t prevPos)2326 int32_t RBBISentMonkey::next(int32_t prevPos) {
2327     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2328                               //   break position being tested.  The candidate break
2329                               //   location is before p2.
2330 
2331     int     breakPos = -1;
2332 
2333     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2334     UChar32 c;
2335 
2336     if (U_FAILURE(deferredStatus)) {
2337         return -1;
2338     }
2339 
2340     // Prev break at end of string.  return DONE.
2341     if (prevPos >= fText->length()) {
2342         return -1;
2343     }
2344     p0 = p1 = p2 = p3 = prevPos;
2345     c3 =  fText->char32At(prevPos);
2346     c0 = c1 = c2 = 0;
2347     (void)p0;     // Suppress set but not used warning.
2348 
2349     // Loop runs once per "significant" character position in the input text.
2350     for (;;) {
2351         // Move all of the positions forward in the input string.
2352         p0 = p1;  c0 = c1;
2353         p1 = p2;  c1 = c2;
2354         p2 = p3;  c2 = c3;
2355 
2356         // Advancd p3 by    X(Extend | Format)*   Rule 4
2357         p3 = moveForward(p3);
2358         c3 = cAt(p3);
2359 
2360         // Rule (3)  CR x LF
2361         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2362             continue;
2363         }
2364 
2365         // Rule (4).   Sep  <break>
2366         if (fSepSet->contains(c1)) {
2367             p2 = p1+1;   // Separators don't combine with Extend or Format.
2368             break;
2369         }
2370 
2371         if (p2 >= fText->length()) {
2372             // Reached end of string.  Always a break position.
2373             break;
2374         }
2375 
2376         if (p2 == prevPos) {
2377             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2378             continue;
2379         }
2380 
2381         // Rule (6).   ATerm x Numeric
2382         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2383             continue;
2384         }
2385 
2386         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
2387         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2388                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2389             continue;
2390         }
2391 
2392         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2393         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2394         //                  note to the Unicode 5.0 documents.
2395         int p8 = p1;
2396         while (fSpSet->contains(cAt(p8))) {
2397             p8 = moveBack(p8);
2398         }
2399         while (fCloseSet->contains(cAt(p8))) {
2400             p8 = moveBack(p8);
2401         }
2402         if (fATermSet->contains(cAt(p8))) {
2403             p8=p2;
2404             for (;;) {
2405                 c = cAt(p8);
2406                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2407                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2408                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2409                     break;
2410                 }
2411                 p8 = moveForward(p8);
2412             }
2413             if (fLowerSet->contains(cAt(p8))) {
2414                 continue;
2415             }
2416         }
2417 
2418         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2419         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2420             p8 = p1;
2421             while (fSpSet->contains(cAt(p8))) {
2422                 p8 = moveBack(p8);
2423             }
2424             while (fCloseSet->contains(cAt(p8))) {
2425                 p8 = moveBack(p8);
2426             }
2427             c = cAt(p8);
2428             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2429                 continue;
2430             }
2431         }
2432 
2433         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2434         int p9 = p1;
2435         while (fCloseSet->contains(cAt(p9))) {
2436             p9 = moveBack(p9);
2437         }
2438         c = cAt(p9);
2439         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2440             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2441                 continue;
2442             }
2443         }
2444 
2445         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2446         int p10 = p1;
2447         while (fSpSet->contains(cAt(p10))) {
2448             p10 = moveBack(p10);
2449         }
2450         while (fCloseSet->contains(cAt(p10))) {
2451             p10 = moveBack(p10);
2452         }
2453         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2454             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2455                 continue;
2456             }
2457         }
2458 
2459         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2460         int p11 = p1;
2461         if (fSepSet->contains(cAt(p11))) {
2462             p11 = moveBack(p11);
2463         }
2464         while (fSpSet->contains(cAt(p11))) {
2465             p11 = moveBack(p11);
2466         }
2467         while (fCloseSet->contains(cAt(p11))) {
2468             p11 = moveBack(p11);
2469         }
2470         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2471             break;
2472         }
2473 
2474         //  Rule (12)  Any x Any
2475         continue;
2476     }
2477     breakPos = p2;
2478     return breakPos;
2479 }
2480 
~RBBISentMonkey()2481 RBBISentMonkey::~RBBISentMonkey() {
2482     delete fSets;
2483     delete fSepSet;
2484     delete fFormatSet;
2485     delete fSpSet;
2486     delete fLowerSet;
2487     delete fUpperSet;
2488     delete fOLetterSet;
2489     delete fNumericSet;
2490     delete fATermSet;
2491     delete fSContinueSet;
2492     delete fSTermSet;
2493     delete fCloseSet;
2494     delete fOtherSet;
2495     delete fExtendSet;
2496 }
2497 
2498 
2499 
2500 //-------------------------------------------------------------------------------------------
2501 //
2502 //  RBBILineMonkey
2503 //
2504 //-------------------------------------------------------------------------------------------
2505 
2506 class RBBILineMonkey: public RBBIMonkeyKind {
2507 public:
2508     RBBILineMonkey();
2509     virtual          ~RBBILineMonkey();
2510     virtual  UVector *charClasses();
2511     virtual  void     setText(const UnicodeString &s);
2512     virtual  int32_t  next(int32_t i);
2513     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2514 private:
2515     UVector      *fSets;
2516 
2517     UnicodeSet  *fBK;
2518     UnicodeSet  *fCR;
2519     UnicodeSet  *fLF;
2520     UnicodeSet  *fCM;
2521     UnicodeSet  *fNL;
2522     UnicodeSet  *fSG;
2523     UnicodeSet  *fWJ;
2524     UnicodeSet  *fZW;
2525     UnicodeSet  *fGL;
2526     UnicodeSet  *fCB;
2527     UnicodeSet  *fSP;
2528     UnicodeSet  *fB2;
2529     UnicodeSet  *fBA;
2530     UnicodeSet  *fBB;
2531     UnicodeSet  *fHH;
2532     UnicodeSet  *fHY;
2533     UnicodeSet  *fH2;
2534     UnicodeSet  *fH3;
2535     UnicodeSet  *fCL;
2536     UnicodeSet  *fCP;
2537     UnicodeSet  *fEX;
2538     UnicodeSet  *fIN;
2539     UnicodeSet  *fJL;
2540     UnicodeSet  *fJV;
2541     UnicodeSet  *fJT;
2542     UnicodeSet  *fNS;
2543     UnicodeSet  *fOP;
2544     UnicodeSet  *fQU;
2545     UnicodeSet  *fIS;
2546     UnicodeSet  *fNU;
2547     UnicodeSet  *fPO;
2548     UnicodeSet  *fPR;
2549     UnicodeSet  *fSY;
2550     UnicodeSet  *fAI;
2551     UnicodeSet  *fAL;
2552     UnicodeSet  *fCJ;
2553     UnicodeSet  *fHL;
2554     UnicodeSet  *fID;
2555     UnicodeSet  *fRI;
2556     UnicodeSet  *fXX;
2557     UnicodeSet  *fEB;
2558     UnicodeSet  *fEM;
2559     UnicodeSet  *fZWJ;
2560 
2561     BreakIterator        *fCharBI;
2562     const UnicodeString  *fText;
2563     RegexMatcher         *fNumberMatcher;
2564 };
2565 
RBBILineMonkey()2566 RBBILineMonkey::RBBILineMonkey() :
2567     RBBIMonkeyKind(),
2568     fSets(NULL),
2569 
2570     fCharBI(NULL),
2571     fText(NULL),
2572     fNumberMatcher(NULL)
2573 
2574 {
2575     if (U_FAILURE(deferredStatus)) {
2576         return;
2577     }
2578 
2579     UErrorCode  status = U_ZERO_ERROR;
2580 
2581     fSets  = new UVector(status);
2582 
2583     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2584     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2585     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2586     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2587     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2588     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2589     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2590     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2591     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2592     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2593     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2594     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2595     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2596     fHH    = new UnicodeSet();
2597     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2598     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2599     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2600     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2601     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2602     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2603     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2604     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2605     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2606     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2607     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2608     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2609     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2610     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2611     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2612     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2613     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2614     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2615     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2616     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2617     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2618     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2619     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2620     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2621     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2622     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2623     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2624     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2625     fZWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2626 
2627     if (U_FAILURE(status)) {
2628         deferredStatus = status;
2629         return;
2630     }
2631 
2632     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2633     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2634     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2635 
2636     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2637     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2638 
2639     fHH->add(u'\u2010');   // Hyphen, '‐'
2640 
2641     fSets->addElement(fBK, status);
2642     fSets->addElement(fCR, status);
2643     fSets->addElement(fLF, status);
2644     fSets->addElement(fCM, status);
2645     fSets->addElement(fNL, status);
2646     fSets->addElement(fWJ, status);
2647     fSets->addElement(fZW, status);
2648     fSets->addElement(fGL, status);
2649     fSets->addElement(fCB, status);
2650     fSets->addElement(fSP, status);
2651     fSets->addElement(fB2, status);
2652     fSets->addElement(fBA, status);
2653     fSets->addElement(fBB, status);
2654     fSets->addElement(fHY, status);
2655     fSets->addElement(fH2, status);
2656     fSets->addElement(fH3, status);
2657     fSets->addElement(fCL, status);
2658     fSets->addElement(fCP, status);
2659     fSets->addElement(fEX, status);
2660     fSets->addElement(fIN, status);
2661     fSets->addElement(fJL, status);
2662     fSets->addElement(fJT, status);
2663     fSets->addElement(fJV, status);
2664     fSets->addElement(fNS, status);
2665     fSets->addElement(fOP, status);
2666     fSets->addElement(fQU, status);
2667     fSets->addElement(fIS, status);
2668     fSets->addElement(fNU, status);
2669     fSets->addElement(fPO, status);
2670     fSets->addElement(fPR, status);
2671     fSets->addElement(fSY, status);
2672     fSets->addElement(fAI, status);
2673     fSets->addElement(fAL, status);
2674     fSets->addElement(fHL, status);
2675     fSets->addElement(fID, status);
2676     fSets->addElement(fWJ, status);
2677     fSets->addElement(fRI, status);
2678     fSets->addElement(fSG, status);
2679     fSets->addElement(fEB, status);
2680     fSets->addElement(fEM, status);
2681     fSets->addElement(fZWJ, status);
2682 
2683 
2684     const char *rules =
2685             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2686             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2687             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2688             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2689             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2690             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2691 
2692     fNumberMatcher = new RegexMatcher(
2693         UnicodeString(rules, -1, US_INV), 0, status);
2694 
2695     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2696 
2697     if (U_FAILURE(status)) {
2698         deferredStatus = status;
2699     }
2700 }
2701 
2702 
setText(const UnicodeString & s)2703 void RBBILineMonkey::setText(const UnicodeString &s) {
2704     fText       = &s;
2705     fCharBI->setText(s);
2706     fNumberMatcher->reset(s);
2707 }
2708 
2709 //
2710 //  rule9Adjust
2711 //     Line Break TR rules 9 and 10 implementation.
2712 //     This deals with combining marks and other sequences that
2713 //     that must be treated as if they were something other than what they actually are.
2714 //
2715 //     This is factored out into a separate function because it must be applied twice for
2716 //     each potential break, once to the chars before the position being checked, then
2717 //     again to the text following the possible break.
2718 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2719 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2720     if (pos == -1) {
2721         // Invalid initial position.  Happens during the warmup iteration of the
2722         //   main loop in next().
2723         return;
2724     }
2725 
2726     int32_t  nPos = *nextPos;
2727 
2728     // LB 9  Keep combining sequences together.
2729     //  advance over any CM class chars.  Note that Line Break CM is different
2730     //  from the normal Grapheme Extend property.
2731     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2732           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2733         for (;;) {
2734             *nextChar = fText->char32At(nPos);
2735             if (!fCM->contains(*nextChar)) {
2736                 break;
2737             }
2738             nPos = fText->moveIndex32(nPos, 1);
2739         }
2740     }
2741 
2742 
2743     // LB 9 Treat X CM* as if it were x.
2744     //       No explicit action required.
2745 
2746     // LB 10  Treat any remaining combining mark as AL
2747     if (fCM->contains(*posChar)) {
2748         *posChar = u'A';
2749     }
2750 
2751     // Push the updated nextPos and nextChar back to our caller.
2752     // This only makes a difference if posChar got bigger by consuming a
2753     // combining sequence.
2754     *nextPos  = nPos;
2755     *nextChar = fText->char32At(nPos);
2756 }
2757 
2758 
2759 
next(int32_t startPos)2760 int32_t RBBILineMonkey::next(int32_t startPos) {
2761     UErrorCode status = U_ZERO_ERROR;
2762     int32_t    pos;       //  Index of the char following a potential break position
2763     UChar32    thisChar;  //  Character at above position "pos"
2764 
2765     int32_t    prevPos;   //  Index of the char preceding a potential break position
2766     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2767                           //   and thisChar may not be adjacent because combining
2768                           //   characters between them will be ignored.
2769 
2770     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2771     UChar32    prevCharX2;
2772 
2773     int32_t    nextPos;   //  Index of the next character following pos.
2774                           //     Usually skips over combining marks.
2775     int32_t    nextCPPos; //  Index of the code point following "pos."
2776                           //     May point to a combining mark.
2777     int32_t    tPos;      //  temp value.
2778     UChar32    c;
2779 
2780     if (U_FAILURE(deferredStatus)) {
2781         return -1;
2782     }
2783 
2784     if (startPos >= fText->length()) {
2785         return -1;
2786     }
2787 
2788 
2789     // Initial values for loop.  Loop will run the first time without finding breaks,
2790     //                           while the invalid values shift out and the "this" and
2791     //                           "prev" positions are filled in with good values.
2792     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2793     thisChar = prevChar  = prevCharX2 = 0;
2794     nextPos  = nextCPPos = startPos;
2795 
2796 
2797     // Loop runs once per position in the test text, until a break position
2798     //  is found.
2799     for (;;) {
2800         prevPosX2 = prevPos;
2801         prevCharX2 = prevChar;
2802 
2803         prevPos   = pos;
2804         prevChar  = thisChar;
2805 
2806         pos       = nextPos;
2807         thisChar  = fText->char32At(pos);
2808 
2809         nextCPPos = fText->moveIndex32(pos, 1);
2810         nextPos   = nextCPPos;
2811 
2812         // Rule LB2 - Break at end of text.
2813         if (pos >= fText->length()) {
2814             break;
2815         }
2816 
2817         // Rule LB 9 - adjust for combining sequences.
2818         //             We do this one out-of-order because the adjustment does not change anything
2819         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2820         //             be applied.
2821         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
2822         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2823         c = fText->char32At(nextPos);
2824         rule9Adjust(pos,     &thisChar, &nextPos, &c);
2825 
2826         // If the loop is still warming up - if we haven't shifted the initial
2827         //   -1 positions out of prevPos yet - loop back to advance the
2828         //    position in the input without any further looking for breaks.
2829         if (prevPos == -1) {
2830             continue;
2831         }
2832 
2833         // LB 4  Always break after hard line breaks,
2834         if (fBK->contains(prevChar)) {
2835             break;
2836         }
2837 
2838         // LB 5  Break after CR, LF, NL, but not inside CR LF
2839         if (prevChar == 0x0d && thisChar == 0x0a) {
2840             continue;
2841         }
2842         if (prevChar == 0x0d ||
2843             prevChar == 0x0a ||
2844             prevChar == 0x85)  {
2845             break;
2846         }
2847 
2848         // LB 6  Don't break before hard line breaks
2849         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2850             fBK->contains(thisChar)) {
2851                 continue;
2852         }
2853 
2854 
2855         // LB 7  Don't break before spaces or zero-width space.
2856         if (fSP->contains(thisChar)) {
2857             continue;
2858         }
2859 
2860         if (fZW->contains(thisChar)) {
2861             continue;
2862         }
2863 
2864         // LB 8  Break after zero width space
2865         //       ZW SP* ÷
2866         //       Scan backwards from prevChar for SP* ZW
2867         tPos = prevPos;
2868         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2869             tPos = fText->moveIndex32(tPos, -1);
2870         }
2871         if (fZW->contains(fText->char32At(tPos))) {
2872             break;
2873         }
2874 
2875         // LB 25    Numbers
2876         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
2877         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
2878         if (fNumberMatcher->lookingAt(prevPos, status)) {
2879             if (U_FAILURE(status)) {
2880                 break;
2881             }
2882             // Matched a number.  But could have been just a single digit, which would
2883             //    not represent a "no break here" between prevChar and thisChar
2884             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
2885             if (numEndIdx > pos) {
2886                 // Number match includes at least our two chars being checked
2887                 if (numEndIdx > nextPos) {
2888                     // Number match includes additional chars.  Update pos and nextPos
2889                     //   so that next loop iteration will continue at the end of the number,
2890                     //   checking for breaks between last char in number & whatever follows.
2891                     pos = nextPos = numEndIdx;
2892                     do {
2893                         pos = fText->moveIndex32(pos, -1);
2894                         thisChar = fText->char32At(pos);
2895                     } while (fCM->contains(thisChar));
2896                 }
2897                 continue;
2898             }
2899         }
2900 
2901         // LB 8a ZWJ x
2902         //       The monkey test's way of ignoring combining characters doesn't work
2903         //       for this rule. ZJ is also a CM. Need to get the actual character
2904         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
2905         {
2906             int32_t prevIdx = fText->moveIndex32(pos, -1);
2907             UChar32 prevC = fText->char32At(prevIdx);
2908             if (fZWJ->contains(prevC)) {
2909                 continue;
2910             }
2911         }
2912 
2913         // LB 9, 10  Already done, at top of loop.
2914         //
2915 
2916 
2917         // LB 11  Do not break before or after WORD JOINER and related characters.
2918         //    x  WJ
2919         //    WJ  x
2920         //
2921         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
2922             continue;
2923         }
2924 
2925         // LB 12
2926         //    GL  x
2927         if (fGL->contains(prevChar)) {
2928             continue;
2929         }
2930 
2931         // LB 12a
2932         //    [^SP BA HY] x GL
2933         if (!(fSP->contains(prevChar) ||
2934               fBA->contains(prevChar) ||
2935               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
2936             continue;
2937         }
2938 
2939 
2940 
2941         // LB 13  Don't break before closings.
2942         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
2943         //        fall into LB 17 and the more general number regular expression.
2944         //
2945         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
2946             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
2947                                          fEX->contains(thisChar)  ||
2948             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
2949             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
2950             continue;
2951         }
2952 
2953         // LB 14 Don't break after OP SP*
2954         //       Scan backwards, checking for this sequence.
2955         //       The OP char could include combining marks, so we actually check for
2956         //           OP CM* SP*
2957         //       Another Twist: The Rule 67 fixes may have changed a SP CM
2958         //       sequence into a ID char, so before scanning back through spaces,
2959         //       verify that prevChar is indeed a space.  The prevChar variable
2960         //       may differ from fText[prevPos]
2961         tPos = prevPos;
2962         if (fSP->contains(prevChar)) {
2963             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
2964                 tPos=fText->moveIndex32(tPos, -1);
2965             }
2966         }
2967         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
2968             tPos=fText->moveIndex32(tPos, -1);
2969         }
2970         if (fOP->contains(fText->char32At(tPos))) {
2971             continue;
2972         }
2973 
2974 
2975         // LB 15    QU SP* x OP
2976         if (fOP->contains(thisChar)) {
2977             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
2978             int tPos = prevPos;
2979             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2980                 tPos = fText->moveIndex32(tPos, -1);
2981             }
2982             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
2983                 tPos = fText->moveIndex32(tPos, -1);
2984             }
2985             if (fQU->contains(fText->char32At(tPos))) {
2986                 continue;
2987             }
2988         }
2989 
2990 
2991 
2992         // LB 16   (CL | CP) SP* x NS
2993         //    Scan backwards for SP* CM* (CL | CP)
2994         if (fNS->contains(thisChar)) {
2995             int tPos = prevPos;
2996             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2997                 tPos = fText->moveIndex32(tPos, -1);
2998             }
2999             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3000                 tPos = fText->moveIndex32(tPos, -1);
3001             }
3002             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3003                 continue;
3004             }
3005         }
3006 
3007 
3008         // LB 17        B2 SP* x B2
3009         if (fB2->contains(thisChar)) {
3010             //  Scan backwards, checking for the B2 CM* SP* sequence.
3011             tPos = prevPos;
3012             if (fSP->contains(prevChar)) {
3013                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3014                     tPos=fText->moveIndex32(tPos, -1);
3015                 }
3016             }
3017             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3018                 tPos=fText->moveIndex32(tPos, -1);
3019             }
3020             if (fB2->contains(fText->char32At(tPos))) {
3021                 continue;
3022             }
3023         }
3024 
3025 
3026         // LB 18    break after space
3027         if (fSP->contains(prevChar)) {
3028             break;
3029         }
3030 
3031         // LB 19
3032         //    x   QU
3033         //    QU  x
3034         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3035             continue;
3036         }
3037 
3038         // LB 20  Break around a CB
3039         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3040             break;
3041         }
3042 
3043         // LB 20.09  Don't break between Hyphens and letters if a break precedes the hyphen.
3044         //           Formerly this was a Finnish tailoring.
3045         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3046         //    ^($HY | $HH) $AL;
3047         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3048                 prevPosX2 == -1) {
3049             continue;
3050         }
3051 
3052         // LB 21
3053         if (fBA->contains(thisChar) ||
3054             fHY->contains(thisChar) ||
3055             fNS->contains(thisChar) ||
3056             fBB->contains(prevChar) )   {
3057             continue;
3058         }
3059 
3060         // LB 21a
3061         //   HL (HY | BA) x
3062         if (fHL->contains(prevCharX2) &&
3063                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3064             continue;
3065         }
3066 
3067         // LB 21b
3068         //   SY x HL
3069         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3070             continue;
3071         }
3072 
3073         // LB 22
3074         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3075             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3076             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3077             ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3078             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3079             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3080             continue;
3081         }
3082 
3083 
3084         // LB 23    (AL | HL) x NU
3085         //          NU x (AL | HL)
3086         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3087             continue;
3088         }
3089         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3090             continue;
3091         }
3092 
3093         // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3094         //      PR x (ID | EB | EM)
3095         //     (ID | EB | EM) x PO
3096         if (fPR->contains(prevChar) &&
3097                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3098             continue;
3099         }
3100         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3101                 fPO->contains(thisChar)) {
3102             continue;
3103         }
3104 
3105         // LB 24  Do not break between prefix and letters or ideographs.
3106         //         (PR | PO) x (AL | HL)
3107         //         (AL | HL) x (PR | PO)
3108         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3109                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3110             continue;
3111         }
3112         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3113                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3114             continue;
3115         }
3116 
3117         // LB 25 numbers match, moved up, before LB 8a,
3118 
3119         // LB 26 Do not break a Korean syllable.
3120         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3121                                         fJV->contains(thisChar) ||
3122                                         fH2->contains(thisChar) ||
3123                                         fH3->contains(thisChar))) {
3124                                             continue;
3125                                         }
3126 
3127         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3128             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3129                 continue;
3130         }
3131 
3132         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3133             fJT->contains(thisChar)) {
3134                 continue;
3135         }
3136 
3137         // LB 27 Treat a Korean Syllable Block the same as ID.
3138         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3139             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3140             fIN->contains(thisChar)) {
3141                 continue;
3142             }
3143         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3144             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3145             fPO->contains(thisChar)) {
3146                 continue;
3147             }
3148         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3149             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3150                 continue;
3151             }
3152 
3153 
3154 
3155         // LB 28  Do not break between alphabetics ("at").
3156         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3157             continue;
3158         }
3159 
3160         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3161         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3162             continue;
3163         }
3164 
3165         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3166         //          (AL | NU) x OP
3167         //          CP x (AL | NU)
3168         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3169             continue;
3170         }
3171         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3172             continue;
3173         }
3174 
3175         // LB30a    RI RI  ÷  RI
3176         //             RI  x  RI
3177         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3178             break;
3179         }
3180         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3181             // Two Regional Indicators have been paired.
3182             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3183             // following RI. This is a hack.
3184             thisChar = -1;
3185             continue;
3186         }
3187 
3188         // LB30b    Emoji Base x Emoji Modifier
3189         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3190             continue;
3191         }
3192 
3193         // LB 31    Break everywhere else
3194         break;
3195 
3196     }
3197 
3198     return pos;
3199 }
3200 
3201 
charClasses()3202 UVector  *RBBILineMonkey::charClasses() {
3203     return fSets;
3204 }
3205 
3206 
~RBBILineMonkey()3207 RBBILineMonkey::~RBBILineMonkey() {
3208     delete fSets;
3209 
3210     delete fBK;
3211     delete fCR;
3212     delete fLF;
3213     delete fCM;
3214     delete fNL;
3215     delete fWJ;
3216     delete fZW;
3217     delete fGL;
3218     delete fCB;
3219     delete fSP;
3220     delete fB2;
3221     delete fBA;
3222     delete fBB;
3223     delete fHH;
3224     delete fHY;
3225     delete fH2;
3226     delete fH3;
3227     delete fCL;
3228     delete fCP;
3229     delete fEX;
3230     delete fIN;
3231     delete fJL;
3232     delete fJV;
3233     delete fJT;
3234     delete fNS;
3235     delete fOP;
3236     delete fQU;
3237     delete fIS;
3238     delete fNU;
3239     delete fPO;
3240     delete fPR;
3241     delete fSY;
3242     delete fAI;
3243     delete fAL;
3244     delete fCJ;
3245     delete fHL;
3246     delete fID;
3247     delete fRI;
3248     delete fSG;
3249     delete fXX;
3250     delete fEB;
3251     delete fEM;
3252     delete fZWJ;
3253 
3254     delete fCharBI;
3255     delete fNumberMatcher;
3256 }
3257 
3258 
3259 //-------------------------------------------------------------------------------------------
3260 //
3261 //   TestMonkey
3262 //
3263 //     params
3264 //       seed=nnnnn        Random number starting seed.
3265 //                         Setting the seed allows errors to be reproduced.
3266 //       loop=nnn          Looping count.  Controls running time.
3267 //                         -1:  run forever.
3268 //                          0 or greater:  run length.
3269 //
3270 //       type = char | word | line | sent | title
3271 //
3272 //  Example:
3273 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3274 //
3275 //-------------------------------------------------------------------------------------------
3276 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3277 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3278     int32_t val = defaultVal;
3279     name.append(" *= *(-?\\d+)");
3280     UErrorCode status = U_ZERO_ERROR;
3281     RegexMatcher m(name, params, 0, status);
3282     if (m.find()) {
3283         // The param exists.  Convert the string to an int.
3284         char valString[100];
3285         int32_t paramLength = m.end(1, status) - m.start(1, status);
3286         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3287             paramLength = (int32_t)(sizeof(valString)-2);
3288         }
3289         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3290         val = strtol(valString,  NULL, 10);
3291 
3292         // Delete this parameter from the params string.
3293         m.reset();
3294         params = m.replaceFirst("", status);
3295     }
3296     U_ASSERT(U_SUCCESS(status));
3297     return val;
3298 }
3299 #endif
3300 
3301 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3302 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3303                                     BreakIterator *bi,
3304                                     int expected[],
3305                                     int expectedcount)
3306 {
3307     int count = 0;
3308     int i = 0;
3309     int forward[50];
3310     bi->setText(ustr);
3311     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3312         forward[count] = i;
3313         if (count < expectedcount && expected[count] != i) {
3314             test->errln("%s:%d break forward test failed: expected %d but got %d",
3315                         __FILE__, __LINE__, expected[count], i);
3316             break;
3317         }
3318         count ++;
3319     }
3320     if (count != expectedcount) {
3321         printStringBreaks(ustr, expected, expectedcount);
3322         test->errln("%s:%d break forward test failed: missed %d match",
3323                     __FILE__, __LINE__, expectedcount - count);
3324         return;
3325     }
3326     // testing boundaries
3327     for (i = 1; i < expectedcount; i ++) {
3328         int j = expected[i - 1];
3329         if (!bi->isBoundary(j)) {
3330             printStringBreaks(ustr, expected, expectedcount);
3331             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3332                     __FILE__, __LINE__, j);
3333             return;
3334         }
3335         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3336             if (bi->isBoundary(j)) {
3337                 printStringBreaks(ustr, expected, expectedcount);
3338                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3339                     __FILE__, __LINE__, j);
3340                 return;
3341             }
3342         }
3343     }
3344 
3345     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3346         count --;
3347         if (forward[count] != i) {
3348             printStringBreaks(ustr, expected, expectedcount);
3349             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3350                         __FILE__, __LINE__, forward[count], i);
3351             break;
3352         }
3353     }
3354     if (count != 0) {
3355         printStringBreaks(ustr, expected, expectedcount);
3356         test->errln("break test previous() failed: missed a match");
3357         return;
3358     }
3359 
3360     // testing preceding
3361     for (i = 0; i < expectedcount - 1; i ++) {
3362         // int j = expected[i] + 1;
3363         int j = ustr.moveIndex32(expected[i], 1);
3364         for (; j <= expected[i + 1]; j ++) {
3365             int32_t expectedPreceding = expected[i];
3366             int32_t actualPreceding = bi->preceding(j);
3367             if (actualPreceding != expectedPreceding) {
3368                 printStringBreaks(ustr, expected, expectedcount);
3369                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3370                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3371                 return;
3372             }
3373         }
3374     }
3375 }
3376 #endif
3377 
TestWordBreaks(void)3378 void RBBITest::TestWordBreaks(void)
3379 {
3380 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3381 
3382     Locale        locale("en");
3383     UErrorCode    status = U_ZERO_ERROR;
3384     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3385     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3386     // Replaced any C+J characters in a row with a random sequence of characters
3387     // of the same length to make our C+J segmentation not get in the way.
3388     static const char *strlist[] =
3389     {
3390     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3391     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3392     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3393     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3394     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3395     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3396     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3397     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3398     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3399     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3400     "\\u2027\\U000e0067\\u0a47\\u00b7",
3401     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3402     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3403     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3404     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3405     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3406     "\\u0027\\u11af\\U000e0057\\u0602",
3407     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3408     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3409     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3410     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3411     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3412     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3413     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3414     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3415     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3416     "\\u18f4\\U000e0049\\u20e7\\u2027",
3417     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3418     "\\ua183\\u102d\\u0bec\\u003a",
3419     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3420     "\\u003a\\u0e57\\u0fad\\u002e",
3421     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3422     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3423     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3424     "\\u003a\\u0664\\u00b7\\u1fba",
3425     "\\u003b\\u0027\\u00b7\\u47a3",
3426     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3427     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3428     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3429     };
3430     int loop;
3431     if (U_FAILURE(status)) {
3432         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3433         return;
3434     }
3435     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3436         // printf("looping %d\n", loop);
3437         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3438         // RBBICharMonkey monkey;
3439         RBBIWordMonkey monkey;
3440 
3441         int expected[50];
3442         int expectedcount = 0;
3443 
3444         monkey.setText(ustr);
3445         int i;
3446         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3447             expected[expectedcount ++] = i;
3448         }
3449 
3450         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3451     }
3452     delete bi;
3453 #endif
3454 }
3455 
TestWordBoundary(void)3456 void RBBITest::TestWordBoundary(void)
3457 {
3458     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3459     Locale        locale("en");
3460     UErrorCode    status = U_ZERO_ERROR;
3461     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3462     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3463     if (U_FAILURE(status)) {
3464         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3465                 __FILE__, __LINE__, u_errorName(status));
3466         return;
3467     }
3468     UChar         str[50];
3469     static const char *strlist[] =
3470     {
3471     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3472     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3473     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3474     "\\u2027\\U000e0067\\u0a47\\u00b7",
3475     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3476     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3477     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3478     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3479     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3480     "\\u0027\\u11af\\U000e0057\\u0602",
3481     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3482     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3483     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3484     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3485     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3486     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3487     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3488     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3489     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3490     "\\u58f4\\U000e0049\\u20e7\\u2027",
3491     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3492     "\\ua183\\u102d\\u0bec\\u003a",
3493     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3494     "\\u003a\\u0e57\\u0fad\\u002e",
3495     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3496     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3497     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3498     "\\u003a\\u0664\\u00b7\\u1fba",
3499     "\\u003b\\u0027\\u00b7\\u47a3",
3500     };
3501     int loop;
3502     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3503         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3504         UnicodeString ustr(str);
3505         int forward[50];
3506         int count = 0;
3507 
3508         bi->setText(ustr);
3509         int prev = -1;
3510         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3511             ++count;
3512             if (count >= UPRV_LENGTHOF(forward)) {
3513                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3514                         __FILE__, __LINE__, loop, count, boundary);
3515                 return;
3516             }
3517             forward[count] = boundary;
3518             if (boundary <= prev) {
3519                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3520                         __FILE__, __LINE__, loop, prev, boundary);
3521                 break;
3522             }
3523             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3524                 if (bi->isBoundary(nonBoundary)) {
3525                     printStringBreaks(ustr, forward, count);
3526                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3527                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3528                     return;
3529                 }
3530             }
3531             if (!bi->isBoundary(boundary)) {
3532                 printStringBreaks(ustr, forward, count);
3533                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3534                        __FILE__, __LINE__, boundary);
3535                 return;
3536             }
3537             prev = boundary;
3538         }
3539     }
3540 }
3541 
TestLineBreaks(void)3542 void RBBITest::TestLineBreaks(void)
3543 {
3544 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3545     Locale        locale("en");
3546     UErrorCode    status = U_ZERO_ERROR;
3547     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3548     const int32_t  STRSIZE = 50;
3549     UChar         str[STRSIZE];
3550     static const char *strlist[] =
3551     {
3552      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3553      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3554              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3555      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3556              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3557      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3558      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3559      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3560      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3561      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3562      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3563      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3564      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3565      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3566      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3567      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3568      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3569      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3570      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3571      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3572      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3573      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3574      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3575      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3576      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3577      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3578      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3579      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3580      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3581      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3582      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3583      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3584      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3585      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3586      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3587      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3588      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3589      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3590      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3591          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3592     };
3593     int loop;
3594     TEST_ASSERT_SUCCESS(status);
3595     if (U_FAILURE(status)) {
3596         return;
3597     }
3598     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3599         // printf("looping %d\n", loop);
3600         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3601         if (t >= STRSIZE) {
3602             TEST_ASSERT(FALSE);
3603             continue;
3604         }
3605 
3606 
3607         UnicodeString ustr(str);
3608         RBBILineMonkey monkey;
3609         if (U_FAILURE(monkey.deferredStatus)) {
3610             continue;
3611         }
3612 
3613         const int EXPECTEDSIZE = 50;
3614         int expected[EXPECTEDSIZE];
3615         int expectedcount = 0;
3616 
3617         monkey.setText(ustr);
3618         int i;
3619         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3620             if (expectedcount >= EXPECTEDSIZE) {
3621                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3622                 return;
3623             }
3624             expected[expectedcount ++] = i;
3625         }
3626 
3627         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3628     }
3629     delete bi;
3630 #endif
3631 }
3632 
TestSentBreaks(void)3633 void RBBITest::TestSentBreaks(void)
3634 {
3635 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3636     Locale        locale("en");
3637     UErrorCode    status = U_ZERO_ERROR;
3638     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3639     UChar         str[200];
3640     static const char *strlist[] =
3641     {
3642      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3643      "This\n",
3644      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3645      "\"Sentence ending with a quote.\" Bye.",
3646      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3647      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3648      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3649      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3650      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3651      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3652      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3653              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3654              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3655              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3656      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3657              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3658              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3659              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3660              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3661              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3662     };
3663     int loop;
3664     if (U_FAILURE(status)) {
3665         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3666         return;
3667     }
3668     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3669         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3670         UnicodeString ustr(str);
3671 
3672         RBBISentMonkey monkey;
3673         if (U_FAILURE(monkey.deferredStatus)) {
3674             continue;
3675         }
3676 
3677         const int EXPECTEDSIZE = 50;
3678         int expected[EXPECTEDSIZE];
3679         int expectedcount = 0;
3680 
3681         monkey.setText(ustr);
3682         int i;
3683         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3684             if (expectedcount >= EXPECTEDSIZE) {
3685                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3686                 return;
3687             }
3688             expected[expectedcount ++] = i;
3689         }
3690 
3691         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3692     }
3693     delete bi;
3694 #endif
3695 }
3696 
TestMonkey()3697 void RBBITest::TestMonkey() {
3698 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3699 
3700     UErrorCode     status    = U_ZERO_ERROR;
3701     int32_t        loopCount = 500;
3702     int32_t        seed      = 1;
3703     UnicodeString  breakType = "all";
3704     Locale         locale("en");
3705     UBool          useUText  = FALSE;
3706 
3707     if (quick == FALSE) {
3708         loopCount = 10000;
3709     }
3710 
3711     if (fTestParams) {
3712         UnicodeString p(fTestParams);
3713         loopCount = getIntParam("loop", p, loopCount);
3714         seed      = getIntParam("seed", p, seed);
3715 
3716         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3717         if (m.find()) {
3718             breakType = m.group(1, status);
3719             m.reset();
3720             p = m.replaceFirst("", status);
3721         }
3722 
3723         RegexMatcher u(" *utext", p, 0, status);
3724         if (u.find()) {
3725             useUText = TRUE;
3726             u.reset();
3727             p = u.replaceFirst("", status);
3728         }
3729 
3730 
3731         // m.reset(p);
3732         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3733             // Each option is stripped out of the option string as it is processed.
3734             // All options have been checked.  The option string should have been completely emptied..
3735             char buf[100];
3736             p.extract(buf, sizeof(buf), NULL, status);
3737             buf[sizeof(buf)-1] = 0;
3738             errln("Unrecognized or extra parameter:  %s\n", buf);
3739             return;
3740         }
3741 
3742     }
3743 
3744     if (breakType == "char" || breakType == "all") {
3745         RBBICharMonkey  m;
3746         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3747         if (U_SUCCESS(status)) {
3748             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3749             if (breakType == "all" && useUText==FALSE) {
3750                 // Also run a quick test with UText when "all" is specified
3751                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3752             }
3753         }
3754         else {
3755             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3756         }
3757         delete bi;
3758     }
3759 
3760     if (breakType == "word" || breakType == "all") {
3761         logln("Word Break Monkey Test");
3762         RBBIWordMonkey  m;
3763         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3764         if (U_SUCCESS(status)) {
3765             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3766         }
3767         else {
3768             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3769         }
3770         delete bi;
3771     }
3772 
3773     if (breakType == "line" || breakType == "all") {
3774         logln("Line Break Monkey Test");
3775         RBBILineMonkey  m;
3776         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3777         if (loopCount >= 10) {
3778             loopCount = loopCount / 5;   // Line break runs slower than the others.
3779         }
3780         if (U_SUCCESS(status)) {
3781             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3782         }
3783         else {
3784             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3785         }
3786         delete bi;
3787     }
3788 
3789     if (breakType == "sent" || breakType == "all"  ) {
3790         logln("Sentence Break Monkey Test");
3791         RBBISentMonkey  m;
3792         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3793         if (loopCount >= 10) {
3794             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3795         }
3796         if (U_SUCCESS(status)) {
3797             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3798         }
3799         else {
3800             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3801         }
3802         delete bi;
3803     }
3804 
3805 #endif
3806 }
3807 
3808 //
3809 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3810 //    Parameters:
3811 //       bi      - the break iterator to use
3812 //       mk      - MonkeyKind, abstraction for obtaining expected results
3813 //       name    - Name of test (char, word, etc.) for use in error messages
3814 //       seed    - Seed for starting random number generator (parameter from user)
3815 //       numIterations
3816 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3817 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3818                          int32_t numIterations, UBool useUText) {
3819 
3820 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3821 
3822     const int32_t    TESTSTRINGLEN = 500;
3823     UnicodeString    testText;
3824     int32_t          numCharClasses;
3825     UVector          *chClasses;
3826     int              expected[TESTSTRINGLEN*2 + 1];
3827     int              expectedCount = 0;
3828     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3829     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3830     char             reverseBreaks[TESTSTRINGLEN*2+1];
3831     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3832     char             followingBreaks[TESTSTRINGLEN*2+1];
3833     char             precedingBreaks[TESTSTRINGLEN*2+1];
3834     int              i;
3835     int              loopCount = 0;
3836 
3837     m_seed = seed;
3838 
3839     numCharClasses = mk.charClasses()->size();
3840     chClasses      = mk.charClasses();
3841 
3842     // Check for errors that occured during the construction of the MonkeyKind object.
3843     //  Can't report them where they occured because errln() is a method coming from intlTest,
3844     //  and is not visible outside of RBBITest :-(
3845     if (U_FAILURE(mk.deferredStatus)) {
3846         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3847         return;
3848     }
3849 
3850     // Verify that the character classes all have at least one member.
3851     for (i=0; i<numCharClasses; i++) {
3852         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3853         if (s == NULL || s->size() == 0) {
3854             errln("Character Class #%d is null or of zero size.", i);
3855             return;
3856         }
3857     }
3858 
3859     while (loopCount < numIterations || numIterations == -1) {
3860         if (numIterations == -1 && loopCount % 10 == 0) {
3861             // If test is running in an infinite loop, display a periodic tic so
3862             //   we can tell that it is making progress.
3863             fprintf(stderr, ".");
3864         }
3865         // Save current random number seed, so that we can recreate the random numbers
3866         //   for this loop iteration in event of an error.
3867         seed = m_seed;
3868 
3869         // Populate a test string with data.
3870         testText.truncate(0);
3871         for (i=0; i<TESTSTRINGLEN; i++) {
3872             int32_t  aClassNum = m_rand() % numCharClasses;
3873             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3874             int32_t   charIdx = m_rand() % classSet->size();
3875             UChar32   c = classSet->charAt(charIdx);
3876             if (c < 0) {   // TODO:  deal with sets containing strings.
3877                 errln("%s:%d c < 0", __FILE__, __LINE__);
3878                 break;
3879             }
3880             // Do not assemble a supplementary character from randomly generated separate surrogates.
3881             //   (It could be a dictionary character)
3882             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
3883                 continue;
3884             }
3885 
3886             testText.append(c);
3887         }
3888 
3889         // Calculate the expected results for this test string.
3890         mk.setText(testText);
3891         memset(expectedBreaks, 0, sizeof(expectedBreaks));
3892         expectedBreaks[0] = 1;
3893         int32_t breakPos = 0;
3894         expectedCount = 0;
3895         for (;;) {
3896             breakPos = mk.next(breakPos);
3897             if (breakPos == -1) {
3898                 break;
3899             }
3900             if (breakPos > testText.length()) {
3901                 errln("breakPos > testText.length()");
3902             }
3903             expectedBreaks[breakPos] = 1;
3904             U_ASSERT(expectedCount<testText.length());
3905             expected[expectedCount ++] = breakPos;
3906             (void)expected;   // Set but not used warning.
3907                               // TODO (andy): check it out.
3908         }
3909 
3910         // Find the break positions using forward iteration
3911         memset(forwardBreaks, 0, sizeof(forwardBreaks));
3912         if (useUText) {
3913             UErrorCode status = U_ZERO_ERROR;
3914             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3915             // testUText = utext_openUnicodeString(testUText, &testText, &status);
3916             bi->setText(testUText, status);
3917             TEST_ASSERT_SUCCESS(status);
3918             utext_close(testUText);   // The break iterator does a shallow clone of the UText
3919                                       //  This UText can be closed immediately, so long as the
3920                                       //  testText string continues to exist.
3921         } else {
3922             bi->setText(testText);
3923         }
3924 
3925         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
3926             if (i < 0 || i > testText.length()) {
3927                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3928                 break;
3929             }
3930             forwardBreaks[i] = 1;
3931         }
3932 
3933         // Find the break positions using reverse iteration
3934         memset(reverseBreaks, 0, sizeof(reverseBreaks));
3935         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
3936             if (i < 0 || i > testText.length()) {
3937                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3938                 break;
3939             }
3940             reverseBreaks[i] = 1;
3941         }
3942 
3943         // Find the break positions using isBoundary() tests.
3944         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
3945         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
3946         for (i=0; i<=testText.length(); i++) {
3947             isBoundaryBreaks[i] = bi->isBoundary(i);
3948         }
3949 
3950 
3951         // Find the break positions using the following() function.
3952         // printf(".");
3953         memset(followingBreaks, 0, sizeof(followingBreaks));
3954         int32_t   lastBreakPos = 0;
3955         followingBreaks[0] = 1;
3956         for (i=0; i<testText.length(); i++) {
3957             breakPos = bi->following(i);
3958             if (breakPos <= i ||
3959                 breakPos < lastBreakPos ||
3960                 breakPos > testText.length() ||
3961                 (breakPos > lastBreakPos && lastBreakPos > i)) {
3962                 errln("%s break monkey test: "
3963                     "Out of range value returned by BreakIterator::following().\n"
3964                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
3965                          name, seed, i, breakPos, lastBreakPos);
3966                 break;
3967             }
3968             followingBreaks[breakPos] = 1;
3969             lastBreakPos = breakPos;
3970         }
3971 
3972         // Find the break positions using the preceding() function.
3973         memset(precedingBreaks, 0, sizeof(precedingBreaks));
3974         lastBreakPos = testText.length();
3975         precedingBreaks[testText.length()] = 1;
3976         for (i=testText.length(); i>0; i--) {
3977             breakPos = bi->preceding(i);
3978             if (breakPos >= i ||
3979                 breakPos > lastBreakPos ||
3980                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
3981                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
3982                 errln("%s break monkey test: "
3983                     "Out of range value returned by BreakIterator::preceding().\n"
3984                     "index=%d;  prev returned %d; lastBreak=%d" ,
3985                     name,  i, breakPos, lastBreakPos);
3986                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
3987                     precedingBreaks[i] = 2;   // Forces an error.
3988                 }
3989             } else {
3990                 if (breakPos >= 0) {
3991                     precedingBreaks[breakPos] = 1;
3992                 }
3993                 lastBreakPos = breakPos;
3994             }
3995         }
3996 
3997         // Compare the expected and actual results.
3998         for (i=0; i<=testText.length(); i++) {
3999             const char *errorType = NULL;
4000             if  (forwardBreaks[i] != expectedBreaks[i]) {
4001                 errorType = "next()";
4002             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4003                 errorType = "previous()";
4004             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4005                 errorType = "isBoundary()";
4006             } else if (followingBreaks[i] != expectedBreaks[i]) {
4007                 errorType = "following()";
4008             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4009                 errorType = "preceding()";
4010             }
4011 
4012 
4013             if (errorType != NULL) {
4014                 // Format a range of the test text that includes the failure as
4015                 //  a data item that can be included in the rbbi test data file.
4016 
4017                 // Start of the range is the last point where expected and actual results
4018                 //   both agreed that there was a break position.
4019                 int startContext = i;
4020                 int32_t count = 0;
4021                 for (;;) {
4022                     if (startContext==0) { break; }
4023                     startContext --;
4024                     if (expectedBreaks[startContext] != 0) {
4025                         if (count == 2) break;
4026                         count ++;
4027                     }
4028                 }
4029 
4030                 // End of range is two expected breaks past the start position.
4031                 int endContext = i + 1;
4032                 int ci;
4033                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4034                     for (;;) {
4035                         if (endContext >= testText.length()) {break;}
4036                         if (expectedBreaks[endContext-1] != 0) {
4037                             if (count == 0) break;
4038                             count --;
4039                         }
4040                         endContext ++;
4041                     }
4042                 }
4043 
4044                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4045                 UnicodeString errorText = "<data>";
4046                 /***if (strcmp(errorType, "next()") == 0) {
4047                     startContext = 0;
4048                     endContext = testText.length();
4049 
4050                     printStringBreaks(testText, expected, expectedCount);
4051                 }***/
4052 
4053                 for (ci=startContext; ci<endContext;) {
4054                     UnicodeString hexChars("0123456789abcdef");
4055                     UChar32  c;
4056                     int      bn;
4057                     c = testText.char32At(ci);
4058                     if (ci == i) {
4059                         // This is the location of the error.
4060                         errorText.append("<?>");
4061                     } else if (expectedBreaks[ci] != 0) {
4062                         // This a non-error expected break position.
4063                         errorText.append("\\");
4064                     }
4065                     if (c < 0x10000) {
4066                         errorText.append("\\u");
4067                         for (bn=12; bn>=0; bn-=4) {
4068                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4069                         }
4070                     } else {
4071                         errorText.append("\\U");
4072                         for (bn=28; bn>=0; bn-=4) {
4073                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4074                         }
4075                     }
4076                     ci = testText.moveIndex32(ci, 1);
4077                 }
4078                 errorText.append("\\");
4079                 errorText.append("</data>\n");
4080 
4081                 // Output the error
4082                 char  charErrorTxt[500];
4083                 UErrorCode status = U_ZERO_ERROR;
4084                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4085                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4086                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4087 
4088                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4089                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4090                     errorType, seed, i, charErrorTxt);
4091                 break;
4092             }
4093         }
4094 
4095         loopCount++;
4096     }
4097 #endif
4098 }
4099 
4100 
4101 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4102 //             This test checks the initial patch,
4103 //             which is to just keep it from crashing.  Correct word boundaries
4104 //             await a proper fix to the dictionary code.
4105 //
TestBug5532(void)4106 void RBBITest::TestBug5532(void)  {
4107    // Text includes a mixture of Thai and Latin.
4108    const unsigned char utf8Data[] = {
4109            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4110            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4111            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4112            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4113            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4114            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4115            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4116            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4117            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4118            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4119            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4120 
4121     UErrorCode status = U_ZERO_ERROR;
4122     UText utext=UTEXT_INITIALIZER;
4123     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4124     TEST_ASSERT_SUCCESS(status);
4125 
4126     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4127     TEST_ASSERT_SUCCESS(status);
4128     if (U_SUCCESS(status)) {
4129         bi->setText(&utext, status);
4130         TEST_ASSERT_SUCCESS(status);
4131 
4132         int32_t breakCount = 0;
4133         int32_t previousBreak = -1;
4134         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4135             // For now, just make sure that the break iterator doesn't hang.
4136             TEST_ASSERT(previousBreak < bi->current());
4137             previousBreak = bi->current();
4138         }
4139         TEST_ASSERT(breakCount > 0);
4140     }
4141     delete bi;
4142     utext_close(&utext);
4143 }
4144 
4145 
TestBug9983(void)4146 void RBBITest::TestBug9983(void)  {
4147     UnicodeString text = UnicodeString("\\u002A"  // * Other
4148                                        "\\uFF65"  //   Other
4149                                        "\\u309C"  //   Katakana
4150                                        "\\uFF9F"  //   Extend
4151                                        "\\uFF65"  //   Other
4152                                        "\\u0020"  //   Other
4153                                        "\\u0000").unescape();
4154 
4155     UErrorCode status = U_ZERO_ERROR;
4156     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4157         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4158     TEST_ASSERT_SUCCESS(status);
4159     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4160         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4161     TEST_ASSERT_SUCCESS(status);
4162     if (U_FAILURE(status)) {
4163         return;
4164     }
4165     int32_t offset, rstatus, iterationCount;
4166 
4167     brkiter->setText(text);
4168     brkiter->last();
4169     iterationCount = 0;
4170     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4171         iterationCount++;
4172         rstatus = brkiter->getRuleStatus();
4173         (void)rstatus;     // Suppress set but not used warning.
4174         if (iterationCount >= 10) {
4175            break;
4176         }
4177     }
4178     TEST_ASSERT(iterationCount == 6);
4179 
4180     brkiterPOSIX->setText(text);
4181     brkiterPOSIX->last();
4182     iterationCount = 0;
4183     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4184         iterationCount++;
4185         rstatus = brkiterPOSIX->getRuleStatus();
4186         (void)rstatus;     // Suppress set but not used warning.
4187         if (iterationCount >= 10) {
4188            break;
4189         }
4190     }
4191     TEST_ASSERT(iterationCount == 6);
4192 }
4193 
4194 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4195 //
TestBug7547()4196 void RBBITest::TestBug7547() {
4197     UnicodeString rules;
4198     UErrorCode status = U_ZERO_ERROR;
4199     UParseError parseError;
4200     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4201     if (status != U_BRK_RULE_SYNTAX) {
4202         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4203     }
4204     if (parseError.line != 1 || parseError.offset != 0) {
4205         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4206     }
4207 }
4208 
4209 
TestBug12797()4210 void RBBITest::TestBug12797() {
4211     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4212     UErrorCode status = U_ZERO_ERROR;
4213     UParseError parseError;
4214     RuleBasedBreakIterator bi(rules, parseError, status);
4215     if (U_FAILURE(status)) {
4216         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4217         return;
4218     }
4219     UnicodeString text = "abc";
4220     bi.setText(text);
4221     bi.first();
4222     int32_t boundary = bi.next();
4223     if (boundary != 3) {
4224         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4225     }
4226 }
4227 
TestBug12918()4228 void RBBITest::TestBug12918() {
4229     // This test triggers an assertion failure in dictbe.cpp
4230     const UChar *crasherString = u"\u3325\u4a16";
4231     UErrorCode status = U_ZERO_ERROR;
4232     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4233     if (U_FAILURE(status)) {
4234         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4235         return;
4236     }
4237     ubrk_first(iter);
4238     int32_t pos = 0;
4239     int32_t lastPos = -1;
4240     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4241         if (pos <= lastPos) {
4242             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4243             break;
4244         }
4245     }
4246     ubrk_close(iter);
4247 }
4248 
TestBug12932()4249 void RBBITest::TestBug12932() {
4250     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4251     UnicodeString ruleStr(
4252             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4253             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4254             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4255             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4256             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4257             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4258 
4259     UErrorCode status = U_ZERO_ERROR;
4260     UParseError parseError;
4261     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4262     if (status != U_BRK_RULE_SYNTAX) {
4263         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4264                 __FILE__, __LINE__, u_errorName(status));
4265     }
4266 }
4267 
4268 
4269 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4270 //             remain undevided by ICU char, word and line break.
TestEmoji()4271 void RBBITest::TestEmoji() {
4272 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4273     UErrorCode  status = U_ZERO_ERROR;
4274 
4275     CharString testFileName;
4276     testFileName.append(IntlTest::getSourceTestData(status), status);
4277     testFileName.appendPathPart("emoji-test.txt", status);
4278     if (U_FAILURE(status)) {
4279         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4280         return;
4281     }
4282     logln("Opening data file %s\n", testFileName.data());
4283 
4284     int    len;
4285     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4286     if (U_FAILURE(status) || testFile == NULL) {
4287         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4288         return;
4289     }
4290     UnicodeString testFileAsString(testFile, len);
4291     delete [] testFile;
4292 
4293     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4294     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4295     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4296     int32_t lineNumber = 0;
4297 
4298     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4299     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4300     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4301     if (U_FAILURE(status)) {
4302         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4303         return;
4304     }
4305 
4306     while (lineMatcher.find()) {
4307         ++lineNumber;
4308         UnicodeString line = lineMatcher.group(status);
4309         hexMatcher.reset(line);
4310         UnicodeString testString;   // accumulates the emoji sequence.
4311         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4312             UnicodeString hex = hexMatcher.group(1, status);
4313             if (hex.length() > 8) {
4314                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4315                 break;
4316             }
4317             CharString hex8;
4318             hex8.appendInvariantChars(hex, status);
4319             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4320             if (c<=0x10ffff) {
4321                 testString.append(c);
4322             } else {
4323                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4324                         __FILE__, __LINE__, lineNumber, hex8.data());
4325                 break;
4326             }
4327         }
4328 
4329         if (testString.length() > 1) {
4330             charBreaks->setText(testString);
4331             charBreaks->first();
4332             int32_t firstBreak = charBreaks->next();
4333             if (testString.length() != firstBreak) {
4334                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4335                         __FILE__, __LINE__, lineNumber, firstBreak);
4336             }
4337             wordBreaks->setText(testString);
4338             wordBreaks->first();
4339             firstBreak = wordBreaks->next();
4340             if (testString.length() != firstBreak) {
4341                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4342                         __FILE__, __LINE__, lineNumber, firstBreak);
4343             }
4344             lineBreaks->setText(testString);
4345             lineBreaks->first();
4346             firstBreak = lineBreaks->next();
4347             if (testString.length() != firstBreak) {
4348                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4349                         __FILE__, __LINE__, lineNumber, firstBreak);
4350             }
4351         }
4352     }
4353 #endif
4354 }
4355 
4356 
4357 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4358 
4359 // WHERE Macro yields a literal string of the form "source_file_name:line number "
4360 // TODO: propose something equivalent as a test framework addition.
4361 
4362 #define WHERE __FILE__ ":" XLINE(__LINE__) " "
4363 #define XLINE(s) LINE(s)
4364 #define LINE(s) #s
4365 
TestBug12519()4366 void RBBITest::TestBug12519() {
4367     UErrorCode status = U_ZERO_ERROR;
4368     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4369     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4370     if (!assertSuccess(WHERE, status)) {
4371         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4372         return;
4373     }
4374     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4375 
4376     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4377     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4378 
4379     LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
4380     assertTrue(WHERE, *biEn == *cloneEn);
4381     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4382 
4383     LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
4384     assertTrue(WHERE, *biFr == *cloneFr);
4385     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4386 
4387     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4388     UnicodeString text("Hallo Welt");
4389     biDe->setText(text);
4390     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4391     *biDe = *biFr;
4392     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4393 }
4394 
TestBug12677()4395 void RBBITest::TestBug12677() {
4396     // Check that stripping of comments from rules for getRules() is not confused by
4397     // the presence of '#' characters in the rules that do not introduce comments.
4398     UnicodeString rules(u"!!forward; \n"
4399                          "$x = [ab#];  # a set with a # literal. \n"
4400                          " # .;        # a comment that looks sort of like a rule.   \n"
4401                          " '#' '?';    # a rule with a quoted #   \n"
4402                        );
4403 
4404     UErrorCode status = U_ZERO_ERROR;
4405     UParseError pe;
4406     RuleBasedBreakIterator bi(rules, pe, status);
4407     assertSuccess(WHERE, status);
4408     UnicodeString rtRules = bi.getRules();
4409     assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "),  rtRules);
4410 }
4411 
4412 
TestTableRedundancies()4413 void RBBITest::TestTableRedundancies() {
4414     UErrorCode status = U_ZERO_ERROR;
4415 
4416     LocalPointer<RuleBasedBreakIterator> bi (
4417         (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4418     assertSuccess(WHERE, status);
4419     if (U_FAILURE(status)) return;
4420 
4421     RBBIDataWrapper *dw = bi->fData;
4422     const RBBIStateTable *fwtbl = dw->fForwardTable;
4423     int32_t numCharClasses = dw->fHeader->fCatCount;
4424     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4425 
4426     // Check for duplicate columns (character categories)
4427 
4428     std::vector<UnicodeString> columns;
4429     for (int32_t column = 0; column < numCharClasses; column++) {
4430         UnicodeString s;
4431         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4432             RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4433             s.append(row->fNextState[column]);
4434         }
4435         columns.push_back(s);
4436     }
4437     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4438     for (int c1=1; c1<numCharClasses; c1++) {
4439         for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4440             if (columns.at(c1) == columns.at(c2)) {
4441                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4442                 goto out;
4443             }
4444         }
4445     }
4446   out:
4447 
4448     // Check for duplicate states
4449     std::vector<UnicodeString> rows;
4450     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4451         UnicodeString s;
4452         RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4453         assertTrue(WHERE, row->fAccepting >= -1);
4454         s.append(row->fAccepting + 1);   // values of -1 are expected.
4455         s.append(row->fLookAhead);
4456         s.append(row->fTagIdx);
4457         for (int32_t column = 0; column < numCharClasses; column++) {
4458             s.append(row->fNextState[column]);
4459         }
4460         rows.push_back(s);
4461     }
4462     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4463         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4464             if (rows.at(r1) == rows.at(r2)) {
4465                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4466                 return;
4467             }
4468         }
4469     }
4470 }
4471 
4472 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4473 //            even after next() has returned DONE.
4474 
TestBug13447()4475 void RBBITest::TestBug13447() {
4476     UErrorCode status = U_ZERO_ERROR;
4477     LocalPointer<RuleBasedBreakIterator> bi(
4478         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4479     assertSuccess(WHERE, status);
4480     if (U_FAILURE(status)) return;
4481     UnicodeString data(u"1234");
4482     bi->setText(data);
4483     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4484     assertEquals(WHERE, 4, bi->next());
4485     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4486     assertEquals(WHERE, UBRK_DONE, bi->next());
4487     assertEquals(WHERE, 4, bi->current());
4488     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4489 }
4490 
4491 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4492 //  for filling the break iterator cache when starting from random positions
4493 //  in the text.
4494 //
4495 //  It's a monkey test, working on random data, with the expected data obtained
4496 //  from forward iteration (no safe rules involved), comparing with results
4497 //  when indexing into the interior of the string (safe rules needed).
4498 
TestReverse()4499 void RBBITest::TestReverse() {
4500     UErrorCode status = U_ZERO_ERROR;
4501 
4502     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4503             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4504     assertSuccess(WHERE, status, true);
4505     status = U_ZERO_ERROR;
4506     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4507             BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4508     assertSuccess(WHERE, status, true);
4509     status = U_ZERO_ERROR;
4510     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4511             BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4512     assertSuccess(WHERE, status, true);
4513     status = U_ZERO_ERROR;
4514     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4515             BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4516     assertSuccess(WHERE, status, true);
4517 }
4518 
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4519 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4520     if (!bi) {
4521         return;
4522     }
4523 
4524     // From the mapping trie in the break iterator's internal data, create a
4525     // vector of UnicodeStrings, one for each character category, containing
4526     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4527     // to avoid an execess of unassigned code points.
4528 
4529     RBBIDataWrapper *data = bi->fData;
4530     int32_t categoryCount = data->fHeader->fCatCount;
4531     UTrie2  *trie = data->fTrie;
4532 
4533     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4534     for (int cp=0; cp<0x1fff0; ++cp) {
4535         int cat = utrie2_get32(trie, cp);
4536         cat &= ~0x4000;    // And off the dictionary bit from the category.
4537         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4538         if (cat < 0 || cat >= categoryCount) return;
4539         strings[cat].append(cp);
4540     }
4541 
4542     icu_rand randomGen;
4543     const int testStringLength = 10000;
4544     UnicodeString testString;
4545 
4546     for (int i=0; i<testStringLength; ++i) {
4547         int charClass = randomGen() % categoryCount;
4548         if (strings[charClass].length() > 0) {
4549             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4550             testString.append(cp);
4551         }
4552     }
4553 
4554     typedef std::pair<UBool, int32_t> Result;
4555     std::vector<Result> expectedResults;
4556     bi->setText(testString);
4557     for (int i=0; i<testString.length(); ++i) {
4558         bool isboundary = bi->isBoundary(i);
4559         int  ruleStatus = bi->getRuleStatus();
4560         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4561     }
4562 
4563     for (int i=testString.length()-1; i>=0; --i) {
4564         bi->setText(testString);   // clears the internal break cache
4565         Result expected = expectedResults[i];
4566         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4567         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4568     }
4569 }
4570 
4571 
4572 // Ticket 13692 - finding word boundaries in very large numbers or words could
4573 //                be very time consuming. When the problem was present, this void test
4574 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4575 
TestBug13692()4576 void RBBITest::TestBug13692() {
4577     UErrorCode status = U_ZERO_ERROR;
4578     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4579             BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4580     if (!assertSuccess(WHERE, status, true)) {
4581         return;
4582     }
4583     constexpr int32_t LENGTH = 1000000;
4584     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4585     for (int i=0; i<20; i+=2) {
4586         longNumber.setCharAt(i, u' ');
4587     }
4588     bi->setText(longNumber);
4589     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4590     assertSuccess(WHERE, status);
4591 }
4592 
4593 //
4594 //  TestDebug    -  A place-holder test for debugging purposes.
4595 //                  For putting in fragments of other tests that can be invoked
4596 //                  for tracing  without a lot of unwanted extra stuff happening.
4597 //
TestDebug(void)4598 void RBBITest::TestDebug(void) {
4599     UErrorCode status = U_ZERO_ERROR;
4600     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4601             BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4602     if (!assertSuccess(WHERE, status, true)) {
4603         return;
4604     }
4605     const UnicodeString &rules = bi->getRules();
4606     UParseError pe;
4607     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4608     assertSuccess(WHERE, status);
4609 }
4610 
TestProperties()4611 void RBBITest::TestProperties() {
4612     UErrorCode errorCode = U_ZERO_ERROR;
4613     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4614     if (!prependSet.isEmpty()) {
4615         errln(
4616             "[:GCB=Prepend:] is not empty any more. "
4617             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4618             "change this test to the opposite condition.");
4619     }
4620 }
4621 
4622 #endif // #if !UCONFIG_NO_BREAK_ITERATION
4623