• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 1999-2012, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 /************************************************************************
7 *   Date        Name        Description
8 *   12/15/99    Madhu        Creation.
9 *   01/12/2000  Madhu        Updated for changed API and added new tests
10 ************************************************************************/
11 
12 #include "utypeinfo.h"  // for 'typeid' to work
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 #include "unicode/regex.h"
28 #endif
29 #include "unicode/ustring.h"
30 #include "unicode/utext.h"
31 #include "intltest.h"
32 #include "rbbitst.h"
33 #include <string.h>
34 #include "uvector.h"
35 #include "uvectr32.h"
36 #include <string.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include "unicode/numfmt.h"
40 #include "unicode/uscript.h"
41 
42 #define TEST_ASSERT(x) {if (!(x)) { \
43     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
44 
45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
46     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
47 
48 
49 //---------------------------------------------
50 // runIndexedTest
51 //---------------------------------------------
52 
53 
54 //  Note:  Before adding new tests to this file, check whether the desired test data can
55 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
56 //         it's much less work than writing a new test, diagnostic output in the event of failures
57 //         is good, and the test data file will is shared with ICU4J, so eventually the test
58 //         will run there as well, without additional effort.
59 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)60 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
61 {
62     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
63 
64     switch (index) {
65 #if !UCONFIG_NO_FILE_IO
66         case 0: name = "TestBug4153072";
67             if(exec) TestBug4153072();                         break;
68 #else
69         case 0: name = "skip";
70             break;
71 #endif
72 
73         case 1: name = "skip";
74             break;
75         case 2: name = "TestStatusReturn";
76             if(exec) TestStatusReturn();                       break;
77 
78 #if !UCONFIG_NO_FILE_IO
79         case 3: name = "TestUnicodeFiles";
80             if(exec) TestUnicodeFiles();                       break;
81         case 4: name = "TestEmptyString";
82             if(exec) TestEmptyString();                        break;
83 #else
84         case 3: case 4: name = "skip";
85             break;
86 #endif
87 
88         case 5: name = "TestGetAvailableLocales";
89             if(exec) TestGetAvailableLocales();                break;
90 
91         case 6: name = "TestGetDisplayName";
92             if(exec) TestGetDisplayName();                     break;
93 
94 #if !UCONFIG_NO_FILE_IO
95         case 7: name = "TestEndBehaviour";
96             if(exec) TestEndBehaviour();                       break;
97         case 8: case 9: case 10: name = "skip";
98              break;
99         case 11: name = "TestWordBreaks";
100              if(exec) TestWordBreaks();                        break;
101         case 12: name = "TestWordBoundary";
102              if(exec) TestWordBoundary();                      break;
103         case 13: name = "TestLineBreaks";
104              if(exec) TestLineBreaks();                        break;
105         case 14: name = "TestSentBreaks";
106              if(exec) TestSentBreaks();                        break;
107         case 15: name = "TestExtended";
108              if(exec) TestExtended();                          break;
109 #else
110         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
111              break;
112 #endif
113 
114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
115         case 16:
116             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
117 #else
118         case 16:
119              name = "skip";                                    break;
120 #endif
121 
122 #if !UCONFIG_NO_FILE_IO
123         case 17: name = "TestBug3818";
124             if(exec) TestBug3818();                            break;
125 #else
126         case 17: name = "skip";
127             break;
128 #endif
129 
130         case 18: name = "skip";
131             break;
132         case 19: name = "TestDebug";
133             if(exec) TestDebug();                              break;
134         case 20: name = "skip";
135             break;
136 
137 #if !UCONFIG_NO_FILE_IO
138         case 21: name = "TestBug5775";
139             if (exec) TestBug5775();                           break;
140 #else
141         case 21: name = "skip";
142             break;
143 #endif
144 
145         case 22: name = "skip";
146             break;
147         case 23: name = "TestDictRules";
148             if (exec) TestDictRules();                         break;
149         case 24: name = "TestBug5532";
150             if (exec) TestBug5532();                           break;
151         default: name = ""; break; //needed to end loop
152     }
153 }
154 
155 
156 //---------------------------------------------------------------------------
157 //
158 //   class BITestData   Holds a set of Break iterator test data and results
159 //                      Includes
160 //                         - the string data to be broken
161 //                         - a vector of the expected break positions.
162 //                         - a vector of source line numbers for the data,
163 //                               (to help see where errors occured.)
164 //                         - The expected break tag values.
165 //                         - Vectors of actual break positions and tag values.
166 //                         - Functions for comparing actual with expected and
167 //                            reporting errors.
168 //
169 //----------------------------------------------------------------------------
170 class BITestData {
171 public:
172     UnicodeString    fDataToBreak;
173     UVector          fExpectedBreakPositions;
174     UVector          fExpectedTags;
175     UVector          fLineNum;
176     UVector          fActualBreakPositions;   // Test Results.
177     UVector          fActualTags;
178 
179     BITestData(UErrorCode &status);
180     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
181     void             checkResults(const char *heading, RBBITest *test);
182     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
183     void             clearResults();
184 };
185 
186 //
187 // Constructor.
188 //
BITestData(UErrorCode & status)189 BITestData::BITestData(UErrorCode &status)
190 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
191   fActualTags(status)
192 {
193 }
194 
195 //
196 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
197 //                 The macro form collects the line number, which is helpful
198 //                 when tracking down failures.
199 //
200 //                 A null data item is inserted at the start of each test's data
201 //                  to put the starting zero into the data list.  The position saved for
202 //                  each non-null item is its ending position.
203 //
204 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
addDataChunk(const char * data,int32_t tag,int32_t lineNum,UErrorCode status)205 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
206     if (U_FAILURE(status)) {return;}
207     if (data != NULL) {
208         fDataToBreak.append(CharsToUnicodeString(data));
209     }
210     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
211     fExpectedTags.addElement(tag, status);
212     fLineNum.addElement(lineNum, status);
213 }
214 
215 
216 //
217 //  checkResults.   Compare the actual and expected break positions, report any differences.
218 //
checkResults(const char * heading,RBBITest * test)219 void BITestData::checkResults(const char *heading, RBBITest *test) {
220     int32_t   expectedIndex = 0;
221     int32_t   actualIndex = 0;
222 
223     for (;;) {
224         // If we've run through both the expected and actual results vectors, we're done.
225         //   break out of the loop.
226         if (expectedIndex >= fExpectedBreakPositions.size() &&
227             actualIndex   >= fActualBreakPositions.size()) {
228             break;
229         }
230 
231 
232         if (expectedIndex >= fExpectedBreakPositions.size()) {
233             err(heading, test, expectedIndex-1, actualIndex);
234             actualIndex++;
235             continue;
236         }
237 
238         if (actualIndex >= fActualBreakPositions.size()) {
239             err(heading, test, expectedIndex, actualIndex-1);
240             expectedIndex++;
241             continue;
242         }
243 
244         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
245             err(heading, test, expectedIndex, actualIndex);
246             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
247             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
248                 actualIndex++;
249             } else {
250                 expectedIndex++;
251             }
252             continue;
253         }
254 
255         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
256             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
257                 heading, fLineNum.elementAt(expectedIndex),
258                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
259         }
260 
261         actualIndex++;
262         expectedIndex++;
263     }
264 }
265 
266 //
267 //  err   -  An error was found.  Report it, along with information about where the
268 //                                incorrectly broken test data appeared in the source file.
269 //
err(const char * heading,RBBITest * test,int32_t expectedIdx,int32_t actualIdx)270 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
271 {
272     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
273     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
274     int32_t   o        = 0;
275     int32_t   line     = fLineNum.elementAti(expectedIdx);
276     if (expectedIdx > 0) {
277         // The line numbers are off by one because a premature break occurs somewhere
278         //    within the previous item, rather than at the start of the current (expected) item.
279         //    We want to report the offset of the unexpected break from the start of
280         //      this previous item.
281         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
282     }
283     if (actual < expected) {
284         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
285     } else {
286         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
287     }
288 }
289 
290 
clearResults()291 void BITestData::clearResults() {
292     fActualBreakPositions.removeAllElements();
293     fActualTags.removeAllElements();
294 }
295 
296 
297 //--------------------------------------------------------------------------------------
298 //
299 //    RBBITest    constructor and destructor
300 //
301 //--------------------------------------------------------------------------------------
302 
RBBITest()303 RBBITest::RBBITest() {
304 }
305 
306 
~RBBITest()307 RBBITest::~RBBITest() {
308 }
309 
310 //-----------------------------------------------------------------------------------
311 //
312 //   Test for status {tag} return value from break rules.
313 //        TODO:  a more thorough test.
314 //
315 //-----------------------------------------------------------------------------------
TestStatusReturn()316 void RBBITest::TestStatusReturn() {
317      UnicodeString rulesString1("$Letters = [:L:];\n"
318                                   "$Numbers = [:N:];\n"
319                                   "$Letters+{1};\n"
320                                   "$Numbers+{2};\n"
321                                   "Help\\ {4}/me\\!;\n"
322                                   "[^$Letters $Numbers];\n"
323                                   "!.*;\n", -1, US_INV);
324      UnicodeString testString1  = "abc123..abc Help me Help me!";
325                                 // 01234567890123456789012345678
326      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
327      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
328 
329      UErrorCode status=U_ZERO_ERROR;
330      UParseError    parseError;
331 
332      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
333      if(U_FAILURE(status)) {
334          dataerrln("FAIL : in construction - %s", u_errorName(status));
335      } else {
336          int32_t  pos;
337          int32_t  i = 0;
338          bi->setText(testString1);
339          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
340              if (pos != bounds1[i]) {
341                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
342                  break;
343              }
344 
345              int tag = bi->getRuleStatus();
346              if (tag != brkStatus[i]) {
347                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
348                  break;
349              }
350              i++;
351          }
352      }
353      delete bi;
354 }
355 
356 
printStringBreaks(UnicodeString ustr,int expected[],int expectedcount)357 static void printStringBreaks(UnicodeString ustr, int expected[],
358                               int expectedcount)
359 {
360     UErrorCode status = U_ZERO_ERROR;
361     char name[100];
362     printf("code    alpha extend alphanum type word sent line name\n");
363     int j;
364     for (j = 0; j < ustr.length(); j ++) {
365         if (expectedcount > 0) {
366             int k;
367             for (k = 0; k < expectedcount; k ++) {
368                 if (j == expected[k]) {
369                     printf("------------------------------------------------ %d\n",
370                            j);
371                 }
372             }
373         }
374         UChar32 c = ustr.char32At(j);
375         if (c > 0xffff) {
376             j ++;
377         }
378         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
379         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
380                            u_isUAlphabetic(c),
381                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
382                            u_isalnum(c),
383                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
384                                                   u_charType(c),
385                                                   U_SHORT_PROPERTY_NAME),
386                            u_getPropertyValueName(UCHAR_WORD_BREAK,
387                                                   u_getIntPropertyValue(c,
388                                                           UCHAR_WORD_BREAK),
389                                                   U_SHORT_PROPERTY_NAME),
390                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
391                                    u_getIntPropertyValue(c,
392                                            UCHAR_SENTENCE_BREAK),
393                                    U_SHORT_PROPERTY_NAME),
394                            u_getPropertyValueName(UCHAR_LINE_BREAK,
395                                    u_getIntPropertyValue(c,
396                                            UCHAR_LINE_BREAK),
397                                    U_SHORT_PROPERTY_NAME),
398                            name);
399     }
400 }
401 
402 
TestBug3818()403 void RBBITest::TestBug3818() {
404     UErrorCode  status = U_ZERO_ERROR;
405 
406     // Four Thai words...
407     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
408                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
409     UnicodeString  thaiStr(thaiWordData);
410 
411     RuleBasedBreakIterator* bi =
412         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
413     if (U_FAILURE(status) || bi == NULL) {
414         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
415         return;
416     }
417     bi->setText(thaiStr);
418 
419     int32_t  startOfSecondWord = bi->following(1);
420     if (startOfSecondWord != 4) {
421         errln("Fail at file %s, line %d expected start of word at 4, got %d",
422             __FILE__, __LINE__, startOfSecondWord);
423     }
424     startOfSecondWord = bi->following(0);
425     if (startOfSecondWord != 4) {
426         errln("Fail at file %s, line %d expected start of word at 4, got %d",
427             __FILE__, __LINE__, startOfSecondWord);
428     }
429     delete bi;
430 }
431 
432 //----------------------------------------------------------------------------
433 //
434 // generalIteratorTest      Given a break iterator and a set of test data,
435 //                          Run the tests and report the results.
436 //
437 //----------------------------------------------------------------------------
generalIteratorTest(RuleBasedBreakIterator & bi,BITestData & td)438 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
439 {
440 
441     bi.setText(td.fDataToBreak);
442 
443     testFirstAndNext(bi, td);
444 
445     testLastAndPrevious(bi, td);
446 
447     testFollowing(bi, td);
448     testPreceding(bi, td);
449     testIsBoundary(bi, td);
450     doMultipleSelectionTest(bi, td);
451 }
452 
453 
454 //
455 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
456 //                       kind of loop.
457 //
testFirstAndNext(RuleBasedBreakIterator & bi,BITestData & td)458 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
459 {
460     UErrorCode  status = U_ZERO_ERROR;
461     int32_t     p;
462     int32_t     lastP = -1;
463     int32_t     tag;
464 
465     logln("Test first and next");
466     bi.setText(td.fDataToBreak);
467     td.clearResults();
468 
469     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
470         td.fActualBreakPositions.addElement(p, status);  // Save result.
471         tag = bi.getRuleStatus();
472         td.fActualTags.addElement(tag, status);
473         if (p <= lastP) {
474             // If the iterator is not making forward progress, stop.
475             //  No need to raise an error here, it'll be detected in the normal check of results.
476             break;
477         }
478         lastP = p;
479     }
480     td.checkResults("testFirstAndNext", this);
481 }
482 
483 
484 //
485 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
486 //
testLastAndPrevious(RuleBasedBreakIterator & bi,BITestData & td)487 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
488 {
489     UErrorCode  status = U_ZERO_ERROR;
490     int32_t     p;
491     int32_t     lastP  = 0x7ffffffe;
492     int32_t     tag;
493 
494     logln("Test last and previous");
495     bi.setText(td.fDataToBreak);
496     td.clearResults();
497 
498     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
499         // Save break position.  Insert it at start of vector of results, shoving
500         //    already-saved results further towards the end.
501         td.fActualBreakPositions.insertElementAt(p, 0, status);
502         // bi.previous();   // TODO:  Why does this fix things up????
503         // bi.next();
504         tag = bi.getRuleStatus();
505         td.fActualTags.insertElementAt(tag, 0, status);
506         if (p >= lastP) {
507             // If the iterator is not making progress, stop.
508             //  No need to raise an error here, it'll be detected in the normal check of results.
509             break;
510         }
511         lastP = p;
512     }
513     td.checkResults("testLastAndPrevious", this);
514 }
515 
516 
testFollowing(RuleBasedBreakIterator & bi,BITestData & td)517 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
518 {
519     UErrorCode  status = U_ZERO_ERROR;
520     int32_t     p;
521     int32_t     tag;
522     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
523                                  //   cannot be -1; that is returned for DONE.
524     int         i;
525 
526     logln("testFollowing():");
527     bi.setText(td.fDataToBreak);
528     td.clearResults();
529 
530     // Save the starting point, since we won't get that out of following.
531     p = bi.first();
532     td.fActualBreakPositions.addElement(p, status);  // Save result.
533     tag = bi.getRuleStatus();
534     td.fActualTags.addElement(tag, status);
535 
536     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
537         p = bi.following(i);
538         if (p != lastP) {
539             if (p == RuleBasedBreakIterator::DONE) {
540                 break;
541             }
542             // We've reached a new break position.  Save it.
543             td.fActualBreakPositions.addElement(p, status);  // Save result.
544             tag = bi.getRuleStatus();
545             td.fActualTags.addElement(tag, status);
546             lastP = p;
547         }
548     }
549     // The loop normally exits by means of the break in the middle.
550     // Make sure that the index was at the correct position for the break iterator to have
551     //   returned DONE.
552     if (i != td.fDataToBreak.length()) {
553         errln("testFollowing():  iterator returned DONE prematurely.");
554     }
555 
556     // Full check of all results.
557     td.checkResults("testFollowing", this);
558 }
559 
560 
561 
testPreceding(RuleBasedBreakIterator & bi,BITestData & td)562 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
563     UErrorCode  status = U_ZERO_ERROR;
564     int32_t     p;
565     int32_t     tag;
566     int32_t     lastP  = 0x7ffffffe;
567     int         i;
568 
569     logln("testPreceding():");
570     bi.setText(td.fDataToBreak);
571     td.clearResults();
572 
573     p = bi.last();
574     td.fActualBreakPositions.addElement(p, status);
575     tag = bi.getRuleStatus();
576     td.fActualTags.addElement(tag, status);
577 
578     for (i = td.fDataToBreak.length(); i>=-1; i--) {
579         p = bi.preceding(i);
580         if (p != lastP) {
581             if (p == RuleBasedBreakIterator::DONE) {
582                 break;
583             }
584             // We've reached a new break position.  Save it.
585             td.fActualBreakPositions.insertElementAt(p, 0, status);
586             lastP = p;
587             tag = bi.getRuleStatus();
588             td.fActualTags.insertElementAt(tag, 0, status);
589         }
590     }
591     // The loop normally exits by means of the break in the middle.
592     // Make sure that the index was at the correct position for the break iterator to have
593     //   returned DONE.
594     if (i != 0) {
595         errln("testPreceding():  iterator returned DONE prematurely.");
596     }
597 
598     // Full check of all results.
599     td.checkResults("testPreceding", this);
600 }
601 
602 
603 
testIsBoundary(RuleBasedBreakIterator & bi,BITestData & td)604 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
605     UErrorCode  status = U_ZERO_ERROR;
606     int         i;
607     int32_t     tag;
608 
609     logln("testIsBoundary():");
610     bi.setText(td.fDataToBreak);
611     td.clearResults();
612 
613     for (i = 0; i <= td.fDataToBreak.length(); i++) {
614         if (bi.isBoundary(i)) {
615             td.fActualBreakPositions.addElement(i, status);  // Save result.
616             tag = bi.getRuleStatus();
617             td.fActualTags.addElement(tag, status);
618         }
619     }
620     td.checkResults("testIsBoundary: ", this);
621 }
622 
623 
624 
doMultipleSelectionTest(RuleBasedBreakIterator & iterator,BITestData & td)625 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
626 {
627     iterator.setText(td.fDataToBreak);
628 
629     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
630     int32_t offset = iterator.first();
631     int32_t testOffset;
632     int32_t count = 0;
633 
634     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
635 
636     if (*testIterator != iterator)
637         errln("clone() or operator!= failed: two clones compared unequal");
638 
639     do {
640         testOffset = testIterator->first();
641         testOffset = testIterator->next(count);
642         if (offset != testOffset)
643             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
644 
645         if (offset != RuleBasedBreakIterator::DONE) {
646             count++;
647             offset = iterator.next();
648 
649             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
650                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
651                 if (count > 10000 || offset == -1) {
652                     errln("operator== failed too many times. Stopping test.");
653                     if (offset == -1) {
654                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
655                     }
656                     return;
657                 }
658             }
659         }
660     } while (offset != RuleBasedBreakIterator::DONE);
661 
662     // now do it backwards...
663     offset = iterator.last();
664     count = 0;
665 
666     do {
667         testOffset = testIterator->last();
668         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
669         if (offset != testOffset)
670             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
671 
672         if (offset != RuleBasedBreakIterator::DONE) {
673             count--;
674             offset = iterator.previous();
675         }
676     } while (offset != RuleBasedBreakIterator::DONE);
677 
678     delete testIterator;
679 }
680 
681 
682 //---------------------------------------------
683 //
684 //     other tests
685 //
686 //---------------------------------------------
TestEmptyString()687 void RBBITest::TestEmptyString()
688 {
689     UnicodeString text = "";
690     UErrorCode status = U_ZERO_ERROR;
691 
692     BITestData x(status);
693     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
694     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
695     if (U_FAILURE(status))
696     {
697         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
698         return;
699     }
700     generalIteratorTest(*bi, x);
701     delete bi;
702 }
703 
TestGetAvailableLocales()704 void RBBITest::TestGetAvailableLocales()
705 {
706     int32_t locCount = 0;
707     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
708 
709     if (locCount == 0)
710         dataerrln("getAvailableLocales() returned an empty list!");
711     // Just make sure that it's returning good memory.
712     int32_t i;
713     for (i = 0; i < locCount; ++i) {
714         logln(locList[i].getName());
715     }
716 }
717 
718 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()719 void RBBITest::TestGetDisplayName()
720 {
721     UnicodeString   result;
722 
723     BreakIterator::getDisplayName(Locale::getUS(), result);
724     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
725         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
726                 + result);
727 
728     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
729     if (result != "French (France)")
730         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
731                 + result);
732 }
733 /**
734  * Test End Behaviour
735  * @bug 4068137
736  */
TestEndBehaviour()737 void RBBITest::TestEndBehaviour()
738 {
739     UErrorCode status = U_ZERO_ERROR;
740     UnicodeString testString("boo.");
741     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
742     if (U_FAILURE(status))
743     {
744         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
745         return;
746     }
747     wb->setText(testString);
748 
749     if (wb->first() != 0)
750         errln("Didn't get break at beginning of string.");
751     if (wb->next() != 3)
752         errln("Didn't get break before period in \"boo.\"");
753     if (wb->current() != 4 && wb->next() != 4)
754         errln("Didn't get break at end of string.");
755     delete wb;
756 }
757 /*
758  * @bug 4153072
759  */
TestBug4153072()760 void RBBITest::TestBug4153072() {
761     UErrorCode status = U_ZERO_ERROR;
762     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
763     if (U_FAILURE(status))
764     {
765         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
766         return;
767     }
768     UnicodeString str("...Hello, World!...");
769     int32_t begin = 3;
770     int32_t end = str.length() - 3;
771     UBool onBoundary;
772 
773     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
774     iter->adoptText(textIterator);
775     int index;
776     // Note: with the switch to UText, there is no way to restrict the
777     //       iteration range to begin at an index other than zero.
778     //       String character iterators created with a non-zero bound are
779     //         treated by RBBI as being empty.
780     for (index = -1; index < begin + 1; ++index) {
781         onBoundary = iter->isBoundary(index);
782         if (index == 0?  !onBoundary : onBoundary) {
783             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
784                             " and begin index = " + begin);
785         }
786     }
787     delete iter;
788 }
789 
790 
791 //
792 // Test for problem reported by Ashok Matoria on 9 July 2007
793 //    One.<kSoftHyphen><kSpace>Two.
794 //
795 //    Sentence break at start (0) and then on calling next() it breaks at
796 //   'T' of "Two". Now, at this point if I do next() and
797 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
798 //
TestBug5775()799 void RBBITest::TestBug5775() {
800     UErrorCode status = U_ZERO_ERROR;
801     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
802     TEST_ASSERT_SUCCESS(status);
803     if (U_FAILURE(status)) {
804         return;
805     }
806 // Check for status first for better handling of no data errors.
807     TEST_ASSERT(bi != NULL);
808     if (bi == NULL) {
809         return;
810     }
811 
812     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
813     //               01234      56789
814     s = s.unescape();
815     bi->setText(s);
816     int pos = bi->next();
817     TEST_ASSERT(pos == 6);
818     pos = bi->next();
819     TEST_ASSERT(pos == 10);
820     pos = bi->previous();
821     TEST_ASSERT(pos == 6);
822     delete bi;
823 }
824 
825 
826 
827 //------------------------------------------------------------------------------
828 //
829 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
830 //
831 //------------------------------------------------------------------------------
832 
833 struct TestParams {
834     BreakIterator   *bi;
835     UnicodeString    dataToBreak;
836     UVector32       *expectedBreaks;
837     UVector32       *srcLine;
838     UVector32       *srcCol;
839 };
840 
executeTest(TestParams * t)841 void RBBITest::executeTest(TestParams *t) {
842     int32_t    bp;
843     int32_t    prevBP;
844     int32_t    i;
845 
846     if (t->bi == NULL) {
847         return;
848     }
849 
850     t->bi->setText(t->dataToBreak);
851     //
852     //  Run the iterator forward
853     //
854     prevBP = -1;
855     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
856         if (prevBP ==  bp) {
857             // Fail for lack of forward progress.
858             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
859                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
860             break;
861         }
862 
863         // Check that there were we didn't miss an expected break between the last one
864         //  and this one.
865         for (i=prevBP+1; i<bp; i++) {
866             if (t->expectedBreaks->elementAti(i) != 0) {
867                 int expected[] = {0, i};
868                 printStringBreaks(t->dataToBreak, expected, 2);
869                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
870                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
871             }
872         }
873 
874         // Check that the break we did find was expected
875         if (t->expectedBreaks->elementAti(bp) == 0) {
876             int expected[] = {0, bp};
877             printStringBreaks(t->dataToBreak, expected, 2);
878             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
879                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
880         } else {
881             // The break was expected.
882             //   Check that the {nnn} tag value is correct.
883             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
884             if (expectedTagVal == -1) {
885                 expectedTagVal = 0;
886             }
887             int32_t line = t->srcLine->elementAti(bp);
888             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
889             if (rs != expectedTagVal) {
890                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
891                       "          Actual, Expected status = %4d, %4d",
892                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
893             }
894         }
895 
896 
897         prevBP = bp;
898     }
899 
900     // Verify that there were no missed expected breaks after the last one found
901     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
902         if (t->expectedBreaks->elementAti(i) != 0) {
903             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
904                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
905         }
906     }
907 
908     //
909     //  Run the iterator backwards, verify that the same breaks are found.
910     //
911     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
912     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
913         if (prevBP ==  bp) {
914             // Fail for lack of progress.
915             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
916                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
917             break;
918         }
919 
920         // Check that there were we didn't miss an expected break between the last one
921         //  and this one.  (UVector returns zeros for index out of bounds.)
922         for (i=prevBP-1; i>bp; i--) {
923             if (t->expectedBreaks->elementAti(i) != 0) {
924                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
925                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
926             }
927         }
928 
929         // Check that the break we did find was expected
930         if (t->expectedBreaks->elementAti(bp) == 0) {
931             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
932                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
933         } else {
934             // The break was expected.
935             //   Check that the {nnn} tag value is correct.
936             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
937             if (expectedTagVal == -1) {
938                 expectedTagVal = 0;
939             }
940             int line = t->srcLine->elementAti(bp);
941             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
942             if (rs != expectedTagVal) {
943                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
944                       "          Actual, Expected status = %4d, %4d",
945                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
946             }
947         }
948 
949         prevBP = bp;
950     }
951 
952     // Verify that there were no missed breaks prior to the last one found
953     for (i=prevBP-1; i>=0; i--) {
954         if (t->expectedBreaks->elementAti(i) != 0) {
955             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
956                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
957         }
958     }
959 }
960 
961 
TestExtended()962 void RBBITest::TestExtended() {
963 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
964     UErrorCode      status  = U_ZERO_ERROR;
965     Locale          locale("");
966 
967     UnicodeString       rules;
968     TestParams          tp;
969     tp.bi             = NULL;
970     tp.expectedBreaks = new UVector32(status);
971     tp.srcLine        = new UVector32(status);
972     tp.srcCol         = new UVector32(status);
973 
974     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
975     if (U_FAILURE(status)) {
976         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
977     }
978 
979 
980     //
981     //  Open and read the test data file.
982     //
983     const char *testDataDirectory = IntlTest::getSourceTestData(status);
984     char testFileName[1000];
985     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
986         errln("Can't open test data.  Path too long.");
987         return;
988     }
989     strcpy(testFileName, testDataDirectory);
990     strcat(testFileName, "rbbitst.txt");
991 
992     int    len;
993     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
994     if (U_FAILURE(status)) {
995         return; /* something went wrong, error already output */
996     }
997 
998 
999 
1000 
1001     //
1002     //  Put the test data into a UnicodeString
1003     //
1004     UnicodeString testString(FALSE, testFile, len);
1005 
1006     enum EParseState{
1007         PARSE_COMMENT,
1008         PARSE_TAG,
1009         PARSE_DATA,
1010         PARSE_NUM
1011     }
1012     parseState = PARSE_TAG;
1013 
1014     EParseState savedState = PARSE_TAG;
1015 
1016     static const UChar CH_LF        = 0x0a;
1017     static const UChar CH_CR        = 0x0d;
1018     static const UChar CH_HASH      = 0x23;
1019     /*static const UChar CH_PERIOD    = 0x2e;*/
1020     static const UChar CH_LT        = 0x3c;
1021     static const UChar CH_GT        = 0x3e;
1022     static const UChar CH_BACKSLASH = 0x5c;
1023     static const UChar CH_BULLET    = 0x2022;
1024 
1025     int32_t    lineNum  = 1;
1026     int32_t    colStart = 0;
1027     int32_t    column   = 0;
1028     int32_t    charIdx  = 0;
1029 
1030     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1031 
1032     for (charIdx = 0; charIdx < len; ) {
1033         status = U_ZERO_ERROR;
1034         UChar  c = testString.charAt(charIdx);
1035         charIdx++;
1036         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1037             // treat CRLF as a unit
1038             c = CH_LF;
1039             charIdx++;
1040         }
1041         if (c == CH_LF || c == CH_CR) {
1042             lineNum++;
1043             colStart = charIdx;
1044         }
1045         column = charIdx - colStart + 1;
1046 
1047         switch (parseState) {
1048         case PARSE_COMMENT:
1049             if (c == 0x0a || c == 0x0d) {
1050                 parseState = savedState;
1051             }
1052             break;
1053 
1054         case PARSE_TAG:
1055             {
1056             if (c == CH_HASH) {
1057                 parseState = PARSE_COMMENT;
1058                 savedState = PARSE_TAG;
1059                 break;
1060             }
1061             if (u_isUWhiteSpace(c)) {
1062                 break;
1063             }
1064             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1065                 delete tp.bi;
1066                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1067                 charIdx += 5;
1068                 break;
1069             }
1070             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1071                 delete tp.bi;
1072                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1073                 charIdx += 5;
1074                 break;
1075             }
1076             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1077                 delete tp.bi;
1078                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1079                 charIdx += 5;
1080                 break;
1081             }
1082             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1083                 delete tp.bi;
1084                 tp.bi = NULL;
1085                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1086                 charIdx += 5;
1087                 break;
1088             }
1089             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1090                 delete tp.bi;
1091                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1092                 charIdx += 6;
1093                 break;
1094             }
1095 
1096             // <locale  loc_name>
1097             localeMatcher.reset(testString);
1098             if (localeMatcher.lookingAt(charIdx-1, status)) {
1099                 UnicodeString localeName = localeMatcher.group(1, status);
1100                 char localeName8[100];
1101                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1102                 locale = Locale::createFromName(localeName8);
1103                 charIdx += localeMatcher.group(0, status).length();
1104                 TEST_ASSERT_SUCCESS(status);
1105                 break;
1106             }
1107             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1108                 parseState = PARSE_DATA;
1109                 charIdx += 5;
1110                 tp.dataToBreak = "";
1111                 tp.expectedBreaks->removeAllElements();
1112                 tp.srcCol ->removeAllElements();
1113                 tp.srcLine->removeAllElements();
1114                 break;
1115             }
1116 
1117             errln("line %d: Tag expected in test file.", lineNum);
1118             parseState = PARSE_COMMENT;
1119             savedState = PARSE_DATA;
1120             goto end_test; // Stop the test.
1121             }
1122             break;
1123 
1124         case PARSE_DATA:
1125             if (c == CH_BULLET) {
1126                 int32_t  breakIdx = tp.dataToBreak.length();
1127                 tp.expectedBreaks->setSize(breakIdx+1);
1128                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1129                 tp.srcLine->setSize(breakIdx+1);
1130                 tp.srcLine->setElementAt(lineNum, breakIdx);
1131                 tp.srcCol ->setSize(breakIdx+1);
1132                 tp.srcCol ->setElementAt(column, breakIdx);
1133                 break;
1134             }
1135 
1136             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1137                 // Add final entry to mappings from break location to source file position.
1138                 //  Need one extra because last break position returned is after the
1139                 //    last char in the data, not at the last char.
1140                 tp.srcLine->addElement(lineNum, status);
1141                 tp.srcCol ->addElement(column, status);
1142 
1143                 parseState = PARSE_TAG;
1144                 charIdx += 6;
1145 
1146                 // RUN THE TEST!
1147                 executeTest(&tp);
1148                 break;
1149             }
1150 
1151             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1152                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1153                 // Get the code point from the name and insert it into the test data.
1154                 //   (Damn, no API takes names in Unicode  !!!
1155                 //    we've got to take it back to char *)
1156                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1157                 int32_t nameLength = nameEndIdx - (charIdx+2);
1158                 char charNameBuf[200];
1159                 UChar32 theChar = -1;
1160                 if (nameEndIdx != -1) {
1161                     UErrorCode status = U_ZERO_ERROR;
1162                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1163                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1164                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1165                     if (U_FAILURE(status)) {
1166                         theChar = -1;
1167                     }
1168                 }
1169                 if (theChar == -1) {
1170                     errln("Error in named character in test file at line %d, col %d",
1171                         lineNum, column);
1172                 } else {
1173                     // Named code point was recognized.  Insert it
1174                     //   into the test data.
1175                     tp.dataToBreak.append(theChar);
1176                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1177                         tp.srcLine->addElement(lineNum, status);
1178                         tp.srcCol ->addElement(column, status);
1179                     }
1180                 }
1181                 if (nameEndIdx > charIdx) {
1182                     charIdx = nameEndIdx+1;
1183 
1184                 }
1185                 break;
1186             }
1187 
1188 
1189 
1190 
1191             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1192                 charIdx++;
1193                 int32_t  breakIdx = tp.dataToBreak.length();
1194                 tp.expectedBreaks->setSize(breakIdx+1);
1195                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1196                 tp.srcLine->setSize(breakIdx+1);
1197                 tp.srcLine->setElementAt(lineNum, breakIdx);
1198                 tp.srcCol ->setSize(breakIdx+1);
1199                 tp.srcCol ->setElementAt(column, breakIdx);
1200                 break;
1201             }
1202 
1203             if (c == CH_LT) {
1204                 tagValue   = 0;
1205                 parseState = PARSE_NUM;
1206                 break;
1207             }
1208 
1209             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1210                 parseState = PARSE_COMMENT;
1211                 savedState = PARSE_DATA;
1212                 break;
1213             }
1214 
1215             if (c == CH_BACKSLASH) {
1216                 // Check for \ at end of line, a line continuation.
1217                 //     Advance over (discard) the newline
1218                 UChar32 cp = testString.char32At(charIdx);
1219                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1220                     // We have a CR LF
1221                     //  Need an extra increment of the input ptr to move over both of them
1222                     charIdx++;
1223                 }
1224                 if (cp == CH_LF || cp == CH_CR) {
1225                     lineNum++;
1226                     colStart = charIdx;
1227                     charIdx++;
1228                     break;
1229                 }
1230 
1231                 // Let unescape handle the back slash.
1232                 cp = testString.unescapeAt(charIdx);
1233                 if (cp != -1) {
1234                     // Escape sequence was recognized.  Insert the char
1235                     //   into the test data.
1236                     tp.dataToBreak.append(cp);
1237                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1238                         tp.srcLine->addElement(lineNum, status);
1239                         tp.srcCol ->addElement(column, status);
1240                     }
1241                     break;
1242                 }
1243 
1244 
1245                 // Not a recognized backslash escape sequence.
1246                 // Take the next char as a literal.
1247                 //  TODO:  Should this be an error?
1248                 c = testString.charAt(charIdx);
1249                 charIdx = testString.moveIndex32(charIdx, 1);
1250             }
1251 
1252             // Normal, non-escaped data char.
1253             tp.dataToBreak.append(c);
1254 
1255             // Save the mapping from offset in the data to line/column numbers in
1256             //   the original input file.  Will be used for better error messages only.
1257             //   If there's an expected break before this char, the slot in the mapping
1258             //     vector will already be set for this char; don't overwrite it.
1259             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1260                 tp.srcLine->addElement(lineNum, status);
1261                 tp.srcCol ->addElement(column, status);
1262             }
1263             break;
1264 
1265 
1266         case PARSE_NUM:
1267             // We are parsing an expected numeric tag value, like <1234>,
1268             //   within a chunk of data.
1269             if (u_isUWhiteSpace(c)) {
1270                 break;
1271             }
1272 
1273             if (c == CH_GT) {
1274                 // Finished the number.  Add the info to the expected break data,
1275                 //   and switch parse state back to doing plain data.
1276                 parseState = PARSE_DATA;
1277                 if (tagValue == 0) {
1278                     tagValue = -1;
1279                 }
1280                 int32_t  breakIdx = tp.dataToBreak.length();
1281                 tp.expectedBreaks->setSize(breakIdx+1);
1282                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1283                 tp.srcLine->setSize(breakIdx+1);
1284                 tp.srcLine->setElementAt(lineNum, breakIdx);
1285                 tp.srcCol ->setSize(breakIdx+1);
1286                 tp.srcCol ->setElementAt(column, breakIdx);
1287                 break;
1288             }
1289 
1290             if (u_isdigit(c)) {
1291                 tagValue = tagValue*10 + u_charDigitValue(c);
1292                 break;
1293             }
1294 
1295             errln("Syntax Error in test file at line %d, col %d",
1296                 lineNum, column);
1297             parseState = PARSE_COMMENT;
1298             goto end_test; // Stop the test
1299             break;
1300         }
1301 
1302 
1303         if (U_FAILURE(status)) {
1304             dataerrln("ICU Error %s while parsing test file at line %d.",
1305                 u_errorName(status), lineNum);
1306             status = U_ZERO_ERROR;
1307             goto end_test; // Stop the test
1308         }
1309 
1310     }
1311 
1312 end_test:
1313     delete tp.bi;
1314     delete tp.expectedBreaks;
1315     delete tp.srcLine;
1316     delete tp.srcCol;
1317     delete [] testFile;
1318 #endif
1319 }
1320 
1321 
1322 //-------------------------------------------------------------------------------
1323 //
1324 //  TestDictRules   create a break iterator from source rules that includes a
1325 //                  dictionary range.   Regression for bug #7130.  Source rules
1326 //                  do not declare a break iterator type (word, line, sentence, etc.
1327 //                  but the dictionary code, without a type, would loop.
1328 //
1329 //-------------------------------------------------------------------------------
TestDictRules()1330 void RBBITest::TestDictRules() {
1331     const char *rules =  "$dictionary = [a-z]; \n"
1332                          "!!forward; \n"
1333                          "$dictionary $dictionary; \n"
1334                          "!!reverse; \n"
1335                          "$dictionary $dictionary; \n";
1336     const char *text = "aa";
1337     UErrorCode status = U_ZERO_ERROR;
1338     UParseError parseError;
1339 
1340     RuleBasedBreakIterator bi(rules, parseError, status);
1341     if (U_SUCCESS(status)) {
1342         UnicodeString utext = text;
1343         bi.setText(utext);
1344         int32_t position;
1345         int32_t loops;
1346         for (loops = 0; loops<10; loops++) {
1347             position = bi.next();
1348             if (position == RuleBasedBreakIterator::DONE) {
1349                 break;
1350             }
1351         }
1352         TEST_ASSERT(loops == 1);
1353     } else {
1354         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1355     }
1356 }
1357 
1358 
1359 
1360 //-------------------------------------------------------------------------------
1361 //
1362 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1363 //    return the datain one big UChar * buffer, which the caller must delete.
1364 //
1365 //    parameters:
1366 //          fileName:   the name of the file, with no directory part.  The test data directory
1367 //                      is assumed.
1368 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1369 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1370 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1371 //                      Pass NULL for the system default encoding.
1372 //          status
1373 //    returns:
1374 //                      The file data, converted to UChar.
1375 //                      The caller must delete this when done with
1376 //                           delete [] theBuffer;
1377 //
1378 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1379 //           Move this function to some common place.
1380 //
1381 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1382 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1383     UChar       *retPtr  = NULL;
1384     char        *fileBuf = NULL;
1385     UConverter* conv     = NULL;
1386     FILE        *f       = NULL;
1387 
1388     ulen = 0;
1389     if (U_FAILURE(status)) {
1390         return retPtr;
1391     }
1392 
1393     //
1394     //  Open the file.
1395     //
1396     f = fopen(fileName, "rb");
1397     if (f == 0) {
1398         dataerrln("Error opening test data file %s\n", fileName);
1399         status = U_FILE_ACCESS_ERROR;
1400         return NULL;
1401     }
1402     //
1403     //  Read it in
1404     //
1405     int   fileSize;
1406     int   amt_read;
1407 
1408     fseek( f, 0, SEEK_END);
1409     fileSize = ftell(f);
1410     fileBuf = new char[fileSize];
1411     fseek(f, 0, SEEK_SET);
1412     amt_read = fread(fileBuf, 1, fileSize, f);
1413     if (amt_read != fileSize || fileSize <= 0) {
1414         errln("Error reading test data file.");
1415         goto cleanUpAndReturn;
1416     }
1417 
1418     //
1419     // Look for a Unicode Signature (BOM) on the data just read
1420     //
1421     int32_t        signatureLength;
1422     const char *   fileBufC;
1423     const char*    bomEncoding;
1424 
1425     fileBufC = fileBuf;
1426     bomEncoding = ucnv_detectUnicodeSignature(
1427         fileBuf, fileSize, &signatureLength, &status);
1428     if(bomEncoding!=NULL ){
1429         fileBufC  += signatureLength;
1430         fileSize  -= signatureLength;
1431         encoding = bomEncoding;
1432     }
1433 
1434     //
1435     // Open a converter to take the rule file to UTF-16
1436     //
1437     conv = ucnv_open(encoding, &status);
1438     if (U_FAILURE(status)) {
1439         goto cleanUpAndReturn;
1440     }
1441 
1442     //
1443     // Convert the rules to UChar.
1444     //  Preflight first to determine required buffer size.
1445     //
1446     ulen = ucnv_toUChars(conv,
1447         NULL,           //  dest,
1448         0,              //  destCapacity,
1449         fileBufC,
1450         fileSize,
1451         &status);
1452     if (status == U_BUFFER_OVERFLOW_ERROR) {
1453         // Buffer Overflow is expected from the preflight operation.
1454         status = U_ZERO_ERROR;
1455 
1456         retPtr = new UChar[ulen+1];
1457         ucnv_toUChars(conv,
1458             retPtr,       //  dest,
1459             ulen+1,
1460             fileBufC,
1461             fileSize,
1462             &status);
1463     }
1464 
1465 cleanUpAndReturn:
1466     fclose(f);
1467     delete []fileBuf;
1468     ucnv_close(conv);
1469     if (U_FAILURE(status)) {
1470         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1471         delete []retPtr;
1472         retPtr = 0;
1473         ulen   = 0;
1474     };
1475     return retPtr;
1476 }
1477 
1478 
1479 
1480 //--------------------------------------------------------------------------------------------
1481 //
1482 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1483 //
1484 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1485 void RBBITest::TestUnicodeFiles() {
1486     RuleBasedBreakIterator  *bi;
1487     UErrorCode               status = U_ZERO_ERROR;
1488 
1489     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1490     TEST_ASSERT_SUCCESS(status);
1491     if (U_SUCCESS(status)) {
1492         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1493     }
1494     delete bi;
1495 
1496     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1497     TEST_ASSERT_SUCCESS(status);
1498     if (U_SUCCESS(status)) {
1499         runUnicodeTestData("WordBreakTest.txt", bi);
1500     }
1501     delete bi;
1502 
1503     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1504     TEST_ASSERT_SUCCESS(status);
1505     if (U_SUCCESS(status)) {
1506         runUnicodeTestData("SentenceBreakTest.txt", bi);
1507     }
1508     delete bi;
1509 
1510     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1511     TEST_ASSERT_SUCCESS(status);
1512     if (U_SUCCESS(status)) {
1513         runUnicodeTestData("LineBreakTest.txt", bi);
1514     }
1515     delete bi;
1516 }
1517 
1518 
1519 //--------------------------------------------------------------------------------------------
1520 //
1521 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1522 //
1523 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1524 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1525 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1526     // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1527     UBool isTicket7270Fixed = isICUVersionAtLeast(52, 1);
1528     UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
1529     UErrorCode  status = U_ZERO_ERROR;
1530 
1531     //
1532     //  Open and read the test data file, put it into a UnicodeString.
1533     //
1534     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1535     char testFileName[1000];
1536     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1537         dataerrln("Can't open test data.  Path too long.");
1538         return;
1539     }
1540     strcpy(testFileName, testDataDirectory);
1541     strcat(testFileName, fileName);
1542 
1543     logln("Opening data file %s\n", fileName);
1544 
1545     int    len;
1546     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1547     if (status != U_FILE_ACCESS_ERROR) {
1548         TEST_ASSERT_SUCCESS(status);
1549         TEST_ASSERT(testFile != NULL);
1550     }
1551     if (U_FAILURE(status) || testFile == NULL) {
1552         return; /* something went wrong, error already output */
1553     }
1554     UnicodeString testFileAsString(TRUE, testFile, len);
1555 
1556     //
1557     //  Parse the test data file using a regular expression.
1558     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1559     //     is identified by which group had a match.
1560     //
1561     //    Caputure Group #                  1          2            3            4           5
1562     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1563     //
1564     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1565     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1566     UnicodeString   testString;
1567     UVector32       breakPositions(status);
1568     int             lineNumber = 1;
1569     TEST_ASSERT_SUCCESS(status);
1570     if (U_FAILURE(status)) {
1571         return;
1572     }
1573 
1574     //
1575     //  Scan through each test case, building up the string to be broken in testString,
1576     //   and the positions that should be boundaries in the breakPositions vector.
1577     //
1578     int spin = 0;
1579     while (tokenMatcher.find()) {
1580       	if(tokenMatcher.hitEnd()) {
1581           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1582              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1583              and caused an infinite loop here on EBCDIC systems!
1584           */
1585           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1586           //	   return;
1587       	}
1588         if (tokenMatcher.start(1, status) >= 0) {
1589             // Scanned a divide sign, indicating a break position in the test data.
1590             if (testString.length()>0) {
1591                 breakPositions.addElement(testString.length(), status);
1592             }
1593         }
1594         else if (tokenMatcher.start(2, status) >= 0) {
1595             // Scanned an 'x', meaning no break at this position in the test data
1596             //   Nothing to be done here.
1597             }
1598         else if (tokenMatcher.start(3, status) >= 0) {
1599             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1600             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1601             int length = hexNumber.length();
1602             if (length<=8) {
1603                 char buf[10];
1604                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1605                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1606                 if (c<=0x10ffff) {
1607                     testString.append(c);
1608                 } else {
1609                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1610                        fileName, lineNumber);
1611                 }
1612             } else {
1613                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1614                        fileName, lineNumber);
1615              }
1616         }
1617         else if (tokenMatcher.start(4, status) >= 0) {
1618             // Scanned to end of a line, possibly skipping over a comment in the process.
1619             //   If the line from the file contained test data, run the test now.
1620             //
1621             if (testString.length() > 0) {
1622 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1623 //             Rule 8
1624 //                ZW SP* <break>
1625 //             is not yet implemented.
1626 if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
1627                                             5202 == lineNumber ||
1628                                             5214 == lineNumber ||
1629                                             5246 == lineNumber ||
1630                                             5298 == lineNumber ||
1631                                             5302 == lineNumber ))) {
1632                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1633 }
1634             }
1635 
1636             // Clear out this test case.
1637             //    The string and breakPositions vector will be refilled as the next
1638             //       test case is parsed.
1639             testString.remove();
1640             breakPositions.removeAllElements();
1641             lineNumber++;
1642         } else {
1643             // Scanner catchall.  Something unrecognized appeared on the line.
1644             char token[16];
1645             UnicodeString uToken = tokenMatcher.group(0, status);
1646             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1647             token[sizeof(token)-1] = 0;
1648             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1649 
1650             // Clean up, in preparation for continuing with the next line.
1651             testString.remove();
1652             breakPositions.removeAllElements();
1653             lineNumber++;
1654         }
1655         TEST_ASSERT_SUCCESS(status);
1656         if (U_FAILURE(status)) {
1657             break;
1658         }
1659     }
1660 
1661     delete [] testFile;
1662  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1663 }
1664 
1665 //--------------------------------------------------------------------------------------------
1666 //
1667 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1668 //                            test data files.  Do only a simple, forward-only check -
1669 //                            this test is mostly to check that ICU and the Unicode
1670 //                            data agree with each other.
1671 //
1672 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1673 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1674                          const UnicodeString &testString,   // Text data to be broken
1675                          UVector32 *breakPositions,         // Positions where breaks should be found.
1676                          RuleBasedBreakIterator *bi) {
1677     int32_t pos;                 // Break Position in the test string
1678     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1679     int32_t expectedPos;         // Expected break position (index into test string)
1680 
1681     bi->setText(testString);
1682     pos = bi->first();
1683     pos = bi->next();
1684 
1685     while (pos != BreakIterator::DONE) {
1686         if (expectedI >= breakPositions->size()) {
1687             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1688                 testFileName, lineNumber, pos);
1689             break;
1690         }
1691         expectedPos = breakPositions->elementAti(expectedI);
1692         if (pos < expectedPos) {
1693             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1694                 testFileName, lineNumber, pos);
1695             break;
1696         }
1697         if (pos > expectedPos) {
1698             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1699                 testFileName, lineNumber, expectedPos);
1700             break;
1701         }
1702         pos = bi->next();
1703         expectedI++;
1704     }
1705 
1706     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1707         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1708             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1709     }
1710 }
1711 
1712 
1713 
1714 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1715 //---------------------------------------------------------------------------------------
1716 //
1717 //   classs RBBIMonkeyKind
1718 //
1719 //      Monkey Test for Break Iteration
1720 //      Abstract interface class.   Concrete derived classes independently
1721 //      implement the break rules for different iterator types.
1722 //
1723 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1724 //      testing, but works purely in terms of the interface defined here.
1725 //
1726 //---------------------------------------------------------------------------------------
1727 class RBBIMonkeyKind {
1728 public:
1729     // Return a UVector of UnicodeSets, representing the character classes used
1730     //   for this type of iterator.
1731     virtual  UVector  *charClasses() = 0;
1732 
1733     // Set the test text on which subsequent calls to next() will operate
1734     virtual  void      setText(const UnicodeString &s) = 0;
1735 
1736     // Find the next break postion, starting from the prev break position, or from zero.
1737     // Return -1 after reaching end of string.
1738     virtual  int32_t   next(int32_t i) = 0;
1739 
1740     virtual ~RBBIMonkeyKind();
1741     UErrorCode       deferredStatus;
1742 
1743 
1744 protected:
1745     RBBIMonkeyKind();
1746 
1747 private:
1748 };
1749 
RBBIMonkeyKind()1750 RBBIMonkeyKind::RBBIMonkeyKind() {
1751     deferredStatus = U_ZERO_ERROR;
1752 }
1753 
~RBBIMonkeyKind()1754 RBBIMonkeyKind::~RBBIMonkeyKind() {
1755 }
1756 
1757 
1758 //----------------------------------------------------------------------------------------
1759 //
1760 //   Random Numbers.  Similar to standard lib rand() and srand()
1761 //                    Not using library to
1762 //                      1.  Get same results on all platforms.
1763 //                      2.  Get access to current seed, to more easily reproduce failures.
1764 //
1765 //---------------------------------------------------------------------------------------
1766 static uint32_t m_seed = 1;
1767 
m_rand()1768 static uint32_t m_rand()
1769 {
1770     m_seed = m_seed * 1103515245 + 12345;
1771     return (uint32_t)(m_seed/65536) % 32768;
1772 }
1773 
1774 
1775 //------------------------------------------------------------------------------------------
1776 //
1777 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1778 //                             of RBBIMonkeyKind.
1779 //
1780 //------------------------------------------------------------------------------------------
1781 class RBBICharMonkey: public RBBIMonkeyKind {
1782 public:
1783     RBBICharMonkey();
1784     virtual          ~RBBICharMonkey();
1785     virtual  UVector *charClasses();
1786     virtual  void     setText(const UnicodeString &s);
1787     virtual  int32_t  next(int32_t i);
1788 private:
1789     UVector   *fSets;
1790 
1791     UnicodeSet  *fCRLFSet;
1792     UnicodeSet  *fControlSet;
1793     UnicodeSet  *fExtendSet;
1794     UnicodeSet  *fRegionalIndicatorSet;
1795     UnicodeSet  *fPrependSet;
1796     UnicodeSet  *fSpacingSet;
1797     UnicodeSet  *fLSet;
1798     UnicodeSet  *fVSet;
1799     UnicodeSet  *fTSet;
1800     UnicodeSet  *fLVSet;
1801     UnicodeSet  *fLVTSet;
1802     UnicodeSet  *fHangulSet;
1803     UnicodeSet  *fAnySet;
1804 
1805     const UnicodeString *fText;
1806 };
1807 
1808 
RBBICharMonkey()1809 RBBICharMonkey::RBBICharMonkey() {
1810     UErrorCode  status = U_ZERO_ERROR;
1811 
1812     fText = NULL;
1813 
1814     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1815     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
1816     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
1817     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1818     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1819     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1820     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1821     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1822     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1823     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1824     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1825     fHangulSet  = new UnicodeSet();
1826     fHangulSet->addAll(*fLSet);
1827     fHangulSet->addAll(*fVSet);
1828     fHangulSet->addAll(*fTSet);
1829     fHangulSet->addAll(*fLVSet);
1830     fHangulSet->addAll(*fLVTSet);
1831     fAnySet     = new UnicodeSet(0, 0x10ffff);
1832 
1833     fSets       = new UVector(status);
1834     fSets->addElement(fCRLFSet,    status);
1835     fSets->addElement(fControlSet, status);
1836     fSets->addElement(fExtendSet,  status);
1837     fSets->addElement(fRegionalIndicatorSet, status);
1838     if (!fPrependSet->isEmpty()) {
1839         fSets->addElement(fPrependSet, status);
1840     }
1841     fSets->addElement(fSpacingSet, status);
1842     fSets->addElement(fHangulSet,  status);
1843     fSets->addElement(fAnySet,     status);
1844     if (U_FAILURE(status)) {
1845         deferredStatus = status;
1846     }
1847 }
1848 
1849 
setText(const UnicodeString & s)1850 void RBBICharMonkey::setText(const UnicodeString &s) {
1851     fText = &s;
1852 }
1853 
1854 
1855 
next(int32_t prevPos)1856 int32_t RBBICharMonkey::next(int32_t prevPos) {
1857     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1858                               //   break position being tested.  The candidate break
1859                               //   location is before p2.
1860 
1861     int     breakPos = -1;
1862 
1863     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1864 
1865     if (U_FAILURE(deferredStatus)) {
1866         return -1;
1867     }
1868 
1869     // Previous break at end of string.  return DONE.
1870     if (prevPos >= fText->length()) {
1871         return -1;
1872     }
1873     p0 = p1 = p2 = p3 = prevPos;
1874     c3 =  fText->char32At(prevPos);
1875     c0 = c1 = c2 = 0;
1876 
1877     // Loop runs once per "significant" character position in the input text.
1878     for (;;) {
1879         // Move all of the positions forward in the input string.
1880         p0 = p1;  c0 = c1;
1881         p1 = p2;  c1 = c2;
1882         p2 = p3;  c2 = c3;
1883 
1884         // Advancd p3 by one codepoint
1885         p3 = fText->moveIndex32(p3, 1);
1886         c3 = fText->char32At(p3);
1887 
1888         if (p1 == p2) {
1889             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1890             continue;
1891         }
1892         if (p2 == fText->length()) {
1893             // Reached end of string.  Always a break position.
1894             break;
1895         }
1896 
1897         // Rule  GB3   CR x LF
1898         //     No Extend or Format characters may appear between the CR and LF,
1899         //     which requires the additional check for p2 immediately following p1.
1900         //
1901         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1902             continue;
1903         }
1904 
1905         // Rule (GB4).   ( Control | CR | LF ) <break>
1906         if (fControlSet->contains(c1) ||
1907             c1 == 0x0D ||
1908             c1 == 0x0A)  {
1909             break;
1910         }
1911 
1912         // Rule (GB5)    <break>  ( Control | CR | LF )
1913         //
1914         if (fControlSet->contains(c2) ||
1915             c2 == 0x0D ||
1916             c2 == 0x0A)  {
1917             break;
1918         }
1919 
1920 
1921         // Rule (GB6)  L x ( L | V | LV | LVT )
1922         if (fLSet->contains(c1) &&
1923                (fLSet->contains(c2)  ||
1924                 fVSet->contains(c2)  ||
1925                 fLVSet->contains(c2) ||
1926                 fLVTSet->contains(c2))) {
1927             continue;
1928         }
1929 
1930         // Rule (GB7)    ( LV | V )  x  ( V | T )
1931         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1932             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1933             continue;
1934         }
1935 
1936         // Rule (GB8)    ( LVT | T)  x T
1937         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1938             fTSet->contains(c2))  {
1939             continue;
1940         }
1941 
1942         // Rule (GB8a)    Regional_Indicator x Regional_Indicator
1943         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1944             continue;
1945         }
1946 
1947         // Rule (GB9)    Numeric x ALetter
1948         if (fExtendSet->contains(c2))  {
1949             continue;
1950         }
1951 
1952         // Rule (GB9a)   x  SpacingMark
1953         if (fSpacingSet->contains(c2)) {
1954             continue;
1955         }
1956 
1957         // Rule (GB9b)   Prepend x
1958         if (fPrependSet->contains(c1)) {
1959             continue;
1960         }
1961 
1962         // Rule (GB10)  Any  <break>  Any
1963         break;
1964     }
1965 
1966     breakPos = p2;
1967     return breakPos;
1968 }
1969 
1970 
1971 
charClasses()1972 UVector  *RBBICharMonkey::charClasses() {
1973     return fSets;
1974 }
1975 
1976 
~RBBICharMonkey()1977 RBBICharMonkey::~RBBICharMonkey() {
1978     delete fSets;
1979     delete fCRLFSet;
1980     delete fControlSet;
1981     delete fExtendSet;
1982     delete fRegionalIndicatorSet;
1983     delete fPrependSet;
1984     delete fSpacingSet;
1985     delete fLSet;
1986     delete fVSet;
1987     delete fTSet;
1988     delete fLVSet;
1989     delete fLVTSet;
1990     delete fHangulSet;
1991     delete fAnySet;
1992 }
1993 
1994 //------------------------------------------------------------------------------------------
1995 //
1996 //   class RBBIWordMonkey      Word Break specific implementation
1997 //                             of RBBIMonkeyKind.
1998 //
1999 //------------------------------------------------------------------------------------------
2000 class RBBIWordMonkey: public RBBIMonkeyKind {
2001 public:
2002     RBBIWordMonkey();
2003     virtual          ~RBBIWordMonkey();
2004     virtual  UVector *charClasses();
2005     virtual  void     setText(const UnicodeString &s);
2006     virtual int32_t   next(int32_t i);
2007 private:
2008     UVector      *fSets;
2009 
2010     UnicodeSet  *fCRSet;
2011     UnicodeSet  *fLFSet;
2012     UnicodeSet  *fNewlineSet;
2013     UnicodeSet  *fKatakanaSet;
2014     UnicodeSet  *fALetterSet;
2015     // TODO(jungshik): Do we still need this change?
2016     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2017     UnicodeSet  *fMidNumLetSet;
2018     UnicodeSet  *fMidLetterSet;
2019     UnicodeSet  *fMidNumSet;
2020     UnicodeSet  *fNumericSet;
2021     UnicodeSet  *fFormatSet;
2022     UnicodeSet  *fOtherSet;
2023     UnicodeSet  *fExtendSet;
2024     UnicodeSet  *fExtendNumLetSet;
2025     UnicodeSet  *fRegionalIndicatorSet;
2026     UnicodeSet  *fDictionaryCjkSet;
2027 
2028     RegexMatcher  *fMatcher;
2029 
2030     const UnicodeString  *fText;
2031 };
2032 
2033 
RBBIWordMonkey()2034 RBBIWordMonkey::RBBIWordMonkey()
2035 {
2036     UErrorCode  status = U_ZERO_ERROR;
2037 
2038     fSets            = new UVector(status);
2039 
2040     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2041     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2042     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2043     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2044     // Exclude Hangul syllables from ALetterSet during testing.
2045     // Leave CJK dictionary characters out from the monkey tests!
2046 #if 0
2047     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2048                                       "[\\p{Line_Break = Complex_Context}"
2049                                       "-\\p{Grapheme_Cluster_Break = Extend}"
2050                                       "-\\p{Grapheme_Cluster_Break = Control}"
2051                                       "]]",
2052                                       status);
2053 #endif
2054     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2055     fALetterSet->removeAll(*fDictionaryCjkSet);
2056     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2057     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2058     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2059     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2060     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2061     // we should figure out why
2062     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2063     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2064     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2065     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2066     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2067 
2068     fOtherSet        = new UnicodeSet();
2069     if(U_FAILURE(status)) {
2070       deferredStatus = status;
2071       return;
2072     }
2073 
2074     fOtherSet->complement();
2075     fOtherSet->removeAll(*fCRSet);
2076     fOtherSet->removeAll(*fLFSet);
2077     fOtherSet->removeAll(*fNewlineSet);
2078     fOtherSet->removeAll(*fKatakanaSet);
2079     fOtherSet->removeAll(*fALetterSet);
2080     fOtherSet->removeAll(*fMidLetterSet);
2081     fOtherSet->removeAll(*fMidNumSet);
2082     fOtherSet->removeAll(*fNumericSet);
2083     fOtherSet->removeAll(*fExtendNumLetSet);
2084     fOtherSet->removeAll(*fFormatSet);
2085     fOtherSet->removeAll(*fExtendSet);
2086     fOtherSet->removeAll(*fRegionalIndicatorSet);
2087     // Inhibit dictionary characters from being tested at all.
2088     fOtherSet->removeAll(*fDictionaryCjkSet);
2089     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2090 
2091     fSets->addElement(fCRSet,        status);
2092     fSets->addElement(fLFSet,        status);
2093     fSets->addElement(fNewlineSet,   status);
2094     fSets->addElement(fALetterSet,   status);
2095     //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
2096     fSets->addElement(fMidLetterSet, status);
2097     fSets->addElement(fMidNumLetSet, status);
2098     fSets->addElement(fMidNumSet,    status);
2099     fSets->addElement(fNumericSet,   status);
2100     fSets->addElement(fFormatSet,    status);
2101     fSets->addElement(fExtendSet,    status);
2102     fSets->addElement(fOtherSet,     status);
2103     fSets->addElement(fExtendNumLetSet, status);
2104     fSets->addElement(fRegionalIndicatorSet, status);
2105 
2106     if (U_FAILURE(status)) {
2107         deferredStatus = status;
2108     }
2109 }
2110 
setText(const UnicodeString & s)2111 void RBBIWordMonkey::setText(const UnicodeString &s) {
2112     fText       = &s;
2113 }
2114 
2115 
next(int32_t prevPos)2116 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2117     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2118                               //   break position being tested.  The candidate break
2119                               //   location is before p2.
2120 
2121     int     breakPos = -1;
2122 
2123     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2124 
2125     if (U_FAILURE(deferredStatus)) {
2126         return -1;
2127     }
2128 
2129     // Prev break at end of string.  return DONE.
2130     if (prevPos >= fText->length()) {
2131         return -1;
2132     }
2133     p0 = p1 = p2 = p3 = prevPos;
2134     c3 =  fText->char32At(prevPos);
2135     c0 = c1 = c2 = 0;
2136 
2137     // Loop runs once per "significant" character position in the input text.
2138     for (;;) {
2139         // Move all of the positions forward in the input string.
2140         p0 = p1;  c0 = c1;
2141         p1 = p2;  c1 = c2;
2142         p2 = p3;  c2 = c3;
2143 
2144         // Advancd p3 by    X(Extend | Format)*   Rule 4
2145         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2146         do {
2147             p3 = fText->moveIndex32(p3, 1);
2148             c3 = fText->char32At(p3);
2149             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2150                break;
2151             };
2152         }
2153         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2154 
2155 
2156         if (p1 == p2) {
2157             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2158             continue;
2159         }
2160         if (p2 == fText->length()) {
2161             // Reached end of string.  Always a break position.
2162             break;
2163         }
2164 
2165         // Rule  (3)   CR x LF
2166         //     No Extend or Format characters may appear between the CR and LF,
2167         //     which requires the additional check for p2 immediately following p1.
2168         //
2169         if (c1==0x0D && c2==0x0A) {
2170             continue;
2171         }
2172 
2173         // Rule (3a)  Break before and after newlines (including CR and LF)
2174         //
2175         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2176             break;
2177         };
2178         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2179             break;
2180         };
2181 
2182         // Rule (5).   ALetter x ALetter
2183         if (fALetterSet->contains(c1) &&
2184             fALetterSet->contains(c2))  {
2185             continue;
2186         }
2187 
2188         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2189         //
2190         if ( fALetterSet->contains(c1)   &&
2191              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2192              fALetterSet->contains(c3)) {
2193             continue;
2194         }
2195 
2196 
2197         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2198         if (fALetterSet->contains(c0) &&
2199             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
2200             fALetterSet->contains(c2)) {
2201             continue;
2202         }
2203 
2204         // Rule (8)    Numeric x Numeric
2205         if (fNumericSet->contains(c1) &&
2206             fNumericSet->contains(c2))  {
2207             continue;
2208         }
2209 
2210         // Rule (9)    ALetter x Numeric
2211         if (fALetterSet->contains(c1) &&
2212             fNumericSet->contains(c2))  {
2213             continue;
2214         }
2215 
2216         // Rule (10)    Numeric x ALetter
2217         if (fNumericSet->contains(c1) &&
2218             fALetterSet->contains(c2))  {
2219             continue;
2220         }
2221 
2222         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2223         if (fNumericSet->contains(c0) &&
2224             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
2225             fNumericSet->contains(c2)) {
2226             continue;
2227         }
2228 
2229         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2230         if (fNumericSet->contains(c1) &&
2231             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
2232             fNumericSet->contains(c3)) {
2233             continue;
2234         }
2235 
2236         // Rule (13)  Katakana x Katakana
2237         if (fKatakanaSet->contains(c1) &&
2238             fKatakanaSet->contains(c2))  {
2239             continue;
2240         }
2241 
2242         // Rule 13a
2243         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2244              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2245              fExtendNumLetSet->contains(c2)) {
2246                 continue;
2247         }
2248 
2249         // Rule 13b
2250         if (fExtendNumLetSet->contains(c1) &&
2251                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2252                 fKatakanaSet->contains(c2)))  {
2253                 continue;
2254         }
2255 
2256         // Rule 13c
2257         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2258             continue;
2259         }
2260 
2261         // Rule 14.  Break found here.
2262         break;
2263     }
2264 
2265     breakPos = p2;
2266     return breakPos;
2267 }
2268 
2269 
charClasses()2270 UVector  *RBBIWordMonkey::charClasses() {
2271     return fSets;
2272 }
2273 
2274 
~RBBIWordMonkey()2275 RBBIWordMonkey::~RBBIWordMonkey() {
2276     delete fSets;
2277     delete fCRSet;
2278     delete fLFSet;
2279     delete fNewlineSet;
2280     delete fKatakanaSet;
2281     delete fALetterSet;
2282     delete fMidNumLetSet;
2283     delete fMidLetterSet;
2284     delete fMidNumSet;
2285     delete fNumericSet;
2286     delete fFormatSet;
2287     delete fExtendSet;
2288     delete fExtendNumLetSet;
2289     delete fRegionalIndicatorSet;
2290     delete fDictionaryCjkSet;
2291     delete fOtherSet;
2292 }
2293 
2294 
2295 
2296 
2297 //------------------------------------------------------------------------------------------
2298 //
2299 //   class RBBISentMonkey      Sentence Break specific implementation
2300 //                             of RBBIMonkeyKind.
2301 //
2302 //------------------------------------------------------------------------------------------
2303 class RBBISentMonkey: public RBBIMonkeyKind {
2304 public:
2305     RBBISentMonkey();
2306     virtual          ~RBBISentMonkey();
2307     virtual  UVector *charClasses();
2308     virtual  void     setText(const UnicodeString &s);
2309     virtual int32_t   next(int32_t i);
2310 private:
2311     int               moveBack(int posFrom);
2312     int               moveForward(int posFrom);
2313     UChar32           cAt(int pos);
2314 
2315     UVector      *fSets;
2316 
2317     UnicodeSet  *fSepSet;
2318     UnicodeSet  *fFormatSet;
2319     UnicodeSet  *fSpSet;
2320     UnicodeSet  *fLowerSet;
2321     UnicodeSet  *fUpperSet;
2322     UnicodeSet  *fOLetterSet;
2323     UnicodeSet  *fNumericSet;
2324     UnicodeSet  *fATermSet;
2325     UnicodeSet  *fSContinueSet;
2326     UnicodeSet  *fSTermSet;
2327     UnicodeSet  *fCloseSet;
2328     UnicodeSet  *fOtherSet;
2329     UnicodeSet  *fExtendSet;
2330 
2331     const UnicodeString  *fText;
2332 
2333 };
2334 
RBBISentMonkey()2335 RBBISentMonkey::RBBISentMonkey()
2336 {
2337     UErrorCode  status = U_ZERO_ERROR;
2338 
2339     fSets            = new UVector(status);
2340 
2341     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2342     //                       set and made into character classes of their own.  For the monkey impl,
2343     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2344     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2345     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2346     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2347     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2348     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2349     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2350     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2351     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2352     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2353     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2354     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2355     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2356     fOtherSet        = new UnicodeSet();
2357 
2358     if(U_FAILURE(status)) {
2359       deferredStatus = status;
2360       return;
2361     }
2362 
2363     fOtherSet->complement();
2364     fOtherSet->removeAll(*fSepSet);
2365     fOtherSet->removeAll(*fFormatSet);
2366     fOtherSet->removeAll(*fSpSet);
2367     fOtherSet->removeAll(*fLowerSet);
2368     fOtherSet->removeAll(*fUpperSet);
2369     fOtherSet->removeAll(*fOLetterSet);
2370     fOtherSet->removeAll(*fNumericSet);
2371     fOtherSet->removeAll(*fATermSet);
2372     fOtherSet->removeAll(*fSContinueSet);
2373     fOtherSet->removeAll(*fSTermSet);
2374     fOtherSet->removeAll(*fCloseSet);
2375     fOtherSet->removeAll(*fExtendSet);
2376 
2377     fSets->addElement(fSepSet,       status);
2378     fSets->addElement(fFormatSet,    status);
2379     fSets->addElement(fSpSet,        status);
2380     fSets->addElement(fLowerSet,     status);
2381     fSets->addElement(fUpperSet,     status);
2382     fSets->addElement(fOLetterSet,   status);
2383     fSets->addElement(fNumericSet,   status);
2384     fSets->addElement(fATermSet,     status);
2385     fSets->addElement(fSContinueSet, status);
2386     fSets->addElement(fSTermSet,     status);
2387     fSets->addElement(fCloseSet,     status);
2388     fSets->addElement(fOtherSet,     status);
2389     fSets->addElement(fExtendSet,    status);
2390 
2391     if (U_FAILURE(status)) {
2392         deferredStatus = status;
2393     }
2394 }
2395 
2396 
2397 
setText(const UnicodeString & s)2398 void RBBISentMonkey::setText(const UnicodeString &s) {
2399     fText       = &s;
2400 }
2401 
charClasses()2402 UVector  *RBBISentMonkey::charClasses() {
2403     return fSets;
2404 }
2405 
2406 
2407 //  moveBack()   Find the "significant" code point preceding the index i.
2408 //               Skips over ($Extend | $Format)* .
2409 //
moveBack(int i)2410 int RBBISentMonkey::moveBack(int i) {
2411     if (i <= 0) {
2412         return -1;
2413     }
2414     UChar32   c;
2415     int32_t   j = i;
2416     do {
2417         j = fText->moveIndex32(j, -1);
2418         c = fText->char32At(j);
2419     }
2420     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2421     return j;
2422 
2423  }
2424 
2425 
moveForward(int i)2426 int RBBISentMonkey::moveForward(int i) {
2427     if (i>=fText->length()) {
2428         return fText->length();
2429     }
2430     UChar32   c;
2431     int32_t   j = i;
2432     do {
2433         j = fText->moveIndex32(j, 1);
2434         c = cAt(j);
2435     }
2436     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2437     return j;
2438 }
2439 
cAt(int pos)2440 UChar32 RBBISentMonkey::cAt(int pos) {
2441     if (pos<0 || pos>=fText->length()) {
2442         return -1;
2443     } else {
2444         return fText->char32At(pos);
2445     }
2446 }
2447 
next(int32_t prevPos)2448 int32_t RBBISentMonkey::next(int32_t prevPos) {
2449     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2450                               //   break position being tested.  The candidate break
2451                               //   location is before p2.
2452 
2453     int     breakPos = -1;
2454 
2455     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2456     UChar32 c;
2457 
2458     if (U_FAILURE(deferredStatus)) {
2459         return -1;
2460     }
2461 
2462     // Prev break at end of string.  return DONE.
2463     if (prevPos >= fText->length()) {
2464         return -1;
2465     }
2466     p0 = p1 = p2 = p3 = prevPos;
2467     c3 =  fText->char32At(prevPos);
2468     c0 = c1 = c2 = 0;
2469 
2470     // Loop runs once per "significant" character position in the input text.
2471     for (;;) {
2472         // Move all of the positions forward in the input string.
2473         p0 = p1;  c0 = c1;
2474         p1 = p2;  c1 = c2;
2475         p2 = p3;  c2 = c3;
2476 
2477         // Advancd p3 by    X(Extend | Format)*   Rule 4
2478         p3 = moveForward(p3);
2479         c3 = cAt(p3);
2480 
2481         // Rule (3)  CR x LF
2482         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2483             continue;
2484         }
2485 
2486         // Rule (4).   Sep  <break>
2487         if (fSepSet->contains(c1)) {
2488             p2 = p1+1;   // Separators don't combine with Extend or Format.
2489             break;
2490         }
2491 
2492         if (p2 >= fText->length()) {
2493             // Reached end of string.  Always a break position.
2494             break;
2495         }
2496 
2497         if (p2 == prevPos) {
2498             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2499             continue;
2500         }
2501 
2502         // Rule (6).   ATerm x Numeric
2503         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2504             continue;
2505         }
2506 
2507         // Rule (7).  Upper ATerm  x  Uppper
2508         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2509             continue;
2510         }
2511 
2512         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2513         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2514         //                  note to the Unicode 5.0 documents.
2515         int p8 = p1;
2516         while (fSpSet->contains(cAt(p8))) {
2517             p8 = moveBack(p8);
2518         }
2519         while (fCloseSet->contains(cAt(p8))) {
2520             p8 = moveBack(p8);
2521         }
2522         if (fATermSet->contains(cAt(p8))) {
2523             p8=p2;
2524             for (;;) {
2525                 c = cAt(p8);
2526                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2527                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2528                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2529                     break;
2530                 }
2531                 p8 = moveForward(p8);
2532             }
2533             if (fLowerSet->contains(cAt(p8))) {
2534                 continue;
2535             }
2536         }
2537 
2538         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2539         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2540             p8 = p1;
2541             while (fSpSet->contains(cAt(p8))) {
2542                 p8 = moveBack(p8);
2543             }
2544             while (fCloseSet->contains(cAt(p8))) {
2545                 p8 = moveBack(p8);
2546             }
2547             c = cAt(p8);
2548             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2549                 continue;
2550             }
2551         }
2552 
2553         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2554         int p9 = p1;
2555         while (fCloseSet->contains(cAt(p9))) {
2556             p9 = moveBack(p9);
2557         }
2558         c = cAt(p9);
2559         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2560             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2561                 continue;
2562             }
2563         }
2564 
2565         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2566         int p10 = p1;
2567         while (fSpSet->contains(cAt(p10))) {
2568             p10 = moveBack(p10);
2569         }
2570         while (fCloseSet->contains(cAt(p10))) {
2571             p10 = moveBack(p10);
2572         }
2573         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2574             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2575                 continue;
2576             }
2577         }
2578 
2579         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2580         int p11 = p1;
2581         if (fSepSet->contains(cAt(p11))) {
2582             p11 = moveBack(p11);
2583         }
2584         while (fSpSet->contains(cAt(p11))) {
2585             p11 = moveBack(p11);
2586         }
2587         while (fCloseSet->contains(cAt(p11))) {
2588             p11 = moveBack(p11);
2589         }
2590         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2591             break;
2592         }
2593 
2594         //  Rule (12)  Any x Any
2595         continue;
2596     }
2597     breakPos = p2;
2598     return breakPos;
2599 }
2600 
~RBBISentMonkey()2601 RBBISentMonkey::~RBBISentMonkey() {
2602     delete fSets;
2603     delete fSepSet;
2604     delete fFormatSet;
2605     delete fSpSet;
2606     delete fLowerSet;
2607     delete fUpperSet;
2608     delete fOLetterSet;
2609     delete fNumericSet;
2610     delete fATermSet;
2611     delete fSContinueSet;
2612     delete fSTermSet;
2613     delete fCloseSet;
2614     delete fOtherSet;
2615     delete fExtendSet;
2616 }
2617 
2618 
2619 
2620 //-------------------------------------------------------------------------------------------
2621 //
2622 //  RBBILineMonkey
2623 //
2624 //-------------------------------------------------------------------------------------------
2625 
2626 class RBBILineMonkey: public RBBIMonkeyKind {
2627 public:
2628     RBBILineMonkey();
2629     virtual          ~RBBILineMonkey();
2630     virtual  UVector *charClasses();
2631     virtual  void     setText(const UnicodeString &s);
2632     virtual  int32_t  next(int32_t i);
2633     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2634 private:
2635     UVector      *fSets;
2636 
2637     UnicodeSet  *fBK;
2638     UnicodeSet  *fCR;
2639     UnicodeSet  *fLF;
2640     UnicodeSet  *fCM;
2641     UnicodeSet  *fNL;
2642     UnicodeSet  *fSG;
2643     UnicodeSet  *fWJ;
2644     UnicodeSet  *fZW;
2645     UnicodeSet  *fGL;
2646     UnicodeSet  *fCB;
2647     UnicodeSet  *fSP;
2648     UnicodeSet  *fB2;
2649     UnicodeSet  *fBA;
2650     UnicodeSet  *fBB;
2651     UnicodeSet  *fHY;
2652     UnicodeSet  *fH2;
2653     UnicodeSet  *fH3;
2654     UnicodeSet  *fCL;
2655     UnicodeSet  *fCP;
2656     UnicodeSet  *fEX;
2657     UnicodeSet  *fIN;
2658     UnicodeSet  *fJL;
2659     UnicodeSet  *fJV;
2660     UnicodeSet  *fJT;
2661     UnicodeSet  *fNS;
2662     UnicodeSet  *fOP;
2663     UnicodeSet  *fQU;
2664     UnicodeSet  *fIS;
2665     UnicodeSet  *fNU;
2666     UnicodeSet  *fPO;
2667     UnicodeSet  *fPR;
2668     UnicodeSet  *fSY;
2669     UnicodeSet  *fAI;
2670     UnicodeSet  *fAL;
2671     UnicodeSet  *fCJ;
2672     UnicodeSet  *fHL;
2673     UnicodeSet  *fID;
2674     UnicodeSet  *fRI;
2675     UnicodeSet  *fSA;
2676     UnicodeSet  *fXX;
2677 
2678     BreakIterator  *fCharBI;
2679 
2680     const UnicodeString  *fText;
2681     int32_t              *fOrigPositions;
2682 
2683     RegexMatcher         *fNumberMatcher;
2684     RegexMatcher         *fLB11Matcher;
2685 };
2686 
2687 
RBBILineMonkey()2688 RBBILineMonkey::RBBILineMonkey()
2689 {
2690     UErrorCode  status = U_ZERO_ERROR;
2691 
2692     fSets  = new UVector(status);
2693 
2694     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2695     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2696     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2697     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2698     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2699     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2700     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2701     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2702     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2703     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2704     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2705     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2706     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2707     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2708     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2709     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2710     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2711     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2712     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2713     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2714     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2715     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2716     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2717     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2718     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2719     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2720     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2721     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2722     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2723     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2724     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2725     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2726     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2727     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2728     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2729     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2730     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2731     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2732     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2733     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2734 
2735     if (U_FAILURE(status)) {
2736         deferredStatus = status;
2737         fCharBI = NULL;
2738         fNumberMatcher = NULL;
2739         return;
2740     }
2741 
2742     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2743     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2744     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2745     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2746 
2747     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2748 
2749     fSets->addElement(fBK, status);
2750     fSets->addElement(fCR, status);
2751     fSets->addElement(fLF, status);
2752     fSets->addElement(fCM, status);
2753     fSets->addElement(fNL, status);
2754     fSets->addElement(fWJ, status);
2755     fSets->addElement(fZW, status);
2756     fSets->addElement(fGL, status);
2757     fSets->addElement(fCB, status);
2758     fSets->addElement(fSP, status);
2759     fSets->addElement(fB2, status);
2760     fSets->addElement(fBA, status);
2761     fSets->addElement(fBB, status);
2762     fSets->addElement(fHY, status);
2763     fSets->addElement(fH2, status);
2764     fSets->addElement(fH3, status);
2765     fSets->addElement(fCL, status);
2766     fSets->addElement(fCP, status);
2767     fSets->addElement(fEX, status);
2768     fSets->addElement(fIN, status);
2769     fSets->addElement(fJL, status);
2770     fSets->addElement(fJT, status);
2771     fSets->addElement(fJV, status);
2772     fSets->addElement(fNS, status);
2773     fSets->addElement(fOP, status);
2774     fSets->addElement(fQU, status);
2775     fSets->addElement(fIS, status);
2776     fSets->addElement(fNU, status);
2777     fSets->addElement(fPO, status);
2778     fSets->addElement(fPR, status);
2779     fSets->addElement(fSY, status);
2780     fSets->addElement(fAI, status);
2781     fSets->addElement(fAL, status);
2782     fSets->addElement(fHL, status);
2783     fSets->addElement(fID, status);
2784     fSets->addElement(fWJ, status);
2785     fSets->addElement(fRI, status);
2786     fSets->addElement(fSA, status);
2787     fSets->addElement(fSG, status);
2788 
2789     const char *rules =
2790             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2791             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2792             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2793             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2794             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2795             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2796 
2797     fNumberMatcher = new RegexMatcher(
2798         UnicodeString(rules, -1, US_INV), 0, status);
2799 
2800     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2801 
2802     if (U_FAILURE(status)) {
2803         deferredStatus = status;
2804     }
2805 }
2806 
2807 
setText(const UnicodeString & s)2808 void RBBILineMonkey::setText(const UnicodeString &s) {
2809     fText       = &s;
2810     fCharBI->setText(s);
2811     fNumberMatcher->reset(s);
2812 }
2813 
2814 //
2815 //  rule9Adjust
2816 //     Line Break TR rules 9 and 10 implementation.
2817 //     This deals with combining marks and other sequences that
2818 //     that must be treated as if they were something other than what they actually are.
2819 //
2820 //     This is factored out into a separate function because it must be applied twice for
2821 //     each potential break, once to the chars before the position being checked, then
2822 //     again to the text following the possible break.
2823 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2824 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2825     if (pos == -1) {
2826         // Invalid initial position.  Happens during the warmup iteration of the
2827         //   main loop in next().
2828         return;
2829     }
2830 
2831     int32_t  nPos = *nextPos;
2832 
2833     // LB 9  Keep combining sequences together.
2834     //  advance over any CM class chars.  Note that Line Break CM is different
2835     //  from the normal Grapheme Extend property.
2836     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2837           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2838         for (;;) {
2839             *nextChar = fText->char32At(nPos);
2840             if (!fCM->contains(*nextChar)) {
2841                 break;
2842             }
2843             nPos = fText->moveIndex32(nPos, 1);
2844         }
2845     }
2846 
2847 
2848     // LB 9 Treat X CM* as if it were x.
2849     //       No explicit action required.
2850 
2851     // LB 10  Treat any remaining combining mark as AL
2852     if (fCM->contains(*posChar)) {
2853         *posChar = 0x41;   // thisChar = 'A';
2854     }
2855 
2856     // Push the updated nextPos and nextChar back to our caller.
2857     // This only makes a difference if posChar got bigger by consuming a
2858     // combining sequence.
2859     *nextPos  = nPos;
2860     *nextChar = fText->char32At(nPos);
2861 }
2862 
2863 
2864 
next(int32_t startPos)2865 int32_t RBBILineMonkey::next(int32_t startPos) {
2866     UErrorCode status = U_ZERO_ERROR;
2867     int32_t    pos;       //  Index of the char following a potential break position
2868     UChar32    thisChar;  //  Character at above position "pos"
2869 
2870     int32_t    prevPos;   //  Index of the char preceding a potential break position
2871     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2872                           //   and thisChar may not be adjacent because combining
2873                           //   characters between them will be ignored.
2874 
2875     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2876     UChar32    prevCharX2;
2877 
2878     int32_t    nextPos;   //  Index of the next character following pos.
2879                           //     Usually skips over combining marks.
2880     int32_t    nextCPPos; //  Index of the code point following "pos."
2881                           //     May point to a combining mark.
2882     int32_t    tPos;      //  temp value.
2883     UChar32    c;
2884 
2885     if (U_FAILURE(deferredStatus)) {
2886         return -1;
2887     }
2888 
2889     if (startPos >= fText->length()) {
2890         return -1;
2891     }
2892 
2893 
2894     // Initial values for loop.  Loop will run the first time without finding breaks,
2895     //                           while the invalid values shift out and the "this" and
2896     //                           "prev" positions are filled in with good values.
2897     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2898     thisChar = prevChar  = prevCharX2 = 0;
2899     nextPos  = nextCPPos = startPos;
2900 
2901 
2902     // Loop runs once per position in the test text, until a break position
2903     //  is found.
2904     for (;;) {
2905         prevPosX2 = prevPos;
2906         prevCharX2 = prevChar;
2907 
2908         prevPos   = pos;
2909         prevChar  = thisChar;
2910 
2911         pos       = nextPos;
2912         thisChar  = fText->char32At(pos);
2913 
2914         nextCPPos = fText->moveIndex32(pos, 1);
2915         nextPos   = nextCPPos;
2916 
2917         // Rule LB2 - Break at end of text.
2918         if (pos >= fText->length()) {
2919             break;
2920         }
2921 
2922         // Rule LB 9 - adjust for combining sequences.
2923         //             We do this one out-of-order because the adjustment does not change anything
2924         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2925         //             be applied.
2926         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
2927         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2928         c = fText->char32At(nextPos);
2929         rule9Adjust(pos,     &thisChar, &nextPos, &c);
2930 
2931         // If the loop is still warming up - if we haven't shifted the initial
2932         //   -1 positions out of prevPos yet - loop back to advance the
2933         //    position in the input without any further looking for breaks.
2934         if (prevPos == -1) {
2935             continue;
2936         }
2937 
2938         // LB 4  Always break after hard line breaks,
2939         if (fBK->contains(prevChar)) {
2940             break;
2941         }
2942 
2943         // LB 5  Break after CR, LF, NL, but not inside CR LF
2944         if (prevChar == 0x0d && thisChar == 0x0a) {
2945             continue;
2946         }
2947         if (prevChar == 0x0d ||
2948             prevChar == 0x0a ||
2949             prevChar == 0x85)  {
2950             break;
2951         }
2952 
2953         // LB 6  Don't break before hard line breaks
2954         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2955             fBK->contains(thisChar)) {
2956                 continue;
2957         }
2958 
2959 
2960         // LB 7  Don't break before spaces or zero-width space.
2961         if (fSP->contains(thisChar)) {
2962             continue;
2963         }
2964 
2965         if (fZW->contains(thisChar)) {
2966             continue;
2967         }
2968 
2969         // LB 8  Break after zero width space
2970         if (fZW->contains(prevChar)) {
2971             break;
2972         }
2973 
2974         // LB 9, 10  Already done, at top of loop.
2975         //
2976 
2977 
2978         // LB 11  Do not break before or after WORD JOINER and related characters.
2979         //    x  WJ
2980         //    WJ  x
2981         //
2982         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
2983             continue;
2984         }
2985 
2986         // LB 12
2987         //    GL  x
2988         if (fGL->contains(prevChar)) {
2989             continue;
2990         }
2991 
2992         // LB 12a
2993         //    [^SP BA HY] x GL
2994         if (!(fSP->contains(prevChar) ||
2995               fBA->contains(prevChar) ||
2996               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
2997             continue;
2998         }
2999 
3000 
3001 
3002         // LB 13  Don't break before closings.
3003         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3004         //        fall into LB 17 and the more general number regular expression.
3005         //
3006         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3007             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3008                                          fEX->contains(thisChar)  ||
3009             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3010             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3011             continue;
3012         }
3013 
3014         // LB 14 Don't break after OP SP*
3015         //       Scan backwards, checking for this sequence.
3016         //       The OP char could include combining marks, so we actually check for
3017         //           OP CM* SP*
3018         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3019         //       sequence into a ID char, so before scanning back through spaces,
3020         //       verify that prevChar is indeed a space.  The prevChar variable
3021         //       may differ from fText[prevPos]
3022         tPos = prevPos;
3023         if (fSP->contains(prevChar)) {
3024             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3025                 tPos=fText->moveIndex32(tPos, -1);
3026             }
3027         }
3028         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3029             tPos=fText->moveIndex32(tPos, -1);
3030         }
3031         if (fOP->contains(fText->char32At(tPos))) {
3032             continue;
3033         }
3034 
3035 
3036         // LB 15    QU SP* x OP
3037         if (fOP->contains(thisChar)) {
3038             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3039             int tPos = prevPos;
3040             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3041                 tPos = fText->moveIndex32(tPos, -1);
3042             }
3043             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3044                 tPos = fText->moveIndex32(tPos, -1);
3045             }
3046             if (fQU->contains(fText->char32At(tPos))) {
3047                 continue;
3048             }
3049         }
3050 
3051 
3052 
3053         // LB 16   (CL | CP) SP* x NS
3054         //    Scan backwards for SP* CM* (CL | CP)
3055         if (fNS->contains(thisChar)) {
3056             int tPos = prevPos;
3057             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3058                 tPos = fText->moveIndex32(tPos, -1);
3059             }
3060             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3061                 tPos = fText->moveIndex32(tPos, -1);
3062             }
3063             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3064                 continue;
3065             }
3066         }
3067 
3068 
3069         // LB 17        B2 SP* x B2
3070         if (fB2->contains(thisChar)) {
3071             //  Scan backwards, checking for the B2 CM* SP* sequence.
3072             tPos = prevPos;
3073             if (fSP->contains(prevChar)) {
3074                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3075                     tPos=fText->moveIndex32(tPos, -1);
3076                 }
3077             }
3078             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3079                 tPos=fText->moveIndex32(tPos, -1);
3080             }
3081             if (fB2->contains(fText->char32At(tPos))) {
3082                 continue;
3083             }
3084         }
3085 
3086 
3087         // LB 18    break after space
3088         if (fSP->contains(prevChar)) {
3089             break;
3090         }
3091 
3092         // LB 19
3093         //    x   QU
3094         //    QU  x
3095         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3096             continue;
3097         }
3098 
3099         // LB 20  Break around a CB
3100         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3101             break;
3102         }
3103 
3104         // LB 21
3105         if (fBA->contains(thisChar) ||
3106             fHY->contains(thisChar) ||
3107             fNS->contains(thisChar) ||
3108             fBB->contains(prevChar) )   {
3109             continue;
3110         }
3111 
3112         // LB 21a
3113         //   HL (HY | BA) x
3114         if (fHL->contains(prevCharX2) &&
3115                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3116             continue;
3117         }
3118 
3119         // LB 22
3120         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3121             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3122             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3123             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3124             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3125             continue;
3126         }
3127 
3128 
3129         // LB 23    ID x PO
3130         //          AL x NU
3131         //          HL x NU
3132         //          NU x AL
3133         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3134             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3135             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3136             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3137             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3138             continue;
3139         }
3140 
3141         // LB 24  Do not break between prefix and letters or ideographs.
3142         //        PR x ID
3143         //        PR x (AL | HL)
3144         //        PO x (AL | HL)
3145         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3146             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3147             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3148             continue;
3149         }
3150 
3151 
3152 
3153         // LB 25    Numbers
3154         if (fNumberMatcher->lookingAt(prevPos, status)) {
3155             if (U_FAILURE(status)) {
3156                 break;
3157             }
3158             // Matched a number.  But could have been just a single digit, which would
3159             //    not represent a "no break here" between prevChar and thisChar
3160             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3161             if (numEndIdx > pos) {
3162                 // Number match includes at least our two chars being checked
3163                 if (numEndIdx > nextPos) {
3164                     // Number match includes additional chars.  Update pos and nextPos
3165                     //   so that next loop iteration will continue at the end of the number,
3166                     //   checking for breaks between last char in number & whatever follows.
3167                     pos = nextPos = numEndIdx;
3168                     do {
3169                         pos = fText->moveIndex32(pos, -1);
3170                         thisChar = fText->char32At(pos);
3171                     } while (fCM->contains(thisChar));
3172                 }
3173                 continue;
3174             }
3175         }
3176 
3177 
3178         // LB 26 Do not break a Korean syllable.
3179         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3180                                         fJV->contains(thisChar) ||
3181                                         fH2->contains(thisChar) ||
3182                                         fH3->contains(thisChar))) {
3183                                             continue;
3184                                         }
3185 
3186         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3187             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3188                 continue;
3189         }
3190 
3191         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3192             fJT->contains(thisChar)) {
3193                 continue;
3194         }
3195 
3196         // LB 27 Treat a Korean Syllable Block the same as ID.
3197         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3198             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3199             fIN->contains(thisChar)) {
3200                 continue;
3201             }
3202         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3203             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3204             fPO->contains(thisChar)) {
3205                 continue;
3206             }
3207         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3208             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3209                 continue;
3210             }
3211 
3212 
3213 
3214         // LB 28  Do not break between alphabetics ("at").
3215         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3216             continue;
3217         }
3218 
3219         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3220         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3221             continue;
3222         }
3223 
3224         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3225         //          (AL | NU) x OP
3226         //          CP x (AL | NU)
3227         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3228             continue;
3229         }
3230         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3231             continue;
3232         }
3233 
3234         // LB30a  Do not break between regional indicators.
3235         //        RI x RI
3236         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3237             continue;
3238         }
3239 
3240         // LB 31    Break everywhere else
3241         break;
3242 
3243     }
3244 
3245     return pos;
3246 }
3247 
3248 
charClasses()3249 UVector  *RBBILineMonkey::charClasses() {
3250     return fSets;
3251 }
3252 
3253 
~RBBILineMonkey()3254 RBBILineMonkey::~RBBILineMonkey() {
3255     delete fSets;
3256 
3257     delete fBK;
3258     delete fCR;
3259     delete fLF;
3260     delete fCM;
3261     delete fNL;
3262     delete fWJ;
3263     delete fZW;
3264     delete fGL;
3265     delete fCB;
3266     delete fSP;
3267     delete fB2;
3268     delete fBA;
3269     delete fBB;
3270     delete fHY;
3271     delete fH2;
3272     delete fH3;
3273     delete fCL;
3274     delete fCP;
3275     delete fEX;
3276     delete fIN;
3277     delete fJL;
3278     delete fJV;
3279     delete fJT;
3280     delete fNS;
3281     delete fOP;
3282     delete fQU;
3283     delete fIS;
3284     delete fNU;
3285     delete fPO;
3286     delete fPR;
3287     delete fSY;
3288     delete fAI;
3289     delete fAL;
3290     delete fCJ;
3291     delete fHL;
3292     delete fID;
3293     delete fRI;
3294     delete fSA;
3295     delete fSG;
3296     delete fXX;
3297 
3298     delete fCharBI;
3299     delete fNumberMatcher;
3300 }
3301 
3302 
3303 //-------------------------------------------------------------------------------------------
3304 //
3305 //   TestMonkey
3306 //
3307 //     params
3308 //       seed=nnnnn        Random number starting seed.
3309 //                         Setting the seed allows errors to be reproduced.
3310 //       loop=nnn          Looping count.  Controls running time.
3311 //                         -1:  run forever.
3312 //                          0 or greater:  run length.
3313 //
3314 //       type = char | word | line | sent | title
3315 //
3316 //-------------------------------------------------------------------------------------------
3317 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3318 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3319     int32_t val = defaultVal;
3320     name.append(" *= *(-?\\d+)");
3321     UErrorCode status = U_ZERO_ERROR;
3322     RegexMatcher m(name, params, 0, status);
3323     if (m.find()) {
3324         // The param exists.  Convert the string to an int.
3325         char valString[100];
3326         int32_t paramLength = m.end(1, status) - m.start(1, status);
3327         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3328             paramLength = (int32_t)(sizeof(valString)-2);
3329         }
3330         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3331         val = strtol(valString,  NULL, 10);
3332 
3333         // Delete this parameter from the params string.
3334         m.reset();
3335         params = m.replaceFirst("", status);
3336     }
3337     U_ASSERT(U_SUCCESS(status));
3338     return val;
3339 }
3340 #endif
3341 
3342 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3343 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3344                                     BreakIterator *bi,
3345                                     int expected[],
3346                                     int expectedcount)
3347 {
3348     int count = 0;
3349     int i = 0;
3350     int forward[50];
3351     bi->setText(ustr);
3352     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3353         forward[count] = i;
3354         if (count < expectedcount && expected[count] != i) {
3355             test->errln("break forward test failed: expected %d but got %d",
3356                         expected[count], i);
3357             break;
3358         }
3359         count ++;
3360     }
3361     if (count != expectedcount) {
3362         printStringBreaks(ustr, expected, expectedcount);
3363         test->errln("break forward test failed: missed %d match",
3364                     expectedcount - count);
3365         return;
3366     }
3367     // testing boundaries
3368     for (i = 1; i < expectedcount; i ++) {
3369         int j = expected[i - 1];
3370         if (!bi->isBoundary(j)) {
3371             printStringBreaks(ustr, expected, expectedcount);
3372             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3373             return;
3374         }
3375         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3376             if (bi->isBoundary(j)) {
3377                 printStringBreaks(ustr, expected, expectedcount);
3378                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3379                 return;
3380             }
3381         }
3382     }
3383 
3384     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3385         count --;
3386         if (forward[count] != i) {
3387             printStringBreaks(ustr, expected, expectedcount);
3388             test->errln("happy break test previous() failed: expected %d but got %d",
3389                         forward[count], i);
3390             break;
3391         }
3392     }
3393     if (count != 0) {
3394         printStringBreaks(ustr, expected, expectedcount);
3395         test->errln("break test previous() failed: missed a match");
3396         return;
3397     }
3398 
3399     // testing preceding
3400     for (i = 0; i < expectedcount - 1; i ++) {
3401         // int j = expected[i] + 1;
3402         int j = ustr.moveIndex32(expected[i], 1);
3403         for (; j <= expected[i + 1]; j ++) {
3404             if (bi->preceding(j) != expected[i]) {
3405                 printStringBreaks(ustr, expected, expectedcount);
3406                 test->errln("preceding(): Not expecting boundary at position %d", j);
3407                 return;
3408             }
3409         }
3410     }
3411 }
3412 #endif
3413 
TestWordBreaks(void)3414 void RBBITest::TestWordBreaks(void)
3415 {
3416 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3417 
3418     Locale        locale("en");
3419     UErrorCode    status = U_ZERO_ERROR;
3420     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3421     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3422     // Replaced any C+J characters in a row with a random sequence of characters
3423     // of the same length to make our C+J segmentation not get in the way.
3424     static const char *strlist[] =
3425     {
3426     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3427     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3428     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3429     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3430     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3431     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3432     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3433     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3434     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3435     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3436     "\\u2027\\U000e0067\\u0a47\\u00b7",
3437     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3438     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3439     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3440     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3441     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3442     "\\u0027\\u11af\\U000e0057\\u0602",
3443     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3444     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3445     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3446     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3447     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3448     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3449     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3450     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3451     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3452     "\\u18f4\\U000e0049\\u20e7\\u2027",
3453     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3454     "\\ua183\\u102d\\u0bec\\u003a",
3455     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3456     "\\u003a\\u0e57\\u0fad\\u002e",
3457     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3458     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3459     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3460     "\\u003a\\u0664\\u00b7\\u1fba",
3461     "\\u003b\\u0027\\u00b7\\u47a3",
3462     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3463     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3464     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3465     };
3466     int loop;
3467     if (U_FAILURE(status)) {
3468         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3469         return;
3470     }
3471     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3472         // printf("looping %d\n", loop);
3473         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3474         // RBBICharMonkey monkey;
3475         RBBIWordMonkey monkey;
3476 
3477         int expected[50];
3478         int expectedcount = 0;
3479 
3480         monkey.setText(ustr);
3481         int i;
3482         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3483             expected[expectedcount ++] = i;
3484         }
3485 
3486         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3487     }
3488     delete bi;
3489 #endif
3490 }
3491 
TestWordBoundary(void)3492 void RBBITest::TestWordBoundary(void)
3493 {
3494     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3495     Locale        locale("en");
3496     UErrorCode    status = U_ZERO_ERROR;
3497     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3498     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3499     UChar         str[50];
3500     static const char *strlist[] =
3501     {
3502     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3503     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3504     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3505     "\\u2027\\U000e0067\\u0a47\\u00b7",
3506     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3507     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3508     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3509     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3510     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3511     "\\u0027\\u11af\\U000e0057\\u0602",
3512     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3513     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3514     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3515     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3516     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3517     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3518     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3519     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3520     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3521     "\\u58f4\\U000e0049\\u20e7\\u2027",
3522     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3523     "\\ua183\\u102d\\u0bec\\u003a",
3524     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3525     "\\u003a\\u0e57\\u0fad\\u002e",
3526     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3527     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3528     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3529     "\\u003a\\u0664\\u00b7\\u1fba",
3530     "\\u003b\\u0027\\u00b7\\u47a3",
3531     };
3532     int loop;
3533     if (U_FAILURE(status)) {
3534         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3535         return;
3536     }
3537     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3538         // printf("looping %d\n", loop);
3539         u_unescape(strlist[loop], str, 20);
3540         UnicodeString ustr(str);
3541         int forward[50];
3542         int count = 0;
3543 
3544         bi->setText(ustr);
3545         int prev = 0;
3546         int i;
3547         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3548             forward[count ++] = i;
3549             if (i > prev) {
3550                 int j;
3551                 for (j = prev + 1; j < i; j ++) {
3552                     if (bi->isBoundary(j)) {
3553                         printStringBreaks(ustr, forward, count);
3554                         errln("happy boundary test failed: expected %d not a boundary",
3555                                j);
3556                         return;
3557                     }
3558                 }
3559             }
3560             if (!bi->isBoundary(i)) {
3561                 printStringBreaks(ustr, forward, count);
3562                 errln("happy boundary test failed: expected %d a boundary",
3563                        i);
3564                 return;
3565             }
3566             prev = i;
3567         }
3568     }
3569     delete bi;
3570 }
3571 
TestLineBreaks(void)3572 void RBBITest::TestLineBreaks(void)
3573 {
3574 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3575     Locale        locale("en");
3576     UErrorCode    status = U_ZERO_ERROR;
3577     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3578     const int32_t  STRSIZE = 50;
3579     UChar         str[STRSIZE];
3580     static const char *strlist[] =
3581     {
3582      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3583      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3584              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3585      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3586              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3587      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3588      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3589      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3590      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3591      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3592      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3593      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3594      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3595      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3596      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3597      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3598      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3599      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3600      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3601      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3602      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3603      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3604      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3605      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3606      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3607      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3608      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3609      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3610      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3611      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3612      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3613      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3614      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3615      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3616      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3617      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3618      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3619      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3620      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3621      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3622      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3623      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3624          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3625          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3626          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3627      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3628          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3629     };
3630     int loop;
3631     TEST_ASSERT_SUCCESS(status);
3632     if (U_FAILURE(status)) {
3633         return;
3634     }
3635     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3636         // printf("looping %d\n", loop);
3637         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3638         if (t >= STRSIZE) {
3639             TEST_ASSERT(FALSE);
3640             continue;
3641         }
3642 
3643 
3644         UnicodeString ustr(str);
3645         RBBILineMonkey monkey;
3646         if (U_FAILURE(monkey.deferredStatus)) {
3647             continue;
3648         }
3649 
3650         const int EXPECTEDSIZE = 50;
3651         int expected[EXPECTEDSIZE];
3652         int expectedcount = 0;
3653 
3654         monkey.setText(ustr);
3655         int i;
3656         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3657             if (expectedcount >= EXPECTEDSIZE) {
3658                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3659                 return;
3660             }
3661             expected[expectedcount ++] = i;
3662         }
3663 
3664         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3665     }
3666     delete bi;
3667 #endif
3668 }
3669 
TestSentBreaks(void)3670 void RBBITest::TestSentBreaks(void)
3671 {
3672 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3673     Locale        locale("en");
3674     UErrorCode    status = U_ZERO_ERROR;
3675     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3676     UChar         str[200];
3677     static const char *strlist[] =
3678     {
3679      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3680      "This\n",
3681      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3682      "\"Sentence ending with a quote.\" Bye.",
3683      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3684      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3685      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3686      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3687      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3688      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3689      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3690              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3691              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3692              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3693      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3694              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3695              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3696              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3697              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3698              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3699     };
3700     int loop;
3701     if (U_FAILURE(status)) {
3702         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3703         return;
3704     }
3705     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3706         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3707         UnicodeString ustr(str);
3708 
3709         RBBISentMonkey monkey;
3710         if (U_FAILURE(monkey.deferredStatus)) {
3711             continue;
3712         }
3713 
3714         const int EXPECTEDSIZE = 50;
3715         int expected[EXPECTEDSIZE];
3716         int expectedcount = 0;
3717 
3718         monkey.setText(ustr);
3719         int i;
3720         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3721             if (expectedcount >= EXPECTEDSIZE) {
3722                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3723                 return;
3724             }
3725             expected[expectedcount ++] = i;
3726         }
3727 
3728         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3729     }
3730     delete bi;
3731 #endif
3732 }
3733 
TestMonkey(char * params)3734 void RBBITest::TestMonkey(char *params) {
3735 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3736 
3737     UErrorCode     status    = U_ZERO_ERROR;
3738     int32_t        loopCount = 500;
3739     int32_t        seed      = 1;
3740     UnicodeString  breakType = "all";
3741     Locale         locale("en");
3742     UBool          useUText  = FALSE;
3743 
3744     if (quick == FALSE) {
3745         loopCount = 10000;
3746     }
3747 
3748     if (params) {
3749         UnicodeString p(params);
3750         loopCount = getIntParam("loop", p, loopCount);
3751         seed      = getIntParam("seed", p, seed);
3752 
3753         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3754         if (m.find()) {
3755             breakType = m.group(1, status);
3756             m.reset();
3757             p = m.replaceFirst("", status);
3758         }
3759 
3760         RegexMatcher u(" *utext", p, 0, status);
3761         if (u.find()) {
3762             useUText = TRUE;
3763             u.reset();
3764             p = u.replaceFirst("", status);
3765         }
3766 
3767 
3768         // m.reset(p);
3769         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3770             // Each option is stripped out of the option string as it is processed.
3771             // All options have been checked.  The option string should have been completely emptied..
3772             char buf[100];
3773             p.extract(buf, sizeof(buf), NULL, status);
3774             buf[sizeof(buf)-1] = 0;
3775             errln("Unrecognized or extra parameter:  %s\n", buf);
3776             return;
3777         }
3778 
3779     }
3780 
3781     if (breakType == "char" || breakType == "all") {
3782         RBBICharMonkey  m;
3783         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3784         if (U_SUCCESS(status)) {
3785             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3786             if (breakType == "all" && useUText==FALSE) {
3787                 // Also run a quick test with UText when "all" is specified
3788                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3789             }
3790         }
3791         else {
3792             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3793         }
3794         delete bi;
3795     }
3796 
3797     if (breakType == "word" || breakType == "all") {
3798         logln("Word Break Monkey Test");
3799         RBBIWordMonkey  m;
3800         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3801         if (U_SUCCESS(status)) {
3802             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3803         }
3804         else {
3805             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3806         }
3807         delete bi;
3808     }
3809 
3810     if (breakType == "line" || breakType == "all") {
3811         logln("Line Break Monkey Test");
3812         RBBILineMonkey  m;
3813         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3814         if (loopCount >= 10) {
3815             loopCount = loopCount / 5;   // Line break runs slower than the others.
3816         }
3817         if (U_SUCCESS(status)) {
3818             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3819         }
3820         else {
3821             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3822         }
3823         delete bi;
3824     }
3825 
3826     if (breakType == "sent" || breakType == "all"  ) {
3827         logln("Sentence Break Monkey Test");
3828         RBBISentMonkey  m;
3829         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3830         if (loopCount >= 10) {
3831             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3832         }
3833         if (U_SUCCESS(status)) {
3834             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3835         }
3836         else {
3837             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3838         }
3839         delete bi;
3840     }
3841 
3842 #endif
3843 }
3844 
3845 //
3846 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3847 //    Parameters:
3848 //       bi      - the break iterator to use
3849 //       mk      - MonkeyKind, abstraction for obtaining expected results
3850 //       name    - Name of test (char, word, etc.) for use in error messages
3851 //       seed    - Seed for starting random number generator (parameter from user)
3852 //       numIterations
3853 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3854 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3855                          int32_t numIterations, UBool useUText) {
3856 
3857 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3858 
3859     const int32_t    TESTSTRINGLEN = 500;
3860     UnicodeString    testText;
3861     int32_t          numCharClasses;
3862     UVector          *chClasses;
3863     int              expected[TESTSTRINGLEN*2 + 1];
3864     int              expectedCount = 0;
3865     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3866     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3867     char             reverseBreaks[TESTSTRINGLEN*2+1];
3868     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3869     char             followingBreaks[TESTSTRINGLEN*2+1];
3870     char             precedingBreaks[TESTSTRINGLEN*2+1];
3871     int              i;
3872     int              loopCount = 0;
3873 
3874     m_seed = seed;
3875 
3876     numCharClasses = mk.charClasses()->size();
3877     chClasses      = mk.charClasses();
3878 
3879     // Check for errors that occured during the construction of the MonkeyKind object.
3880     //  Can't report them where they occured because errln() is a method coming from intlTest,
3881     //  and is not visible outside of RBBITest :-(
3882     if (U_FAILURE(mk.deferredStatus)) {
3883         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3884         return;
3885     }
3886 
3887     // Verify that the character classes all have at least one member.
3888     for (i=0; i<numCharClasses; i++) {
3889         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3890         if (s == NULL || s->size() == 0) {
3891             errln("Character Class #%d is null or of zero size.", i);
3892             return;
3893         }
3894     }
3895 
3896     while (loopCount < numIterations || numIterations == -1) {
3897         if (numIterations == -1 && loopCount % 10 == 0) {
3898             // If test is running in an infinite loop, display a periodic tic so
3899             //   we can tell that it is making progress.
3900             fprintf(stderr, ".");
3901         }
3902         // Save current random number seed, so that we can recreate the random numbers
3903         //   for this loop iteration in event of an error.
3904         seed = m_seed;
3905 
3906         // Populate a test string with data.
3907         testText.truncate(0);
3908         for (i=0; i<TESTSTRINGLEN; i++) {
3909             int32_t  aClassNum = m_rand() % numCharClasses;
3910             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3911             int32_t   charIdx = m_rand() % classSet->size();
3912             UChar32   c = classSet->charAt(charIdx);
3913             if (c < 0) {   // TODO:  deal with sets containing strings.
3914                 errln("c < 0");
3915                 break;
3916             }
3917             testText.append(c);
3918         }
3919 
3920         // Calculate the expected results for this test string.
3921         mk.setText(testText);
3922         memset(expectedBreaks, 0, sizeof(expectedBreaks));
3923         expectedBreaks[0] = 1;
3924         int32_t breakPos = 0;
3925         expectedCount = 0;
3926         for (;;) {
3927             breakPos = mk.next(breakPos);
3928             if (breakPos == -1) {
3929                 break;
3930             }
3931             if (breakPos > testText.length()) {
3932                 errln("breakPos > testText.length()");
3933             }
3934             expectedBreaks[breakPos] = 1;
3935             U_ASSERT(expectedCount<testText.length());
3936             expected[expectedCount ++] = breakPos;
3937         }
3938 
3939         // Find the break positions using forward iteration
3940         memset(forwardBreaks, 0, sizeof(forwardBreaks));
3941         if (useUText) {
3942             UErrorCode status = U_ZERO_ERROR;
3943             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3944             // testUText = utext_openUnicodeString(testUText, &testText, &status);
3945             bi->setText(testUText, status);
3946             TEST_ASSERT_SUCCESS(status);
3947             utext_close(testUText);   // The break iterator does a shallow clone of the UText
3948                                       //  This UText can be closed immediately, so long as the
3949                                       //  testText string continues to exist.
3950         } else {
3951             bi->setText(testText);
3952         }
3953 
3954         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
3955             if (i < 0 || i > testText.length()) {
3956                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3957                 break;
3958             }
3959             forwardBreaks[i] = 1;
3960         }
3961 
3962         // Find the break positions using reverse iteration
3963         memset(reverseBreaks, 0, sizeof(reverseBreaks));
3964         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
3965             if (i < 0 || i > testText.length()) {
3966                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3967                 break;
3968             }
3969             reverseBreaks[i] = 1;
3970         }
3971 
3972         // Find the break positions using isBoundary() tests.
3973         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
3974         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
3975         for (i=0; i<=testText.length(); i++) {
3976             isBoundaryBreaks[i] = bi->isBoundary(i);
3977         }
3978 
3979 
3980         // Find the break positions using the following() function.
3981         // printf(".");
3982         memset(followingBreaks, 0, sizeof(followingBreaks));
3983         int32_t   lastBreakPos = 0;
3984         followingBreaks[0] = 1;
3985         for (i=0; i<testText.length(); i++) {
3986             breakPos = bi->following(i);
3987             if (breakPos <= i ||
3988                 breakPos < lastBreakPos ||
3989                 breakPos > testText.length() ||
3990                 (breakPos > lastBreakPos && lastBreakPos > i)) {
3991                 errln("%s break monkey test: "
3992                     "Out of range value returned by BreakIterator::following().\n"
3993                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
3994                          name, seed, i, breakPos, lastBreakPos);
3995                 break;
3996             }
3997             followingBreaks[breakPos] = 1;
3998             lastBreakPos = breakPos;
3999         }
4000 
4001         // Find the break positions using the preceding() function.
4002         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4003         lastBreakPos = testText.length();
4004         precedingBreaks[testText.length()] = 1;
4005         for (i=testText.length(); i>0; i--) {
4006             breakPos = bi->preceding(i);
4007             if (breakPos >= i ||
4008                 breakPos > lastBreakPos ||
4009                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4010                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4011                 errln("%s break monkey test: "
4012                     "Out of range value returned by BreakIterator::preceding().\n"
4013                     "index=%d;  prev returned %d; lastBreak=%d" ,
4014                     name,  i, breakPos, lastBreakPos);
4015                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4016                     precedingBreaks[i] = 2;   // Forces an error.
4017                 }
4018             } else {
4019                 if (breakPos >= 0) {
4020                     precedingBreaks[breakPos] = 1;
4021                 }
4022                 lastBreakPos = breakPos;
4023             }
4024         }
4025 
4026         // Compare the expected and actual results.
4027         for (i=0; i<=testText.length(); i++) {
4028             const char *errorType = NULL;
4029             if  (forwardBreaks[i] != expectedBreaks[i]) {
4030                 errorType = "next()";
4031             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4032                 errorType = "previous()";
4033             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4034                 errorType = "isBoundary()";
4035             } else if (followingBreaks[i] != expectedBreaks[i]) {
4036                 errorType = "following()";
4037             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4038                 errorType = "preceding()";
4039             }
4040 
4041 
4042             if (errorType != NULL) {
4043                 // Format a range of the test text that includes the failure as
4044                 //  a data item that can be included in the rbbi test data file.
4045 
4046                 // Start of the range is the last point where expected and actual results
4047                 //   both agreed that there was a break position.
4048                 int startContext = i;
4049                 int32_t count = 0;
4050                 for (;;) {
4051                     if (startContext==0) { break; }
4052                     startContext --;
4053                     if (expectedBreaks[startContext] != 0) {
4054                         if (count == 2) break;
4055                         count ++;
4056                     }
4057                 }
4058 
4059                 // End of range is two expected breaks past the start position.
4060                 int endContext = i + 1;
4061                 int ci;
4062                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4063                     for (;;) {
4064                         if (endContext >= testText.length()) {break;}
4065                         if (expectedBreaks[endContext-1] != 0) {
4066                             if (count == 0) break;
4067                             count --;
4068                         }
4069                         endContext ++;
4070                     }
4071                 }
4072 
4073                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4074                 UnicodeString errorText = "<data>";
4075                 /***if (strcmp(errorType, "next()") == 0) {
4076                     startContext = 0;
4077                     endContext = testText.length();
4078 
4079                     printStringBreaks(testText, expected, expectedCount);
4080                 }***/
4081 
4082                 for (ci=startContext; ci<endContext;) {
4083                     UnicodeString hexChars("0123456789abcdef");
4084                     UChar32  c;
4085                     int      bn;
4086                     c = testText.char32At(ci);
4087                     if (ci == i) {
4088                         // This is the location of the error.
4089                         errorText.append("<?>");
4090                     } else if (expectedBreaks[ci] != 0) {
4091                         // This a non-error expected break position.
4092                         errorText.append("\\");
4093                     }
4094                     if (c < 0x10000) {
4095                         errorText.append("\\u");
4096                         for (bn=12; bn>=0; bn-=4) {
4097                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4098                         }
4099                     } else {
4100                         errorText.append("\\U");
4101                         for (bn=28; bn>=0; bn-=4) {
4102                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4103                         }
4104                     }
4105                     ci = testText.moveIndex32(ci, 1);
4106                 }
4107                 errorText.append("\\");
4108                 errorText.append("</data>\n");
4109 
4110                 // Output the error
4111                 char  charErrorTxt[500];
4112                 UErrorCode status = U_ZERO_ERROR;
4113                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4114                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4115                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4116 
4117                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4118                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4119                     errorType, seed, i, charErrorTxt);
4120                 break;
4121             }
4122         }
4123 
4124         loopCount++;
4125     }
4126 #endif
4127 }
4128 
4129 
4130 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4131 //             This test checks the initial patch,
4132 //             which is to just keep it from crashing.  Correct word boundaries
4133 //             await a proper fix to the dictionary code.
4134 //
TestBug5532(void)4135 void RBBITest::TestBug5532(void)  {
4136    // Text includes a mixture of Thai and Latin.
4137    const unsigned char utf8Data[] = {
4138            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4139            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4140            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4141            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4142            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4143            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4144            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4145            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4146            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4147            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4148            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4149 
4150     UErrorCode status = U_ZERO_ERROR;
4151     UText utext=UTEXT_INITIALIZER;
4152     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4153     TEST_ASSERT_SUCCESS(status);
4154 
4155     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4156     TEST_ASSERT_SUCCESS(status);
4157     if (U_SUCCESS(status)) {
4158         bi->setText(&utext, status);
4159         TEST_ASSERT_SUCCESS(status);
4160 
4161         int32_t breakCount = 0;
4162         int32_t previousBreak = -1;
4163         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4164             // For now, just make sure that the break iterator doesn't hang.
4165             TEST_ASSERT(previousBreak < bi->current());
4166             previousBreak = bi->current();
4167         }
4168         TEST_ASSERT(breakCount > 0);
4169     }
4170     delete bi;
4171     utext_close(&utext);
4172 }
4173 
4174 
4175 //
4176 //  TestDebug    -  A place-holder test for debugging purposes.
4177 //                  For putting in fragments of other tests that can be invoked
4178 //                  for tracing  without a lot of unwanted extra stuff happening.
4179 //
TestDebug(void)4180 void RBBITest::TestDebug(void) {
4181 #if 0
4182     UErrorCode   status = U_ZERO_ERROR;
4183     int pos = 0;
4184     int ruleStatus = 0;
4185 
4186     RuleBasedBreakIterator* bi =
4187        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4188        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4189        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4190     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4191     // UnicodeString s("Aaa.  Bcd");
4192     s = s.unescape();
4193     bi->setText(s);
4194     UBool r = bi->isBoundary(8);
4195     printf("%s", r?"true":"false");
4196     return;
4197     pos = bi->last();
4198     do {
4199         // ruleStatus = bi->getRuleStatus();
4200         printf("%d\t%d\n", pos, ruleStatus);
4201         pos = bi->previous();
4202     } while (pos != BreakIterator::DONE);
4203 #endif
4204 }
4205 
TestProperties()4206 void RBBITest::TestProperties() {
4207     UErrorCode errorCode = U_ZERO_ERROR;
4208     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4209     if (!prependSet.isEmpty()) {
4210         errln(
4211             "[:GCB=Prepend:] is not empty any more. "
4212             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4213             "change this test to the opposite condition.");
4214     }
4215 }
4216 
4217 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4218