• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  // © 2016 and later: Unicode, Inc. and others.
2  // License & terms of use: http://www.unicode.org/copyright.html
3  /********************************************************************
4   * COPYRIGHT:
5   * Copyright (c) 1999-2016, International Business Machines Corporation and
6   * others. All Rights Reserved.
7   ********************************************************************/
8  /************************************************************************
9  *   Date        Name        Description
10  *   12/15/99    Madhu        Creation.
11  *   01/12/2000  Madhu        Updated for changed API and added new tests
12  ************************************************************************/
13  
14  #include "unicode/utypes.h"
15  #if !UCONFIG_NO_BREAK_ITERATION
16  
17  #include <algorithm>
18  #include <sstream>
19  #include <stdio.h>
20  #include <stdlib.h>
21  #include <string.h>
22  #include <utility>
23  #include <vector>
24  
25  #include "unicode/brkiter.h"
26  #include "unicode/localpointer.h"
27  #include "unicode/numfmt.h"
28  #include "unicode/rbbi.h"
29  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30  #include "unicode/regex.h"
31  #endif
32  #include "unicode/schriter.h"
33  #include "unicode/uchar.h"
34  #include "unicode/utf16.h"
35  #include "unicode/ucnv.h"
36  #include "unicode/uniset.h"
37  #include "unicode/uscript.h"
38  #include "unicode/ustring.h"
39  #include "unicode/utext.h"
40  #include "unicode/utrace.h"
41  
42  #include "charstr.h"
43  #include "cmemory.h"
44  #include "cstr.h"
45  #include "intltest.h"
46  #include "lstmbe.h"
47  #include "rbbitst.h"
48  #include "rbbidata.h"
49  #include "utypeinfo.h"  // for 'typeid' to work
50  #include "uvector.h"
51  #include "uvectr32.h"
52  
53  
54  #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
55  #include "unicode/filteredbrk.h"
56  #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
57  
58  #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
59      if (!(x)) { \
60          errln("Failure in file %s, line %d", __FILE__, __LINE__); \
61      } \
62  } UPRV_BLOCK_MACRO_END
63  
64  #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
65      if (U_FAILURE(errcode)) { \
66          errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
67      } \
68  } UPRV_BLOCK_MACRO_END
69  
70  #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
71      IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
72                      __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
73  }
74  
75  //---------------------------------------------
76  // runIndexedTest
77  //---------------------------------------------
78  
79  
80  //  Note:  Before adding new tests to this file, check whether the desired test data can
81  //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
82  //         it's much less work than writing a new test, diagnostic output in the event of failures
83  //         is good, and the test data file will is shared with ICU4J, so eventually the test
84  //         will run there as well, without additional effort.
85  
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)86  void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
87  {
88      if (exec) logln("TestSuite RuleBasedBreakIterator: ");
89      fTestParams = params;
90  
91      TESTCASE_AUTO_BEGIN;
92  #if !UCONFIG_NO_FILE_IO
93      TESTCASE_AUTO(TestBug4153072);
94  #endif
95  #if !UCONFIG_NO_FILE_IO
96      TESTCASE_AUTO(TestUnicodeFiles);
97  #endif
98      TESTCASE_AUTO(TestGetAvailableLocales);
99      TESTCASE_AUTO(TestGetDisplayName);
100  #if !UCONFIG_NO_FILE_IO
101      TESTCASE_AUTO(TestEndBehaviour);
102      TESTCASE_AUTO(TestWordBreaks);
103      TESTCASE_AUTO(TestWordBoundary);
104      TESTCASE_AUTO(TestLineBreaks);
105      TESTCASE_AUTO(TestSentBreaks);
106      TESTCASE_AUTO(TestExtended);
107  #endif
108  #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
109      TESTCASE_AUTO(TestMonkey);
110  #endif
111  #if !UCONFIG_NO_FILE_IO
112      TESTCASE_AUTO(TestBug3818);
113  #endif
114      TESTCASE_AUTO(TestDebug);
115  #if !UCONFIG_NO_FILE_IO
116      TESTCASE_AUTO(TestBug5775);
117  #endif
118      TESTCASE_AUTO(TestBug9983);
119      TESTCASE_AUTO(TestDictRules);
120      TESTCASE_AUTO(TestBug5532);
121      TESTCASE_AUTO(TestBug7547);
122      TESTCASE_AUTO(TestBug12797);
123      TESTCASE_AUTO(TestBug12918);
124      TESTCASE_AUTO(TestBug12932);
125      TESTCASE_AUTO(TestEmoji);
126      TESTCASE_AUTO(TestBug12519);
127      TESTCASE_AUTO(TestBug12677);
128      TESTCASE_AUTO(TestTableRedundancies);
129      TESTCASE_AUTO(TestBug13447);
130      TESTCASE_AUTO(TestReverse);
131      TESTCASE_AUTO(TestBug13692);
132      TESTCASE_AUTO(TestDebugRules);
133      TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
134      TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
135      TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
136      TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
137      TESTCASE_AUTO(TestTable_8_16_Bits);
138      TESTCASE_AUTO(TestBug13590);
139      TESTCASE_AUTO(TestUnpairedSurrogate);
140      TESTCASE_AUTO(TestLSTMThai);
141      TESTCASE_AUTO(TestLSTMBurmese);
142      TESTCASE_AUTO(TestRandomAccess);
143  
144  #if U_ENABLE_TRACING
145      TESTCASE_AUTO(TestTraceCreateCharacter);
146      TESTCASE_AUTO(TestTraceCreateWord);
147      TESTCASE_AUTO(TestTraceCreateSentence);
148      TESTCASE_AUTO(TestTraceCreateTitle);
149      TESTCASE_AUTO(TestTraceCreateLine);
150      TESTCASE_AUTO(TestTraceCreateLineNormal);
151      TESTCASE_AUTO(TestTraceCreateLineLoose);
152      TESTCASE_AUTO(TestTraceCreateLineStrict);
153      TESTCASE_AUTO(TestTraceCreateLineNormalPhrase);
154      TESTCASE_AUTO(TestTraceCreateLineLoosePhrase);
155      TESTCASE_AUTO(TestTraceCreateLineStrictPhrase);
156      TESTCASE_AUTO(TestTraceCreateLinePhrase);
157      TESTCASE_AUTO(TestTraceCreateBreakEngine);
158  #endif
159  
160      TESTCASE_AUTO_END;
161  }
162  
163  
164  //--------------------------------------------------------------------------------------
165  //
166  //    RBBITest    constructor and destructor
167  //
168  //--------------------------------------------------------------------------------------
169  
RBBITest()170  RBBITest::RBBITest() {
171      fTestParams = NULL;
172  }
173  
174  
~RBBITest()175  RBBITest::~RBBITest() {
176  }
177  
178  
printStringBreaks(UText * tstr,int expected[],int expectedCount)179  static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
180      UErrorCode status = U_ZERO_ERROR;
181      char name[100];
182      printf("code    alpha extend alphanum type word sent line name\n");
183      int nextExpectedIndex = 0;
184      utext_setNativeIndex(tstr, 0);
185      for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
186          if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
187              printf("------------------------------------------------ %d\n", j);
188              ++nextExpectedIndex;
189          }
190  
191          UChar32 c = utext_next32(tstr);
192          u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
193          printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
194                             u_isUAlphabetic(c),
195                             u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
196                             u_isalnum(c),
197                             u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
198                                                    u_charType(c),
199                                                    U_SHORT_PROPERTY_NAME),
200                             u_getPropertyValueName(UCHAR_WORD_BREAK,
201                                                    u_getIntPropertyValue(c,
202                                                            UCHAR_WORD_BREAK),
203                                                    U_SHORT_PROPERTY_NAME),
204                             u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
205                                     u_getIntPropertyValue(c,
206                                             UCHAR_SENTENCE_BREAK),
207                                     U_SHORT_PROPERTY_NAME),
208                             u_getPropertyValueName(UCHAR_LINE_BREAK,
209                                     u_getIntPropertyValue(c,
210                                             UCHAR_LINE_BREAK),
211                                     U_SHORT_PROPERTY_NAME),
212                             name);
213      }
214  }
215  
216  
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)217  static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
218     UErrorCode status = U_ZERO_ERROR;
219     UText *tstr = NULL;
220     tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
221     if (U_FAILURE(status)) {
222         printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
223         return;
224      }
225     printStringBreaks(tstr, expected, expectedCount);
226     utext_close(tstr);
227  }
228  
229  
TestBug3818()230  void RBBITest::TestBug3818() {
231      UErrorCode  status = U_ZERO_ERROR;
232  
233      // Four Thai words...
234      static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
235                                             0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
236      UnicodeString  thaiStr(thaiWordData);
237  
238      BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
239      if (U_FAILURE(status) || bi == NULL) {
240          errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
241          return;
242      }
243      bi->setText(thaiStr);
244  
245      int32_t  startOfSecondWord = bi->following(1);
246      if (startOfSecondWord != 4) {
247          errln("Fail at file %s, line %d expected start of word at 4, got %d",
248              __FILE__, __LINE__, startOfSecondWord);
249      }
250      startOfSecondWord = bi->following(0);
251      if (startOfSecondWord != 4) {
252          errln("Fail at file %s, line %d expected start of word at 4, got %d",
253              __FILE__, __LINE__, startOfSecondWord);
254      }
255      delete bi;
256  }
257  
258  
259  //---------------------------------------------
260  //
261  //     other tests
262  //
263  //---------------------------------------------
264  
TestGetAvailableLocales()265  void RBBITest::TestGetAvailableLocales()
266  {
267      int32_t locCount = 0;
268      const Locale* locList = BreakIterator::getAvailableLocales(locCount);
269  
270      if (locCount == 0)
271          dataerrln("getAvailableLocales() returned an empty list!");
272      // Just make sure that it's returning good memory.
273      int32_t i;
274      for (i = 0; i < locCount; ++i) {
275          logln(locList[i].getName());
276      }
277  }
278  
279  //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()280  void RBBITest::TestGetDisplayName()
281  {
282      UnicodeString   result;
283  
284      BreakIterator::getDisplayName(Locale::getUS(), result);
285      if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
286          dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
287                  + result);
288  
289      BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
290      if (result != "French (France)")
291          dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
292                  + result);
293  }
294  /**
295   * Test End Behaviour
296   * @bug 4068137
297   */
TestEndBehaviour()298  void RBBITest::TestEndBehaviour()
299  {
300      UErrorCode status = U_ZERO_ERROR;
301      UnicodeString testString("boo.");
302      BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
303      if (U_FAILURE(status))
304      {
305          errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
306          return;
307      }
308      wb->setText(testString);
309  
310      if (wb->first() != 0)
311          errln("Didn't get break at beginning of string.");
312      if (wb->next() != 3)
313          errln("Didn't get break before period in \"boo.\"");
314      if (wb->current() != 4 && wb->next() != 4)
315          errln("Didn't get break at end of string.");
316      delete wb;
317  }
318  /*
319   * @bug 4153072
320   */
TestBug4153072()321  void RBBITest::TestBug4153072() {
322      UErrorCode status = U_ZERO_ERROR;
323      BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
324      if (U_FAILURE(status))
325      {
326          errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
327          return;
328      }
329      UnicodeString str("...Hello, World!...");
330      int32_t begin = 3;
331      int32_t end = str.length() - 3;
332      UBool onBoundary;
333  
334      StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
335      iter->adoptText(textIterator);
336      int index;
337      // Note: with the switch to UText, there is no way to restrict the
338      //       iteration range to begin at an index other than zero.
339      //       String character iterators created with a non-zero bound are
340      //         treated by RBBI as being empty.
341      for (index = -1; index < begin + 1; ++index) {
342          onBoundary = iter->isBoundary(index);
343          if (index == 0?  !onBoundary : onBoundary) {
344              errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
345                              " and begin index = " + begin);
346          }
347      }
348      delete iter;
349  }
350  
351  
352  //
353  // Test for problem reported by Ashok Matoria on 9 July 2007
354  //    One.<kSoftHyphen><kSpace>Two.
355  //
356  //    Sentence break at start (0) and then on calling next() it breaks at
357  //   'T' of "Two". Now, at this point if I do next() and
358  //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
359  //
TestBug5775()360  void RBBITest::TestBug5775() {
361      UErrorCode status = U_ZERO_ERROR;
362      BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
363      TEST_ASSERT_SUCCESS(status);
364      if (U_FAILURE(status)) {
365          return;
366      }
367  // Check for status first for better handling of no data errors.
368      TEST_ASSERT(bi != NULL);
369      if (bi == NULL) {
370          return;
371      }
372  
373      UnicodeString s("One.\\u00ad Two.", -1, US_INV);
374      //               01234      56789
375      s = s.unescape();
376      bi->setText(s);
377      int pos = bi->next();
378      TEST_ASSERT(pos == 6);
379      pos = bi->next();
380      TEST_ASSERT(pos == 10);
381      pos = bi->previous();
382      TEST_ASSERT(pos == 6);
383      delete bi;
384  }
385  
386  
387  
388  //------------------------------------------------------------------------------
389  //
390  //   RBBITest::Extended    Run  RBBI Tests from an external test data file
391  //
392  //------------------------------------------------------------------------------
393  
394  struct TestParams {
395      BreakIterator   *bi;                   // Break iterator is set while parsing test source.
396                                             //   Changed out whenever test data changes break type.
397  
398      UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
399      UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
400      UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
401      UVector32       *srcCol;
402  
403      UText           *textToBreak;          // UText, could be UTF8 or UTF16.
404      UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
405      CharString       utf8String;           // UTF-8 form of text to break.
406  
TestParamsTestParams407      TestParams(UErrorCode &status) : dataToBreak() {
408          bi               = NULL;
409          expectedBreaks   = new UVector32(status);
410          srcLine          = new UVector32(status);
411          srcCol           = new UVector32(status);
412          textToBreak      = NULL;
413          textMap          = new UVector32(status);
414      }
415  
~TestParamsTestParams416      ~TestParams() {
417          delete bi;
418          delete expectedBreaks;
419          delete srcLine;
420          delete srcCol;
421          utext_close(textToBreak);
422          delete textMap;
423      }
424  
425      int32_t getSrcLine(int32_t bp);
426      int32_t getExpectedBreak(int32_t bp);
427      int32_t getSrcCol(int32_t bp);
428  
429      void setUTF16(UErrorCode &status);
430      void setUTF8(UErrorCode &status);
431  };
432  
433  // Append a UnicodeString to a CharString with UTF-8 encoding.
434  // Substitute any invalid chars.
435  //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)436  static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
437      if (U_FAILURE(status)) {
438          return;
439      }
440      int32_t utf8Length;
441      u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
442                         src.getBuffer(), src.length(),   // UTF-16 data
443                         0xfffd, NULL,                    // Substitution char, number of subs.
444                         &status);
445      if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
446          return;
447      }
448      status = U_ZERO_ERROR;
449      int32_t capacity;
450      char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
451      u_strToUTF8WithSub(buffer, utf8Length, NULL,
452                         src.getBuffer(), src.length(),
453                         0xfffd, NULL, &status);
454      dest.append(buffer, utf8Length, status);
455  }
456  
457  
setUTF16(UErrorCode & status)458  void TestParams::setUTF16(UErrorCode &status) {
459      textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
460      textMap->removeAllElements();
461      for (int32_t i=0; i<dataToBreak.length(); i++) {
462          if (i == dataToBreak.getChar32Start(i)) {
463              textMap->addElement(i, status);
464          } else {
465              textMap->addElement(-1, status);
466          }
467      }
468      textMap->addElement(dataToBreak.length(), status);
469      U_ASSERT(dataToBreak.length() + 1 == textMap->size());
470  }
471  
472  
setUTF8(UErrorCode & status)473  void TestParams::setUTF8(UErrorCode &status) {
474      if (U_FAILURE(status)) {
475          return;
476      }
477      utf8String.clear();
478      CharStringAppend(utf8String, dataToBreak, status);
479      textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
480      if (U_FAILURE(status)) {
481          return;
482      }
483  
484      textMap->removeAllElements();
485      int32_t utf16Index = 0;
486      for (;;) {
487          textMap->addElement(utf16Index, status);
488          UChar32 c32 = utext_current32(textToBreak);
489          if (c32 < 0) {
490              break;
491          }
492          utf16Index += U16_LENGTH(c32);
493          utext_next32(textToBreak);
494          while (textMap->size() < utext_getNativeIndex(textToBreak)) {
495              textMap->addElement(-1, status);
496          }
497      }
498      U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
499  }
500  
501  
getSrcLine(int32_t bp)502  int32_t TestParams::getSrcLine(int32_t bp) {
503      if (bp >= textMap->size()) {
504          bp = textMap->size() - 1;
505      }
506      int32_t i = 0;
507      for(; bp >= 0 ; --bp) {
508          // Move to a character boundary if we are not on one already.
509          i = textMap->elementAti(bp);
510          if (i >= 0) {
511              break;
512          }
513      }
514      return srcLine->elementAti(i);
515  }
516  
517  
getExpectedBreak(int32_t bp)518  int32_t TestParams::getExpectedBreak(int32_t bp) {
519      if (bp >= textMap->size()) {
520          return 0;
521      }
522      int32_t i = textMap->elementAti(bp);
523      int32_t retVal = 0;
524      if (i >= 0) {
525          retVal = expectedBreaks->elementAti(i);
526      }
527      return retVal;
528  }
529  
530  
getSrcCol(int32_t bp)531  int32_t TestParams::getSrcCol(int32_t bp) {
532      if (bp >= textMap->size()) {
533          bp = textMap->size() - 1;
534      }
535      int32_t i = 0;
536      for(; bp >= 0; --bp) {
537          // Move bp to a character boundary if we are not on one already.
538          i = textMap->elementAti(bp);
539          if (i >= 0) {
540              break;
541          }
542      }
543      return srcCol->elementAti(i);
544  }
545  
546  
executeTest(TestParams * t,UErrorCode & status)547  void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
548      int32_t    bp;
549      int32_t    prevBP;
550      int32_t    i;
551  
552      TEST_ASSERT_SUCCESS(status);
553      if (U_FAILURE(status)) {
554          return;
555      }
556  
557      if (t->bi == NULL) {
558          return;
559      }
560  
561      t->bi->setText(t->textToBreak, status);
562      //
563      //  Run the iterator forward
564      //
565      prevBP = -1;
566      for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
567          if (prevBP ==  bp) {
568              // Fail for lack of forward progress.
569              errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
570                  bp, t->getSrcLine(bp), t->getSrcCol(bp));
571              break;
572          }
573  
574          // Check that there we didn't miss an expected break between the last one
575          //  and this one.
576          for (i=prevBP+1; i<bp; i++) {
577              if (t->getExpectedBreak(i) != 0) {
578                  int expected[] = {0, i};
579                  printStringBreaks(t->dataToBreak, expected, 2);
580                  errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
581                        i, t->getSrcLine(i), t->getSrcCol(i));
582              }
583          }
584  
585          // Check that the break we did find was expected
586          if (t->getExpectedBreak(bp) == 0) {
587              int expected[] = {0, bp};
588              printStringBreaks(t->textToBreak, expected, 2);
589              errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
590                  bp, t->getSrcLine(bp), t->getSrcCol(bp));
591          } else {
592              // The break was expected.
593              //   Check that the {nnn} tag value is correct.
594              int32_t expectedTagVal = t->getExpectedBreak(bp);
595              if (expectedTagVal == -1) {
596                  expectedTagVal = 0;
597              }
598              int32_t line = t->getSrcLine(bp);
599              int32_t rs = t->bi->getRuleStatus();
600              if (rs != expectedTagVal) {
601                  errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
602                        "          Actual, Expected status = %4d, %4d",
603                      bp, line, t->getSrcCol(bp), rs, expectedTagVal);
604              }
605          }
606  
607          prevBP = bp;
608      }
609  
610      // Verify that there were no missed expected breaks after the last one found
611      for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
612          if (t->getExpectedBreak(i) != 0) {
613              errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
614                        i, t->getSrcLine(i), t->getSrcCol(i));
615          }
616      }
617  
618      //
619      //  Run the iterator backwards, verify that the same breaks are found.
620      //
621      prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
622      bp = t->bi->last();
623      while (bp != BreakIterator::DONE) {
624          if (prevBP ==  bp) {
625              // Fail for lack of progress.
626              errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
627                  bp, t->getSrcLine(bp), t->getSrcCol(bp));
628              break;
629          }
630  
631          // Check that we didn't miss an expected break between the last one
632          //  and this one.  (UVector returns zeros for index out of bounds.)
633          for (i=prevBP-1; i>bp; i--) {
634              if (t->getExpectedBreak(i) != 0) {
635                  errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
636                        i, t->getSrcLine(i), t->getSrcCol(i));
637              }
638          }
639  
640          // Check that the break we did find was expected
641          if (t->getExpectedBreak(bp) == 0) {
642              errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
643                     bp, t->getSrcLine(bp), t->getSrcCol(bp));
644          } else {
645              // The break was expected.
646              //   Check that the {nnn} tag value is correct.
647              int32_t expectedTagVal = t->getExpectedBreak(bp);
648              if (expectedTagVal == -1) {
649                  expectedTagVal = 0;
650              }
651              int line = t->getSrcLine(bp);
652              int32_t rs = t->bi->getRuleStatus();
653              if (rs != expectedTagVal) {
654                  errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
655                        "          Actual, Expected status = %4d, %4d",
656                      bp, line, t->getSrcCol(bp), rs, expectedTagVal);
657              }
658          }
659  
660          prevBP = bp;
661          bp = t->bi->previous();
662      }
663  
664      // Verify that there were no missed breaks prior to the last one found
665      for (i=prevBP-1; i>=0; i--) {
666          if (t->getExpectedBreak(i) != 0) {
667              errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
668                        i, t->getSrcLine(i), t->getSrcCol(i));
669          }
670      }
671  
672      // Check isBoundary()
673      for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
674          UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
675          UBool boundaryFound    = t->bi->isBoundary(i);
676          if (boundaryExpected != boundaryFound) {
677              errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
678                    "        Expected, Actual= %s, %s",
679                    i, t->getSrcLine(i), t->getSrcCol(i),
680                    boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
681          }
682      }
683  
684      // Check following()
685      for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
686          int32_t actualBreak = t->bi->following(i);
687          int32_t expectedBreak = BreakIterator::DONE;
688          for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
689              if (t->getExpectedBreak(j) != 0) {
690                  expectedBreak = j;
691                  break;
692              }
693          }
694          if (expectedBreak != actualBreak) {
695              errln("following(%d) incorrect. File line,col= %4d,%4d\n"
696                    "        Expected, Actual= %d, %d",
697                    i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
698          }
699      }
700  
701      // Check preceding()
702      for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
703          int32_t actualBreak = t->bi->preceding(i);
704          int32_t expectedBreak = BreakIterator::DONE;
705  
706          // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
707          // preceding(trailing byte) will return the index of some preceding code point,
708          // not the lead byte of the current code point, even though that has a smaller index.
709          // Therefore, start looking at the expected break data not at i-1, but at
710          // the start of code point index - 1.
711          utext_setNativeIndex(t->textToBreak, i);
712          int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
713          for (; j >= 0; j--) {
714              if (t->getExpectedBreak(j) != 0) {
715                  expectedBreak = j;
716                  break;
717              }
718          }
719          if (expectedBreak != actualBreak) {
720              errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
721                    "        Expected, Actual= %d, %d",
722                    i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
723          }
724      }
725  }
726  
TestExtended()727  void RBBITest::TestExtended() {
728       // The expectations in this test heavily depends on the Thai dictionary.
729       // Therefore, we skip this test under the LSTM configuration.
730       if (skipDictionaryTest()) {
731           return;
732       }
733    // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
734    // data driven test closely entangles filtered and regular data.
735  #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
736      UErrorCode      status  = U_ZERO_ERROR;
737      Locale          locale("");
738  
739      TestParams          tp(status);
740  
741      RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
742      if (U_FAILURE(status)) {
743          dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
744      }
745  
746      //
747      //  Open and read the test data file.
748      //
749      const char *testDataDirectory = IntlTest::getSourceTestData(status);
750      CharString testFileName(testDataDirectory, -1, status);
751      testFileName.append("rbbitst.txt", -1, status);
752  
753      int    len;
754      UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
755      if (U_FAILURE(status)) {
756          errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
757          return;
758      }
759  
760      bool skipTest = false; // Skip this test?
761  
762      //
763      //  Put the test data into a UnicodeString
764      //
765      UnicodeString testString(false, testFile, len);
766  
767      enum EParseState{
768          PARSE_COMMENT,
769          PARSE_TAG,
770          PARSE_DATA,
771          PARSE_NUM,
772          PARSE_RULES
773      }
774      parseState = PARSE_TAG;
775  
776      EParseState savedState = PARSE_TAG;
777  
778      int32_t    lineNum  = 1;
779      int32_t    colStart = 0;
780      int32_t    column   = 0;
781      int32_t    charIdx  = 0;
782  
783      int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
784  
785      UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
786      int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
787  
788      for (charIdx = 0; charIdx < len; ) {
789          status = U_ZERO_ERROR;
790          UChar  c = testString.charAt(charIdx);
791          charIdx++;
792          if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
793              // treat CRLF as a unit
794              c = u'\n';
795              charIdx++;
796          }
797          if (c == u'\n' || c == u'\r') {
798              lineNum++;
799              colStart = charIdx;
800          }
801          column = charIdx - colStart + 1;
802  
803          switch (parseState) {
804          case PARSE_COMMENT:
805              if (c == u'\n' || c == u'\r') {
806                  parseState = savedState;
807              }
808              break;
809  
810          case PARSE_TAG:
811              {
812              if (c == u'#') {
813                  parseState = PARSE_COMMENT;
814                  savedState = PARSE_TAG;
815                  break;
816              }
817              if (u_isUWhiteSpace(c)) {
818                  break;
819              }
820              if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
821                  delete tp.bi;
822                  tp.bi = BreakIterator::createWordInstance(locale,  status);
823                  skipTest = false;
824                  charIdx += 5;
825                  break;
826              }
827              if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
828                  delete tp.bi;
829                  tp.bi = BreakIterator::createCharacterInstance(locale,  status);
830                  skipTest = false;
831                  charIdx += 5;
832                  break;
833              }
834              if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
835                  delete tp.bi;
836                  tp.bi = BreakIterator::createLineInstance(locale,  status);
837                  skipTest = false;
838                  charIdx += 5;
839                  break;
840              }
841              if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
842                  delete tp.bi;
843                  tp.bi = BreakIterator::createSentenceInstance(locale,  status);
844                  skipTest = false;
845                  charIdx += 5;
846                  break;
847              }
848              if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
849                  delete tp.bi;
850                  tp.bi = BreakIterator::createTitleInstance(locale,  status);
851                  charIdx += 6;
852                  break;
853              }
854  
855              if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
856                  testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
857                  charIdx = testString.indexOf(u'>', charIdx) + 1;
858                  parseState = PARSE_RULES;
859                  rules.remove();
860                  rulesFirstLine = lineNum;
861                  break;
862              }
863  
864              // <locale  loc_name>
865              localeMatcher.reset(testString);
866              if (localeMatcher.lookingAt(charIdx-1, status)) {
867                  UnicodeString localeName = localeMatcher.group(1, status);
868                  char localeName8[100];
869                  localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
870                  locale = Locale::createFromName(localeName8);
871                  charIdx += localeMatcher.group(0, status).length() - 1;
872                  TEST_ASSERT_SUCCESS(status);
873                  break;
874              }
875              if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
876                  parseState = PARSE_DATA;
877                  charIdx += 5;
878                  tp.dataToBreak = "";
879                  tp.expectedBreaks->removeAllElements();
880                  tp.srcCol ->removeAllElements();
881                  tp.srcLine->removeAllElements();
882                  break;
883              }
884  
885              errln("line %d: Tag expected in test file.", lineNum);
886              parseState = PARSE_COMMENT;
887              savedState = PARSE_DATA;
888              goto end_test; // Stop the test.
889              }
890              break;
891  
892          case PARSE_RULES:
893              if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
894                  charIdx += 7;
895                  parseState = PARSE_TAG;
896                  delete tp.bi;
897                  UParseError pe;
898                  tp.bi = new RuleBasedBreakIterator(rules, pe, status);
899                  skipTest = U_FAILURE(status);
900                  if (U_FAILURE(status)) {
901                      errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
902                          rulesFirstLine + pe.line - 1, u_errorName(status));
903                  }
904              } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
905                  charIdx += 10;
906                  parseState = PARSE_TAG;
907                  UErrorCode ec = U_ZERO_ERROR;
908                  UParseError pe;
909                  RuleBasedBreakIterator bi(rules, pe, ec);
910                  if (U_SUCCESS(ec)) {
911                      errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
912                          rulesFirstLine + pe.line - 1);
913                  }
914              } else {
915                  rules.append(c);
916              }
917              break;
918  
919          case PARSE_DATA:
920              if (c == u'•') {
921                  int32_t  breakIdx = tp.dataToBreak.length();
922                  if (tp.expectedBreaks->size() > breakIdx) {
923                      errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
924                            lineNum, column);
925                  }
926                  tp.expectedBreaks->setSize(breakIdx+1);
927                  tp.expectedBreaks->setElementAt(-1, breakIdx);
928                  tp.srcLine->setSize(breakIdx+1);
929                  tp.srcLine->setElementAt(lineNum, breakIdx);
930                  tp.srcCol ->setSize(breakIdx+1);
931                  tp.srcCol ->setElementAt(column, breakIdx);
932                  break;
933              }
934  
935              if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
936                  // Add final entry to mappings from break location to source file position.
937                  //  Need one extra because last break position returned is after the
938                  //    last char in the data, not at the last char.
939                  tp.srcLine->addElement(lineNum, status);
940                  tp.srcCol ->addElement(column, status);
941  
942                  parseState = PARSE_TAG;
943                  charIdx += 6;
944  
945                  if (!skipTest) {
946                      // RUN THE TEST!
947                      status = U_ZERO_ERROR;
948                      tp.setUTF16(status);
949                      executeTest(&tp, status);
950                      TEST_ASSERT_SUCCESS(status);
951  
952                      // Run again, this time with UTF-8 text wrapped in a UText.
953                      status = U_ZERO_ERROR;
954                      tp.setUTF8(status);
955                      TEST_ASSERT_SUCCESS(status);
956                      executeTest(&tp, status);
957                  }
958                  break;
959              }
960  
961              if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
962                  // Named character, e.g. \N{COMBINING GRAVE ACCENT}
963                  // Get the code point from the name and insert it into the test data.
964                  //   (Damn, no API takes names in Unicode  !!!
965                  //    we've got to take it back to char *)
966                  int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
967                  int32_t nameLength = nameEndIdx - (charIdx+2);
968                  char charNameBuf[200];
969                  UChar32 theChar = -1;
970                  if (nameEndIdx != -1) {
971                      UErrorCode status = U_ZERO_ERROR;
972                      testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
973                      charNameBuf[sizeof(charNameBuf)-1] = 0;
974                      theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
975                      if (U_FAILURE(status)) {
976                          theChar = -1;
977                      }
978                  }
979                  if (theChar == -1) {
980                      errln("Error in named character in test file at line %d, col %d",
981                          lineNum, column);
982                  } else {
983                      // Named code point was recognized.  Insert it
984                      //   into the test data.
985                      tp.dataToBreak.append(theChar);
986                      while (tp.dataToBreak.length() > tp.srcLine->size()) {
987                          tp.srcLine->addElement(lineNum, status);
988                          tp.srcCol ->addElement(column, status);
989                      }
990                  }
991                  if (nameEndIdx > charIdx) {
992                      charIdx = nameEndIdx+1;
993  
994                  }
995                  break;
996              }
997  
998  
999  
1000              if (testString.compare(charIdx-1, 2, u"<>") == 0) {
1001                  charIdx++;
1002                  int32_t  breakIdx = tp.dataToBreak.length();
1003                  tp.expectedBreaks->setSize(breakIdx+1);
1004                  tp.expectedBreaks->setElementAt(-1, breakIdx);
1005                  tp.srcLine->setSize(breakIdx+1);
1006                  tp.srcLine->setElementAt(lineNum, breakIdx);
1007                  tp.srcCol ->setSize(breakIdx+1);
1008                  tp.srcCol ->setElementAt(column, breakIdx);
1009                  break;
1010              }
1011  
1012              if (c == u'<') {
1013                  tagValue   = 0;
1014                  parseState = PARSE_NUM;
1015                  break;
1016              }
1017  
1018              if (c == u'#' && column==3) {   // TODO:  why is column off so far?
1019                  parseState = PARSE_COMMENT;
1020                  savedState = PARSE_DATA;
1021                  break;
1022              }
1023  
1024              if (c == u'\\') {
1025                  // Check for \ at end of line, a line continuation.
1026                  //     Advance over (discard) the newline
1027                  UChar32 cp = testString.char32At(charIdx);
1028                  if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1029                      // We have a CR LF
1030                      //  Need an extra increment of the input ptr to move over both of them
1031                      charIdx++;
1032                  }
1033                  if (cp == u'\n' || cp == u'\r') {
1034                      lineNum++;
1035                      colStart = charIdx;
1036                      charIdx++;
1037                      break;
1038                  }
1039  
1040                  // Let unescape handle the back slash.
1041                  cp = testString.unescapeAt(charIdx);
1042                  if (cp != -1) {
1043                      // Escape sequence was recognized.  Insert the char
1044                      //   into the test data.
1045                      tp.dataToBreak.append(cp);
1046                      while (tp.dataToBreak.length() > tp.srcLine->size()) {
1047                          tp.srcLine->addElement(lineNum, status);
1048                          tp.srcCol ->addElement(column, status);
1049                      }
1050                      break;
1051                  }
1052  
1053  
1054                  // Not a recognized backslash escape sequence.
1055                  // Take the next char as a literal.
1056                  //  TODO:  Should this be an error?
1057                  c = testString.charAt(charIdx);
1058                  charIdx = testString.moveIndex32(charIdx, 1);
1059              }
1060  
1061              // Normal, non-escaped data char.
1062              tp.dataToBreak.append(c);
1063  
1064              // Save the mapping from offset in the data to line/column numbers in
1065              //   the original input file.  Will be used for better error messages only.
1066              //   If there's an expected break before this char, the slot in the mapping
1067              //     vector will already be set for this char; don't overwrite it.
1068              if (tp.dataToBreak.length() > tp.srcLine->size()) {
1069                  tp.srcLine->addElement(lineNum, status);
1070                  tp.srcCol ->addElement(column, status);
1071              }
1072              break;
1073  
1074  
1075          case PARSE_NUM:
1076              // We are parsing an expected numeric tag value, like <1234>,
1077              //   within a chunk of data.
1078              if (u_isUWhiteSpace(c)) {
1079                  break;
1080              }
1081  
1082              if (c == u'>') {
1083                  // Finished the number.  Add the info to the expected break data,
1084                  //   and switch parse state back to doing plain data.
1085                  parseState = PARSE_DATA;
1086                  if (tagValue == 0) {
1087                      tagValue = -1;
1088                  }
1089                  int32_t  breakIdx = tp.dataToBreak.length();
1090                  if (tp.expectedBreaks->size() > breakIdx) {
1091                      errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1092                            lineNum, column);
1093                  }
1094                  tp.expectedBreaks->setSize(breakIdx+1);
1095                  tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1096                  tp.srcLine->setSize(breakIdx+1);
1097                  tp.srcLine->setElementAt(lineNum, breakIdx);
1098                  tp.srcCol ->setSize(breakIdx+1);
1099                  tp.srcCol ->setElementAt(column, breakIdx);
1100                  break;
1101              }
1102  
1103              if (u_isdigit(c)) {
1104                  tagValue = tagValue*10 + u_charDigitValue(c);
1105                  break;
1106              }
1107  
1108              errln("Syntax Error in test file at line %d, col %d",
1109                  lineNum, column);
1110              parseState = PARSE_COMMENT;
1111              goto end_test; // Stop the test
1112              break;
1113          }
1114  
1115  
1116          if (U_FAILURE(status)) {
1117              dataerrln("ICU Error %s while parsing test file at line %d.",
1118                  u_errorName(status), lineNum);
1119              status = U_ZERO_ERROR;
1120              goto end_test; // Stop the test
1121          }
1122  
1123      }
1124  
1125      // Reached end of test file. Raise an error if parseState indicates that we are
1126      //   within a block that should have been terminated.
1127  
1128      if (parseState == PARSE_RULES) {
1129          errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1130              lineNum, rulesFirstLine);
1131      }
1132      if (parseState == PARSE_DATA) {
1133          errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1134      }
1135  
1136  
1137  end_test:
1138      delete [] testFile;
1139  #endif
1140  }
1141  
1142  //-------------------------------------------------------------------------------
1143  //
1144  //  TestDictRules   create a break iterator from source rules that includes a
1145  //                  dictionary range.   Regression for bug #7130.  Source rules
1146  //                  do not declare a break iterator type (word, line, sentence, etc.
1147  //                  but the dictionary code, without a type, would loop.
1148  //
1149  //-------------------------------------------------------------------------------
TestDictRules()1150  void RBBITest::TestDictRules() {
1151      const char *rules =  "$dictionary = [a-z]; \n"
1152                           "!!forward; \n"
1153                           "$dictionary $dictionary; \n"
1154                           "!!reverse; \n"
1155                           "$dictionary $dictionary; \n";
1156      const char *text = "aa";
1157      UErrorCode status = U_ZERO_ERROR;
1158      UParseError parseError;
1159  
1160      RuleBasedBreakIterator bi(rules, parseError, status);
1161      if (U_SUCCESS(status)) {
1162          UnicodeString utext = text;
1163          bi.setText(utext);
1164          int32_t position;
1165          int32_t loops;
1166          for (loops = 0; loops<10; loops++) {
1167              position = bi.next();
1168              if (position == RuleBasedBreakIterator::DONE) {
1169                  break;
1170              }
1171          }
1172          TEST_ASSERT(loops == 1);
1173      } else {
1174          dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1175      }
1176  }
1177  
1178  
1179  
1180  //--------------------------------------------------------------------------------------------
1181  //
1182  //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1183  //
1184  //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1185  void RBBITest::TestUnicodeFiles() {
1186      RuleBasedBreakIterator  *bi;
1187      UErrorCode               status = U_ZERO_ERROR;
1188  
1189      bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1190      TEST_ASSERT_SUCCESS(status);
1191      if (U_SUCCESS(status)) {
1192          runUnicodeTestData("GraphemeBreakTest.txt", bi);
1193      }
1194      delete bi;
1195  
1196      bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1197      TEST_ASSERT_SUCCESS(status);
1198      if (U_SUCCESS(status)) {
1199          runUnicodeTestData("WordBreakTest.txt", bi);
1200      }
1201      delete bi;
1202  
1203      bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1204      TEST_ASSERT_SUCCESS(status);
1205      if (U_SUCCESS(status)) {
1206          runUnicodeTestData("SentenceBreakTest.txt", bi);
1207      }
1208      delete bi;
1209  
1210      bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1211      TEST_ASSERT_SUCCESS(status);
1212      if (U_SUCCESS(status)) {
1213          runUnicodeTestData("LineBreakTest.txt", bi);
1214      }
1215      delete bi;
1216  }
1217  
1218  
1219  // Check for test cases from the Unicode test data files that are known to fail
1220  // and should be skipped as known issues because ICU does not fully implement
1221  // the Unicode specifications, or because ICU includes tailorings that differ from
1222  // the Unicode standard.
1223  //
1224  // Test cases are identified by the test data sequence, which tends to be more stable
1225  // across Unicode versions than the test file line numbers.
1226  //
1227  // The test case with ticket "10666" is a dummy, included as an example.
1228  
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1229  UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1230      static struct TestCase {
1231          const char *fTicketNum;
1232          const char *fFileName;
1233          const UChar *fString;
1234      } badTestCases[] = {
1235          {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1236          // The following tests were originally for
1237          // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1238          // However, that ticket has been closed as fixed but these tests still fail, so
1239          // ICU-21097 has been created to investigate and address these remaining issues.
1240          {"21097",  "LineBreakTest.txt", u"-#"},
1241          {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1242          {"21097",  "LineBreakTest.txt", u"\u002d\u00a7"},
1243          {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1244          {"21097",  "LineBreakTest.txt", u"\u002d\U00050005"},
1245          {"21097",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1246          {"21097",  "LineBreakTest.txt", u"\u002d\u0e01"},
1247          {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1248  
1249          // The following tests were originally for
1250          // Issue ICU-12017 Improve line break around numbers.
1251          // However, that ticket has been closed as fixed but these tests still fail, so
1252          // ICU-21097 has been created to investigate and address these remaining issues.
1253          {"21097", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1254          {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1255          {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1256          {"21097", "LineBreakTest.txt", u"a.2 "},
1257          {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1258          {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1259          {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1260          {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1261          {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1262          {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1263          {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1264          {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1265          {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1266          {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1267  
1268          // ICU-22127 until UAX #29 wordbreak is update for the colon changes in ICU-22112,
1269          // need to skip some tests in WordBreakTest.txt
1270          {"22127", "WordBreakTest.txt", u"a:"},
1271          {"22127", "WordBreakTest.txt", u"A:"},
1272      };
1273  
1274      for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1275          const TestCase &badCase = badTestCases[n];
1276          if (!strcmp(fileName, badCase.fFileName) &&
1277                  testCase.startsWith(UnicodeString(badCase.fString))) {
1278              return logKnownIssue(badCase.fTicketNum);
1279          }
1280      }
1281      return false;
1282  }
1283  
1284  
1285  //--------------------------------------------------------------------------------------------
1286  //
1287  //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1288  //
1289  //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1290  void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1291  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1292      UErrorCode  status = U_ZERO_ERROR;
1293  
1294      //
1295      //  Open and read the test data file, put it into a UnicodeString.
1296      //
1297      const char *testDataDirectory = IntlTest::getSourceTestData(status);
1298      char testFileName[1000];
1299      if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1300          dataerrln("Can't open test data.  Path too long.");
1301          return;
1302      }
1303      strcpy(testFileName, testDataDirectory);
1304      strcat(testFileName, fileName);
1305  
1306      logln("Opening data file %s\n", fileName);
1307  
1308      int    len;
1309      UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1310      if (status != U_FILE_ACCESS_ERROR) {
1311          TEST_ASSERT_SUCCESS(status);
1312          TEST_ASSERT(testFile != NULL);
1313      }
1314      if (U_FAILURE(status) || testFile == NULL) {
1315          return; /* something went wrong, error already output */
1316      }
1317      UnicodeString testFileAsString(true, testFile, len);
1318  
1319      //
1320      //  Parse the test data file using a regular expression.
1321      //  Each kind of token is recognized in its own capture group; what type of item was scanned
1322      //     is identified by which group had a match.
1323      //
1324      //    Capture Group  #                  1          2            3            4           5
1325      //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1326      //
1327      UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1328      RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1329      UnicodeString   testString;
1330      UVector32       breakPositions(status);
1331      int             lineNumber = 1;
1332      TEST_ASSERT_SUCCESS(status);
1333      if (U_FAILURE(status)) {
1334          return;
1335      }
1336  
1337      //
1338      //  Scan through each test case, building up the string to be broken in testString,
1339      //   and the positions that should be boundaries in the breakPositions vector.
1340      //
1341      int spin = 0;
1342      while (tokenMatcher.find()) {
1343          if(tokenMatcher.hitEnd()) {
1344            /* Shouldn't Happen(TM).  This means we didn't find the symbols we were looking for.
1345               This occurred when the text file was corrupt (wasn't marked as UTF-8)
1346               and caused an infinite loop here on EBCDIC systems!
1347            */
1348            fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1349            //       return;
1350          }
1351          if (tokenMatcher.start(1, status) >= 0) {
1352              // Scanned a divide sign, indicating a break position in the test data.
1353              if (testString.length()>0) {
1354                  breakPositions.addElement(testString.length(), status);
1355              }
1356          }
1357          else if (tokenMatcher.start(2, status) >= 0) {
1358              // Scanned an 'x', meaning no break at this position in the test data
1359              //   Nothing to be done here.
1360              }
1361          else if (tokenMatcher.start(3, status) >= 0) {
1362              // Scanned Hex digits.  Convert them to binary, append to the character data string.
1363              const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1364              int length = hexNumber.length();
1365              if (length<=8) {
1366                  char buf[10];
1367                  hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1368                  UChar32 c = (UChar32)strtol(buf, NULL, 16);
1369                  if (c<=0x10ffff) {
1370                      testString.append(c);
1371                  } else {
1372                      errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1373                         fileName, lineNumber);
1374                  }
1375              } else {
1376                  errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1377                         fileName, lineNumber);
1378               }
1379          }
1380          else if (tokenMatcher.start(4, status) >= 0) {
1381              // Scanned to end of a line, possibly skipping over a comment in the process.
1382              //   If the line from the file contained test data, run the test now.
1383              if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1384                  checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1385              }
1386  
1387              // Clear out this test case.
1388              //    The string and breakPositions vector will be refilled as the next
1389              //       test case is parsed.
1390              testString.remove();
1391              breakPositions.removeAllElements();
1392              lineNumber++;
1393          } else {
1394              // Scanner catchall.  Something unrecognized appeared on the line.
1395              char token[16];
1396              UnicodeString uToken = tokenMatcher.group(0, status);
1397              uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1398              token[sizeof(token)-1] = 0;
1399              errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1400  
1401              // Clean up, in preparation for continuing with the next line.
1402              testString.remove();
1403              breakPositions.removeAllElements();
1404              lineNumber++;
1405          }
1406          TEST_ASSERT_SUCCESS(status);
1407          if (U_FAILURE(status)) {
1408              break;
1409          }
1410      }
1411  
1412      delete [] testFile;
1413   #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1414  }
1415  
1416  //--------------------------------------------------------------------------------------------
1417  //
1418  //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1419  //                            test data files.  Do only a simple, forward-only check -
1420  //                            this test is mostly to check that ICU and the Unicode
1421  //                            data agree with each other.
1422  //
1423  //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1424  void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1425                           const UnicodeString &testString,   // Text data to be broken
1426                           UVector32 *breakPositions,         // Positions where breaks should be found.
1427                           RuleBasedBreakIterator *bi) {
1428      int32_t pos;                 // Break Position in the test string
1429      int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1430      int32_t expectedPos;         // Expected break position (index into test string)
1431  
1432      bi->setText(testString);
1433      pos = bi->first();
1434      pos = bi->next();
1435  
1436      while (pos != BreakIterator::DONE) {
1437          if (expectedI >= breakPositions->size()) {
1438              errln("Test file \"%s\", line %d, unexpected break found at position %d",
1439                  testFileName, lineNumber, pos);
1440              break;
1441          }
1442          expectedPos = breakPositions->elementAti(expectedI);
1443          if (pos < expectedPos) {
1444              errln("Test file \"%s\", line %d, unexpected break found at position %d",
1445                  testFileName, lineNumber, pos);
1446              break;
1447          }
1448          if (pos > expectedPos) {
1449              errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1450                  testFileName, lineNumber, expectedPos);
1451              break;
1452          }
1453          pos = bi->next();
1454          expectedI++;
1455      }
1456  
1457      if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1458          errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1459              testFileName, lineNumber, breakPositions->elementAti(expectedI));
1460      }
1461  }
1462  
1463  
1464  
1465  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1466  //---------------------------------------------------------------------------------------
1467  //
1468  //   class RBBIMonkeyKind
1469  //
1470  //      Monkey Test for Break Iteration
1471  //      Abstract interface class.   Concrete derived classes independently
1472  //      implement the break rules for different iterator types.
1473  //
1474  //      The Monkey Test itself uses doesn't know which type of break iterator it is
1475  //      testing, but works purely in terms of the interface defined here.
1476  //
1477  //---------------------------------------------------------------------------------------
1478  class RBBIMonkeyKind {
1479  public:
1480      // Return a UVector of UnicodeSets, representing the character classes used
1481      //   for this type of iterator.
1482      virtual  UVector  *charClasses() = 0;
1483  
1484      // Set the test text on which subsequent calls to next() will operate
1485      virtual  void      setText(const UnicodeString &s) = 0;
1486  
1487      // Find the next break position, starting from the prev break position, or from zero.
1488      // Return -1 after reaching end of string.
1489      virtual  int32_t   next(int32_t i) = 0;
1490  
1491      // Name of each character class, parallel with charClasses. Used for debugging output
1492      // of characters.
1493      virtual  std::vector<std::string>&     characterClassNames();
1494  
1495      void setAppliedRule(int32_t position, const char* value);
1496  
1497      std::string getAppliedRule(int32_t position);
1498  
1499      virtual ~RBBIMonkeyKind();
1500      UErrorCode deferredStatus;
1501  
1502      std::string classNameFromCodepoint(const UChar32 c);
1503      unsigned int maxClassNameSize();
1504  
1505   protected:
1506       RBBIMonkeyKind();
1507       std::vector<std::string> classNames;
1508       std::vector<std::string> appliedRules;
1509  
1510      // Clear `appliedRules` and fill it with empty strings in the size of test text.
1511      void prepareAppliedRules(int32_t size );
1512  
1513   private:
1514  
1515  };
1516  
RBBIMonkeyKind()1517  RBBIMonkeyKind::RBBIMonkeyKind() {
1518      deferredStatus = U_ZERO_ERROR;
1519  }
1520  
~RBBIMonkeyKind()1521  RBBIMonkeyKind::~RBBIMonkeyKind() {
1522  }
1523  
characterClassNames()1524  std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1525      return classNames;
1526  }
1527  
prepareAppliedRules(int32_t size)1528  void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1529      // Remove all the information in the `appliedRules`.
1530      appliedRules.clear();
1531      appliedRules.resize(size + 1);
1532  }
1533  
setAppliedRule(int32_t position,const char * value)1534  void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1535      appliedRules[position] = value;
1536  }
1537  
getAppliedRule(int32_t position)1538  std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1539      return appliedRules[position];
1540  }
1541  
classNameFromCodepoint(const UChar32 c)1542  std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1543      // Simply iterate through charClasses to find character's class
1544      for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1545          UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1546          if (classSet->contains(c)) {
1547              return classNames[aClassNum];
1548          }
1549      }
1550      U_ASSERT(false);  // This should not happen.
1551      return "bad class name";
1552  }
1553  
maxClassNameSize()1554  unsigned int RBBIMonkeyKind::maxClassNameSize() {
1555      unsigned int maxSize = 0;
1556      for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1557          auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1558          if (aClassNumSize > maxSize) {
1559              maxSize = aClassNumSize;
1560          }
1561      }
1562      return maxSize;
1563  }
1564  
1565  //----------------------------------------------------------------------------------------
1566  //
1567  //   Random Numbers.  Similar to standard lib rand() and srand()
1568  //                    Not using library to
1569  //                      1.  Get same results on all platforms.
1570  //                      2.  Get access to current seed, to more easily reproduce failures.
1571  //
1572  //---------------------------------------------------------------------------------------
1573  static uint32_t m_seed = 1;
1574  
m_rand()1575  static uint32_t m_rand()
1576  {
1577      m_seed = m_seed * 1103515245 + 12345;
1578      return (uint32_t)(m_seed/65536) % 32768;
1579  }
1580  
1581  
1582  //------------------------------------------------------------------------------------------
1583  //
1584  //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1585  //                             of RBBIMonkeyKind.
1586  //
1587  //------------------------------------------------------------------------------------------
1588  class RBBICharMonkey: public RBBIMonkeyKind {
1589  public:
1590      RBBICharMonkey();
1591      virtual          ~RBBICharMonkey();
1592      virtual  UVector *charClasses() override;
1593      virtual  void     setText(const UnicodeString &s) override;
1594      virtual  int32_t  next(int32_t i) override;
1595  private:
1596      UVector   *fSets;
1597  
1598      UnicodeSet  *fCRLFSet;
1599      UnicodeSet  *fControlSet;
1600      UnicodeSet  *fExtendSet;
1601      UnicodeSet  *fZWJSet;
1602      UnicodeSet  *fRegionalIndicatorSet;
1603      UnicodeSet  *fPrependSet;
1604      UnicodeSet  *fSpacingSet;
1605      UnicodeSet  *fLSet;
1606      UnicodeSet  *fVSet;
1607      UnicodeSet  *fTSet;
1608      UnicodeSet  *fLVSet;
1609      UnicodeSet  *fLVTSet;
1610      UnicodeSet  *fHangulSet;
1611      UnicodeSet  *fExtendedPictSet;
1612      UnicodeSet  *fViramaSet;
1613      UnicodeSet  *fLinkingConsonantSet;
1614      UnicodeSet  *fExtCccZwjSet;
1615      UnicodeSet  *fAnySet;
1616  
1617      const UnicodeString *fText;
1618  };
1619  
1620  
RBBICharMonkey()1621  RBBICharMonkey::RBBICharMonkey() {
1622      UErrorCode  status = U_ZERO_ERROR;
1623  
1624      fText = NULL;
1625  
1626      fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1627      fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1628      fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1629      fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1630      fRegionalIndicatorSet =
1631                    new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1632      fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1633      fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1634      fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1635      fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1636      fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1637      fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1638      fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1639      fHangulSet  = new UnicodeSet();
1640      fHangulSet->addAll(*fLSet);
1641      fHangulSet->addAll(*fVSet);
1642      fHangulSet->addAll(*fTSet);
1643      fHangulSet->addAll(*fLVSet);
1644      fHangulSet->addAll(*fLVTSet);
1645  
1646      fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1647      fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1648                                          "\\p{Indic_Syllabic_Category=Virama}]", status);
1649      fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1650                                          "\\p{Indic_Syllabic_Category=Consonant}]", status);
1651      fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1652      fAnySet           = new UnicodeSet(0, 0x10ffff);
1653  
1654      // Create sets of characters, and add the names of the above character sets.
1655      // In each new ICU release, add new names corresponding to the sets above.
1656      fSets             = new UVector(status);
1657  
1658      // Important: Keep class names the same as the class contents.
1659      fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1660      fSets->addElement(fControlSet, status); classNames.push_back("Control");
1661      fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1662      fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1663      if (!fPrependSet->isEmpty()) {
1664          fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1665      }
1666      fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1667      fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1668      fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1669      fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1670      fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1671      fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1672      fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1673      fSets->addElement(fAnySet, status); classNames.push_back("Any");
1674  
1675      if (U_FAILURE(status)) {
1676          deferredStatus = status;
1677      }
1678  }
1679  
1680  
setText(const UnicodeString & s)1681  void RBBICharMonkey::setText(const UnicodeString &s) {
1682      fText = &s;
1683      prepareAppliedRules(s.length());
1684  }
1685  
1686  
1687  
next(int32_t prevPos)1688  int32_t RBBICharMonkey::next(int32_t prevPos) {
1689      int    p0, p1, p2, p3;    // Indices of the significant code points around the
1690                                //   break position being tested.  The candidate break
1691                                //   location is before p2.
1692  
1693      int     breakPos = -1;
1694  
1695      UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1696      UChar32 cBase;            // for (X Extend*) patterns, the X character.
1697  
1698      if (U_FAILURE(deferredStatus)) {
1699          return -1;
1700      }
1701  
1702      // Previous break at end of string.  return DONE.
1703      if (prevPos >= fText->length()) {
1704          return -1;
1705      }
1706  
1707      p0 = p1 = p2 = p3 = prevPos;
1708      c3 =  fText->char32At(prevPos);
1709      c0 = c1 = c2 = cBase = 0;
1710      (void)p0;   // suppress set but not used warning.
1711      (void)c0;
1712  
1713      // Loop runs once per "significant" character position in the input text.
1714      for (;;) {
1715          // Move all of the positions forward in the input string.
1716          p0 = p1;  c0 = c1;
1717          p1 = p2;  c1 = c2;
1718          p2 = p3;  c2 = c3;
1719  
1720          // Advance p3 by one codepoint
1721          p3 = fText->moveIndex32(p3, 1);
1722          c3 = fText->char32At(p3);
1723  
1724          if (p1 == p2) {
1725              // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1726              continue;
1727          }
1728  
1729          if (p2 == fText->length()) {
1730              setAppliedRule(p2, "End of String");
1731              break;
1732          }
1733  
1734          //     No Extend or Format characters may appear between the CR and LF,
1735          //     which requires the additional check for p2 immediately following p1.
1736          //
1737          if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1738            setAppliedRule(p2, "GB3   CR x LF");
1739            continue;
1740          }
1741  
1742          if (fControlSet->contains(c1) ||
1743              c1 == 0x0D ||
1744              c1 == 0x0A)  {
1745            setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
1746            break;
1747          }
1748  
1749          if (fControlSet->contains(c2) ||
1750              c2 == 0x0D ||
1751              c2 == 0x0A)  {
1752              setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
1753              break;
1754          }
1755  
1756          if (fLSet->contains(c1) &&
1757                 (fLSet->contains(c2)  ||
1758                  fVSet->contains(c2)  ||
1759                  fLVSet->contains(c2) ||
1760                  fLVTSet->contains(c2))) {
1761              setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
1762              continue;
1763          }
1764  
1765          if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1766              (fVSet->contains(c2) || fTSet->contains(c2)))  {
1767              setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
1768              continue;
1769          }
1770  
1771          if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1772              fTSet->contains(c2))  {
1773              setAppliedRule(p2, "GB8   ( LVT | T)  x T");
1774              continue;
1775          }
1776  
1777          if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1778              if (!fExtendSet->contains(c1)) {
1779                  cBase = c1;
1780              }
1781              setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
1782              continue;
1783          }
1784  
1785          if (fSpacingSet->contains(c2)) {
1786              setAppliedRule(p2, "GB9a  x  SpacingMark");
1787              continue;
1788          }
1789  
1790          if (fPrependSet->contains(c1)) {
1791              setAppliedRule(p2, "GB9b  Prepend x");
1792              continue;
1793          }
1794  
1795          //   Note: Viramas are also included in the ExtCccZwj class.
1796          if (fLinkingConsonantSet->contains(c2)) {
1797              int pi = p1;
1798              bool sawVirama = false;
1799              while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1800                  if (fViramaSet->contains(fText->char32At(pi))) {
1801                      sawVirama = true;
1802                  }
1803                  pi = fText->moveIndex32(pi, -1);
1804              }
1805              if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1806                setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1807                continue;
1808              }
1809          }
1810  
1811          if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1812            setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1813            continue;
1814          }
1815  
1816          //                   Note: The first if condition is a little tricky. We only need to force
1817          //                      a break if there are three or more contiguous RIs. If there are
1818          //                      only two, a break following will occur via other rules, and will include
1819          //                      any trailing extend characters, which is needed behavior.
1820          if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1821                  && fRegionalIndicatorSet->contains(c2)) {
1822            setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1823            break;
1824          }
1825          if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1826            setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1827            continue;
1828          }
1829  
1830          setAppliedRule(p2, "GB999 Any <break> Any");
1831          break;
1832      }
1833  
1834      breakPos = p2;
1835      return breakPos;
1836  }
1837  
1838  
1839  
charClasses()1840  UVector  *RBBICharMonkey::charClasses() {
1841      return fSets;
1842  }
1843  
~RBBICharMonkey()1844  RBBICharMonkey::~RBBICharMonkey() {
1845      delete fSets;
1846      delete fCRLFSet;
1847      delete fControlSet;
1848      delete fExtendSet;
1849      delete fRegionalIndicatorSet;
1850      delete fPrependSet;
1851      delete fSpacingSet;
1852      delete fLSet;
1853      delete fVSet;
1854      delete fTSet;
1855      delete fLVSet;
1856      delete fLVTSet;
1857      delete fHangulSet;
1858      delete fAnySet;
1859      delete fZWJSet;
1860      delete fExtendedPictSet;
1861      delete fViramaSet;
1862      delete fLinkingConsonantSet;
1863      delete fExtCccZwjSet;
1864  }
1865  
1866  //------------------------------------------------------------------------------------------
1867  //
1868  //   class RBBIWordMonkey      Word Break specific implementation
1869  //                             of RBBIMonkeyKind.
1870  //
1871  //------------------------------------------------------------------------------------------
1872  class RBBIWordMonkey: public RBBIMonkeyKind {
1873  public:
1874      RBBIWordMonkey();
1875      virtual          ~RBBIWordMonkey();
1876      virtual  UVector *charClasses() override;
1877      virtual  void     setText(const UnicodeString &s) override;
1878      virtual int32_t   next(int32_t i) override;
1879  private:
1880      UVector      *fSets;
1881  
1882      UnicodeSet  *fCRSet;
1883      UnicodeSet  *fLFSet;
1884      UnicodeSet  *fNewlineSet;
1885      UnicodeSet  *fRegionalIndicatorSet;
1886      UnicodeSet  *fKatakanaSet;
1887      UnicodeSet  *fHebrew_LetterSet;
1888      UnicodeSet  *fALetterSet;
1889      UnicodeSet  *fSingle_QuoteSet;
1890      UnicodeSet  *fDouble_QuoteSet;
1891      UnicodeSet  *fMidNumLetSet;
1892      UnicodeSet  *fMidLetterSet;
1893      UnicodeSet  *fMidNumSet;
1894      UnicodeSet  *fNumericSet;
1895      UnicodeSet  *fFormatSet;
1896      UnicodeSet  *fOtherSet = nullptr;
1897      UnicodeSet  *fExtendSet;
1898      UnicodeSet  *fExtendNumLetSet;
1899      UnicodeSet  *fWSegSpaceSet;
1900      UnicodeSet  *fDictionarySet = nullptr;
1901      UnicodeSet  *fZWJSet;
1902      UnicodeSet  *fExtendedPictSet;
1903  
1904      const UnicodeString  *fText;
1905  };
1906  
1907  
RBBIWordMonkey()1908  RBBIWordMonkey::RBBIWordMonkey()
1909  {
1910      UErrorCode  status = U_ZERO_ERROR;
1911  
1912      fSets            = new UVector(status);
1913  
1914      fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
1915      fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
1916      fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
1917      fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
1918      fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1919      fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1920      fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter} @]", status);
1921      fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
1922      fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
1923      fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
1924      fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]",    status);
1925      fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
1926      fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1927      fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
1928      fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1929      // There are some sc=Hani characters with WB=Extend.
1930      // The break rules need to pick one or the other because
1931      // Extend overlapping with something else is messy.
1932      // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1933      // in $Han (for $dictionary) and out of $Extend.
1934      fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1935      fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
1936  
1937      fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
1938      fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1939      if(U_FAILURE(status)) {
1940          IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1941          deferredStatus = status;
1942          return;
1943      }
1944  
1945      fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1946      fDictionarySet->addAll(*fKatakanaSet);
1947      fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1948  
1949      fALetterSet->removeAll(*fDictionarySet);
1950  
1951      fOtherSet        = new UnicodeSet();
1952      if(U_FAILURE(status)) {
1953          IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1954          deferredStatus = status;
1955          return;
1956      }
1957  
1958      fOtherSet->complement();
1959      fOtherSet->removeAll(*fCRSet);
1960      fOtherSet->removeAll(*fLFSet);
1961      fOtherSet->removeAll(*fNewlineSet);
1962      fOtherSet->removeAll(*fKatakanaSet);
1963      fOtherSet->removeAll(*fHebrew_LetterSet);
1964      fOtherSet->removeAll(*fALetterSet);
1965      fOtherSet->removeAll(*fSingle_QuoteSet);
1966      fOtherSet->removeAll(*fDouble_QuoteSet);
1967      fOtherSet->removeAll(*fMidLetterSet);
1968      fOtherSet->removeAll(*fMidNumSet);
1969      fOtherSet->removeAll(*fNumericSet);
1970      fOtherSet->removeAll(*fExtendNumLetSet);
1971      fOtherSet->removeAll(*fWSegSpaceSet);
1972      fOtherSet->removeAll(*fFormatSet);
1973      fOtherSet->removeAll(*fExtendSet);
1974      fOtherSet->removeAll(*fRegionalIndicatorSet);
1975      fOtherSet->removeAll(*fZWJSet);
1976      fOtherSet->removeAll(*fExtendedPictSet);
1977  
1978      // Inhibit dictionary characters from being tested at all.
1979      fOtherSet->removeAll(*fDictionarySet);
1980  
1981      // Add classes and their names
1982      fSets->addElement(fCRSet, status); classNames.push_back("CR");
1983      fSets->addElement(fLFSet, status); classNames.push_back("LF");
1984      fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
1985      fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1986      fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
1987      fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
1988      fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
1989      fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
1990      // Omit Katakana from fSets, which omits Katakana characters
1991      // from the test data. They are all in the dictionary set,
1992      // which this (old, to be retired) monkey test cannot handle.
1993      //fSets->addElement(fKatakanaSet, status);
1994  
1995      fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
1996      fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
1997      fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
1998      fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
1999      fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2000      fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2001      fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2002      fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2003      fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2004  
2005      fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2006      fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2007  
2008      if (U_FAILURE(status)) {
2009          deferredStatus = status;
2010      }
2011  }
2012  
setText(const UnicodeString & s)2013  void RBBIWordMonkey::setText(const UnicodeString &s) {
2014      fText       = &s;
2015      prepareAppliedRules(s.length());
2016  }
2017  
2018  
next(int32_t prevPos)2019  int32_t RBBIWordMonkey::next(int32_t prevPos) {
2020      int    p0, p1, p2, p3;    // Indices of the significant code points around the
2021                                //   break position being tested.  The candidate break
2022                                //   location is before p2.
2023  
2024      int     breakPos = -1;
2025  
2026      UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2027  
2028      if (U_FAILURE(deferredStatus)) {
2029          return -1;
2030      }
2031  
2032      // Prev break at end of string.  return DONE.
2033      if (prevPos >= fText->length()) {
2034          return -1;
2035      }
2036      p0 = p1 = p2 = p3 = prevPos;
2037      c3 =  fText->char32At(prevPos);
2038      c0 = c1 = c2 = 0;
2039      (void)p0;       // Suppress set but not used warning.
2040  
2041      // Loop runs once per "significant" character position in the input text.
2042      for (;;) {
2043          // Move all of the positions forward in the input string.
2044          p0 = p1;  c0 = c1;
2045          p1 = p2;  c1 = c2;
2046          p2 = p3;  c2 = c3;
2047  
2048          // Advance p3 by    X(Extend | Format)*   Rule 4
2049          //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2050          do {
2051              p3 = fText->moveIndex32(p3, 1);
2052              c3 = fText->char32At(p3);
2053              if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2054                 break;
2055              }
2056          }
2057          while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2058  
2059  
2060          if (p1 == p2) {
2061              // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2062              continue;
2063          }
2064  
2065          if (p2 == fText->length()) {
2066              // Reached end of string.  Always a break position.
2067              break;
2068          }
2069  
2070          //     No Extend or Format characters may appear between the CR and LF,
2071          //     which requires the additional check for p2 immediately following p1.
2072          //
2073          if (c1==0x0D && c2==0x0A) {
2074            setAppliedRule(p2, "WB3   CR x LF");
2075            continue;
2076          }
2077  
2078          if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2079              setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2080              break;
2081          }
2082          if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2083              setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2084              break;
2085          }
2086  
2087          //              Not ignoring extend chars, so peek into input text to
2088          //              get the potential ZWJ, the character immediately preceding c2.
2089          //              Sloppy UChar32 indexing: p2-1 may reference trail half
2090          //              but char32At will get the full code point.
2091          if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2092              setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
2093              continue;
2094          }
2095  
2096          if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2097              setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
2098              continue;
2099          }
2100  
2101          if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2102              (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2103              setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2104              continue;
2105          }
2106  
2107          if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2108               (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2109               (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2110              setAppliedRule(p2,
2111                             "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2112              continue;
2113          }
2114  
2115          if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2116              (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2117              (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2118              setAppliedRule(p2,
2119                             "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
2120              continue;
2121          }
2122  
2123          if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2124              setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
2125              continue;
2126          }
2127  
2128            if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2129              setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
2130              continue;
2131          }
2132  
2133          if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2134              setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
2135              continue;
2136          }
2137  
2138          if (fNumericSet->contains(c1) &&
2139              fNumericSet->contains(c2)) {
2140              setAppliedRule(p2, "WB8   Numeric x Numeric");
2141              continue;
2142          }
2143  
2144          if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2145              fNumericSet->contains(c2)) {
2146              setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
2147              continue;
2148          }
2149  
2150          if (fNumericSet->contains(c1) &&
2151              (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2152              setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
2153              continue;
2154          }
2155  
2156            if (fNumericSet->contains(c0) &&
2157              (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2158              fNumericSet->contains(c2)) {
2159              setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
2160              continue;
2161          }
2162  
2163          if (fNumericSet->contains(c1) &&
2164              (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2165              fNumericSet->contains(c3)) {
2166              setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2167              continue;
2168          }
2169  
2170          //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2171          //                  all Katakana are handled by the dictionary breaker.
2172          if (fKatakanaSet->contains(c1) &&
2173              fKatakanaSet->contains(c2))  {
2174              setAppliedRule(p2, "WB13  Katakana x Katakana");
2175              continue;
2176          }
2177  
2178          if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2179               fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2180               fExtendNumLetSet->contains(c2)) {
2181              setAppliedRule(p2,
2182                             "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2183              continue;
2184          }
2185  
2186          if (fExtendNumLetSet->contains(c1) &&
2187                  (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2188                   fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2189              setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2190              continue;
2191          }
2192  
2193          if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2194              setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2195              break;
2196          }
2197          if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2198              setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2199              continue;
2200          }
2201  
2202          setAppliedRule(p2, "WB999");
2203          break;
2204      }
2205  
2206      breakPos = p2;
2207      return breakPos;
2208  }
2209  
2210  
charClasses()2211  UVector  *RBBIWordMonkey::charClasses() {
2212      return fSets;
2213  }
2214  
~RBBIWordMonkey()2215  RBBIWordMonkey::~RBBIWordMonkey() {
2216      delete fSets;
2217      delete fCRSet;
2218      delete fLFSet;
2219      delete fNewlineSet;
2220      delete fKatakanaSet;
2221      delete fHebrew_LetterSet;
2222      delete fALetterSet;
2223      delete fSingle_QuoteSet;
2224      delete fDouble_QuoteSet;
2225      delete fMidNumLetSet;
2226      delete fMidLetterSet;
2227      delete fMidNumSet;
2228      delete fNumericSet;
2229      delete fFormatSet;
2230      delete fExtendSet;
2231      delete fExtendNumLetSet;
2232      delete fWSegSpaceSet;
2233      delete fRegionalIndicatorSet;
2234      delete fDictionarySet;
2235      delete fOtherSet;
2236      delete fZWJSet;
2237      delete fExtendedPictSet;
2238  }
2239  
2240  
2241  
2242  
2243  //------------------------------------------------------------------------------------------
2244  //
2245  //   class RBBISentMonkey      Sentence Break specific implementation
2246  //                             of RBBIMonkeyKind.
2247  //
2248  //------------------------------------------------------------------------------------------
2249  class RBBISentMonkey: public RBBIMonkeyKind {
2250  public:
2251      RBBISentMonkey();
2252      virtual          ~RBBISentMonkey();
2253      virtual  UVector *charClasses() override;
2254      virtual  void     setText(const UnicodeString &s) override;
2255      virtual int32_t   next(int32_t i) override;
2256  private:
2257      int               moveBack(int posFrom);
2258      int               moveForward(int posFrom);
2259      UChar32           cAt(int pos);
2260  
2261      UVector      *fSets;
2262  
2263      UnicodeSet  *fSepSet;
2264      UnicodeSet  *fFormatSet;
2265      UnicodeSet  *fSpSet;
2266      UnicodeSet  *fLowerSet;
2267      UnicodeSet  *fUpperSet;
2268      UnicodeSet  *fOLetterSet;
2269      UnicodeSet  *fNumericSet;
2270      UnicodeSet  *fATermSet;
2271      UnicodeSet  *fSContinueSet;
2272      UnicodeSet  *fSTermSet;
2273      UnicodeSet  *fCloseSet;
2274      UnicodeSet  *fOtherSet;
2275      UnicodeSet  *fExtendSet;
2276  
2277      const UnicodeString  *fText;
2278  };
2279  
RBBISentMonkey()2280  RBBISentMonkey::RBBISentMonkey()
2281  {
2282      UErrorCode  status = U_ZERO_ERROR;
2283  
2284      fSets            = new UVector(status);
2285  
2286      //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2287      //                       set and made into character classes of their own.  For the monkey impl,
2288      //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2289      fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2290      fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2291      fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2292      fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2293      fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2294      fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2295      fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2296      fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2297      fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2298      fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2299      fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2300      fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2301      fOtherSet        = new UnicodeSet();
2302  
2303      if(U_FAILURE(status)) {
2304        deferredStatus = status;
2305        return;
2306      }
2307  
2308      fOtherSet->complement();
2309      fOtherSet->removeAll(*fSepSet);
2310      fOtherSet->removeAll(*fFormatSet);
2311      fOtherSet->removeAll(*fSpSet);
2312      fOtherSet->removeAll(*fLowerSet);
2313      fOtherSet->removeAll(*fUpperSet);
2314      fOtherSet->removeAll(*fOLetterSet);
2315      fOtherSet->removeAll(*fNumericSet);
2316      fOtherSet->removeAll(*fATermSet);
2317      fOtherSet->removeAll(*fSContinueSet);
2318      fOtherSet->removeAll(*fSTermSet);
2319      fOtherSet->removeAll(*fCloseSet);
2320      fOtherSet->removeAll(*fExtendSet);
2321  
2322      fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2323      fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2324      fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2325      fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2326      fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2327      fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2328      fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2329      fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2330      fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2331      fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2332      fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2333      fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2334      fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2335  
2336      if (U_FAILURE(status)) {
2337          deferredStatus = status;
2338      }
2339  }
2340  
2341  
2342  
setText(const UnicodeString & s)2343  void RBBISentMonkey::setText(const UnicodeString &s) {
2344      fText       = &s;
2345      prepareAppliedRules(s.length());
2346  }
2347  
charClasses()2348  UVector  *RBBISentMonkey::charClasses() {
2349      return fSets;
2350  }
2351  
2352  //  moveBack()   Find the "significant" code point preceding the index i.
2353  //               Skips over ($Extend | $Format)* .
2354  //
moveBack(int i)2355  int RBBISentMonkey::moveBack(int i) {
2356      if (i <= 0) {
2357          return -1;
2358      }
2359      UChar32   c;
2360      int32_t   j = i;
2361      do {
2362          j = fText->moveIndex32(j, -1);
2363          c = fText->char32At(j);
2364      }
2365      while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2366      return j;
2367  
2368   }
2369  
2370  
moveForward(int i)2371  int RBBISentMonkey::moveForward(int i) {
2372      if (i>=fText->length()) {
2373          return fText->length();
2374      }
2375      UChar32   c;
2376      int32_t   j = i;
2377      do {
2378          j = fText->moveIndex32(j, 1);
2379          c = cAt(j);
2380      }
2381      while (fFormatSet->contains(c) || fExtendSet->contains(c));
2382      return j;
2383  }
2384  
cAt(int pos)2385  UChar32 RBBISentMonkey::cAt(int pos) {
2386      if (pos<0 || pos>=fText->length()) {
2387          return -1;
2388      } else {
2389          return fText->char32At(pos);
2390      }
2391  }
2392  
next(int32_t prevPos)2393  int32_t RBBISentMonkey::next(int32_t prevPos) {
2394      int    p0, p1, p2, p3;    // Indices of the significant code points around the
2395                                //   break position being tested.  The candidate break
2396                                //   location is before p2.
2397  
2398      int     breakPos = -1;
2399  
2400      UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2401      UChar32 c;
2402  
2403      if (U_FAILURE(deferredStatus)) {
2404          return -1;
2405      }
2406  
2407      // Prev break at end of string.  return DONE.
2408      if (prevPos >= fText->length()) {
2409          return -1;
2410      }
2411      p0 = p1 = p2 = p3 = prevPos;
2412      c3 =  fText->char32At(prevPos);
2413      c0 = c1 = c2 = 0;
2414      (void)p0;     // Suppress set but not used warning.
2415  
2416      // Loop runs once per "significant" character position in the input text.
2417      for (;;) {
2418          // Move all of the positions forward in the input string.
2419          p0 = p1;  c0 = c1;
2420          p1 = p2;  c1 = c2;
2421          p2 = p3;  c2 = c3;
2422  
2423          // Advance p3 by    X(Extend | Format)*   Rule 4
2424          p3 = moveForward(p3);
2425          c3 = cAt(p3);
2426  
2427          if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2428              setAppliedRule(p2, "SB3   CR x LF");
2429              continue;
2430          }
2431  
2432          if (fSepSet->contains(c1)) {
2433              p2 = p1+1;   // Separators don't combine with Extend or Format.
2434  
2435              setAppliedRule(p2, "SB4   Sep  <break>");
2436              break;
2437          }
2438  
2439          if (p2 >= fText->length()) {
2440              // Reached end of string.  Always a break position.
2441              setAppliedRule(p2, "SB4   Sep  <break>");
2442              break;
2443          }
2444  
2445          if (p2 == prevPos) {
2446              // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2447              setAppliedRule(p2, "SB4   Sep  <break>");
2448              continue;
2449          }
2450  
2451          if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2452              setAppliedRule(p2, "SB6   ATerm x Numeric");
2453              continue;
2454          }
2455  
2456            if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2457                  fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2458              setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
2459              continue;
2460          }
2461  
2462          //           Note:  STerm | ATerm are added to the negated part of the expression by a
2463          //                  note to the Unicode 5.0 documents.
2464          int p8 = p1;
2465          while (fSpSet->contains(cAt(p8))) {
2466              p8 = moveBack(p8);
2467          }
2468          while (fCloseSet->contains(cAt(p8))) {
2469              p8 = moveBack(p8);
2470          }
2471          if (fATermSet->contains(cAt(p8))) {
2472              p8=p2;
2473              for (;;) {
2474                  c = cAt(p8);
2475                  if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2476                      fLowerSet->contains(c) || fSepSet->contains(c) ||
2477                      fATermSet->contains(c) || fSTermSet->contains(c))  {
2478  
2479                      setAppliedRule(p2,
2480                                     "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2481                      break;
2482                  }
2483                  p8 = moveForward(p8);
2484              }
2485              if (fLowerSet->contains(cAt(p8))) {
2486  
2487                  setAppliedRule(p2,
2488                                 "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2489                  continue;
2490              }
2491          }
2492  
2493          if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2494              p8 = p1;
2495              while (fSpSet->contains(cAt(p8))) {
2496                  p8 = moveBack(p8);
2497              }
2498              while (fCloseSet->contains(cAt(p8))) {
2499                  p8 = moveBack(p8);
2500              }
2501              c = cAt(p8);
2502              if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2503                  setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2504                  continue;
2505              }
2506          }
2507  
2508          int p9 = p1;
2509          while (fCloseSet->contains(cAt(p9))) {
2510              p9 = moveBack(p9);
2511          }
2512          c = cAt(p9);
2513          if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2514              if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2515  
2516                  setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
2517                  continue;
2518              }
2519          }
2520  
2521          int p10 = p1;
2522          while (fSpSet->contains(cAt(p10))) {
2523              p10 = moveBack(p10);
2524          }
2525          while (fCloseSet->contains(cAt(p10))) {
2526              p10 = moveBack(p10);
2527          }
2528          if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2529              if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2530                  setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
2531                  continue;
2532              }
2533          }
2534  
2535          int p11 = p1;
2536          if (fSepSet->contains(cAt(p11))) {
2537              p11 = moveBack(p11);
2538          }
2539          while (fSpSet->contains(cAt(p11))) {
2540              p11 = moveBack(p11);
2541          }
2542          while (fCloseSet->contains(cAt(p11))) {
2543              p11 = moveBack(p11);
2544          }
2545          if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2546            setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
2547              break;
2548          }
2549  
2550          setAppliedRule(p2, "SB12  Any x Any");
2551          continue;
2552      }
2553  
2554      breakPos = p2;
2555      return breakPos;
2556  }
2557  
~RBBISentMonkey()2558  RBBISentMonkey::~RBBISentMonkey() {
2559      delete fSets;
2560      delete fSepSet;
2561      delete fFormatSet;
2562      delete fSpSet;
2563      delete fLowerSet;
2564      delete fUpperSet;
2565      delete fOLetterSet;
2566      delete fNumericSet;
2567      delete fATermSet;
2568      delete fSContinueSet;
2569      delete fSTermSet;
2570      delete fCloseSet;
2571      delete fOtherSet;
2572      delete fExtendSet;
2573  }
2574  
2575  
2576  
2577  //-------------------------------------------------------------------------------------------
2578  //
2579  //  RBBILineMonkey
2580  //
2581  //-------------------------------------------------------------------------------------------
2582  
2583  class RBBILineMonkey: public RBBIMonkeyKind {
2584  public:
2585      RBBILineMonkey();
2586      virtual          ~RBBILineMonkey();
2587      virtual  UVector *charClasses() override;
2588      virtual  void     setText(const UnicodeString &s) override;
2589      virtual  int32_t  next(int32_t i) override;
2590      virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2591  private:
2592      UVector      *fSets;
2593  
2594      UnicodeSet  *fBK;
2595      UnicodeSet  *fCR;
2596      UnicodeSet  *fLF;
2597      UnicodeSet  *fCM;
2598      UnicodeSet  *fNL;
2599      UnicodeSet  *fSG;
2600      UnicodeSet  *fWJ;
2601      UnicodeSet  *fZW;
2602      UnicodeSet  *fGL;
2603      UnicodeSet  *fCB;
2604      UnicodeSet  *fSP;
2605      UnicodeSet  *fB2;
2606      UnicodeSet  *fBA;
2607      UnicodeSet  *fBB;
2608      UnicodeSet  *fHH;
2609      UnicodeSet  *fHY;
2610      UnicodeSet  *fH2;
2611      UnicodeSet  *fH3;
2612      UnicodeSet  *fCL;
2613      UnicodeSet  *fCP;
2614      UnicodeSet  *fEX;
2615      UnicodeSet  *fIN;
2616      UnicodeSet  *fJL;
2617      UnicodeSet  *fJV;
2618      UnicodeSet  *fJT;
2619      UnicodeSet  *fNS;
2620      UnicodeSet  *fOP;
2621      UnicodeSet  *fQU;
2622      UnicodeSet  *fIS;
2623      UnicodeSet  *fNU;
2624      UnicodeSet  *fPO;
2625      UnicodeSet  *fPR;
2626      UnicodeSet  *fSY;
2627      UnicodeSet  *fAI;
2628      UnicodeSet  *fAL;
2629      UnicodeSet  *fCJ;
2630      UnicodeSet  *fHL;
2631      UnicodeSet  *fID;
2632      UnicodeSet  *fRI;
2633      UnicodeSet  *fXX;
2634      UnicodeSet  *fEB;
2635      UnicodeSet  *fEM;
2636      UnicodeSet  *fZWJ;
2637      UnicodeSet  *fOP30;
2638      UnicodeSet  *fCP30;
2639      UnicodeSet  *fExtPictUnassigned;
2640  
2641      BreakIterator        *fCharBI;
2642      const UnicodeString  *fText;
2643      RegexMatcher         *fNumberMatcher;
2644  };
2645  
RBBILineMonkey()2646  RBBILineMonkey::RBBILineMonkey() :
2647      RBBIMonkeyKind(),
2648      fSets(NULL),
2649  
2650      fCharBI(NULL),
2651      fText(NULL),
2652      fNumberMatcher(NULL)
2653  
2654  {
2655      if (U_FAILURE(deferredStatus)) {
2656          return;
2657      }
2658  
2659      UErrorCode  status = U_ZERO_ERROR;
2660  
2661      fSets  = new UVector(status);
2662  
2663      fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2664      fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2665      fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2666      fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2667      fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2668      fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2669      fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2670      fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2671      fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2672      fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2673      fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2674      fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2675      fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2676      fHH    = new UnicodeSet();
2677      fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2678      fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2679      fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2680      fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2681      fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2682      fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2683      fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2684      fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2685      fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2686      fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2687      fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2688      fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2689      fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2690      fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2691      fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2692      fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2693      fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2694      fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2695      fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2696      fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2697      fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2698      fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2699      fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2700      fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2701      fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2702      fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2703      fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2704      fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2705      fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2706      fOP30  = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2707      fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2708      fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2709  
2710      if (U_FAILURE(status)) {
2711          deferredStatus = status;
2712          return;
2713      }
2714  
2715      fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2716      fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2717      fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2718  
2719      fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2720      fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2721  
2722      fHH->add(u'\u2010');   // Hyphen, '‐'
2723  
2724      // Sets and names.
2725      fSets->addElement(fBK, status); classNames.push_back("fBK");
2726      fSets->addElement(fCR, status); classNames.push_back("fCR");
2727      fSets->addElement(fLF, status); classNames.push_back("fLF");
2728      fSets->addElement(fCM, status); classNames.push_back("fCM");
2729      fSets->addElement(fNL, status); classNames.push_back("fNL");
2730      fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2731      fSets->addElement(fZW, status); classNames.push_back("fZW");
2732      fSets->addElement(fGL, status); classNames.push_back("fGL");
2733      fSets->addElement(fCB, status); classNames.push_back("fCB");
2734      fSets->addElement(fSP, status); classNames.push_back("fSP");
2735      fSets->addElement(fB2, status); classNames.push_back("fB2");
2736      fSets->addElement(fBA, status); classNames.push_back("fBA");
2737      fSets->addElement(fBB, status); classNames.push_back("fBB");
2738      fSets->addElement(fHY, status); classNames.push_back("fHY");
2739      fSets->addElement(fH2, status); classNames.push_back("fH2");
2740      fSets->addElement(fH3, status); classNames.push_back("fH3");
2741      fSets->addElement(fCL, status); classNames.push_back("fCL");
2742      fSets->addElement(fCP, status); classNames.push_back("fCP");
2743      fSets->addElement(fEX, status); classNames.push_back("fEX");
2744      fSets->addElement(fIN, status); classNames.push_back("fIN");
2745      fSets->addElement(fJL, status); classNames.push_back("fJL");
2746      fSets->addElement(fJT, status); classNames.push_back("fJT");
2747      fSets->addElement(fJV, status); classNames.push_back("fJV");
2748      fSets->addElement(fNS, status); classNames.push_back("fNS");
2749      fSets->addElement(fOP, status); classNames.push_back("fOP");
2750      fSets->addElement(fQU, status); classNames.push_back("fQU");
2751      fSets->addElement(fIS, status); classNames.push_back("fIS");
2752      fSets->addElement(fNU, status); classNames.push_back("fNU");
2753      fSets->addElement(fPO, status); classNames.push_back("fPO");
2754      fSets->addElement(fPR, status); classNames.push_back("fPR");
2755      fSets->addElement(fSY, status); classNames.push_back("fSY");
2756      fSets->addElement(fAI, status); classNames.push_back("fAI");
2757      fSets->addElement(fAL, status); classNames.push_back("fAL");
2758      fSets->addElement(fHL, status); classNames.push_back("fHL");
2759      fSets->addElement(fID, status); classNames.push_back("fID");
2760      fSets->addElement(fRI, status); classNames.push_back("fRI");
2761      fSets->addElement(fSG, status); classNames.push_back("fSG");
2762      fSets->addElement(fEB, status); classNames.push_back("fEB");
2763      fSets->addElement(fEM, status); classNames.push_back("fEM");
2764      fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2765      // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2766      fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2767      fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2768      fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
2769  
2770      const char *rules =
2771              "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2772              "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2773              "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2774              "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2775              "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2776              "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2777              "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2778  
2779      fNumberMatcher = new RegexMatcher(
2780          UnicodeString(rules, -1, US_INV), 0, status);
2781  
2782      fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2783  
2784      if (U_FAILURE(status)) {
2785          deferredStatus = status;
2786      }
2787  
2788  }
2789  
2790  
setText(const UnicodeString & s)2791  void RBBILineMonkey::setText(const UnicodeString &s) {
2792      fText       = &s;
2793      fCharBI->setText(s);
2794      prepareAppliedRules(s.length());
2795      fNumberMatcher->reset(s);
2796  }
2797  
2798  //
2799  //  rule9Adjust
2800  //     Line Break TR rules 9 and 10 implementation.
2801  //     This deals with combining marks and other sequences that
2802  //     that must be treated as if they were something other than what they actually are.
2803  //
2804  //     This is factored out into a separate function because it must be applied twice for
2805  //     each potential break, once to the chars before the position being checked, then
2806  //     again to the text following the possible break.
2807  //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2808  void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2809      if (pos == -1) {
2810          // Invalid initial position.  Happens during the warmup iteration of the
2811          //   main loop in next().
2812          return;
2813      }
2814  
2815      int32_t  nPos = *nextPos;
2816  
2817      // LB 9  Keep combining sequences together.
2818      // advance over any CM class chars.  Note that Line Break CM is different
2819      // from the normal Grapheme Extend property.
2820      if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2821            *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2822          for (;;) {
2823              *nextChar = fText->char32At(nPos);
2824              if (!fCM->contains(*nextChar)) {
2825                  break;
2826              }
2827              nPos = fText->moveIndex32(nPos, 1);
2828          }
2829      }
2830  
2831  
2832      // LB 9 Treat X CM* as if it were x.
2833      //       No explicit action required.
2834  
2835      // LB 10  Treat any remaining combining mark as AL
2836      if (fCM->contains(*posChar)) {
2837          *posChar = u'A';
2838      }
2839  
2840      // Push the updated nextPos and nextChar back to our caller.
2841      // This only makes a difference if posChar got bigger by consuming a
2842      // combining sequence.
2843      *nextPos  = nPos;
2844      *nextChar = fText->char32At(nPos);
2845  }
2846  
2847  
2848  
next(int32_t startPos)2849  int32_t RBBILineMonkey::next(int32_t startPos) {
2850      UErrorCode status = U_ZERO_ERROR;
2851      int32_t    pos;       //  Index of the char following a potential break position
2852      UChar32    thisChar;  //  Character at above position "pos"
2853  
2854      int32_t    prevPos;   //  Index of the char preceding a potential break position
2855      UChar32    prevChar;  //  Character at above position.  Note that prevChar
2856                            //   and thisChar may not be adjacent because combining
2857                            //   characters between them will be ignored.
2858  
2859      int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2860      UChar32    prevCharX2;
2861  
2862      int32_t    nextPos;   //  Index of the next character following pos.
2863                            //     Usually skips over combining marks.
2864      int32_t    nextCPPos; //  Index of the code point following "pos."
2865                            //     May point to a combining mark.
2866      int32_t    tPos;      //  temp value.
2867      UChar32    c;
2868  
2869      if (U_FAILURE(deferredStatus)) {
2870          return -1;
2871      }
2872  
2873      if (startPos >= fText->length()) {
2874          return -1;
2875      }
2876  
2877  
2878      // Initial values for loop.  Loop will run the first time without finding breaks,
2879      //                           while the invalid values shift out and the "this" and
2880      //                           "prev" positions are filled in with good values.
2881      pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2882      thisChar = prevChar  = prevCharX2 = 0;
2883      nextPos  = nextCPPos = startPos;
2884  
2885  
2886      // Loop runs once per position in the test text, until a break position
2887      //  is found.
2888      for (;;) {
2889          prevPosX2 = prevPos;
2890          prevCharX2 = prevChar;
2891  
2892          prevPos   = pos;
2893          prevChar  = thisChar;
2894  
2895          pos       = nextPos;
2896          thisChar  = fText->char32At(pos);
2897  
2898          nextCPPos = fText->moveIndex32(pos, 1);
2899          nextPos   = nextCPPos;
2900  
2901  
2902          if (pos >= fText->length()) {
2903              setAppliedRule(pos, "LB2 - Break at end of text.");
2904              break;
2905          }
2906  
2907  
2908          //             We do this one out-of-order because the adjustment does not change anything
2909          //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2910          //             be applied.
2911          rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2912          nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2913          c = fText->char32At(nextPos);
2914          rule9Adjust(pos, &thisChar, &nextPos, &c);
2915  
2916          // If the loop is still warming up - if we haven't shifted the initial
2917          //   -1 positions out of prevPos yet - loop back to advance the
2918          //    position in the input without any further looking for breaks.
2919          if (prevPos == -1) {
2920            setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2921              continue;
2922          }
2923  
2924  
2925          if (fBK->contains(prevChar)) {
2926              setAppliedRule(pos, "LB 4  Always break after hard line breaks");
2927              break;
2928          }
2929  
2930  
2931          if (prevChar == 0x0d && thisChar == 0x0a) {
2932              setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
2933              continue;
2934          }
2935          if (prevChar == 0x0d ||
2936              prevChar == 0x0a ||
2937              prevChar == 0x85)  {
2938              setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
2939              break;
2940          }
2941  
2942  
2943          if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2944              fBK->contains(thisChar)) {
2945              setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
2946              continue;
2947          }
2948  
2949  
2950          if (fSP->contains(thisChar)) {
2951              setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
2952              continue;
2953          }
2954  
2955          // !!! ??? Is this the right text for the applied rule?
2956          if (fZW->contains(thisChar)) {
2957              setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
2958              continue;
2959          }
2960  
2961  
2962          //       ZW SP* ÷
2963          //       Scan backwards from prevChar for SP* ZW
2964          tPos = prevPos;
2965          while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2966              tPos = fText->moveIndex32(tPos, -1);
2967          }
2968          if (fZW->contains(fText->char32At(tPos))) {
2969              setAppliedRule(pos, "LB 8  Break after zero width space");
2970              break;
2971          }
2972  
2973  
2974          //          Move this test up, before LB8a, because numbers can match a longer sequence that would
2975          //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
2976          if (fNumberMatcher->lookingAt(prevPos, status)) {
2977              if (U_FAILURE(status)) {
2978                  setAppliedRule(pos, "LB 25 Numbers");
2979                  break;
2980              }
2981              // Matched a number.  But could have been just a single digit, which would
2982              //    not represent a "no break here" between prevChar and thisChar
2983              int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
2984              if (numEndIdx > pos) {
2985                  // Number match includes at least our two chars being checked
2986                  if (numEndIdx > nextPos) {
2987                      // Number match includes additional chars.  Update pos and nextPos
2988                      //   so that next loop iteration will continue at the end of the number,
2989                      //   checking for breaks between last char in number & whatever follows.
2990                      pos = nextPos = numEndIdx;
2991                      do {
2992                          pos = fText->moveIndex32(pos, -1);
2993                          thisChar = fText->char32At(pos);
2994                      } while (fCM->contains(thisChar));
2995                  }
2996                  setAppliedRule(pos, "LB 25 Numbers");
2997                  continue;
2998              }
2999          }
3000  
3001  
3002          //       The monkey test's way of ignoring combining characters doesn't work
3003          //       for this rule. ZJ is also a CM. Need to get the actual character
3004          //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3005          {
3006              int32_t prevIdx = fText->moveIndex32(pos, -1);
3007              UChar32 prevC = fText->char32At(prevIdx);
3008              if (fZWJ->contains(prevC)) {
3009                  setAppliedRule(pos, "LB 8a ZWJ x");
3010                  continue;
3011              }
3012          }
3013  
3014  
3015          // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
3016          //
3017  
3018  
3019          //    x  WJ
3020          //    WJ  x
3021          //
3022          if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3023              setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
3024              continue;
3025          }
3026  
3027  
3028          if (fGL->contains(prevChar)) {
3029              setAppliedRule(pos, "LB 12  GL  x");
3030              continue;
3031          }
3032  
3033  
3034            if (!(fSP->contains(prevChar) ||
3035                fBA->contains(prevChar) ||
3036                fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3037                setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
3038                continue;
3039          }
3040  
3041  
3042          if (fCL->contains(thisChar) ||
3043                  fCP->contains(thisChar) ||
3044                  fEX->contains(thisChar) ||
3045                  fSY->contains(thisChar)) {
3046              setAppliedRule(pos, "LB 13  Don't break before closings.");
3047              continue;
3048          }
3049  
3050  
3051          //       Scan backwards, checking for this sequence.
3052          //       The OP char could include combining marks, so we actually check for
3053          //           OP CM* SP*
3054          //       Another Twist: The Rule 9 fixes may have changed a SP CM
3055          //       sequence into a ID char, so before scanning back through spaces,
3056          //       verify that prevChar is indeed a space.  The prevChar variable
3057          //       may differ from fText[prevPos]
3058          tPos = prevPos;
3059          if (fSP->contains(prevChar)) {
3060              while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3061                  tPos=fText->moveIndex32(tPos, -1);
3062              }
3063          }
3064          while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3065              tPos=fText->moveIndex32(tPos, -1);
3066          }
3067          if (fOP->contains(fText->char32At(tPos))) {
3068              setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3069              continue;
3070          }
3071  
3072  
3073          if (nextPos < fText->length()) {
3074              // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3075              //       from a legit ffff character. So test length separately.
3076              UChar32 nextChar = fText->char32At(nextPos);
3077              if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3078                  setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3079                  break;
3080              }
3081          }
3082  
3083  
3084            if (fIS->contains(thisChar)) {
3085                setAppliedRule(pos, "LB 14b  Do not break before numeric separators, even after spaces.");
3086                continue;
3087          }
3088  
3089  
3090          if (fOP->contains(thisChar)) {
3091              // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3092              int tPos = prevPos;
3093              while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3094                  tPos = fText->moveIndex32(tPos, -1);
3095              }
3096              while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3097                  tPos = fText->moveIndex32(tPos, -1);
3098              }
3099              if (fQU->contains(fText->char32At(tPos))) {
3100                  setAppliedRule(pos, "LB 15    QU SP* x OP");
3101                  continue;
3102              }
3103          }
3104  
3105  
3106          //    Scan backwards for SP* CM* (CL | CP)
3107          if (fNS->contains(thisChar)) {
3108              int tPos = prevPos;
3109              while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3110                  tPos = fText->moveIndex32(tPos, -1);
3111              }
3112              while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3113                  tPos = fText->moveIndex32(tPos, -1);
3114              }
3115              if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3116                  setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
3117                  continue;
3118              }
3119          }
3120  
3121  
3122          if (fB2->contains(thisChar)) {
3123              //  Scan backwards, checking for the B2 CM* SP* sequence.
3124              tPos = prevPos;
3125              if (fSP->contains(prevChar)) {
3126                  while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3127                      tPos=fText->moveIndex32(tPos, -1);
3128                  }
3129              }
3130              while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3131                  tPos=fText->moveIndex32(tPos, -1);
3132              }
3133              if (fB2->contains(fText->char32At(tPos))) {
3134                  setAppliedRule(pos, "LB 17   B2 SP* x B2");
3135                  continue;
3136              }
3137          }
3138  
3139  
3140          if (fSP->contains(prevChar)) {
3141              setAppliedRule(pos, "LB 18    break after space");
3142              break;
3143          }
3144  
3145          //    x   QU
3146          //    QU  x
3147          if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3148              setAppliedRule(pos, "LB 19");
3149              continue;
3150          }
3151  
3152          if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3153              setAppliedRule(pos, "LB 20  Break around a CB");
3154              break;
3155          }
3156  
3157          //           Don't break between Hyphens and letters if a break precedes the hyphen.
3158          //           Formerly this was a Finnish tailoring.
3159          //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3160          //           ^($HY | $HH) $AL;
3161          if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3162                  prevPosX2 == -1) {
3163              setAppliedRule(pos, "LB 20.09");
3164              continue;
3165          }
3166  
3167          if (fBA->contains(thisChar) ||
3168              fHY->contains(thisChar) ||
3169              fNS->contains(thisChar) ||
3170              fBB->contains(prevChar) )   {
3171              setAppliedRule(pos, "LB 21");
3172              continue;
3173          }
3174  
3175          if (fHL->contains(prevCharX2) &&
3176                  (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3177              setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
3178              continue;
3179          }
3180  
3181          if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3182              setAppliedRule(pos, "LB 21b SY x HL");
3183              continue;
3184          }
3185  
3186          if (fIN->contains(thisChar))   {
3187              setAppliedRule(pos, "LB 22");
3188              continue;
3189          }
3190  
3191  
3192          //          (AL | HL) x NU
3193          //          NU x (AL | HL)
3194          if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3195              setAppliedRule(pos, "LB 23");
3196              continue;
3197          }
3198          if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3199              setAppliedRule(pos, "LB 23");
3200              continue;
3201          }
3202  
3203          // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3204          //      PR x (ID | EB | EM)
3205          //     (ID | EB | EM) x PO
3206          if (fPR->contains(prevChar) &&
3207                  (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3208              setAppliedRule(pos, "LB 23a");
3209              continue;
3210          }
3211          if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3212                  fPO->contains(thisChar)) {
3213              setAppliedRule(pos, "LB 23a");
3214              continue;
3215          }
3216  
3217          //   Do not break between prefix and letters or ideographs.
3218          //         (PR | PO) x (AL | HL)
3219          //         (AL | HL) x (PR | PO)
3220          if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3221                  (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3222              setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3223              continue;
3224          }
3225          if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3226                  (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3227              setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3228              continue;
3229          }
3230  
3231          // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3232  
3233          if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3234                                          fJV->contains(thisChar) ||
3235                                          fH2->contains(thisChar) ||
3236                                          fH3->contains(thisChar))) {
3237              setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3238              continue;
3239                                          }
3240  
3241          if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3242              (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3243              setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3244              continue;
3245          }
3246  
3247          if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3248              fJT->contains(thisChar)) {
3249              setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3250              continue;
3251          }
3252  
3253          if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3254              fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3255              fPO->contains(thisChar)) {
3256              setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3257              continue;
3258          }
3259          if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3260              fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3261              setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3262              continue;
3263          }
3264  
3265  
3266  
3267          if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3268              setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
3269              continue;
3270          }
3271  
3272            if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3273                setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3274                continue;
3275          }
3276  
3277          //          (AL | NU) x OP
3278          //          CP x (AL | NU)
3279          if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3280              setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3281              continue;
3282          }
3283          if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3284              setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3285              continue;
3286          }
3287  
3288          //             RI  x  RI
3289          if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3290              setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3291              break;
3292          }
3293          if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3294              // Two Regional Indicators have been paired.
3295              // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3296              // following RI. This is a hack.
3297              thisChar = -1;
3298              setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3299              continue;
3300          }
3301  
3302          // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3303          if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3304              setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
3305              continue;
3306          }
3307  
3308          if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3309              setAppliedRule(pos, "LB30b    [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
3310              continue;
3311          }
3312  
3313          setAppliedRule(pos, "LB 31    Break everywhere else");
3314          break;
3315      }
3316  
3317      return pos;
3318  }
3319  
3320  
charClasses()3321  UVector  *RBBILineMonkey::charClasses() {
3322      return fSets;
3323  }
3324  
3325  
~RBBILineMonkey()3326  RBBILineMonkey::~RBBILineMonkey() {
3327      delete fSets;
3328  
3329      delete fBK;
3330      delete fCR;
3331      delete fLF;
3332      delete fCM;
3333      delete fNL;
3334      delete fWJ;
3335      delete fZW;
3336      delete fGL;
3337      delete fCB;
3338      delete fSP;
3339      delete fB2;
3340      delete fBA;
3341      delete fBB;
3342      delete fHH;
3343      delete fHY;
3344      delete fH2;
3345      delete fH3;
3346      delete fCL;
3347      delete fCP;
3348      delete fEX;
3349      delete fIN;
3350      delete fJL;
3351      delete fJV;
3352      delete fJT;
3353      delete fNS;
3354      delete fOP;
3355      delete fQU;
3356      delete fIS;
3357      delete fNU;
3358      delete fPO;
3359      delete fPR;
3360      delete fSY;
3361      delete fAI;
3362      delete fAL;
3363      delete fCJ;
3364      delete fHL;
3365      delete fID;
3366      delete fRI;
3367      delete fSG;
3368      delete fXX;
3369      delete fEB;
3370      delete fEM;
3371      delete fZWJ;
3372      delete fOP30;
3373      delete fCP30;
3374      delete fExtPictUnassigned;
3375  
3376      delete fCharBI;
3377      delete fNumberMatcher;
3378  }
3379  
3380  
3381  //-------------------------------------------------------------------------------------------
3382  //
3383  //   TestMonkey
3384  //
3385  //     params
3386  //       seed=nnnnn        Random number starting seed.
3387  //                         Setting the seed allows errors to be reproduced.
3388  //       loop=nnn          Looping count.  Controls running time.
3389  //                         -1:  run forever.
3390  //                          0 or greater:  run length.
3391  //
3392  //       type = char | word | line | sent | title
3393  //
3394  //  Example:
3395  //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3396  //
3397  //-------------------------------------------------------------------------------------------
3398  
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3399  static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3400      int32_t val = defaultVal;
3401      name.append(" *= *(-?\\d+)");
3402      UErrorCode status = U_ZERO_ERROR;
3403      RegexMatcher m(name, params, 0, status);
3404      if (m.find()) {
3405          // The param exists.  Convert the string to an int.
3406          char valString[100];
3407          int32_t paramLength = m.end(1, status) - m.start(1, status);
3408          if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3409              paramLength = (int32_t)(sizeof(valString)-2);
3410          }
3411          params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3412          val = strtol(valString, NULL, 10);
3413  
3414          // Delete this parameter from the params string.
3415          m.reset();
3416          params = m.replaceFirst("", status);
3417      }
3418      U_ASSERT(U_SUCCESS(status));
3419      return val;
3420  }
3421  #endif
3422  
3423  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3424  static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3425                                      BreakIterator *bi,
3426                                      int expected[],
3427                                      int expectedcount)
3428  {
3429      int count = 0;
3430      int i = 0;
3431      int forward[50];
3432      bi->setText(ustr);
3433      for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3434          forward[count] = i;
3435          if (count < expectedcount && expected[count] != i) {
3436              test->errln("%s:%d break forward test failed: expected %d but got %d",
3437                          __FILE__, __LINE__, expected[count], i);
3438              break;
3439          }
3440          count ++;
3441      }
3442      if (count != expectedcount) {
3443          printStringBreaks(ustr, expected, expectedcount);
3444          test->errln("%s:%d break forward test failed: missed %d match",
3445                      __FILE__, __LINE__, expectedcount - count);
3446          return;
3447      }
3448      // testing boundaries
3449      for (i = 1; i < expectedcount; i ++) {
3450          int j = expected[i - 1];
3451          if (!bi->isBoundary(j)) {
3452              printStringBreaks(ustr, expected, expectedcount);
3453              test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3454                      __FILE__, __LINE__, j);
3455              return;
3456          }
3457          for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3458              if (bi->isBoundary(j)) {
3459                  printStringBreaks(ustr, expected, expectedcount);
3460                  test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3461                      __FILE__, __LINE__, j);
3462                  return;
3463              }
3464          }
3465      }
3466  
3467      for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3468          count --;
3469          if (forward[count] != i) {
3470              printStringBreaks(ustr, expected, expectedcount);
3471              test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3472                          __FILE__, __LINE__, forward[count], i);
3473              break;
3474          }
3475      }
3476      if (count != 0) {
3477          printStringBreaks(ustr, expected, expectedcount);
3478          test->errln("break test previous() failed: missed a match");
3479          return;
3480      }
3481  
3482      // testing preceding
3483      for (i = 0; i < expectedcount - 1; i ++) {
3484          // int j = expected[i] + 1;
3485          int j = ustr.moveIndex32(expected[i], 1);
3486          for (; j <= expected[i + 1]; j ++) {
3487              int32_t expectedPreceding = expected[i];
3488              int32_t actualPreceding = bi->preceding(j);
3489              if (actualPreceding != expectedPreceding) {
3490                  printStringBreaks(ustr, expected, expectedcount);
3491                  test->errln("%s:%d preceding(%d): expected %d, got %d",
3492                          __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3493                  return;
3494              }
3495          }
3496      }
3497  }
3498  #endif
3499  
TestWordBreaks(void)3500  void RBBITest::TestWordBreaks(void)
3501  {
3502  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3503  
3504      Locale        locale("en");
3505      UErrorCode    status = U_ZERO_ERROR;
3506      // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3507      BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3508      // Replaced any C+J characters in a row with a random sequence of characters
3509      // of the same length to make our C+J segmentation not get in the way.
3510      static const char *strlist[] =
3511      {
3512      "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3513      "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3514      "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3515      "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3516      "\\uac00\\u3588\\u009c\\u0953\\u194b",
3517      "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3518      "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3519      "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3520      "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3521      "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3522      "\\u2027\\U000e0067\\u0a47\\u00b7",
3523      "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3524      "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3525      "\\u0589\\U000e006e\\u0a42\\U000104a5",
3526      "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3527      "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3528      "\\u0027\\u11af\\U000e0057\\u0602",
3529      "\\U0001d7f2\\U000e007\\u0004\\u0589",
3530      "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3531      "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3532      "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3533      "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3534      "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3535      "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3536      "\\u0233\\U000e0020\\u0a69\\u0d6a",
3537      "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3538      "\\u18f4\\U000e0049\\u20e7\\u2027",
3539      "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3540      "\\ua183\\u102d\\u0bec\\u003a",
3541      "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3542      "\\u003a\\u0e57\\u0fad\\u002e",
3543      "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3544      "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3545      "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3546      "\\u003a\\u0664\\u00b7\\u1fba",
3547      "\\u003b\\u0027\\u00b7\\u47a3",
3548      "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3549      "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3550      "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3551      };
3552      int loop;
3553      if (U_FAILURE(status)) {
3554          errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3555          return;
3556      }
3557      for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3558          // printf("looping %d\n", loop);
3559          UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3560          // RBBICharMonkey monkey;
3561          RBBIWordMonkey monkey;
3562  
3563          int expected[50];
3564          int expectedcount = 0;
3565  
3566          monkey.setText(ustr);
3567          int i;
3568          for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3569              expected[expectedcount ++] = i;
3570          }
3571  
3572          testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3573      }
3574      delete bi;
3575  #endif
3576  }
3577  
TestWordBoundary(void)3578  void RBBITest::TestWordBoundary(void)
3579  {
3580      // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3581      Locale        locale("en");
3582      UErrorCode    status = U_ZERO_ERROR;
3583      // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3584      LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3585      if (U_FAILURE(status)) {
3586          errcheckln(status, "%s:%d Creation of break iterator failed %s",
3587                  __FILE__, __LINE__, u_errorName(status));
3588          return;
3589      }
3590      UChar         str[50];
3591      static const char *strlist[] =
3592      {
3593      "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3594      "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3595      "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3596      "\\u2027\\U000e0067\\u0a47\\u00b7",
3597      "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3598      "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3599      "\\u0589\\U000e006e\\u0a42\\U000104a5",
3600      "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3601      "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3602      "\\u0027\\u11af\\U000e0057\\u0602",
3603      "\\U0001d7f2\\U000e007\\u0004\\u0589",
3604      "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3605      "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3606      "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3607      "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3608      "\\U000e0065\\u302c\\u09ee\\U000e0068",
3609      "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3610      "\\u0233\\U000e0020\\u0a69\\u0d6a",
3611      "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3612      "\\u58f4\\U000e0049\\u20e7\\u2027",
3613      "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3614      "\\ua183\\u102d\\u0bec\\u003a",
3615      "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3616      "\\u003a\\u0e57\\u0fad\\u002e",
3617      "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3618      "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3619      "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3620      "\\u003a\\u0664\\u00b7\\u1fba",
3621      "\\u003b\\u0027\\u00b7\\u47a3",
3622      };
3623      int loop;
3624      for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3625          u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3626          UnicodeString ustr(str);
3627          int forward[50];
3628          int count = 0;
3629  
3630          bi->setText(ustr);
3631          int prev = -1;
3632          for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3633              ++count;
3634              if (count >= UPRV_LENGTHOF(forward)) {
3635                  errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3636                          __FILE__, __LINE__, loop, count, boundary);
3637                  return;
3638              }
3639              forward[count] = boundary;
3640              if (boundary <= prev) {
3641                  errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3642                          __FILE__, __LINE__, loop, prev, boundary);
3643                  break;
3644              }
3645              for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3646                  if (bi->isBoundary(nonBoundary)) {
3647                      printStringBreaks(ustr, forward, count);
3648                      errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3649                             __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3650                      return;
3651                  }
3652              }
3653              if (!bi->isBoundary(boundary)) {
3654                  printStringBreaks(ustr, forward, count);
3655                  errln("%s:%d happy boundary test failed: expected %d a boundary",
3656                         __FILE__, __LINE__, boundary);
3657                  return;
3658              }
3659              prev = boundary;
3660          }
3661      }
3662  }
3663  
TestLineBreaks(void)3664  void RBBITest::TestLineBreaks(void)
3665  {
3666  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3667      Locale        locale("en");
3668      UErrorCode    status = U_ZERO_ERROR;
3669      BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3670      const int32_t  STRSIZE = 50;
3671      UChar         str[STRSIZE];
3672      static const char *strlist[] =
3673      {
3674       "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3675       "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3676               "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3677       "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3678               "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3679       "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3680       "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3681       "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3682       "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3683       "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3684       "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3685       "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3686       "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3687       "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3688       "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3689       "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3690       "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3691       "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3692       "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3693       "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3694       "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3695       "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3696       "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3697       "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3698       "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3699       "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3700       "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3701       "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3702       "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3703       "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3704       "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3705       "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3706       "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3707       "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3708       "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3709       "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3710       "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3711       "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3712       "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3713           "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3714      };
3715      int loop;
3716      TEST_ASSERT_SUCCESS(status);
3717      if (U_FAILURE(status)) {
3718          return;
3719      }
3720      for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3721          // printf("looping %d\n", loop);
3722          int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3723          if (t >= STRSIZE) {
3724              TEST_ASSERT(false);
3725              continue;
3726          }
3727  
3728  
3729          UnicodeString ustr(str);
3730          RBBILineMonkey monkey;
3731          if (U_FAILURE(monkey.deferredStatus)) {
3732              continue;
3733          }
3734  
3735          const int EXPECTEDSIZE = 50;
3736          int expected[EXPECTEDSIZE];
3737          int expectedcount = 0;
3738  
3739          monkey.setText(ustr);
3740  
3741          int i;
3742          for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3743              if (expectedcount >= EXPECTEDSIZE) {
3744                  TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3745                  return;
3746              }
3747              expected[expectedcount ++] = i;
3748          }
3749  
3750          testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3751      }
3752      delete bi;
3753  #endif
3754  }
3755  
TestSentBreaks(void)3756  void RBBITest::TestSentBreaks(void)
3757  {
3758  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3759      Locale        locale("en");
3760      UErrorCode    status = U_ZERO_ERROR;
3761      BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3762      UChar         str[200];
3763      static const char *strlist[] =
3764      {
3765       "Now\ris\nthe\r\ntime\n\rfor\r\r",
3766       "This\n",
3767       "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3768       "\"Sentence ending with a quote.\" Bye.",
3769       "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3770       "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3771       "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3772       "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3773       "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3774       "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3775       "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3776               "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3777               "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3778               "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3779       "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3780               "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3781               "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3782               "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3783               "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3784               "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3785      };
3786      int loop;
3787      if (U_FAILURE(status)) {
3788          errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3789          return;
3790      }
3791      for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3792          u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3793          UnicodeString ustr(str);
3794  
3795          RBBISentMonkey monkey;
3796          if (U_FAILURE(monkey.deferredStatus)) {
3797              continue;
3798          }
3799  
3800          const int EXPECTEDSIZE = 50;
3801          int expected[EXPECTEDSIZE];
3802          int expectedcount = 0;
3803  
3804          monkey.setText(ustr);
3805  
3806          int i;
3807          for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3808              if (expectedcount >= EXPECTEDSIZE) {
3809                  TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3810                  return;
3811              }
3812              expected[expectedcount ++] = i;
3813          }
3814  
3815          testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3816      }
3817      delete bi;
3818  #endif
3819  }
3820  
TestMonkey()3821  void RBBITest::TestMonkey() {
3822  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3823  
3824      UErrorCode     status    = U_ZERO_ERROR;
3825      int32_t        loopCount = 500;
3826      int32_t        seed      = 1;
3827      UnicodeString  breakType = "all";
3828      Locale         locale("en");
3829      UBool          useUText  = false;
3830  
3831      if (quick == false) {
3832          loopCount = 10000;
3833      }
3834  
3835      if (fTestParams) {
3836          UnicodeString p(fTestParams);
3837          loopCount = getIntParam("loop", p, loopCount);
3838          seed      = getIntParam("seed", p, seed);
3839  
3840          RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3841          if (m.find()) {
3842              breakType = m.group(1, status);
3843              m.reset();
3844              p = m.replaceFirst("", status);
3845          }
3846  
3847          RegexMatcher u(" *utext", p, 0, status);
3848          if (u.find()) {
3849              useUText = true;
3850              u.reset();
3851              p = u.replaceFirst("", status);
3852          }
3853  
3854  
3855          // m.reset(p);
3856          if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3857              // Each option is stripped out of the option string as it is processed.
3858              // All options have been checked.  The option string should have been completely emptied..
3859              char buf[100];
3860              p.extract(buf, sizeof(buf), NULL, status);
3861              buf[sizeof(buf)-1] = 0;
3862              errln("Unrecognized or extra parameter:  %s\n", buf);
3863              return;
3864          }
3865  
3866      }
3867  
3868      if (breakType == "char" || breakType == "all") {
3869          RBBICharMonkey  m;
3870          BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3871          if (U_SUCCESS(status)) {
3872              RunMonkey(bi, m, "char", seed, loopCount, useUText);
3873              if (breakType == "all" && useUText==false) {
3874                  // Also run a quick test with UText when "all" is specified
3875                  RunMonkey(bi, m, "char", seed, loopCount, true);
3876              }
3877          }
3878          else {
3879              errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3880          }
3881          delete bi;
3882      }
3883  
3884      if (breakType == "word" || breakType == "all") {
3885          logln("Word Break Monkey Test");
3886          RBBIWordMonkey  m;
3887          BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3888          if (U_SUCCESS(status)) {
3889              RunMonkey(bi, m, "word", seed, loopCount, useUText);
3890          }
3891          else {
3892              errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3893          }
3894          delete bi;
3895      }
3896  
3897      if (breakType == "line" || breakType == "all") {
3898          logln("Line Break Monkey Test");
3899          RBBILineMonkey  m;
3900          BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3901          if (loopCount >= 10) {
3902              loopCount = loopCount / 5;   // Line break runs slower than the others.
3903          }
3904          if (U_SUCCESS(status)) {
3905              RunMonkey(bi, m, "line", seed, loopCount, useUText);
3906          }
3907          else {
3908              errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3909          }
3910          delete bi;
3911      }
3912  
3913      if (breakType == "sent" || breakType == "all"  ) {
3914          logln("Sentence Break Monkey Test");
3915          RBBISentMonkey  m;
3916          BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3917          if (loopCount >= 10) {
3918              loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3919          }
3920          if (U_SUCCESS(status)) {
3921              RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3922          }
3923          else {
3924              errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3925          }
3926          delete bi;
3927      }
3928  
3929  #endif
3930  }
3931  
3932  //
3933  //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3934  //    Parameters:
3935  //       bi      - the break iterator to use
3936  //       mk      - MonkeyKind, abstraction for obtaining expected results
3937  //       name    - Name of test (char, word, etc.) for use in error messages
3938  //       seed    - Seed for starting random number generator (parameter from user)
3939  //       numIterations
3940  //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3941  void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3942                           int32_t numIterations, UBool useUText) {
3943  
3944  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3945  
3946      const int32_t    TESTSTRINGLEN = 500;
3947      UnicodeString    testText;
3948      int32_t          numCharClasses;
3949      UVector          *chClasses;
3950      int              expectedCount = 0;
3951      char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3952      char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3953      char             reverseBreaks[TESTSTRINGLEN*2+1];
3954      char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3955      char             followingBreaks[TESTSTRINGLEN*2+1];
3956      char             precedingBreaks[TESTSTRINGLEN*2+1];
3957      int              i;
3958      int              loopCount = 0;
3959  
3960  
3961      m_seed = seed;
3962  
3963      numCharClasses = mk.charClasses()->size();
3964      chClasses      = mk.charClasses();
3965  
3966      // Check for errors that occurred during the construction of the MonkeyKind object.
3967      //  Can't report them where they occurred because errln() is a method coming from intlTest,
3968      //  and is not visible outside of RBBITest :-(
3969      if (U_FAILURE(mk.deferredStatus)) {
3970          errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3971          return;
3972      }
3973  
3974      // Verify that the character classes all have at least one member.
3975      for (i=0; i<numCharClasses; i++) {
3976          UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3977          if (s == NULL || s->size() == 0) {
3978              errln("Character Class #%d is null or of zero size.", i);
3979              return;
3980          }
3981      }
3982  
3983      // For minimizing width of class name output.
3984      int classNameSize = mk.maxClassNameSize();
3985  
3986      while (loopCount < numIterations || numIterations == -1) {
3987          if (numIterations == -1 && loopCount % 10 == 0) {
3988              // If test is running in an infinite loop, display a periodic tic so
3989              //   we can tell that it is making progress.
3990              fprintf(stderr, ".");
3991          }
3992          // Save current random number seed, so that we can recreate the random numbers
3993          //   for this loop iteration in event of an error.
3994          seed = m_seed;
3995  
3996          // Populate a test string with data.
3997          testText.truncate(0);
3998          for (i=0; i<TESTSTRINGLEN; i++) {
3999              int32_t  aClassNum = m_rand() % numCharClasses;
4000              UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4001              int32_t   charIdx = m_rand() % classSet->size();
4002              UChar32   c = classSet->charAt(charIdx);
4003              if (c < 0) {   // TODO:  deal with sets containing strings.
4004                  errln("%s:%d c < 0", __FILE__, __LINE__);
4005                  break;
4006              }
4007              // Do not assemble a supplementary character from randomly generated separate surrogates.
4008              //   (It could be a dictionary character)
4009              if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4010                  continue;
4011              }
4012  
4013              testText.append(c);
4014          }
4015  
4016          // Calculate the expected results for this test string and reset applied rules.
4017          mk.setText(testText);
4018  
4019          memset(expectedBreaks, 0, sizeof(expectedBreaks));
4020          expectedBreaks[0] = 1;
4021          int32_t breakPos = 0;
4022          expectedCount = 0;
4023          for (;;) {
4024              breakPos = mk.next(breakPos);
4025              if (breakPos == -1) {
4026                  break;
4027              }
4028              if (breakPos > testText.length()) {
4029                  errln("breakPos > testText.length()");
4030              }
4031              expectedBreaks[breakPos] = 1;
4032              expectedCount++;
4033              U_ASSERT(expectedCount<testText.length());
4034          }
4035  
4036          // Find the break positions using forward iteration
4037          memset(forwardBreaks, 0, sizeof(forwardBreaks));
4038          if (useUText) {
4039              UErrorCode status = U_ZERO_ERROR;
4040              UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4041              // testUText = utext_openUnicodeString(testUText, &testText, &status);
4042              bi->setText(testUText, status);
4043              TEST_ASSERT_SUCCESS(status);
4044              utext_close(testUText);   // The break iterator does a shallow clone of the UText
4045                                        //  This UText can be closed immediately, so long as the
4046                                        //  testText string continues to exist.
4047          } else {
4048              bi->setText(testText);
4049          }
4050  
4051          for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4052              if (i < 0 || i > testText.length()) {
4053                  errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4054                  break;
4055              }
4056              forwardBreaks[i] = 1;
4057          }
4058  
4059          // Find the break positions using reverse iteration
4060          memset(reverseBreaks, 0, sizeof(reverseBreaks));
4061          for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4062              if (i < 0 || i > testText.length()) {
4063                  errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4064                  break;
4065              }
4066              reverseBreaks[i] = 1;
4067          }
4068  
4069          // Find the break positions using isBoundary() tests.
4070          memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4071          U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4072          for (i=0; i<=testText.length(); i++) {
4073              isBoundaryBreaks[i] = bi->isBoundary(i);
4074          }
4075  
4076  
4077          // Find the break positions using the following() function.
4078          // printf(".");
4079          memset(followingBreaks, 0, sizeof(followingBreaks));
4080          int32_t   lastBreakPos = 0;
4081          followingBreaks[0] = 1;
4082          for (i=0; i<testText.length(); i++) {
4083              breakPos = bi->following(i);
4084              if (breakPos <= i ||
4085                  breakPos < lastBreakPos ||
4086                  breakPos > testText.length() ||
4087                  (breakPos > lastBreakPos && lastBreakPos > i)) {
4088                  errln("%s break monkey test: "
4089                      "Out of range value returned by BreakIterator::following().\n"
4090                          "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4091                           name, seed, i, breakPos, lastBreakPos);
4092                  break;
4093              }
4094              followingBreaks[breakPos] = 1;
4095              lastBreakPos = breakPos;
4096          }
4097  
4098          // Find the break positions using the preceding() function.
4099          memset(precedingBreaks, 0, sizeof(precedingBreaks));
4100          lastBreakPos = testText.length();
4101          precedingBreaks[testText.length()] = 1;
4102          for (i=testText.length(); i>0; i--) {
4103              breakPos = bi->preceding(i);
4104              if (breakPos >= i ||
4105                  breakPos > lastBreakPos ||
4106                  (breakPos < 0 && testText.getChar32Start(i)>0) ||
4107                  (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4108                  errln("%s break monkey test: "
4109                      "Out of range value returned by BreakIterator::preceding().\n"
4110                      "index=%d;  prev returned %d; lastBreak=%d" ,
4111                      name,  i, breakPos, lastBreakPos);
4112                  if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4113                      precedingBreaks[i] = 2;   // Forces an error.
4114                  }
4115              } else {
4116                  if (breakPos >= 0) {
4117                      precedingBreaks[breakPos] = 1;
4118                  }
4119                  lastBreakPos = breakPos;
4120              }
4121          }
4122  
4123          // Compare the expected and actual results.
4124          for (i=0; i<=testText.length(); i++) {
4125              const char *errorType = NULL;
4126              const char* currentBreakData = NULL;
4127              if  (forwardBreaks[i] != expectedBreaks[i]) {
4128                  errorType = "next()";
4129                  currentBreakData = forwardBreaks;
4130              } else if (reverseBreaks[i] != forwardBreaks[i]) {
4131                  errorType = "previous()";
4132                  currentBreakData = reverseBreaks;
4133             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4134                  errorType = "isBoundary()";
4135                  currentBreakData = isBoundaryBreaks;
4136              } else if (followingBreaks[i] != expectedBreaks[i]) {
4137                  errorType = "following()";
4138                  currentBreakData = followingBreaks;
4139              } else if (precedingBreaks[i] != expectedBreaks[i]) {
4140                  errorType = "preceding()";
4141                  currentBreakData = precedingBreaks;
4142              }
4143  
4144              if (errorType != NULL) {
4145                  // Format a range of the test text that includes the failure as
4146                  //  a data item that can be included in the rbbi test data file.
4147  
4148                  // Start of the range is the last point where expected and actual results
4149                  //  both agreed that there was a break position.
4150  
4151                  int startContext = i;
4152                  int32_t count = 0;
4153                  for (;;) {
4154                      if (startContext==0) { break; }
4155                      startContext --;
4156                      if (expectedBreaks[startContext] != 0) {
4157                          if (count == 2) break;
4158                          count ++;
4159                      }
4160                  }
4161  
4162                  // End of range is two expected breaks past the start position.
4163                  int endContext = i + 1;
4164                  int ci;
4165                  for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4166                      for (;;) {
4167                          if (endContext >= testText.length()) {break;}
4168                          if (expectedBreaks[endContext-1] != 0) {
4169                              if (count == 0) break;
4170                              count --;
4171                          }
4172                          endContext ++;
4173                      }
4174                  }
4175  
4176                  // Formatting of each line includes:
4177                  //   character code
4178                  //   reference break: '|' -> a break, '.' -> no break
4179                  //   actual break:    '|' -> a break, '.' -> no break
4180                  //   (name of character clase)
4181                  //   Unicode name of character
4182                  //   '-->' indicates location of the difference.
4183  
4184                  MONKEY_ERROR(
4185                      (expectedBreaks[i] ? "Break expected but not found" :
4186                         "Break found but not expected"),
4187                      name, i, seed);
4188  
4189                  for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4190                      UChar32  c;
4191                      c = testText.char32At(ci);
4192  
4193                      std::string currentLineFlag = "   ";
4194                      if (ci == i) {
4195                          currentLineFlag = "-->";  // Error position
4196                      }
4197  
4198                      // BMP or SMP character in hex
4199                      char hexCodePoint[12];
4200                      std::string format = "    \\u%04x";
4201                      if (c >= 0x10000) {
4202                          format = "\\U%08x";
4203                      }
4204                      sprintf(hexCodePoint, format.c_str(), c);
4205  
4206                      // Get the class name and character name for the character.
4207                      char cName[200];
4208                      UErrorCode status = U_ZERO_ERROR;
4209                      u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4210  
4211                      char buffer[200];
4212                      auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4213                               "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
4214                               currentLineFlag.c_str(),
4215                               ci,
4216                               expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
4217                               currentBreakData[ci] == 0 ? "." : "|",  // Actual break
4218                               hexCodePoint,
4219                               classNameSize,
4220                               mk.classNameFromCodepoint(c).c_str(),
4221                               mk.getAppliedRule(ci).c_str(), cName);
4222                      (void)ret;
4223                      U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4224  
4225                      // Output the error
4226                      if (ci == i) {
4227                          errln(buffer);
4228                      } else {
4229                          infoln(buffer);
4230                      }
4231  
4232                      if (ci >= endContext) { break; }
4233                  }
4234                  break;
4235              }
4236          }
4237  
4238          loopCount++;
4239      }
4240  #endif
4241  }
4242  
4243  
4244  //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4245  //             This test checks the initial patch,
4246  //             which is to just keep it from crashing.  Correct word boundaries
4247  //             await a proper fix to the dictionary code.
4248  //
TestBug5532(void)4249  void RBBITest::TestBug5532(void)  {
4250     // Text includes a mixture of Thai and Latin.
4251     const unsigned char utf8Data[] = {
4252             0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4253             0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4254             0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4255             0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4256             0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4257             0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4258             0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4259             0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4260             0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4261             0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4262             0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4263  
4264      UErrorCode status = U_ZERO_ERROR;
4265      UText utext=UTEXT_INITIALIZER;
4266      utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4267      TEST_ASSERT_SUCCESS(status);
4268  
4269      BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4270      TEST_ASSERT_SUCCESS(status);
4271      if (U_SUCCESS(status)) {
4272          bi->setText(&utext, status);
4273          TEST_ASSERT_SUCCESS(status);
4274  
4275          int32_t breakCount = 0;
4276          int32_t previousBreak = -1;
4277          for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4278              // For now, just make sure that the break iterator doesn't hang.
4279              TEST_ASSERT(previousBreak < bi->current());
4280              previousBreak = bi->current();
4281          }
4282          TEST_ASSERT(breakCount > 0);
4283      }
4284      delete bi;
4285      utext_close(&utext);
4286  }
4287  
4288  
TestBug9983(void)4289  void RBBITest::TestBug9983(void)  {
4290      UnicodeString text = UnicodeString("\\u002A"  // * Other
4291                                         "\\uFF65"  //   Other
4292                                         "\\u309C"  //   Katakana
4293                                         "\\uFF9F"  //   Extend
4294                                         "\\uFF65"  //   Other
4295                                         "\\u0020"  //   Other
4296                                         "\\u0000").unescape();
4297  
4298      UErrorCode status = U_ZERO_ERROR;
4299      LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4300          BreakIterator::createWordInstance(Locale::getRoot(), status)));
4301      TEST_ASSERT_SUCCESS(status);
4302      LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4303          BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4304      TEST_ASSERT_SUCCESS(status);
4305      if (U_FAILURE(status)) {
4306          return;
4307      }
4308      int32_t offset, rstatus, iterationCount;
4309  
4310      brkiter->setText(text);
4311      brkiter->last();
4312      iterationCount = 0;
4313      while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4314          iterationCount++;
4315          rstatus = brkiter->getRuleStatus();
4316          (void)rstatus;     // Suppress set but not used warning.
4317          if (iterationCount >= 10) {
4318             break;
4319          }
4320      }
4321      TEST_ASSERT(iterationCount == 6);
4322  
4323      brkiterPOSIX->setText(text);
4324      brkiterPOSIX->last();
4325      iterationCount = 0;
4326      while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4327          iterationCount++;
4328          rstatus = brkiterPOSIX->getRuleStatus();
4329          (void)rstatus;     // Suppress set but not used warning.
4330          if (iterationCount >= 10) {
4331             break;
4332          }
4333      }
4334      TEST_ASSERT(iterationCount == 6);
4335  }
4336  
4337  // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4338  //
TestBug7547()4339  void RBBITest::TestBug7547() {
4340      UnicodeString rules;
4341      UErrorCode status = U_ZERO_ERROR;
4342      UParseError parseError;
4343      RuleBasedBreakIterator breakIterator(rules, parseError, status);
4344      if (status != U_BRK_RULE_SYNTAX) {
4345          errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4346      }
4347      if (parseError.line != 1 || parseError.offset != 0) {
4348          errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4349      }
4350  }
4351  
4352  
TestBug12797()4353  void RBBITest::TestBug12797() {
4354      UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4355      UErrorCode status = U_ZERO_ERROR;
4356      UParseError parseError;
4357      RuleBasedBreakIterator bi(rules, parseError, status);
4358      if (U_FAILURE(status)) {
4359          errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4360          return;
4361      }
4362      UnicodeString text = "abc";
4363      bi.setText(text);
4364      bi.first();
4365      int32_t boundary = bi.next();
4366      if (boundary != 3) {
4367          errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4368      }
4369  }
4370  
TestBug12918()4371  void RBBITest::TestBug12918() {
4372      // This test triggers an assertion failure in dictbe.cpp
4373      const UChar *crasherString = u"\u3325\u4a16";
4374      UErrorCode status = U_ZERO_ERROR;
4375      UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4376      if (U_FAILURE(status)) {
4377          dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4378          return;
4379      }
4380      ubrk_first(iter);
4381      int32_t pos = 0;
4382      int32_t lastPos = -1;
4383      while((pos = ubrk_next(iter)) != UBRK_DONE) {
4384          if (pos <= lastPos) {
4385              errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4386              break;
4387          }
4388      }
4389      ubrk_close(iter);
4390  }
4391  
TestBug12932()4392  void RBBITest::TestBug12932() {
4393      // Node Stack overflow in the RBBI rule parser caused a seg fault.
4394      UnicodeString ruleStr(
4395              "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4396              "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4397              "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4398              ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4399              ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4400              ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4401  
4402      UErrorCode status = U_ZERO_ERROR;
4403      UParseError parseError;
4404      RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4405      if (status != U_BRK_RULE_SYNTAX) {
4406          errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4407                  __FILE__, __LINE__, u_errorName(status));
4408      }
4409  }
4410  
4411  
4412  // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4413  //             remain undevided by ICU char, word and line break.
TestEmoji()4414  void RBBITest::TestEmoji() {
4415  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4416      UErrorCode  status = U_ZERO_ERROR;
4417  
4418      CharString testFileName;
4419      testFileName.append(IntlTest::getSourceTestData(status), status);
4420      testFileName.appendPathPart("emoji-test.txt", status);
4421      if (U_FAILURE(status)) {
4422          errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4423          return;
4424      }
4425      logln("Opening data file %s\n", testFileName.data());
4426  
4427      int    len;
4428      UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4429      if (U_FAILURE(status) || testFile == NULL) {
4430          errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4431          return;
4432      }
4433      UnicodeString testFileAsString(testFile, len);
4434      delete [] testFile;
4435  
4436      RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4437      RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4438      //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4439      int32_t lineNumber = 0;
4440  
4441      LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4442      LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4443      LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4444      if (U_FAILURE(status)) {
4445          dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4446          return;
4447      }
4448  
4449      while (lineMatcher.find()) {
4450          ++lineNumber;
4451          UnicodeString line = lineMatcher.group(status);
4452          hexMatcher.reset(line);
4453          UnicodeString testString;   // accumulates the emoji sequence.
4454          while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4455              UnicodeString hex = hexMatcher.group(1, status);
4456              if (hex.length() > 8) {
4457                  errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4458                  break;
4459              }
4460              CharString hex8;
4461              hex8.appendInvariantChars(hex, status);
4462              UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4463              if (c<=0x10ffff) {
4464                  testString.append(c);
4465              } else {
4466                  errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4467                          __FILE__, __LINE__, lineNumber, hex8.data());
4468                  break;
4469              }
4470          }
4471  
4472          if (testString.length() > 1) {
4473              charBreaks->setText(testString);
4474              charBreaks->first();
4475              int32_t firstBreak = charBreaks->next();
4476              if (testString.length() != firstBreak) {
4477                  errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4478                          __FILE__, __LINE__, lineNumber, firstBreak);
4479              }
4480              wordBreaks->setText(testString);
4481              wordBreaks->first();
4482              firstBreak = wordBreaks->next();
4483              if (testString.length() != firstBreak) {
4484                  errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4485                          __FILE__, __LINE__, lineNumber, firstBreak);
4486              }
4487              lineBreaks->setText(testString);
4488              lineBreaks->first();
4489              firstBreak = lineBreaks->next();
4490              if (testString.length() != firstBreak) {
4491                  errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4492                          __FILE__, __LINE__, lineNumber, firstBreak);
4493              }
4494          }
4495      }
4496  #endif
4497  }
4498  
4499  
4500  // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4501  
TestBug12519()4502  void RBBITest::TestBug12519() {
4503      UErrorCode status = U_ZERO_ERROR;
4504      LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4505      LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4506      if (!assertSuccess(WHERE, status)) {
4507          dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4508          return;
4509      }
4510      assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4511  
4512      assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4513      assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4514  
4515      LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4516      assertTrue(WHERE, *biEn == *cloneEn);
4517      assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4518  
4519      LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4520      assertTrue(WHERE, *biFr == *cloneFr);
4521      assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4522  
4523      LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4524      UnicodeString text("Hallo Welt");
4525      biDe->setText(text);
4526      assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4527      *biDe = *biFr;
4528      assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4529  }
4530  
TestBug12677()4531  void RBBITest::TestBug12677() {
4532      // Check that stripping of comments from rules for getRules() is not confused by
4533      // the presence of '#' characters in the rules that do not introduce comments.
4534      UnicodeString rules(u"!!forward; \n"
4535                           "$x = [ab#];  # a set with a # literal. \n"
4536                           " # .;        # a comment that looks sort of like a rule.   \n"
4537                           " '#' '?';    # a rule with a quoted #   \n"
4538                         );
4539  
4540      UErrorCode status = U_ZERO_ERROR;
4541      UParseError pe;
4542      RuleBasedBreakIterator bi(rules, pe, status);
4543      assertSuccess(WHERE, status);
4544      UnicodeString rtRules = bi.getRules();
4545      assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"),  rtRules);
4546  }
4547  
4548  
TestTableRedundancies()4549  void RBBITest::TestTableRedundancies() {
4550      UErrorCode status = U_ZERO_ERROR;
4551  
4552      LocalPointer<RuleBasedBreakIterator> bi (
4553          (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4554      assertSuccess(WHERE, status);
4555      if (U_FAILURE(status)) return;
4556  
4557      RBBIDataWrapper *dw = bi->fData;
4558      const RBBIStateTable *fwtbl = dw->fForwardTable;
4559      UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4560      int32_t numCharClasses = dw->fHeader->fCatCount;
4561      // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4562  
4563      // Check for duplicate columns (character categories)
4564  
4565      std::vector<UnicodeString> columns;
4566      for (int32_t column = 0; column < numCharClasses; column++) {
4567          UnicodeString s;
4568          for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4569              RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4570              s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4571          }
4572          columns.push_back(s);
4573      }
4574      // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4575      for (int c1=1; c1<numCharClasses; c1++) {
4576          int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4577          for (int c2 = c1+1; c2 < limit; c2++) {
4578              if (columns.at(c1) == columns.at(c2)) {
4579                  errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4580                  goto out;
4581              }
4582          }
4583      }
4584    out:
4585  
4586      // Check for duplicate states
4587      std::vector<UnicodeString> rows;
4588      for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4589          UnicodeString s;
4590          RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4591          if (in8Bits) {
4592              s.append(row->r8.fAccepting);
4593              s.append(row->r8.fLookAhead);
4594              s.append(row->r8.fTagsIdx);
4595              for (int32_t column = 0; column < numCharClasses; column++) {
4596                  s.append(row->r8.fNextState[column]);
4597              }
4598          } else {
4599              s.append(row->r16.fAccepting);
4600              s.append(row->r16.fLookAhead);
4601              s.append(row->r16.fTagsIdx);
4602              for (int32_t column = 0; column < numCharClasses; column++) {
4603                  s.append(row->r16.fNextState[column]);
4604              }
4605          }
4606          rows.push_back(s);
4607      }
4608      for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4609          for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4610              if (rows.at(r1) == rows.at(r2)) {
4611                  errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4612                  return;
4613              }
4614          }
4615      }
4616  }
4617  
4618  // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4619  //            even after next() has returned DONE.
4620  
TestBug13447()4621  void RBBITest::TestBug13447() {
4622      UErrorCode status = U_ZERO_ERROR;
4623      LocalPointer<RuleBasedBreakIterator> bi(
4624          (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4625      assertSuccess(WHERE, status);
4626      if (U_FAILURE(status)) return;
4627      UnicodeString data(u"1234");
4628      bi->setText(data);
4629      assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4630      assertEquals(WHERE, 4, bi->next());
4631      assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4632      assertEquals(WHERE, UBRK_DONE, bi->next());
4633      assertEquals(WHERE, 4, bi->current());
4634      assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4635  }
4636  
4637  //  TestReverse exercises both the synthesized safe reverse rules and the logic
4638  //  for filling the break iterator cache when starting from random positions
4639  //  in the text.
4640  //
4641  //  It's a monkey test, working on random data, with the expected data obtained
4642  //  from forward iteration (no safe rules involved), comparing with results
4643  //  when indexing into the interior of the string (safe rules needed).
4644  
TestReverse()4645  void RBBITest::TestReverse() {
4646      UErrorCode status = U_ZERO_ERROR;
4647  
4648      TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4649              BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4650      assertSuccess(WHERE, status, true);
4651      status = U_ZERO_ERROR;
4652      TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4653              BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4654      assertSuccess(WHERE, status, true);
4655      status = U_ZERO_ERROR;
4656      TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4657              BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4658      assertSuccess(WHERE, status, true);
4659      status = U_ZERO_ERROR;
4660      TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4661              BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4662      assertSuccess(WHERE, status, true);
4663  }
4664  
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4665  void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4666      if (!bi) {
4667          return;
4668      }
4669  
4670      // From the mapping trie in the break iterator's internal data, create a
4671      // vector of UnicodeStrings, one for each character category, containing
4672      // all of the code points that map to that category. Unicode planes 0 and 1 only,
4673      // to avoid an execess of unassigned code points.
4674  
4675      RBBIDataWrapper *data = bi->fData;
4676      int32_t categoryCount = data->fHeader->fCatCount;
4677      UCPTrie *trie = data->fTrie;
4678      bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4679      uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4680  
4681      std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4682      for (int cp=0; cp<0x1fff0; ++cp) {
4683          int cat = ucptrie_get(trie, cp);
4684          cat &= ~dictBit;    // And off the dictionary bit from the category.
4685          assertTrue(WHERE, cat < categoryCount && cat >= 0);
4686          if (cat < 0 || cat >= categoryCount) return;
4687          strings[cat].append(cp);
4688      }
4689  
4690      icu_rand randomGen;
4691      const int testStringLength = 10000;
4692      UnicodeString testString;
4693  
4694      for (int i=0; i<testStringLength; ++i) {
4695          int charClass = randomGen() % categoryCount;
4696          if (strings[charClass].length() > 0) {
4697              int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4698              testString.append(cp);
4699          }
4700      }
4701  
4702      typedef std::pair<UBool, int32_t> Result;
4703      std::vector<Result> expectedResults;
4704      bi->setText(testString);
4705      for (int i=0; i<testString.length(); ++i) {
4706          bool isboundary = bi->isBoundary(i);
4707          int  ruleStatus = bi->getRuleStatus();
4708          expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4709      }
4710  
4711      for (int i=testString.length()-1; i>=0; --i) {
4712          bi->setText(testString);   // clears the internal break cache
4713          Result expected = expectedResults[i];
4714          assertEquals(WHERE, expected.first, bi->isBoundary(i));
4715          assertEquals(WHERE, expected.second, bi->getRuleStatus());
4716      }
4717  }
4718  
4719  
4720  // Ticket 13692 - finding word boundaries in very large numbers or words could
4721  //                be very time consuming. When the problem was present, this void test
4722  //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4723  
TestBug13692()4724  void RBBITest::TestBug13692() {
4725      UErrorCode status = U_ZERO_ERROR;
4726      LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4727              BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4728      if (!assertSuccess(WHERE, status, true)) {
4729          return;
4730      }
4731      constexpr int32_t LENGTH = 1000000;
4732      UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4733      for (int i=0; i<20; i+=2) {
4734          longNumber.setCharAt(i, u' ');
4735      }
4736      bi->setText(longNumber);
4737      assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4738      assertSuccess(WHERE, status);
4739  }
4740  
4741  
TestProperties()4742  void RBBITest::TestProperties() {
4743      UErrorCode errorCode = U_ZERO_ERROR;
4744      UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4745      if (!prependSet.isEmpty()) {
4746          errln(
4747              "[:GCB=Prepend:] is not empty any more. "
4748              "Uncomment relevant lines in source/data/brkitr/char.txt and "
4749              "change this test to the opposite condition.");
4750      }
4751  }
4752  
4753  
4754  //
4755  //  TestDebug    -  A place-holder test for debugging purposes.
4756  //                  For putting in fragments of other tests that can be invoked
4757  //                  for tracing  without a lot of unwanted extra stuff happening.
4758  //
TestDebug(void)4759  void RBBITest::TestDebug(void) {
4760      UErrorCode status = U_ZERO_ERROR;
4761      LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4762              BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4763      if (!assertSuccess(WHERE, status, true)) {
4764          return;
4765      }
4766      const UnicodeString &rules = bi->getRules();
4767      UParseError pe;
4768      LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4769      assertSuccess(WHERE, status);
4770  }
4771  
4772  
4773  //
4774  //  TestDebugRules   A stub test for use in debugging rule compilation problems.
4775  //                   Can be freely altered as needed or convenient.
4776  //                   Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4777  //                   data files may not be available in all environments.
4778  //                   Any permanent test cases should be moved to rbbitst.txt
4779  //                   (see Bug 20303 in that file, for example), or to another test function in this file.
4780  //
TestDebugRules()4781  void RBBITest::TestDebugRules() {
4782  #if 0
4783      const char16_t *rules = u""
4784          "!!quoted_literals_only; \n"
4785          "!!chain; \n"
4786          "!!lookAheadHardBreak; \n"
4787          " \n"
4788          // "[a] / ; \n"
4789          "[a] [b] / [c] [d]; \n"
4790          "[a] [b] / [c] [d] {100}; \n"
4791          "[x] [a] [b] / [c] [d] {100}; \n"
4792          "[a] [b] [c] / [d] {100}; \n"
4793          //" [c] [d] / [e] [f]; \n"
4794          //"[a] [b] / [c]; \n"
4795          ;
4796  
4797      UErrorCode status = U_ZERO_ERROR;
4798      CharString path(pathToDataDirectory(), status);
4799      path.appendPathPart("brkitr", status);
4800      path.appendPathPart("rules", status);
4801      path.appendPathPart("line.txt", status);
4802      int    len;
4803      std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4804      if (!assertSuccess(WHERE, status)) {
4805          return;
4806      }
4807  
4808      UParseError pe;
4809      // rules = testFile.get();
4810      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4811  
4812      if (!assertSuccess(WHERE, status)) {
4813          delete bi;
4814          return;
4815      }
4816      // bi->dumpTables();
4817  
4818      delete bi;
4819  #endif
4820  }
4821  
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4822  void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4823      UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4824      int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4825      // Text are duplicate characters from U+4E00 to U+4FFF
4826      UnicodeString text;
4827      for (UChar c = 0x4e00; c < 0x5000; c++) {
4828          text.append(c).append(c);
4829      }
4830      // Generate rule which will caused length+4 character classes and
4831      // length+3 states
4832      UnicodeString rules(u"!!quoted_literals_only;");
4833      for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4834          rules.append(u'\'').append(c).append(c).append(u"';");
4835      }
4836      rules.append(u".;");
4837      UErrorCode status = U_ZERO_ERROR;
4838      UParseError parseError;
4839      RuleBasedBreakIterator bi(rules, parseError, status);
4840  
4841      assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4842      assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4843      assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4844      assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4845      assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4846  
4847      bi.setText(text);
4848  
4849      int32_t pos;
4850      int32_t i = 0;
4851      while ((pos = bi.next()) > 0) {
4852          // The first numChar should not break between the pair
4853          if (i++ < numChar) {
4854              assertEquals(WHERE, i * 2, pos);
4855          } else {
4856              // After the first numChar next(), break on each character.
4857              assertEquals(WHERE, i + numChar, pos);
4858          }
4859      }
4860      while ((pos = bi.previous()) > 0) {
4861          // The first numChar should not break between the pair
4862          if (--i < numChar) {
4863              assertEquals(WHERE, i * 2, pos);
4864          } else {
4865              // After the first numChar next(), break on each character.
4866              assertEquals(WHERE, i + numChar, pos);
4867          }
4868      }
4869  }
4870  
Test8BitsTrieWith8BitStateTable()4871  void RBBITest::Test8BitsTrieWith8BitStateTable() {
4872      testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4873  }
4874  
Test16BitsTrieWith8BitStateTable()4875  void RBBITest::Test16BitsTrieWith8BitStateTable() {
4876      testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4877  }
4878  
Test16BitsTrieWith16BitStateTable()4879  void RBBITest::Test16BitsTrieWith16BitStateTable() {
4880      testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4881  }
4882  
Test8BitsTrieWith16BitStateTable()4883  void RBBITest::Test8BitsTrieWith16BitStateTable() {
4884      // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4885      // create state table in 16 bits.
4886  
4887      // Generate 510 'a' as text
4888      UnicodeString text;
4889      for (int32_t i = 0; i < 510; i++) {
4890          text.append(u'a');
4891      }
4892  
4893      UnicodeString rules(u"!!quoted_literals_only;'");
4894      // 254 'a' in the rule will cause 256 states
4895      for (int32_t i = 0; i < 254; i++) {
4896          rules.append(u'a');
4897      }
4898      rules.append(u"';.;");
4899  
4900      UErrorCode status = U_ZERO_ERROR;
4901      UParseError parseError;
4902      LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4903  
4904      assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4905      assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4906      assertEquals(WHERE,
4907                   false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
4908      bi->setText(text);
4909  
4910      // break positions:
4911      // 254, 508, 509, ... 510
4912      assertEquals("next()", 254, bi->next());
4913      int32_t i = 0;
4914      int32_t pos;
4915      while ((pos = bi->next()) > 0) {
4916          assertEquals(WHERE, 508 + i , pos);
4917          i++;
4918      }
4919      i = 0;
4920      while ((pos = bi->previous()) > 0) {
4921          i++;
4922          if (pos >= 508) {
4923              assertEquals(WHERE, 510 - i , pos);
4924          } else {
4925              assertEquals(WHERE, 254 , pos);
4926          }
4927      }
4928  }
4929  
4930  // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
4931  // that there are no problems with rules at the size that transitions between the two.
4932  //
4933  // A rule that matches a literal string, like 'abcdefghij', will require one state and
4934  // one character class per character in the string. So we can make a rule to tickle the
4935  // boundaries by using literal strings of various lengths.
4936  //
4937  // For both the number of states and the number of character classes, the eight bit format
4938  // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
4939  // leaving 120 something available. This test runs the string over the range of 120 - 130,
4940  // which allows some margin for changes to the number of values reserved by the rule builder
4941  // without breaking the test.
4942  
TestTable_8_16_Bits()4943  void RBBITest::TestTable_8_16_Bits() {
4944  
4945      // testStr serves as both the source of the rule string (truncated to the desired length)
4946      // and as test data to check matching behavior. A break rule consisting of the first 120
4947      // characters of testStr will match the first 120 chars of the full-length testStr.
4948      UnicodeString testStr;
4949      for (UChar c=0x3000; c<0x3200; ++c) {
4950          testStr.append(c);
4951      }
4952  
4953      const int32_t startLength = 120;   // The shortest rule string to test.
4954      const int32_t endLength = 260;     // The longest rule string to test
4955      const int32_t increment = this->quick ? endLength - startLength : 1;
4956  
4957      for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
4958          UParseError parseError;
4959          UErrorCode status = U_ZERO_ERROR;
4960  
4961          UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
4962          ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
4963          RuleBasedBreakIterator bi(ruleString, parseError, status);
4964          if (!assertSuccess(WHERE, status)) {
4965              errln(ruleString);
4966              break;
4967          }
4968          // bi.dumpTables();
4969  
4970          // Verify that the break iterator is functioning - that the first boundary found
4971          // in testStr is at the length of the rule string.
4972          bi.setText(testStr);
4973          assertEquals(WHERE, ruleLen, bi.next());
4974  
4975          // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
4976          // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
4977          bi.setText(testStr);
4978          int32_t result = bi.preceding(ruleLen);
4979          assertEquals(WHERE, 0, result);
4980  
4981          // Verify that the range of rule lengths being tested cover the translations
4982          // from 8 to 16 bit data.
4983          bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
4984          bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
4985  
4986          if (ruleLen == startLength) {
4987              assertEquals(WHERE, true, has8BitRowData);
4988              assertEquals(WHERE, true, has8BitsTrie);
4989          }
4990          if (ruleLen == endLength) {
4991              assertEquals(WHERE, false, has8BitRowData);
4992              assertEquals(WHERE, false, has8BitsTrie);
4993          }
4994      }
4995  }
4996  
4997  /* Test handling of a large number of look-ahead rules.
4998   * The number of rules in the test exceeds the implementation limits prior to the
4999   * improvements introduced with #13590.
5000   *
5001   * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5002   * The text being matched is sequential, "ABCDEFGHI..."
5003   *
5004   * The upshot is that the look-ahead rules all match on their preceding context,
5005   * and consequently must save a potential result, but then fail to match on their
5006   * trailing context, so that they don't actually cause a boundary.
5007   *
5008   * Additionally, add a ".*" rule, so there are no boundaries unless a
5009   * look-ahead hard-break rule forces one.
5010   */
TestBug13590()5011  void RBBITest::TestBug13590() {
5012      UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5013  
5014      const int NUM_LOOKAHEAD_RULES = 50;
5015      const char16_t STARTING_CHAR = u'\u5000';
5016      char16_t firstChar;
5017      for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5018          firstChar = STARTING_CHAR + ruleNum*2;
5019          rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5020               .append(u' ') .append(u'/') .append(u' ')
5021               .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5022               .append(u';') .append(u'\n');
5023      }
5024  
5025      // Change the last rule added from the form "UV / WY" to "UV / WX".
5026      // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5027      rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5028  
5029      UErrorCode status = U_ZERO_ERROR;
5030      UParseError parseError;
5031      RuleBasedBreakIterator bi(rules, parseError, status);
5032      if (!assertSuccess(WHERE, status)) {
5033          errln(rules);
5034          return;
5035      }
5036      // bi.dumpTables();
5037  
5038      UnicodeString testString;
5039      for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5040          testString.append(c);
5041      }
5042      bi.setText(testString);
5043  
5044      int breaksFound = 0;
5045      while (bi.next() != UBRK_DONE) {
5046          ++breaksFound;
5047      }
5048  
5049      // Two matches are expected, one from the last rule that was explicitly modified,
5050      // and one at the end of the text.
5051      assertEquals(WHERE, 2, breaksFound);
5052  }
5053  
5054  
5055  #if U_ENABLE_TRACING
5056  static std::vector<std::string> gData;
5057  static std::vector<int32_t> gEntryFn;
5058  static std::vector<int32_t> gExitFn;
5059  static std::vector<int32_t> gDataFn;
5060  
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5061  static void U_CALLCONV traceData(
5062          const void*,
5063          int32_t fnNumber,
5064          int32_t,
5065          const char *,
5066          va_list args) {
5067      if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5068          const char* data = va_arg(args, const char*);
5069          gDataFn.push_back(fnNumber);
5070          gData.push_back(data);
5071      }
5072  }
5073  
traceEntry(const void *,int32_t fnNumber)5074  static void traceEntry(const void *, int32_t fnNumber) {
5075      if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5076          gEntryFn.push_back(fnNumber);
5077      }
5078  }
5079  
traceExit(const void *,int32_t fnNumber,const char *,va_list)5080  static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5081      if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5082          gExitFn.push_back(fnNumber);
5083      }
5084  }
5085  
5086  
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5087  void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5088      assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5089      assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5090      assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5091      assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5092  
5093      if (expectedData == nullptr) {
5094        assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5095        assertEquals("utrace_data should not be called ", 0, gData.size());
5096      } else {
5097        assertEquals("utrace_data should be called ", 1, gDataFn.size());
5098        assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5099        assertEquals("utrace_data should be called ", 1, gData.size());
5100        assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5101      }
5102  }
5103  
SetupTestTrace()5104  void SetupTestTrace() {
5105      gEntryFn.clear();
5106      gExitFn.clear();
5107      gDataFn.clear();
5108      gData.clear();
5109  
5110      const void* context = nullptr;
5111      utrace_setFunctions(context, traceEntry, traceExit, traceData);
5112      utrace_setLevel(UTRACE_INFO);
5113  }
5114  
TestTraceCreateCharacter(void)5115  void RBBITest::TestTraceCreateCharacter(void) {
5116      SetupTestTrace();
5117      IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5118      LocalPointer<BreakIterator> brkitr(
5119          BreakIterator::createCharacterInstance("zh-CN", status));
5120      status.errIfFailureAndReset();
5121      assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5122  }
5123  
TestTraceCreateTitle(void)5124  void RBBITest::TestTraceCreateTitle(void) {
5125      SetupTestTrace();
5126      IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5127      LocalPointer<BreakIterator> brkitr(
5128          BreakIterator::createTitleInstance("zh-CN", status));
5129      status.errIfFailureAndReset();
5130      assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5131  }
5132  
TestTraceCreateSentence(void)5133  void RBBITest::TestTraceCreateSentence(void) {
5134      SetupTestTrace();
5135      IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5136      LocalPointer<BreakIterator> brkitr(
5137          BreakIterator::createSentenceInstance("zh-CN", status));
5138      status.errIfFailureAndReset();
5139      assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5140  }
5141  
TestTraceCreateWord(void)5142  void RBBITest::TestTraceCreateWord(void) {
5143      SetupTestTrace();
5144      IcuTestErrorCode status(*this, "TestTraceCreateWord");
5145      LocalPointer<BreakIterator> brkitr(
5146          BreakIterator::createWordInstance("zh-CN", status));
5147      status.errIfFailureAndReset();
5148      assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5149  }
5150  
TestTraceCreateLine(void)5151  void RBBITest::TestTraceCreateLine(void) {
5152      SetupTestTrace();
5153      IcuTestErrorCode status(*this, "TestTraceCreateLine");
5154      LocalPointer<BreakIterator> brkitr(
5155          BreakIterator::createLineInstance("zh-CN", status));
5156      status.errIfFailureAndReset();
5157      assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line");
5158  }
5159  
TestTraceCreateLineStrict(void)5160  void RBBITest::TestTraceCreateLineStrict(void) {
5161      SetupTestTrace();
5162      IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5163      LocalPointer<BreakIterator> brkitr(
5164          BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5165      status.errIfFailureAndReset();
5166      assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict");
5167  }
5168  
TestTraceCreateLineNormal(void)5169  void RBBITest::TestTraceCreateLineNormal(void) {
5170      SetupTestTrace();
5171      IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5172      LocalPointer<BreakIterator> brkitr(
5173          BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5174      status.errIfFailureAndReset();
5175      assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal");
5176  }
5177  
TestTraceCreateLineLoose(void)5178  void RBBITest::TestTraceCreateLineLoose(void) {
5179      SetupTestTrace();
5180      IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5181      LocalPointer<BreakIterator> brkitr(
5182          BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5183      status.errIfFailureAndReset();
5184      assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose");
5185  }
5186  
TestTraceCreateLineLoosePhrase(void)5187  void RBBITest::TestTraceCreateLineLoosePhrase(void) {
5188      SetupTestTrace();
5189      IcuTestErrorCode status(*this, "TestTraceCreateLineLoosePhrase");
5190      LocalPointer<BreakIterator> brkitr(
5191          BreakIterator::createLineInstance("ja-u-lb-loose-lw-phrase", status));
5192      status.errIfFailureAndReset();
5193      assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose_phrase");
5194  }
5195  
TestTraceCreateLineNormalPhrase(void)5196  void RBBITest::TestTraceCreateLineNormalPhrase(void) {
5197      SetupTestTrace();
5198      IcuTestErrorCode status(*this, "TestTraceCreateLineNormalPhrase");
5199      LocalPointer<BreakIterator> brkitr(
5200          BreakIterator::createLineInstance("ja-u-lb-normal-lw-phrase", status));
5201      status.errIfFailureAndReset();
5202      assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal_phrase");
5203  }
5204  
TestTraceCreateLineStrictPhrase(void)5205  void RBBITest::TestTraceCreateLineStrictPhrase(void) {
5206      SetupTestTrace();
5207      IcuTestErrorCode status(*this, "TestTraceCreateLineStrictPhrase");
5208      LocalPointer<BreakIterator> brkitr(
5209          BreakIterator::createLineInstance("ja-u-lb-strict-lw-phrase", status));
5210      status.errIfFailureAndReset();
5211      assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict_phrase");
5212  }
5213  
TestTraceCreateLinePhrase(void)5214  void RBBITest::TestTraceCreateLinePhrase(void) {
5215      SetupTestTrace();
5216      IcuTestErrorCode status(*this, "TestTraceCreateLinePhrase");
5217      LocalPointer<BreakIterator> brkitr(
5218          BreakIterator::createLineInstance("ja-u-lw-phrase", status));
5219      status.errIfFailureAndReset();
5220      assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_phrase");
5221  }
5222  
TestTraceCreateBreakEngine(void)5223  void RBBITest::TestTraceCreateBreakEngine(void) {
5224      rbbi_cleanup();
5225      SetupTestTrace();
5226      IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5227      LocalPointer<BreakIterator> brkitr(
5228          BreakIterator::createWordInstance("zh-CN", status));
5229      status.errIfFailureAndReset();
5230      assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5231  
5232      // To word break the following text, BreakIterator will create 5 dictionary
5233      // break engine internally.
5234      brkitr->setText(
5235          u"test "
5236          u"測試 " // Hani
5237          u"សាកល្បង " // Khmr
5238          u"ທົດສອບ " // Laoo
5239          u"စမ်းသပ်မှု " // Mymr
5240          u"ทดสอบ " // Thai
5241          u"test "
5242      );
5243  
5244      // Loop through all the text.
5245      while (brkitr->next() > 0) ;
5246  
5247      assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5248      assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5249      assertEquals("utrace_data should be called ", 5, gDataFn.size());
5250  
5251      for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5252          assertEquals("utrace_entry should be called ",
5253                       UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5254          assertEquals("utrace_exit should be called ",
5255                       UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5256          assertEquals("utrace_data should be called ",
5257                       UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5258      }
5259  
5260      assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5261      assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5262      assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5263      assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5264      assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5265  
5266  }
5267  #endif
5268  
TestUnpairedSurrogate()5269  void RBBITest::TestUnpairedSurrogate() {
5270      UnicodeString rules(u"ab;");
5271  
5272      UErrorCode status = U_ZERO_ERROR;
5273      UParseError pe;
5274      RuleBasedBreakIterator bi1(rules, pe, status);
5275      assertSuccess(WHERE, status);
5276      UnicodeString rtRules = bi1.getRules();
5277      // make sure the simple one work first.
5278      assertEquals(WHERE, rules,  rtRules);
5279  
5280  
5281      rules = UnicodeString(u"a\\ud800b;").unescape();
5282      pe.line = 0;
5283      pe.offset = 0;
5284      RuleBasedBreakIterator bi2(rules, pe, status);
5285      assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5286      if (pe.line != 1 || pe.offset != 1) {
5287          errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5288      }
5289  
5290      status = U_ZERO_ERROR;
5291      rules = UnicodeString(u"a\\ude00b;").unescape();
5292      pe.line = 0;
5293      pe.offset = 0;
5294      RuleBasedBreakIterator bi3(rules, pe, status);
5295      assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5296      if (pe.line != 1 || pe.offset != 1) {
5297          errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5298      }
5299  
5300      // make sure the surrogate one work too.
5301      status = U_ZERO_ERROR;
5302      rules = UnicodeString(u"a��b;");
5303      RuleBasedBreakIterator bi4(rules, pe, status);
5304      rtRules = bi4.getRules();
5305      assertEquals(WHERE, rules, rtRules);
5306  }
5307  
5308  // Read file generated by
5309  // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5310  // as test cases and compare the Output.
5311  // Format of the file
5312  //   Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5313  //   Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5314  //   Input:\t[source text]
5315  //   Output:\t[expected output separated by | ]
5316  //   Input: ...
5317  //   Output: ...
5318  
runLSTMTestFromFile(const char * filename,UScriptCode script)5319  void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5320      // The expectation in this test depends on LSTM, skip the test if the
5321      // configuration is not build with LSTM data.
5322      if (skipLSTMTest()) {
5323          return;
5324      }
5325      UErrorCode   status = U_ZERO_ERROR;
5326      LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5327      if (U_FAILURE(status)) {
5328          errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5329          return;
5330      }
5331      //  Open and read the test data file.
5332      const char *testDataDirectory = IntlTest::getSourceTestData(status);
5333      CharString testFileName(testDataDirectory, -1, status);
5334      testFileName.append(filename, -1, status);
5335  
5336      int len;
5337      UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5338      if (U_FAILURE(status)) {
5339          errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5340          return;
5341      }
5342  
5343      //  Put the test data into a UnicodeString
5344      UnicodeString testString(false, testFile, len);
5345  
5346      int32_t start = 0;
5347  
5348      UnicodeString line;
5349      int32_t end;
5350      std::string actual_sep_str;
5351      int32_t caseNum = 0;
5352      // Iterate through all the lines in the test file.
5353      do {
5354          int32_t cr = testString.indexOf(u'\r', start);
5355          int32_t lf = testString.indexOf(u'\n', start);
5356          end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5357          line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5358          if (line.length() > 0) {
5359              // Separate each line to key and value by TAB.
5360              int32_t tab = line.indexOf(u'\t');
5361              UnicodeString key = line.tempSubString(0, tab);
5362              const UnicodeString value = line.tempSubString(tab+1);
5363  
5364              if (key == "Model:") {
5365                  // Verify the expectation in the test file match the LSTM model
5366                  // we are using now.
5367                  const LSTMData* data = CreateLSTMDataForScript(script, status);
5368                  if (U_FAILURE(status)) {
5369                      dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5370                                __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5371                      return;
5372                  }
5373                  UnicodeString name(LSTMDataName(data));
5374                  DeleteLSTMData(data);
5375                  if (value != name) {
5376                      std::string utf8Name, utf8Value;
5377                      dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5378                                __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5379                                name.toUTF8String<std::string>(utf8Name).c_str(),
5380                                value.toUTF8String<std::string>(utf8Value).c_str());
5381                      return;
5382                  }
5383              } else if (key == "Input:") {
5384                  UnicodeString input("prefix ");
5385                  input += value + " suffix";
5386                  std::stringstream ss;
5387  
5388                  // Construct the UText which is expected by the the engine as
5389                  // input from the UnicodeString.
5390                  UText ut = UTEXT_INITIALIZER;
5391                  utext_openConstUnicodeString(&ut, &input, &status);
5392                  if (U_FAILURE(status)) {
5393                      dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5394                      return;
5395                  }
5396  
5397                  iterator->setText(&ut, status);
5398                  if (U_FAILURE(status)) {
5399                      errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5400                      return;
5401                  }
5402  
5403                  int32_t bp;
5404                  for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5405                      ss << bp;
5406                      if (bp != input.length()) {
5407                          ss << ", ";
5408                      }
5409                  }
5410  
5411                  utext_close(&ut);
5412                  // Turn the break points into a string for easy comparison
5413                  // output.
5414                  actual_sep_str = "{" + ss.str() + "}";
5415              } else if (key == "Output:" && !actual_sep_str.empty()) {
5416                  UnicodeString input("prefix| |");
5417                  input += value + "| |suffix";
5418                  std::string d;
5419                  int32_t sep;
5420                  int32_t start = 0;
5421                  int32_t curr = 0;
5422                  std::stringstream ss;
5423                  // Include 0 as the break point.
5424                  ss << "0, ";
5425                  while ((sep = input.indexOf(u'|', start)) >= 0) {
5426                      int32_t len = sep - start;
5427                      if (len > 0) {
5428                          if (curr > 0) {
5429                              ss << ", ";
5430                          }
5431                          curr += len;
5432                          ss << curr;
5433                      }
5434                      start = sep + 1;
5435                  }
5436                  // Include end of the string as break point.
5437                  ss << ", " << curr + input.length() - start;
5438                  // Turn the break points into a string for easy comparison
5439                  // output.
5440                  std::string expected = "{" + ss.str() + "}";
5441                  std::string utf8;
5442  
5443                  assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5444                               expected.c_str(), actual_sep_str.c_str());
5445                  actual_sep_str.clear();
5446              }
5447          }
5448          start = std::max(cr, lf) + 1;
5449      } while (end >= 0);
5450  
5451      delete [] testFile;
5452  }
5453  
TestLSTMThai()5454  void RBBITest::TestLSTMThai() {
5455      runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5456  }
5457  
TestLSTMBurmese()5458  void RBBITest::TestLSTMBurmese() {
5459      runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5460  }
5461  
5462  
5463  // Test preceding(index) and following(index), with semi-random indexes.
5464  // The random indexes are produced in clusters that are relatively closely spaced,
5465  // to increase the occurrences of hits to the internal break cache.
5466  
TestRandomAccess()5467  void RBBITest::TestRandomAccess() {
5468      static constexpr int32_t CACHE_SIZE = 128;
5469  
5470      UnicodeString testData;
5471      for (int i=0; i<CACHE_SIZE*2; ++i) {
5472          testData.append(u"aaaa\n");
5473      }
5474  
5475      UErrorCode status = U_ZERO_ERROR;
5476      LocalPointer<RuleBasedBreakIterator> bi(
5477              (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status),
5478              status);
5479      if (!assertSuccess(WHERE, status)) { return; };
5480  
5481      bi->setText(testData);
5482  
5483      auto expectedPreceding = [](int from) {
5484          if (from == 0) {return UBRK_DONE;}
5485          if (from % 5 == 0) {return from - 5;}
5486          return from - (from % 5);
5487      };
5488  
5489      auto expectedFollow = [testData](int from) {
5490          if (from >= testData.length()) {return UBRK_DONE;}
5491          if (from % 5 == 0) {return from + 5;}
5492          return from + (5 - (from % 5));
5493      };
5494  
5495      auto randomStringIndex = [testData]() {
5496          static icu_rand randomGenerator;  // produces random uint32_t values.
5497          static int lastNum;
5498          static int clusterCount;
5499          static constexpr int CLUSTER_SIZE = 100;
5500          static constexpr int CLUSTER_LENGTH = 10;
5501  
5502          if (clusterCount < CLUSTER_LENGTH) {
5503              ++clusterCount;
5504              lastNum += (randomGenerator() % CLUSTER_SIZE);
5505              lastNum -= CLUSTER_SIZE / 2;
5506              lastNum = std::max(0, lastNum);
5507              // Deliberately test indexes > testData.length.
5508              lastNum = std::min(testData.length() + 5, lastNum);
5509          } else {
5510              clusterCount = 0;
5511              lastNum = randomGenerator() % testData.length();
5512          }
5513          return lastNum;
5514      };
5515  
5516      for (int i=0; i<5000; ++i) {
5517          int idx = randomStringIndex();
5518          assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
5519          idx = randomStringIndex();
5520          assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
5521      }
5522  }
5523  
5524  #endif // #if !UCONFIG_NO_BREAK_ITERATION
5525