• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 1999-2011, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 /************************************************************************
7 *   Date        Name        Description
8 *   12/15/99    Madhu        Creation.
9 *   01/12/2000  Madhu        Updated for changed API and added new tests
10 ************************************************************************/
11 
12 #include <typeinfo>  // for 'typeid' to work
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
27 #include "unicode/ustring.h"
28 #include "unicode/utext.h"
29 #include "intltest.h"
30 #include "rbbitst.h"
31 #include <string.h>
32 #include "uvector.h"
33 #include "uvectr32.h"
34 #include "triedict.h"
35 #include <string.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 
39 #define TEST_ASSERT(x) {if (!(x)) { \
40     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
41 
42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
43     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
44 
45 
46 //---------------------------------------------
47 // runIndexedTest
48 //---------------------------------------------
49 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)50 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
51 {
52     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
53 
54     switch (index) {
55 #if !UCONFIG_NO_FILE_IO
56         case 0: name = "TestBug4153072";
57             if(exec) TestBug4153072();                         break;
58 #else
59         case 0: name = "skip";
60             break;
61 #endif
62 
63         case 1: name = "TestJapaneseLineBreak";
64             if(exec) TestJapaneseLineBreak();                  break;
65         case 2: name = "TestStatusReturn";
66             if(exec) TestStatusReturn();                       break;
67 
68 #if !UCONFIG_NO_FILE_IO
69         case 3: name = "TestUnicodeFiles";
70             if(exec) TestUnicodeFiles();                       break;
71         case 4: name = "TestEmptyString";
72             if(exec) TestEmptyString();                        break;
73 #else
74         case 3: case 4: name = "skip";
75             break;
76 #endif
77 
78         case 5: name = "TestGetAvailableLocales";
79             if(exec) TestGetAvailableLocales();                break;
80 
81         case 6: name = "TestGetDisplayName";
82             if(exec) TestGetDisplayName();                     break;
83 
84 #if !UCONFIG_NO_FILE_IO
85         case 7: name = "TestEndBehaviour";
86             if(exec) TestEndBehaviour();                       break;
87         case 8: name = "TestMixedThaiLineBreak";
88              if(exec) TestMixedThaiLineBreak();                break;
89         case 9: name = "TestThaiLineBreak";
90              if(exec) TestThaiLineBreak();                     break;
91         case 10: name = "TestMaiyamok";
92              if(exec) TestMaiyamok();                          break;
93         case 11: name = "TestWordBreaks";
94              if(exec) TestWordBreaks();                        break;
95         case 12: name = "TestWordBoundary";
96              if(exec) TestWordBoundary();                      break;
97         case 13: name = "TestLineBreaks";
98              if(exec) TestLineBreaks();                        break;
99         case 14: name = "TestSentBreaks";
100              if(exec) TestSentBreaks();                        break;
101         case 15: name = "TestExtended";
102              if(exec) TestExtended();                          break;
103 #else
104         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
105              break;
106 #endif
107 
108         case 16:
109              if(exec) {
110  #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
111                name = "TestMonkey";
112                TestMonkey(params);
113  #else
114                name = "skip";
115  #endif
116              }
117                                                                break;
118 
119 #if !UCONFIG_NO_FILE_IO
120         case 17: name = "TestBug3818";
121             if(exec) TestBug3818();                            break;
122         case 18: name = "TestJapaneseWordBreak";
123             if(exec) TestJapaneseWordBreak();                  break;
124 #else
125         case 17: case 18: name = "skip";
126             break;
127 #endif
128 
129         case 19: name = "TestDebug";
130             if(exec) TestDebug();                              break;
131         case 20: name = "TestTrieDict";
132             if(exec) TestTrieDict();                           break;
133 
134 #if !UCONFIG_NO_FILE_IO
135         case 21: name = "TestBug5775";
136             if (exec) TestBug5775();                           break;
137         case 22: name = "TestTailoredBreaks";
138             if (exec) TestTailoredBreaks();                    break;
139 #else
140         case 21: case 22: name = "skip";
141             break;
142 #endif
143         case 23: name = "TestDictRules";
144             if (exec) TestDictRules();                         break;
145         case 24: name = "TestBug5532";
146             if (exec) TestBug5532();                           break;
147         default: name = ""; break; //needed to end loop
148     }
149 }
150 
151 
152 //---------------------------------------------------------------------------
153 //
154 //   class BITestData   Holds a set of Break iterator test data and results
155 //                      Includes
156 //                         - the string data to be broken
157 //                         - a vector of the expected break positions.
158 //                         - a vector of source line numbers for the data,
159 //                               (to help see where errors occured.)
160 //                         - The expected break tag values.
161 //                         - Vectors of actual break positions and tag values.
162 //                         - Functions for comparing actual with expected and
163 //                            reporting errors.
164 //
165 //----------------------------------------------------------------------------
166 class BITestData {
167 public:
168     UnicodeString    fDataToBreak;
169     UVector          fExpectedBreakPositions;
170     UVector          fExpectedTags;
171     UVector          fLineNum;
172     UVector          fActualBreakPositions;   // Test Results.
173     UVector          fActualTags;
174 
175     BITestData(UErrorCode &status);
176     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
177     void             checkResults(const char *heading, RBBITest *test);
178     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
179     void             clearResults();
180 };
181 
182 //
183 // Constructor.
184 //
BITestData(UErrorCode & status)185 BITestData::BITestData(UErrorCode &status)
186 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
187   fActualTags(status)
188 {
189 }
190 
191 //
192 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
193 //                 The macro form collects the line number, which is helpful
194 //                 when tracking down failures.
195 //
196 //                 A null data item is inserted at the start of each test's data
197 //                  to put the starting zero into the data list.  The position saved for
198 //                  each non-null item is its ending position.
199 //
200 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
addDataChunk(const char * data,int32_t tag,int32_t lineNum,UErrorCode status)201 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
202     if (U_FAILURE(status)) {return;}
203     if (data != NULL) {
204         fDataToBreak.append(CharsToUnicodeString(data));
205     }
206     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
207     fExpectedTags.addElement(tag, status);
208     fLineNum.addElement(lineNum, status);
209 }
210 
211 
212 //
213 //  checkResults.   Compare the actual and expected break positions, report any differences.
214 //
checkResults(const char * heading,RBBITest * test)215 void BITestData::checkResults(const char *heading, RBBITest *test) {
216     int32_t   expectedIndex = 0;
217     int32_t   actualIndex = 0;
218 
219     for (;;) {
220         // If we've run through both the expected and actual results vectors, we're done.
221         //   break out of the loop.
222         if (expectedIndex >= fExpectedBreakPositions.size() &&
223             actualIndex   >= fActualBreakPositions.size()) {
224             break;
225         }
226 
227 
228         if (expectedIndex >= fExpectedBreakPositions.size()) {
229             err(heading, test, expectedIndex-1, actualIndex);
230             actualIndex++;
231             continue;
232         }
233 
234         if (actualIndex >= fActualBreakPositions.size()) {
235             err(heading, test, expectedIndex, actualIndex-1);
236             expectedIndex++;
237             continue;
238         }
239 
240         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
241             err(heading, test, expectedIndex, actualIndex);
242             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
243             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
244                 actualIndex++;
245             } else {
246                 expectedIndex++;
247             }
248             continue;
249         }
250 
251         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
252             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
253                 heading, fLineNum.elementAt(expectedIndex),
254                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
255         }
256 
257         actualIndex++;
258         expectedIndex++;
259     }
260 }
261 
262 //
263 //  err   -  An error was found.  Report it, along with information about where the
264 //                                incorrectly broken test data appeared in the source file.
265 //
err(const char * heading,RBBITest * test,int32_t expectedIdx,int32_t actualIdx)266 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
267 {
268     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
269     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
270     int32_t   o        = 0;
271     int32_t   line     = fLineNum.elementAti(expectedIdx);
272     if (expectedIdx > 0) {
273         // The line numbers are off by one because a premature break occurs somewhere
274         //    within the previous item, rather than at the start of the current (expected) item.
275         //    We want to report the offset of the unexpected break from the start of
276         //      this previous item.
277         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
278     }
279     if (actual < expected) {
280         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
281     } else {
282         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
283     }
284 }
285 
286 
clearResults()287 void BITestData::clearResults() {
288     fActualBreakPositions.removeAllElements();
289     fActualTags.removeAllElements();
290 }
291 
292 
293 //-----------------------------------------------------------------------------------
294 //
295 //    Cannned Test Characters
296 //
297 //-----------------------------------------------------------------------------------
298 
299 static const UChar cannedTestArray[] = {
300     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
301     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
302     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
303     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
304     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
305     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
306     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
307     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
308 };
309 
310 static UnicodeString* cannedTestChars = 0;
311 
312 #define  halfNA     "\\u0928\\u094d\\u200d"
313 #define  halfSA     "\\u0938\\u094d\\u200d"
314 #define  halfCHA    "\\u091a\\u094d\\u200d"
315 #define  halfKA     "\\u0915\\u094d\\u200d"
316 #define  deadTA     "\\u0924\\u094d"
317 
318 //--------------------------------------------------------------------------------------
319 //
320 //    RBBITest    constructor and destructor
321 //
322 //--------------------------------------------------------------------------------------
323 
RBBITest()324 RBBITest::RBBITest() {
325     UnicodeString temp(cannedTestArray);
326     cannedTestChars = new UnicodeString();
327     *cannedTestChars += (UChar)0x0000;
328     *cannedTestChars += temp;
329 }
330 
331 
~RBBITest()332 RBBITest::~RBBITest() {
333     delete cannedTestChars;
334 }
335 
336 
337 static const int T_NUMBER = 100;
338 static const int T_LETTER = 200;
339 static const int T_H_OR_K = 300;
340 static const int T_IDEO   = 400;
341 
342 
343 
344 
345 
346 
347 //--------------------------------------------------------------------
348 //Testing the BreakIterator for devanagari script
349 //--------------------------------------------------------------------
350 
351 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
352 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
353 #define deadTTHA "\\u0920\\u094d"
354 #define deadPA   "\\u092a\\u094d"
355 #define deadSA   "\\u0938\\u094d"
356 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
357 
358 
359 
360 
361 
362 
363 //-----------------------------------------------------------------------------------
364 //
365 //   Test for status {tag} return value from break rules.
366 //        TODO:  a more thorough test.
367 //
368 //-----------------------------------------------------------------------------------
TestStatusReturn()369 void RBBITest::TestStatusReturn() {
370      UnicodeString rulesString1("$Letters = [:L:];\n"
371                                   "$Numbers = [:N:];\n"
372                                   "$Letters+{1};\n"
373                                   "$Numbers+{2};\n"
374                                   "Help\\ {4}/me\\!;\n"
375                                   "[^$Letters $Numbers];\n"
376                                   "!.*;\n", -1, US_INV);
377      UnicodeString testString1  = "abc123..abc Help me Help me!";
378                                 // 01234567890123456789012345678
379      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
380      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
381 
382      UErrorCode status=U_ZERO_ERROR;
383      UParseError    parseError;
384 
385      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
386      if(U_FAILURE(status)) {
387          dataerrln("FAIL : in construction - %s", u_errorName(status));
388      } else {
389          int32_t  pos;
390          int32_t  i = 0;
391          bi->setText(testString1);
392          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
393              if (pos != bounds1[i]) {
394                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
395                  break;
396              }
397 
398              int tag = bi->getRuleStatus();
399              if (tag != brkStatus[i]) {
400                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
401                  break;
402              }
403              i++;
404          }
405      }
406      delete bi;
407 }
408 
409 
printStringBreaks(UnicodeString ustr,int expected[],int expectedcount)410 static void printStringBreaks(UnicodeString ustr, int expected[],
411                               int expectedcount)
412 {
413     UErrorCode status = U_ZERO_ERROR;
414     char name[100];
415     printf("code    alpha extend alphanum type word sent line name\n");
416     int j;
417     for (j = 0; j < ustr.length(); j ++) {
418         if (expectedcount > 0) {
419             int k;
420             for (k = 0; k < expectedcount; k ++) {
421                 if (j == expected[k]) {
422                     printf("------------------------------------------------ %d\n",
423                            j);
424                 }
425             }
426         }
427         UChar32 c = ustr.char32At(j);
428         if (c > 0xffff) {
429             j ++;
430         }
431         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
432         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
433                            u_isUAlphabetic(c),
434                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
435                            u_isalnum(c),
436                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
437                                                   u_charType(c),
438                                                   U_SHORT_PROPERTY_NAME),
439                            u_getPropertyValueName(UCHAR_WORD_BREAK,
440                                                   u_getIntPropertyValue(c,
441                                                           UCHAR_WORD_BREAK),
442                                                   U_SHORT_PROPERTY_NAME),
443                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
444                                    u_getIntPropertyValue(c,
445                                            UCHAR_SENTENCE_BREAK),
446                                    U_SHORT_PROPERTY_NAME),
447                            u_getPropertyValueName(UCHAR_LINE_BREAK,
448                                    u_getIntPropertyValue(c,
449                                            UCHAR_LINE_BREAK),
450                                    U_SHORT_PROPERTY_NAME),
451                            name);
452     }
453 }
454 
TestThaiLineBreak()455 void RBBITest::TestThaiLineBreak() {
456     UErrorCode status = U_ZERO_ERROR;
457     BITestData thaiLineSelection(status);
458 
459     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
460     // represents elided letters at the end of a long word.  It should be bound to
461     // the end of the word and not treated as an independent punctuation mark.
462 
463 
464     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
465     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
466     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
467     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
468     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
469 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
470 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
471     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
472     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
473     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
474     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
475     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
476     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
477     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
478     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
479 
480     // the one time where the paiyannoi occurs somewhere other than at the end
481     // of a word is in the Thai abbrevation for "etc.", which both begins and
482     // ends with a paiyannoi
483     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
484     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
485     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
486 
487     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
488         Locale("th"), status);
489     if (U_FAILURE(status))
490     {
491         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
492         return;
493     }
494 
495     generalIteratorTest(*e, thaiLineSelection);
496     delete e;
497 }
498 
499 
500 
TestMixedThaiLineBreak()501 void RBBITest::TestMixedThaiLineBreak()
502 {
503     UErrorCode   status = U_ZERO_ERROR;
504     BITestData   thaiLineSelection(status);
505 
506     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
507 
508 
509     // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
510     // start
511 
512     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
513     ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
514     ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
515     ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
516     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
517     ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
518     ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
519     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
520     ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
521     ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
522     ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
523     ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
524     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
525     ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
526     ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
527     ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
528 
529     // @suwit - end of changes
530 
531 
532     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
533     if (U_FAILURE(status))
534     {
535         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
536         return;
537     }
538 
539 
540     generalIteratorTest(*e, thaiLineSelection);
541     delete e;
542 }
543 
544 
TestMaiyamok()545 void RBBITest::TestMaiyamok()
546 {
547     UErrorCode status = U_ZERO_ERROR;
548     BITestData   thaiLineSelection(status);
549     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
550     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
551     // word".  Instead of appearing as a word unto itself, however, it's kept together
552     // with the word before it
553     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
554     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
555     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
556     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
557     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
558     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
559     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
560     ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
561     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
562 
563     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
564         Locale("th"), status);
565 
566     if (U_FAILURE(status))
567     {
568         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
569         return;
570     }
571     generalIteratorTest(*e, thaiLineSelection);
572     delete e;
573 }
574 
575 
576 
TestBug3818()577 void RBBITest::TestBug3818() {
578     UErrorCode  status = U_ZERO_ERROR;
579 
580     // Four Thai words...
581     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
582                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
583     UnicodeString  thaiStr(thaiWordData);
584 
585     RuleBasedBreakIterator* bi =
586         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
587     if (U_FAILURE(status) || bi == NULL) {
588         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
589         return;
590     }
591     bi->setText(thaiStr);
592 
593     int32_t  startOfSecondWord = bi->following(1);
594     if (startOfSecondWord != 4) {
595         errln("Fail at file %s, line %d expected start of word at 4, got %d",
596             __FILE__, __LINE__, startOfSecondWord);
597     }
598     startOfSecondWord = bi->following(0);
599     if (startOfSecondWord != 4) {
600         errln("Fail at file %s, line %d expected start of word at 4, got %d",
601             __FILE__, __LINE__, startOfSecondWord);
602     }
603     delete bi;
604 }
605 
606 
TestJapaneseWordBreak()607 void RBBITest::TestJapaneseWordBreak() {
608     UErrorCode status = U_ZERO_ERROR;
609     BITestData   japaneseWordSelection(status);
610 
611     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
612     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
613     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
614     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
615     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
616     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
617     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
618 
619     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
620         Locale("ja"), status);
621     if (U_FAILURE(status))
622     {
623         errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
624         return;
625     }
626 
627     generalIteratorTest(*e, japaneseWordSelection);
628     delete e;
629 }
630 
TestTrieDict()631 void RBBITest::TestTrieDict() {
632     UErrorCode      status  = U_ZERO_ERROR;
633 
634     //
635     //  Open and read the test data file.
636     //
637     const char *testDataDirectory = IntlTest::getSourceTestData(status);
638     char testFileName[1000];
639     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
640         errln("Can't open test data.  Path too long.");
641         return;
642     }
643     strcpy(testFileName, testDataDirectory);
644     strcat(testFileName, "riwords.txt");
645 
646     // Items needing deleting at the end
647     MutableTrieDictionary *mutableDict = NULL;
648     CompactTrieDictionary *compactDict = NULL;
649     UnicodeSet            *breaks      = NULL;
650     UChar                 *testFile    = NULL;
651     StringEnumeration     *enumer1     = NULL;
652     StringEnumeration     *enumer2     = NULL;
653     MutableTrieDictionary *mutable2    = NULL;
654     StringEnumeration     *cloneEnum   = NULL;
655     CompactTrieDictionary *compact2    = NULL;
656 
657 
658     const UnicodeString *originalWord = NULL;
659     const UnicodeString *cloneWord    = NULL;
660     UChar *current;
661     UChar *word;
662     UChar uc;
663     int32_t wordLen;
664     int32_t wordCount;
665     int32_t testCount;
666 
667     int    len;
668     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
669     if (U_FAILURE(status)) {
670         goto cleanup; /* something went wrong, error already output */
671     }
672 
673     mutableDict = new MutableTrieDictionary(0x0E1C, status);
674     if (U_FAILURE(status)) {
675         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
676         goto cleanup;
677     }
678 
679     breaks = new UnicodeSet;
680     breaks->add(0x000A);     // Line Feed
681     breaks->add(0x000D);     // Carriage Return
682     breaks->add(0x2028);     // Line Separator
683     breaks->add(0x2029);     // Paragraph Separator
684 
685     // Now add each non-comment line of the file as a word.
686     current = testFile;
687     word = current;
688     uc = *current++;
689     wordLen = 0;
690     wordCount = 0;
691 
692     while (uc) {
693         if (uc == 0x0023) {     // #comment line, skip
694             while (uc && !breaks->contains(uc)) {
695                 uc = *current++;
696             }
697         }
698         else while (uc && !breaks->contains(uc)) {
699             ++wordLen;
700             uc = *current++;
701         }
702         if (wordLen > 0) {
703             mutableDict->addWord(word, wordLen, status);
704             if (U_FAILURE(status)) {
705                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
706                 goto cleanup;
707             }
708             wordCount += 1;
709         }
710 
711         // Find beginning of next line
712         while (uc && breaks->contains(uc)) {
713             uc = *current++;
714         }
715         word = current-1;
716         wordLen = 0;
717     }
718 
719     if (wordCount < 50) {
720         errln("Word count (%d) unreasonably small\n", wordCount);
721         goto cleanup;
722     }
723 
724     enumer1 = mutableDict->openWords(status);
725     if (U_FAILURE(status)) {
726         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
727         goto cleanup;
728     }
729 
730     testCount = 0;
731     if (wordCount != (testCount = enumer1->count(status))) {
732         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
733             testCount, wordCount, u_errorName(status));
734         goto cleanup;
735     }
736 
737     // Now compact it
738     compactDict = new CompactTrieDictionary(*mutableDict, status);
739     if (U_FAILURE(status)) {
740         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
741         goto cleanup;
742     }
743 
744     enumer2 = compactDict->openWords(status);
745     if (U_FAILURE(status)) {
746         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
747         goto cleanup;
748     }
749 
750     if (wordCount != (testCount = enumer2->count(status))) {
751         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
752             testCount, wordCount, u_errorName(status));
753         goto cleanup;
754     }
755 
756     if (typeid(*enumer1) == typeid(*enumer2)) {
757         errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
758     }
759     delete enumer1;
760     enumer1 = NULL;
761     delete enumer2;
762     enumer2 = NULL;
763 
764     // Now un-compact it
765     mutable2 = compactDict->cloneMutable(status);
766     if (U_FAILURE(status)) {
767         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
768         goto cleanup;
769     }
770 
771     cloneEnum = mutable2->openWords(status);
772     if (U_FAILURE(status)) {
773         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
774         goto cleanup;
775     }
776 
777     if (wordCount != (testCount = cloneEnum->count(status))) {
778         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
779             testCount, wordCount, u_errorName(status));
780         goto cleanup;
781     }
782 
783     // Compact original dictionary to clone. Note that we can only compare the same kind of
784     // dictionary as the order of the enumerators is not guaranteed to be the same between
785     // different kinds
786     enumer1 = mutableDict->openWords(status);
787     if (U_FAILURE(status)) {
788         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
789         goto cleanup;
790      }
791 
792     originalWord = enumer1->snext(status);
793     cloneWord = cloneEnum->snext(status);
794     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
795         if (*originalWord != *cloneWord) {
796             errln("Original and cloned MutableTrieDictionary word mismatch\n");
797             goto cleanup;
798         }
799         originalWord = enumer1->snext(status);
800         cloneWord = cloneEnum->snext(status);
801     }
802 
803     if (U_FAILURE(status)) {
804         errln("Enumeration failed: %s\n", u_errorName(status));
805         goto cleanup;
806     }
807 
808     if (originalWord != cloneWord) {
809         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
810         goto cleanup;
811     }
812 
813     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
814     compact2 = new CompactTrieDictionary(compactDict->data(), status);
815     if (U_FAILURE(status)) {
816         errln("CompactTrieDictionary(const void *,...) failed\n");
817         goto cleanup;
818     }
819 
820     if (compact2->dataSize() == 0) {
821         errln("CompactTrieDictionary->dataSize() == 0\n");
822         goto cleanup;
823     }
824 
825     // Now count the words via the second dictionary
826     delete enumer1;
827     enumer1 = compact2->openWords(status);
828     if (U_FAILURE(status)) {
829         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
830         goto cleanup;
831     }
832 
833     if (wordCount != (testCount = enumer1->count(status))) {
834         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
835             testCount, wordCount, u_errorName(status));
836         goto cleanup;
837     }
838 
839 cleanup:
840     delete compactDict;
841     delete mutableDict;
842     delete breaks;
843     delete[] testFile;
844     delete enumer1;
845     delete mutable2;
846     delete cloneEnum;
847     delete compact2;
848 }
849 
850 
851 //----------------------------------------------------------------------------
852 //
853 // generalIteratorTest      Given a break iterator and a set of test data,
854 //                          Run the tests and report the results.
855 //
856 //----------------------------------------------------------------------------
generalIteratorTest(RuleBasedBreakIterator & bi,BITestData & td)857 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
858 {
859 
860     bi.setText(td.fDataToBreak);
861 
862     testFirstAndNext(bi, td);
863 
864     testLastAndPrevious(bi, td);
865 
866     testFollowing(bi, td);
867     testPreceding(bi, td);
868     testIsBoundary(bi, td);
869     doMultipleSelectionTest(bi, td);
870 }
871 
872 
873 //
874 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
875 //                       kind of loop.
876 //
testFirstAndNext(RuleBasedBreakIterator & bi,BITestData & td)877 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
878 {
879     UErrorCode  status = U_ZERO_ERROR;
880     int32_t     p;
881     int32_t     lastP = -1;
882     int32_t     tag;
883 
884     logln("Test first and next");
885     bi.setText(td.fDataToBreak);
886     td.clearResults();
887 
888     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
889         td.fActualBreakPositions.addElement(p, status);  // Save result.
890         tag = bi.getRuleStatus();
891         td.fActualTags.addElement(tag, status);
892         if (p <= lastP) {
893             // If the iterator is not making forward progress, stop.
894             //  No need to raise an error here, it'll be detected in the normal check of results.
895             break;
896         }
897         lastP = p;
898     }
899     td.checkResults("testFirstAndNext", this);
900 }
901 
902 
903 //
904 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
905 //
testLastAndPrevious(RuleBasedBreakIterator & bi,BITestData & td)906 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
907 {
908     UErrorCode  status = U_ZERO_ERROR;
909     int32_t     p;
910     int32_t     lastP  = 0x7ffffffe;
911     int32_t     tag;
912 
913     logln("Test last and previous");
914     bi.setText(td.fDataToBreak);
915     td.clearResults();
916 
917     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
918         // Save break position.  Insert it at start of vector of results, shoving
919         //    already-saved results further towards the end.
920         td.fActualBreakPositions.insertElementAt(p, 0, status);
921         // bi.previous();   // TODO:  Why does this fix things up????
922         // bi.next();
923         tag = bi.getRuleStatus();
924         td.fActualTags.insertElementAt(tag, 0, status);
925         if (p >= lastP) {
926             // If the iterator is not making progress, stop.
927             //  No need to raise an error here, it'll be detected in the normal check of results.
928             break;
929         }
930         lastP = p;
931     }
932     td.checkResults("testLastAndPrevious", this);
933 }
934 
935 
testFollowing(RuleBasedBreakIterator & bi,BITestData & td)936 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
937 {
938     UErrorCode  status = U_ZERO_ERROR;
939     int32_t     p;
940     int32_t     tag;
941     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
942                                  //   cannot be -1; that is returned for DONE.
943     int         i;
944 
945     logln("testFollowing():");
946     bi.setText(td.fDataToBreak);
947     td.clearResults();
948 
949     // Save the starting point, since we won't get that out of following.
950     p = bi.first();
951     td.fActualBreakPositions.addElement(p, status);  // Save result.
952     tag = bi.getRuleStatus();
953     td.fActualTags.addElement(tag, status);
954 
955     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
956         p = bi.following(i);
957         if (p != lastP) {
958             if (p == RuleBasedBreakIterator::DONE) {
959                 break;
960             }
961             // We've reached a new break position.  Save it.
962             td.fActualBreakPositions.addElement(p, status);  // Save result.
963             tag = bi.getRuleStatus();
964             td.fActualTags.addElement(tag, status);
965             lastP = p;
966         }
967     }
968     // The loop normally exits by means of the break in the middle.
969     // Make sure that the index was at the correct position for the break iterator to have
970     //   returned DONE.
971     if (i != td.fDataToBreak.length()) {
972         errln("testFollowing():  iterator returned DONE prematurely.");
973     }
974 
975     // Full check of all results.
976     td.checkResults("testFollowing", this);
977 }
978 
979 
980 
testPreceding(RuleBasedBreakIterator & bi,BITestData & td)981 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
982     UErrorCode  status = U_ZERO_ERROR;
983     int32_t     p;
984     int32_t     tag;
985     int32_t     lastP  = 0x7ffffffe;
986     int         i;
987 
988     logln("testPreceding():");
989     bi.setText(td.fDataToBreak);
990     td.clearResults();
991 
992     p = bi.last();
993     td.fActualBreakPositions.addElement(p, status);
994     tag = bi.getRuleStatus();
995     td.fActualTags.addElement(tag, status);
996 
997     for (i = td.fDataToBreak.length(); i>=-1; i--) {
998         p = bi.preceding(i);
999         if (p != lastP) {
1000             if (p == RuleBasedBreakIterator::DONE) {
1001                 break;
1002             }
1003             // We've reached a new break position.  Save it.
1004             td.fActualBreakPositions.insertElementAt(p, 0, status);
1005             lastP = p;
1006             tag = bi.getRuleStatus();
1007             td.fActualTags.insertElementAt(tag, 0, status);
1008         }
1009     }
1010     // The loop normally exits by means of the break in the middle.
1011     // Make sure that the index was at the correct position for the break iterator to have
1012     //   returned DONE.
1013     if (i != 0) {
1014         errln("testPreceding():  iterator returned DONE prematurely.");
1015     }
1016 
1017     // Full check of all results.
1018     td.checkResults("testPreceding", this);
1019 }
1020 
1021 
1022 
testIsBoundary(RuleBasedBreakIterator & bi,BITestData & td)1023 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
1024     UErrorCode  status = U_ZERO_ERROR;
1025     int         i;
1026     int32_t     tag;
1027 
1028     logln("testIsBoundary():");
1029     bi.setText(td.fDataToBreak);
1030     td.clearResults();
1031 
1032     for (i = 0; i <= td.fDataToBreak.length(); i++) {
1033         if (bi.isBoundary(i)) {
1034             td.fActualBreakPositions.addElement(i, status);  // Save result.
1035             tag = bi.getRuleStatus();
1036             td.fActualTags.addElement(tag, status);
1037         }
1038     }
1039     td.checkResults("testIsBoundary: ", this);
1040 }
1041 
1042 
1043 
doMultipleSelectionTest(RuleBasedBreakIterator & iterator,BITestData & td)1044 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
1045 {
1046     iterator.setText(td.fDataToBreak);
1047 
1048     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
1049     int32_t offset = iterator.first();
1050     int32_t testOffset;
1051     int32_t count = 0;
1052 
1053     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
1054 
1055     if (*testIterator != iterator)
1056         errln("clone() or operator!= failed: two clones compared unequal");
1057 
1058     do {
1059         testOffset = testIterator->first();
1060         testOffset = testIterator->next(count);
1061         if (offset != testOffset)
1062             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1063 
1064         if (offset != RuleBasedBreakIterator::DONE) {
1065             count++;
1066             offset = iterator.next();
1067 
1068             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
1069                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
1070                 if (count > 10000 || offset == -1) {
1071                     errln("operator== failed too many times. Stopping test.");
1072                     if (offset == -1) {
1073                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1074                     }
1075                     return;
1076                 }
1077             }
1078         }
1079     } while (offset != RuleBasedBreakIterator::DONE);
1080 
1081     // now do it backwards...
1082     offset = iterator.last();
1083     count = 0;
1084 
1085     do {
1086         testOffset = testIterator->last();
1087         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
1088         if (offset != testOffset)
1089             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1090 
1091         if (offset != RuleBasedBreakIterator::DONE) {
1092             count--;
1093             offset = iterator.previous();
1094         }
1095     } while (offset != RuleBasedBreakIterator::DONE);
1096 
1097     delete testIterator;
1098 }
1099 
1100 
1101 //---------------------------------------------
1102 //
1103 //     other tests
1104 //
1105 //---------------------------------------------
TestEmptyString()1106 void RBBITest::TestEmptyString()
1107 {
1108     UnicodeString text = "";
1109     UErrorCode status = U_ZERO_ERROR;
1110 
1111     BITestData x(status);
1112     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
1113     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1114     if (U_FAILURE(status))
1115     {
1116         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
1117         return;
1118     }
1119     generalIteratorTest(*bi, x);
1120     delete bi;
1121 }
1122 
TestGetAvailableLocales()1123 void RBBITest::TestGetAvailableLocales()
1124 {
1125     int32_t locCount = 0;
1126     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1127 
1128     if (locCount == 0)
1129         dataerrln("getAvailableLocales() returned an empty list!");
1130     // Just make sure that it's returning good memory.
1131     int32_t i;
1132     for (i = 0; i < locCount; ++i) {
1133         logln(locList[i].getName());
1134     }
1135 }
1136 
1137 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()1138 void RBBITest::TestGetDisplayName()
1139 {
1140     UnicodeString   result;
1141 
1142     BreakIterator::getDisplayName(Locale::getUS(), result);
1143     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1144         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1145                 + result);
1146 
1147     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1148     if (result != "French (France)")
1149         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1150                 + result);
1151 }
1152 /**
1153  * Test End Behaviour
1154  * @bug 4068137
1155  */
TestEndBehaviour()1156 void RBBITest::TestEndBehaviour()
1157 {
1158     UErrorCode status = U_ZERO_ERROR;
1159     UnicodeString testString("boo.");
1160     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1161     if (U_FAILURE(status))
1162     {
1163         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
1164         return;
1165     }
1166     wb->setText(testString);
1167 
1168     if (wb->first() != 0)
1169         errln("Didn't get break at beginning of string.");
1170     if (wb->next() != 3)
1171         errln("Didn't get break before period in \"boo.\"");
1172     if (wb->current() != 4 && wb->next() != 4)
1173         errln("Didn't get break at end of string.");
1174     delete wb;
1175 }
1176 /*
1177  * @bug 4153072
1178  */
TestBug4153072()1179 void RBBITest::TestBug4153072() {
1180     UErrorCode status = U_ZERO_ERROR;
1181     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1182     if (U_FAILURE(status))
1183     {
1184         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
1185         return;
1186     }
1187     UnicodeString str("...Hello, World!...");
1188     int32_t begin = 3;
1189     int32_t end = str.length() - 3;
1190     UBool onBoundary;
1191 
1192     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1193     iter->adoptText(textIterator);
1194     int index;
1195     // Note: with the switch to UText, there is no way to restrict the
1196     //       iteration range to begin at an index other than zero.
1197     //       String character iterators created with a non-zero bound are
1198     //         treated by RBBI as being empty.
1199     for (index = -1; index < begin + 1; ++index) {
1200         onBoundary = iter->isBoundary(index);
1201         if (index == 0?  !onBoundary : onBoundary) {
1202             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1203                             " and begin index = " + begin);
1204         }
1205     }
1206     delete iter;
1207 }
1208 
1209 
1210 //
1211 // Test for problem reported by Ashok Matoria on 9 July 2007
1212 //    One.<kSoftHyphen><kSpace>Two.
1213 //
1214 //    Sentence break at start (0) and then on calling next() it breaks at
1215 //   'T' of "Two". Now, at this point if I do next() and
1216 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1217 //
TestBug5775()1218 void RBBITest::TestBug5775() {
1219     UErrorCode status = U_ZERO_ERROR;
1220     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1221     TEST_ASSERT_SUCCESS(status);
1222     if (U_FAILURE(status)) {
1223         return;
1224     }
1225 // Check for status first for better handling of no data errors.
1226     TEST_ASSERT(bi != NULL);
1227     if (bi == NULL) {
1228         return;
1229     }
1230 
1231     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
1232     //               01234      56789
1233     s = s.unescape();
1234     bi->setText(s);
1235     int pos = bi->next();
1236     TEST_ASSERT(pos == 6);
1237     pos = bi->next();
1238     TEST_ASSERT(pos == 10);
1239     pos = bi->previous();
1240     TEST_ASSERT(pos == 6);
1241     delete bi;
1242 }
1243 
1244 
1245 
1246 /**
1247  * Test Japanese Line Break
1248  * @bug 4095322
1249  */
TestJapaneseLineBreak()1250 void RBBITest::TestJapaneseLineBreak()
1251 {
1252 #if 0
1253     // Test needs updating some more...   Dump it for now.
1254 
1255 
1256     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
1257     //        as opening and closing punctuation for line breaking.
1258     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
1259     //        from these tests.    6-13-2002
1260     //
1261     UErrorCode status = U_ZERO_ERROR;
1262     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1263     UnicodeString precedingChars = CharsToUnicodeString(
1264         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1265         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1266     UnicodeString followingChars = CharsToUnicodeString(
1267         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1268         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1269         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1270         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1271         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1272     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1273 
1274     int32_t i;
1275     if (U_FAILURE(status))
1276     {
1277         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1278         return;
1279     }
1280 
1281     for (i = 0; i < precedingChars.length(); i++) {
1282         testString.setCharAt(1, precedingChars[i]);
1283         iter->setText(testString);
1284         int32_t j = iter->first();
1285         if (j != 0)
1286             errln("ja line break failure: failed to start at 0");
1287         j = iter->next();
1288         if (j != 1)
1289             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1290                         + "' (" + ((int)(precedingChars[i])) + ")");
1291         j = iter->next();
1292         if (j != 3)
1293             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1294                         + "' (" + ((int)(precedingChars[i])) + ")");
1295     }
1296 
1297     for (i = 0; i < followingChars.length(); i++) {
1298         testString.setCharAt(1, followingChars[i]);
1299         iter->setText(testString);
1300         int j = iter->first();
1301         if (j != 0)
1302             errln("ja line break failure: failed to start at 0");
1303         j = iter->next();
1304         if (j != 2)
1305             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1306                         + "' (" + ((int)(followingChars[i])) + ")");
1307         j = iter->next();
1308         if (j != 3)
1309             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1310                         + "' (" + ((int)(followingChars[i])) + ")");
1311     }
1312     delete iter;
1313 #endif
1314 }
1315 
1316 
1317 //------------------------------------------------------------------------------
1318 //
1319 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
1320 //
1321 //------------------------------------------------------------------------------
1322 
1323 struct TestParams {
1324     BreakIterator   *bi;
1325     UnicodeString    dataToBreak;
1326     UVector32       *expectedBreaks;
1327     UVector32       *srcLine;
1328     UVector32       *srcCol;
1329 };
1330 
executeTest(TestParams * t)1331 void RBBITest::executeTest(TestParams *t) {
1332     int32_t    bp;
1333     int32_t    prevBP;
1334     int32_t    i;
1335 
1336     if (t->bi == NULL) {
1337         return;
1338     }
1339 
1340     t->bi->setText(t->dataToBreak);
1341     //
1342     //  Run the iterator forward
1343     //
1344     prevBP = -1;
1345     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1346         if (prevBP ==  bp) {
1347             // Fail for lack of forward progress.
1348             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1349                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1350             break;
1351         }
1352 
1353         // Check that there were we didn't miss an expected break between the last one
1354         //  and this one.
1355         for (i=prevBP+1; i<bp; i++) {
1356             if (t->expectedBreaks->elementAti(i) != 0) {
1357                 int expected[] = {0, i};
1358                 printStringBreaks(t->dataToBreak, expected, 2);
1359                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1360                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1361             }
1362         }
1363 
1364         // Check that the break we did find was expected
1365         if (t->expectedBreaks->elementAti(bp) == 0) {
1366             int expected[] = {0, bp};
1367             printStringBreaks(t->dataToBreak, expected, 2);
1368             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1369                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1370         } else {
1371             // The break was expected.
1372             //   Check that the {nnn} tag value is correct.
1373             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1374             if (expectedTagVal == -1) {
1375                 expectedTagVal = 0;
1376             }
1377             int32_t line = t->srcLine->elementAti(bp);
1378             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1379             if (rs != expectedTagVal) {
1380                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1381                       "          Actual, Expected status = %4d, %4d",
1382                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1383             }
1384         }
1385 
1386 
1387         prevBP = bp;
1388     }
1389 
1390     // Verify that there were no missed expected breaks after the last one found
1391     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1392         if (t->expectedBreaks->elementAti(i) != 0) {
1393             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1394                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1395         }
1396     }
1397 
1398     //
1399     //  Run the iterator backwards, verify that the same breaks are found.
1400     //
1401     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
1402     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1403         if (prevBP ==  bp) {
1404             // Fail for lack of progress.
1405             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1406                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1407             break;
1408         }
1409 
1410         // Check that there were we didn't miss an expected break between the last one
1411         //  and this one.  (UVector returns zeros for index out of bounds.)
1412         for (i=prevBP-1; i>bp; i--) {
1413             if (t->expectedBreaks->elementAti(i) != 0) {
1414                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1415                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1416             }
1417         }
1418 
1419         // Check that the break we did find was expected
1420         if (t->expectedBreaks->elementAti(bp) == 0) {
1421             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1422                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1423         } else {
1424             // The break was expected.
1425             //   Check that the {nnn} tag value is correct.
1426             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1427             if (expectedTagVal == -1) {
1428                 expectedTagVal = 0;
1429             }
1430             int line = t->srcLine->elementAti(bp);
1431             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1432             if (rs != expectedTagVal) {
1433                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1434                       "          Actual, Expected status = %4d, %4d",
1435                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1436             }
1437         }
1438 
1439         prevBP = bp;
1440     }
1441 
1442     // Verify that there were no missed breaks prior to the last one found
1443     for (i=prevBP-1; i>=0; i--) {
1444         if (t->expectedBreaks->elementAti(i) != 0) {
1445             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1446                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1447         }
1448     }
1449 }
1450 
1451 
TestExtended()1452 void RBBITest::TestExtended() {
1453 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1454     UErrorCode      status  = U_ZERO_ERROR;
1455     Locale          locale("");
1456 
1457     UnicodeString       rules;
1458     TestParams          tp;
1459     tp.bi             = NULL;
1460     tp.expectedBreaks = new UVector32(status);
1461     tp.srcLine        = new UVector32(status);
1462     tp.srcCol         = new UVector32(status);
1463 
1464     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1465     if (U_FAILURE(status)) {
1466         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1467     }
1468 
1469 
1470     //
1471     //  Open and read the test data file.
1472     //
1473     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1474     char testFileName[1000];
1475     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1476         errln("Can't open test data.  Path too long.");
1477         return;
1478     }
1479     strcpy(testFileName, testDataDirectory);
1480     strcat(testFileName, "rbbitst.txt");
1481 
1482     int    len;
1483     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1484     if (U_FAILURE(status)) {
1485         return; /* something went wrong, error already output */
1486     }
1487 
1488 
1489 
1490 
1491     //
1492     //  Put the test data into a UnicodeString
1493     //
1494     UnicodeString testString(FALSE, testFile, len);
1495 
1496     enum EParseState{
1497         PARSE_COMMENT,
1498         PARSE_TAG,
1499         PARSE_DATA,
1500         PARSE_NUM
1501     }
1502     parseState = PARSE_TAG;
1503 
1504     EParseState savedState = PARSE_TAG;
1505 
1506     static const UChar CH_LF        = 0x0a;
1507     static const UChar CH_CR        = 0x0d;
1508     static const UChar CH_HASH      = 0x23;
1509     /*static const UChar CH_PERIOD    = 0x2e;*/
1510     static const UChar CH_LT        = 0x3c;
1511     static const UChar CH_GT        = 0x3e;
1512     static const UChar CH_BACKSLASH = 0x5c;
1513     static const UChar CH_BULLET    = 0x2022;
1514 
1515     int32_t    lineNum  = 1;
1516     int32_t    colStart = 0;
1517     int32_t    column   = 0;
1518     int32_t    charIdx  = 0;
1519 
1520     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1521 
1522     for (charIdx = 0; charIdx < len; ) {
1523         status = U_ZERO_ERROR;
1524         UChar  c = testString.charAt(charIdx);
1525         charIdx++;
1526         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1527             // treat CRLF as a unit
1528             c = CH_LF;
1529             charIdx++;
1530         }
1531         if (c == CH_LF || c == CH_CR) {
1532             lineNum++;
1533             colStart = charIdx;
1534         }
1535         column = charIdx - colStart + 1;
1536 
1537         switch (parseState) {
1538         case PARSE_COMMENT:
1539             if (c == 0x0a || c == 0x0d) {
1540                 parseState = savedState;
1541             }
1542             break;
1543 
1544         case PARSE_TAG:
1545             {
1546             if (c == CH_HASH) {
1547                 parseState = PARSE_COMMENT;
1548                 savedState = PARSE_TAG;
1549                 break;
1550             }
1551             if (u_isUWhiteSpace(c)) {
1552                 break;
1553             }
1554             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1555                 delete tp.bi;
1556                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1557                 charIdx += 5;
1558                 break;
1559             }
1560             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1561                 delete tp.bi;
1562                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1563                 charIdx += 5;
1564                 break;
1565             }
1566             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1567                 delete tp.bi;
1568                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1569                 charIdx += 5;
1570                 break;
1571             }
1572             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1573                 delete tp.bi;
1574                 tp.bi = NULL;
1575                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1576                 charIdx += 5;
1577                 break;
1578             }
1579             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1580                 delete tp.bi;
1581                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1582                 charIdx += 6;
1583                 break;
1584             }
1585 
1586             // <locale  loc_name>
1587             localeMatcher.reset(testString);
1588             if (localeMatcher.lookingAt(charIdx-1, status)) {
1589                 UnicodeString localeName = localeMatcher.group(1, status);
1590                 char localeName8[100];
1591                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1592                 locale = Locale::createFromName(localeName8);
1593                 charIdx += localeMatcher.group(0, status).length();
1594                 TEST_ASSERT_SUCCESS(status);
1595                 break;
1596             }
1597             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1598                 parseState = PARSE_DATA;
1599                 charIdx += 5;
1600                 tp.dataToBreak = "";
1601                 tp.expectedBreaks->removeAllElements();
1602                 tp.srcCol ->removeAllElements();
1603                 tp.srcLine->removeAllElements();
1604                 break;
1605             }
1606 
1607             errln("line %d: Tag expected in test file.", lineNum);
1608             parseState = PARSE_COMMENT;
1609             savedState = PARSE_DATA;
1610             goto end_test; // Stop the test.
1611             }
1612             break;
1613 
1614         case PARSE_DATA:
1615             if (c == CH_BULLET) {
1616                 int32_t  breakIdx = tp.dataToBreak.length();
1617                 tp.expectedBreaks->setSize(breakIdx+1);
1618                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1619                 tp.srcLine->setSize(breakIdx+1);
1620                 tp.srcLine->setElementAt(lineNum, breakIdx);
1621                 tp.srcCol ->setSize(breakIdx+1);
1622                 tp.srcCol ->setElementAt(column, breakIdx);
1623                 break;
1624             }
1625 
1626             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1627                 // Add final entry to mappings from break location to source file position.
1628                 //  Need one extra because last break position returned is after the
1629                 //    last char in the data, not at the last char.
1630                 tp.srcLine->addElement(lineNum, status);
1631                 tp.srcCol ->addElement(column, status);
1632 
1633                 parseState = PARSE_TAG;
1634                 charIdx += 6;
1635 
1636                 // RUN THE TEST!
1637                 executeTest(&tp);
1638                 break;
1639             }
1640 
1641             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1642                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1643                 // Get the code point from the name and insert it into the test data.
1644                 //   (Damn, no API takes names in Unicode  !!!
1645                 //    we've got to take it back to char *)
1646                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1647                 int32_t nameLength = nameEndIdx - (charIdx+2);
1648                 char charNameBuf[200];
1649                 UChar32 theChar = -1;
1650                 if (nameEndIdx != -1) {
1651                     UErrorCode status = U_ZERO_ERROR;
1652                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1653                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1654                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1655                     if (U_FAILURE(status)) {
1656                         theChar = -1;
1657                     }
1658                 }
1659                 if (theChar == -1) {
1660                     errln("Error in named character in test file at line %d, col %d",
1661                         lineNum, column);
1662                 } else {
1663                     // Named code point was recognized.  Insert it
1664                     //   into the test data.
1665                     tp.dataToBreak.append(theChar);
1666                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1667                         tp.srcLine->addElement(lineNum, status);
1668                         tp.srcCol ->addElement(column, status);
1669                     }
1670                 }
1671                 if (nameEndIdx > charIdx) {
1672                     charIdx = nameEndIdx+1;
1673 
1674                 }
1675                 break;
1676             }
1677 
1678 
1679 
1680 
1681             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1682                 charIdx++;
1683                 int32_t  breakIdx = tp.dataToBreak.length();
1684                 tp.expectedBreaks->setSize(breakIdx+1);
1685                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1686                 tp.srcLine->setSize(breakIdx+1);
1687                 tp.srcLine->setElementAt(lineNum, breakIdx);
1688                 tp.srcCol ->setSize(breakIdx+1);
1689                 tp.srcCol ->setElementAt(column, breakIdx);
1690                 break;
1691             }
1692 
1693             if (c == CH_LT) {
1694                 tagValue   = 0;
1695                 parseState = PARSE_NUM;
1696                 break;
1697             }
1698 
1699             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1700                 parseState = PARSE_COMMENT;
1701                 savedState = PARSE_DATA;
1702                 break;
1703             }
1704 
1705             if (c == CH_BACKSLASH) {
1706                 // Check for \ at end of line, a line continuation.
1707                 //     Advance over (discard) the newline
1708                 UChar32 cp = testString.char32At(charIdx);
1709                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1710                     // We have a CR LF
1711                     //  Need an extra increment of the input ptr to move over both of them
1712                     charIdx++;
1713                 }
1714                 if (cp == CH_LF || cp == CH_CR) {
1715                     lineNum++;
1716                     colStart = charIdx;
1717                     charIdx++;
1718                     break;
1719                 }
1720 
1721                 // Let unescape handle the back slash.
1722                 cp = testString.unescapeAt(charIdx);
1723                 if (cp != -1) {
1724                     // Escape sequence was recognized.  Insert the char
1725                     //   into the test data.
1726                     tp.dataToBreak.append(cp);
1727                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1728                         tp.srcLine->addElement(lineNum, status);
1729                         tp.srcCol ->addElement(column, status);
1730                     }
1731                     break;
1732                 }
1733 
1734 
1735                 // Not a recognized backslash escape sequence.
1736                 // Take the next char as a literal.
1737                 //  TODO:  Should this be an error?
1738                 c = testString.charAt(charIdx);
1739                 charIdx = testString.moveIndex32(charIdx, 1);
1740             }
1741 
1742             // Normal, non-escaped data char.
1743             tp.dataToBreak.append(c);
1744 
1745             // Save the mapping from offset in the data to line/column numbers in
1746             //   the original input file.  Will be used for better error messages only.
1747             //   If there's an expected break before this char, the slot in the mapping
1748             //     vector will already be set for this char; don't overwrite it.
1749             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1750                 tp.srcLine->addElement(lineNum, status);
1751                 tp.srcCol ->addElement(column, status);
1752             }
1753             break;
1754 
1755 
1756         case PARSE_NUM:
1757             // We are parsing an expected numeric tag value, like <1234>,
1758             //   within a chunk of data.
1759             if (u_isUWhiteSpace(c)) {
1760                 break;
1761             }
1762 
1763             if (c == CH_GT) {
1764                 // Finished the number.  Add the info to the expected break data,
1765                 //   and switch parse state back to doing plain data.
1766                 parseState = PARSE_DATA;
1767                 if (tagValue == 0) {
1768                     tagValue = -1;
1769                 }
1770                 int32_t  breakIdx = tp.dataToBreak.length();
1771                 tp.expectedBreaks->setSize(breakIdx+1);
1772                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1773                 tp.srcLine->setSize(breakIdx+1);
1774                 tp.srcLine->setElementAt(lineNum, breakIdx);
1775                 tp.srcCol ->setSize(breakIdx+1);
1776                 tp.srcCol ->setElementAt(column, breakIdx);
1777                 break;
1778             }
1779 
1780             if (u_isdigit(c)) {
1781                 tagValue = tagValue*10 + u_charDigitValue(c);
1782                 break;
1783             }
1784 
1785             errln("Syntax Error in test file at line %d, col %d",
1786                 lineNum, column);
1787             parseState = PARSE_COMMENT;
1788             goto end_test; // Stop the test
1789             break;
1790         }
1791 
1792 
1793         if (U_FAILURE(status)) {
1794             dataerrln("ICU Error %s while parsing test file at line %d.",
1795                 u_errorName(status), lineNum);
1796             status = U_ZERO_ERROR;
1797             goto end_test; // Stop the test
1798         }
1799 
1800     }
1801 
1802 end_test:
1803     delete tp.bi;
1804     delete tp.expectedBreaks;
1805     delete tp.srcLine;
1806     delete tp.srcCol;
1807     delete [] testFile;
1808 #endif
1809 }
1810 
1811 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
1812 // Words don't include colon or period (cldrbug #1969).
1813 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
1814 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
1815 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
1816 
1817 // UBreakIteratorType UBRK_WORD, Locale "ja"
1818 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
1819 static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
1820                                         "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
1821 static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
1822 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
1823 
1824 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
1825 // Add break after Greek question mark (cldrbug #2069).
1826 static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
1827                                         "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
1828 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
1829 static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
1830 
1831 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
1832 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
1833 static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
1834                                         "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
1835                                         "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
1836 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
1837                                           12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
1838                                           29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
1839 static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
1840                                           12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
1841                                           29,     32, 33, 35, 37, 38,     40, 41 };
1842 
1843 typedef struct {
1844     UBreakIteratorType  type;
1845     const char *        locale;
1846     const char *        escapedText;
1847     const int32_t *     tailoredOffsets;
1848     int32_t             tailoredOffsetsCount;
1849     const int32_t *     rootOffsets;
1850     int32_t             rootOffsetsCount;
1851 } TailoredBreakItem;
1852 
1853 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
1854 
1855 static const TailoredBreakItem tbItems[] = {
1856     { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
1857     { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
1858     { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
1859     { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
1860     { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
1861 };
1862 
formatOffsets(char * buffer,int32_t buflen,int32_t count,const int32_t * offsets)1863 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
1864     while (count-- > 0) {
1865         int writeCount;
1866         sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
1867         buffer += writeCount;
1868         buflen -= writeCount;
1869     }
1870 }
1871 
1872 enum { kMaxOffsetCount = 128 };
1873 
TBTest(BreakIterator * brkitr,int type,const char * locale,const char * escapedText,const int32_t * expectOffsets,int32_t expectOffsetsCount)1874 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
1875     brkitr->setText( CharsToUnicodeString(escapedText) );
1876     int32_t foundOffsets[kMaxOffsetCount];
1877     int32_t offset, foundOffsetsCount = 0;
1878     // do forwards iteration test
1879     while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
1880         foundOffsets[foundOffsetsCount++] = offset;
1881     }
1882     if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
1883         // log error for forwards test
1884         char formatExpect[512], formatFound[512];
1885         formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
1886         formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
1887         errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
1888                 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
1889     } else {
1890         // do backwards iteration test
1891         --foundOffsetsCount; // back off one from the end offset
1892         while ( foundOffsetsCount > 0 ) {
1893             offset = brkitr->previous();
1894             if ( offset != foundOffsets[--foundOffsetsCount] ) {
1895                 // log error for backwards test
1896                 char formatExpect[512];
1897                 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
1898                 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
1899                         type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
1900                 break;
1901             }
1902         }
1903     }
1904 }
1905 
TestTailoredBreaks()1906 void RBBITest::TestTailoredBreaks() {
1907     const TailoredBreakItem * tbItemPtr;
1908     Locale rootLocale = Locale("root");
1909     for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
1910         Locale testLocale = Locale(tbItemPtr->locale);
1911         BreakIterator * tailoredBrkiter = NULL;
1912         BreakIterator * rootBrkiter = NULL;
1913         UErrorCode status = U_ZERO_ERROR;
1914         switch (tbItemPtr->type) {
1915             case UBRK_CHARACTER:
1916                 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
1917                 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
1918                 break;
1919             case UBRK_WORD:
1920                 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
1921                 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
1922                 break;
1923             case UBRK_LINE:
1924                 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
1925                 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
1926                 break;
1927             case UBRK_SENTENCE:
1928                 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
1929                 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
1930                 break;
1931             default:
1932                 status = U_UNSUPPORTED_ERROR;
1933                 break;
1934         }
1935         if (U_FAILURE(status)) {
1936             errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
1937             continue;
1938         }
1939         TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
1940         TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
1941 
1942         delete rootBrkiter;
1943         delete tailoredBrkiter;
1944     }
1945 }
1946 
1947 
1948 //-------------------------------------------------------------------------------
1949 //
1950 //  TestDictRules   create a break iterator from source rules that includes a
1951 //                  dictionary range.   Regression for bug #7130.  Source rules
1952 //                  do not declare a break iterator type (word, line, sentence, etc.
1953 //                  but the dictionary code, without a type, would loop.
1954 //
1955 //-------------------------------------------------------------------------------
TestDictRules()1956 void RBBITest::TestDictRules() {
1957     const char *rules =  "$dictionary = [a-z]; \n"
1958                          "!!forward; \n"
1959                          "$dictionary $dictionary; \n"
1960                          "!!reverse; \n"
1961                          "$dictionary $dictionary; \n";
1962     const char *text = "aa";
1963     UErrorCode status = U_ZERO_ERROR;
1964     UParseError parseError;
1965 
1966     RuleBasedBreakIterator bi(rules, parseError, status);
1967     if (U_SUCCESS(status)) {
1968         UnicodeString utext = text;
1969         bi.setText(utext);
1970         int32_t position;
1971         int32_t loops;
1972         for (loops = 0; loops<10; loops++) {
1973             position = bi.next();
1974             if (position == RuleBasedBreakIterator::DONE) {
1975                 break;
1976             }
1977         }
1978         TEST_ASSERT(loops == 1);
1979     } else {
1980         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1981     }
1982 }
1983 
1984 
1985 
1986 //-------------------------------------------------------------------------------
1987 //
1988 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1989 //    return the datain one big UChar * buffer, which the caller must delete.
1990 //
1991 //    parameters:
1992 //          fileName:   the name of the file, with no directory part.  The test data directory
1993 //                      is assumed.
1994 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1995 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1996 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1997 //                      Pass NULL for the system default encoding.
1998 //          status
1999 //    returns:
2000 //                      The file data, converted to UChar.
2001 //                      The caller must delete this when done with
2002 //                           delete [] theBuffer;
2003 //
2004 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
2005 //           Move this function to some common place.
2006 //
2007 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)2008 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
2009     UChar       *retPtr  = NULL;
2010     char        *fileBuf = NULL;
2011     UConverter* conv     = NULL;
2012     FILE        *f       = NULL;
2013 
2014     ulen = 0;
2015     if (U_FAILURE(status)) {
2016         return retPtr;
2017     }
2018 
2019     //
2020     //  Open the file.
2021     //
2022     f = fopen(fileName, "rb");
2023     if (f == 0) {
2024         dataerrln("Error opening test data file %s\n", fileName);
2025         status = U_FILE_ACCESS_ERROR;
2026         return NULL;
2027     }
2028     //
2029     //  Read it in
2030     //
2031     int   fileSize;
2032     int   amt_read;
2033 
2034     fseek( f, 0, SEEK_END);
2035     fileSize = ftell(f);
2036     fileBuf = new char[fileSize];
2037     fseek(f, 0, SEEK_SET);
2038     amt_read = fread(fileBuf, 1, fileSize, f);
2039     if (amt_read != fileSize || fileSize <= 0) {
2040         errln("Error reading test data file.");
2041         goto cleanUpAndReturn;
2042     }
2043 
2044     //
2045     // Look for a Unicode Signature (BOM) on the data just read
2046     //
2047     int32_t        signatureLength;
2048     const char *   fileBufC;
2049     const char*    bomEncoding;
2050 
2051     fileBufC = fileBuf;
2052     bomEncoding = ucnv_detectUnicodeSignature(
2053         fileBuf, fileSize, &signatureLength, &status);
2054     if(bomEncoding!=NULL ){
2055         fileBufC  += signatureLength;
2056         fileSize  -= signatureLength;
2057         encoding = bomEncoding;
2058     }
2059 
2060     //
2061     // Open a converter to take the rule file to UTF-16
2062     //
2063     conv = ucnv_open(encoding, &status);
2064     if (U_FAILURE(status)) {
2065         goto cleanUpAndReturn;
2066     }
2067 
2068     //
2069     // Convert the rules to UChar.
2070     //  Preflight first to determine required buffer size.
2071     //
2072     ulen = ucnv_toUChars(conv,
2073         NULL,           //  dest,
2074         0,              //  destCapacity,
2075         fileBufC,
2076         fileSize,
2077         &status);
2078     if (status == U_BUFFER_OVERFLOW_ERROR) {
2079         // Buffer Overflow is expected from the preflight operation.
2080         status = U_ZERO_ERROR;
2081 
2082         retPtr = new UChar[ulen+1];
2083         ucnv_toUChars(conv,
2084             retPtr,       //  dest,
2085             ulen+1,
2086             fileBufC,
2087             fileSize,
2088             &status);
2089     }
2090 
2091 cleanUpAndReturn:
2092     fclose(f);
2093     delete []fileBuf;
2094     ucnv_close(conv);
2095     if (U_FAILURE(status)) {
2096         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2097         delete []retPtr;
2098         retPtr = 0;
2099         ulen   = 0;
2100     };
2101     return retPtr;
2102 }
2103 
2104 
2105 
2106 //--------------------------------------------------------------------------------------------
2107 //
2108 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
2109 //
2110 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()2111 void RBBITest::TestUnicodeFiles() {
2112     RuleBasedBreakIterator  *bi;
2113     UErrorCode               status = U_ZERO_ERROR;
2114 
2115     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2116     TEST_ASSERT_SUCCESS(status);
2117     if (U_SUCCESS(status)) {
2118         runUnicodeTestData("GraphemeBreakTest.txt", bi);
2119     }
2120     delete bi;
2121 
2122     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
2123     TEST_ASSERT_SUCCESS(status);
2124     if (U_SUCCESS(status)) {
2125         runUnicodeTestData("WordBreakTest.txt", bi);
2126     }
2127     delete bi;
2128 
2129     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
2130     TEST_ASSERT_SUCCESS(status);
2131     if (U_SUCCESS(status)) {
2132         runUnicodeTestData("SentenceBreakTest.txt", bi);
2133     }
2134     delete bi;
2135 
2136     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
2137     TEST_ASSERT_SUCCESS(status);
2138     if (U_SUCCESS(status)) {
2139         runUnicodeTestData("LineBreakTest.txt", bi);
2140     }
2141     delete bi;
2142 }
2143 
2144 
2145 //--------------------------------------------------------------------------------------------
2146 //
2147 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
2148 //
2149 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)2150 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
2151 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2152 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb.
2153   UVersionInfo icu49 = { 4, 9, 0, 0 };
2154 UBool isICUVersionPast48 = isICUVersionAtLeast(icu49);
2155 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
2156     UErrorCode  status = U_ZERO_ERROR;
2157 
2158     //
2159     //  Open and read the test data file, put it into a UnicodeString.
2160     //
2161     const char *testDataDirectory = IntlTest::getSourceTestData(status);
2162     char testFileName[1000];
2163     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
2164         dataerrln("Can't open test data.  Path too long.");
2165         return;
2166     }
2167     strcpy(testFileName, testDataDirectory);
2168     strcat(testFileName, fileName);
2169 
2170     logln("Opening data file %s\n", fileName);
2171 
2172     int    len;
2173     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
2174     if (status != U_FILE_ACCESS_ERROR) {
2175         TEST_ASSERT_SUCCESS(status);
2176         TEST_ASSERT(testFile != NULL);
2177     }
2178     if (U_FAILURE(status) || testFile == NULL) {
2179         return; /* something went wrong, error already output */
2180     }
2181     UnicodeString testFileAsString(TRUE, testFile, len);
2182 
2183     //
2184     //  Parse the test data file using a regular expression.
2185     //  Each kind of token is recognized in its own capture group; what type of item was scanned
2186     //     is identified by which group had a match.
2187     //
2188     //    Caputure Group #                  1          2            3            4           5
2189     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
2190     //
2191     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
2192     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
2193     UnicodeString   testString;
2194     UVector32       breakPositions(status);
2195     int             lineNumber = 1;
2196     TEST_ASSERT_SUCCESS(status);
2197     if (U_FAILURE(status)) {
2198         return;
2199     }
2200 
2201     //
2202     //  Scan through each test case, building up the string to be broken in testString,
2203     //   and the positions that should be boundaries in the breakPositions vector.
2204     //
2205     int spin = 0;
2206     while (tokenMatcher.find()) {
2207       	if(tokenMatcher.hitEnd()) {
2208           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
2209              This occurred when the text file was corrupt (wasn't marked as UTF-8)
2210              and caused an infinite loop here on EBCDIC systems!
2211           */
2212           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
2213           //	   return;
2214       	}
2215         if (tokenMatcher.start(1, status) >= 0) {
2216             // Scanned a divide sign, indicating a break position in the test data.
2217             if (testString.length()>0) {
2218                 breakPositions.addElement(testString.length(), status);
2219             }
2220         }
2221         else if (tokenMatcher.start(2, status) >= 0) {
2222             // Scanned an 'x', meaning no break at this position in the test data
2223             //   Nothing to be done here.
2224             }
2225         else if (tokenMatcher.start(3, status) >= 0) {
2226             // Scanned Hex digits.  Convert them to binary, append to the character data string.
2227             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
2228             int length = hexNumber.length();
2229             if (length<=8) {
2230                 char buf[10];
2231                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
2232                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
2233                 if (c<=0x10ffff) {
2234                     testString.append(c);
2235                 } else {
2236                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
2237                        fileName, lineNumber);
2238                 }
2239             } else {
2240                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
2241                        fileName, lineNumber);
2242              }
2243         }
2244         else if (tokenMatcher.start(4, status) >= 0) {
2245             // Scanned to end of a line, possibly skipping over a comment in the process.
2246             //   If the line from the file contained test data, run the test now.
2247             //
2248             if (testString.length() > 0) {
2249 // TODO(andy): Remove this time bomb code.
2250 if (!isLineBreak || isICUVersionPast48 || !(4658 <= lineNumber && lineNumber <= 4758)) {
2251                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
2252 }
2253             }
2254 
2255             // Clear out this test case.
2256             //    The string and breakPositions vector will be refilled as the next
2257             //       test case is parsed.
2258             testString.remove();
2259             breakPositions.removeAllElements();
2260             lineNumber++;
2261         } else {
2262             // Scanner catchall.  Something unrecognized appeared on the line.
2263             char token[16];
2264             UnicodeString uToken = tokenMatcher.group(0, status);
2265             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
2266             token[sizeof(token)-1] = 0;
2267             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
2268 
2269             // Clean up, in preparation for continuing with the next line.
2270             testString.remove();
2271             breakPositions.removeAllElements();
2272             lineNumber++;
2273         }
2274         TEST_ASSERT_SUCCESS(status);
2275         if (U_FAILURE(status)) {
2276             break;
2277         }
2278     }
2279 
2280     delete [] testFile;
2281  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
2282 }
2283 
2284 //--------------------------------------------------------------------------------------------
2285 //
2286 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
2287 //                            test data files.  Do only a simple, forward-only check -
2288 //                            this test is mostly to check that ICU and the Unicode
2289 //                            data agree with each other.
2290 //
2291 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)2292 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
2293                          const UnicodeString &testString,   // Text data to be broken
2294                          UVector32 *breakPositions,         // Positions where breaks should be found.
2295                          RuleBasedBreakIterator *bi) {
2296     int32_t pos;                 // Break Position in the test string
2297     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
2298     int32_t expectedPos;         // Expected break position (index into test string)
2299 
2300     bi->setText(testString);
2301     pos = bi->first();
2302     pos = bi->next();
2303 
2304     while (pos != BreakIterator::DONE) {
2305         if (expectedI >= breakPositions->size()) {
2306             errln("Test file \"%s\", line %d, unexpected break found at position %d",
2307                 testFileName, lineNumber, pos);
2308             break;
2309         }
2310         expectedPos = breakPositions->elementAti(expectedI);
2311         if (pos < expectedPos) {
2312             errln("Test file \"%s\", line %d, unexpected break found at position %d",
2313                 testFileName, lineNumber, pos);
2314             break;
2315         }
2316         if (pos > expectedPos) {
2317             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2318                 testFileName, lineNumber, expectedPos);
2319             break;
2320         }
2321         pos = bi->next();
2322         expectedI++;
2323     }
2324 
2325     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
2326         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2327             testFileName, lineNumber, breakPositions->elementAti(expectedI));
2328     }
2329 }
2330 
2331 
2332 
2333 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2334 //---------------------------------------------------------------------------------------
2335 //
2336 //   classs RBBIMonkeyKind
2337 //
2338 //      Monkey Test for Break Iteration
2339 //      Abstract interface class.   Concrete derived classes independently
2340 //      implement the break rules for different iterator types.
2341 //
2342 //      The Monkey Test itself uses doesn't know which type of break iterator it is
2343 //      testing, but works purely in terms of the interface defined here.
2344 //
2345 //---------------------------------------------------------------------------------------
2346 class RBBIMonkeyKind {
2347 public:
2348     // Return a UVector of UnicodeSets, representing the character classes used
2349     //   for this type of iterator.
2350     virtual  UVector  *charClasses() = 0;
2351 
2352     // Set the test text on which subsequent calls to next() will operate
2353     virtual  void      setText(const UnicodeString &s) = 0;
2354 
2355     // Find the next break postion, starting from the prev break position, or from zero.
2356     // Return -1 after reaching end of string.
2357     virtual  int32_t   next(int32_t i) = 0;
2358 
2359     virtual ~RBBIMonkeyKind();
2360     UErrorCode       deferredStatus;
2361 
2362 
2363 protected:
2364     RBBIMonkeyKind();
2365 
2366 private:
2367 };
2368 
RBBIMonkeyKind()2369 RBBIMonkeyKind::RBBIMonkeyKind() {
2370     deferredStatus = U_ZERO_ERROR;
2371 }
2372 
~RBBIMonkeyKind()2373 RBBIMonkeyKind::~RBBIMonkeyKind() {
2374 }
2375 
2376 
2377 //----------------------------------------------------------------------------------------
2378 //
2379 //   Random Numbers.  Similar to standard lib rand() and srand()
2380 //                    Not using library to
2381 //                      1.  Get same results on all platforms.
2382 //                      2.  Get access to current seed, to more easily reproduce failures.
2383 //
2384 //---------------------------------------------------------------------------------------
2385 static uint32_t m_seed = 1;
2386 
m_rand()2387 static uint32_t m_rand()
2388 {
2389     m_seed = m_seed * 1103515245 + 12345;
2390     return (uint32_t)(m_seed/65536) % 32768;
2391 }
2392 
2393 
2394 //------------------------------------------------------------------------------------------
2395 //
2396 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2397 //                             of RBBIMonkeyKind.
2398 //
2399 //------------------------------------------------------------------------------------------
2400 class RBBICharMonkey: public RBBIMonkeyKind {
2401 public:
2402     RBBICharMonkey();
2403     virtual          ~RBBICharMonkey();
2404     virtual  UVector *charClasses();
2405     virtual  void     setText(const UnicodeString &s);
2406     virtual  int32_t  next(int32_t i);
2407 private:
2408     UVector   *fSets;
2409 
2410     UnicodeSet  *fCRLFSet;
2411     UnicodeSet  *fControlSet;
2412     UnicodeSet  *fExtendSet;
2413     UnicodeSet  *fPrependSet;
2414     UnicodeSet  *fSpacingSet;
2415     UnicodeSet  *fLSet;
2416     UnicodeSet  *fVSet;
2417     UnicodeSet  *fTSet;
2418     UnicodeSet  *fLVSet;
2419     UnicodeSet  *fLVTSet;
2420     UnicodeSet  *fHangulSet;
2421     UnicodeSet  *fAnySet;
2422 
2423     const UnicodeString *fText;
2424 };
2425 
2426 
RBBICharMonkey()2427 RBBICharMonkey::RBBICharMonkey() {
2428     UErrorCode  status = U_ZERO_ERROR;
2429 
2430     fText = NULL;
2431 
2432     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2433     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2434     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2435     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2436     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2437     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2438     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2439     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2440     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2441     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2442     fHangulSet  = new UnicodeSet();
2443     fHangulSet->addAll(*fLSet);
2444     fHangulSet->addAll(*fVSet);
2445     fHangulSet->addAll(*fTSet);
2446     fHangulSet->addAll(*fLVSet);
2447     fHangulSet->addAll(*fLVTSet);
2448     fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
2449 
2450     fSets       = new UVector(status);
2451     fSets->addElement(fCRLFSet,    status);
2452     fSets->addElement(fControlSet, status);
2453     fSets->addElement(fExtendSet,  status);
2454     fSets->addElement(fPrependSet, status);
2455     fSets->addElement(fSpacingSet, status);
2456     fSets->addElement(fHangulSet,  status);
2457     fSets->addElement(fAnySet,     status);
2458     if (U_FAILURE(status)) {
2459         deferredStatus = status;
2460     }
2461 }
2462 
2463 
setText(const UnicodeString & s)2464 void RBBICharMonkey::setText(const UnicodeString &s) {
2465     fText = &s;
2466 }
2467 
2468 
2469 
next(int32_t prevPos)2470 int32_t RBBICharMonkey::next(int32_t prevPos) {
2471     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2472                               //   break position being tested.  The candidate break
2473                               //   location is before p2.
2474 
2475     int     breakPos = -1;
2476 
2477     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2478 
2479     if (U_FAILURE(deferredStatus)) {
2480         return -1;
2481     }
2482 
2483     // Previous break at end of string.  return DONE.
2484     if (prevPos >= fText->length()) {
2485         return -1;
2486     }
2487     p0 = p1 = p2 = p3 = prevPos;
2488     c3 =  fText->char32At(prevPos);
2489     c0 = c1 = c2 = 0;
2490 
2491     // Loop runs once per "significant" character position in the input text.
2492     for (;;) {
2493         // Move all of the positions forward in the input string.
2494         p0 = p1;  c0 = c1;
2495         p1 = p2;  c1 = c2;
2496         p2 = p3;  c2 = c3;
2497 
2498         // Advancd p3 by one codepoint
2499         p3 = fText->moveIndex32(p3, 1);
2500         c3 = fText->char32At(p3);
2501 
2502         if (p1 == p2) {
2503             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2504             continue;
2505         }
2506         if (p2 == fText->length()) {
2507             // Reached end of string.  Always a break position.
2508             break;
2509         }
2510 
2511         // Rule  GB3   CR x LF
2512         //     No Extend or Format characters may appear between the CR and LF,
2513         //     which requires the additional check for p2 immediately following p1.
2514         //
2515         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2516             continue;
2517         }
2518 
2519         // Rule (GB4).   ( Control | CR | LF ) <break>
2520         if (fControlSet->contains(c1) ||
2521             c1 == 0x0D ||
2522             c1 == 0x0A)  {
2523             break;
2524         }
2525 
2526         // Rule (GB5)    <break>  ( Control | CR | LF )
2527         //
2528         if (fControlSet->contains(c2) ||
2529             c2 == 0x0D ||
2530             c2 == 0x0A)  {
2531             break;
2532         }
2533 
2534 
2535         // Rule (GB6)  L x ( L | V | LV | LVT )
2536         if (fLSet->contains(c1) &&
2537                (fLSet->contains(c2)  ||
2538                 fVSet->contains(c2)  ||
2539                 fLVSet->contains(c2) ||
2540                 fLVTSet->contains(c2))) {
2541             continue;
2542         }
2543 
2544         // Rule (GB7)    ( LV | V )  x  ( V | T )
2545         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2546             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2547             continue;
2548         }
2549 
2550         // Rule (GB8)    ( LVT | T)  x T
2551         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2552             fTSet->contains(c2))  {
2553             continue;
2554         }
2555 
2556         // Rule (GB9)    Numeric x ALetter
2557         if (fExtendSet->contains(c2))  {
2558             continue;
2559         }
2560 
2561         // Rule (GB9a)   x  SpacingMark
2562         if (fSpacingSet->contains(c2)) {
2563             continue;
2564         }
2565 
2566         // Rule (GB9b)   Prepend x
2567         if (fPrependSet->contains(c1)) {
2568             continue;
2569         }
2570 
2571         // Rule (GB10)  Any  <break>  Any
2572         break;
2573     }
2574 
2575     breakPos = p2;
2576     return breakPos;
2577 }
2578 
2579 
2580 
charClasses()2581 UVector  *RBBICharMonkey::charClasses() {
2582     return fSets;
2583 }
2584 
2585 
~RBBICharMonkey()2586 RBBICharMonkey::~RBBICharMonkey() {
2587     delete fSets;
2588     delete fCRLFSet;
2589     delete fControlSet;
2590     delete fExtendSet;
2591     delete fPrependSet;
2592     delete fSpacingSet;
2593     delete fLSet;
2594     delete fVSet;
2595     delete fTSet;
2596     delete fLVSet;
2597     delete fLVTSet;
2598     delete fHangulSet;
2599     delete fAnySet;
2600 }
2601 
2602 //------------------------------------------------------------------------------------------
2603 //
2604 //   class RBBIWordMonkey      Word Break specific implementation
2605 //                             of RBBIMonkeyKind.
2606 //
2607 //------------------------------------------------------------------------------------------
2608 class RBBIWordMonkey: public RBBIMonkeyKind {
2609 public:
2610     RBBIWordMonkey();
2611     virtual          ~RBBIWordMonkey();
2612     virtual  UVector *charClasses();
2613     virtual  void     setText(const UnicodeString &s);
2614     virtual int32_t   next(int32_t i);
2615 private:
2616     UVector      *fSets;
2617 
2618     UnicodeSet  *fCRSet;
2619     UnicodeSet  *fLFSet;
2620     UnicodeSet  *fNewlineSet;
2621     UnicodeSet  *fKatakanaSet;
2622     UnicodeSet  *fALetterSet;
2623     UnicodeSet  *fMidNumLetSet;
2624     UnicodeSet  *fMidLetterSet;
2625     UnicodeSet  *fMidNumSet;
2626     UnicodeSet  *fNumericSet;
2627     UnicodeSet  *fFormatSet;
2628     UnicodeSet  *fOtherSet;
2629     UnicodeSet  *fExtendSet;
2630     UnicodeSet  *fExtendNumLetSet;
2631 
2632     RegexMatcher  *fMatcher;
2633 
2634     const UnicodeString  *fText;
2635 };
2636 
2637 
RBBIWordMonkey()2638 RBBIWordMonkey::RBBIWordMonkey()
2639 {
2640     UErrorCode  status = U_ZERO_ERROR;
2641 
2642     fSets            = new UVector(status);
2643 
2644     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2645     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2646     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2647     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
2648     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2649     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2650     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2651     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2652     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2653     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2654     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2655     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2656 
2657     fOtherSet        = new UnicodeSet();
2658     if(U_FAILURE(status)) {
2659       deferredStatus = status;
2660       return;
2661     }
2662 
2663     fOtherSet->complement();
2664     fOtherSet->removeAll(*fCRSet);
2665     fOtherSet->removeAll(*fLFSet);
2666     fOtherSet->removeAll(*fNewlineSet);
2667     fOtherSet->removeAll(*fKatakanaSet);
2668     fOtherSet->removeAll(*fALetterSet);
2669     fOtherSet->removeAll(*fMidLetterSet);
2670     fOtherSet->removeAll(*fMidNumSet);
2671     fOtherSet->removeAll(*fNumericSet);
2672     fOtherSet->removeAll(*fExtendNumLetSet);
2673     fOtherSet->removeAll(*fFormatSet);
2674     fOtherSet->removeAll(*fExtendSet);
2675     // Inhibit dictionary characters from being tested at all.
2676     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2677 
2678     fSets->addElement(fCRSet,        status);
2679     fSets->addElement(fLFSet,        status);
2680     fSets->addElement(fNewlineSet,   status);
2681     fSets->addElement(fALetterSet,   status);
2682     fSets->addElement(fKatakanaSet,  status);
2683     fSets->addElement(fMidLetterSet, status);
2684     fSets->addElement(fMidNumLetSet, status);
2685     fSets->addElement(fMidNumSet,    status);
2686     fSets->addElement(fNumericSet,   status);
2687     fSets->addElement(fFormatSet,    status);
2688     fSets->addElement(fExtendSet,    status);
2689     fSets->addElement(fOtherSet,     status);
2690     fSets->addElement(fExtendNumLetSet, status);
2691 
2692     if (U_FAILURE(status)) {
2693         deferredStatus = status;
2694     }
2695 }
2696 
setText(const UnicodeString & s)2697 void RBBIWordMonkey::setText(const UnicodeString &s) {
2698     fText       = &s;
2699 }
2700 
2701 
next(int32_t prevPos)2702 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2703     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2704                               //   break position being tested.  The candidate break
2705                               //   location is before p2.
2706 
2707     int     breakPos = -1;
2708 
2709     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2710 
2711     if (U_FAILURE(deferredStatus)) {
2712         return -1;
2713     }
2714 
2715     // Prev break at end of string.  return DONE.
2716     if (prevPos >= fText->length()) {
2717         return -1;
2718     }
2719     p0 = p1 = p2 = p3 = prevPos;
2720     c3 =  fText->char32At(prevPos);
2721     c0 = c1 = c2 = 0;
2722 
2723     // Loop runs once per "significant" character position in the input text.
2724     for (;;) {
2725         // Move all of the positions forward in the input string.
2726         p0 = p1;  c0 = c1;
2727         p1 = p2;  c1 = c2;
2728         p2 = p3;  c2 = c3;
2729 
2730         // Advancd p3 by    X(Extend | Format)*   Rule 4
2731         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2732         do {
2733             p3 = fText->moveIndex32(p3, 1);
2734             c3 = fText->char32At(p3);
2735             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2736                break;
2737             };
2738         }
2739         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2740 
2741 
2742         if (p1 == p2) {
2743             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2744             continue;
2745         }
2746         if (p2 == fText->length()) {
2747             // Reached end of string.  Always a break position.
2748             break;
2749         }
2750 
2751         // Rule  (3)   CR x LF
2752         //     No Extend or Format characters may appear between the CR and LF,
2753         //     which requires the additional check for p2 immediately following p1.
2754         //
2755         if (c1==0x0D && c2==0x0A) {
2756             continue;
2757         }
2758 
2759         // Rule (3a)  Break before and after newlines (including CR and LF)
2760         //
2761         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2762             break;
2763         };
2764         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2765             break;
2766         };
2767 
2768         // Rule (5).   ALetter x ALetter
2769         if (fALetterSet->contains(c1) &&
2770             fALetterSet->contains(c2))  {
2771             continue;
2772         }
2773 
2774         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2775         //
2776         if ( fALetterSet->contains(c1)   &&
2777              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2778              fALetterSet->contains(c3)) {
2779             continue;
2780         }
2781 
2782 
2783         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2784         if (fALetterSet->contains(c0) &&
2785             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
2786             fALetterSet->contains(c2)) {
2787             continue;
2788         }
2789 
2790         // Rule (8)    Numeric x Numeric
2791         if (fNumericSet->contains(c1) &&
2792             fNumericSet->contains(c2))  {
2793             continue;
2794         }
2795 
2796         // Rule (9)    ALetter x Numeric
2797         if (fALetterSet->contains(c1) &&
2798             fNumericSet->contains(c2))  {
2799             continue;
2800         }
2801 
2802         // Rule (10)    Numeric x ALetter
2803         if (fNumericSet->contains(c1) &&
2804             fALetterSet->contains(c2))  {
2805             continue;
2806         }
2807 
2808         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2809         if (fNumericSet->contains(c0) &&
2810             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
2811             fNumericSet->contains(c2)) {
2812             continue;
2813         }
2814 
2815         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2816         if (fNumericSet->contains(c1) &&
2817             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
2818             fNumericSet->contains(c3)) {
2819             continue;
2820         }
2821 
2822         // Rule (13)  Katakana x Katakana
2823         if (fKatakanaSet->contains(c1) &&
2824             fKatakanaSet->contains(c2))  {
2825             continue;
2826         }
2827 
2828         // Rule 13a
2829         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2830              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2831              fExtendNumLetSet->contains(c2)) {
2832                 continue;
2833              }
2834 
2835         // Rule 13b
2836         if (fExtendNumLetSet->contains(c1) &&
2837                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2838                 fKatakanaSet->contains(c2)))  {
2839                 continue;
2840              }
2841 
2842         // Rule 14.  Break found here.
2843         break;
2844     }
2845 
2846     breakPos = p2;
2847     return breakPos;
2848 }
2849 
2850 
charClasses()2851 UVector  *RBBIWordMonkey::charClasses() {
2852     return fSets;
2853 }
2854 
2855 
~RBBIWordMonkey()2856 RBBIWordMonkey::~RBBIWordMonkey() {
2857     delete fSets;
2858     delete fCRSet;
2859     delete fLFSet;
2860     delete fNewlineSet;
2861     delete fKatakanaSet;
2862     delete fALetterSet;
2863     delete fMidNumLetSet;
2864     delete fMidLetterSet;
2865     delete fMidNumSet;
2866     delete fNumericSet;
2867     delete fFormatSet;
2868     delete fExtendSet;
2869     delete fExtendNumLetSet;
2870     delete fOtherSet;
2871 }
2872 
2873 
2874 
2875 
2876 //------------------------------------------------------------------------------------------
2877 //
2878 //   class RBBISentMonkey      Sentence Break specific implementation
2879 //                             of RBBIMonkeyKind.
2880 //
2881 //------------------------------------------------------------------------------------------
2882 class RBBISentMonkey: public RBBIMonkeyKind {
2883 public:
2884     RBBISentMonkey();
2885     virtual          ~RBBISentMonkey();
2886     virtual  UVector *charClasses();
2887     virtual  void     setText(const UnicodeString &s);
2888     virtual int32_t   next(int32_t i);
2889 private:
2890     int               moveBack(int posFrom);
2891     int               moveForward(int posFrom);
2892     UChar32           cAt(int pos);
2893 
2894     UVector      *fSets;
2895 
2896     UnicodeSet  *fSepSet;
2897     UnicodeSet  *fFormatSet;
2898     UnicodeSet  *fSpSet;
2899     UnicodeSet  *fLowerSet;
2900     UnicodeSet  *fUpperSet;
2901     UnicodeSet  *fOLetterSet;
2902     UnicodeSet  *fNumericSet;
2903     UnicodeSet  *fATermSet;
2904     UnicodeSet  *fSContinueSet;
2905     UnicodeSet  *fSTermSet;
2906     UnicodeSet  *fCloseSet;
2907     UnicodeSet  *fOtherSet;
2908     UnicodeSet  *fExtendSet;
2909 
2910     const UnicodeString  *fText;
2911 
2912 };
2913 
RBBISentMonkey()2914 RBBISentMonkey::RBBISentMonkey()
2915 {
2916     UErrorCode  status = U_ZERO_ERROR;
2917 
2918     fSets            = new UVector(status);
2919 
2920     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2921     //                       set and made into character classes of their own.  For the monkey impl,
2922     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2923     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2924     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2925     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2926     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2927     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2928     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2929     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2930     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2931     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2932     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2933     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2934     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2935     fOtherSet        = new UnicodeSet();
2936 
2937     if(U_FAILURE(status)) {
2938       deferredStatus = status;
2939       return;
2940     }
2941 
2942     fOtherSet->complement();
2943     fOtherSet->removeAll(*fSepSet);
2944     fOtherSet->removeAll(*fFormatSet);
2945     fOtherSet->removeAll(*fSpSet);
2946     fOtherSet->removeAll(*fLowerSet);
2947     fOtherSet->removeAll(*fUpperSet);
2948     fOtherSet->removeAll(*fOLetterSet);
2949     fOtherSet->removeAll(*fNumericSet);
2950     fOtherSet->removeAll(*fATermSet);
2951     fOtherSet->removeAll(*fSContinueSet);
2952     fOtherSet->removeAll(*fSTermSet);
2953     fOtherSet->removeAll(*fCloseSet);
2954     fOtherSet->removeAll(*fExtendSet);
2955 
2956     fSets->addElement(fSepSet,       status);
2957     fSets->addElement(fFormatSet,    status);
2958     fSets->addElement(fSpSet,        status);
2959     fSets->addElement(fLowerSet,     status);
2960     fSets->addElement(fUpperSet,     status);
2961     fSets->addElement(fOLetterSet,   status);
2962     fSets->addElement(fNumericSet,   status);
2963     fSets->addElement(fATermSet,     status);
2964     fSets->addElement(fSContinueSet, status);
2965     fSets->addElement(fSTermSet,     status);
2966     fSets->addElement(fCloseSet,     status);
2967     fSets->addElement(fOtherSet,     status);
2968     fSets->addElement(fExtendSet,    status);
2969 
2970     if (U_FAILURE(status)) {
2971         deferredStatus = status;
2972     }
2973 }
2974 
2975 
2976 
setText(const UnicodeString & s)2977 void RBBISentMonkey::setText(const UnicodeString &s) {
2978     fText       = &s;
2979 }
2980 
charClasses()2981 UVector  *RBBISentMonkey::charClasses() {
2982     return fSets;
2983 }
2984 
2985 
2986 //  moveBack()   Find the "significant" code point preceding the index i.
2987 //               Skips over ($Extend | $Format)* .
2988 //
moveBack(int i)2989 int RBBISentMonkey::moveBack(int i) {
2990     if (i <= 0) {
2991         return -1;
2992     }
2993     UChar32   c;
2994     int32_t   j = i;
2995     do {
2996         j = fText->moveIndex32(j, -1);
2997         c = fText->char32At(j);
2998     }
2999     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
3000     return j;
3001 
3002  }
3003 
3004 
moveForward(int i)3005 int RBBISentMonkey::moveForward(int i) {
3006     if (i>=fText->length()) {
3007         return fText->length();
3008     }
3009     UChar32   c;
3010     int32_t   j = i;
3011     do {
3012         j = fText->moveIndex32(j, 1);
3013         c = cAt(j);
3014     }
3015     while (fFormatSet->contains(c) || fExtendSet->contains(c));
3016     return j;
3017 }
3018 
cAt(int pos)3019 UChar32 RBBISentMonkey::cAt(int pos) {
3020     if (pos<0 || pos>=fText->length()) {
3021         return -1;
3022     } else {
3023         return fText->char32At(pos);
3024     }
3025 }
3026 
next(int32_t prevPos)3027 int32_t RBBISentMonkey::next(int32_t prevPos) {
3028     int    p0, p1, p2, p3;    // Indices of the significant code points around the
3029                               //   break position being tested.  The candidate break
3030                               //   location is before p2.
3031 
3032     int     breakPos = -1;
3033 
3034     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
3035     UChar32 c;
3036 
3037     if (U_FAILURE(deferredStatus)) {
3038         return -1;
3039     }
3040 
3041     // Prev break at end of string.  return DONE.
3042     if (prevPos >= fText->length()) {
3043         return -1;
3044     }
3045     p0 = p1 = p2 = p3 = prevPos;
3046     c3 =  fText->char32At(prevPos);
3047     c0 = c1 = c2 = 0;
3048 
3049     // Loop runs once per "significant" character position in the input text.
3050     for (;;) {
3051         // Move all of the positions forward in the input string.
3052         p0 = p1;  c0 = c1;
3053         p1 = p2;  c1 = c2;
3054         p2 = p3;  c2 = c3;
3055 
3056         // Advancd p3 by    X(Extend | Format)*   Rule 4
3057         p3 = moveForward(p3);
3058         c3 = cAt(p3);
3059 
3060         // Rule (3)  CR x LF
3061         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
3062             continue;
3063         }
3064 
3065         // Rule (4).   Sep  <break>
3066         if (fSepSet->contains(c1)) {
3067             p2 = p1+1;   // Separators don't combine with Extend or Format.
3068             break;
3069         }
3070 
3071         if (p2 >= fText->length()) {
3072             // Reached end of string.  Always a break position.
3073             break;
3074         }
3075 
3076         if (p2 == prevPos) {
3077             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
3078             continue;
3079         }
3080 
3081         // Rule (6).   ATerm x Numeric
3082         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
3083             continue;
3084         }
3085 
3086         // Rule (7).  Upper ATerm  x  Uppper
3087         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
3088             continue;
3089         }
3090 
3091         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
3092         //           Note:  STerm | ATerm are added to the negated part of the expression by a
3093         //                  note to the Unicode 5.0 documents.
3094         int p8 = p1;
3095         while (fSpSet->contains(cAt(p8))) {
3096             p8 = moveBack(p8);
3097         }
3098         while (fCloseSet->contains(cAt(p8))) {
3099             p8 = moveBack(p8);
3100         }
3101         if (fATermSet->contains(cAt(p8))) {
3102             p8=p2;
3103             for (;;) {
3104                 c = cAt(p8);
3105                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
3106                     fLowerSet->contains(c) || fSepSet->contains(c) ||
3107                     fATermSet->contains(c) || fSTermSet->contains(c))  {
3108                     break;
3109                 }
3110                 p8 = moveForward(p8);
3111             }
3112             if (fLowerSet->contains(cAt(p8))) {
3113                 continue;
3114             }
3115         }
3116 
3117         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
3118         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
3119             p8 = p1;
3120             while (fSpSet->contains(cAt(p8))) {
3121                 p8 = moveBack(p8);
3122             }
3123             while (fCloseSet->contains(cAt(p8))) {
3124                 p8 = moveBack(p8);
3125             }
3126             c = cAt(p8);
3127             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
3128                 continue;
3129             }
3130         }
3131 
3132         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
3133         int p9 = p1;
3134         while (fCloseSet->contains(cAt(p9))) {
3135             p9 = moveBack(p9);
3136         }
3137         c = cAt(p9);
3138         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
3139             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
3140                 continue;
3141             }
3142         }
3143 
3144         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
3145         int p10 = p1;
3146         while (fSpSet->contains(cAt(p10))) {
3147             p10 = moveBack(p10);
3148         }
3149         while (fCloseSet->contains(cAt(p10))) {
3150             p10 = moveBack(p10);
3151         }
3152         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
3153             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
3154                 continue;
3155             }
3156         }
3157 
3158         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
3159         int p11 = p1;
3160         if (fSepSet->contains(cAt(p11))) {
3161             p11 = moveBack(p11);
3162         }
3163         while (fSpSet->contains(cAt(p11))) {
3164             p11 = moveBack(p11);
3165         }
3166         while (fCloseSet->contains(cAt(p11))) {
3167             p11 = moveBack(p11);
3168         }
3169         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
3170             break;
3171         }
3172 
3173         //  Rule (12)  Any x Any
3174         continue;
3175     }
3176     breakPos = p2;
3177     return breakPos;
3178 }
3179 
~RBBISentMonkey()3180 RBBISentMonkey::~RBBISentMonkey() {
3181     delete fSets;
3182     delete fSepSet;
3183     delete fFormatSet;
3184     delete fSpSet;
3185     delete fLowerSet;
3186     delete fUpperSet;
3187     delete fOLetterSet;
3188     delete fNumericSet;
3189     delete fATermSet;
3190     delete fSContinueSet;
3191     delete fSTermSet;
3192     delete fCloseSet;
3193     delete fOtherSet;
3194     delete fExtendSet;
3195 }
3196 
3197 
3198 
3199 //-------------------------------------------------------------------------------------------
3200 //
3201 //  RBBILineMonkey
3202 //
3203 //-------------------------------------------------------------------------------------------
3204 
3205 class RBBILineMonkey: public RBBIMonkeyKind {
3206 public:
3207     RBBILineMonkey();
3208     virtual          ~RBBILineMonkey();
3209     virtual  UVector *charClasses();
3210     virtual  void     setText(const UnicodeString &s);
3211     virtual  int32_t  next(int32_t i);
3212     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
3213 private:
3214     UVector      *fSets;
3215 
3216     UnicodeSet  *fBK;
3217     UnicodeSet  *fCR;
3218     UnicodeSet  *fLF;
3219     UnicodeSet  *fCM;
3220     UnicodeSet  *fNL;
3221     UnicodeSet  *fSG;
3222     UnicodeSet  *fWJ;
3223     UnicodeSet  *fZW;
3224     UnicodeSet  *fGL;
3225     UnicodeSet  *fCB;
3226     UnicodeSet  *fSP;
3227     UnicodeSet  *fB2;
3228     UnicodeSet  *fBA;
3229     UnicodeSet  *fBB;
3230     UnicodeSet  *fHY;
3231     UnicodeSet  *fH2;
3232     UnicodeSet  *fH3;
3233     UnicodeSet  *fCL;
3234     UnicodeSet  *fCP;
3235     UnicodeSet  *fEX;
3236     UnicodeSet  *fIN;
3237     UnicodeSet  *fJL;
3238     UnicodeSet  *fJV;
3239     UnicodeSet  *fJT;
3240     UnicodeSet  *fNS;
3241     UnicodeSet  *fOP;
3242     UnicodeSet  *fQU;
3243     UnicodeSet  *fIS;
3244     UnicodeSet  *fNU;
3245     UnicodeSet  *fPO;
3246     UnicodeSet  *fPR;
3247     UnicodeSet  *fSY;
3248     UnicodeSet  *fAI;
3249     UnicodeSet  *fAL;
3250     UnicodeSet  *fID;
3251     UnicodeSet  *fSA;
3252     UnicodeSet  *fXX;
3253 
3254     BreakIterator  *fCharBI;
3255 
3256     const UnicodeString  *fText;
3257     int32_t              *fOrigPositions;
3258 
3259     RegexMatcher         *fNumberMatcher;
3260     RegexMatcher         *fLB11Matcher;
3261 };
3262 
3263 
RBBILineMonkey()3264 RBBILineMonkey::RBBILineMonkey()
3265 {
3266     UErrorCode  status = U_ZERO_ERROR;
3267 
3268     fSets  = new UVector(status);
3269 
3270     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3271     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3272     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3273     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3274     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3275     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3276     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3277     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3278     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3279     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3280     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3281     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3282     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3283     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3284     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3285     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3286     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3287     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3288     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3289     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3290     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3291     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3292     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3293     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3294     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3295     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3296     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3297     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3298     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3299     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3300     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3301     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3302     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3303     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3304     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
3305     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3306     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3307 
3308     if (U_FAILURE(status)) {
3309         deferredStatus = status;
3310         fCharBI = NULL;
3311         fNumberMatcher = NULL;
3312         return;
3313     }
3314 
3315     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3316     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3317     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
3318     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3319 
3320     fSets->addElement(fBK, status);
3321     fSets->addElement(fCR, status);
3322     fSets->addElement(fLF, status);
3323     fSets->addElement(fCM, status);
3324     fSets->addElement(fNL, status);
3325     fSets->addElement(fWJ, status);
3326     fSets->addElement(fZW, status);
3327     fSets->addElement(fGL, status);
3328     fSets->addElement(fCB, status);
3329     fSets->addElement(fSP, status);
3330     fSets->addElement(fB2, status);
3331     fSets->addElement(fBA, status);
3332     fSets->addElement(fBB, status);
3333     fSets->addElement(fHY, status);
3334     fSets->addElement(fH2, status);
3335     fSets->addElement(fH3, status);
3336     fSets->addElement(fCL, status);
3337     fSets->addElement(fCP, status);
3338     fSets->addElement(fEX, status);
3339     fSets->addElement(fIN, status);
3340     fSets->addElement(fJL, status);
3341     fSets->addElement(fJT, status);
3342     fSets->addElement(fJV, status);
3343     fSets->addElement(fNS, status);
3344     fSets->addElement(fOP, status);
3345     fSets->addElement(fQU, status);
3346     fSets->addElement(fIS, status);
3347     fSets->addElement(fNU, status);
3348     fSets->addElement(fPO, status);
3349     fSets->addElement(fPR, status);
3350     fSets->addElement(fSY, status);
3351     fSets->addElement(fAI, status);
3352     fSets->addElement(fAL, status);
3353     fSets->addElement(fID, status);
3354     fSets->addElement(fWJ, status);
3355     fSets->addElement(fSA, status);
3356     fSets->addElement(fSG, status);
3357 
3358     const char *rules =
3359             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3360             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3361             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3362             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3363             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3364             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3365 
3366     fNumberMatcher = new RegexMatcher(
3367         UnicodeString(rules, -1, US_INV), 0, status);
3368 
3369     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3370 
3371     if (U_FAILURE(status)) {
3372         deferredStatus = status;
3373     }
3374 }
3375 
3376 
setText(const UnicodeString & s)3377 void RBBILineMonkey::setText(const UnicodeString &s) {
3378     fText       = &s;
3379     fCharBI->setText(s);
3380     fNumberMatcher->reset(s);
3381 }
3382 
3383 //
3384 //  rule9Adjust
3385 //     Line Break TR rules 9 and 10 implementation.
3386 //     This deals with combining marks and other sequences that
3387 //     that must be treated as if they were something other than what they actually are.
3388 //
3389 //     This is factored out into a separate function because it must be applied twice for
3390 //     each potential break, once to the chars before the position being checked, then
3391 //     again to the text following the possible break.
3392 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)3393 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3394     if (pos == -1) {
3395         // Invalid initial position.  Happens during the warmup iteration of the
3396         //   main loop in next().
3397         return;
3398     }
3399 
3400     int32_t  nPos = *nextPos;
3401 
3402     // LB 9  Keep combining sequences together.
3403     //  advance over any CM class chars.  Note that Line Break CM is different
3404     //  from the normal Grapheme Extend property.
3405     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3406           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3407         for (;;) {
3408             *nextChar = fText->char32At(nPos);
3409             if (!fCM->contains(*nextChar)) {
3410                 break;
3411             }
3412             nPos = fText->moveIndex32(nPos, 1);
3413         }
3414     }
3415 
3416 
3417     // LB 9 Treat X CM* as if it were x.
3418     //       No explicit action required.
3419 
3420     // LB 10  Treat any remaining combining mark as AL
3421     if (fCM->contains(*posChar)) {
3422         *posChar = 0x41;   // thisChar = 'A';
3423     }
3424 
3425     // Push the updated nextPos and nextChar back to our caller.
3426     // This only makes a difference if posChar got bigger by consuming a
3427     // combining sequence.
3428     *nextPos  = nPos;
3429     *nextChar = fText->char32At(nPos);
3430 }
3431 
3432 
3433 
next(int32_t startPos)3434 int32_t RBBILineMonkey::next(int32_t startPos) {
3435     UErrorCode status = U_ZERO_ERROR;
3436     int32_t    pos;       //  Index of the char following a potential break position
3437     UChar32    thisChar;  //  Character at above position "pos"
3438 
3439     int32_t    prevPos;   //  Index of the char preceding a potential break position
3440     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3441                           //   and thisChar may not be adjacent because combining
3442                           //   characters between them will be ignored.
3443 
3444     int32_t    nextPos;   //  Index of the next character following pos.
3445                           //     Usually skips over combining marks.
3446     int32_t    nextCPPos; //  Index of the code point following "pos."
3447                           //     May point to a combining mark.
3448     int32_t    tPos;      //  temp value.
3449     UChar32    c;
3450 
3451     if (U_FAILURE(deferredStatus)) {
3452         return -1;
3453     }
3454 
3455     if (startPos >= fText->length()) {
3456         return -1;
3457     }
3458 
3459 
3460     // Initial values for loop.  Loop will run the first time without finding breaks,
3461     //                           while the invalid values shift out and the "this" and
3462     //                           "prev" positions are filled in with good values.
3463     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
3464     thisChar = prevChar  = 0;
3465     nextPos  = nextCPPos = startPos;
3466 
3467 
3468     // Loop runs once per position in the test text, until a break position
3469     //  is found.
3470     for (;;) {
3471         prevPos   = pos;
3472         prevChar  = thisChar;
3473 
3474         pos       = nextPos;
3475         thisChar  = fText->char32At(pos);
3476 
3477         nextCPPos = fText->moveIndex32(pos, 1);
3478         nextPos   = nextCPPos;
3479 
3480         // Rule LB2 - Break at end of text.
3481         if (pos >= fText->length()) {
3482             break;
3483         }
3484 
3485         // Rule LB 9 - adjust for combining sequences.
3486         //             We do this one out-of-order because the adjustment does not change anything
3487         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3488         //             be applied.
3489         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3490         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3491         c = fText->char32At(nextPos);
3492         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3493 
3494         // If the loop is still warming up - if we haven't shifted the initial
3495         //   -1 positions out of prevPos yet - loop back to advance the
3496         //    position in the input without any further looking for breaks.
3497         if (prevPos == -1) {
3498             continue;
3499         }
3500 
3501         // LB 4  Always break after hard line breaks,
3502         if (fBK->contains(prevChar)) {
3503             break;
3504         }
3505 
3506         // LB 5  Break after CR, LF, NL, but not inside CR LF
3507         if (prevChar == 0x0d && thisChar == 0x0a) {
3508             continue;
3509         }
3510         if (prevChar == 0x0d ||
3511             prevChar == 0x0a ||
3512             prevChar == 0x85)  {
3513             break;
3514         }
3515 
3516         // LB 6  Don't break before hard line breaks
3517         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3518             fBK->contains(thisChar)) {
3519                 continue;
3520         }
3521 
3522 
3523         // LB 7  Don't break before spaces or zero-width space.
3524         if (fSP->contains(thisChar)) {
3525             continue;
3526         }
3527 
3528         if (fZW->contains(thisChar)) {
3529             continue;
3530         }
3531 
3532         // LB 8  Break after zero width space
3533         if (fZW->contains(prevChar)) {
3534             break;
3535         }
3536 
3537         // LB 9, 10  Already done, at top of loop.
3538         //
3539 
3540 
3541         // LB 11  Do not break before or after WORD JOINER and related characters.
3542         //    x  WJ
3543         //    WJ  x
3544         //
3545         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3546             continue;
3547         }
3548 
3549         // LB 12
3550         //    GL  x
3551         if (fGL->contains(prevChar)) {
3552             continue;
3553         }
3554 
3555         // LB 12a
3556         //    [^SP BA HY] x GL
3557         if (!(fSP->contains(prevChar) ||
3558               fBA->contains(prevChar) ||
3559               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3560             continue;
3561         }
3562 
3563 
3564 
3565         // LB 13  Don't break before closings.
3566         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3567         //        fall into LB 17 and the more general number regular expression.
3568         //
3569         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3570             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3571                                          fEX->contains(thisChar)  ||
3572             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3573             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3574             continue;
3575         }
3576 
3577         // LB 14 Don't break after OP SP*
3578         //       Scan backwards, checking for this sequence.
3579         //       The OP char could include combining marks, so we actually check for
3580         //           OP CM* SP*
3581         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3582         //       sequence into a ID char, so before scanning back through spaces,
3583         //       verify that prevChar is indeed a space.  The prevChar variable
3584         //       may differ from fText[prevPos]
3585         tPos = prevPos;
3586         if (fSP->contains(prevChar)) {
3587             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3588                 tPos=fText->moveIndex32(tPos, -1);
3589             }
3590         }
3591         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3592             tPos=fText->moveIndex32(tPos, -1);
3593         }
3594         if (fOP->contains(fText->char32At(tPos))) {
3595             continue;
3596         }
3597 
3598 
3599         // LB 15    QU SP* x OP
3600         if (fOP->contains(thisChar)) {
3601             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3602             int tPos = prevPos;
3603             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3604                 tPos = fText->moveIndex32(tPos, -1);
3605             }
3606             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3607                 tPos = fText->moveIndex32(tPos, -1);
3608             }
3609             if (fQU->contains(fText->char32At(tPos))) {
3610                 continue;
3611             }
3612         }
3613 
3614 
3615 
3616         // LB 16   (CL | CP) SP* x NS
3617         //    Scan backwards for SP* CM* (CL | CP)
3618         if (fNS->contains(thisChar)) {
3619             int tPos = prevPos;
3620             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3621                 tPos = fText->moveIndex32(tPos, -1);
3622             }
3623             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3624                 tPos = fText->moveIndex32(tPos, -1);
3625             }
3626             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3627                 continue;
3628             }
3629         }
3630 
3631 
3632         // LB 17        B2 SP* x B2
3633         if (fB2->contains(thisChar)) {
3634             //  Scan backwards, checking for the B2 CM* SP* sequence.
3635             tPos = prevPos;
3636             if (fSP->contains(prevChar)) {
3637                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3638                     tPos=fText->moveIndex32(tPos, -1);
3639                 }
3640             }
3641             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3642                 tPos=fText->moveIndex32(tPos, -1);
3643             }
3644             if (fB2->contains(fText->char32At(tPos))) {
3645                 continue;
3646             }
3647         }
3648 
3649 
3650         // LB 18    break after space
3651         if (fSP->contains(prevChar)) {
3652             break;
3653         }
3654 
3655         // LB 19
3656         //    x   QU
3657         //    QU  x
3658         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3659             continue;
3660         }
3661 
3662         // LB 20  Break around a CB
3663         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3664             break;
3665         }
3666 
3667         // LB 21
3668         if (fBA->contains(thisChar) ||
3669             fHY->contains(thisChar) ||
3670             fNS->contains(thisChar) ||
3671             fBB->contains(prevChar) )   {
3672             continue;
3673         }
3674 
3675         // LB 22
3676         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3677             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3678             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3679             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3680             continue;
3681         }
3682 
3683 
3684         // LB 23    ID x PO
3685         //          AL x NU
3686         //          NU x AL
3687         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3688             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3689             (fNU->contains(prevChar) && fAL->contains(thisChar)) )   {
3690             continue;
3691         }
3692 
3693         // LB 24  Do not break between prefix and letters or ideographs.
3694         //        PR x ID
3695         //        PR x AL
3696         //        PO x AL
3697         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3698             (fPR->contains(prevChar) && fAL->contains(thisChar)) ||
3699             (fPO->contains(prevChar) && fAL->contains(thisChar)) )   {
3700             continue;
3701         }
3702 
3703 
3704 
3705         // LB 25    Numbers
3706         if (fNumberMatcher->lookingAt(prevPos, status)) {
3707             if (U_FAILURE(status)) {
3708                 break;
3709             }
3710             // Matched a number.  But could have been just a single digit, which would
3711             //    not represent a "no break here" between prevChar and thisChar
3712             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3713             if (numEndIdx > pos) {
3714                 // Number match includes at least our two chars being checked
3715                 if (numEndIdx > nextPos) {
3716                     // Number match includes additional chars.  Update pos and nextPos
3717                     //   so that next loop iteration will continue at the end of the number,
3718                     //   checking for breaks between last char in number & whatever follows.
3719                     pos = nextPos = numEndIdx;
3720                     do {
3721                         pos = fText->moveIndex32(pos, -1);
3722                         thisChar = fText->char32At(pos);
3723                     } while (fCM->contains(thisChar));
3724                 }
3725                 continue;
3726             }
3727         }
3728 
3729 
3730         // LB 26 Do not break a Korean syllable.
3731         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3732                                         fJV->contains(thisChar) ||
3733                                         fH2->contains(thisChar) ||
3734                                         fH3->contains(thisChar))) {
3735                                             continue;
3736                                         }
3737 
3738         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3739             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3740                 continue;
3741         }
3742 
3743         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3744             fJT->contains(thisChar)) {
3745                 continue;
3746         }
3747 
3748         // LB 27 Treat a Korean Syllable Block the same as ID.
3749         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3750             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3751             fIN->contains(thisChar)) {
3752                 continue;
3753             }
3754         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3755             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3756             fPO->contains(thisChar)) {
3757                 continue;
3758             }
3759         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3760             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3761                 continue;
3762             }
3763 
3764 
3765 
3766         // LB 28  Do not break between alphabetics ("at").
3767         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3768             continue;
3769         }
3770 
3771         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3772         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3773             continue;
3774         }
3775 
3776         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3777         //          (AL | NU) x OP
3778         //          CP x (AL | NU)
3779         if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3780             continue;
3781         }
3782         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) {
3783             continue;
3784         }
3785 
3786         // LB 31    Break everywhere else
3787         break;
3788 
3789     }
3790 
3791     return pos;
3792 }
3793 
3794 
charClasses()3795 UVector  *RBBILineMonkey::charClasses() {
3796     return fSets;
3797 }
3798 
3799 
~RBBILineMonkey()3800 RBBILineMonkey::~RBBILineMonkey() {
3801     delete fSets;
3802 
3803     delete fBK;
3804     delete fCR;
3805     delete fLF;
3806     delete fCM;
3807     delete fNL;
3808     delete fWJ;
3809     delete fZW;
3810     delete fGL;
3811     delete fCB;
3812     delete fSP;
3813     delete fB2;
3814     delete fBA;
3815     delete fBB;
3816     delete fHY;
3817     delete fH2;
3818     delete fH3;
3819     delete fCL;
3820     delete fCP;
3821     delete fEX;
3822     delete fIN;
3823     delete fJL;
3824     delete fJV;
3825     delete fJT;
3826     delete fNS;
3827     delete fOP;
3828     delete fQU;
3829     delete fIS;
3830     delete fNU;
3831     delete fPO;
3832     delete fPR;
3833     delete fSY;
3834     delete fAI;
3835     delete fAL;
3836     delete fID;
3837     delete fSA;
3838     delete fSG;
3839     delete fXX;
3840 
3841     delete fCharBI;
3842     delete fNumberMatcher;
3843 }
3844 
3845 
3846 //-------------------------------------------------------------------------------------------
3847 //
3848 //   TestMonkey
3849 //
3850 //     params
3851 //       seed=nnnnn        Random number starting seed.
3852 //                         Setting the seed allows errors to be reproduced.
3853 //       loop=nnn          Looping count.  Controls running time.
3854 //                         -1:  run forever.
3855 //                          0 or greater:  run length.
3856 //
3857 //       type = char | word | line | sent | title
3858 //
3859 //-------------------------------------------------------------------------------------------
3860 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3861 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3862     int32_t val = defaultVal;
3863     name.append(" *= *(-?\\d+)");
3864     UErrorCode status = U_ZERO_ERROR;
3865     RegexMatcher m(name, params, 0, status);
3866     if (m.find()) {
3867         // The param exists.  Convert the string to an int.
3868         char valString[100];
3869         int32_t paramLength = m.end(1, status) - m.start(1, status);
3870         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3871             paramLength = (int32_t)(sizeof(valString)-2);
3872         }
3873         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3874         val = strtol(valString,  NULL, 10);
3875 
3876         // Delete this parameter from the params string.
3877         m.reset();
3878         params = m.replaceFirst("", status);
3879     }
3880     U_ASSERT(U_SUCCESS(status));
3881     return val;
3882 }
3883 #endif
3884 
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3885 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3886                                     BreakIterator *bi,
3887                                     int expected[],
3888                                     int expectedcount)
3889 {
3890     int count = 0;
3891     int i = 0;
3892     int forward[50];
3893     bi->setText(ustr);
3894     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3895         forward[count] = i;
3896         if (count < expectedcount && expected[count] != i) {
3897             test->errln("break forward test failed: expected %d but got %d",
3898                         expected[count], i);
3899             break;
3900         }
3901         count ++;
3902     }
3903     if (count != expectedcount) {
3904         printStringBreaks(ustr, expected, expectedcount);
3905         test->errln("break forward test failed: missed %d match",
3906                     expectedcount - count);
3907         return;
3908     }
3909     // testing boundaries
3910     for (i = 1; i < expectedcount; i ++) {
3911         int j = expected[i - 1];
3912         if (!bi->isBoundary(j)) {
3913             printStringBreaks(ustr, expected, expectedcount);
3914             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3915             return;
3916         }
3917         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3918             if (bi->isBoundary(j)) {
3919                 printStringBreaks(ustr, expected, expectedcount);
3920                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3921                 return;
3922             }
3923         }
3924     }
3925 
3926     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3927         count --;
3928         if (forward[count] != i) {
3929             test->errln("happy break test previous() failed: expected %d but got %d",
3930                         forward[count], i);
3931             break;
3932         }
3933     }
3934     if (count != 0) {
3935         printStringBreaks(ustr, expected, expectedcount);
3936         test->errln("break test previous() failed: missed a match");
3937         return;
3938     }
3939 
3940     // testing preceding
3941     for (i = 0; i < expectedcount - 1; i ++) {
3942         // int j = expected[i] + 1;
3943         int j = ustr.moveIndex32(expected[i], 1);
3944         for (; j <= expected[i + 1]; j ++) {
3945             if (bi->preceding(j) != expected[i]) {
3946                 printStringBreaks(ustr, expected, expectedcount);
3947                 test->errln("preceding(): Not expecting boundary at position %d", j);
3948                 return;
3949             }
3950         }
3951     }
3952 }
3953 
TestWordBreaks(void)3954 void RBBITest::TestWordBreaks(void)
3955 {
3956 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3957 
3958     Locale        locale("en");
3959     UErrorCode    status = U_ZERO_ERROR;
3960     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3961     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3962     static const char *strlist[] =
3963     {
3964     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3965     "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3966     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3967     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3968     "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3969     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3970     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3971     "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3972     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3973     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3974     "\\u2027\\U000e0067\\u0a47\\u00b7",
3975     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3976     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3977     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3978     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3979     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3980     "\\u0027\\u11af\\U000e0057\\u0602",
3981     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3982     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3983     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3984     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3985     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3986     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3987     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3988     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3989     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3990     "\\u58f4\\U000e0049\\u20e7\\u2027",
3991     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3992     "\\ua183\\u102d\\u0bec\\u003a",
3993     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3994     "\\u003a\\u0e57\\u0fad\\u002e",
3995     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3996     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3997     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3998     "\\u003a\\u0664\\u00b7\\u1fba",
3999     "\\u003b\\u0027\\u00b7\\u47a3",
4000     "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
4001     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
4002     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
4003     };
4004     int loop;
4005     if (U_FAILURE(status)) {
4006         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4007         return;
4008     }
4009     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4010         // printf("looping %d\n", loop);
4011         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
4012         // RBBICharMonkey monkey;
4013         RBBIWordMonkey monkey;
4014 
4015         int expected[50];
4016         int expectedcount = 0;
4017 
4018         monkey.setText(ustr);
4019         int i;
4020         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4021             expected[expectedcount ++] = i;
4022         }
4023 
4024         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4025     }
4026     delete bi;
4027 #endif
4028 }
4029 
TestWordBoundary(void)4030 void RBBITest::TestWordBoundary(void)
4031 {
4032     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
4033     Locale        locale("en");
4034     UErrorCode    status = U_ZERO_ERROR;
4035     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4036     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4037     UChar         str[50];
4038     static const char *strlist[] =
4039     {
4040     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
4041     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
4042     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
4043     "\\u2027\\U000e0067\\u0a47\\u00b7",
4044     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
4045     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4046     "\\u0589\\U000e006e\\u0a42\\U000104a5",
4047     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
4048     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4049     "\\u0027\\u11af\\U000e0057\\u0602",
4050     "\\U0001d7f2\\U000e007\\u0004\\u0589",
4051     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
4052     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4053     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4054     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4055     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
4056     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4057     "\\u0233\\U000e0020\\u0a69\\u0d6a",
4058     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4059     "\\u58f4\\U000e0049\\u20e7\\u2027",
4060     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4061     "\\ua183\\u102d\\u0bec\\u003a",
4062     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4063     "\\u003a\\u0e57\\u0fad\\u002e",
4064     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4065     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4066     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
4067     "\\u003a\\u0664\\u00b7\\u1fba",
4068     "\\u003b\\u0027\\u00b7\\u47a3",
4069     };
4070     int loop;
4071     if (U_FAILURE(status)) {
4072         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4073         return;
4074     }
4075     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4076         // printf("looping %d\n", loop);
4077         u_unescape(strlist[loop], str, 20);
4078         UnicodeString ustr(str);
4079         int forward[50];
4080         int count = 0;
4081 
4082         bi->setText(ustr);
4083         int prev = 0;
4084         int i;
4085         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
4086             forward[count ++] = i;
4087             if (i > prev) {
4088                 int j;
4089                 for (j = prev + 1; j < i; j ++) {
4090                     if (bi->isBoundary(j)) {
4091                         printStringBreaks(ustr, forward, count);
4092                         errln("happy boundary test failed: expected %d not a boundary",
4093                                j);
4094                         return;
4095                     }
4096                 }
4097             }
4098             if (!bi->isBoundary(i)) {
4099                 printStringBreaks(ustr, forward, count);
4100                 errln("happy boundary test failed: expected %d a boundary",
4101                        i);
4102                 return;
4103             }
4104             prev = i;
4105         }
4106     }
4107     delete bi;
4108 }
4109 
TestLineBreaks(void)4110 void RBBITest::TestLineBreaks(void)
4111 {
4112 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4113     Locale        locale("en");
4114     UErrorCode    status = U_ZERO_ERROR;
4115     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4116     const int32_t  STRSIZE = 50;
4117     UChar         str[STRSIZE];
4118     static const char *strlist[] =
4119     {
4120      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4121      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4122              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4123      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4124              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4125      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4126      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4127      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4128      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4129      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4130      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
4131      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4132      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4133      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4134      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4135      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4136      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4137      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4138      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4139      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4140      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4141      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4142      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4143      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4144      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4145      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4146      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
4147      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4148      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4149      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4150      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4151      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4152      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
4153      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4154      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4155      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
4156      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4157      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4158      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4159      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4160      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4161      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4162          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
4163          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
4164          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
4165      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4166          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4167     };
4168     int loop;
4169     TEST_ASSERT_SUCCESS(status);
4170     if (U_FAILURE(status)) {
4171         return;
4172     }
4173     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4174         // printf("looping %d\n", loop);
4175         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4176         if (t >= STRSIZE) {
4177             TEST_ASSERT(FALSE);
4178             continue;
4179         }
4180 
4181 
4182         UnicodeString ustr(str);
4183         RBBILineMonkey monkey;
4184         if (U_FAILURE(monkey.deferredStatus)) {
4185             continue;
4186         }
4187 
4188         const int EXPECTEDSIZE = 50;
4189         int expected[EXPECTEDSIZE];
4190         int expectedcount = 0;
4191 
4192         monkey.setText(ustr);
4193         int i;
4194         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4195             if (expectedcount >= EXPECTEDSIZE) {
4196                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4197                 return;
4198             }
4199             expected[expectedcount ++] = i;
4200         }
4201 
4202         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4203     }
4204     delete bi;
4205 #endif
4206 }
4207 
TestSentBreaks(void)4208 void RBBITest::TestSentBreaks(void)
4209 {
4210 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4211     Locale        locale("en");
4212     UErrorCode    status = U_ZERO_ERROR;
4213     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4214     UChar         str[200];
4215     static const char *strlist[] =
4216     {
4217      "Now\ris\nthe\r\ntime\n\rfor\r\r",
4218      "This\n",
4219      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4220      "\"Sentence ending with a quote.\" Bye.",
4221      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4222      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4223      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4224      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4225      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4226      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4227      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4228              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4229              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4230              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4231      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4232              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4233              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4234              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4235              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4236              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4237     };
4238     int loop;
4239     if (U_FAILURE(status)) {
4240         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4241         return;
4242     }
4243     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4244         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
4245         UnicodeString ustr(str);
4246 
4247         RBBISentMonkey monkey;
4248         if (U_FAILURE(monkey.deferredStatus)) {
4249             continue;
4250         }
4251 
4252         const int EXPECTEDSIZE = 50;
4253         int expected[EXPECTEDSIZE];
4254         int expectedcount = 0;
4255 
4256         monkey.setText(ustr);
4257         int i;
4258         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4259             if (expectedcount >= EXPECTEDSIZE) {
4260                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4261                 return;
4262             }
4263             expected[expectedcount ++] = i;
4264         }
4265 
4266         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4267     }
4268     delete bi;
4269 #endif
4270 }
4271 
TestMonkey(char * params)4272 void RBBITest::TestMonkey(char *params) {
4273 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4274 
4275     UErrorCode     status    = U_ZERO_ERROR;
4276     int32_t        loopCount = 500;
4277     int32_t        seed      = 1;
4278     UnicodeString  breakType = "all";
4279     Locale         locale("en");
4280     UBool          useUText  = FALSE;
4281 
4282     if (quick == FALSE) {
4283         loopCount = 10000;
4284     }
4285 
4286     if (params) {
4287         UnicodeString p(params);
4288         loopCount = getIntParam("loop", p, loopCount);
4289         seed      = getIntParam("seed", p, seed);
4290 
4291         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4292         if (m.find()) {
4293             breakType = m.group(1, status);
4294             m.reset();
4295             p = m.replaceFirst("", status);
4296         }
4297 
4298         RegexMatcher u(" *utext", p, 0, status);
4299         if (u.find()) {
4300             useUText = TRUE;
4301             u.reset();
4302             p = u.replaceFirst("", status);
4303         }
4304 
4305 
4306         // m.reset(p);
4307         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4308             // Each option is stripped out of the option string as it is processed.
4309             // All options have been checked.  The option string should have been completely emptied..
4310             char buf[100];
4311             p.extract(buf, sizeof(buf), NULL, status);
4312             buf[sizeof(buf)-1] = 0;
4313             errln("Unrecognized or extra parameter:  %s\n", buf);
4314             return;
4315         }
4316 
4317     }
4318 
4319     if (breakType == "char" || breakType == "all") {
4320         RBBICharMonkey  m;
4321         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4322         if (U_SUCCESS(status)) {
4323             RunMonkey(bi, m, "char", seed, loopCount, useUText);
4324             if (breakType == "all" && useUText==FALSE) {
4325                 // Also run a quick test with UText when "all" is specified
4326                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4327             }
4328         }
4329         else {
4330             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4331         }
4332         delete bi;
4333     }
4334 
4335     if (breakType == "word" || breakType == "all") {
4336         logln("Word Break Monkey Test");
4337         RBBIWordMonkey  m;
4338         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4339         if (U_SUCCESS(status)) {
4340             RunMonkey(bi, m, "word", seed, loopCount, useUText);
4341         }
4342         else {
4343             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4344         }
4345         delete bi;
4346     }
4347 
4348     if (breakType == "line" || breakType == "all") {
4349         logln("Line Break Monkey Test");
4350         RBBILineMonkey  m;
4351         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4352         if (loopCount >= 10) {
4353             loopCount = loopCount / 5;   // Line break runs slower than the others.
4354         }
4355         if (U_SUCCESS(status)) {
4356             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4357         }
4358         else {
4359             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4360         }
4361         delete bi;
4362     }
4363 
4364     if (breakType == "sent" || breakType == "all"  ) {
4365         logln("Sentence Break Monkey Test");
4366         RBBISentMonkey  m;
4367         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4368         if (loopCount >= 10) {
4369             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4370         }
4371         if (U_SUCCESS(status)) {
4372             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4373         }
4374         else {
4375             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4376         }
4377         delete bi;
4378     }
4379 
4380 #endif
4381 }
4382 
4383 //
4384 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4385 //    Parameters:
4386 //       bi      - the break iterator to use
4387 //       mk      - MonkeyKind, abstraction for obtaining expected results
4388 //       name    - Name of test (char, word, etc.) for use in error messages
4389 //       seed    - Seed for starting random number generator (parameter from user)
4390 //       numIterations
4391 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4392 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4393                          int32_t numIterations, UBool useUText) {
4394 
4395 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4396 
4397     const int32_t    TESTSTRINGLEN = 500;
4398     UnicodeString    testText;
4399     int32_t          numCharClasses;
4400     UVector          *chClasses;
4401     int              expected[TESTSTRINGLEN*2 + 1];
4402     int              expectedCount = 0;
4403     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4404     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4405     char             reverseBreaks[TESTSTRINGLEN*2+1];
4406     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4407     char             followingBreaks[TESTSTRINGLEN*2+1];
4408     char             precedingBreaks[TESTSTRINGLEN*2+1];
4409     int              i;
4410     int              loopCount = 0;
4411 
4412     m_seed = seed;
4413 
4414     numCharClasses = mk.charClasses()->size();
4415     chClasses      = mk.charClasses();
4416 
4417     // Check for errors that occured during the construction of the MonkeyKind object.
4418     //  Can't report them where they occured because errln() is a method coming from intlTest,
4419     //  and is not visible outside of RBBITest :-(
4420     if (U_FAILURE(mk.deferredStatus)) {
4421         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4422         return;
4423     }
4424 
4425     // Verify that the character classes all have at least one member.
4426     for (i=0; i<numCharClasses; i++) {
4427         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4428         if (s == NULL || s->size() == 0) {
4429             errln("Character Class #%d is null or of zero size.", i);
4430             return;
4431         }
4432     }
4433 
4434     while (loopCount < numIterations || numIterations == -1) {
4435         if (numIterations == -1 && loopCount % 10 == 0) {
4436             // If test is running in an infinite loop, display a periodic tic so
4437             //   we can tell that it is making progress.
4438             fprintf(stderr, ".");
4439         }
4440         // Save current random number seed, so that we can recreate the random numbers
4441         //   for this loop iteration in event of an error.
4442         seed = m_seed;
4443 
4444         // Populate a test string with data.
4445         testText.truncate(0);
4446         for (i=0; i<TESTSTRINGLEN; i++) {
4447             int32_t  aClassNum = m_rand() % numCharClasses;
4448             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4449             int32_t   charIdx = m_rand() % classSet->size();
4450             UChar32   c = classSet->charAt(charIdx);
4451             if (c < 0) {   // TODO:  deal with sets containing strings.
4452                 errln("c < 0");
4453                 break;
4454             }
4455             testText.append(c);
4456         }
4457 
4458         // Calculate the expected results for this test string.
4459         mk.setText(testText);
4460         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4461         expectedBreaks[0] = 1;
4462         int32_t breakPos = 0;
4463         expectedCount = 0;
4464         for (;;) {
4465             breakPos = mk.next(breakPos);
4466             if (breakPos == -1) {
4467                 break;
4468             }
4469             if (breakPos > testText.length()) {
4470                 errln("breakPos > testText.length()");
4471             }
4472             expectedBreaks[breakPos] = 1;
4473             U_ASSERT(expectedCount<testText.length());
4474             expected[expectedCount ++] = breakPos;
4475         }
4476 
4477         // Find the break positions using forward iteration
4478         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4479         if (useUText) {
4480             UErrorCode status = U_ZERO_ERROR;
4481             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4482             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4483             bi->setText(testUText, status);
4484             TEST_ASSERT_SUCCESS(status);
4485             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4486                                       //  This UText can be closed immediately, so long as the
4487                                       //  testText string continues to exist.
4488         } else {
4489             bi->setText(testText);
4490         }
4491 
4492         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4493             if (i < 0 || i > testText.length()) {
4494                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4495                 break;
4496             }
4497             forwardBreaks[i] = 1;
4498         }
4499 
4500         // Find the break positions using reverse iteration
4501         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4502         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4503             if (i < 0 || i > testText.length()) {
4504                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4505                 break;
4506             }
4507             reverseBreaks[i] = 1;
4508         }
4509 
4510         // Find the break positions using isBoundary() tests.
4511         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4512         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4513         for (i=0; i<=testText.length(); i++) {
4514             isBoundaryBreaks[i] = bi->isBoundary(i);
4515         }
4516 
4517 
4518         // Find the break positions using the following() function.
4519         // printf(".");
4520         memset(followingBreaks, 0, sizeof(followingBreaks));
4521         int32_t   lastBreakPos = 0;
4522         followingBreaks[0] = 1;
4523         for (i=0; i<testText.length(); i++) {
4524             breakPos = bi->following(i);
4525             if (breakPos <= i ||
4526                 breakPos < lastBreakPos ||
4527                 breakPos > testText.length() ||
4528                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4529                 errln("%s break monkey test: "
4530                     "Out of range value returned by BreakIterator::following().\n"
4531                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4532                          name, seed, i, breakPos, lastBreakPos);
4533                 break;
4534             }
4535             followingBreaks[breakPos] = 1;
4536             lastBreakPos = breakPos;
4537         }
4538 
4539         // Find the break positions using the preceding() function.
4540         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4541         lastBreakPos = testText.length();
4542         precedingBreaks[testText.length()] = 1;
4543         for (i=testText.length(); i>0; i--) {
4544             breakPos = bi->preceding(i);
4545             if (breakPos >= i ||
4546                 breakPos > lastBreakPos ||
4547                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4548                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4549                 errln("%s break monkey test: "
4550                     "Out of range value returned by BreakIterator::preceding().\n"
4551                     "index=%d;  prev returned %d; lastBreak=%d" ,
4552                     name,  i, breakPos, lastBreakPos);
4553                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4554                     precedingBreaks[i] = 2;   // Forces an error.
4555                 }
4556             } else {
4557                 if (breakPos >= 0) {
4558                     precedingBreaks[breakPos] = 1;
4559                 }
4560                 lastBreakPos = breakPos;
4561             }
4562         }
4563 
4564         // Compare the expected and actual results.
4565         for (i=0; i<=testText.length(); i++) {
4566             const char *errorType = NULL;
4567             if  (forwardBreaks[i] != expectedBreaks[i]) {
4568                 errorType = "next()";
4569             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4570                 errorType = "previous()";
4571             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4572                 errorType = "isBoundary()";
4573             } else if (followingBreaks[i] != expectedBreaks[i]) {
4574                 errorType = "following()";
4575             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4576                 errorType = "preceding()";
4577             }
4578 
4579 
4580             if (errorType != NULL) {
4581                 // Format a range of the test text that includes the failure as
4582                 //  a data item that can be included in the rbbi test data file.
4583 
4584                 // Start of the range is the last point where expected and actual results
4585                 //   both agreed that there was a break position.
4586                 int startContext = i;
4587                 int32_t count = 0;
4588                 for (;;) {
4589                     if (startContext==0) { break; }
4590                     startContext --;
4591                     if (expectedBreaks[startContext] != 0) {
4592                         if (count == 2) break;
4593                         count ++;
4594                     }
4595                 }
4596 
4597                 // End of range is two expected breaks past the start position.
4598                 int endContext = i + 1;
4599                 int ci;
4600                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4601                     for (;;) {
4602                         if (endContext >= testText.length()) {break;}
4603                         if (expectedBreaks[endContext-1] != 0) {
4604                             if (count == 0) break;
4605                             count --;
4606                         }
4607                         endContext ++;
4608                     }
4609                 }
4610 
4611                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4612                 UnicodeString errorText = "<data>";
4613                 /***if (strcmp(errorType, "next()") == 0) {
4614                     startContext = 0;
4615                     endContext = testText.length();
4616 
4617                     printStringBreaks(testText, expected, expectedCount);
4618                 }***/
4619 
4620                 for (ci=startContext; ci<endContext;) {
4621                     UnicodeString hexChars("0123456789abcdef");
4622                     UChar32  c;
4623                     int      bn;
4624                     c = testText.char32At(ci);
4625                     if (ci == i) {
4626                         // This is the location of the error.
4627                         errorText.append("<?>");
4628                     } else if (expectedBreaks[ci] != 0) {
4629                         // This a non-error expected break position.
4630                         errorText.append("\\");
4631                     }
4632                     if (c < 0x10000) {
4633                         errorText.append("\\u");
4634                         for (bn=12; bn>=0; bn-=4) {
4635                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4636                         }
4637                     } else {
4638                         errorText.append("\\U");
4639                         for (bn=28; bn>=0; bn-=4) {
4640                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4641                         }
4642                     }
4643                     ci = testText.moveIndex32(ci, 1);
4644                 }
4645                 errorText.append("\\");
4646                 errorText.append("</data>\n");
4647 
4648                 // Output the error
4649                 char  charErrorTxt[500];
4650                 UErrorCode status = U_ZERO_ERROR;
4651                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4652                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4653                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4654                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4655                     errorType, seed, i, charErrorTxt);
4656                 break;
4657             }
4658         }
4659 
4660         loopCount++;
4661     }
4662 #endif
4663 }
4664 
4665 
4666 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4667 //             This test checks the initial patch,
4668 //             which is to just keep it from crashing.  Correct word boundaries
4669 //             await a proper fix to the dictionary code.
4670 //
TestBug5532(void)4671 void RBBITest::TestBug5532(void)  {
4672    // Text includes a mixture of Thai and Latin.
4673    const unsigned char utf8Data[] = {
4674            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4675            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4676            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4677            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4678            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4679            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4680            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4681            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4682            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4683            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4684            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4685 
4686     UErrorCode status = U_ZERO_ERROR;
4687     UText utext=UTEXT_INITIALIZER;
4688     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4689     TEST_ASSERT_SUCCESS(status);
4690 
4691     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4692     TEST_ASSERT_SUCCESS(status);
4693     if (U_SUCCESS(status)) {
4694         bi->setText(&utext, status);
4695         TEST_ASSERT_SUCCESS(status);
4696 
4697         int32_t breakCount = 0;
4698         int32_t previousBreak = -1;
4699         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4700             // For now, just make sure that the break iterator doesn't hang.
4701             TEST_ASSERT(previousBreak < bi->current());
4702             previousBreak = bi->current();
4703         }
4704         TEST_ASSERT(breakCount > 0);
4705     }
4706     delete bi;
4707     utext_close(&utext);
4708 }
4709 
4710 
4711 //
4712 //  TestDebug    -  A place-holder test for debugging purposes.
4713 //                  For putting in fragments of other tests that can be invoked
4714 //                  for tracing  without a lot of unwanted extra stuff happening.
4715 //
TestDebug(void)4716 void RBBITest::TestDebug(void) {
4717 #if 0
4718     UErrorCode   status = U_ZERO_ERROR;
4719     int pos = 0;
4720     int ruleStatus = 0;
4721 
4722     RuleBasedBreakIterator* bi =
4723        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4724        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4725        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4726     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4727     // UnicodeString s("Aaa.  Bcd");
4728     s = s.unescape();
4729     bi->setText(s);
4730     UBool r = bi->isBoundary(8);
4731     printf("%s", r?"true":"false");
4732     return;
4733     pos = bi->last();
4734     do {
4735         // ruleStatus = bi->getRuleStatus();
4736         printf("%d\t%d\n", pos, ruleStatus);
4737         pos = bi->previous();
4738     } while (pos != BreakIterator::DONE);
4739 #endif
4740 }
4741 
4742 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4743