1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <sstream>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <utility>
22 #include <vector>
23
24 #include "unicode/brkiter.h"
25 #include "unicode/localpointer.h"
26 #include "unicode/numfmt.h"
27 #include "unicode/rbbi.h"
28 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
29 #include "unicode/regex.h"
30 #endif
31 #include "unicode/schriter.h"
32 #include "unicode/uchar.h"
33 #include "unicode/utf16.h"
34 #include "unicode/ucnv.h"
35 #include "unicode/uniset.h"
36 #include "unicode/uscript.h"
37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
39 #include "unicode/utrace.h"
40
41 #include "charstr.h"
42 #include "cmemory.h"
43 #include "cstr.h"
44 #include "intltest.h"
45 #include "rbbitst.h"
46 #include "rbbidata.h"
47 #include "utypeinfo.h" // for 'typeid' to work
48 #include "uvector.h"
49 #include "uvectr32.h"
50
51
52 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
53 #include "unicode/filteredbrk.h"
54 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
55
56 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
57 if (!(x)) { \
58 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
59 } \
60 } UPRV_BLOCK_MACRO_END
61
62 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
63 if (U_FAILURE(errcode)) { \
64 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
65 } \
66 } UPRV_BLOCK_MACRO_END
67
68 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
69 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
70 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
71 }
72
73 //---------------------------------------------
74 // runIndexedTest
75 //---------------------------------------------
76
77
78 // Note: Before adding new tests to this file, check whether the desired test data can
79 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
80 // it's much less work than writing a new test, diagnostic output in the event of failures
81 // is good, and the test data file will is shared with ICU4J, so eventually the test
82 // will run there as well, without additional effort.
83
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)84 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
85 {
86 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
87 fTestParams = params;
88
89 TESTCASE_AUTO_BEGIN;
90 #if !UCONFIG_NO_FILE_IO
91 TESTCASE_AUTO(TestBug4153072);
92 #endif
93 #if !UCONFIG_NO_FILE_IO
94 TESTCASE_AUTO(TestUnicodeFiles);
95 #endif
96 TESTCASE_AUTO(TestGetAvailableLocales);
97 TESTCASE_AUTO(TestGetDisplayName);
98 #if !UCONFIG_NO_FILE_IO
99 TESTCASE_AUTO(TestEndBehaviour);
100 TESTCASE_AUTO(TestWordBreaks);
101 TESTCASE_AUTO(TestWordBoundary);
102 TESTCASE_AUTO(TestLineBreaks);
103 TESTCASE_AUTO(TestSentBreaks);
104 TESTCASE_AUTO(TestExtended);
105 #endif
106 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
107 TESTCASE_AUTO(TestMonkey);
108 #endif
109 #if !UCONFIG_NO_FILE_IO
110 TESTCASE_AUTO(TestBug3818);
111 #endif
112 TESTCASE_AUTO(TestDebug);
113 #if !UCONFIG_NO_FILE_IO
114 TESTCASE_AUTO(TestBug5775);
115 #endif
116 TESTCASE_AUTO(TestBug9983);
117 TESTCASE_AUTO(TestDictRules);
118 TESTCASE_AUTO(TestBug5532);
119 TESTCASE_AUTO(TestBug7547);
120 TESTCASE_AUTO(TestBug12797);
121 TESTCASE_AUTO(TestBug12918);
122 TESTCASE_AUTO(TestBug12932);
123 TESTCASE_AUTO(TestEmoji);
124 TESTCASE_AUTO(TestBug12519);
125 TESTCASE_AUTO(TestBug12677);
126 TESTCASE_AUTO(TestTableRedundancies);
127 TESTCASE_AUTO(TestBug13447);
128 TESTCASE_AUTO(TestReverse);
129 TESTCASE_AUTO(TestBug13692);
130 TESTCASE_AUTO(TestDebugRules);
131 TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
132 TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
133 TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
134 TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
135 TESTCASE_AUTO(TestTable_8_16_Bits);
136 TESTCASE_AUTO(TestBug13590);
137
138 #if U_ENABLE_TRACING
139 TESTCASE_AUTO(TestTraceCreateCharacter);
140 TESTCASE_AUTO(TestTraceCreateWord);
141 TESTCASE_AUTO(TestTraceCreateSentence);
142 TESTCASE_AUTO(TestTraceCreateTitle);
143 TESTCASE_AUTO(TestTraceCreateLine);
144 TESTCASE_AUTO(TestTraceCreateLineNormal);
145 TESTCASE_AUTO(TestTraceCreateLineLoose);
146 TESTCASE_AUTO(TestTraceCreateLineStrict);
147 TESTCASE_AUTO(TestTraceCreateBreakEngine);
148 #endif
149
150 TESTCASE_AUTO_END;
151 }
152
153
154 //--------------------------------------------------------------------------------------
155 //
156 // RBBITest constructor and destructor
157 //
158 //--------------------------------------------------------------------------------------
159
RBBITest()160 RBBITest::RBBITest() {
161 fTestParams = NULL;
162 }
163
164
~RBBITest()165 RBBITest::~RBBITest() {
166 }
167
168
printStringBreaks(UText * tstr,int expected[],int expectedCount)169 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
170 UErrorCode status = U_ZERO_ERROR;
171 char name[100];
172 printf("code alpha extend alphanum type word sent line name\n");
173 int nextExpectedIndex = 0;
174 utext_setNativeIndex(tstr, 0);
175 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
176 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
177 printf("------------------------------------------------ %d\n", j);
178 ++nextExpectedIndex;
179 }
180
181 UChar32 c = utext_next32(tstr);
182 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
183 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
184 u_isUAlphabetic(c),
185 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
186 u_isalnum(c),
187 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
188 u_charType(c),
189 U_SHORT_PROPERTY_NAME),
190 u_getPropertyValueName(UCHAR_WORD_BREAK,
191 u_getIntPropertyValue(c,
192 UCHAR_WORD_BREAK),
193 U_SHORT_PROPERTY_NAME),
194 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
195 u_getIntPropertyValue(c,
196 UCHAR_SENTENCE_BREAK),
197 U_SHORT_PROPERTY_NAME),
198 u_getPropertyValueName(UCHAR_LINE_BREAK,
199 u_getIntPropertyValue(c,
200 UCHAR_LINE_BREAK),
201 U_SHORT_PROPERTY_NAME),
202 name);
203 }
204 }
205
206
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)207 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
208 UErrorCode status = U_ZERO_ERROR;
209 UText *tstr = NULL;
210 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
211 if (U_FAILURE(status)) {
212 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
213 return;
214 }
215 printStringBreaks(tstr, expected, expectedCount);
216 utext_close(tstr);
217 }
218
219
TestBug3818()220 void RBBITest::TestBug3818() {
221 UErrorCode status = U_ZERO_ERROR;
222
223 // Four Thai words...
224 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
225 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
226 UnicodeString thaiStr(thaiWordData);
227
228 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
229 if (U_FAILURE(status) || bi == NULL) {
230 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
231 return;
232 }
233 bi->setText(thaiStr);
234
235 int32_t startOfSecondWord = bi->following(1);
236 if (startOfSecondWord != 4) {
237 errln("Fail at file %s, line %d expected start of word at 4, got %d",
238 __FILE__, __LINE__, startOfSecondWord);
239 }
240 startOfSecondWord = bi->following(0);
241 if (startOfSecondWord != 4) {
242 errln("Fail at file %s, line %d expected start of word at 4, got %d",
243 __FILE__, __LINE__, startOfSecondWord);
244 }
245 delete bi;
246 }
247
248
249 //---------------------------------------------
250 //
251 // other tests
252 //
253 //---------------------------------------------
254
TestGetAvailableLocales()255 void RBBITest::TestGetAvailableLocales()
256 {
257 int32_t locCount = 0;
258 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
259
260 if (locCount == 0)
261 dataerrln("getAvailableLocales() returned an empty list!");
262 // Just make sure that it's returning good memory.
263 int32_t i;
264 for (i = 0; i < locCount; ++i) {
265 logln(locList[i].getName());
266 }
267 }
268
269 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()270 void RBBITest::TestGetDisplayName()
271 {
272 UnicodeString result;
273
274 BreakIterator::getDisplayName(Locale::getUS(), result);
275 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
276 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
277 + result);
278
279 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
280 if (result != "French (France)")
281 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
282 + result);
283 }
284 /**
285 * Test End Behaviour
286 * @bug 4068137
287 */
TestEndBehaviour()288 void RBBITest::TestEndBehaviour()
289 {
290 UErrorCode status = U_ZERO_ERROR;
291 UnicodeString testString("boo.");
292 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
293 if (U_FAILURE(status))
294 {
295 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
296 return;
297 }
298 wb->setText(testString);
299
300 if (wb->first() != 0)
301 errln("Didn't get break at beginning of string.");
302 if (wb->next() != 3)
303 errln("Didn't get break before period in \"boo.\"");
304 if (wb->current() != 4 && wb->next() != 4)
305 errln("Didn't get break at end of string.");
306 delete wb;
307 }
308 /*
309 * @bug 4153072
310 */
TestBug4153072()311 void RBBITest::TestBug4153072() {
312 UErrorCode status = U_ZERO_ERROR;
313 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
314 if (U_FAILURE(status))
315 {
316 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
317 return;
318 }
319 UnicodeString str("...Hello, World!...");
320 int32_t begin = 3;
321 int32_t end = str.length() - 3;
322 UBool onBoundary;
323
324 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
325 iter->adoptText(textIterator);
326 int index;
327 // Note: with the switch to UText, there is no way to restrict the
328 // iteration range to begin at an index other than zero.
329 // String character iterators created with a non-zero bound are
330 // treated by RBBI as being empty.
331 for (index = -1; index < begin + 1; ++index) {
332 onBoundary = iter->isBoundary(index);
333 if (index == 0? !onBoundary : onBoundary) {
334 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
335 " and begin index = " + begin);
336 }
337 }
338 delete iter;
339 }
340
341
342 //
343 // Test for problem reported by Ashok Matoria on 9 July 2007
344 // One.<kSoftHyphen><kSpace>Two.
345 //
346 // Sentence break at start (0) and then on calling next() it breaks at
347 // 'T' of "Two". Now, at this point if I do next() and
348 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
349 //
TestBug5775()350 void RBBITest::TestBug5775() {
351 UErrorCode status = U_ZERO_ERROR;
352 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
353 TEST_ASSERT_SUCCESS(status);
354 if (U_FAILURE(status)) {
355 return;
356 }
357 // Check for status first for better handling of no data errors.
358 TEST_ASSERT(bi != NULL);
359 if (bi == NULL) {
360 return;
361 }
362
363 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
364 // 01234 56789
365 s = s.unescape();
366 bi->setText(s);
367 int pos = bi->next();
368 TEST_ASSERT(pos == 6);
369 pos = bi->next();
370 TEST_ASSERT(pos == 10);
371 pos = bi->previous();
372 TEST_ASSERT(pos == 6);
373 delete bi;
374 }
375
376
377
378 //------------------------------------------------------------------------------
379 //
380 // RBBITest::Extended Run RBBI Tests from an external test data file
381 //
382 //------------------------------------------------------------------------------
383
384 struct TestParams {
385 BreakIterator *bi; // Break iterator is set while parsing test source.
386 // Changed out whenever test data changes break type.
387
388 UnicodeString dataToBreak; // Data that is built up while parsing the test.
389 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
390 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
391 UVector32 *srcCol;
392
393 UText *textToBreak; // UText, could be UTF8 or UTF16.
394 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
395 CharString utf8String; // UTF-8 form of text to break.
396
TestParamsTestParams397 TestParams(UErrorCode &status) : dataToBreak() {
398 bi = NULL;
399 expectedBreaks = new UVector32(status);
400 srcLine = new UVector32(status);
401 srcCol = new UVector32(status);
402 textToBreak = NULL;
403 textMap = new UVector32(status);
404 }
405
~TestParamsTestParams406 ~TestParams() {
407 delete bi;
408 delete expectedBreaks;
409 delete srcLine;
410 delete srcCol;
411 utext_close(textToBreak);
412 delete textMap;
413 }
414
415 int32_t getSrcLine(int32_t bp);
416 int32_t getExpectedBreak(int32_t bp);
417 int32_t getSrcCol(int32_t bp);
418
419 void setUTF16(UErrorCode &status);
420 void setUTF8(UErrorCode &status);
421 };
422
423 // Append a UnicodeString to a CharString with UTF-8 encoding.
424 // Substitute any invalid chars.
425 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)426 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
427 if (U_FAILURE(status)) {
428 return;
429 }
430 int32_t utf8Length;
431 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
432 src.getBuffer(), src.length(), // UTF-16 data
433 0xfffd, NULL, // Substitution char, number of subs.
434 &status);
435 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
436 return;
437 }
438 status = U_ZERO_ERROR;
439 int32_t capacity;
440 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
441 u_strToUTF8WithSub(buffer, utf8Length, NULL,
442 src.getBuffer(), src.length(),
443 0xfffd, NULL, &status);
444 dest.append(buffer, utf8Length, status);
445 }
446
447
setUTF16(UErrorCode & status)448 void TestParams::setUTF16(UErrorCode &status) {
449 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
450 textMap->removeAllElements();
451 for (int32_t i=0; i<dataToBreak.length(); i++) {
452 if (i == dataToBreak.getChar32Start(i)) {
453 textMap->addElement(i, status);
454 } else {
455 textMap->addElement(-1, status);
456 }
457 }
458 textMap->addElement(dataToBreak.length(), status);
459 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
460 }
461
462
setUTF8(UErrorCode & status)463 void TestParams::setUTF8(UErrorCode &status) {
464 if (U_FAILURE(status)) {
465 return;
466 }
467 utf8String.clear();
468 CharStringAppend(utf8String, dataToBreak, status);
469 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
470 if (U_FAILURE(status)) {
471 return;
472 }
473
474 textMap->removeAllElements();
475 int32_t utf16Index = 0;
476 for (;;) {
477 textMap->addElement(utf16Index, status);
478 UChar32 c32 = utext_current32(textToBreak);
479 if (c32 < 0) {
480 break;
481 }
482 utf16Index += U16_LENGTH(c32);
483 utext_next32(textToBreak);
484 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
485 textMap->addElement(-1, status);
486 }
487 }
488 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
489 }
490
491
getSrcLine(int32_t bp)492 int32_t TestParams::getSrcLine(int32_t bp) {
493 if (bp >= textMap->size()) {
494 bp = textMap->size() - 1;
495 }
496 int32_t i = 0;
497 for(; bp >= 0 ; --bp) {
498 // Move to a character boundary if we are not on one already.
499 i = textMap->elementAti(bp);
500 if (i >= 0) {
501 break;
502 }
503 }
504 return srcLine->elementAti(i);
505 }
506
507
getExpectedBreak(int32_t bp)508 int32_t TestParams::getExpectedBreak(int32_t bp) {
509 if (bp >= textMap->size()) {
510 return 0;
511 }
512 int32_t i = textMap->elementAti(bp);
513 int32_t retVal = 0;
514 if (i >= 0) {
515 retVal = expectedBreaks->elementAti(i);
516 }
517 return retVal;
518 }
519
520
getSrcCol(int32_t bp)521 int32_t TestParams::getSrcCol(int32_t bp) {
522 if (bp >= textMap->size()) {
523 bp = textMap->size() - 1;
524 }
525 int32_t i = 0;
526 for(; bp >= 0; --bp) {
527 // Move bp to a character boundary if we are not on one already.
528 i = textMap->elementAti(bp);
529 if (i >= 0) {
530 break;
531 }
532 }
533 return srcCol->elementAti(i);
534 }
535
536
executeTest(TestParams * t,UErrorCode & status)537 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
538 int32_t bp;
539 int32_t prevBP;
540 int32_t i;
541
542 TEST_ASSERT_SUCCESS(status);
543 if (U_FAILURE(status)) {
544 return;
545 }
546
547 if (t->bi == NULL) {
548 return;
549 }
550
551 t->bi->setText(t->textToBreak, status);
552 //
553 // Run the iterator forward
554 //
555 prevBP = -1;
556 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
557 if (prevBP == bp) {
558 // Fail for lack of forward progress.
559 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
560 bp, t->getSrcLine(bp), t->getSrcCol(bp));
561 break;
562 }
563
564 // Check that there we didn't miss an expected break between the last one
565 // and this one.
566 for (i=prevBP+1; i<bp; i++) {
567 if (t->getExpectedBreak(i) != 0) {
568 int expected[] = {0, i};
569 printStringBreaks(t->dataToBreak, expected, 2);
570 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
571 i, t->getSrcLine(i), t->getSrcCol(i));
572 }
573 }
574
575 // Check that the break we did find was expected
576 if (t->getExpectedBreak(bp) == 0) {
577 int expected[] = {0, bp};
578 printStringBreaks(t->textToBreak, expected, 2);
579 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
580 bp, t->getSrcLine(bp), t->getSrcCol(bp));
581 } else {
582 // The break was expected.
583 // Check that the {nnn} tag value is correct.
584 int32_t expectedTagVal = t->getExpectedBreak(bp);
585 if (expectedTagVal == -1) {
586 expectedTagVal = 0;
587 }
588 int32_t line = t->getSrcLine(bp);
589 int32_t rs = t->bi->getRuleStatus();
590 if (rs != expectedTagVal) {
591 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
592 " Actual, Expected status = %4d, %4d",
593 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
594 }
595 }
596
597 prevBP = bp;
598 }
599
600 // Verify that there were no missed expected breaks after the last one found
601 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
602 if (t->getExpectedBreak(i) != 0) {
603 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
604 i, t->getSrcLine(i), t->getSrcCol(i));
605 }
606 }
607
608 //
609 // Run the iterator backwards, verify that the same breaks are found.
610 //
611 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
612 bp = t->bi->last();
613 while (bp != BreakIterator::DONE) {
614 if (prevBP == bp) {
615 // Fail for lack of progress.
616 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
617 bp, t->getSrcLine(bp), t->getSrcCol(bp));
618 break;
619 }
620
621 // Check that we didn't miss an expected break between the last one
622 // and this one. (UVector returns zeros for index out of bounds.)
623 for (i=prevBP-1; i>bp; i--) {
624 if (t->getExpectedBreak(i) != 0) {
625 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
626 i, t->getSrcLine(i), t->getSrcCol(i));
627 }
628 }
629
630 // Check that the break we did find was expected
631 if (t->getExpectedBreak(bp) == 0) {
632 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
633 bp, t->getSrcLine(bp), t->getSrcCol(bp));
634 } else {
635 // The break was expected.
636 // Check that the {nnn} tag value is correct.
637 int32_t expectedTagVal = t->getExpectedBreak(bp);
638 if (expectedTagVal == -1) {
639 expectedTagVal = 0;
640 }
641 int line = t->getSrcLine(bp);
642 int32_t rs = t->bi->getRuleStatus();
643 if (rs != expectedTagVal) {
644 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
645 " Actual, Expected status = %4d, %4d",
646 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
647 }
648 }
649
650 prevBP = bp;
651 bp = t->bi->previous();
652 }
653
654 // Verify that there were no missed breaks prior to the last one found
655 for (i=prevBP-1; i>=0; i--) {
656 if (t->getExpectedBreak(i) != 0) {
657 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
658 i, t->getSrcLine(i), t->getSrcCol(i));
659 }
660 }
661
662 // Check isBoundary()
663 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
664 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
665 UBool boundaryFound = t->bi->isBoundary(i);
666 if (boundaryExpected != boundaryFound) {
667 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
668 " Expected, Actual= %s, %s",
669 i, t->getSrcLine(i), t->getSrcCol(i),
670 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
671 }
672 }
673
674 // Check following()
675 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
676 int32_t actualBreak = t->bi->following(i);
677 int32_t expectedBreak = BreakIterator::DONE;
678 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
679 if (t->getExpectedBreak(j) != 0) {
680 expectedBreak = j;
681 break;
682 }
683 }
684 if (expectedBreak != actualBreak) {
685 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
686 " Expected, Actual= %d, %d",
687 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
688 }
689 }
690
691 // Check preceding()
692 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
693 int32_t actualBreak = t->bi->preceding(i);
694 int32_t expectedBreak = BreakIterator::DONE;
695
696 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
697 // preceding(trailing byte) will return the index of some preceding code point,
698 // not the lead byte of the current code point, even though that has a smaller index.
699 // Therefore, start looking at the expected break data not at i-1, but at
700 // the start of code point index - 1.
701 utext_setNativeIndex(t->textToBreak, i);
702 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
703 for (; j >= 0; j--) {
704 if (t->getExpectedBreak(j) != 0) {
705 expectedBreak = j;
706 break;
707 }
708 }
709 if (expectedBreak != actualBreak) {
710 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
711 " Expected, Actual= %d, %d",
712 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
713 }
714 }
715 }
716
717
TestExtended()718 void RBBITest::TestExtended() {
719 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
720 // data driven test closely entangles filtered and regular data.
721 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
722 UErrorCode status = U_ZERO_ERROR;
723 Locale locale("");
724
725 TestParams tp(status);
726
727 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
728 if (U_FAILURE(status)) {
729 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
730 }
731
732 //
733 // Open and read the test data file.
734 //
735 const char *testDataDirectory = IntlTest::getSourceTestData(status);
736 CharString testFileName(testDataDirectory, -1, status);
737 testFileName.append("rbbitst.txt", -1, status);
738
739 int len;
740 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
741 if (U_FAILURE(status)) {
742 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
743 return;
744 }
745
746 bool skipTest = false; // Skip this test?
747
748 //
749 // Put the test data into a UnicodeString
750 //
751 UnicodeString testString(FALSE, testFile, len);
752
753 enum EParseState{
754 PARSE_COMMENT,
755 PARSE_TAG,
756 PARSE_DATA,
757 PARSE_NUM,
758 PARSE_RULES
759 }
760 parseState = PARSE_TAG;
761
762 EParseState savedState = PARSE_TAG;
763
764 int32_t lineNum = 1;
765 int32_t colStart = 0;
766 int32_t column = 0;
767 int32_t charIdx = 0;
768
769 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
770
771 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
772 int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
773
774 for (charIdx = 0; charIdx < len; ) {
775 status = U_ZERO_ERROR;
776 UChar c = testString.charAt(charIdx);
777 charIdx++;
778 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
779 // treat CRLF as a unit
780 c = u'\n';
781 charIdx++;
782 }
783 if (c == u'\n' || c == u'\r') {
784 lineNum++;
785 colStart = charIdx;
786 }
787 column = charIdx - colStart + 1;
788
789 switch (parseState) {
790 case PARSE_COMMENT:
791 if (c == u'\n' || c == u'\r') {
792 parseState = savedState;
793 }
794 break;
795
796 case PARSE_TAG:
797 {
798 if (c == u'#') {
799 parseState = PARSE_COMMENT;
800 savedState = PARSE_TAG;
801 break;
802 }
803 if (u_isUWhiteSpace(c)) {
804 break;
805 }
806 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
807 delete tp.bi;
808 tp.bi = BreakIterator::createWordInstance(locale, status);
809 skipTest = false;
810 charIdx += 5;
811 break;
812 }
813 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
814 delete tp.bi;
815 tp.bi = BreakIterator::createCharacterInstance(locale, status);
816 skipTest = false;
817 charIdx += 5;
818 break;
819 }
820 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
821 delete tp.bi;
822 tp.bi = BreakIterator::createLineInstance(locale, status);
823 skipTest = false;
824 charIdx += 5;
825 break;
826 }
827 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
828 delete tp.bi;
829 tp.bi = BreakIterator::createSentenceInstance(locale, status);
830 skipTest = false;
831 charIdx += 5;
832 break;
833 }
834 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
835 delete tp.bi;
836 tp.bi = BreakIterator::createTitleInstance(locale, status);
837 charIdx += 6;
838 break;
839 }
840
841 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
842 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
843 charIdx = testString.indexOf(u'>', charIdx) + 1;
844 parseState = PARSE_RULES;
845 rules.remove();
846 rulesFirstLine = lineNum;
847 break;
848 }
849
850 // <locale loc_name>
851 localeMatcher.reset(testString);
852 if (localeMatcher.lookingAt(charIdx-1, status)) {
853 UnicodeString localeName = localeMatcher.group(1, status);
854 char localeName8[100];
855 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
856 locale = Locale::createFromName(localeName8);
857 charIdx += localeMatcher.group(0, status).length() - 1;
858 TEST_ASSERT_SUCCESS(status);
859 break;
860 }
861 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
862 parseState = PARSE_DATA;
863 charIdx += 5;
864 tp.dataToBreak = "";
865 tp.expectedBreaks->removeAllElements();
866 tp.srcCol ->removeAllElements();
867 tp.srcLine->removeAllElements();
868 break;
869 }
870
871 errln("line %d: Tag expected in test file.", lineNum);
872 parseState = PARSE_COMMENT;
873 savedState = PARSE_DATA;
874 goto end_test; // Stop the test.
875 }
876 break;
877
878 case PARSE_RULES:
879 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
880 charIdx += 7;
881 parseState = PARSE_TAG;
882 delete tp.bi;
883 UParseError pe;
884 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
885 skipTest = U_FAILURE(status);
886 if (U_FAILURE(status)) {
887 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
888 rulesFirstLine + pe.line - 1, u_errorName(status));
889 }
890 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
891 charIdx += 10;
892 parseState = PARSE_TAG;
893 UErrorCode ec = U_ZERO_ERROR;
894 UParseError pe;
895 RuleBasedBreakIterator bi(rules, pe, ec);
896 if (U_SUCCESS(ec)) {
897 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
898 rulesFirstLine + pe.line - 1);
899 }
900 } else {
901 rules.append(c);
902 }
903 break;
904
905 case PARSE_DATA:
906 if (c == u'•') {
907 int32_t breakIdx = tp.dataToBreak.length();
908 if (tp.expectedBreaks->size() > breakIdx) {
909 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
910 lineNum, column);
911 }
912 tp.expectedBreaks->setSize(breakIdx+1);
913 tp.expectedBreaks->setElementAt(-1, breakIdx);
914 tp.srcLine->setSize(breakIdx+1);
915 tp.srcLine->setElementAt(lineNum, breakIdx);
916 tp.srcCol ->setSize(breakIdx+1);
917 tp.srcCol ->setElementAt(column, breakIdx);
918 break;
919 }
920
921 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
922 // Add final entry to mappings from break location to source file position.
923 // Need one extra because last break position returned is after the
924 // last char in the data, not at the last char.
925 tp.srcLine->addElement(lineNum, status);
926 tp.srcCol ->addElement(column, status);
927
928 parseState = PARSE_TAG;
929 charIdx += 6;
930
931 if (!skipTest) {
932 // RUN THE TEST!
933 status = U_ZERO_ERROR;
934 tp.setUTF16(status);
935 executeTest(&tp, status);
936 TEST_ASSERT_SUCCESS(status);
937
938 // Run again, this time with UTF-8 text wrapped in a UText.
939 status = U_ZERO_ERROR;
940 tp.setUTF8(status);
941 TEST_ASSERT_SUCCESS(status);
942 executeTest(&tp, status);
943 }
944 break;
945 }
946
947 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
948 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
949 // Get the code point from the name and insert it into the test data.
950 // (Damn, no API takes names in Unicode !!!
951 // we've got to take it back to char *)
952 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
953 int32_t nameLength = nameEndIdx - (charIdx+2);
954 char charNameBuf[200];
955 UChar32 theChar = -1;
956 if (nameEndIdx != -1) {
957 UErrorCode status = U_ZERO_ERROR;
958 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
959 charNameBuf[sizeof(charNameBuf)-1] = 0;
960 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
961 if (U_FAILURE(status)) {
962 theChar = -1;
963 }
964 }
965 if (theChar == -1) {
966 errln("Error in named character in test file at line %d, col %d",
967 lineNum, column);
968 } else {
969 // Named code point was recognized. Insert it
970 // into the test data.
971 tp.dataToBreak.append(theChar);
972 while (tp.dataToBreak.length() > tp.srcLine->size()) {
973 tp.srcLine->addElement(lineNum, status);
974 tp.srcCol ->addElement(column, status);
975 }
976 }
977 if (nameEndIdx > charIdx) {
978 charIdx = nameEndIdx+1;
979
980 }
981 break;
982 }
983
984
985
986 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
987 charIdx++;
988 int32_t breakIdx = tp.dataToBreak.length();
989 tp.expectedBreaks->setSize(breakIdx+1);
990 tp.expectedBreaks->setElementAt(-1, breakIdx);
991 tp.srcLine->setSize(breakIdx+1);
992 tp.srcLine->setElementAt(lineNum, breakIdx);
993 tp.srcCol ->setSize(breakIdx+1);
994 tp.srcCol ->setElementAt(column, breakIdx);
995 break;
996 }
997
998 if (c == u'<') {
999 tagValue = 0;
1000 parseState = PARSE_NUM;
1001 break;
1002 }
1003
1004 if (c == u'#' && column==3) { // TODO: why is column off so far?
1005 parseState = PARSE_COMMENT;
1006 savedState = PARSE_DATA;
1007 break;
1008 }
1009
1010 if (c == u'\\') {
1011 // Check for \ at end of line, a line continuation.
1012 // Advance over (discard) the newline
1013 UChar32 cp = testString.char32At(charIdx);
1014 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1015 // We have a CR LF
1016 // Need an extra increment of the input ptr to move over both of them
1017 charIdx++;
1018 }
1019 if (cp == u'\n' || cp == u'\r') {
1020 lineNum++;
1021 colStart = charIdx;
1022 charIdx++;
1023 break;
1024 }
1025
1026 // Let unescape handle the back slash.
1027 cp = testString.unescapeAt(charIdx);
1028 if (cp != -1) {
1029 // Escape sequence was recognized. Insert the char
1030 // into the test data.
1031 tp.dataToBreak.append(cp);
1032 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1033 tp.srcLine->addElement(lineNum, status);
1034 tp.srcCol ->addElement(column, status);
1035 }
1036 break;
1037 }
1038
1039
1040 // Not a recognized backslash escape sequence.
1041 // Take the next char as a literal.
1042 // TODO: Should this be an error?
1043 c = testString.charAt(charIdx);
1044 charIdx = testString.moveIndex32(charIdx, 1);
1045 }
1046
1047 // Normal, non-escaped data char.
1048 tp.dataToBreak.append(c);
1049
1050 // Save the mapping from offset in the data to line/column numbers in
1051 // the original input file. Will be used for better error messages only.
1052 // If there's an expected break before this char, the slot in the mapping
1053 // vector will already be set for this char; don't overwrite it.
1054 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1055 tp.srcLine->addElement(lineNum, status);
1056 tp.srcCol ->addElement(column, status);
1057 }
1058 break;
1059
1060
1061 case PARSE_NUM:
1062 // We are parsing an expected numeric tag value, like <1234>,
1063 // within a chunk of data.
1064 if (u_isUWhiteSpace(c)) {
1065 break;
1066 }
1067
1068 if (c == u'>') {
1069 // Finished the number. Add the info to the expected break data,
1070 // and switch parse state back to doing plain data.
1071 parseState = PARSE_DATA;
1072 if (tagValue == 0) {
1073 tagValue = -1;
1074 }
1075 int32_t breakIdx = tp.dataToBreak.length();
1076 if (tp.expectedBreaks->size() > breakIdx) {
1077 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1078 lineNum, column);
1079 }
1080 tp.expectedBreaks->setSize(breakIdx+1);
1081 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1082 tp.srcLine->setSize(breakIdx+1);
1083 tp.srcLine->setElementAt(lineNum, breakIdx);
1084 tp.srcCol ->setSize(breakIdx+1);
1085 tp.srcCol ->setElementAt(column, breakIdx);
1086 break;
1087 }
1088
1089 if (u_isdigit(c)) {
1090 tagValue = tagValue*10 + u_charDigitValue(c);
1091 break;
1092 }
1093
1094 errln("Syntax Error in test file at line %d, col %d",
1095 lineNum, column);
1096 parseState = PARSE_COMMENT;
1097 goto end_test; // Stop the test
1098 break;
1099 }
1100
1101
1102 if (U_FAILURE(status)) {
1103 dataerrln("ICU Error %s while parsing test file at line %d.",
1104 u_errorName(status), lineNum);
1105 status = U_ZERO_ERROR;
1106 goto end_test; // Stop the test
1107 }
1108
1109 }
1110
1111 // Reached end of test file. Raise an error if parseState indicates that we are
1112 // within a block that should have been terminated.
1113
1114 if (parseState == PARSE_RULES) {
1115 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1116 lineNum, rulesFirstLine);
1117 }
1118 if (parseState == PARSE_DATA) {
1119 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1120 }
1121
1122
1123 end_test:
1124 delete [] testFile;
1125 #endif
1126 }
1127
1128
1129 //-------------------------------------------------------------------------------
1130 //
1131 // TestDictRules create a break iterator from source rules that includes a
1132 // dictionary range. Regression for bug #7130. Source rules
1133 // do not declare a break iterator type (word, line, sentence, etc.
1134 // but the dictionary code, without a type, would loop.
1135 //
1136 //-------------------------------------------------------------------------------
TestDictRules()1137 void RBBITest::TestDictRules() {
1138 const char *rules = "$dictionary = [a-z]; \n"
1139 "!!forward; \n"
1140 "$dictionary $dictionary; \n"
1141 "!!reverse; \n"
1142 "$dictionary $dictionary; \n";
1143 const char *text = "aa";
1144 UErrorCode status = U_ZERO_ERROR;
1145 UParseError parseError;
1146
1147 RuleBasedBreakIterator bi(rules, parseError, status);
1148 if (U_SUCCESS(status)) {
1149 UnicodeString utext = text;
1150 bi.setText(utext);
1151 int32_t position;
1152 int32_t loops;
1153 for (loops = 0; loops<10; loops++) {
1154 position = bi.next();
1155 if (position == RuleBasedBreakIterator::DONE) {
1156 break;
1157 }
1158 }
1159 TEST_ASSERT(loops == 1);
1160 } else {
1161 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1162 }
1163 }
1164
1165
1166
1167 //-------------------------------------------------------------------------------
1168 //
1169 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1170 // return the data in one big UChar * buffer, which the caller must delete.
1171 //
1172 // parameters:
1173 // fileName: the name of the file, with no directory part. The test data directory
1174 // is assumed.
1175 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1176 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1177 // specified here. The BOM, if it exists, will be stripped from the returned data.
1178 // Pass NULL for the system default encoding.
1179 // status
1180 // returns:
1181 // The file data, converted to UChar.
1182 // The caller must delete this when done with
1183 // delete [] theBuffer;
1184 //
1185 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1186 // Move this function to some common place.
1187 //
1188 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1189 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1190 UChar *retPtr = NULL;
1191 char *fileBuf = NULL;
1192 UConverter* conv = NULL;
1193 FILE *f = NULL;
1194
1195 ulen = 0;
1196 if (U_FAILURE(status)) {
1197 return retPtr;
1198 }
1199
1200 //
1201 // Open the file.
1202 //
1203 f = fopen(fileName, "rb");
1204 if (f == 0) {
1205 dataerrln("Error opening test data file %s\n", fileName);
1206 status = U_FILE_ACCESS_ERROR;
1207 return NULL;
1208 }
1209 //
1210 // Read it in
1211 //
1212 int fileSize;
1213 int amt_read;
1214
1215 fseek( f, 0, SEEK_END);
1216 fileSize = ftell(f);
1217 fileBuf = new char[fileSize];
1218 fseek(f, 0, SEEK_SET);
1219 amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
1220 if (amt_read != fileSize || fileSize <= 0) {
1221 errln("Error reading test data file.");
1222 goto cleanUpAndReturn;
1223 }
1224
1225 //
1226 // Look for a Unicode Signature (BOM) on the data just read
1227 //
1228 int32_t signatureLength;
1229 const char * fileBufC;
1230 const char* bomEncoding;
1231
1232 fileBufC = fileBuf;
1233 bomEncoding = ucnv_detectUnicodeSignature(
1234 fileBuf, fileSize, &signatureLength, &status);
1235 if(bomEncoding!=NULL ){
1236 fileBufC += signatureLength;
1237 fileSize -= signatureLength;
1238 encoding = bomEncoding;
1239 }
1240
1241 //
1242 // Open a converter to take the rule file to UTF-16
1243 //
1244 conv = ucnv_open(encoding, &status);
1245 if (U_FAILURE(status)) {
1246 goto cleanUpAndReturn;
1247 }
1248
1249 //
1250 // Convert the rules to UChar.
1251 // Preflight first to determine required buffer size.
1252 //
1253 ulen = ucnv_toUChars(conv,
1254 NULL, // dest,
1255 0, // destCapacity,
1256 fileBufC,
1257 fileSize,
1258 &status);
1259 if (status == U_BUFFER_OVERFLOW_ERROR) {
1260 // Buffer Overflow is expected from the preflight operation.
1261 status = U_ZERO_ERROR;
1262
1263 retPtr = new UChar[ulen+1];
1264 ucnv_toUChars(conv,
1265 retPtr, // dest,
1266 ulen+1,
1267 fileBufC,
1268 fileSize,
1269 &status);
1270 }
1271
1272 cleanUpAndReturn:
1273 fclose(f);
1274 delete []fileBuf;
1275 ucnv_close(conv);
1276 if (U_FAILURE(status)) {
1277 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1278 delete []retPtr;
1279 retPtr = 0;
1280 ulen = 0;
1281 }
1282 return retPtr;
1283 }
1284
1285
1286
1287 //--------------------------------------------------------------------------------------------
1288 //
1289 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1290 //
1291 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1292 void RBBITest::TestUnicodeFiles() {
1293 RuleBasedBreakIterator *bi;
1294 UErrorCode status = U_ZERO_ERROR;
1295
1296 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1297 TEST_ASSERT_SUCCESS(status);
1298 if (U_SUCCESS(status)) {
1299 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1300 }
1301 delete bi;
1302
1303 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1304 TEST_ASSERT_SUCCESS(status);
1305 if (U_SUCCESS(status)) {
1306 runUnicodeTestData("WordBreakTest.txt", bi);
1307 }
1308 delete bi;
1309
1310 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1311 TEST_ASSERT_SUCCESS(status);
1312 if (U_SUCCESS(status)) {
1313 runUnicodeTestData("SentenceBreakTest.txt", bi);
1314 }
1315 delete bi;
1316
1317 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1318 TEST_ASSERT_SUCCESS(status);
1319 if (U_SUCCESS(status)) {
1320 runUnicodeTestData("LineBreakTest.txt", bi);
1321 }
1322 delete bi;
1323 }
1324
1325
1326 // Check for test cases from the Unicode test data files that are known to fail
1327 // and should be skipped as known issues because ICU does not fully implement
1328 // the Unicode specifications, or because ICU includes tailorings that differ from
1329 // the Unicode standard.
1330 //
1331 // Test cases are identified by the test data sequence, which tends to be more stable
1332 // across Unicode versions than the test file line numbers.
1333 //
1334 // The test case with ticket "10666" is a dummy, included as an example.
1335
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1336 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1337 static struct TestCase {
1338 const char *fTicketNum;
1339 const char *fFileName;
1340 const UChar *fString;
1341 } badTestCases[] = {
1342 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1343 // The following tests were originally for
1344 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1345 // However, that ticket has been closed as fixed but these tests still fail, so
1346 // ICU-21097 has been created to investigate and address these remaining issues.
1347 {"21097", "LineBreakTest.txt", u"-#"},
1348 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1349 {"21097", "LineBreakTest.txt", u"\u002d\u00a7"},
1350 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1351 {"21097", "LineBreakTest.txt", u"\u002d\U00050005"},
1352 {"21097", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1353 {"21097", "LineBreakTest.txt", u"\u002d\u0e01"},
1354 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1355
1356 // The following tests were originally for
1357 // Issue ICU-12017 Improve line break around numbers.
1358 // However, that ticket has been closed as fixed but these tests still fail, so
1359 // ICU-21097 has been created to investigate and address these remaining issues.
1360 {"21097", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1361 {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1362 {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1363 {"21097", "LineBreakTest.txt", u"a.2 "},
1364 {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1365 {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1366 {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1367 {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1368 {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1369 {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1370 {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1371 {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1372 {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1373 {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1374 };
1375
1376 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1377 const TestCase &badCase = badTestCases[n];
1378 if (!strcmp(fileName, badCase.fFileName) &&
1379 testCase == UnicodeString(badCase.fString)) {
1380 return logKnownIssue(badCase.fTicketNum);
1381 }
1382 }
1383 return FALSE;
1384 }
1385
1386
1387 //--------------------------------------------------------------------------------------------
1388 //
1389 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1390 //
1391 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1392 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1393 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1394 UErrorCode status = U_ZERO_ERROR;
1395
1396 //
1397 // Open and read the test data file, put it into a UnicodeString.
1398 //
1399 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1400 char testFileName[1000];
1401 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1402 dataerrln("Can't open test data. Path too long.");
1403 return;
1404 }
1405 strcpy(testFileName, testDataDirectory);
1406 strcat(testFileName, fileName);
1407
1408 logln("Opening data file %s\n", fileName);
1409
1410 int len;
1411 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1412 if (status != U_FILE_ACCESS_ERROR) {
1413 TEST_ASSERT_SUCCESS(status);
1414 TEST_ASSERT(testFile != NULL);
1415 }
1416 if (U_FAILURE(status) || testFile == NULL) {
1417 return; /* something went wrong, error already output */
1418 }
1419 UnicodeString testFileAsString(TRUE, testFile, len);
1420
1421 //
1422 // Parse the test data file using a regular expression.
1423 // Each kind of token is recognized in its own capture group; what type of item was scanned
1424 // is identified by which group had a match.
1425 //
1426 // Caputure Group # 1 2 3 4 5
1427 // Parses this item: divide x hex digits comment \n unrecognized \n
1428 //
1429 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1430 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1431 UnicodeString testString;
1432 UVector32 breakPositions(status);
1433 int lineNumber = 1;
1434 TEST_ASSERT_SUCCESS(status);
1435 if (U_FAILURE(status)) {
1436 return;
1437 }
1438
1439 //
1440 // Scan through each test case, building up the string to be broken in testString,
1441 // and the positions that should be boundaries in the breakPositions vector.
1442 //
1443 int spin = 0;
1444 while (tokenMatcher.find()) {
1445 if(tokenMatcher.hitEnd()) {
1446 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1447 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1448 and caused an infinite loop here on EBCDIC systems!
1449 */
1450 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1451 // return;
1452 }
1453 if (tokenMatcher.start(1, status) >= 0) {
1454 // Scanned a divide sign, indicating a break position in the test data.
1455 if (testString.length()>0) {
1456 breakPositions.addElement(testString.length(), status);
1457 }
1458 }
1459 else if (tokenMatcher.start(2, status) >= 0) {
1460 // Scanned an 'x', meaning no break at this position in the test data
1461 // Nothing to be done here.
1462 }
1463 else if (tokenMatcher.start(3, status) >= 0) {
1464 // Scanned Hex digits. Convert them to binary, append to the character data string.
1465 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1466 int length = hexNumber.length();
1467 if (length<=8) {
1468 char buf[10];
1469 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1470 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1471 if (c<=0x10ffff) {
1472 testString.append(c);
1473 } else {
1474 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1475 fileName, lineNumber);
1476 }
1477 } else {
1478 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1479 fileName, lineNumber);
1480 }
1481 }
1482 else if (tokenMatcher.start(4, status) >= 0) {
1483 // Scanned to end of a line, possibly skipping over a comment in the process.
1484 // If the line from the file contained test data, run the test now.
1485 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1486 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1487 }
1488
1489 // Clear out this test case.
1490 // The string and breakPositions vector will be refilled as the next
1491 // test case is parsed.
1492 testString.remove();
1493 breakPositions.removeAllElements();
1494 lineNumber++;
1495 } else {
1496 // Scanner catchall. Something unrecognized appeared on the line.
1497 char token[16];
1498 UnicodeString uToken = tokenMatcher.group(0, status);
1499 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1500 token[sizeof(token)-1] = 0;
1501 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1502
1503 // Clean up, in preparation for continuing with the next line.
1504 testString.remove();
1505 breakPositions.removeAllElements();
1506 lineNumber++;
1507 }
1508 TEST_ASSERT_SUCCESS(status);
1509 if (U_FAILURE(status)) {
1510 break;
1511 }
1512 }
1513
1514 delete [] testFile;
1515 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1516 }
1517
1518 //--------------------------------------------------------------------------------------------
1519 //
1520 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1521 // test data files. Do only a simple, forward-only check -
1522 // this test is mostly to check that ICU and the Unicode
1523 // data agree with each other.
1524 //
1525 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1526 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1527 const UnicodeString &testString, // Text data to be broken
1528 UVector32 *breakPositions, // Positions where breaks should be found.
1529 RuleBasedBreakIterator *bi) {
1530 int32_t pos; // Break Position in the test string
1531 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1532 int32_t expectedPos; // Expected break position (index into test string)
1533
1534 bi->setText(testString);
1535 pos = bi->first();
1536 pos = bi->next();
1537
1538 while (pos != BreakIterator::DONE) {
1539 if (expectedI >= breakPositions->size()) {
1540 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1541 testFileName, lineNumber, pos);
1542 break;
1543 }
1544 expectedPos = breakPositions->elementAti(expectedI);
1545 if (pos < expectedPos) {
1546 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1547 testFileName, lineNumber, pos);
1548 break;
1549 }
1550 if (pos > expectedPos) {
1551 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1552 testFileName, lineNumber, expectedPos);
1553 break;
1554 }
1555 pos = bi->next();
1556 expectedI++;
1557 }
1558
1559 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1560 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1561 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1562 }
1563 }
1564
1565
1566
1567 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1568 //---------------------------------------------------------------------------------------
1569 //
1570 // classs RBBIMonkeyKind
1571 //
1572 // Monkey Test for Break Iteration
1573 // Abstract interface class. Concrete derived classes independently
1574 // implement the break rules for different iterator types.
1575 //
1576 // The Monkey Test itself uses doesn't know which type of break iterator it is
1577 // testing, but works purely in terms of the interface defined here.
1578 //
1579 //---------------------------------------------------------------------------------------
1580 class RBBIMonkeyKind {
1581 public:
1582 // Return a UVector of UnicodeSets, representing the character classes used
1583 // for this type of iterator.
1584 virtual UVector *charClasses() = 0;
1585
1586 // Set the test text on which subsequent calls to next() will operate
1587 virtual void setText(const UnicodeString &s) = 0;
1588
1589 // Find the next break postion, starting from the prev break position, or from zero.
1590 // Return -1 after reaching end of string.
1591 virtual int32_t next(int32_t i) = 0;
1592
1593 // Name of each character class, parallel with charClasses. Used for debugging output
1594 // of characters.
1595 virtual std::vector<std::string>& characterClassNames();
1596
1597 void setAppliedRule(int32_t position, const char* value);
1598
1599 std::string getAppliedRule(int32_t position);
1600
1601 virtual ~RBBIMonkeyKind();
1602 UErrorCode deferredStatus;
1603
1604 std::string classNameFromCodepoint(const UChar32 c);
1605 unsigned int maxClassNameSize();
1606
1607 protected:
1608 RBBIMonkeyKind();
1609 std::vector<std::string> classNames;
1610 std::vector<std::string> appliedRules;
1611
1612 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1613 void prepareAppliedRules(int32_t size );
1614
1615 private:
1616
1617 };
1618
RBBIMonkeyKind()1619 RBBIMonkeyKind::RBBIMonkeyKind() {
1620 deferredStatus = U_ZERO_ERROR;
1621 }
1622
~RBBIMonkeyKind()1623 RBBIMonkeyKind::~RBBIMonkeyKind() {
1624 }
1625
characterClassNames()1626 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1627 return classNames;
1628 }
1629
prepareAppliedRules(int32_t size)1630 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1631 // Remove all the information in the `appliedRules`.
1632 appliedRules.clear();
1633 appliedRules.resize(size + 1);
1634 }
1635
setAppliedRule(int32_t position,const char * value)1636 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1637 appliedRules[position] = value;
1638 }
1639
getAppliedRule(int32_t position)1640 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1641 return appliedRules[position];
1642 }
1643
classNameFromCodepoint(const UChar32 c)1644 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1645 // Simply iterate through charClasses to find character's class
1646 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1647 UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1648 if (classSet->contains(c)) {
1649 return classNames[aClassNum];
1650 }
1651 }
1652 U_ASSERT(FALSE); // This should not happen.
1653 return "bad class name";
1654 }
1655
maxClassNameSize()1656 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1657 unsigned int maxSize = 0;
1658 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1659 auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1660 if (aClassNumSize > maxSize) {
1661 maxSize = aClassNumSize;
1662 }
1663 }
1664 return maxSize;
1665 }
1666
1667 //----------------------------------------------------------------------------------------
1668 //
1669 // Random Numbers. Similar to standard lib rand() and srand()
1670 // Not using library to
1671 // 1. Get same results on all platforms.
1672 // 2. Get access to current seed, to more easily reproduce failures.
1673 //
1674 //---------------------------------------------------------------------------------------
1675 static uint32_t m_seed = 1;
1676
m_rand()1677 static uint32_t m_rand()
1678 {
1679 m_seed = m_seed * 1103515245 + 12345;
1680 return (uint32_t)(m_seed/65536) % 32768;
1681 }
1682
1683
1684 //------------------------------------------------------------------------------------------
1685 //
1686 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1687 // of RBBIMonkeyKind.
1688 //
1689 //------------------------------------------------------------------------------------------
1690 class RBBICharMonkey: public RBBIMonkeyKind {
1691 public:
1692 RBBICharMonkey();
1693 virtual ~RBBICharMonkey();
1694 virtual UVector *charClasses();
1695 virtual void setText(const UnicodeString &s);
1696 virtual int32_t next(int32_t i);
1697 private:
1698 UVector *fSets;
1699
1700 UnicodeSet *fCRLFSet;
1701 UnicodeSet *fControlSet;
1702 UnicodeSet *fExtendSet;
1703 UnicodeSet *fZWJSet;
1704 UnicodeSet *fRegionalIndicatorSet;
1705 UnicodeSet *fPrependSet;
1706 UnicodeSet *fSpacingSet;
1707 UnicodeSet *fLSet;
1708 UnicodeSet *fVSet;
1709 UnicodeSet *fTSet;
1710 UnicodeSet *fLVSet;
1711 UnicodeSet *fLVTSet;
1712 UnicodeSet *fHangulSet;
1713 UnicodeSet *fExtendedPictSet;
1714 UnicodeSet *fViramaSet;
1715 UnicodeSet *fLinkingConsonantSet;
1716 UnicodeSet *fExtCccZwjSet;
1717 UnicodeSet *fAnySet;
1718
1719 const UnicodeString *fText;
1720 };
1721
1722
RBBICharMonkey()1723 RBBICharMonkey::RBBICharMonkey() {
1724 UErrorCode status = U_ZERO_ERROR;
1725
1726 fText = NULL;
1727
1728 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1729 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1730 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1731 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1732 fRegionalIndicatorSet =
1733 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1734 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1735 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1736 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1737 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1738 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1739 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1740 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1741 fHangulSet = new UnicodeSet();
1742 fHangulSet->addAll(*fLSet);
1743 fHangulSet->addAll(*fVSet);
1744 fHangulSet->addAll(*fTSet);
1745 fHangulSet->addAll(*fLVSet);
1746 fHangulSet->addAll(*fLVTSet);
1747
1748 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1749 fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1750 "\\p{Indic_Syllabic_Category=Virama}]", status);
1751 fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1752 "\\p{Indic_Syllabic_Category=Consonant}]", status);
1753 fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1754 fAnySet = new UnicodeSet(0, 0x10ffff);
1755
1756 // Create sets of characters, and add the names of the above character sets.
1757 // In each new ICU release, add new names corresponding to the sets above.
1758 fSets = new UVector(status);
1759
1760 // Important: Keep class names the same as the class contents.
1761 fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1762 fSets->addElement(fControlSet, status); classNames.push_back("Control");
1763 fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1764 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1765 if (!fPrependSet->isEmpty()) {
1766 fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1767 }
1768 fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1769 fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1770 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1771 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1772 fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1773 fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1774 fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1775 fSets->addElement(fAnySet, status); classNames.push_back("Any");
1776
1777 if (U_FAILURE(status)) {
1778 deferredStatus = status;
1779 }
1780 }
1781
1782
setText(const UnicodeString & s)1783 void RBBICharMonkey::setText(const UnicodeString &s) {
1784 fText = &s;
1785 prepareAppliedRules(s.length());
1786 }
1787
1788
1789
next(int32_t prevPos)1790 int32_t RBBICharMonkey::next(int32_t prevPos) {
1791 int p0, p1, p2, p3; // Indices of the significant code points around the
1792 // break position being tested. The candidate break
1793 // location is before p2.
1794
1795 int breakPos = -1;
1796
1797 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1798 UChar32 cBase; // for (X Extend*) patterns, the X character.
1799
1800 if (U_FAILURE(deferredStatus)) {
1801 return -1;
1802 }
1803
1804 // Previous break at end of string. return DONE.
1805 if (prevPos >= fText->length()) {
1806 return -1;
1807 }
1808
1809 p0 = p1 = p2 = p3 = prevPos;
1810 c3 = fText->char32At(prevPos);
1811 c0 = c1 = c2 = cBase = 0;
1812 (void)p0; // suppress set but not used warning.
1813 (void)c0;
1814
1815 // Loop runs once per "significant" character position in the input text.
1816 for (;;) {
1817 // Move all of the positions forward in the input string.
1818 p0 = p1; c0 = c1;
1819 p1 = p2; c1 = c2;
1820 p2 = p3; c2 = c3;
1821
1822 // Advance p3 by one codepoint
1823 p3 = fText->moveIndex32(p3, 1);
1824 c3 = fText->char32At(p3);
1825
1826 if (p1 == p2) {
1827 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1828 continue;
1829 }
1830
1831 if (p2 == fText->length()) {
1832 setAppliedRule(p2, "End of String");
1833 break;
1834 }
1835
1836 // No Extend or Format characters may appear between the CR and LF,
1837 // which requires the additional check for p2 immediately following p1.
1838 //
1839 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1840 setAppliedRule(p2, "GB3 CR x LF");
1841 continue;
1842 }
1843
1844 if (fControlSet->contains(c1) ||
1845 c1 == 0x0D ||
1846 c1 == 0x0A) {
1847 setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
1848 break;
1849 }
1850
1851 if (fControlSet->contains(c2) ||
1852 c2 == 0x0D ||
1853 c2 == 0x0A) {
1854 setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
1855 break;
1856 }
1857
1858 if (fLSet->contains(c1) &&
1859 (fLSet->contains(c2) ||
1860 fVSet->contains(c2) ||
1861 fLVSet->contains(c2) ||
1862 fLVTSet->contains(c2))) {
1863 setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
1864 continue;
1865 }
1866
1867 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1868 (fVSet->contains(c2) || fTSet->contains(c2))) {
1869 setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
1870 continue;
1871 }
1872
1873 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1874 fTSet->contains(c2)) {
1875 setAppliedRule(p2, "GB8 ( LVT | T) x T");
1876 continue;
1877 }
1878
1879 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1880 if (!fExtendSet->contains(c1)) {
1881 cBase = c1;
1882 }
1883 setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
1884 continue;
1885 }
1886
1887 if (fSpacingSet->contains(c2)) {
1888 setAppliedRule(p2, "GB9a x SpacingMark");
1889 continue;
1890 }
1891
1892 if (fPrependSet->contains(c1)) {
1893 setAppliedRule(p2, "GB9b Prepend x");
1894 continue;
1895 }
1896
1897 // Note: Viramas are also included in the ExtCccZwj class.
1898 if (fLinkingConsonantSet->contains(c2)) {
1899 int pi = p1;
1900 bool sawVirama = false;
1901 while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1902 if (fViramaSet->contains(fText->char32At(pi))) {
1903 sawVirama = true;
1904 }
1905 pi = fText->moveIndex32(pi, -1);
1906 }
1907 if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1908 setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1909 continue;
1910 }
1911 }
1912
1913 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1914 setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1915 continue;
1916 }
1917
1918 // Note: The first if condition is a little tricky. We only need to force
1919 // a break if there are three or more contiguous RIs. If there are
1920 // only two, a break following will occur via other rules, and will include
1921 // any trailing extend characters, which is needed behavior.
1922 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1923 && fRegionalIndicatorSet->contains(c2)) {
1924 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1925 break;
1926 }
1927 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1928 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1929 continue;
1930 }
1931
1932 setAppliedRule(p2, "GB999 Any <break> Any");
1933 break;
1934 }
1935
1936 breakPos = p2;
1937 return breakPos;
1938 }
1939
1940
1941
charClasses()1942 UVector *RBBICharMonkey::charClasses() {
1943 return fSets;
1944 }
1945
~RBBICharMonkey()1946 RBBICharMonkey::~RBBICharMonkey() {
1947 delete fSets;
1948 delete fCRLFSet;
1949 delete fControlSet;
1950 delete fExtendSet;
1951 delete fRegionalIndicatorSet;
1952 delete fPrependSet;
1953 delete fSpacingSet;
1954 delete fLSet;
1955 delete fVSet;
1956 delete fTSet;
1957 delete fLVSet;
1958 delete fLVTSet;
1959 delete fHangulSet;
1960 delete fAnySet;
1961 delete fZWJSet;
1962 delete fExtendedPictSet;
1963 delete fViramaSet;
1964 delete fLinkingConsonantSet;
1965 delete fExtCccZwjSet;
1966 }
1967
1968 //------------------------------------------------------------------------------------------
1969 //
1970 // class RBBIWordMonkey Word Break specific implementation
1971 // of RBBIMonkeyKind.
1972 //
1973 //------------------------------------------------------------------------------------------
1974 class RBBIWordMonkey: public RBBIMonkeyKind {
1975 public:
1976 RBBIWordMonkey();
1977 virtual ~RBBIWordMonkey();
1978 virtual UVector *charClasses();
1979 virtual void setText(const UnicodeString &s);
1980 virtual int32_t next(int32_t i);
1981 private:
1982 UVector *fSets;
1983
1984 UnicodeSet *fCRSet;
1985 UnicodeSet *fLFSet;
1986 UnicodeSet *fNewlineSet;
1987 UnicodeSet *fRegionalIndicatorSet;
1988 UnicodeSet *fKatakanaSet;
1989 UnicodeSet *fHebrew_LetterSet;
1990 UnicodeSet *fALetterSet;
1991 UnicodeSet *fSingle_QuoteSet;
1992 UnicodeSet *fDouble_QuoteSet;
1993 UnicodeSet *fMidNumLetSet;
1994 UnicodeSet *fMidLetterSet;
1995 UnicodeSet *fMidNumSet;
1996 UnicodeSet *fNumericSet;
1997 UnicodeSet *fFormatSet;
1998 UnicodeSet *fOtherSet;
1999 UnicodeSet *fExtendSet;
2000 UnicodeSet *fExtendNumLetSet;
2001 UnicodeSet *fWSegSpaceSet;
2002 UnicodeSet *fDictionarySet;
2003 UnicodeSet *fZWJSet;
2004 UnicodeSet *fExtendedPictSet;
2005
2006 const UnicodeString *fText;
2007 };
2008
2009
RBBIWordMonkey()2010 RBBIWordMonkey::RBBIWordMonkey()
2011 {
2012 UErrorCode status = U_ZERO_ERROR;
2013
2014 fSets = new UVector(status);
2015
2016 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
2017 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
2018 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
2019 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
2020 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2021 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2022 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2023 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
2024 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
2025 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
2026 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status);
2027 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
2028 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
2029 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
2030 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
2031 // There are some sc=Hani characters with WB=Extend.
2032 // The break rules need to pick one or the other because
2033 // Extend overlapping with something else is messy.
2034 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
2035 // in $Han (for $dictionary) and out of $Extend.
2036 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
2037 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
2038
2039 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
2040 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
2041
2042 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2043 fDictionarySet->addAll(*fKatakanaSet);
2044 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2045
2046 fALetterSet->removeAll(*fDictionarySet);
2047
2048 fOtherSet = new UnicodeSet();
2049 if(U_FAILURE(status)) {
2050 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2051 deferredStatus = status;
2052 return;
2053 }
2054
2055 fOtherSet->complement();
2056 fOtherSet->removeAll(*fCRSet);
2057 fOtherSet->removeAll(*fLFSet);
2058 fOtherSet->removeAll(*fNewlineSet);
2059 fOtherSet->removeAll(*fKatakanaSet);
2060 fOtherSet->removeAll(*fHebrew_LetterSet);
2061 fOtherSet->removeAll(*fALetterSet);
2062 fOtherSet->removeAll(*fSingle_QuoteSet);
2063 fOtherSet->removeAll(*fDouble_QuoteSet);
2064 fOtherSet->removeAll(*fMidLetterSet);
2065 fOtherSet->removeAll(*fMidNumSet);
2066 fOtherSet->removeAll(*fNumericSet);
2067 fOtherSet->removeAll(*fExtendNumLetSet);
2068 fOtherSet->removeAll(*fWSegSpaceSet);
2069 fOtherSet->removeAll(*fFormatSet);
2070 fOtherSet->removeAll(*fExtendSet);
2071 fOtherSet->removeAll(*fRegionalIndicatorSet);
2072 fOtherSet->removeAll(*fZWJSet);
2073 fOtherSet->removeAll(*fExtendedPictSet);
2074
2075 // Inhibit dictionary characters from being tested at all.
2076 fOtherSet->removeAll(*fDictionarySet);
2077
2078 // Add classes and their names
2079 fSets->addElement(fCRSet, status); classNames.push_back("CR");
2080 fSets->addElement(fLFSet, status); classNames.push_back("LF");
2081 fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2082 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2083 fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2084 fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2085 fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2086 fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2087 // Omit Katakana from fSets, which omits Katakana characters
2088 // from the test data. They are all in the dictionary set,
2089 // which this (old, to be retired) monkey test cannot handle.
2090 //fSets->addElement(fKatakanaSet, status);
2091
2092 fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2093 fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2094 fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2095 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2096 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2097 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2098 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2099 fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2100 fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2101
2102 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2103 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2104
2105 if (U_FAILURE(status)) {
2106 deferredStatus = status;
2107 }
2108 }
2109
setText(const UnicodeString & s)2110 void RBBIWordMonkey::setText(const UnicodeString &s) {
2111 fText = &s;
2112 prepareAppliedRules(s.length());
2113 }
2114
2115
next(int32_t prevPos)2116 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2117 int p0, p1, p2, p3; // Indices of the significant code points around the
2118 // break position being tested. The candidate break
2119 // location is before p2.
2120
2121 int breakPos = -1;
2122
2123 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2124
2125 if (U_FAILURE(deferredStatus)) {
2126 return -1;
2127 }
2128
2129 // Prev break at end of string. return DONE.
2130 if (prevPos >= fText->length()) {
2131 return -1;
2132 }
2133 p0 = p1 = p2 = p3 = prevPos;
2134 c3 = fText->char32At(prevPos);
2135 c0 = c1 = c2 = 0;
2136 (void)p0; // Suppress set but not used warning.
2137
2138 // Loop runs once per "significant" character position in the input text.
2139 for (;;) {
2140 // Move all of the positions forward in the input string.
2141 p0 = p1; c0 = c1;
2142 p1 = p2; c1 = c2;
2143 p2 = p3; c2 = c3;
2144
2145 // Advance p3 by X(Extend | Format)* Rule 4
2146 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2147 do {
2148 p3 = fText->moveIndex32(p3, 1);
2149 c3 = fText->char32At(p3);
2150 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2151 break;
2152 }
2153 }
2154 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2155
2156
2157 if (p1 == p2) {
2158 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2159 continue;
2160 }
2161
2162 if (p2 == fText->length()) {
2163 // Reached end of string. Always a break position.
2164 break;
2165 }
2166
2167 // No Extend or Format characters may appear between the CR and LF,
2168 // which requires the additional check for p2 immediately following p1.
2169 //
2170 if (c1==0x0D && c2==0x0A) {
2171 setAppliedRule(p2, "WB3 CR x LF");
2172 continue;
2173 }
2174
2175 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2176 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2177 break;
2178 }
2179 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2180 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2181 break;
2182 }
2183
2184 // Not ignoring extend chars, so peek into input text to
2185 // get the potential ZWJ, the character immediately preceding c2.
2186 // Sloppy UChar32 indexing: p2-1 may reference trail half
2187 // but char32At will get the full code point.
2188 if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2189 setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
2190 continue;
2191 }
2192
2193 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2194 setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
2195 continue;
2196 }
2197
2198 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2199 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2200 setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2201 continue;
2202 }
2203
2204 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2205 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2206 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2207 setAppliedRule(p2,
2208 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2209 continue;
2210 }
2211
2212 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2213 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2214 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2215 setAppliedRule(p2,
2216 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
2217 continue;
2218 }
2219
2220 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2221 setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
2222 continue;
2223 }
2224
2225 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2226 setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
2227 continue;
2228 }
2229
2230 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2231 setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
2232 continue;
2233 }
2234
2235 if (fNumericSet->contains(c1) &&
2236 fNumericSet->contains(c2)) {
2237 setAppliedRule(p2, "WB8 Numeric x Numeric");
2238 continue;
2239 }
2240
2241 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2242 fNumericSet->contains(c2)) {
2243 setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
2244 continue;
2245 }
2246
2247 if (fNumericSet->contains(c1) &&
2248 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2249 setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
2250 continue;
2251 }
2252
2253 if (fNumericSet->contains(c0) &&
2254 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2255 fNumericSet->contains(c2)) {
2256 setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
2257 continue;
2258 }
2259
2260 if (fNumericSet->contains(c1) &&
2261 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2262 fNumericSet->contains(c3)) {
2263 setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2264 continue;
2265 }
2266
2267 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2268 // all Katakana are handled by the dictionary breaker.
2269 if (fKatakanaSet->contains(c1) &&
2270 fKatakanaSet->contains(c2)) {
2271 setAppliedRule(p2, "WB13 Katakana x Katakana");
2272 continue;
2273 }
2274
2275 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2276 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2277 fExtendNumLetSet->contains(c2)) {
2278 setAppliedRule(p2,
2279 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2280 continue;
2281 }
2282
2283 if (fExtendNumLetSet->contains(c1) &&
2284 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2285 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2286 setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2287 continue;
2288 }
2289
2290 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2291 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2292 break;
2293 }
2294 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2295 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2296 continue;
2297 }
2298
2299 setAppliedRule(p2, "WB999");
2300 break;
2301 }
2302
2303 breakPos = p2;
2304 return breakPos;
2305 }
2306
2307
charClasses()2308 UVector *RBBIWordMonkey::charClasses() {
2309 return fSets;
2310 }
2311
~RBBIWordMonkey()2312 RBBIWordMonkey::~RBBIWordMonkey() {
2313 delete fSets;
2314 delete fCRSet;
2315 delete fLFSet;
2316 delete fNewlineSet;
2317 delete fKatakanaSet;
2318 delete fHebrew_LetterSet;
2319 delete fALetterSet;
2320 delete fSingle_QuoteSet;
2321 delete fDouble_QuoteSet;
2322 delete fMidNumLetSet;
2323 delete fMidLetterSet;
2324 delete fMidNumSet;
2325 delete fNumericSet;
2326 delete fFormatSet;
2327 delete fExtendSet;
2328 delete fExtendNumLetSet;
2329 delete fWSegSpaceSet;
2330 delete fRegionalIndicatorSet;
2331 delete fDictionarySet;
2332 delete fOtherSet;
2333 delete fZWJSet;
2334 delete fExtendedPictSet;
2335 }
2336
2337
2338
2339
2340 //------------------------------------------------------------------------------------------
2341 //
2342 // class RBBISentMonkey Sentence Break specific implementation
2343 // of RBBIMonkeyKind.
2344 //
2345 //------------------------------------------------------------------------------------------
2346 class RBBISentMonkey: public RBBIMonkeyKind {
2347 public:
2348 RBBISentMonkey();
2349 virtual ~RBBISentMonkey();
2350 virtual UVector *charClasses();
2351 virtual void setText(const UnicodeString &s);
2352 virtual int32_t next(int32_t i);
2353 private:
2354 int moveBack(int posFrom);
2355 int moveForward(int posFrom);
2356 UChar32 cAt(int pos);
2357
2358 UVector *fSets;
2359
2360 UnicodeSet *fSepSet;
2361 UnicodeSet *fFormatSet;
2362 UnicodeSet *fSpSet;
2363 UnicodeSet *fLowerSet;
2364 UnicodeSet *fUpperSet;
2365 UnicodeSet *fOLetterSet;
2366 UnicodeSet *fNumericSet;
2367 UnicodeSet *fATermSet;
2368 UnicodeSet *fSContinueSet;
2369 UnicodeSet *fSTermSet;
2370 UnicodeSet *fCloseSet;
2371 UnicodeSet *fOtherSet;
2372 UnicodeSet *fExtendSet;
2373
2374 const UnicodeString *fText;
2375 };
2376
RBBISentMonkey()2377 RBBISentMonkey::RBBISentMonkey()
2378 {
2379 UErrorCode status = U_ZERO_ERROR;
2380
2381 fSets = new UVector(status);
2382
2383 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2384 // set and made into character classes of their own. For the monkey impl,
2385 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2386 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2387 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2388 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2389 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2390 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2391 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2392 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2393 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2394 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2395 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2396 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2397 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2398 fOtherSet = new UnicodeSet();
2399
2400 if(U_FAILURE(status)) {
2401 deferredStatus = status;
2402 return;
2403 }
2404
2405 fOtherSet->complement();
2406 fOtherSet->removeAll(*fSepSet);
2407 fOtherSet->removeAll(*fFormatSet);
2408 fOtherSet->removeAll(*fSpSet);
2409 fOtherSet->removeAll(*fLowerSet);
2410 fOtherSet->removeAll(*fUpperSet);
2411 fOtherSet->removeAll(*fOLetterSet);
2412 fOtherSet->removeAll(*fNumericSet);
2413 fOtherSet->removeAll(*fATermSet);
2414 fOtherSet->removeAll(*fSContinueSet);
2415 fOtherSet->removeAll(*fSTermSet);
2416 fOtherSet->removeAll(*fCloseSet);
2417 fOtherSet->removeAll(*fExtendSet);
2418
2419 fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2420 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2421 fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2422 fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2423 fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2424 fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2425 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2426 fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2427 fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2428 fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2429 fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2430 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2431 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2432
2433 if (U_FAILURE(status)) {
2434 deferredStatus = status;
2435 }
2436 }
2437
2438
2439
setText(const UnicodeString & s)2440 void RBBISentMonkey::setText(const UnicodeString &s) {
2441 fText = &s;
2442 prepareAppliedRules(s.length());
2443 }
2444
charClasses()2445 UVector *RBBISentMonkey::charClasses() {
2446 return fSets;
2447 }
2448
2449 // moveBack() Find the "significant" code point preceding the index i.
2450 // Skips over ($Extend | $Format)* .
2451 //
moveBack(int i)2452 int RBBISentMonkey::moveBack(int i) {
2453 if (i <= 0) {
2454 return -1;
2455 }
2456 UChar32 c;
2457 int32_t j = i;
2458 do {
2459 j = fText->moveIndex32(j, -1);
2460 c = fText->char32At(j);
2461 }
2462 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2463 return j;
2464
2465 }
2466
2467
moveForward(int i)2468 int RBBISentMonkey::moveForward(int i) {
2469 if (i>=fText->length()) {
2470 return fText->length();
2471 }
2472 UChar32 c;
2473 int32_t j = i;
2474 do {
2475 j = fText->moveIndex32(j, 1);
2476 c = cAt(j);
2477 }
2478 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2479 return j;
2480 }
2481
cAt(int pos)2482 UChar32 RBBISentMonkey::cAt(int pos) {
2483 if (pos<0 || pos>=fText->length()) {
2484 return -1;
2485 } else {
2486 return fText->char32At(pos);
2487 }
2488 }
2489
next(int32_t prevPos)2490 int32_t RBBISentMonkey::next(int32_t prevPos) {
2491 int p0, p1, p2, p3; // Indices of the significant code points around the
2492 // break position being tested. The candidate break
2493 // location is before p2.
2494
2495 int breakPos = -1;
2496
2497 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2498 UChar32 c;
2499
2500 if (U_FAILURE(deferredStatus)) {
2501 return -1;
2502 }
2503
2504 // Prev break at end of string. return DONE.
2505 if (prevPos >= fText->length()) {
2506 return -1;
2507 }
2508 p0 = p1 = p2 = p3 = prevPos;
2509 c3 = fText->char32At(prevPos);
2510 c0 = c1 = c2 = 0;
2511 (void)p0; // Suppress set but not used warning.
2512
2513 // Loop runs once per "significant" character position in the input text.
2514 for (;;) {
2515 // Move all of the positions forward in the input string.
2516 p0 = p1; c0 = c1;
2517 p1 = p2; c1 = c2;
2518 p2 = p3; c2 = c3;
2519
2520 // Advance p3 by X(Extend | Format)* Rule 4
2521 p3 = moveForward(p3);
2522 c3 = cAt(p3);
2523
2524 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2525 setAppliedRule(p2, "SB3 CR x LF");
2526 continue;
2527 }
2528
2529 if (fSepSet->contains(c1)) {
2530 p2 = p1+1; // Separators don't combine with Extend or Format.
2531
2532 setAppliedRule(p2, "SB4 Sep <break>");
2533 break;
2534 }
2535
2536 if (p2 >= fText->length()) {
2537 // Reached end of string. Always a break position.
2538 setAppliedRule(p2, "SB4 Sep <break>");
2539 break;
2540 }
2541
2542 if (p2 == prevPos) {
2543 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2544 setAppliedRule(p2, "SB4 Sep <break>");
2545 continue;
2546 }
2547
2548 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2549 setAppliedRule(p2, "SB6 ATerm x Numeric");
2550 continue;
2551 }
2552
2553 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2554 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2555 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
2556 continue;
2557 }
2558
2559 // Note: STerm | ATerm are added to the negated part of the expression by a
2560 // note to the Unicode 5.0 documents.
2561 int p8 = p1;
2562 while (fSpSet->contains(cAt(p8))) {
2563 p8 = moveBack(p8);
2564 }
2565 while (fCloseSet->contains(cAt(p8))) {
2566 p8 = moveBack(p8);
2567 }
2568 if (fATermSet->contains(cAt(p8))) {
2569 p8=p2;
2570 for (;;) {
2571 c = cAt(p8);
2572 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2573 fLowerSet->contains(c) || fSepSet->contains(c) ||
2574 fATermSet->contains(c) || fSTermSet->contains(c)) {
2575
2576 setAppliedRule(p2,
2577 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2578 break;
2579 }
2580 p8 = moveForward(p8);
2581 }
2582 if (fLowerSet->contains(cAt(p8))) {
2583
2584 setAppliedRule(p2,
2585 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2586 continue;
2587 }
2588 }
2589
2590 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2591 p8 = p1;
2592 while (fSpSet->contains(cAt(p8))) {
2593 p8 = moveBack(p8);
2594 }
2595 while (fCloseSet->contains(cAt(p8))) {
2596 p8 = moveBack(p8);
2597 }
2598 c = cAt(p8);
2599 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2600 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2601 continue;
2602 }
2603 }
2604
2605 int p9 = p1;
2606 while (fCloseSet->contains(cAt(p9))) {
2607 p9 = moveBack(p9);
2608 }
2609 c = cAt(p9);
2610 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2611 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2612
2613 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
2614 continue;
2615 }
2616 }
2617
2618 int p10 = p1;
2619 while (fSpSet->contains(cAt(p10))) {
2620 p10 = moveBack(p10);
2621 }
2622 while (fCloseSet->contains(cAt(p10))) {
2623 p10 = moveBack(p10);
2624 }
2625 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2626 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2627 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
2628 continue;
2629 }
2630 }
2631
2632 int p11 = p1;
2633 if (fSepSet->contains(cAt(p11))) {
2634 p11 = moveBack(p11);
2635 }
2636 while (fSpSet->contains(cAt(p11))) {
2637 p11 = moveBack(p11);
2638 }
2639 while (fCloseSet->contains(cAt(p11))) {
2640 p11 = moveBack(p11);
2641 }
2642 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2643 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
2644 break;
2645 }
2646
2647 setAppliedRule(p2, "SB12 Any x Any");
2648 continue;
2649 }
2650
2651 breakPos = p2;
2652 return breakPos;
2653 }
2654
~RBBISentMonkey()2655 RBBISentMonkey::~RBBISentMonkey() {
2656 delete fSets;
2657 delete fSepSet;
2658 delete fFormatSet;
2659 delete fSpSet;
2660 delete fLowerSet;
2661 delete fUpperSet;
2662 delete fOLetterSet;
2663 delete fNumericSet;
2664 delete fATermSet;
2665 delete fSContinueSet;
2666 delete fSTermSet;
2667 delete fCloseSet;
2668 delete fOtherSet;
2669 delete fExtendSet;
2670 }
2671
2672
2673
2674 //-------------------------------------------------------------------------------------------
2675 //
2676 // RBBILineMonkey
2677 //
2678 //-------------------------------------------------------------------------------------------
2679
2680 class RBBILineMonkey: public RBBIMonkeyKind {
2681 public:
2682 RBBILineMonkey();
2683 virtual ~RBBILineMonkey();
2684 virtual UVector *charClasses();
2685 virtual void setText(const UnicodeString &s);
2686 virtual int32_t next(int32_t i);
2687 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2688 private:
2689 UVector *fSets;
2690
2691 UnicodeSet *fBK;
2692 UnicodeSet *fCR;
2693 UnicodeSet *fLF;
2694 UnicodeSet *fCM;
2695 UnicodeSet *fNL;
2696 UnicodeSet *fSG;
2697 UnicodeSet *fWJ;
2698 UnicodeSet *fZW;
2699 UnicodeSet *fGL;
2700 UnicodeSet *fCB;
2701 UnicodeSet *fSP;
2702 UnicodeSet *fB2;
2703 UnicodeSet *fBA;
2704 UnicodeSet *fBB;
2705 UnicodeSet *fHH;
2706 UnicodeSet *fHY;
2707 UnicodeSet *fH2;
2708 UnicodeSet *fH3;
2709 UnicodeSet *fCL;
2710 UnicodeSet *fCP;
2711 UnicodeSet *fEX;
2712 UnicodeSet *fIN;
2713 UnicodeSet *fJL;
2714 UnicodeSet *fJV;
2715 UnicodeSet *fJT;
2716 UnicodeSet *fNS;
2717 UnicodeSet *fOP;
2718 UnicodeSet *fQU;
2719 UnicodeSet *fIS;
2720 UnicodeSet *fNU;
2721 UnicodeSet *fPO;
2722 UnicodeSet *fPR;
2723 UnicodeSet *fSY;
2724 UnicodeSet *fAI;
2725 UnicodeSet *fAL;
2726 UnicodeSet *fCJ;
2727 UnicodeSet *fHL;
2728 UnicodeSet *fID;
2729 UnicodeSet *fRI;
2730 UnicodeSet *fXX;
2731 UnicodeSet *fEB;
2732 UnicodeSet *fEM;
2733 UnicodeSet *fZWJ;
2734 UnicodeSet *fOP30;
2735 UnicodeSet *fCP30;
2736
2737 BreakIterator *fCharBI;
2738 const UnicodeString *fText;
2739 RegexMatcher *fNumberMatcher;
2740 };
2741
RBBILineMonkey()2742 RBBILineMonkey::RBBILineMonkey() :
2743 RBBIMonkeyKind(),
2744 fSets(NULL),
2745
2746 fCharBI(NULL),
2747 fText(NULL),
2748 fNumberMatcher(NULL)
2749
2750 {
2751 if (U_FAILURE(deferredStatus)) {
2752 return;
2753 }
2754
2755 UErrorCode status = U_ZERO_ERROR;
2756
2757 fSets = new UVector(status);
2758
2759 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2760 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2761 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2762 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2763 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2764 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2765 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2766 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2767 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2768 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2769 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2770 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2771 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2772 fHH = new UnicodeSet();
2773 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2774 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2775 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2776 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2777 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2778 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2779 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2780 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2781 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2782 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2783 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2784 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2785 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2786 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2787 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2788 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2789 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2790 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2791 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2792 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2793 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2794 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2795 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2796 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2797 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2798 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2799 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2800 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2801 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2802 fOP30 = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2803 fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2804
2805 if (U_FAILURE(status)) {
2806 deferredStatus = status;
2807 return;
2808 }
2809
2810 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2811 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2812 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2813
2814 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2815 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2816
2817 fHH->add(u'\u2010'); // Hyphen, '‐'
2818
2819 // Sets and names.
2820 fSets->addElement(fBK, status); classNames.push_back("fBK");
2821 fSets->addElement(fCR, status); classNames.push_back("fCR");
2822 fSets->addElement(fLF, status); classNames.push_back("fLF");
2823 fSets->addElement(fCM, status); classNames.push_back("fCM");
2824 fSets->addElement(fNL, status); classNames.push_back("fNL");
2825 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2826 fSets->addElement(fZW, status); classNames.push_back("fZW");
2827 fSets->addElement(fGL, status); classNames.push_back("fGL");
2828 fSets->addElement(fCB, status); classNames.push_back("fCB");
2829 fSets->addElement(fSP, status); classNames.push_back("fSP");
2830 fSets->addElement(fB2, status); classNames.push_back("fB2");
2831 fSets->addElement(fBA, status); classNames.push_back("fBA");
2832 fSets->addElement(fBB, status); classNames.push_back("fBB");
2833 fSets->addElement(fHY, status); classNames.push_back("fHY");
2834 fSets->addElement(fH2, status); classNames.push_back("fH2");
2835 fSets->addElement(fH3, status); classNames.push_back("fH3");
2836 fSets->addElement(fCL, status); classNames.push_back("fCL");
2837 fSets->addElement(fCP, status); classNames.push_back("fCP");
2838 fSets->addElement(fEX, status); classNames.push_back("fEX");
2839 fSets->addElement(fIN, status); classNames.push_back("fIN");
2840 fSets->addElement(fJL, status); classNames.push_back("fJL");
2841 fSets->addElement(fJT, status); classNames.push_back("fJT");
2842 fSets->addElement(fJV, status); classNames.push_back("fJV");
2843 fSets->addElement(fNS, status); classNames.push_back("fNS");
2844 fSets->addElement(fOP, status); classNames.push_back("fOP");
2845 fSets->addElement(fQU, status); classNames.push_back("fQU");
2846 fSets->addElement(fIS, status); classNames.push_back("fIS");
2847 fSets->addElement(fNU, status); classNames.push_back("fNU");
2848 fSets->addElement(fPO, status); classNames.push_back("fPO");
2849 fSets->addElement(fPR, status); classNames.push_back("fPR");
2850 fSets->addElement(fSY, status); classNames.push_back("fSY");
2851 fSets->addElement(fAI, status); classNames.push_back("fAI");
2852 fSets->addElement(fAL, status); classNames.push_back("fAL");
2853 fSets->addElement(fHL, status); classNames.push_back("fHL");
2854 fSets->addElement(fID, status); classNames.push_back("fID");
2855 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2856 fSets->addElement(fRI, status); classNames.push_back("fRI");
2857 fSets->addElement(fSG, status); classNames.push_back("fSG");
2858 fSets->addElement(fEB, status); classNames.push_back("fEB");
2859 fSets->addElement(fEM, status); classNames.push_back("fEM");
2860 fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2861 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2862 fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2863 fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2864
2865 const char *rules =
2866 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2867 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2868 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2869 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2870 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2871 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2872 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2873
2874 fNumberMatcher = new RegexMatcher(
2875 UnicodeString(rules, -1, US_INV), 0, status);
2876
2877 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2878
2879 if (U_FAILURE(status)) {
2880 deferredStatus = status;
2881 }
2882
2883 }
2884
2885
setText(const UnicodeString & s)2886 void RBBILineMonkey::setText(const UnicodeString &s) {
2887 fText = &s;
2888 fCharBI->setText(s);
2889 prepareAppliedRules(s.length());
2890 fNumberMatcher->reset(s);
2891 }
2892
2893 //
2894 // rule9Adjust
2895 // Line Break TR rules 9 and 10 implementation.
2896 // This deals with combining marks and other sequences that
2897 // that must be treated as if they were something other than what they actually are.
2898 //
2899 // This is factored out into a separate function because it must be applied twice for
2900 // each potential break, once to the chars before the position being checked, then
2901 // again to the text following the possible break.
2902 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2903 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2904 if (pos == -1) {
2905 // Invalid initial position. Happens during the warmup iteration of the
2906 // main loop in next().
2907 return;
2908 }
2909
2910 int32_t nPos = *nextPos;
2911
2912 // LB 9 Keep combining sequences together.
2913 // advance over any CM class chars. Note that Line Break CM is different
2914 // from the normal Grapheme Extend property.
2915 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2916 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2917 for (;;) {
2918 *nextChar = fText->char32At(nPos);
2919 if (!fCM->contains(*nextChar)) {
2920 break;
2921 }
2922 nPos = fText->moveIndex32(nPos, 1);
2923 }
2924 }
2925
2926
2927 // LB 9 Treat X CM* as if it were x.
2928 // No explicit action required.
2929
2930 // LB 10 Treat any remaining combining mark as AL
2931 if (fCM->contains(*posChar)) {
2932 *posChar = u'A';
2933 }
2934
2935 // Push the updated nextPos and nextChar back to our caller.
2936 // This only makes a difference if posChar got bigger by consuming a
2937 // combining sequence.
2938 *nextPos = nPos;
2939 *nextChar = fText->char32At(nPos);
2940 }
2941
2942
2943
next(int32_t startPos)2944 int32_t RBBILineMonkey::next(int32_t startPos) {
2945 UErrorCode status = U_ZERO_ERROR;
2946 int32_t pos; // Index of the char following a potential break position
2947 UChar32 thisChar; // Character at above position "pos"
2948
2949 int32_t prevPos; // Index of the char preceding a potential break position
2950 UChar32 prevChar; // Character at above position. Note that prevChar
2951 // and thisChar may not be adjacent because combining
2952 // characters between them will be ignored.
2953
2954 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2955 UChar32 prevCharX2;
2956
2957 int32_t nextPos; // Index of the next character following pos.
2958 // Usually skips over combining marks.
2959 int32_t nextCPPos; // Index of the code point following "pos."
2960 // May point to a combining mark.
2961 int32_t tPos; // temp value.
2962 UChar32 c;
2963
2964 if (U_FAILURE(deferredStatus)) {
2965 return -1;
2966 }
2967
2968 if (startPos >= fText->length()) {
2969 return -1;
2970 }
2971
2972
2973 // Initial values for loop. Loop will run the first time without finding breaks,
2974 // while the invalid values shift out and the "this" and
2975 // "prev" positions are filled in with good values.
2976 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2977 thisChar = prevChar = prevCharX2 = 0;
2978 nextPos = nextCPPos = startPos;
2979
2980
2981 // Loop runs once per position in the test text, until a break position
2982 // is found.
2983 for (;;) {
2984 prevPosX2 = prevPos;
2985 prevCharX2 = prevChar;
2986
2987 prevPos = pos;
2988 prevChar = thisChar;
2989
2990 pos = nextPos;
2991 thisChar = fText->char32At(pos);
2992
2993 nextCPPos = fText->moveIndex32(pos, 1);
2994 nextPos = nextCPPos;
2995
2996
2997 if (pos >= fText->length()) {
2998 setAppliedRule(pos, "LB2 - Break at end of text.");
2999 break;
3000 }
3001
3002
3003 // We do this one out-of-order because the adjustment does not change anything
3004 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3005 // be applied.
3006 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3007 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3008 c = fText->char32At(nextPos);
3009 rule9Adjust(pos, &thisChar, &nextPos, &c);
3010
3011 // If the loop is still warming up - if we haven't shifted the initial
3012 // -1 positions out of prevPos yet - loop back to advance the
3013 // position in the input without any further looking for breaks.
3014 if (prevPos == -1) {
3015 setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
3016 continue;
3017 }
3018
3019
3020 if (fBK->contains(prevChar)) {
3021 setAppliedRule(pos, "LB 4 Always break after hard line breaks");
3022 break;
3023 }
3024
3025
3026 if (prevChar == 0x0d && thisChar == 0x0a) {
3027 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3028 continue;
3029 }
3030 if (prevChar == 0x0d ||
3031 prevChar == 0x0a ||
3032 prevChar == 0x85) {
3033 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3034 break;
3035 }
3036
3037
3038 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3039 fBK->contains(thisChar)) {
3040 setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
3041 continue;
3042 }
3043
3044
3045 if (fSP->contains(thisChar)) {
3046 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
3047 continue;
3048 }
3049
3050 // !!! ??? Is this the right text for the applied rule?
3051 if (fZW->contains(thisChar)) {
3052 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
3053 continue;
3054 }
3055
3056
3057 // ZW SP* ÷
3058 // Scan backwards from prevChar for SP* ZW
3059 tPos = prevPos;
3060 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3061 tPos = fText->moveIndex32(tPos, -1);
3062 }
3063 if (fZW->contains(fText->char32At(tPos))) {
3064 setAppliedRule(pos, "LB 8 Break after zero width space");
3065 break;
3066 }
3067
3068
3069 // Move this test up, before LB8a, because numbers can match a longer sequence that would
3070 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
3071 if (fNumberMatcher->lookingAt(prevPos, status)) {
3072 if (U_FAILURE(status)) {
3073 setAppliedRule(pos, "LB 25 Numbers");
3074 break;
3075 }
3076 // Matched a number. But could have been just a single digit, which would
3077 // not represent a "no break here" between prevChar and thisChar
3078 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3079 if (numEndIdx > pos) {
3080 // Number match includes at least our two chars being checked
3081 if (numEndIdx > nextPos) {
3082 // Number match includes additional chars. Update pos and nextPos
3083 // so that next loop iteration will continue at the end of the number,
3084 // checking for breaks between last char in number & whatever follows.
3085 pos = nextPos = numEndIdx;
3086 do {
3087 pos = fText->moveIndex32(pos, -1);
3088 thisChar = fText->char32At(pos);
3089 } while (fCM->contains(thisChar));
3090 }
3091 setAppliedRule(pos, "LB 25 Numbers");
3092 continue;
3093 }
3094 }
3095
3096
3097 // The monkey test's way of ignoring combining characters doesn't work
3098 // for this rule. ZJ is also a CM. Need to get the actual character
3099 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3100 {
3101 int32_t prevIdx = fText->moveIndex32(pos, -1);
3102 UChar32 prevC = fText->char32At(prevIdx);
3103 if (fZWJ->contains(prevC)) {
3104 setAppliedRule(pos, "LB 8a ZWJ x");
3105 continue;
3106 }
3107 }
3108
3109
3110 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
3111 //
3112
3113
3114 // x WJ
3115 // WJ x
3116 //
3117 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3118 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
3119 continue;
3120 }
3121
3122
3123 if (fGL->contains(prevChar)) {
3124 setAppliedRule(pos, "LB 12 GL x");
3125 continue;
3126 }
3127
3128
3129 if (!(fSP->contains(prevChar) ||
3130 fBA->contains(prevChar) ||
3131 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3132 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
3133 continue;
3134 }
3135
3136
3137 if (fCL->contains(thisChar) ||
3138 fCP->contains(thisChar) ||
3139 fEX->contains(thisChar) ||
3140 fSY->contains(thisChar)) {
3141 setAppliedRule(pos, "LB 13 Don't break before closings.");
3142 continue;
3143 }
3144
3145
3146 // Scan backwards, checking for this sequence.
3147 // The OP char could include combining marks, so we actually check for
3148 // OP CM* SP*
3149 // Another Twist: The Rule 9 fixes may have changed a SP CM
3150 // sequence into a ID char, so before scanning back through spaces,
3151 // verify that prevChar is indeed a space. The prevChar variable
3152 // may differ from fText[prevPos]
3153 tPos = prevPos;
3154 if (fSP->contains(prevChar)) {
3155 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3156 tPos=fText->moveIndex32(tPos, -1);
3157 }
3158 }
3159 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3160 tPos=fText->moveIndex32(tPos, -1);
3161 }
3162 if (fOP->contains(fText->char32At(tPos))) {
3163 setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3164 continue;
3165 }
3166
3167
3168 if (nextPos < fText->length()) {
3169 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3170 // from a legit ffff character. So test length separately.
3171 UChar32 nextChar = fText->char32At(nextPos);
3172 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3173 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3174 break;
3175 }
3176 }
3177
3178
3179 if (fIS->contains(thisChar)) {
3180 setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
3181 continue;
3182 }
3183
3184
3185 if (fOP->contains(thisChar)) {
3186 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3187 int tPos = prevPos;
3188 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3189 tPos = fText->moveIndex32(tPos, -1);
3190 }
3191 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3192 tPos = fText->moveIndex32(tPos, -1);
3193 }
3194 if (fQU->contains(fText->char32At(tPos))) {
3195 setAppliedRule(pos, "LB 15 QU SP* x OP");
3196 continue;
3197 }
3198 }
3199
3200
3201 // Scan backwards for SP* CM* (CL | CP)
3202 if (fNS->contains(thisChar)) {
3203 int tPos = prevPos;
3204 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3205 tPos = fText->moveIndex32(tPos, -1);
3206 }
3207 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3208 tPos = fText->moveIndex32(tPos, -1);
3209 }
3210 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3211 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
3212 continue;
3213 }
3214 }
3215
3216
3217 if (fB2->contains(thisChar)) {
3218 // Scan backwards, checking for the B2 CM* SP* sequence.
3219 tPos = prevPos;
3220 if (fSP->contains(prevChar)) {
3221 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3222 tPos=fText->moveIndex32(tPos, -1);
3223 }
3224 }
3225 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3226 tPos=fText->moveIndex32(tPos, -1);
3227 }
3228 if (fB2->contains(fText->char32At(tPos))) {
3229 setAppliedRule(pos, "LB 17 B2 SP* x B2");
3230 continue;
3231 }
3232 }
3233
3234
3235 if (fSP->contains(prevChar)) {
3236 setAppliedRule(pos, "LB 18 break after space");
3237 break;
3238 }
3239
3240 // x QU
3241 // QU x
3242 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3243 setAppliedRule(pos, "LB 19");
3244 continue;
3245 }
3246
3247 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3248 setAppliedRule(pos, "LB 20 Break around a CB");
3249 break;
3250 }
3251
3252 // Don't break between Hyphens and letters if a break precedes the hyphen.
3253 // Formerly this was a Finnish tailoring.
3254 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3255 // ^($HY | $HH) $AL;
3256 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3257 prevPosX2 == -1) {
3258 setAppliedRule(pos, "LB 20.09");
3259 continue;
3260 }
3261
3262 if (fBA->contains(thisChar) ||
3263 fHY->contains(thisChar) ||
3264 fNS->contains(thisChar) ||
3265 fBB->contains(prevChar) ) {
3266 setAppliedRule(pos, "LB 21");
3267 continue;
3268 }
3269
3270 if (fHL->contains(prevCharX2) &&
3271 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3272 setAppliedRule(pos, "LB 21a HL (HY | BA) x");
3273 continue;
3274 }
3275
3276 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3277 setAppliedRule(pos, "LB 21b SY x HL");
3278 continue;
3279 }
3280
3281 if (fIN->contains(thisChar)) {
3282 setAppliedRule(pos, "LB 22");
3283 continue;
3284 }
3285
3286
3287 // (AL | HL) x NU
3288 // NU x (AL | HL)
3289 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3290 setAppliedRule(pos, "LB 23");
3291 continue;
3292 }
3293 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3294 setAppliedRule(pos, "LB 23");
3295 continue;
3296 }
3297
3298 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3299 // PR x (ID | EB | EM)
3300 // (ID | EB | EM) x PO
3301 if (fPR->contains(prevChar) &&
3302 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3303 setAppliedRule(pos, "LB 23a");
3304 continue;
3305 }
3306 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3307 fPO->contains(thisChar)) {
3308 setAppliedRule(pos, "LB 23a");
3309 continue;
3310 }
3311
3312 // Do not break between prefix and letters or ideographs.
3313 // (PR | PO) x (AL | HL)
3314 // (AL | HL) x (PR | PO)
3315 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3316 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3317 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3318 continue;
3319 }
3320 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3321 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3322 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3323 continue;
3324 }
3325
3326 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3327
3328 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3329 fJV->contains(thisChar) ||
3330 fH2->contains(thisChar) ||
3331 fH3->contains(thisChar))) {
3332 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3333 continue;
3334 }
3335
3336 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3337 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3338 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3339 continue;
3340 }
3341
3342 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3343 fJT->contains(thisChar)) {
3344 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3345 continue;
3346 }
3347
3348 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3349 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3350 fIN->contains(thisChar)) {
3351 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3352 continue;
3353 }
3354 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3355 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3356 fPO->contains(thisChar)) {
3357 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3358 continue;
3359 }
3360 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3361 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3362 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3363 continue;
3364 }
3365
3366
3367
3368 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3369 setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
3370 continue;
3371 }
3372
3373 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3374 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3375 continue;
3376 }
3377
3378 // (AL | NU) x OP
3379 // CP x (AL | NU)
3380 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3381 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3382 continue;
3383 }
3384 if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3385 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3386 continue;
3387 }
3388
3389 // RI x RI
3390 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3391 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3392 break;
3393 }
3394 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3395 // Two Regional Indicators have been paired.
3396 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3397 // following RI. This is a hack.
3398 thisChar = -1;
3399 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3400 continue;
3401 }
3402
3403 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3404 setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
3405 continue;
3406 }
3407
3408 setAppliedRule(pos, "LB 31 Break everywhere else");
3409 break;
3410 }
3411
3412 return pos;
3413 }
3414
3415
charClasses()3416 UVector *RBBILineMonkey::charClasses() {
3417 return fSets;
3418 }
3419
3420
~RBBILineMonkey()3421 RBBILineMonkey::~RBBILineMonkey() {
3422 delete fSets;
3423
3424 delete fBK;
3425 delete fCR;
3426 delete fLF;
3427 delete fCM;
3428 delete fNL;
3429 delete fWJ;
3430 delete fZW;
3431 delete fGL;
3432 delete fCB;
3433 delete fSP;
3434 delete fB2;
3435 delete fBA;
3436 delete fBB;
3437 delete fHH;
3438 delete fHY;
3439 delete fH2;
3440 delete fH3;
3441 delete fCL;
3442 delete fCP;
3443 delete fEX;
3444 delete fIN;
3445 delete fJL;
3446 delete fJV;
3447 delete fJT;
3448 delete fNS;
3449 delete fOP;
3450 delete fQU;
3451 delete fIS;
3452 delete fNU;
3453 delete fPO;
3454 delete fPR;
3455 delete fSY;
3456 delete fAI;
3457 delete fAL;
3458 delete fCJ;
3459 delete fHL;
3460 delete fID;
3461 delete fRI;
3462 delete fSG;
3463 delete fXX;
3464 delete fEB;
3465 delete fEM;
3466 delete fZWJ;
3467 delete fOP30;
3468 delete fCP30;
3469
3470 delete fCharBI;
3471 delete fNumberMatcher;
3472 }
3473
3474
3475 //-------------------------------------------------------------------------------------------
3476 //
3477 // TestMonkey
3478 //
3479 // params
3480 // seed=nnnnn Random number starting seed.
3481 // Setting the seed allows errors to be reproduced.
3482 // loop=nnn Looping count. Controls running time.
3483 // -1: run forever.
3484 // 0 or greater: run length.
3485 //
3486 // type = char | word | line | sent | title
3487 //
3488 // Example:
3489 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3490 //
3491 //-------------------------------------------------------------------------------------------
3492
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3493 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3494 int32_t val = defaultVal;
3495 name.append(" *= *(-?\\d+)");
3496 UErrorCode status = U_ZERO_ERROR;
3497 RegexMatcher m(name, params, 0, status);
3498 if (m.find()) {
3499 // The param exists. Convert the string to an int.
3500 char valString[100];
3501 int32_t paramLength = m.end(1, status) - m.start(1, status);
3502 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3503 paramLength = (int32_t)(sizeof(valString)-2);
3504 }
3505 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3506 val = strtol(valString, NULL, 10);
3507
3508 // Delete this parameter from the params string.
3509 m.reset();
3510 params = m.replaceFirst("", status);
3511 }
3512 U_ASSERT(U_SUCCESS(status));
3513 return val;
3514 }
3515 #endif
3516
3517 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3518 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3519 BreakIterator *bi,
3520 int expected[],
3521 int expectedcount)
3522 {
3523 int count = 0;
3524 int i = 0;
3525 int forward[50];
3526 bi->setText(ustr);
3527 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3528 forward[count] = i;
3529 if (count < expectedcount && expected[count] != i) {
3530 test->errln("%s:%d break forward test failed: expected %d but got %d",
3531 __FILE__, __LINE__, expected[count], i);
3532 break;
3533 }
3534 count ++;
3535 }
3536 if (count != expectedcount) {
3537 printStringBreaks(ustr, expected, expectedcount);
3538 test->errln("%s:%d break forward test failed: missed %d match",
3539 __FILE__, __LINE__, expectedcount - count);
3540 return;
3541 }
3542 // testing boundaries
3543 for (i = 1; i < expectedcount; i ++) {
3544 int j = expected[i - 1];
3545 if (!bi->isBoundary(j)) {
3546 printStringBreaks(ustr, expected, expectedcount);
3547 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3548 __FILE__, __LINE__, j);
3549 return;
3550 }
3551 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3552 if (bi->isBoundary(j)) {
3553 printStringBreaks(ustr, expected, expectedcount);
3554 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3555 __FILE__, __LINE__, j);
3556 return;
3557 }
3558 }
3559 }
3560
3561 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3562 count --;
3563 if (forward[count] != i) {
3564 printStringBreaks(ustr, expected, expectedcount);
3565 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3566 __FILE__, __LINE__, forward[count], i);
3567 break;
3568 }
3569 }
3570 if (count != 0) {
3571 printStringBreaks(ustr, expected, expectedcount);
3572 test->errln("break test previous() failed: missed a match");
3573 return;
3574 }
3575
3576 // testing preceding
3577 for (i = 0; i < expectedcount - 1; i ++) {
3578 // int j = expected[i] + 1;
3579 int j = ustr.moveIndex32(expected[i], 1);
3580 for (; j <= expected[i + 1]; j ++) {
3581 int32_t expectedPreceding = expected[i];
3582 int32_t actualPreceding = bi->preceding(j);
3583 if (actualPreceding != expectedPreceding) {
3584 printStringBreaks(ustr, expected, expectedcount);
3585 test->errln("%s:%d preceding(%d): expected %d, got %d",
3586 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3587 return;
3588 }
3589 }
3590 }
3591 }
3592 #endif
3593
TestWordBreaks(void)3594 void RBBITest::TestWordBreaks(void)
3595 {
3596 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3597
3598 Locale locale("en");
3599 UErrorCode status = U_ZERO_ERROR;
3600 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3601 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3602 // Replaced any C+J characters in a row with a random sequence of characters
3603 // of the same length to make our C+J segmentation not get in the way.
3604 static const char *strlist[] =
3605 {
3606 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3607 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3608 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3609 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3610 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3611 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3612 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3613 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3614 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3615 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3616 "\\u2027\\U000e0067\\u0a47\\u00b7",
3617 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3618 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3619 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3620 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3621 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3622 "\\u0027\\u11af\\U000e0057\\u0602",
3623 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3624 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3625 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3626 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3627 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3628 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3629 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3630 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3631 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3632 "\\u18f4\\U000e0049\\u20e7\\u2027",
3633 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3634 "\\ua183\\u102d\\u0bec\\u003a",
3635 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3636 "\\u003a\\u0e57\\u0fad\\u002e",
3637 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3638 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3639 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3640 "\\u003a\\u0664\\u00b7\\u1fba",
3641 "\\u003b\\u0027\\u00b7\\u47a3",
3642 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3643 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3644 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3645 };
3646 int loop;
3647 if (U_FAILURE(status)) {
3648 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3649 return;
3650 }
3651 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3652 // printf("looping %d\n", loop);
3653 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3654 // RBBICharMonkey monkey;
3655 RBBIWordMonkey monkey;
3656
3657 int expected[50];
3658 int expectedcount = 0;
3659
3660 monkey.setText(ustr);
3661 int i;
3662 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3663 expected[expectedcount ++] = i;
3664 }
3665
3666 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3667 }
3668 delete bi;
3669 #endif
3670 }
3671
TestWordBoundary(void)3672 void RBBITest::TestWordBoundary(void)
3673 {
3674 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3675 Locale locale("en");
3676 UErrorCode status = U_ZERO_ERROR;
3677 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3678 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3679 if (U_FAILURE(status)) {
3680 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3681 __FILE__, __LINE__, u_errorName(status));
3682 return;
3683 }
3684 UChar str[50];
3685 static const char *strlist[] =
3686 {
3687 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3688 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3689 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3690 "\\u2027\\U000e0067\\u0a47\\u00b7",
3691 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3692 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3693 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3694 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3695 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3696 "\\u0027\\u11af\\U000e0057\\u0602",
3697 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3698 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3699 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3700 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3701 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3702 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3703 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3704 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3705 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3706 "\\u58f4\\U000e0049\\u20e7\\u2027",
3707 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3708 "\\ua183\\u102d\\u0bec\\u003a",
3709 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3710 "\\u003a\\u0e57\\u0fad\\u002e",
3711 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3712 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3713 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3714 "\\u003a\\u0664\\u00b7\\u1fba",
3715 "\\u003b\\u0027\\u00b7\\u47a3",
3716 };
3717 int loop;
3718 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3719 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3720 UnicodeString ustr(str);
3721 int forward[50];
3722 int count = 0;
3723
3724 bi->setText(ustr);
3725 int prev = -1;
3726 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3727 ++count;
3728 if (count >= UPRV_LENGTHOF(forward)) {
3729 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3730 __FILE__, __LINE__, loop, count, boundary);
3731 return;
3732 }
3733 forward[count] = boundary;
3734 if (boundary <= prev) {
3735 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3736 __FILE__, __LINE__, loop, prev, boundary);
3737 break;
3738 }
3739 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3740 if (bi->isBoundary(nonBoundary)) {
3741 printStringBreaks(ustr, forward, count);
3742 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3743 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3744 return;
3745 }
3746 }
3747 if (!bi->isBoundary(boundary)) {
3748 printStringBreaks(ustr, forward, count);
3749 errln("%s:%d happy boundary test failed: expected %d a boundary",
3750 __FILE__, __LINE__, boundary);
3751 return;
3752 }
3753 prev = boundary;
3754 }
3755 }
3756 }
3757
TestLineBreaks(void)3758 void RBBITest::TestLineBreaks(void)
3759 {
3760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3761 Locale locale("en");
3762 UErrorCode status = U_ZERO_ERROR;
3763 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3764 const int32_t STRSIZE = 50;
3765 UChar str[STRSIZE];
3766 static const char *strlist[] =
3767 {
3768 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3769 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3770 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3771 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3772 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3773 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3774 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3775 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3776 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3777 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3778 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3779 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3780 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3781 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3782 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3783 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3784 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3785 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3786 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3787 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3788 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3789 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3790 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3791 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3792 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3793 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3794 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3795 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3796 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3797 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3798 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3799 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3800 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3801 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3802 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3803 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3804 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3805 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3806 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3807 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3808 };
3809 int loop;
3810 TEST_ASSERT_SUCCESS(status);
3811 if (U_FAILURE(status)) {
3812 return;
3813 }
3814 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3815 // printf("looping %d\n", loop);
3816 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3817 if (t >= STRSIZE) {
3818 TEST_ASSERT(FALSE);
3819 continue;
3820 }
3821
3822
3823 UnicodeString ustr(str);
3824 RBBILineMonkey monkey;
3825 if (U_FAILURE(monkey.deferredStatus)) {
3826 continue;
3827 }
3828
3829 const int EXPECTEDSIZE = 50;
3830 int expected[EXPECTEDSIZE];
3831 int expectedcount = 0;
3832
3833 monkey.setText(ustr);
3834
3835 int i;
3836 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3837 if (expectedcount >= EXPECTEDSIZE) {
3838 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3839 return;
3840 }
3841 expected[expectedcount ++] = i;
3842 }
3843
3844 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3845 }
3846 delete bi;
3847 #endif
3848 }
3849
TestSentBreaks(void)3850 void RBBITest::TestSentBreaks(void)
3851 {
3852 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3853 Locale locale("en");
3854 UErrorCode status = U_ZERO_ERROR;
3855 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3856 UChar str[200];
3857 static const char *strlist[] =
3858 {
3859 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3860 "This\n",
3861 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3862 "\"Sentence ending with a quote.\" Bye.",
3863 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3864 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3865 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3866 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3867 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3868 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3869 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3870 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3871 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3872 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3873 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3874 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3875 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3876 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3877 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3878 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3879 };
3880 int loop;
3881 if (U_FAILURE(status)) {
3882 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3883 return;
3884 }
3885 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3886 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3887 UnicodeString ustr(str);
3888
3889 RBBISentMonkey monkey;
3890 if (U_FAILURE(monkey.deferredStatus)) {
3891 continue;
3892 }
3893
3894 const int EXPECTEDSIZE = 50;
3895 int expected[EXPECTEDSIZE];
3896 int expectedcount = 0;
3897
3898 monkey.setText(ustr);
3899
3900 int i;
3901 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3902 if (expectedcount >= EXPECTEDSIZE) {
3903 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3904 return;
3905 }
3906 expected[expectedcount ++] = i;
3907 }
3908
3909 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3910 }
3911 delete bi;
3912 #endif
3913 }
3914
TestMonkey()3915 void RBBITest::TestMonkey() {
3916 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3917
3918 UErrorCode status = U_ZERO_ERROR;
3919 int32_t loopCount = 500;
3920 int32_t seed = 1;
3921 UnicodeString breakType = "all";
3922 Locale locale("en");
3923 UBool useUText = FALSE;
3924
3925 if (quick == FALSE) {
3926 loopCount = 10000;
3927 }
3928
3929 if (fTestParams) {
3930 UnicodeString p(fTestParams);
3931 loopCount = getIntParam("loop", p, loopCount);
3932 seed = getIntParam("seed", p, seed);
3933
3934 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3935 if (m.find()) {
3936 breakType = m.group(1, status);
3937 m.reset();
3938 p = m.replaceFirst("", status);
3939 }
3940
3941 RegexMatcher u(" *utext", p, 0, status);
3942 if (u.find()) {
3943 useUText = TRUE;
3944 u.reset();
3945 p = u.replaceFirst("", status);
3946 }
3947
3948
3949 // m.reset(p);
3950 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3951 // Each option is stripped out of the option string as it is processed.
3952 // All options have been checked. The option string should have been completely emptied..
3953 char buf[100];
3954 p.extract(buf, sizeof(buf), NULL, status);
3955 buf[sizeof(buf)-1] = 0;
3956 errln("Unrecognized or extra parameter: %s\n", buf);
3957 return;
3958 }
3959
3960 }
3961
3962 if (breakType == "char" || breakType == "all") {
3963 RBBICharMonkey m;
3964 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3965 if (U_SUCCESS(status)) {
3966 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3967 if (breakType == "all" && useUText==FALSE) {
3968 // Also run a quick test with UText when "all" is specified
3969 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3970 }
3971 }
3972 else {
3973 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3974 }
3975 delete bi;
3976 }
3977
3978 if (breakType == "word" || breakType == "all") {
3979 logln("Word Break Monkey Test");
3980 RBBIWordMonkey m;
3981 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3982 if (U_SUCCESS(status)) {
3983 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3984 }
3985 else {
3986 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3987 }
3988 delete bi;
3989 }
3990
3991 if (breakType == "line" || breakType == "all") {
3992 logln("Line Break Monkey Test");
3993 RBBILineMonkey m;
3994 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3995 if (loopCount >= 10) {
3996 loopCount = loopCount / 5; // Line break runs slower than the others.
3997 }
3998 if (U_SUCCESS(status)) {
3999 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4000 }
4001 else {
4002 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4003 }
4004 delete bi;
4005 }
4006
4007 if (breakType == "sent" || breakType == "all" ) {
4008 logln("Sentence Break Monkey Test");
4009 RBBISentMonkey m;
4010 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4011 if (loopCount >= 10) {
4012 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4013 }
4014 if (U_SUCCESS(status)) {
4015 RunMonkey(bi, m, "sent", seed, loopCount, useUText);
4016 }
4017 else {
4018 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4019 }
4020 delete bi;
4021 }
4022
4023 #endif
4024 }
4025
4026 //
4027 // Run a RBBI monkey test. Common routine, for all break iterator types.
4028 // Parameters:
4029 // bi - the break iterator to use
4030 // mk - MonkeyKind, abstraction for obtaining expected results
4031 // name - Name of test (char, word, etc.) for use in error messages
4032 // seed - Seed for starting random number generator (parameter from user)
4033 // numIterations
4034 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4035 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4036 int32_t numIterations, UBool useUText) {
4037
4038 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4039
4040 const int32_t TESTSTRINGLEN = 500;
4041 UnicodeString testText;
4042 int32_t numCharClasses;
4043 UVector *chClasses;
4044 int expectedCount = 0;
4045 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4046 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4047 char reverseBreaks[TESTSTRINGLEN*2+1];
4048 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4049 char followingBreaks[TESTSTRINGLEN*2+1];
4050 char precedingBreaks[TESTSTRINGLEN*2+1];
4051 int i;
4052 int loopCount = 0;
4053
4054
4055 m_seed = seed;
4056
4057 numCharClasses = mk.charClasses()->size();
4058 chClasses = mk.charClasses();
4059
4060 // Check for errors that occured during the construction of the MonkeyKind object.
4061 // Can't report them where they occured because errln() is a method coming from intlTest,
4062 // and is not visible outside of RBBITest :-(
4063 if (U_FAILURE(mk.deferredStatus)) {
4064 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4065 return;
4066 }
4067
4068 // Verify that the character classes all have at least one member.
4069 for (i=0; i<numCharClasses; i++) {
4070 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4071 if (s == NULL || s->size() == 0) {
4072 errln("Character Class #%d is null or of zero size.", i);
4073 return;
4074 }
4075 }
4076
4077 // For minimizing width of class name output.
4078 int classNameSize = mk.maxClassNameSize();
4079
4080 while (loopCount < numIterations || numIterations == -1) {
4081 if (numIterations == -1 && loopCount % 10 == 0) {
4082 // If test is running in an infinite loop, display a periodic tic so
4083 // we can tell that it is making progress.
4084 fprintf(stderr, ".");
4085 }
4086 // Save current random number seed, so that we can recreate the random numbers
4087 // for this loop iteration in event of an error.
4088 seed = m_seed;
4089
4090 // Populate a test string with data.
4091 testText.truncate(0);
4092 for (i=0; i<TESTSTRINGLEN; i++) {
4093 int32_t aClassNum = m_rand() % numCharClasses;
4094 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4095 int32_t charIdx = m_rand() % classSet->size();
4096 UChar32 c = classSet->charAt(charIdx);
4097 if (c < 0) { // TODO: deal with sets containing strings.
4098 errln("%s:%d c < 0", __FILE__, __LINE__);
4099 break;
4100 }
4101 // Do not assemble a supplementary character from randomly generated separate surrogates.
4102 // (It could be a dictionary character)
4103 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4104 continue;
4105 }
4106
4107 testText.append(c);
4108 }
4109
4110 // Calculate the expected results for this test string and reset applied rules.
4111 mk.setText(testText);
4112
4113 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4114 expectedBreaks[0] = 1;
4115 int32_t breakPos = 0;
4116 expectedCount = 0;
4117 for (;;) {
4118 breakPos = mk.next(breakPos);
4119 if (breakPos == -1) {
4120 break;
4121 }
4122 if (breakPos > testText.length()) {
4123 errln("breakPos > testText.length()");
4124 }
4125 expectedBreaks[breakPos] = 1;
4126 U_ASSERT(expectedCount<testText.length());
4127 }
4128
4129 // Find the break positions using forward iteration
4130 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4131 if (useUText) {
4132 UErrorCode status = U_ZERO_ERROR;
4133 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4134 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4135 bi->setText(testUText, status);
4136 TEST_ASSERT_SUCCESS(status);
4137 utext_close(testUText); // The break iterator does a shallow clone of the UText
4138 // This UText can be closed immediately, so long as the
4139 // testText string continues to exist.
4140 } else {
4141 bi->setText(testText);
4142 }
4143
4144 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4145 if (i < 0 || i > testText.length()) {
4146 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4147 break;
4148 }
4149 forwardBreaks[i] = 1;
4150 }
4151
4152 // Find the break positions using reverse iteration
4153 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4154 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4155 if (i < 0 || i > testText.length()) {
4156 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4157 break;
4158 }
4159 reverseBreaks[i] = 1;
4160 }
4161
4162 // Find the break positions using isBoundary() tests.
4163 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4164 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4165 for (i=0; i<=testText.length(); i++) {
4166 isBoundaryBreaks[i] = bi->isBoundary(i);
4167 }
4168
4169
4170 // Find the break positions using the following() function.
4171 // printf(".");
4172 memset(followingBreaks, 0, sizeof(followingBreaks));
4173 int32_t lastBreakPos = 0;
4174 followingBreaks[0] = 1;
4175 for (i=0; i<testText.length(); i++) {
4176 breakPos = bi->following(i);
4177 if (breakPos <= i ||
4178 breakPos < lastBreakPos ||
4179 breakPos > testText.length() ||
4180 (breakPos > lastBreakPos && lastBreakPos > i)) {
4181 errln("%s break monkey test: "
4182 "Out of range value returned by BreakIterator::following().\n"
4183 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4184 name, seed, i, breakPos, lastBreakPos);
4185 break;
4186 }
4187 followingBreaks[breakPos] = 1;
4188 lastBreakPos = breakPos;
4189 }
4190
4191 // Find the break positions using the preceding() function.
4192 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4193 lastBreakPos = testText.length();
4194 precedingBreaks[testText.length()] = 1;
4195 for (i=testText.length(); i>0; i--) {
4196 breakPos = bi->preceding(i);
4197 if (breakPos >= i ||
4198 breakPos > lastBreakPos ||
4199 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4200 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4201 errln("%s break monkey test: "
4202 "Out of range value returned by BreakIterator::preceding().\n"
4203 "index=%d; prev returned %d; lastBreak=%d" ,
4204 name, i, breakPos, lastBreakPos);
4205 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4206 precedingBreaks[i] = 2; // Forces an error.
4207 }
4208 } else {
4209 if (breakPos >= 0) {
4210 precedingBreaks[breakPos] = 1;
4211 }
4212 lastBreakPos = breakPos;
4213 }
4214 }
4215
4216 // Compare the expected and actual results.
4217 for (i=0; i<=testText.length(); i++) {
4218 const char *errorType = NULL;
4219 const char* currentBreakData = NULL;
4220 if (forwardBreaks[i] != expectedBreaks[i]) {
4221 errorType = "next()";
4222 currentBreakData = forwardBreaks;
4223 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4224 errorType = "previous()";
4225 currentBreakData = reverseBreaks;
4226 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4227 errorType = "isBoundary()";
4228 currentBreakData = isBoundaryBreaks;
4229 } else if (followingBreaks[i] != expectedBreaks[i]) {
4230 errorType = "following()";
4231 currentBreakData = followingBreaks;
4232 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4233 errorType = "preceding()";
4234 currentBreakData = precedingBreaks;
4235 }
4236
4237 if (errorType != NULL) {
4238 // Format a range of the test text that includes the failure as
4239 // a data item that can be included in the rbbi test data file.
4240
4241 // Start of the range is the last point where expected and actual results
4242 // both agreed that there was a break position.
4243
4244 int startContext = i;
4245 int32_t count = 0;
4246 for (;;) {
4247 if (startContext==0) { break; }
4248 startContext --;
4249 if (expectedBreaks[startContext] != 0) {
4250 if (count == 2) break;
4251 count ++;
4252 }
4253 }
4254
4255 // End of range is two expected breaks past the start position.
4256 int endContext = i + 1;
4257 int ci;
4258 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4259 for (;;) {
4260 if (endContext >= testText.length()) {break;}
4261 if (expectedBreaks[endContext-1] != 0) {
4262 if (count == 0) break;
4263 count --;
4264 }
4265 endContext ++;
4266 }
4267 }
4268
4269 // Formatting of each line includes:
4270 // character code
4271 // reference break: '|' -> a break, '.' -> no break
4272 // actual break: '|' -> a break, '.' -> no break
4273 // (name of character clase)
4274 // Unicode name of character
4275 // '-->' indicates location of the difference.
4276
4277 MONKEY_ERROR(
4278 (expectedBreaks[i] ? "Break expected but not found" :
4279 "Break found but not expected"),
4280 name, i, seed);
4281
4282 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4283 UChar32 c;
4284 c = testText.char32At(ci);
4285
4286 std::string currentLineFlag = " ";
4287 if (ci == i) {
4288 currentLineFlag = "-->"; // Error position
4289 }
4290
4291 // BMP or SMP character in hex
4292 char hexCodePoint[12];
4293 std::string format = " \\u%04x";
4294 if (c >= 0x10000) {
4295 format = "\\U%08x";
4296 }
4297 sprintf(hexCodePoint, format.c_str(), c);
4298
4299 // Get the class name and character name for the character.
4300 char cName[200];
4301 UErrorCode status = U_ZERO_ERROR;
4302 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4303
4304 char buffer[200];
4305 auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4306 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4307 currentLineFlag.c_str(),
4308 ci,
4309 expectedBreaks[ci] == 0 ? "." : "|", // Reference break
4310 currentBreakData[ci] == 0 ? "." : "|", // Actual break
4311 hexCodePoint,
4312 classNameSize,
4313 mk.classNameFromCodepoint(c).c_str(),
4314 mk.getAppliedRule(ci).c_str(), cName);
4315 (void)ret;
4316 U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4317
4318 // Output the error
4319 if (ci == i) {
4320 errln(buffer);
4321 } else {
4322 infoln(buffer);
4323 }
4324
4325 if (ci >= endContext) { break; }
4326 }
4327 break;
4328 }
4329 }
4330
4331 loopCount++;
4332 }
4333 #endif
4334 }
4335
4336
4337 // Bug 5532. UTF-8 based UText fails in dictionary code.
4338 // This test checks the initial patch,
4339 // which is to just keep it from crashing. Correct word boundaries
4340 // await a proper fix to the dictionary code.
4341 //
TestBug5532(void)4342 void RBBITest::TestBug5532(void) {
4343 // Text includes a mixture of Thai and Latin.
4344 const unsigned char utf8Data[] = {
4345 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4346 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4347 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4348 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4349 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4350 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4351 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4352 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4353 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4354 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4355 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4356
4357 UErrorCode status = U_ZERO_ERROR;
4358 UText utext=UTEXT_INITIALIZER;
4359 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4360 TEST_ASSERT_SUCCESS(status);
4361
4362 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4363 TEST_ASSERT_SUCCESS(status);
4364 if (U_SUCCESS(status)) {
4365 bi->setText(&utext, status);
4366 TEST_ASSERT_SUCCESS(status);
4367
4368 int32_t breakCount = 0;
4369 int32_t previousBreak = -1;
4370 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4371 // For now, just make sure that the break iterator doesn't hang.
4372 TEST_ASSERT(previousBreak < bi->current());
4373 previousBreak = bi->current();
4374 }
4375 TEST_ASSERT(breakCount > 0);
4376 }
4377 delete bi;
4378 utext_close(&utext);
4379 }
4380
4381
TestBug9983(void)4382 void RBBITest::TestBug9983(void) {
4383 UnicodeString text = UnicodeString("\\u002A" // * Other
4384 "\\uFF65" // Other
4385 "\\u309C" // Katakana
4386 "\\uFF9F" // Extend
4387 "\\uFF65" // Other
4388 "\\u0020" // Other
4389 "\\u0000").unescape();
4390
4391 UErrorCode status = U_ZERO_ERROR;
4392 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4393 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4394 TEST_ASSERT_SUCCESS(status);
4395 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4396 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4397 TEST_ASSERT_SUCCESS(status);
4398 if (U_FAILURE(status)) {
4399 return;
4400 }
4401 int32_t offset, rstatus, iterationCount;
4402
4403 brkiter->setText(text);
4404 brkiter->last();
4405 iterationCount = 0;
4406 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4407 iterationCount++;
4408 rstatus = brkiter->getRuleStatus();
4409 (void)rstatus; // Suppress set but not used warning.
4410 if (iterationCount >= 10) {
4411 break;
4412 }
4413 }
4414 TEST_ASSERT(iterationCount == 6);
4415
4416 brkiterPOSIX->setText(text);
4417 brkiterPOSIX->last();
4418 iterationCount = 0;
4419 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4420 iterationCount++;
4421 rstatus = brkiterPOSIX->getRuleStatus();
4422 (void)rstatus; // Suppress set but not used warning.
4423 if (iterationCount >= 10) {
4424 break;
4425 }
4426 }
4427 TEST_ASSERT(iterationCount == 6);
4428 }
4429
4430 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4431 //
TestBug7547()4432 void RBBITest::TestBug7547() {
4433 UnicodeString rules;
4434 UErrorCode status = U_ZERO_ERROR;
4435 UParseError parseError;
4436 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4437 if (status != U_BRK_RULE_SYNTAX) {
4438 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4439 }
4440 if (parseError.line != 1 || parseError.offset != 0) {
4441 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4442 }
4443 }
4444
4445
TestBug12797()4446 void RBBITest::TestBug12797() {
4447 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4448 UErrorCode status = U_ZERO_ERROR;
4449 UParseError parseError;
4450 RuleBasedBreakIterator bi(rules, parseError, status);
4451 if (U_FAILURE(status)) {
4452 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4453 return;
4454 }
4455 UnicodeString text = "abc";
4456 bi.setText(text);
4457 bi.first();
4458 int32_t boundary = bi.next();
4459 if (boundary != 3) {
4460 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4461 }
4462 }
4463
TestBug12918()4464 void RBBITest::TestBug12918() {
4465 // This test triggers an assertion failure in dictbe.cpp
4466 const UChar *crasherString = u"\u3325\u4a16";
4467 UErrorCode status = U_ZERO_ERROR;
4468 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4469 if (U_FAILURE(status)) {
4470 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4471 return;
4472 }
4473 ubrk_first(iter);
4474 int32_t pos = 0;
4475 int32_t lastPos = -1;
4476 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4477 if (pos <= lastPos) {
4478 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4479 break;
4480 }
4481 }
4482 ubrk_close(iter);
4483 }
4484
TestBug12932()4485 void RBBITest::TestBug12932() {
4486 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4487 UnicodeString ruleStr(
4488 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4489 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4490 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4491 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4492 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4493 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4494
4495 UErrorCode status = U_ZERO_ERROR;
4496 UParseError parseError;
4497 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4498 if (status != U_BRK_RULE_SYNTAX) {
4499 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4500 __FILE__, __LINE__, u_errorName(status));
4501 }
4502 }
4503
4504
4505 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4506 // remain undevided by ICU char, word and line break.
TestEmoji()4507 void RBBITest::TestEmoji() {
4508 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4509 UErrorCode status = U_ZERO_ERROR;
4510
4511 CharString testFileName;
4512 testFileName.append(IntlTest::getSourceTestData(status), status);
4513 testFileName.appendPathPart("emoji-test.txt", status);
4514 if (U_FAILURE(status)) {
4515 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4516 return;
4517 }
4518 logln("Opening data file %s\n", testFileName.data());
4519
4520 int len;
4521 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4522 if (U_FAILURE(status) || testFile == NULL) {
4523 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4524 return;
4525 }
4526 UnicodeString testFileAsString(testFile, len);
4527 delete [] testFile;
4528
4529 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4530 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4531 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4532 int32_t lineNumber = 0;
4533
4534 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4535 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4536 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4537 if (U_FAILURE(status)) {
4538 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4539 return;
4540 }
4541
4542 while (lineMatcher.find()) {
4543 ++lineNumber;
4544 UnicodeString line = lineMatcher.group(status);
4545 hexMatcher.reset(line);
4546 UnicodeString testString; // accumulates the emoji sequence.
4547 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4548 UnicodeString hex = hexMatcher.group(1, status);
4549 if (hex.length() > 8) {
4550 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4551 break;
4552 }
4553 CharString hex8;
4554 hex8.appendInvariantChars(hex, status);
4555 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4556 if (c<=0x10ffff) {
4557 testString.append(c);
4558 } else {
4559 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4560 __FILE__, __LINE__, lineNumber, hex8.data());
4561 break;
4562 }
4563 }
4564
4565 if (testString.length() > 1) {
4566 charBreaks->setText(testString);
4567 charBreaks->first();
4568 int32_t firstBreak = charBreaks->next();
4569 if (testString.length() != firstBreak) {
4570 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4571 __FILE__, __LINE__, lineNumber, firstBreak);
4572 }
4573 wordBreaks->setText(testString);
4574 wordBreaks->first();
4575 firstBreak = wordBreaks->next();
4576 if (testString.length() != firstBreak) {
4577 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4578 __FILE__, __LINE__, lineNumber, firstBreak);
4579 }
4580 lineBreaks->setText(testString);
4581 lineBreaks->first();
4582 firstBreak = lineBreaks->next();
4583 if (testString.length() != firstBreak) {
4584 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4585 __FILE__, __LINE__, lineNumber, firstBreak);
4586 }
4587 }
4588 }
4589 #endif
4590 }
4591
4592
4593 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4594
TestBug12519()4595 void RBBITest::TestBug12519() {
4596 UErrorCode status = U_ZERO_ERROR;
4597 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4598 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4599 if (!assertSuccess(WHERE, status)) {
4600 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4601 return;
4602 }
4603 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4604
4605 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4606 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4607
4608 LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4609 assertTrue(WHERE, *biEn == *cloneEn);
4610 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4611
4612 LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4613 assertTrue(WHERE, *biFr == *cloneFr);
4614 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4615
4616 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4617 UnicodeString text("Hallo Welt");
4618 biDe->setText(text);
4619 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4620 *biDe = *biFr;
4621 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4622 }
4623
TestBug12677()4624 void RBBITest::TestBug12677() {
4625 // Check that stripping of comments from rules for getRules() is not confused by
4626 // the presence of '#' characters in the rules that do not introduce comments.
4627 UnicodeString rules(u"!!forward; \n"
4628 "$x = [ab#]; # a set with a # literal. \n"
4629 " # .; # a comment that looks sort of like a rule. \n"
4630 " '#' '?'; # a rule with a quoted # \n"
4631 );
4632
4633 UErrorCode status = U_ZERO_ERROR;
4634 UParseError pe;
4635 RuleBasedBreakIterator bi(rules, pe, status);
4636 assertSuccess(WHERE, status);
4637 UnicodeString rtRules = bi.getRules();
4638 assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules);
4639 }
4640
4641
TestTableRedundancies()4642 void RBBITest::TestTableRedundancies() {
4643 UErrorCode status = U_ZERO_ERROR;
4644
4645 LocalPointer<RuleBasedBreakIterator> bi (
4646 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4647 assertSuccess(WHERE, status);
4648 if (U_FAILURE(status)) return;
4649
4650 RBBIDataWrapper *dw = bi->fData;
4651 const RBBIStateTable *fwtbl = dw->fForwardTable;
4652 UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4653 int32_t numCharClasses = dw->fHeader->fCatCount;
4654 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4655
4656 // Check for duplicate columns (character categories)
4657
4658 std::vector<UnicodeString> columns;
4659 for (int32_t column = 0; column < numCharClasses; column++) {
4660 UnicodeString s;
4661 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4662 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4663 s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4664 }
4665 columns.push_back(s);
4666 }
4667 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4668 for (int c1=1; c1<numCharClasses; c1++) {
4669 int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4670 for (int c2 = c1+1; c2 < limit; c2++) {
4671 if (columns.at(c1) == columns.at(c2)) {
4672 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4673 goto out;
4674 }
4675 }
4676 }
4677 out:
4678
4679 // Check for duplicate states
4680 std::vector<UnicodeString> rows;
4681 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4682 UnicodeString s;
4683 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4684 if (in8Bits) {
4685 s.append(row->r8.fAccepting);
4686 s.append(row->r8.fLookAhead);
4687 s.append(row->r8.fTagsIdx);
4688 for (int32_t column = 0; column < numCharClasses; column++) {
4689 s.append(row->r8.fNextState[column]);
4690 }
4691 } else {
4692 s.append(row->r16.fAccepting);
4693 s.append(row->r16.fLookAhead);
4694 s.append(row->r16.fTagsIdx);
4695 for (int32_t column = 0; column < numCharClasses; column++) {
4696 s.append(row->r16.fNextState[column]);
4697 }
4698 }
4699 rows.push_back(s);
4700 }
4701 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4702 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4703 if (rows.at(r1) == rows.at(r2)) {
4704 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4705 return;
4706 }
4707 }
4708 }
4709 }
4710
4711 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4712 // even after next() has returned DONE.
4713
TestBug13447()4714 void RBBITest::TestBug13447() {
4715 UErrorCode status = U_ZERO_ERROR;
4716 LocalPointer<RuleBasedBreakIterator> bi(
4717 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4718 assertSuccess(WHERE, status);
4719 if (U_FAILURE(status)) return;
4720 UnicodeString data(u"1234");
4721 bi->setText(data);
4722 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4723 assertEquals(WHERE, 4, bi->next());
4724 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4725 assertEquals(WHERE, UBRK_DONE, bi->next());
4726 assertEquals(WHERE, 4, bi->current());
4727 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4728 }
4729
4730 // TestReverse exercises both the synthesized safe reverse rules and the logic
4731 // for filling the break iterator cache when starting from random positions
4732 // in the text.
4733 //
4734 // It's a monkey test, working on random data, with the expected data obtained
4735 // from forward iteration (no safe rules involved), comparing with results
4736 // when indexing into the interior of the string (safe rules needed).
4737
TestReverse()4738 void RBBITest::TestReverse() {
4739 UErrorCode status = U_ZERO_ERROR;
4740
4741 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4742 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4743 assertSuccess(WHERE, status, true);
4744 status = U_ZERO_ERROR;
4745 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4746 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4747 assertSuccess(WHERE, status, true);
4748 status = U_ZERO_ERROR;
4749 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4750 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4751 assertSuccess(WHERE, status, true);
4752 status = U_ZERO_ERROR;
4753 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4754 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4755 assertSuccess(WHERE, status, true);
4756 }
4757
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4758 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4759 if (!bi) {
4760 return;
4761 }
4762
4763 // From the mapping trie in the break iterator's internal data, create a
4764 // vector of UnicodeStrings, one for each character category, containing
4765 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4766 // to avoid an execess of unassigned code points.
4767
4768 RBBIDataWrapper *data = bi->fData;
4769 int32_t categoryCount = data->fHeader->fCatCount;
4770 UCPTrie *trie = data->fTrie;
4771 bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4772 uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4773
4774 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4775 for (int cp=0; cp<0x1fff0; ++cp) {
4776 int cat = ucptrie_get(trie, cp);
4777 cat &= ~dictBit; // And off the dictionary bit from the category.
4778 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4779 if (cat < 0 || cat >= categoryCount) return;
4780 strings[cat].append(cp);
4781 }
4782
4783 icu_rand randomGen;
4784 const int testStringLength = 10000;
4785 UnicodeString testString;
4786
4787 for (int i=0; i<testStringLength; ++i) {
4788 int charClass = randomGen() % categoryCount;
4789 if (strings[charClass].length() > 0) {
4790 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4791 testString.append(cp);
4792 }
4793 }
4794
4795 typedef std::pair<UBool, int32_t> Result;
4796 std::vector<Result> expectedResults;
4797 bi->setText(testString);
4798 for (int i=0; i<testString.length(); ++i) {
4799 bool isboundary = bi->isBoundary(i);
4800 int ruleStatus = bi->getRuleStatus();
4801 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4802 }
4803
4804 for (int i=testString.length()-1; i>=0; --i) {
4805 bi->setText(testString); // clears the internal break cache
4806 Result expected = expectedResults[i];
4807 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4808 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4809 }
4810 }
4811
4812
4813 // Ticket 13692 - finding word boundaries in very large numbers or words could
4814 // be very time consuming. When the problem was present, this void test
4815 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4816
TestBug13692()4817 void RBBITest::TestBug13692() {
4818 UErrorCode status = U_ZERO_ERROR;
4819 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4820 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4821 if (!assertSuccess(WHERE, status, true)) {
4822 return;
4823 }
4824 constexpr int32_t LENGTH = 1000000;
4825 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4826 for (int i=0; i<20; i+=2) {
4827 longNumber.setCharAt(i, u' ');
4828 }
4829 bi->setText(longNumber);
4830 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4831 assertSuccess(WHERE, status);
4832 }
4833
4834
TestProperties()4835 void RBBITest::TestProperties() {
4836 UErrorCode errorCode = U_ZERO_ERROR;
4837 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4838 if (!prependSet.isEmpty()) {
4839 errln(
4840 "[:GCB=Prepend:] is not empty any more. "
4841 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4842 "change this test to the opposite condition.");
4843 }
4844 }
4845
4846
4847 //
4848 // TestDebug - A place-holder test for debugging purposes.
4849 // For putting in fragments of other tests that can be invoked
4850 // for tracing without a lot of unwanted extra stuff happening.
4851 //
TestDebug(void)4852 void RBBITest::TestDebug(void) {
4853 UErrorCode status = U_ZERO_ERROR;
4854 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4855 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4856 if (!assertSuccess(WHERE, status, true)) {
4857 return;
4858 }
4859 const UnicodeString &rules = bi->getRules();
4860 UParseError pe;
4861 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4862 assertSuccess(WHERE, status);
4863 }
4864
4865
4866 //
4867 // TestDebugRules A stub test for use in debugging rule compilation problems.
4868 // Can be freely altered as needed or convenient.
4869 // Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4870 // data files may not be available in all environments.
4871 // Any permanent test cases should be moved to rbbitst.txt
4872 // (see Bug 20303 in that file, for example), or to another test function in this file.
4873 //
TestDebugRules()4874 void RBBITest::TestDebugRules() {
4875 #if 0
4876 const char16_t *rules = u""
4877 "!!quoted_literals_only; \n"
4878 "!!chain; \n"
4879 "!!lookAheadHardBreak; \n"
4880 " \n"
4881 // "[a] / ; \n"
4882 "[a] [b] / [c] [d]; \n"
4883 "[a] [b] / [c] [d] {100}; \n"
4884 "[x] [a] [b] / [c] [d] {100}; \n"
4885 "[a] [b] [c] / [d] {100}; \n"
4886 //" [c] [d] / [e] [f]; \n"
4887 //"[a] [b] / [c]; \n"
4888 ;
4889
4890 UErrorCode status = U_ZERO_ERROR;
4891 CharString path(pathToDataDirectory(), status);
4892 path.appendPathPart("brkitr", status);
4893 path.appendPathPart("rules", status);
4894 path.appendPathPart("line.txt", status);
4895 int len;
4896 std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4897 if (!assertSuccess(WHERE, status)) {
4898 return;
4899 }
4900
4901 UParseError pe;
4902 // rules = testFile.get();
4903 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4904
4905 if (!assertSuccess(WHERE, status)) {
4906 delete bi;
4907 return;
4908 }
4909 // bi->dumpTables();
4910
4911 delete bi;
4912 #endif
4913 }
4914
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4915 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4916 UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4917 int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4918 // Text are duplicate characters from U+4E00 to U+4FFF
4919 UnicodeString text;
4920 for (UChar c = 0x4e00; c < 0x5000; c++) {
4921 text.append(c).append(c);
4922 }
4923 // Generate rule which will caused length+4 character classes and
4924 // length+3 states
4925 UnicodeString rules(u"!!quoted_literals_only;");
4926 for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4927 rules.append(u'\'').append(c).append(c).append(u"';");
4928 }
4929 rules.append(u".;");
4930 UErrorCode status = U_ZERO_ERROR;
4931 UParseError parseError;
4932 RuleBasedBreakIterator bi(rules, parseError, status);
4933
4934 assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4935 assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4936 assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4937 assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4938 assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4939
4940 bi.setText(text);
4941
4942 int32_t pos;
4943 int32_t i = 0;
4944 while ((pos = bi.next()) > 0) {
4945 // The first numChar should not break between the pair
4946 if (i++ < numChar) {
4947 assertEquals(WHERE, i * 2, pos);
4948 } else {
4949 // After the first numChar next(), break on each character.
4950 assertEquals(WHERE, i + numChar, pos);
4951 }
4952 }
4953 while ((pos = bi.previous()) > 0) {
4954 // The first numChar should not break between the pair
4955 if (--i < numChar) {
4956 assertEquals(WHERE, i * 2, pos);
4957 } else {
4958 // After the first numChar next(), break on each character.
4959 assertEquals(WHERE, i + numChar, pos);
4960 }
4961 }
4962 }
4963
Test8BitsTrieWith8BitStateTable()4964 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4965 testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4966 }
4967
Test16BitsTrieWith8BitStateTable()4968 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4969 testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4970 }
4971
Test16BitsTrieWith16BitStateTable()4972 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4973 testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4974 }
4975
Test8BitsTrieWith16BitStateTable()4976 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4977 // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4978 // create state table in 16 bits.
4979
4980 // Generate 510 'a' as text
4981 UnicodeString text;
4982 for (int32_t i = 0; i < 510; i++) {
4983 text.append(u'a');
4984 }
4985
4986 UnicodeString rules(u"!!quoted_literals_only;'");
4987 // 254 'a' in the rule will cause 256 states
4988 for (int32_t i = 0; i < 254; i++) {
4989 rules.append(u'a');
4990 }
4991 rules.append(u"';.;");
4992
4993 UErrorCode status = U_ZERO_ERROR;
4994 UParseError parseError;
4995 LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4996
4997 assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4998 assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4999 assertEquals(WHERE,
5000 false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
5001 bi->setText(text);
5002
5003 // break positions:
5004 // 254, 508, 509, ... 510
5005 assertEquals("next()", 254, bi->next());
5006 int32_t i = 0;
5007 int32_t pos;
5008 while ((pos = bi->next()) > 0) {
5009 assertEquals(WHERE, 508 + i , pos);
5010 i++;
5011 }
5012 i = 0;
5013 while ((pos = bi->previous()) > 0) {
5014 i++;
5015 if (pos >= 508) {
5016 assertEquals(WHERE, 510 - i , pos);
5017 } else {
5018 assertEquals(WHERE, 254 , pos);
5019 }
5020 }
5021 }
5022
5023 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
5024 // that there are no problems with rules at the size that transitions between the two.
5025 //
5026 // A rule that matches a literal string, like 'abcdefghij', will require one state and
5027 // one character class per character in the string. So we can make a rule to tickle the
5028 // boundaries by using literal strings of various lengths.
5029 //
5030 // For both the number of states and the number of character classes, the eight bit format
5031 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
5032 // leaving 120 something available. This test runs the string over the range of 120 - 130,
5033 // which allows some margin for changes to the number of values reserved by the rule builder
5034 // without breaking the test.
5035
TestTable_8_16_Bits()5036 void RBBITest::TestTable_8_16_Bits() {
5037
5038 // testStr serves as both the source of the rule string (truncated to the desired length)
5039 // and as test data to check matching behavior. A break rule consisting of the first 120
5040 // characters of testStr will match the first 120 chars of the full-length testStr.
5041 UnicodeString testStr;
5042 for (UChar c=0x3000; c<0x3200; ++c) {
5043 testStr.append(c);
5044 }
5045
5046 const int32_t startLength = 120; // The shortest rule string to test.
5047 const int32_t endLength = 260; // The longest rule string to test
5048 const int32_t increment = this->quick ? endLength - startLength : 1;
5049
5050 for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
5051 UParseError parseError;
5052 UErrorCode status = U_ZERO_ERROR;
5053
5054 UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
5055 ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
5056 RuleBasedBreakIterator bi(ruleString, parseError, status);
5057 if (!assertSuccess(WHERE, status)) {
5058 errln(ruleString);
5059 break;
5060 }
5061 // bi.dumpTables();
5062
5063 // Verify that the break iterator is functioning - that the first boundary found
5064 // in testStr is at the length of the rule string.
5065 bi.setText(testStr);
5066 assertEquals(WHERE, ruleLen, bi.next());
5067
5068 // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
5069 // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
5070 bi.setText(testStr);
5071 int32_t result = bi.preceding(ruleLen);
5072 assertEquals(WHERE, 0, result);
5073
5074 // Verify that the range of rule lengths being tested cover the transations
5075 // from 8 to 16 bit data.
5076 bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
5077 bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
5078
5079 if (ruleLen == startLength) {
5080 assertEquals(WHERE, true, has8BitRowData);
5081 assertEquals(WHERE, true, has8BitsTrie);
5082 }
5083 if (ruleLen == endLength) {
5084 assertEquals(WHERE, false, has8BitRowData);
5085 assertEquals(WHERE, false, has8BitsTrie);
5086 }
5087 }
5088 }
5089
5090 /* Test handling of a large number of look-ahead rules.
5091 * The number of rules in the test exceeds the implementation limits prior to the
5092 * improvements introduced with #13590.
5093 *
5094 * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5095 * The text being matched is sequential, "ABCDEFGHI..."
5096 *
5097 * The upshot is that the look-ahead rules all match on their preceding context,
5098 * and consequently must save a potential result, but then fail to match on their
5099 * trailing context, so that they don't actually cause a boundary.
5100 *
5101 * Additionally, add a ".*" rule, so there are no boundaries unless a
5102 * look-ahead hard-break rule forces one.
5103 */
TestBug13590()5104 void RBBITest::TestBug13590() {
5105 UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5106
5107 const int NUM_LOOKAHEAD_RULES = 50;
5108 const char16_t STARTING_CHAR = u'\u5000';
5109 char16_t firstChar;
5110 for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5111 firstChar = STARTING_CHAR + ruleNum*2;
5112 rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5113 .append(u' ') .append(u'/') .append(u' ')
5114 .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5115 .append(u';') .append(u'\n');
5116 }
5117
5118 // Change the last rule added from the form "UV / WY" to "UV / WX".
5119 // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5120 rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5121
5122 UErrorCode status = U_ZERO_ERROR;
5123 UParseError parseError;
5124 RuleBasedBreakIterator bi(rules, parseError, status);
5125 if (!assertSuccess(WHERE, status)) {
5126 errln(rules);
5127 return;
5128 }
5129 // bi.dumpTables();
5130
5131 UnicodeString testString;
5132 for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5133 testString.append(c);
5134 }
5135 bi.setText(testString);
5136
5137 int breaksFound = 0;
5138 while (bi.next() != UBRK_DONE) {
5139 ++breaksFound;
5140 }
5141
5142 // Two matches are expected, one from the last rule that was explicitly modified,
5143 // and one at the end of the text.
5144 assertEquals(WHERE, 2, breaksFound);
5145 }
5146
5147
5148 #if U_ENABLE_TRACING
5149 static std::vector<std::string> gData;
5150 static std::vector<int32_t> gEntryFn;
5151 static std::vector<int32_t> gExitFn;
5152 static std::vector<int32_t> gDataFn;
5153
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5154 static void U_CALLCONV traceData(
5155 const void*,
5156 int32_t fnNumber,
5157 int32_t,
5158 const char *,
5159 va_list args) {
5160 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5161 const char* data = va_arg(args, const char*);
5162 gDataFn.push_back(fnNumber);
5163 gData.push_back(data);
5164 }
5165 }
5166
traceEntry(const void *,int32_t fnNumber)5167 static void traceEntry(const void *, int32_t fnNumber) {
5168 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5169 gEntryFn.push_back(fnNumber);
5170 }
5171 }
5172
traceExit(const void *,int32_t fnNumber,const char *,va_list)5173 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5174 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5175 gExitFn.push_back(fnNumber);
5176 }
5177 }
5178
5179
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5180 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5181 assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5182 assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5183 assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5184 assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5185
5186 if (expectedData == nullptr) {
5187 assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5188 assertEquals("utrace_data should not be called ", 0, gData.size());
5189 } else {
5190 assertEquals("utrace_data should be called ", 1, gDataFn.size());
5191 assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5192 assertEquals("utrace_data should be called ", 1, gData.size());
5193 assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5194 }
5195 }
5196
SetupTestTrace()5197 void SetupTestTrace() {
5198 gEntryFn.clear();
5199 gExitFn.clear();
5200 gDataFn.clear();
5201 gData.clear();
5202
5203 const void* context = nullptr;
5204 utrace_setFunctions(context, traceEntry, traceExit, traceData);
5205 utrace_setLevel(UTRACE_INFO);
5206 }
5207
TestTraceCreateCharacter(void)5208 void RBBITest::TestTraceCreateCharacter(void) {
5209 SetupTestTrace();
5210 IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5211 LocalPointer<BreakIterator> brkitr(
5212 BreakIterator::createCharacterInstance("zh-CN", status));
5213 status.errIfFailureAndReset();
5214 assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5215 }
5216
TestTraceCreateTitle(void)5217 void RBBITest::TestTraceCreateTitle(void) {
5218 SetupTestTrace();
5219 IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5220 LocalPointer<BreakIterator> brkitr(
5221 BreakIterator::createTitleInstance("zh-CN", status));
5222 status.errIfFailureAndReset();
5223 assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5224 }
5225
TestTraceCreateSentence(void)5226 void RBBITest::TestTraceCreateSentence(void) {
5227 SetupTestTrace();
5228 IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5229 LocalPointer<BreakIterator> brkitr(
5230 BreakIterator::createSentenceInstance("zh-CN", status));
5231 status.errIfFailureAndReset();
5232 assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5233 }
5234
TestTraceCreateWord(void)5235 void RBBITest::TestTraceCreateWord(void) {
5236 SetupTestTrace();
5237 IcuTestErrorCode status(*this, "TestTraceCreateWord");
5238 LocalPointer<BreakIterator> brkitr(
5239 BreakIterator::createWordInstance("zh-CN", status));
5240 status.errIfFailureAndReset();
5241 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5242 }
5243
TestTraceCreateLine(void)5244 void RBBITest::TestTraceCreateLine(void) {
5245 SetupTestTrace();
5246 IcuTestErrorCode status(*this, "TestTraceCreateLine");
5247 LocalPointer<BreakIterator> brkitr(
5248 BreakIterator::createLineInstance("zh-CN", status));
5249 status.errIfFailureAndReset();
5250 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "");
5251 }
5252
TestTraceCreateLineStrict(void)5253 void RBBITest::TestTraceCreateLineStrict(void) {
5254 SetupTestTrace();
5255 IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5256 LocalPointer<BreakIterator> brkitr(
5257 BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5258 status.errIfFailureAndReset();
5259 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "strict");
5260 }
5261
TestTraceCreateLineNormal(void)5262 void RBBITest::TestTraceCreateLineNormal(void) {
5263 SetupTestTrace();
5264 IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5265 LocalPointer<BreakIterator> brkitr(
5266 BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5267 status.errIfFailureAndReset();
5268 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "normal");
5269 }
5270
TestTraceCreateLineLoose(void)5271 void RBBITest::TestTraceCreateLineLoose(void) {
5272 SetupTestTrace();
5273 IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5274 LocalPointer<BreakIterator> brkitr(
5275 BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5276 status.errIfFailureAndReset();
5277 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "loose");
5278 }
5279
TestTraceCreateBreakEngine(void)5280 void RBBITest::TestTraceCreateBreakEngine(void) {
5281 rbbi_cleanup();
5282 SetupTestTrace();
5283 IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5284 LocalPointer<BreakIterator> brkitr(
5285 BreakIterator::createWordInstance("zh-CN", status));
5286 status.errIfFailureAndReset();
5287 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5288
5289 // To word break the following text, BreakIterator will create 5 dictionary
5290 // break engine internally.
5291 brkitr->setText(
5292 u"test "
5293 u"測試 " // Hani
5294 u"សាកល្បង " // Khmr
5295 u"ທົດສອບ " // Laoo
5296 u"စမ်းသပ်မှု " // Mymr
5297 u"ทดสอบ " // Thai
5298 u"test "
5299 );
5300
5301 // Loop through all the text.
5302 while (brkitr->next() > 0) ;
5303
5304 assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5305 assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5306 assertEquals("utrace_data should be called ", 5, gDataFn.size());
5307
5308 for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5309 assertEquals("utrace_entry should be called ",
5310 UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5311 assertEquals("utrace_exit should be called ",
5312 UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5313 assertEquals("utrace_data should be called ",
5314 UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5315 }
5316
5317 assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5318 assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5319 assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5320 assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5321 assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5322
5323 }
5324 #endif
5325
5326 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5327