1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <algorithm>
18 #include <sstream>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <utility>
23 #include <vector>
24
25 #include "unicode/brkiter.h"
26 #include "unicode/localpointer.h"
27 #include "unicode/numfmt.h"
28 #include "unicode/rbbi.h"
29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30 #include "unicode/regex.h"
31 #endif
32 #include "unicode/schriter.h"
33 #include "unicode/uchar.h"
34 #include "unicode/utf16.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uscript.h"
38 #include "unicode/ustring.h"
39 #include "unicode/utext.h"
40 #include "unicode/utrace.h"
41
42 #include "charstr.h"
43 #include "cmemory.h"
44 #include "cstr.h"
45 #include "cstring.h"
46 #include "intltest.h"
47 #include "lstmbe.h"
48 #include "rbbitst.h"
49 #include "rbbidata.h"
50 #include "utypeinfo.h" // for 'typeid' to work
51 #include "uvector.h"
52 #include "uvectr32.h"
53
54
55 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
56 #include "unicode/filteredbrk.h"
57 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
58
59 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
60 if (!(x)) { \
61 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
62 } \
63 } UPRV_BLOCK_MACRO_END
64
65 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
66 if (U_FAILURE(errcode)) { \
67 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
68 } \
69 } UPRV_BLOCK_MACRO_END
70
71 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
72 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
73 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
74 }
75
76 //---------------------------------------------
77 // runIndexedTest
78 //---------------------------------------------
79
80
81 // Note: Before adding new tests to this file, check whether the desired test data can
82 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
83 // it's much less work than writing a new test, diagnostic output in the event of failures
84 // is good, and the test data file will is shared with ICU4J, so eventually the test
85 // will run there as well, without additional effort.
86
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)87 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
88 {
89 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
90 fTestParams = params;
91
92 TESTCASE_AUTO_BEGIN;
93 #if !UCONFIG_NO_FILE_IO
94 TESTCASE_AUTO(TestBug4153072);
95 #endif
96 #if !UCONFIG_NO_FILE_IO
97 TESTCASE_AUTO(TestUnicodeFiles);
98 #endif
99 TESTCASE_AUTO(TestGetAvailableLocales);
100 TESTCASE_AUTO(TestGetDisplayName);
101 #if !UCONFIG_NO_FILE_IO
102 TESTCASE_AUTO(TestEndBehaviour);
103 TESTCASE_AUTO(TestWordBreaks);
104 TESTCASE_AUTO(TestWordBoundary);
105 TESTCASE_AUTO(TestLineBreaks);
106 TESTCASE_AUTO(TestSentBreaks);
107 TESTCASE_AUTO(TestExtended);
108 #endif
109 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
110 TESTCASE_AUTO(TestMonkey);
111 #endif
112 #if !UCONFIG_NO_FILE_IO
113 TESTCASE_AUTO(TestBug3818);
114 #endif
115 TESTCASE_AUTO(TestDebug);
116 #if !UCONFIG_NO_FILE_IO
117 TESTCASE_AUTO(TestBug5775);
118 #endif
119 TESTCASE_AUTO(TestBug9983);
120 TESTCASE_AUTO(TestDictRules);
121 TESTCASE_AUTO(TestBug5532);
122 TESTCASE_AUTO(TestBug7547);
123 TESTCASE_AUTO(TestBug12797);
124 TESTCASE_AUTO(TestBug12918);
125 TESTCASE_AUTO(TestBug12932);
126 TESTCASE_AUTO(TestEmoji);
127 TESTCASE_AUTO(TestBug12519);
128 TESTCASE_AUTO(TestBug12677);
129 TESTCASE_AUTO(TestTableRedundancies);
130 TESTCASE_AUTO(TestBug13447);
131 TESTCASE_AUTO(TestReverse);
132 TESTCASE_AUTO(TestBug13692);
133 TESTCASE_AUTO(TestDebugRules);
134 TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
135 TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
136 TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
137 TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
138 TESTCASE_AUTO(TestTable_8_16_Bits);
139 TESTCASE_AUTO(TestBug13590);
140 TESTCASE_AUTO(TestUnpairedSurrogate);
141 TESTCASE_AUTO(TestLSTMThai);
142 TESTCASE_AUTO(TestLSTMBurmese);
143 TESTCASE_AUTO(TestRandomAccess);
144
145 #if U_ENABLE_TRACING
146 TESTCASE_AUTO(TestTraceCreateCharacter);
147 TESTCASE_AUTO(TestTraceCreateWord);
148 TESTCASE_AUTO(TestTraceCreateSentence);
149 TESTCASE_AUTO(TestTraceCreateTitle);
150 TESTCASE_AUTO(TestTraceCreateLine);
151 TESTCASE_AUTO(TestTraceCreateLineNormal);
152 TESTCASE_AUTO(TestTraceCreateLineLoose);
153 TESTCASE_AUTO(TestTraceCreateLineStrict);
154 TESTCASE_AUTO(TestTraceCreateLineNormalPhrase);
155 TESTCASE_AUTO(TestTraceCreateLineLoosePhrase);
156 TESTCASE_AUTO(TestTraceCreateLineStrictPhrase);
157 TESTCASE_AUTO(TestTraceCreateLinePhrase);
158 TESTCASE_AUTO(TestTraceCreateBreakEngine);
159 #endif
160
161 TESTCASE_AUTO_END;
162 }
163
164
165 //--------------------------------------------------------------------------------------
166 //
167 // RBBITest constructor and destructor
168 //
169 //--------------------------------------------------------------------------------------
170
RBBITest()171 RBBITest::RBBITest() {
172 fTestParams = nullptr;
173 }
174
175
~RBBITest()176 RBBITest::~RBBITest() {
177 }
178
179
printStringBreaks(UText * tstr,int expected[],int expectedCount)180 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
181 UErrorCode status = U_ZERO_ERROR;
182 char name[100];
183 printf("code alpha extend alphanum type word sent line name\n");
184 int nextExpectedIndex = 0;
185 utext_setNativeIndex(tstr, 0);
186 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
187 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
188 printf("------------------------------------------------ %d\n", j);
189 ++nextExpectedIndex;
190 }
191
192 UChar32 c = utext_next32(tstr);
193 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
194 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
195 u_isUAlphabetic(c),
196 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
197 u_isalnum(c),
198 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
199 u_charType(c),
200 U_SHORT_PROPERTY_NAME),
201 u_getPropertyValueName(UCHAR_WORD_BREAK,
202 u_getIntPropertyValue(c,
203 UCHAR_WORD_BREAK),
204 U_SHORT_PROPERTY_NAME),
205 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
206 u_getIntPropertyValue(c,
207 UCHAR_SENTENCE_BREAK),
208 U_SHORT_PROPERTY_NAME),
209 u_getPropertyValueName(UCHAR_LINE_BREAK,
210 u_getIntPropertyValue(c,
211 UCHAR_LINE_BREAK),
212 U_SHORT_PROPERTY_NAME),
213 name);
214 }
215 }
216
217
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)218 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
219 UErrorCode status = U_ZERO_ERROR;
220 UText *tstr = nullptr;
221 tstr = utext_openConstUnicodeString(nullptr, &ustr, &status);
222 if (U_FAILURE(status)) {
223 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
224 return;
225 }
226 printStringBreaks(tstr, expected, expectedCount);
227 utext_close(tstr);
228 }
229
230
TestBug3818()231 void RBBITest::TestBug3818() {
232 UErrorCode status = U_ZERO_ERROR;
233
234 // Four Thai words...
235 static const char16_t thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
236 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
237 UnicodeString thaiStr(thaiWordData);
238
239 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
240 if (U_FAILURE(status) || bi == nullptr) {
241 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
242 return;
243 }
244 bi->setText(thaiStr);
245
246 int32_t startOfSecondWord = bi->following(1);
247 if (startOfSecondWord != 4) {
248 errln("Fail at file %s, line %d expected start of word at 4, got %d",
249 __FILE__, __LINE__, startOfSecondWord);
250 }
251 startOfSecondWord = bi->following(0);
252 if (startOfSecondWord != 4) {
253 errln("Fail at file %s, line %d expected start of word at 4, got %d",
254 __FILE__, __LINE__, startOfSecondWord);
255 }
256 delete bi;
257 }
258
259
260 //---------------------------------------------
261 //
262 // other tests
263 //
264 //---------------------------------------------
265
TestGetAvailableLocales()266 void RBBITest::TestGetAvailableLocales()
267 {
268 int32_t locCount = 0;
269 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
270
271 if (locCount == 0)
272 dataerrln("getAvailableLocales() returned an empty list!");
273 // Just make sure that it's returning good memory.
274 int32_t i;
275 for (i = 0; i < locCount; ++i) {
276 logln(locList[i].getName());
277 }
278 }
279
280 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()281 void RBBITest::TestGetDisplayName()
282 {
283 UnicodeString result;
284
285 BreakIterator::getDisplayName(Locale::getUS(), result);
286 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
287 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
288 + result);
289
290 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
291 if (result != "French (France)")
292 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
293 + result);
294 }
295 /**
296 * Test End Behaviour
297 * @bug 4068137
298 */
TestEndBehaviour()299 void RBBITest::TestEndBehaviour()
300 {
301 UErrorCode status = U_ZERO_ERROR;
302 UnicodeString testString("boo.");
303 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
304 if (U_FAILURE(status))
305 {
306 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
307 return;
308 }
309 wb->setText(testString);
310
311 if (wb->first() != 0)
312 errln("Didn't get break at beginning of string.");
313 if (wb->next() != 3)
314 errln("Didn't get break before period in \"boo.\"");
315 if (wb->current() != 4 && wb->next() != 4)
316 errln("Didn't get break at end of string.");
317 delete wb;
318 }
319 /*
320 * @bug 4153072
321 */
TestBug4153072()322 void RBBITest::TestBug4153072() {
323 UErrorCode status = U_ZERO_ERROR;
324 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
325 if (U_FAILURE(status))
326 {
327 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
328 return;
329 }
330 UnicodeString str("...Hello, World!...");
331 int32_t begin = 3;
332 int32_t end = str.length() - 3;
333 UBool onBoundary;
334
335 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
336 iter->adoptText(textIterator);
337 int index;
338 // Note: with the switch to UText, there is no way to restrict the
339 // iteration range to begin at an index other than zero.
340 // String character iterators created with a non-zero bound are
341 // treated by RBBI as being empty.
342 for (index = -1; index < begin + 1; ++index) {
343 onBoundary = iter->isBoundary(index);
344 if (index == 0? !onBoundary : onBoundary) {
345 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
346 " and begin index = " + begin);
347 }
348 }
349 delete iter;
350 }
351
352
353 //
354 // Test for problem reported by Ashok Matoria on 9 July 2007
355 // One.<kSoftHyphen><kSpace>Two.
356 //
357 // Sentence break at start (0) and then on calling next() it breaks at
358 // 'T' of "Two". Now, at this point if I do next() and
359 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
360 //
TestBug5775()361 void RBBITest::TestBug5775() {
362 UErrorCode status = U_ZERO_ERROR;
363 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
364 TEST_ASSERT_SUCCESS(status);
365 if (U_FAILURE(status)) {
366 return;
367 }
368 // Check for status first for better handling of no data errors.
369 TEST_ASSERT(bi != nullptr);
370 if (bi == nullptr) {
371 return;
372 }
373
374 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
375 // 01234 56789
376 s = s.unescape();
377 bi->setText(s);
378 int pos = bi->next();
379 TEST_ASSERT(pos == 6);
380 pos = bi->next();
381 TEST_ASSERT(pos == 10);
382 pos = bi->previous();
383 TEST_ASSERT(pos == 6);
384 delete bi;
385 }
386
387
388
389 //------------------------------------------------------------------------------
390 //
391 // RBBITest::Extended Run RBBI Tests from an external test data file
392 //
393 //------------------------------------------------------------------------------
394
395 struct TestParams {
396 BreakIterator *bi; // Break iterator is set while parsing test source.
397 // Changed out whenever test data changes break type.
398
399 UnicodeString dataToBreak; // Data that is built up while parsing the test.
400 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
401 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
402 UVector32 *srcCol;
403
404 UText *textToBreak; // UText, could be UTF8 or UTF16.
405 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
406 CharString utf8String; // UTF-8 form of text to break.
407
TestParamsTestParams408 TestParams(UErrorCode &status) : dataToBreak() {
409 bi = nullptr;
410 expectedBreaks = new UVector32(status);
411 srcLine = new UVector32(status);
412 srcCol = new UVector32(status);
413 textToBreak = nullptr;
414 textMap = new UVector32(status);
415 }
416
~TestParamsTestParams417 ~TestParams() {
418 delete bi;
419 delete expectedBreaks;
420 delete srcLine;
421 delete srcCol;
422 utext_close(textToBreak);
423 delete textMap;
424 }
425
426 int32_t getSrcLine(int32_t bp);
427 int32_t getExpectedBreak(int32_t bp);
428 int32_t getSrcCol(int32_t bp);
429
430 void setUTF16(UErrorCode &status);
431 void setUTF8(UErrorCode &status);
432 };
433
434 // Append a UnicodeString to a CharString with UTF-8 encoding.
435 // Substitute any invalid chars.
436 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)437 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
438 if (U_FAILURE(status)) {
439 return;
440 }
441 int32_t utf8Length;
442 u_strToUTF8WithSub(nullptr, 0, &utf8Length, // Output Buffer, nullptr for preflight.
443 src.getBuffer(), src.length(), // UTF-16 data
444 0xfffd, nullptr, // Substitution char, number of subs.
445 &status);
446 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
447 return;
448 }
449 status = U_ZERO_ERROR;
450 int32_t capacity;
451 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
452 u_strToUTF8WithSub(buffer, utf8Length, nullptr,
453 src.getBuffer(), src.length(),
454 0xfffd, nullptr, &status);
455 dest.append(buffer, utf8Length, status);
456 }
457
458
setUTF16(UErrorCode & status)459 void TestParams::setUTF16(UErrorCode &status) {
460 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
461 textMap->removeAllElements();
462 for (int32_t i=0; i<dataToBreak.length(); i++) {
463 if (i == dataToBreak.getChar32Start(i)) {
464 textMap->addElement(i, status);
465 } else {
466 textMap->addElement(-1, status);
467 }
468 }
469 textMap->addElement(dataToBreak.length(), status);
470 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
471 }
472
473
setUTF8(UErrorCode & status)474 void TestParams::setUTF8(UErrorCode &status) {
475 if (U_FAILURE(status)) {
476 return;
477 }
478 utf8String.clear();
479 CharStringAppend(utf8String, dataToBreak, status);
480 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
481 if (U_FAILURE(status)) {
482 return;
483 }
484
485 textMap->removeAllElements();
486 int32_t utf16Index = 0;
487 for (;;) {
488 textMap->addElement(utf16Index, status);
489 UChar32 c32 = utext_current32(textToBreak);
490 if (c32 < 0) {
491 break;
492 }
493 utf16Index += U16_LENGTH(c32);
494 utext_next32(textToBreak);
495 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
496 textMap->addElement(-1, status);
497 }
498 }
499 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
500 }
501
502
getSrcLine(int32_t bp)503 int32_t TestParams::getSrcLine(int32_t bp) {
504 if (bp >= textMap->size()) {
505 bp = textMap->size() - 1;
506 }
507 int32_t i = 0;
508 for(; bp >= 0 ; --bp) {
509 // Move to a character boundary if we are not on one already.
510 i = textMap->elementAti(bp);
511 if (i >= 0) {
512 break;
513 }
514 }
515 return srcLine->elementAti(i);
516 }
517
518
getExpectedBreak(int32_t bp)519 int32_t TestParams::getExpectedBreak(int32_t bp) {
520 if (bp >= textMap->size()) {
521 return 0;
522 }
523 int32_t i = textMap->elementAti(bp);
524 int32_t retVal = 0;
525 if (i >= 0) {
526 retVal = expectedBreaks->elementAti(i);
527 }
528 return retVal;
529 }
530
531
getSrcCol(int32_t bp)532 int32_t TestParams::getSrcCol(int32_t bp) {
533 if (bp >= textMap->size()) {
534 bp = textMap->size() - 1;
535 }
536 int32_t i = 0;
537 for(; bp >= 0; --bp) {
538 // Move bp to a character boundary if we are not on one already.
539 i = textMap->elementAti(bp);
540 if (i >= 0) {
541 break;
542 }
543 }
544 return srcCol->elementAti(i);
545 }
546
547
executeTest(TestParams * t,UErrorCode & status)548 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
549 int32_t bp;
550 int32_t prevBP;
551 int32_t i;
552
553 TEST_ASSERT_SUCCESS(status);
554 if (U_FAILURE(status)) {
555 return;
556 }
557
558 if (t->bi == nullptr) {
559 return;
560 }
561
562 t->bi->setText(t->textToBreak, status);
563 //
564 // Run the iterator forward
565 //
566 prevBP = -1;
567 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
568 if (prevBP == bp) {
569 // Fail for lack of forward progress.
570 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
571 bp, t->getSrcLine(bp), t->getSrcCol(bp));
572 break;
573 }
574
575 // Check that there we didn't miss an expected break between the last one
576 // and this one.
577 for (i=prevBP+1; i<bp; i++) {
578 if (t->getExpectedBreak(i) != 0) {
579 int expected[] = {0, i};
580 printStringBreaks(t->dataToBreak, expected, 2);
581 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
582 i, t->getSrcLine(i), t->getSrcCol(i));
583 }
584 }
585
586 // Check that the break we did find was expected
587 if (t->getExpectedBreak(bp) == 0) {
588 int expected[] = {0, bp};
589 printStringBreaks(t->textToBreak, expected, 2);
590 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
591 bp, t->getSrcLine(bp), t->getSrcCol(bp));
592 } else {
593 // The break was expected.
594 // Check that the {nnn} tag value is correct.
595 int32_t expectedTagVal = t->getExpectedBreak(bp);
596 if (expectedTagVal == -1) {
597 expectedTagVal = 0;
598 }
599 int32_t line = t->getSrcLine(bp);
600 int32_t rs = t->bi->getRuleStatus();
601 if (rs != expectedTagVal) {
602 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
603 " Actual, Expected status = %4d, %4d",
604 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
605 }
606 }
607
608 prevBP = bp;
609 }
610
611 // Verify that there were no missed expected breaks after the last one found
612 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
613 if (t->getExpectedBreak(i) != 0) {
614 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
615 i, t->getSrcLine(i), t->getSrcCol(i));
616 }
617 }
618
619 //
620 // Run the iterator backwards, verify that the same breaks are found.
621 //
622 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
623 bp = t->bi->last();
624 while (bp != BreakIterator::DONE) {
625 if (prevBP == bp) {
626 // Fail for lack of progress.
627 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
628 bp, t->getSrcLine(bp), t->getSrcCol(bp));
629 break;
630 }
631
632 // Check that we didn't miss an expected break between the last one
633 // and this one. (UVector returns zeros for index out of bounds.)
634 for (i=prevBP-1; i>bp; i--) {
635 if (t->getExpectedBreak(i) != 0) {
636 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
637 i, t->getSrcLine(i), t->getSrcCol(i));
638 }
639 }
640
641 // Check that the break we did find was expected
642 if (t->getExpectedBreak(bp) == 0) {
643 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
644 bp, t->getSrcLine(bp), t->getSrcCol(bp));
645 } else {
646 // The break was expected.
647 // Check that the {nnn} tag value is correct.
648 int32_t expectedTagVal = t->getExpectedBreak(bp);
649 if (expectedTagVal == -1) {
650 expectedTagVal = 0;
651 }
652 int line = t->getSrcLine(bp);
653 int32_t rs = t->bi->getRuleStatus();
654 if (rs != expectedTagVal) {
655 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
656 " Actual, Expected status = %4d, %4d",
657 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
658 }
659 }
660
661 prevBP = bp;
662 bp = t->bi->previous();
663 }
664
665 // Verify that there were no missed breaks prior to the last one found
666 for (i=prevBP-1; i>=0; i--) {
667 if (t->getExpectedBreak(i) != 0) {
668 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
669 i, t->getSrcLine(i), t->getSrcCol(i));
670 }
671 }
672
673 // Check isBoundary()
674 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
675 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
676 UBool boundaryFound = t->bi->isBoundary(i);
677 if (boundaryExpected != boundaryFound) {
678 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
679 " Expected, Actual= %s, %s",
680 i, t->getSrcLine(i), t->getSrcCol(i),
681 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
682 }
683 }
684
685 // Check following()
686 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
687 int32_t actualBreak = t->bi->following(i);
688 int32_t expectedBreak = BreakIterator::DONE;
689 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
690 if (t->getExpectedBreak(j) != 0) {
691 expectedBreak = j;
692 break;
693 }
694 }
695 if (expectedBreak != actualBreak) {
696 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
697 " Expected, Actual= %d, %d",
698 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
699 }
700 }
701
702 // Check preceding()
703 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
704 int32_t actualBreak = t->bi->preceding(i);
705 int32_t expectedBreak = BreakIterator::DONE;
706
707 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
708 // preceding(trailing byte) will return the index of some preceding code point,
709 // not the lead byte of the current code point, even though that has a smaller index.
710 // Therefore, start looking at the expected break data not at i-1, but at
711 // the start of code point index - 1.
712 utext_setNativeIndex(t->textToBreak, i);
713 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
714 for (; j >= 0; j--) {
715 if (t->getExpectedBreak(j) != 0) {
716 expectedBreak = j;
717 break;
718 }
719 }
720 if (expectedBreak != actualBreak) {
721 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
722 " Expected, Actual= %d, %d",
723 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
724 }
725 }
726 }
727
TestExtended()728 void RBBITest::TestExtended() {
729 // The expectations in this test heavily depends on the Thai dictionary.
730 // Therefore, we skip this test under the LSTM configuration.
731 if (skipDictionaryTest()) {
732 return;
733 }
734 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
735 // data driven test closely entangles filtered and regular data.
736 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
737 UErrorCode status = U_ZERO_ERROR;
738 Locale locale("");
739
740 TestParams tp(status);
741
742 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
743 if (U_FAILURE(status)) {
744 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
745 }
746
747 //
748 // Open and read the test data file.
749 //
750 const char *testDataDirectory = IntlTest::getSourceTestData(status);
751 CharString testFileName(testDataDirectory, -1, status);
752 testFileName.append("rbbitst.txt", -1, status);
753
754 int len;
755 char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
756 if (U_FAILURE(status)) {
757 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
758 return;
759 }
760
761 bool skipTest = false; // Skip this test?
762
763 //
764 // Put the test data into a UnicodeString
765 //
766 UnicodeString testString(false, testFile, len);
767
768 enum EParseState{
769 PARSE_COMMENT,
770 PARSE_TAG,
771 PARSE_DATA,
772 PARSE_NUM,
773 PARSE_RULES
774 }
775 parseState = PARSE_TAG;
776
777 EParseState savedState = PARSE_TAG;
778
779 int32_t lineNum = 1;
780 int32_t colStart = 0;
781 int32_t column = 0;
782 int32_t charIdx = 0;
783
784 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
785
786 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
787 int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
788
789 for (charIdx = 0; charIdx < len; ) {
790 status = U_ZERO_ERROR;
791 char16_t c = testString.charAt(charIdx);
792 charIdx++;
793 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
794 // treat CRLF as a unit
795 c = u'\n';
796 charIdx++;
797 }
798 if (c == u'\n' || c == u'\r') {
799 lineNum++;
800 colStart = charIdx;
801 }
802 column = charIdx - colStart + 1;
803
804 switch (parseState) {
805 case PARSE_COMMENT:
806 if (c == u'\n' || c == u'\r') {
807 parseState = savedState;
808 }
809 break;
810
811 case PARSE_TAG:
812 {
813 if (c == u'#') {
814 parseState = PARSE_COMMENT;
815 savedState = PARSE_TAG;
816 break;
817 }
818 if (u_isUWhiteSpace(c)) {
819 break;
820 }
821 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
822 delete tp.bi;
823 tp.bi = BreakIterator::createWordInstance(locale, status);
824 skipTest = false;
825 charIdx += 5;
826 break;
827 }
828 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
829 delete tp.bi;
830 tp.bi = BreakIterator::createCharacterInstance(locale, status);
831 skipTest = false;
832 charIdx += 5;
833 break;
834 }
835 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
836 delete tp.bi;
837 tp.bi = BreakIterator::createLineInstance(locale, status);
838 skipTest = false;
839 #if UCONFIG_USE_ML_PHRASE_BREAKING
840 if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
841 // skip <line> test cases of JP's phrase breaking when ML is enabled.
842 skipTest = true;
843 }
844 #endif
845 charIdx += 5;
846 break;
847 }
848 if (testString.compare(charIdx-1, 8, u"<lineML>") == 0) {
849 delete tp.bi;
850 tp.bi = BreakIterator::createLineInstance(locale, status);
851 skipTest = false;
852 #if !UCONFIG_USE_ML_PHRASE_BREAKING
853 if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
854 // skip <lineML> test cases of JP's phrase breaking when ML is disabled.
855 skipTest = true;
856 }
857 #endif
858 charIdx += 7;
859 break;
860 }
861 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
862 delete tp.bi;
863 tp.bi = BreakIterator::createSentenceInstance(locale, status);
864 skipTest = false;
865 charIdx += 5;
866 break;
867 }
868 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
869 delete tp.bi;
870 tp.bi = BreakIterator::createTitleInstance(locale, status);
871 charIdx += 6;
872 break;
873 }
874
875 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
876 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
877 charIdx = testString.indexOf(u'>', charIdx) + 1;
878 parseState = PARSE_RULES;
879 rules.remove();
880 rulesFirstLine = lineNum;
881 break;
882 }
883
884 // <locale loc_name>
885 localeMatcher.reset(testString);
886 if (localeMatcher.lookingAt(charIdx-1, status)) {
887 UnicodeString localeName = localeMatcher.group(1, status);
888 char localeName8[100];
889 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
890 locale = Locale::createFromName(localeName8);
891 charIdx += localeMatcher.group(0, status).length() - 1;
892 TEST_ASSERT_SUCCESS(status);
893 break;
894 }
895 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
896 parseState = PARSE_DATA;
897 charIdx += 5;
898 tp.dataToBreak = "";
899 tp.expectedBreaks->removeAllElements();
900 tp.srcCol ->removeAllElements();
901 tp.srcLine->removeAllElements();
902 break;
903 }
904
905 errln("line %d: Tag expected in test file.", lineNum);
906 parseState = PARSE_COMMENT;
907 savedState = PARSE_DATA;
908 goto end_test; // Stop the test.
909 }
910 break;
911
912 case PARSE_RULES:
913 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
914 charIdx += 7;
915 parseState = PARSE_TAG;
916 delete tp.bi;
917 UParseError pe;
918 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
919 skipTest = U_FAILURE(status);
920 if (U_FAILURE(status)) {
921 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
922 rulesFirstLine + pe.line - 1, u_errorName(status));
923 }
924 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
925 charIdx += 10;
926 parseState = PARSE_TAG;
927 UErrorCode ec = U_ZERO_ERROR;
928 UParseError pe;
929 RuleBasedBreakIterator bi(rules, pe, ec);
930 if (U_SUCCESS(ec)) {
931 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
932 rulesFirstLine + pe.line - 1);
933 }
934 } else {
935 rules.append(c);
936 }
937 break;
938
939 case PARSE_DATA:
940 if (c == u'•') {
941 int32_t breakIdx = tp.dataToBreak.length();
942 if (tp.expectedBreaks->size() > breakIdx) {
943 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
944 lineNum, column);
945 }
946 tp.expectedBreaks->setSize(breakIdx+1);
947 tp.expectedBreaks->setElementAt(-1, breakIdx);
948 tp.srcLine->setSize(breakIdx+1);
949 tp.srcLine->setElementAt(lineNum, breakIdx);
950 tp.srcCol ->setSize(breakIdx+1);
951 tp.srcCol ->setElementAt(column, breakIdx);
952 break;
953 }
954
955 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
956 // Add final entry to mappings from break location to source file position.
957 // Need one extra because last break position returned is after the
958 // last char in the data, not at the last char.
959 tp.srcLine->addElement(lineNum, status);
960 tp.srcCol ->addElement(column, status);
961
962 parseState = PARSE_TAG;
963 charIdx += 6;
964
965 if (!skipTest) {
966 // RUN THE TEST!
967 status = U_ZERO_ERROR;
968 tp.setUTF16(status);
969 executeTest(&tp, status);
970 TEST_ASSERT_SUCCESS(status);
971
972 // Run again, this time with UTF-8 text wrapped in a UText.
973 status = U_ZERO_ERROR;
974 tp.setUTF8(status);
975 TEST_ASSERT_SUCCESS(status);
976 executeTest(&tp, status);
977 }
978 break;
979 }
980
981 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
982 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
983 // Get the code point from the name and insert it into the test data.
984 // (Damn, no API takes names in Unicode !!!
985 // we've got to take it back to char *)
986 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
987 int32_t nameLength = nameEndIdx - (charIdx+2);
988 char charNameBuf[200];
989 UChar32 theChar = -1;
990 if (nameEndIdx != -1) {
991 UErrorCode status = U_ZERO_ERROR;
992 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
993 charNameBuf[sizeof(charNameBuf)-1] = 0;
994 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
995 if (U_FAILURE(status)) {
996 theChar = -1;
997 }
998 }
999 if (theChar == -1) {
1000 errln("Error in named character in test file at line %d, col %d",
1001 lineNum, column);
1002 } else {
1003 // Named code point was recognized. Insert it
1004 // into the test data.
1005 tp.dataToBreak.append(theChar);
1006 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1007 tp.srcLine->addElement(lineNum, status);
1008 tp.srcCol ->addElement(column, status);
1009 }
1010 }
1011 if (nameEndIdx > charIdx) {
1012 charIdx = nameEndIdx+1;
1013
1014 }
1015 break;
1016 }
1017
1018
1019
1020 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
1021 charIdx++;
1022 int32_t breakIdx = tp.dataToBreak.length();
1023 tp.expectedBreaks->setSize(breakIdx+1);
1024 tp.expectedBreaks->setElementAt(-1, breakIdx);
1025 tp.srcLine->setSize(breakIdx+1);
1026 tp.srcLine->setElementAt(lineNum, breakIdx);
1027 tp.srcCol ->setSize(breakIdx+1);
1028 tp.srcCol ->setElementAt(column, breakIdx);
1029 break;
1030 }
1031
1032 if (c == u'<') {
1033 tagValue = 0;
1034 parseState = PARSE_NUM;
1035 break;
1036 }
1037
1038 if (c == u'#' && column==3) { // TODO: why is column off so far?
1039 parseState = PARSE_COMMENT;
1040 savedState = PARSE_DATA;
1041 break;
1042 }
1043
1044 if (c == u'\\') {
1045 // Check for \ at end of line, a line continuation.
1046 // Advance over (discard) the newline
1047 UChar32 cp = testString.char32At(charIdx);
1048 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1049 // We have a CR LF
1050 // Need an extra increment of the input ptr to move over both of them
1051 charIdx++;
1052 }
1053 if (cp == u'\n' || cp == u'\r') {
1054 lineNum++;
1055 colStart = charIdx;
1056 charIdx++;
1057 break;
1058 }
1059
1060 // Let unescape handle the back slash.
1061 cp = testString.unescapeAt(charIdx);
1062 if (cp != -1) {
1063 // Escape sequence was recognized. Insert the char
1064 // into the test data.
1065 tp.dataToBreak.append(cp);
1066 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1067 tp.srcLine->addElement(lineNum, status);
1068 tp.srcCol ->addElement(column, status);
1069 }
1070 break;
1071 }
1072
1073
1074 // Not a recognized backslash escape sequence.
1075 // Take the next char as a literal.
1076 // TODO: Should this be an error?
1077 c = testString.charAt(charIdx);
1078 charIdx = testString.moveIndex32(charIdx, 1);
1079 }
1080
1081 // Normal, non-escaped data char.
1082 tp.dataToBreak.append(c);
1083
1084 // Save the mapping from offset in the data to line/column numbers in
1085 // the original input file. Will be used for better error messages only.
1086 // If there's an expected break before this char, the slot in the mapping
1087 // vector will already be set for this char; don't overwrite it.
1088 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1089 tp.srcLine->addElement(lineNum, status);
1090 tp.srcCol ->addElement(column, status);
1091 }
1092 break;
1093
1094
1095 case PARSE_NUM:
1096 // We are parsing an expected numeric tag value, like <1234>,
1097 // within a chunk of data.
1098 if (u_isUWhiteSpace(c)) {
1099 break;
1100 }
1101
1102 if (c == u'>') {
1103 // Finished the number. Add the info to the expected break data,
1104 // and switch parse state back to doing plain data.
1105 parseState = PARSE_DATA;
1106 if (tagValue == 0) {
1107 tagValue = -1;
1108 }
1109 int32_t breakIdx = tp.dataToBreak.length();
1110 if (tp.expectedBreaks->size() > breakIdx) {
1111 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1112 lineNum, column);
1113 }
1114 tp.expectedBreaks->setSize(breakIdx+1);
1115 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1116 tp.srcLine->setSize(breakIdx+1);
1117 tp.srcLine->setElementAt(lineNum, breakIdx);
1118 tp.srcCol ->setSize(breakIdx+1);
1119 tp.srcCol ->setElementAt(column, breakIdx);
1120 break;
1121 }
1122
1123 if (u_isdigit(c)) {
1124 tagValue = tagValue*10 + u_charDigitValue(c);
1125 break;
1126 }
1127
1128 errln("Syntax Error in test file at line %d, col %d",
1129 lineNum, column);
1130 parseState = PARSE_COMMENT;
1131 goto end_test; // Stop the test
1132 break;
1133 }
1134
1135
1136 if (U_FAILURE(status)) {
1137 dataerrln("ICU Error %s while parsing test file at line %d.",
1138 u_errorName(status), lineNum);
1139 status = U_ZERO_ERROR;
1140 goto end_test; // Stop the test
1141 }
1142
1143 }
1144
1145 // Reached end of test file. Raise an error if parseState indicates that we are
1146 // within a block that should have been terminated.
1147
1148 if (parseState == PARSE_RULES) {
1149 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1150 lineNum, rulesFirstLine);
1151 }
1152 if (parseState == PARSE_DATA) {
1153 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1154 }
1155
1156
1157 end_test:
1158 delete [] testFile;
1159 #endif
1160 }
1161
1162 //-------------------------------------------------------------------------------
1163 //
1164 // TestDictRules create a break iterator from source rules that includes a
1165 // dictionary range. Regression for bug #7130. Source rules
1166 // do not declare a break iterator type (word, line, sentence, etc.
1167 // but the dictionary code, without a type, would loop.
1168 //
1169 //-------------------------------------------------------------------------------
TestDictRules()1170 void RBBITest::TestDictRules() {
1171 const char *rules = "$dictionary = [a-z]; \n"
1172 "!!forward; \n"
1173 "$dictionary $dictionary; \n"
1174 "!!reverse; \n"
1175 "$dictionary $dictionary; \n";
1176 const char *text = "aa";
1177 UErrorCode status = U_ZERO_ERROR;
1178 UParseError parseError;
1179
1180 RuleBasedBreakIterator bi(rules, parseError, status);
1181 if (U_SUCCESS(status)) {
1182 UnicodeString utext = text;
1183 bi.setText(utext);
1184 int32_t position;
1185 int32_t loops;
1186 for (loops = 0; loops<10; loops++) {
1187 position = bi.next();
1188 if (position == RuleBasedBreakIterator::DONE) {
1189 break;
1190 }
1191 }
1192 TEST_ASSERT(loops == 1);
1193 } else {
1194 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1195 }
1196 }
1197
1198
1199
1200 //--------------------------------------------------------------------------------------------
1201 //
1202 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1203 //
1204 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1205 void RBBITest::TestUnicodeFiles() {
1206 RuleBasedBreakIterator *bi;
1207 UErrorCode status = U_ZERO_ERROR;
1208
1209 bi = dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createCharacterInstance(Locale::getEnglish(), status));
1210 TEST_ASSERT_SUCCESS(status);
1211 if (U_SUCCESS(status)) {
1212 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1213 }
1214 delete bi;
1215
1216 bi = dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status));
1217 TEST_ASSERT_SUCCESS(status);
1218 if (U_SUCCESS(status)) {
1219 runUnicodeTestData("WordBreakTest.txt", bi);
1220 }
1221 delete bi;
1222
1223 bi = dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1224 TEST_ASSERT_SUCCESS(status);
1225 if (U_SUCCESS(status)) {
1226 runUnicodeTestData("SentenceBreakTest.txt", bi);
1227 }
1228 delete bi;
1229
1230 bi = dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status));
1231 TEST_ASSERT_SUCCESS(status);
1232 if (U_SUCCESS(status)) {
1233 runUnicodeTestData("LineBreakTest.txt", bi);
1234 }
1235 delete bi;
1236 }
1237
1238
1239 // Check for test cases from the Unicode test data files that are known to fail
1240 // and should be skipped as known issues because ICU does not fully implement
1241 // the Unicode specifications, or because ICU includes tailorings that differ from
1242 // the Unicode standard.
1243 //
1244 // Test cases are identified by the test data sequence, which tends to be more stable
1245 // across Unicode versions than the test file line numbers.
1246 //
1247 // The test case with ticket "10666" is a dummy, included as an example.
1248
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1249 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1250 static struct TestCase {
1251 const char *fTicketNum;
1252 const char *fFileName;
1253 const char16_t *fString;
1254 } badTestCases[] = {
1255 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1256 // The following tests were originally for
1257 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1258 // However, that ticket has been closed as fixed but these tests still fail, so
1259 // ICU-21097 has been created to investigate and address these remaining issues.
1260 {"21097", "LineBreakTest.txt", u"-#"},
1261 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1262 {"21097", "LineBreakTest.txt", u"\u002d\u00a7"},
1263 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1264 {"21097", "LineBreakTest.txt", u"\u002d\U00050005"},
1265 {"21097", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1266 {"21097", "LineBreakTest.txt", u"\u002d\u0e01"},
1267 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1268
1269 // The following tests were originally for
1270 // Issue ICU-12017 Improve line break around numbers.
1271 // However, that ticket has been closed as fixed but these tests still fail, so
1272 // ICU-21097 has been created to investigate and address these remaining issues.
1273 {"21097", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1274 {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1275 {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1276 {"21097", "LineBreakTest.txt", u"a.2 "},
1277 {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1278 {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1279 {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1280 {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1281 {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1282 {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1283 {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1284 {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1285 {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1286 {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1287
1288 // ICU-22127 until UAX #29 wordbreak is update for the colon changes in ICU-22112,
1289 // need to skip some tests in WordBreakTest.txt
1290 {"22127", "WordBreakTest.txt", u"a:"},
1291 {"22127", "WordBreakTest.txt", u"A:"},
1292 };
1293
1294 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1295 const TestCase &badCase = badTestCases[n];
1296 if (!strcmp(fileName, badCase.fFileName) &&
1297 testCase.startsWith(UnicodeString(badCase.fString))) {
1298 return logKnownIssue(badCase.fTicketNum);
1299 }
1300 }
1301 return false;
1302 }
1303
1304
1305 //--------------------------------------------------------------------------------------------
1306 //
1307 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1308 //
1309 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1310 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1311 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1312 UErrorCode status = U_ZERO_ERROR;
1313
1314 //
1315 // Open and read the test data file, put it into a UnicodeString.
1316 //
1317 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1318 char testFileName[1000];
1319 if (testDataDirectory == nullptr || strlen(testDataDirectory) >= sizeof(testFileName)) {
1320 dataerrln("Can't open test data. Path too long.");
1321 return;
1322 }
1323 strcpy(testFileName, testDataDirectory);
1324 strcat(testFileName, fileName);
1325
1326 logln("Opening data file %s\n", fileName);
1327
1328 int len;
1329 char16_t *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1330 if (status != U_FILE_ACCESS_ERROR) {
1331 TEST_ASSERT_SUCCESS(status);
1332 TEST_ASSERT(testFile != nullptr);
1333 }
1334 if (U_FAILURE(status) || testFile == nullptr) {
1335 return; /* something went wrong, error already output */
1336 }
1337 UnicodeString testFileAsString(true, testFile, len);
1338
1339 //
1340 // Parse the test data file using a regular expression.
1341 // Each kind of token is recognized in its own capture group; what type of item was scanned
1342 // is identified by which group had a match.
1343 //
1344 // Capture Group # 1 2 3 4 5
1345 // Parses this item: divide x hex digits comment \n unrecognized \n
1346 //
1347 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1348 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1349 UnicodeString testString;
1350 UVector32 breakPositions(status);
1351 int lineNumber = 1;
1352 TEST_ASSERT_SUCCESS(status);
1353 if (U_FAILURE(status)) {
1354 return;
1355 }
1356
1357 //
1358 // Scan through each test case, building up the string to be broken in testString,
1359 // and the positions that should be boundaries in the breakPositions vector.
1360 //
1361 int spin = 0;
1362 while (tokenMatcher.find()) {
1363 if(tokenMatcher.hitEnd()) {
1364 /* Shouldn't Happen(TM). This means we didn't find the symbols we were looking for.
1365 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1366 and caused an infinite loop here on EBCDIC systems!
1367 */
1368 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1369 // return;
1370 }
1371 if (tokenMatcher.start(1, status) >= 0) {
1372 // Scanned a divide sign, indicating a break position in the test data.
1373 if (testString.length()>0) {
1374 breakPositions.addElement(testString.length(), status);
1375 }
1376 }
1377 else if (tokenMatcher.start(2, status) >= 0) {
1378 // Scanned an 'x', meaning no break at this position in the test data
1379 // Nothing to be done here.
1380 }
1381 else if (tokenMatcher.start(3, status) >= 0) {
1382 // Scanned Hex digits. Convert them to binary, append to the character data string.
1383 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1384 int length = hexNumber.length();
1385 if (length<=8) {
1386 char buf[10];
1387 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1388 UChar32 c = (UChar32)strtol(buf, nullptr, 16);
1389 if (c<=0x10ffff) {
1390 testString.append(c);
1391 } else {
1392 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1393 fileName, lineNumber);
1394 }
1395 } else {
1396 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1397 fileName, lineNumber);
1398 }
1399 }
1400 else if (tokenMatcher.start(4, status) >= 0) {
1401 // Scanned to end of a line, possibly skipping over a comment in the process.
1402 // If the line from the file contained test data, run the test now.
1403 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1404 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1405 }
1406
1407 // Clear out this test case.
1408 // The string and breakPositions vector will be refilled as the next
1409 // test case is parsed.
1410 testString.remove();
1411 breakPositions.removeAllElements();
1412 lineNumber++;
1413 } else {
1414 // Scanner catchall. Something unrecognized appeared on the line.
1415 char token[16];
1416 UnicodeString uToken = tokenMatcher.group(0, status);
1417 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1418 token[sizeof(token)-1] = 0;
1419 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1420
1421 // Clean up, in preparation for continuing with the next line.
1422 testString.remove();
1423 breakPositions.removeAllElements();
1424 lineNumber++;
1425 }
1426 TEST_ASSERT_SUCCESS(status);
1427 if (U_FAILURE(status)) {
1428 break;
1429 }
1430 }
1431
1432 delete [] testFile;
1433 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1434 }
1435
1436 //--------------------------------------------------------------------------------------------
1437 //
1438 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1439 // test data files. Do only a simple, forward-only check -
1440 // this test is mostly to check that ICU and the Unicode
1441 // data agree with each other.
1442 //
1443 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1444 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1445 const UnicodeString &testString, // Text data to be broken
1446 UVector32 *breakPositions, // Positions where breaks should be found.
1447 RuleBasedBreakIterator *bi) {
1448 int32_t pos; // Break Position in the test string
1449 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1450 int32_t expectedPos; // Expected break position (index into test string)
1451
1452 bi->setText(testString);
1453 pos = bi->first();
1454 pos = bi->next();
1455
1456 while (pos != BreakIterator::DONE) {
1457 if (expectedI >= breakPositions->size()) {
1458 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1459 testFileName, lineNumber, pos);
1460 break;
1461 }
1462 expectedPos = breakPositions->elementAti(expectedI);
1463 if (pos < expectedPos) {
1464 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1465 testFileName, lineNumber, pos);
1466 break;
1467 }
1468 if (pos > expectedPos) {
1469 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1470 testFileName, lineNumber, expectedPos);
1471 break;
1472 }
1473 pos = bi->next();
1474 expectedI++;
1475 }
1476
1477 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1478 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1479 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1480 }
1481 }
1482
1483
1484
1485 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1486 //---------------------------------------------------------------------------------------
1487 //
1488 // class RBBIMonkeyKind
1489 //
1490 // Monkey Test for Break Iteration
1491 // Abstract interface class. Concrete derived classes independently
1492 // implement the break rules for different iterator types.
1493 //
1494 // The Monkey Test itself uses doesn't know which type of break iterator it is
1495 // testing, but works purely in terms of the interface defined here.
1496 //
1497 //---------------------------------------------------------------------------------------
1498 class RBBIMonkeyKind {
1499 public:
1500 // Return a UVector of UnicodeSets, representing the character classes used
1501 // for this type of iterator.
1502 virtual UVector *charClasses() = 0;
1503
1504 // Set the test text on which subsequent calls to next() will operate
1505 virtual void setText(const UnicodeString &s) = 0;
1506
1507 // Find the next break position, starting from the prev break position, or from zero.
1508 // Return -1 after reaching end of string.
1509 virtual int32_t next(int32_t i) = 0;
1510
1511 // Name of each character class, parallel with charClasses. Used for debugging output
1512 // of characters.
1513 virtual std::vector<std::string>& characterClassNames();
1514
1515 void setAppliedRule(int32_t position, const char* value);
1516
1517 std::string getAppliedRule(int32_t position);
1518
1519 virtual ~RBBIMonkeyKind();
1520 UErrorCode deferredStatus;
1521
1522 std::string classNameFromCodepoint(const UChar32 c);
1523 unsigned int maxClassNameSize();
1524
1525 protected:
1526 RBBIMonkeyKind();
1527 std::vector<std::string> classNames;
1528 std::vector<std::string> appliedRules;
1529
1530 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1531 void prepareAppliedRules(int32_t size );
1532
1533 private:
1534
1535 };
1536
RBBIMonkeyKind()1537 RBBIMonkeyKind::RBBIMonkeyKind() {
1538 deferredStatus = U_ZERO_ERROR;
1539 }
1540
~RBBIMonkeyKind()1541 RBBIMonkeyKind::~RBBIMonkeyKind() {
1542 }
1543
characterClassNames()1544 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1545 return classNames;
1546 }
1547
prepareAppliedRules(int32_t size)1548 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1549 // Remove all the information in the `appliedRules`.
1550 appliedRules.clear();
1551 appliedRules.resize(size + 1);
1552 }
1553
setAppliedRule(int32_t position,const char * value)1554 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1555 appliedRules[position] = value;
1556 }
1557
getAppliedRule(int32_t position)1558 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1559 return appliedRules[position];
1560 }
1561
classNameFromCodepoint(const UChar32 c)1562 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1563 // Simply iterate through charClasses to find character's class
1564 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1565 UnicodeSet *classSet = static_cast<UnicodeSet *>(charClasses()->elementAt(aClassNum));
1566 if (classSet->contains(c)) {
1567 return classNames[aClassNum];
1568 }
1569 }
1570 U_ASSERT(false); // This should not happen.
1571 return "bad class name";
1572 }
1573
maxClassNameSize()1574 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1575 unsigned int maxSize = 0;
1576 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1577 auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1578 if (aClassNumSize > maxSize) {
1579 maxSize = aClassNumSize;
1580 }
1581 }
1582 return maxSize;
1583 }
1584
1585 //----------------------------------------------------------------------------------------
1586 //
1587 // Random Numbers. Similar to standard lib rand() and srand()
1588 // Not using library to
1589 // 1. Get same results on all platforms.
1590 // 2. Get access to current seed, to more easily reproduce failures.
1591 //
1592 //---------------------------------------------------------------------------------------
1593 static uint32_t m_seed = 1;
1594
m_rand()1595 static uint32_t m_rand()
1596 {
1597 m_seed = m_seed * 1103515245 + 12345;
1598 return (uint32_t)(m_seed/65536) % 32768;
1599 }
1600
1601
1602 //------------------------------------------------------------------------------------------
1603 //
1604 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1605 // of RBBIMonkeyKind.
1606 //
1607 //------------------------------------------------------------------------------------------
1608 class RBBICharMonkey: public RBBIMonkeyKind {
1609 public:
1610 RBBICharMonkey();
1611 virtual ~RBBICharMonkey();
1612 virtual UVector *charClasses() override;
1613 virtual void setText(const UnicodeString &s) override;
1614 virtual int32_t next(int32_t i) override;
1615 private:
1616 UVector *fSets;
1617
1618 UnicodeSet *fCRLFSet;
1619 UnicodeSet *fControlSet;
1620 UnicodeSet *fExtendSet;
1621 UnicodeSet *fZWJSet;
1622 UnicodeSet *fRegionalIndicatorSet;
1623 UnicodeSet *fPrependSet;
1624 UnicodeSet *fSpacingSet;
1625 UnicodeSet *fLSet;
1626 UnicodeSet *fVSet;
1627 UnicodeSet *fTSet;
1628 UnicodeSet *fLVSet;
1629 UnicodeSet *fLVTSet;
1630 UnicodeSet *fHangulSet;
1631 UnicodeSet *fExtendedPictSet;
1632 UnicodeSet *fViramaSet;
1633 UnicodeSet *fLinkingConsonantSet;
1634 UnicodeSet *fExtCccZwjSet;
1635 UnicodeSet *fAnySet;
1636
1637 const UnicodeString *fText;
1638 };
1639
1640
RBBICharMonkey()1641 RBBICharMonkey::RBBICharMonkey() {
1642 UErrorCode status = U_ZERO_ERROR;
1643
1644 fText = nullptr;
1645
1646 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1647 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1648 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1649 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1650 fRegionalIndicatorSet =
1651 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1652 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1653 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1654 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1655 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1656 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1657 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1658 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1659 fHangulSet = new UnicodeSet();
1660 fHangulSet->addAll(*fLSet);
1661 fHangulSet->addAll(*fVSet);
1662 fHangulSet->addAll(*fTSet);
1663 fHangulSet->addAll(*fLVSet);
1664 fHangulSet->addAll(*fLVTSet);
1665
1666 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1667 fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1668 "\\p{Indic_Syllabic_Category=Virama}]", status);
1669 fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1670 "\\p{Indic_Syllabic_Category=Consonant}]", status);
1671 fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1672 fAnySet = new UnicodeSet(0, 0x10ffff);
1673
1674 // Create sets of characters, and add the names of the above character sets.
1675 // In each new ICU release, add new names corresponding to the sets above.
1676 fSets = new UVector(status);
1677
1678 // Important: Keep class names the same as the class contents.
1679 fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1680 fSets->addElement(fControlSet, status); classNames.push_back("Control");
1681 fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1682 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1683 if (!fPrependSet->isEmpty()) {
1684 fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1685 }
1686 fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1687 fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1688 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1689 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1690 fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1691 fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1692 fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1693 fSets->addElement(fAnySet, status); classNames.push_back("Any");
1694
1695 if (U_FAILURE(status)) {
1696 deferredStatus = status;
1697 }
1698 }
1699
1700
setText(const UnicodeString & s)1701 void RBBICharMonkey::setText(const UnicodeString &s) {
1702 fText = &s;
1703 prepareAppliedRules(s.length());
1704 }
1705
1706
1707
next(int32_t prevPos)1708 int32_t RBBICharMonkey::next(int32_t prevPos) {
1709 int p0, p1, p2, p3; // Indices of the significant code points around the
1710 // break position being tested. The candidate break
1711 // location is before p2.
1712
1713 int breakPos = -1;
1714
1715 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1716 UChar32 cBase; // for (X Extend*) patterns, the X character.
1717
1718 if (U_FAILURE(deferredStatus)) {
1719 return -1;
1720 }
1721
1722 // Previous break at end of string. return DONE.
1723 if (prevPos >= fText->length()) {
1724 return -1;
1725 }
1726
1727 p0 = p1 = p2 = p3 = prevPos;
1728 c3 = fText->char32At(prevPos);
1729 c0 = c1 = c2 = cBase = 0;
1730 (void)p0; // suppress set but not used warning.
1731 (void)c0;
1732
1733 // Loop runs once per "significant" character position in the input text.
1734 for (;;) {
1735 // Move all of the positions forward in the input string.
1736 p0 = p1; c0 = c1;
1737 p1 = p2; c1 = c2;
1738 p2 = p3; c2 = c3;
1739
1740 // Advance p3 by one codepoint
1741 p3 = fText->moveIndex32(p3, 1);
1742 c3 = fText->char32At(p3);
1743
1744 if (p1 == p2) {
1745 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1746 continue;
1747 }
1748
1749 if (p2 == fText->length()) {
1750 setAppliedRule(p2, "End of String");
1751 break;
1752 }
1753
1754 // No Extend or Format characters may appear between the CR and LF,
1755 // which requires the additional check for p2 immediately following p1.
1756 //
1757 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1758 setAppliedRule(p2, "GB3 CR x LF");
1759 continue;
1760 }
1761
1762 if (fControlSet->contains(c1) ||
1763 c1 == 0x0D ||
1764 c1 == 0x0A) {
1765 setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
1766 break;
1767 }
1768
1769 if (fControlSet->contains(c2) ||
1770 c2 == 0x0D ||
1771 c2 == 0x0A) {
1772 setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
1773 break;
1774 }
1775
1776 if (fLSet->contains(c1) &&
1777 (fLSet->contains(c2) ||
1778 fVSet->contains(c2) ||
1779 fLVSet->contains(c2) ||
1780 fLVTSet->contains(c2))) {
1781 setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
1782 continue;
1783 }
1784
1785 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1786 (fVSet->contains(c2) || fTSet->contains(c2))) {
1787 setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
1788 continue;
1789 }
1790
1791 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1792 fTSet->contains(c2)) {
1793 setAppliedRule(p2, "GB8 ( LVT | T) x T");
1794 continue;
1795 }
1796
1797 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1798 if (!fExtendSet->contains(c1)) {
1799 cBase = c1;
1800 }
1801 setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
1802 continue;
1803 }
1804
1805 if (fSpacingSet->contains(c2)) {
1806 setAppliedRule(p2, "GB9a x SpacingMark");
1807 continue;
1808 }
1809
1810 if (fPrependSet->contains(c1)) {
1811 setAppliedRule(p2, "GB9b Prepend x");
1812 continue;
1813 }
1814
1815 // Note: Viramas are also included in the ExtCccZwj class.
1816 if (fLinkingConsonantSet->contains(c2)) {
1817 int pi = p1;
1818 bool sawVirama = false;
1819 while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1820 if (fViramaSet->contains(fText->char32At(pi))) {
1821 sawVirama = true;
1822 }
1823 pi = fText->moveIndex32(pi, -1);
1824 }
1825 if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1826 setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1827 continue;
1828 }
1829 }
1830
1831 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1832 setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1833 continue;
1834 }
1835
1836 // Note: The first if condition is a little tricky. We only need to force
1837 // a break if there are three or more contiguous RIs. If there are
1838 // only two, a break following will occur via other rules, and will include
1839 // any trailing extend characters, which is needed behavior.
1840 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1841 && fRegionalIndicatorSet->contains(c2)) {
1842 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1843 break;
1844 }
1845 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1846 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1847 continue;
1848 }
1849
1850 setAppliedRule(p2, "GB999 Any <break> Any");
1851 break;
1852 }
1853
1854 breakPos = p2;
1855 return breakPos;
1856 }
1857
1858
1859
charClasses()1860 UVector *RBBICharMonkey::charClasses() {
1861 return fSets;
1862 }
1863
~RBBICharMonkey()1864 RBBICharMonkey::~RBBICharMonkey() {
1865 delete fSets;
1866 delete fCRLFSet;
1867 delete fControlSet;
1868 delete fExtendSet;
1869 delete fRegionalIndicatorSet;
1870 delete fPrependSet;
1871 delete fSpacingSet;
1872 delete fLSet;
1873 delete fVSet;
1874 delete fTSet;
1875 delete fLVSet;
1876 delete fLVTSet;
1877 delete fHangulSet;
1878 delete fAnySet;
1879 delete fZWJSet;
1880 delete fExtendedPictSet;
1881 delete fViramaSet;
1882 delete fLinkingConsonantSet;
1883 delete fExtCccZwjSet;
1884 }
1885
1886 //------------------------------------------------------------------------------------------
1887 //
1888 // class RBBIWordMonkey Word Break specific implementation
1889 // of RBBIMonkeyKind.
1890 //
1891 //------------------------------------------------------------------------------------------
1892 class RBBIWordMonkey: public RBBIMonkeyKind {
1893 public:
1894 RBBIWordMonkey();
1895 virtual ~RBBIWordMonkey();
1896 virtual UVector *charClasses() override;
1897 virtual void setText(const UnicodeString &s) override;
1898 virtual int32_t next(int32_t i) override;
1899 private:
1900 UVector *fSets;
1901
1902 UnicodeSet *fCRSet;
1903 UnicodeSet *fLFSet;
1904 UnicodeSet *fNewlineSet;
1905 UnicodeSet *fRegionalIndicatorSet;
1906 UnicodeSet *fKatakanaSet;
1907 UnicodeSet *fHebrew_LetterSet;
1908 UnicodeSet *fALetterSet;
1909 UnicodeSet *fSingle_QuoteSet;
1910 UnicodeSet *fDouble_QuoteSet;
1911 UnicodeSet *fMidNumLetSet;
1912 UnicodeSet *fMidLetterSet;
1913 UnicodeSet *fMidNumSet;
1914 UnicodeSet *fNumericSet;
1915 UnicodeSet *fFormatSet;
1916 UnicodeSet *fOtherSet = nullptr;
1917 UnicodeSet *fExtendSet;
1918 UnicodeSet *fExtendNumLetSet;
1919 UnicodeSet *fWSegSpaceSet;
1920 UnicodeSet *fDictionarySet = nullptr;
1921 UnicodeSet *fZWJSet;
1922 UnicodeSet *fExtendedPictSet;
1923
1924 const UnicodeString *fText;
1925 };
1926
1927
RBBIWordMonkey()1928 RBBIWordMonkey::RBBIWordMonkey()
1929 {
1930 UErrorCode status = U_ZERO_ERROR;
1931
1932 fSets = new UVector(status);
1933
1934 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
1935 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
1936 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
1937 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
1938 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1939 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1940 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter} @]", status);
1941 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
1942 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
1943 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
1944 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]", status);
1945 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
1946 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1947 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
1948 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1949 // There are some sc=Hani characters with WB=Extend.
1950 // The break rules need to pick one or the other because
1951 // Extend overlapping with something else is messy.
1952 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1953 // in $Han (for $dictionary) and out of $Extend.
1954 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1955 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
1956
1957 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
1958 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1959 if(U_FAILURE(status)) {
1960 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1961 deferredStatus = status;
1962 return;
1963 }
1964
1965 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1966 fDictionarySet->addAll(*fKatakanaSet);
1967 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1968
1969 fALetterSet->removeAll(*fDictionarySet);
1970
1971 fOtherSet = new UnicodeSet();
1972 if(U_FAILURE(status)) {
1973 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1974 deferredStatus = status;
1975 return;
1976 }
1977
1978 fOtherSet->complement();
1979 fOtherSet->removeAll(*fCRSet);
1980 fOtherSet->removeAll(*fLFSet);
1981 fOtherSet->removeAll(*fNewlineSet);
1982 fOtherSet->removeAll(*fKatakanaSet);
1983 fOtherSet->removeAll(*fHebrew_LetterSet);
1984 fOtherSet->removeAll(*fALetterSet);
1985 fOtherSet->removeAll(*fSingle_QuoteSet);
1986 fOtherSet->removeAll(*fDouble_QuoteSet);
1987 fOtherSet->removeAll(*fMidLetterSet);
1988 fOtherSet->removeAll(*fMidNumSet);
1989 fOtherSet->removeAll(*fNumericSet);
1990 fOtherSet->removeAll(*fExtendNumLetSet);
1991 fOtherSet->removeAll(*fWSegSpaceSet);
1992 fOtherSet->removeAll(*fFormatSet);
1993 fOtherSet->removeAll(*fExtendSet);
1994 fOtherSet->removeAll(*fRegionalIndicatorSet);
1995 fOtherSet->removeAll(*fZWJSet);
1996 fOtherSet->removeAll(*fExtendedPictSet);
1997
1998 // Inhibit dictionary characters from being tested at all.
1999 fOtherSet->removeAll(*fDictionarySet);
2000
2001 // Add classes and their names
2002 fSets->addElement(fCRSet, status); classNames.push_back("CR");
2003 fSets->addElement(fLFSet, status); classNames.push_back("LF");
2004 fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2005 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2006 fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2007 fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2008 fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2009 fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2010 // Omit Katakana from fSets, which omits Katakana characters
2011 // from the test data. They are all in the dictionary set,
2012 // which this (old, to be retired) monkey test cannot handle.
2013 //fSets->addElement(fKatakanaSet, status);
2014
2015 fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2016 fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2017 fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2018 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2019 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2020 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2021 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2022 fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2023 fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2024
2025 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2026 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2027
2028 if (U_FAILURE(status)) {
2029 deferredStatus = status;
2030 }
2031 }
2032
setText(const UnicodeString & s)2033 void RBBIWordMonkey::setText(const UnicodeString &s) {
2034 fText = &s;
2035 prepareAppliedRules(s.length());
2036 }
2037
2038
next(int32_t prevPos)2039 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2040 int p0, p1, p2, p3; // Indices of the significant code points around the
2041 // break position being tested. The candidate break
2042 // location is before p2.
2043
2044 int breakPos = -1;
2045
2046 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2047
2048 if (U_FAILURE(deferredStatus)) {
2049 return -1;
2050 }
2051
2052 // Prev break at end of string. return DONE.
2053 if (prevPos >= fText->length()) {
2054 return -1;
2055 }
2056 p0 = p1 = p2 = p3 = prevPos;
2057 c3 = fText->char32At(prevPos);
2058 c0 = c1 = c2 = 0;
2059 (void)p0; // Suppress set but not used warning.
2060
2061 // Loop runs once per "significant" character position in the input text.
2062 for (;;) {
2063 // Move all of the positions forward in the input string.
2064 p0 = p1; c0 = c1;
2065 p1 = p2; c1 = c2;
2066 p2 = p3; c2 = c3;
2067
2068 // Advance p3 by X(Extend | Format)* Rule 4
2069 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2070 do {
2071 p3 = fText->moveIndex32(p3, 1);
2072 c3 = fText->char32At(p3);
2073 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2074 break;
2075 }
2076 }
2077 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2078
2079
2080 if (p1 == p2) {
2081 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2082 continue;
2083 }
2084
2085 if (p2 == fText->length()) {
2086 // Reached end of string. Always a break position.
2087 break;
2088 }
2089
2090 // No Extend or Format characters may appear between the CR and LF,
2091 // which requires the additional check for p2 immediately following p1.
2092 //
2093 if (c1==0x0D && c2==0x0A) {
2094 setAppliedRule(p2, "WB3 CR x LF");
2095 continue;
2096 }
2097
2098 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2099 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2100 break;
2101 }
2102 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2103 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2104 break;
2105 }
2106
2107 // Not ignoring extend chars, so peek into input text to
2108 // get the potential ZWJ, the character immediately preceding c2.
2109 // Sloppy UChar32 indexing: p2-1 may reference trail half
2110 // but char32At will get the full code point.
2111 if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2112 setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
2113 continue;
2114 }
2115
2116 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2117 setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
2118 continue;
2119 }
2120
2121 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2122 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2123 setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2124 continue;
2125 }
2126
2127 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2128 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2129 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2130 setAppliedRule(p2,
2131 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2132 continue;
2133 }
2134
2135 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2136 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2137 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2138 setAppliedRule(p2,
2139 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
2140 continue;
2141 }
2142
2143 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2144 setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
2145 continue;
2146 }
2147
2148 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2149 setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
2150 continue;
2151 }
2152
2153 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2154 setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
2155 continue;
2156 }
2157
2158 if (fNumericSet->contains(c1) &&
2159 fNumericSet->contains(c2)) {
2160 setAppliedRule(p2, "WB8 Numeric x Numeric");
2161 continue;
2162 }
2163
2164 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2165 fNumericSet->contains(c2)) {
2166 setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
2167 continue;
2168 }
2169
2170 if (fNumericSet->contains(c1) &&
2171 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2172 setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
2173 continue;
2174 }
2175
2176 if (fNumericSet->contains(c0) &&
2177 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2178 fNumericSet->contains(c2)) {
2179 setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
2180 continue;
2181 }
2182
2183 if (fNumericSet->contains(c1) &&
2184 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2185 fNumericSet->contains(c3)) {
2186 setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2187 continue;
2188 }
2189
2190 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2191 // all Katakana are handled by the dictionary breaker.
2192 if (fKatakanaSet->contains(c1) &&
2193 fKatakanaSet->contains(c2)) {
2194 setAppliedRule(p2, "WB13 Katakana x Katakana");
2195 continue;
2196 }
2197
2198 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2199 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2200 fExtendNumLetSet->contains(c2)) {
2201 setAppliedRule(p2,
2202 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2203 continue;
2204 }
2205
2206 if (fExtendNumLetSet->contains(c1) &&
2207 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2208 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2209 setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2210 continue;
2211 }
2212
2213 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2214 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2215 break;
2216 }
2217 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2218 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2219 continue;
2220 }
2221
2222 setAppliedRule(p2, "WB999");
2223 break;
2224 }
2225
2226 breakPos = p2;
2227 return breakPos;
2228 }
2229
2230
charClasses()2231 UVector *RBBIWordMonkey::charClasses() {
2232 return fSets;
2233 }
2234
~RBBIWordMonkey()2235 RBBIWordMonkey::~RBBIWordMonkey() {
2236 delete fSets;
2237 delete fCRSet;
2238 delete fLFSet;
2239 delete fNewlineSet;
2240 delete fKatakanaSet;
2241 delete fHebrew_LetterSet;
2242 delete fALetterSet;
2243 delete fSingle_QuoteSet;
2244 delete fDouble_QuoteSet;
2245 delete fMidNumLetSet;
2246 delete fMidLetterSet;
2247 delete fMidNumSet;
2248 delete fNumericSet;
2249 delete fFormatSet;
2250 delete fExtendSet;
2251 delete fExtendNumLetSet;
2252 delete fWSegSpaceSet;
2253 delete fRegionalIndicatorSet;
2254 delete fDictionarySet;
2255 delete fOtherSet;
2256 delete fZWJSet;
2257 delete fExtendedPictSet;
2258 }
2259
2260
2261
2262
2263 //------------------------------------------------------------------------------------------
2264 //
2265 // class RBBISentMonkey Sentence Break specific implementation
2266 // of RBBIMonkeyKind.
2267 //
2268 //------------------------------------------------------------------------------------------
2269 class RBBISentMonkey: public RBBIMonkeyKind {
2270 public:
2271 RBBISentMonkey();
2272 virtual ~RBBISentMonkey();
2273 virtual UVector *charClasses() override;
2274 virtual void setText(const UnicodeString &s) override;
2275 virtual int32_t next(int32_t i) override;
2276 private:
2277 int moveBack(int posFrom);
2278 int moveForward(int posFrom);
2279 UChar32 cAt(int pos);
2280
2281 UVector *fSets;
2282
2283 UnicodeSet *fSepSet;
2284 UnicodeSet *fFormatSet;
2285 UnicodeSet *fSpSet;
2286 UnicodeSet *fLowerSet;
2287 UnicodeSet *fUpperSet;
2288 UnicodeSet *fOLetterSet;
2289 UnicodeSet *fNumericSet;
2290 UnicodeSet *fATermSet;
2291 UnicodeSet *fSContinueSet;
2292 UnicodeSet *fSTermSet;
2293 UnicodeSet *fCloseSet;
2294 UnicodeSet *fOtherSet;
2295 UnicodeSet *fExtendSet;
2296
2297 const UnicodeString *fText;
2298 };
2299
RBBISentMonkey()2300 RBBISentMonkey::RBBISentMonkey()
2301 {
2302 UErrorCode status = U_ZERO_ERROR;
2303
2304 fSets = new UVector(status);
2305
2306 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2307 // set and made into character classes of their own. For the monkey impl,
2308 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2309 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2310 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2311 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2312 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2313 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2314 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2315 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2316 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2317 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2318 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2319 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2320 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2321 fOtherSet = new UnicodeSet();
2322
2323 if(U_FAILURE(status)) {
2324 deferredStatus = status;
2325 return;
2326 }
2327
2328 fOtherSet->complement();
2329 fOtherSet->removeAll(*fSepSet);
2330 fOtherSet->removeAll(*fFormatSet);
2331 fOtherSet->removeAll(*fSpSet);
2332 fOtherSet->removeAll(*fLowerSet);
2333 fOtherSet->removeAll(*fUpperSet);
2334 fOtherSet->removeAll(*fOLetterSet);
2335 fOtherSet->removeAll(*fNumericSet);
2336 fOtherSet->removeAll(*fATermSet);
2337 fOtherSet->removeAll(*fSContinueSet);
2338 fOtherSet->removeAll(*fSTermSet);
2339 fOtherSet->removeAll(*fCloseSet);
2340 fOtherSet->removeAll(*fExtendSet);
2341
2342 fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2343 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2344 fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2345 fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2346 fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2347 fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2348 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2349 fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2350 fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2351 fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2352 fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2353 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2354 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2355
2356 if (U_FAILURE(status)) {
2357 deferredStatus = status;
2358 }
2359 }
2360
2361
2362
setText(const UnicodeString & s)2363 void RBBISentMonkey::setText(const UnicodeString &s) {
2364 fText = &s;
2365 prepareAppliedRules(s.length());
2366 }
2367
charClasses()2368 UVector *RBBISentMonkey::charClasses() {
2369 return fSets;
2370 }
2371
2372 // moveBack() Find the "significant" code point preceding the index i.
2373 // Skips over ($Extend | $Format)* .
2374 //
moveBack(int i)2375 int RBBISentMonkey::moveBack(int i) {
2376 if (i <= 0) {
2377 return -1;
2378 }
2379 UChar32 c;
2380 int32_t j = i;
2381 do {
2382 j = fText->moveIndex32(j, -1);
2383 c = fText->char32At(j);
2384 }
2385 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2386 return j;
2387
2388 }
2389
2390
moveForward(int i)2391 int RBBISentMonkey::moveForward(int i) {
2392 if (i>=fText->length()) {
2393 return fText->length();
2394 }
2395 UChar32 c;
2396 int32_t j = i;
2397 do {
2398 j = fText->moveIndex32(j, 1);
2399 c = cAt(j);
2400 }
2401 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2402 return j;
2403 }
2404
cAt(int pos)2405 UChar32 RBBISentMonkey::cAt(int pos) {
2406 if (pos<0 || pos>=fText->length()) {
2407 return -1;
2408 } else {
2409 return fText->char32At(pos);
2410 }
2411 }
2412
next(int32_t prevPos)2413 int32_t RBBISentMonkey::next(int32_t prevPos) {
2414 int p0, p1, p2, p3; // Indices of the significant code points around the
2415 // break position being tested. The candidate break
2416 // location is before p2.
2417
2418 int breakPos = -1;
2419
2420 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2421 UChar32 c;
2422
2423 if (U_FAILURE(deferredStatus)) {
2424 return -1;
2425 }
2426
2427 // Prev break at end of string. return DONE.
2428 if (prevPos >= fText->length()) {
2429 return -1;
2430 }
2431 p0 = p1 = p2 = p3 = prevPos;
2432 c3 = fText->char32At(prevPos);
2433 c0 = c1 = c2 = 0;
2434 (void)p0; // Suppress set but not used warning.
2435
2436 // Loop runs once per "significant" character position in the input text.
2437 for (;;) {
2438 // Move all of the positions forward in the input string.
2439 p0 = p1; c0 = c1;
2440 p1 = p2; c1 = c2;
2441 p2 = p3; c2 = c3;
2442
2443 // Advance p3 by X(Extend | Format)* Rule 4
2444 p3 = moveForward(p3);
2445 c3 = cAt(p3);
2446
2447 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2448 setAppliedRule(p2, "SB3 CR x LF");
2449 continue;
2450 }
2451
2452 if (fSepSet->contains(c1)) {
2453 p2 = p1+1; // Separators don't combine with Extend or Format.
2454
2455 setAppliedRule(p2, "SB4 Sep <break>");
2456 break;
2457 }
2458
2459 if (p2 >= fText->length()) {
2460 // Reached end of string. Always a break position.
2461 setAppliedRule(p2, "SB4 Sep <break>");
2462 break;
2463 }
2464
2465 if (p2 == prevPos) {
2466 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2467 setAppliedRule(p2, "SB4 Sep <break>");
2468 continue;
2469 }
2470
2471 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2472 setAppliedRule(p2, "SB6 ATerm x Numeric");
2473 continue;
2474 }
2475
2476 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2477 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2478 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
2479 continue;
2480 }
2481
2482 // Note: STerm | ATerm are added to the negated part of the expression by a
2483 // note to the Unicode 5.0 documents.
2484 int p8 = p1;
2485 while (fSpSet->contains(cAt(p8))) {
2486 p8 = moveBack(p8);
2487 }
2488 while (fCloseSet->contains(cAt(p8))) {
2489 p8 = moveBack(p8);
2490 }
2491 if (fATermSet->contains(cAt(p8))) {
2492 p8=p2;
2493 for (;;) {
2494 c = cAt(p8);
2495 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2496 fLowerSet->contains(c) || fSepSet->contains(c) ||
2497 fATermSet->contains(c) || fSTermSet->contains(c)) {
2498
2499 setAppliedRule(p2,
2500 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2501 break;
2502 }
2503 p8 = moveForward(p8);
2504 }
2505 if (fLowerSet->contains(cAt(p8))) {
2506
2507 setAppliedRule(p2,
2508 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2509 continue;
2510 }
2511 }
2512
2513 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2514 p8 = p1;
2515 while (fSpSet->contains(cAt(p8))) {
2516 p8 = moveBack(p8);
2517 }
2518 while (fCloseSet->contains(cAt(p8))) {
2519 p8 = moveBack(p8);
2520 }
2521 c = cAt(p8);
2522 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2523 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2524 continue;
2525 }
2526 }
2527
2528 int p9 = p1;
2529 while (fCloseSet->contains(cAt(p9))) {
2530 p9 = moveBack(p9);
2531 }
2532 c = cAt(p9);
2533 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2534 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2535
2536 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
2537 continue;
2538 }
2539 }
2540
2541 int p10 = p1;
2542 while (fSpSet->contains(cAt(p10))) {
2543 p10 = moveBack(p10);
2544 }
2545 while (fCloseSet->contains(cAt(p10))) {
2546 p10 = moveBack(p10);
2547 }
2548 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2549 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2550 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
2551 continue;
2552 }
2553 }
2554
2555 int p11 = p1;
2556 if (fSepSet->contains(cAt(p11))) {
2557 p11 = moveBack(p11);
2558 }
2559 while (fSpSet->contains(cAt(p11))) {
2560 p11 = moveBack(p11);
2561 }
2562 while (fCloseSet->contains(cAt(p11))) {
2563 p11 = moveBack(p11);
2564 }
2565 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2566 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
2567 break;
2568 }
2569
2570 setAppliedRule(p2, "SB12 Any x Any");
2571 continue;
2572 }
2573
2574 breakPos = p2;
2575 return breakPos;
2576 }
2577
~RBBISentMonkey()2578 RBBISentMonkey::~RBBISentMonkey() {
2579 delete fSets;
2580 delete fSepSet;
2581 delete fFormatSet;
2582 delete fSpSet;
2583 delete fLowerSet;
2584 delete fUpperSet;
2585 delete fOLetterSet;
2586 delete fNumericSet;
2587 delete fATermSet;
2588 delete fSContinueSet;
2589 delete fSTermSet;
2590 delete fCloseSet;
2591 delete fOtherSet;
2592 delete fExtendSet;
2593 }
2594
2595
2596
2597 //-------------------------------------------------------------------------------------------
2598 //
2599 // RBBILineMonkey
2600 //
2601 //-------------------------------------------------------------------------------------------
2602
2603 class RBBILineMonkey: public RBBIMonkeyKind {
2604 public:
2605 RBBILineMonkey();
2606 virtual ~RBBILineMonkey();
2607 virtual UVector *charClasses() override;
2608 virtual void setText(const UnicodeString &s) override;
2609 virtual int32_t next(int32_t i) override;
2610 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2611 private:
2612 UVector *fSets;
2613
2614 UnicodeSet *fBK;
2615 UnicodeSet *fCR;
2616 UnicodeSet *fLF;
2617 UnicodeSet *fCM;
2618 UnicodeSet *fNL;
2619 UnicodeSet *fSG;
2620 UnicodeSet *fWJ;
2621 UnicodeSet *fZW;
2622 UnicodeSet *fGL;
2623 UnicodeSet *fCB;
2624 UnicodeSet *fSP;
2625 UnicodeSet *fB2;
2626 UnicodeSet *fBA;
2627 UnicodeSet *fBB;
2628 UnicodeSet *fHH;
2629 UnicodeSet *fHY;
2630 UnicodeSet *fH2;
2631 UnicodeSet *fH3;
2632 UnicodeSet *fCL;
2633 UnicodeSet *fCP;
2634 UnicodeSet *fEX;
2635 UnicodeSet *fIN;
2636 UnicodeSet *fJL;
2637 UnicodeSet *fJV;
2638 UnicodeSet *fJT;
2639 UnicodeSet *fNS;
2640 UnicodeSet *fOP;
2641 UnicodeSet *fQU;
2642 UnicodeSet *fIS;
2643 UnicodeSet *fNU;
2644 UnicodeSet *fPO;
2645 UnicodeSet *fPR;
2646 UnicodeSet *fSY;
2647 UnicodeSet *fAI;
2648 UnicodeSet *fAL;
2649 UnicodeSet *fCJ;
2650 UnicodeSet *fHL;
2651 UnicodeSet *fID;
2652 UnicodeSet *fRI;
2653 UnicodeSet *fXX;
2654 UnicodeSet *fEB;
2655 UnicodeSet *fEM;
2656 UnicodeSet *fZWJ;
2657 UnicodeSet *fOP30;
2658 UnicodeSet *fCP30;
2659 UnicodeSet *fExtPictUnassigned;
2660
2661 BreakIterator *fCharBI;
2662 const UnicodeString *fText;
2663 RegexMatcher *fNumberMatcher;
2664 };
2665
RBBILineMonkey()2666 RBBILineMonkey::RBBILineMonkey() :
2667 RBBIMonkeyKind(),
2668 fSets(nullptr),
2669
2670 fCharBI(nullptr),
2671 fText(nullptr),
2672 fNumberMatcher(nullptr)
2673
2674 {
2675 if (U_FAILURE(deferredStatus)) {
2676 return;
2677 }
2678
2679 UErrorCode status = U_ZERO_ERROR;
2680
2681 fSets = new UVector(status);
2682
2683 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2684 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2685 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2686 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2687 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2688 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2689 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2690 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2691 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2692 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2693 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2694 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2695 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2696 fHH = new UnicodeSet();
2697 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2698 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2699 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2700 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2701 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2702 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2703 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2704 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2705 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2706 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2707 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2708 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2709 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2710 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2711 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2712 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2713 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2714 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2715 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2716 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2717 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2718 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2719 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2720 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2721 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2722 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2723 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2724 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2725 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2726 fOP30 = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2727 fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2728 fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2729
2730 if (U_FAILURE(status)) {
2731 deferredStatus = status;
2732 return;
2733 }
2734
2735 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2736 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2737 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2738
2739 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2740 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2741
2742 fHH->add(u'\u2010'); // Hyphen, '‐'
2743
2744 // Sets and names.
2745 fSets->addElement(fBK, status); classNames.push_back("fBK");
2746 fSets->addElement(fCR, status); classNames.push_back("fCR");
2747 fSets->addElement(fLF, status); classNames.push_back("fLF");
2748 fSets->addElement(fCM, status); classNames.push_back("fCM");
2749 fSets->addElement(fNL, status); classNames.push_back("fNL");
2750 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2751 fSets->addElement(fZW, status); classNames.push_back("fZW");
2752 fSets->addElement(fGL, status); classNames.push_back("fGL");
2753 fSets->addElement(fCB, status); classNames.push_back("fCB");
2754 fSets->addElement(fSP, status); classNames.push_back("fSP");
2755 fSets->addElement(fB2, status); classNames.push_back("fB2");
2756 fSets->addElement(fBA, status); classNames.push_back("fBA");
2757 fSets->addElement(fBB, status); classNames.push_back("fBB");
2758 fSets->addElement(fHY, status); classNames.push_back("fHY");
2759 fSets->addElement(fH2, status); classNames.push_back("fH2");
2760 fSets->addElement(fH3, status); classNames.push_back("fH3");
2761 fSets->addElement(fCL, status); classNames.push_back("fCL");
2762 fSets->addElement(fCP, status); classNames.push_back("fCP");
2763 fSets->addElement(fEX, status); classNames.push_back("fEX");
2764 fSets->addElement(fIN, status); classNames.push_back("fIN");
2765 fSets->addElement(fJL, status); classNames.push_back("fJL");
2766 fSets->addElement(fJT, status); classNames.push_back("fJT");
2767 fSets->addElement(fJV, status); classNames.push_back("fJV");
2768 fSets->addElement(fNS, status); classNames.push_back("fNS");
2769 fSets->addElement(fOP, status); classNames.push_back("fOP");
2770 fSets->addElement(fQU, status); classNames.push_back("fQU");
2771 fSets->addElement(fIS, status); classNames.push_back("fIS");
2772 fSets->addElement(fNU, status); classNames.push_back("fNU");
2773 fSets->addElement(fPO, status); classNames.push_back("fPO");
2774 fSets->addElement(fPR, status); classNames.push_back("fPR");
2775 fSets->addElement(fSY, status); classNames.push_back("fSY");
2776 fSets->addElement(fAI, status); classNames.push_back("fAI");
2777 fSets->addElement(fAL, status); classNames.push_back("fAL");
2778 fSets->addElement(fHL, status); classNames.push_back("fHL");
2779 fSets->addElement(fID, status); classNames.push_back("fID");
2780 fSets->addElement(fRI, status); classNames.push_back("fRI");
2781 fSets->addElement(fSG, status); classNames.push_back("fSG");
2782 fSets->addElement(fEB, status); classNames.push_back("fEB");
2783 fSets->addElement(fEM, status); classNames.push_back("fEM");
2784 fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2785 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2786 fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2787 fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2788 fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
2789
2790 const char *rules =
2791 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2792 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2793 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2794 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2795 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2796 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2797 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2798
2799 fNumberMatcher = new RegexMatcher(
2800 UnicodeString(rules, -1, US_INV), 0, status);
2801
2802 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2803
2804 if (U_FAILURE(status)) {
2805 deferredStatus = status;
2806 }
2807
2808 }
2809
2810
setText(const UnicodeString & s)2811 void RBBILineMonkey::setText(const UnicodeString &s) {
2812 fText = &s;
2813 fCharBI->setText(s);
2814 prepareAppliedRules(s.length());
2815 fNumberMatcher->reset(s);
2816 }
2817
2818 //
2819 // rule9Adjust
2820 // Line Break TR rules 9 and 10 implementation.
2821 // This deals with combining marks and other sequences that
2822 // that must be treated as if they were something other than what they actually are.
2823 //
2824 // This is factored out into a separate function because it must be applied twice for
2825 // each potential break, once to the chars before the position being checked, then
2826 // again to the text following the possible break.
2827 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2828 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2829 if (pos == -1) {
2830 // Invalid initial position. Happens during the warmup iteration of the
2831 // main loop in next().
2832 return;
2833 }
2834
2835 int32_t nPos = *nextPos;
2836
2837 // LB 9 Keep combining sequences together.
2838 // advance over any CM class chars. Note that Line Break CM is different
2839 // from the normal Grapheme Extend property.
2840 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2841 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2842 for (;;) {
2843 *nextChar = fText->char32At(nPos);
2844 if (!fCM->contains(*nextChar)) {
2845 break;
2846 }
2847 nPos = fText->moveIndex32(nPos, 1);
2848 }
2849 }
2850
2851
2852 // LB 9 Treat X CM* as if it were x.
2853 // No explicit action required.
2854
2855 // LB 10 Treat any remaining combining mark as AL
2856 if (fCM->contains(*posChar)) {
2857 *posChar = u'A';
2858 }
2859
2860 // Push the updated nextPos and nextChar back to our caller.
2861 // This only makes a difference if posChar got bigger by consuming a
2862 // combining sequence.
2863 *nextPos = nPos;
2864 *nextChar = fText->char32At(nPos);
2865 }
2866
2867
2868
next(int32_t startPos)2869 int32_t RBBILineMonkey::next(int32_t startPos) {
2870 UErrorCode status = U_ZERO_ERROR;
2871 int32_t pos; // Index of the char following a potential break position
2872 UChar32 thisChar; // Character at above position "pos"
2873
2874 int32_t prevPos; // Index of the char preceding a potential break position
2875 UChar32 prevChar; // Character at above position. Note that prevChar
2876 // and thisChar may not be adjacent because combining
2877 // characters between them will be ignored.
2878
2879 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2880 UChar32 prevCharX2;
2881
2882 int32_t nextPos; // Index of the next character following pos.
2883 // Usually skips over combining marks.
2884 int32_t nextCPPos; // Index of the code point following "pos."
2885 // May point to a combining mark.
2886 int32_t tPos; // temp value.
2887 UChar32 c;
2888
2889 if (U_FAILURE(deferredStatus)) {
2890 return -1;
2891 }
2892
2893 if (startPos >= fText->length()) {
2894 return -1;
2895 }
2896
2897
2898 // Initial values for loop. Loop will run the first time without finding breaks,
2899 // while the invalid values shift out and the "this" and
2900 // "prev" positions are filled in with good values.
2901 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2902 thisChar = prevChar = prevCharX2 = 0;
2903 nextPos = nextCPPos = startPos;
2904
2905
2906 // Loop runs once per position in the test text, until a break position
2907 // is found.
2908 for (;;) {
2909 prevPosX2 = prevPos;
2910 prevCharX2 = prevChar;
2911
2912 prevPos = pos;
2913 prevChar = thisChar;
2914
2915 pos = nextPos;
2916 thisChar = fText->char32At(pos);
2917
2918 nextCPPos = fText->moveIndex32(pos, 1);
2919 nextPos = nextCPPos;
2920
2921
2922 if (pos >= fText->length()) {
2923 setAppliedRule(pos, "LB2 - Break at end of text.");
2924 break;
2925 }
2926
2927
2928 // We do this one out-of-order because the adjustment does not change anything
2929 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2930 // be applied.
2931 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2932 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2933 c = fText->char32At(nextPos);
2934 rule9Adjust(pos, &thisChar, &nextPos, &c);
2935
2936 // If the loop is still warming up - if we haven't shifted the initial
2937 // -1 positions out of prevPos yet - loop back to advance the
2938 // position in the input without any further looking for breaks.
2939 if (prevPos == -1) {
2940 setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2941 continue;
2942 }
2943
2944
2945 if (fBK->contains(prevChar)) {
2946 setAppliedRule(pos, "LB 4 Always break after hard line breaks");
2947 break;
2948 }
2949
2950
2951 if (prevChar == 0x0d && thisChar == 0x0a) {
2952 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
2953 continue;
2954 }
2955 if (prevChar == 0x0d ||
2956 prevChar == 0x0a ||
2957 prevChar == 0x85) {
2958 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
2959 break;
2960 }
2961
2962
2963 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2964 fBK->contains(thisChar)) {
2965 setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
2966 continue;
2967 }
2968
2969
2970 if (fSP->contains(thisChar)) {
2971 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
2972 continue;
2973 }
2974
2975 // !!! ??? Is this the right text for the applied rule?
2976 if (fZW->contains(thisChar)) {
2977 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
2978 continue;
2979 }
2980
2981
2982 // ZW SP* ÷
2983 // Scan backwards from prevChar for SP* ZW
2984 tPos = prevPos;
2985 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2986 tPos = fText->moveIndex32(tPos, -1);
2987 }
2988 if (fZW->contains(fText->char32At(tPos))) {
2989 setAppliedRule(pos, "LB 8 Break after zero width space");
2990 break;
2991 }
2992
2993
2994 // Move this test up, before LB8a, because numbers can match a longer sequence that would
2995 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
2996 if (fNumberMatcher->lookingAt(prevPos, status)) {
2997 if (U_FAILURE(status)) {
2998 setAppliedRule(pos, "LB 25 Numbers");
2999 break;
3000 }
3001 // Matched a number. But could have been just a single digit, which would
3002 // not represent a "no break here" between prevChar and thisChar
3003 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3004 if (numEndIdx > pos) {
3005 // Number match includes at least our two chars being checked
3006 if (numEndIdx > nextPos) {
3007 // Number match includes additional chars. Update pos and nextPos
3008 // so that next loop iteration will continue at the end of the number,
3009 // checking for breaks between last char in number & whatever follows.
3010 pos = nextPos = numEndIdx;
3011 do {
3012 pos = fText->moveIndex32(pos, -1);
3013 thisChar = fText->char32At(pos);
3014 } while (fCM->contains(thisChar));
3015 }
3016 setAppliedRule(pos, "LB 25 Numbers");
3017 continue;
3018 }
3019 }
3020
3021
3022 // The monkey test's way of ignoring combining characters doesn't work
3023 // for this rule. ZJ is also a CM. Need to get the actual character
3024 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3025 {
3026 int32_t prevIdx = fText->moveIndex32(pos, -1);
3027 UChar32 prevC = fText->char32At(prevIdx);
3028 if (fZWJ->contains(prevC)) {
3029 setAppliedRule(pos, "LB 8a ZWJ x");
3030 continue;
3031 }
3032 }
3033
3034
3035 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
3036 //
3037
3038
3039 // x WJ
3040 // WJ x
3041 //
3042 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3043 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
3044 continue;
3045 }
3046
3047
3048 if (fGL->contains(prevChar)) {
3049 setAppliedRule(pos, "LB 12 GL x");
3050 continue;
3051 }
3052
3053
3054 if (!(fSP->contains(prevChar) ||
3055 fBA->contains(prevChar) ||
3056 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3057 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
3058 continue;
3059 }
3060
3061
3062 if (fCL->contains(thisChar) ||
3063 fCP->contains(thisChar) ||
3064 fEX->contains(thisChar) ||
3065 fSY->contains(thisChar)) {
3066 setAppliedRule(pos, "LB 13 Don't break before closings.");
3067 continue;
3068 }
3069
3070
3071 // Scan backwards, checking for this sequence.
3072 // The OP char could include combining marks, so we actually check for
3073 // OP CM* SP*
3074 // Another Twist: The Rule 9 fixes may have changed a SP CM
3075 // sequence into a ID char, so before scanning back through spaces,
3076 // verify that prevChar is indeed a space. The prevChar variable
3077 // may differ from fText[prevPos]
3078 tPos = prevPos;
3079 if (fSP->contains(prevChar)) {
3080 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3081 tPos=fText->moveIndex32(tPos, -1);
3082 }
3083 }
3084 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3085 tPos=fText->moveIndex32(tPos, -1);
3086 }
3087 if (fOP->contains(fText->char32At(tPos))) {
3088 setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3089 continue;
3090 }
3091
3092
3093 if (nextPos < fText->length()) {
3094 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3095 // from a legit ffff character. So test length separately.
3096 UChar32 nextChar = fText->char32At(nextPos);
3097 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3098 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3099 break;
3100 }
3101 }
3102
3103
3104 if (fIS->contains(thisChar)) {
3105 setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
3106 continue;
3107 }
3108
3109
3110 if (fOP->contains(thisChar)) {
3111 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3112 int tPos = prevPos;
3113 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3114 tPos = fText->moveIndex32(tPos, -1);
3115 }
3116 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3117 tPos = fText->moveIndex32(tPos, -1);
3118 }
3119 if (fQU->contains(fText->char32At(tPos))) {
3120 setAppliedRule(pos, "LB 15 QU SP* x OP");
3121 continue;
3122 }
3123 }
3124
3125
3126 // Scan backwards for SP* CM* (CL | CP)
3127 if (fNS->contains(thisChar)) {
3128 int tPos = prevPos;
3129 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3130 tPos = fText->moveIndex32(tPos, -1);
3131 }
3132 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3133 tPos = fText->moveIndex32(tPos, -1);
3134 }
3135 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3136 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
3137 continue;
3138 }
3139 }
3140
3141
3142 if (fB2->contains(thisChar)) {
3143 // Scan backwards, checking for the B2 CM* SP* sequence.
3144 tPos = prevPos;
3145 if (fSP->contains(prevChar)) {
3146 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3147 tPos=fText->moveIndex32(tPos, -1);
3148 }
3149 }
3150 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3151 tPos=fText->moveIndex32(tPos, -1);
3152 }
3153 if (fB2->contains(fText->char32At(tPos))) {
3154 setAppliedRule(pos, "LB 17 B2 SP* x B2");
3155 continue;
3156 }
3157 }
3158
3159
3160 if (fSP->contains(prevChar)) {
3161 setAppliedRule(pos, "LB 18 break after space");
3162 break;
3163 }
3164
3165 // x QU
3166 // QU x
3167 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3168 setAppliedRule(pos, "LB 19");
3169 continue;
3170 }
3171
3172 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3173 setAppliedRule(pos, "LB 20 Break around a CB");
3174 break;
3175 }
3176
3177 // Don't break between Hyphens and letters if a break precedes the hyphen.
3178 // Formerly this was a Finnish tailoring.
3179 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3180 // ^($HY | $HH) $AL;
3181 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3182 prevPosX2 == -1) {
3183 setAppliedRule(pos, "LB 20.09");
3184 continue;
3185 }
3186
3187 if (fBA->contains(thisChar) ||
3188 fHY->contains(thisChar) ||
3189 fNS->contains(thisChar) ||
3190 fBB->contains(prevChar) ) {
3191 setAppliedRule(pos, "LB 21");
3192 continue;
3193 }
3194
3195 if (fHL->contains(prevCharX2) &&
3196 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3197 setAppliedRule(pos, "LB 21a HL (HY | BA) x");
3198 continue;
3199 }
3200
3201 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3202 setAppliedRule(pos, "LB 21b SY x HL");
3203 continue;
3204 }
3205
3206 if (fIN->contains(thisChar)) {
3207 setAppliedRule(pos, "LB 22");
3208 continue;
3209 }
3210
3211
3212 // (AL | HL) x NU
3213 // NU x (AL | HL)
3214 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3215 setAppliedRule(pos, "LB 23");
3216 continue;
3217 }
3218 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3219 setAppliedRule(pos, "LB 23");
3220 continue;
3221 }
3222
3223 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3224 // PR x (ID | EB | EM)
3225 // (ID | EB | EM) x PO
3226 if (fPR->contains(prevChar) &&
3227 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3228 setAppliedRule(pos, "LB 23a");
3229 continue;
3230 }
3231 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3232 fPO->contains(thisChar)) {
3233 setAppliedRule(pos, "LB 23a");
3234 continue;
3235 }
3236
3237 // Do not break between prefix and letters or ideographs.
3238 // (PR | PO) x (AL | HL)
3239 // (AL | HL) x (PR | PO)
3240 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3241 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3242 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3243 continue;
3244 }
3245 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3246 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3247 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3248 continue;
3249 }
3250
3251 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3252
3253 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3254 fJV->contains(thisChar) ||
3255 fH2->contains(thisChar) ||
3256 fH3->contains(thisChar))) {
3257 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3258 continue;
3259 }
3260
3261 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3262 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3263 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3264 continue;
3265 }
3266
3267 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3268 fJT->contains(thisChar)) {
3269 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3270 continue;
3271 }
3272
3273 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3274 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3275 fPO->contains(thisChar)) {
3276 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3277 continue;
3278 }
3279 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3280 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3281 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3282 continue;
3283 }
3284
3285
3286
3287 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3288 setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
3289 continue;
3290 }
3291
3292 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3293 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3294 continue;
3295 }
3296
3297 // (AL | NU) x OP
3298 // CP x (AL | NU)
3299 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3300 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3301 continue;
3302 }
3303 if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3304 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3305 continue;
3306 }
3307
3308 // RI x RI
3309 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3310 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3311 break;
3312 }
3313 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3314 // Two Regional Indicators have been paired.
3315 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3316 // following RI. This is a hack.
3317 thisChar = -1;
3318 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3319 continue;
3320 }
3321
3322 // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3323 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3324 setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
3325 continue;
3326 }
3327
3328 if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3329 setAppliedRule(pos, "LB30b [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
3330 continue;
3331 }
3332
3333 setAppliedRule(pos, "LB 31 Break everywhere else");
3334 break;
3335 }
3336
3337 return pos;
3338 }
3339
3340
charClasses()3341 UVector *RBBILineMonkey::charClasses() {
3342 return fSets;
3343 }
3344
3345
~RBBILineMonkey()3346 RBBILineMonkey::~RBBILineMonkey() {
3347 delete fSets;
3348
3349 delete fBK;
3350 delete fCR;
3351 delete fLF;
3352 delete fCM;
3353 delete fNL;
3354 delete fWJ;
3355 delete fZW;
3356 delete fGL;
3357 delete fCB;
3358 delete fSP;
3359 delete fB2;
3360 delete fBA;
3361 delete fBB;
3362 delete fHH;
3363 delete fHY;
3364 delete fH2;
3365 delete fH3;
3366 delete fCL;
3367 delete fCP;
3368 delete fEX;
3369 delete fIN;
3370 delete fJL;
3371 delete fJV;
3372 delete fJT;
3373 delete fNS;
3374 delete fOP;
3375 delete fQU;
3376 delete fIS;
3377 delete fNU;
3378 delete fPO;
3379 delete fPR;
3380 delete fSY;
3381 delete fAI;
3382 delete fAL;
3383 delete fCJ;
3384 delete fHL;
3385 delete fID;
3386 delete fRI;
3387 delete fSG;
3388 delete fXX;
3389 delete fEB;
3390 delete fEM;
3391 delete fZWJ;
3392 delete fOP30;
3393 delete fCP30;
3394 delete fExtPictUnassigned;
3395
3396 delete fCharBI;
3397 delete fNumberMatcher;
3398 }
3399
3400
3401 //-------------------------------------------------------------------------------------------
3402 //
3403 // TestMonkey
3404 //
3405 // params
3406 // seed=nnnnn Random number starting seed.
3407 // Setting the seed allows errors to be reproduced.
3408 // loop=nnn Looping count. Controls running time.
3409 // -1: run forever.
3410 // 0 or greater: run length.
3411 //
3412 // type = char | word | line | sent | title
3413 //
3414 // Example:
3415 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3416 //
3417 //-------------------------------------------------------------------------------------------
3418
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3419 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3420 int32_t val = defaultVal;
3421 name.append(" *= *(-?\\d+)");
3422 UErrorCode status = U_ZERO_ERROR;
3423 RegexMatcher m(name, params, 0, status);
3424 if (m.find()) {
3425 // The param exists. Convert the string to an int.
3426 char valString[100];
3427 int32_t paramLength = m.end(1, status) - m.start(1, status);
3428 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3429 paramLength = (int32_t)(sizeof(valString)-2);
3430 }
3431 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3432 val = strtol(valString, nullptr, 10);
3433
3434 // Delete this parameter from the params string.
3435 m.reset();
3436 params = m.replaceFirst("", status);
3437 }
3438 U_ASSERT(U_SUCCESS(status));
3439 return val;
3440 }
3441 #endif
3442
3443 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3444 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3445 BreakIterator *bi,
3446 int expected[],
3447 int expectedcount)
3448 {
3449 int count = 0;
3450 int i = 0;
3451 int forward[50];
3452 bi->setText(ustr);
3453 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3454 forward[count] = i;
3455 if (count < expectedcount && expected[count] != i) {
3456 test->errln("%s:%d break forward test failed: expected %d but got %d",
3457 __FILE__, __LINE__, expected[count], i);
3458 break;
3459 }
3460 count ++;
3461 }
3462 if (count != expectedcount) {
3463 printStringBreaks(ustr, expected, expectedcount);
3464 test->errln("%s:%d break forward test failed: missed %d match",
3465 __FILE__, __LINE__, expectedcount - count);
3466 return;
3467 }
3468 // testing boundaries
3469 for (i = 1; i < expectedcount; i ++) {
3470 int j = expected[i - 1];
3471 if (!bi->isBoundary(j)) {
3472 printStringBreaks(ustr, expected, expectedcount);
3473 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3474 __FILE__, __LINE__, j);
3475 return;
3476 }
3477 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3478 if (bi->isBoundary(j)) {
3479 printStringBreaks(ustr, expected, expectedcount);
3480 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3481 __FILE__, __LINE__, j);
3482 return;
3483 }
3484 }
3485 }
3486
3487 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3488 count --;
3489 if (forward[count] != i) {
3490 printStringBreaks(ustr, expected, expectedcount);
3491 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3492 __FILE__, __LINE__, forward[count], i);
3493 break;
3494 }
3495 }
3496 if (count != 0) {
3497 printStringBreaks(ustr, expected, expectedcount);
3498 test->errln("break test previous() failed: missed a match");
3499 return;
3500 }
3501
3502 // testing preceding
3503 for (i = 0; i < expectedcount - 1; i ++) {
3504 // int j = expected[i] + 1;
3505 int j = ustr.moveIndex32(expected[i], 1);
3506 for (; j <= expected[i + 1]; j ++) {
3507 int32_t expectedPreceding = expected[i];
3508 int32_t actualPreceding = bi->preceding(j);
3509 if (actualPreceding != expectedPreceding) {
3510 printStringBreaks(ustr, expected, expectedcount);
3511 test->errln("%s:%d preceding(%d): expected %d, got %d",
3512 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3513 return;
3514 }
3515 }
3516 }
3517 }
3518 #endif
3519
TestWordBreaks()3520 void RBBITest::TestWordBreaks()
3521 {
3522 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3523
3524 Locale locale("en");
3525 UErrorCode status = U_ZERO_ERROR;
3526 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3527 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3528 // Replaced any C+J characters in a row with a random sequence of characters
3529 // of the same length to make our C+J segmentation not get in the way.
3530 static const char *strlist[] =
3531 {
3532 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3533 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3534 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3535 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3536 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3537 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3538 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3539 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3540 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3541 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3542 "\\u2027\\U000e0067\\u0a47\\u00b7",
3543 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3544 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3545 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3546 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3547 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3548 "\\u0027\\u11af\\U000e0057\\u0602",
3549 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3550 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3551 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3552 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3553 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3554 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3555 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3556 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3557 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3558 "\\u18f4\\U000e0049\\u20e7\\u2027",
3559 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3560 "\\ua183\\u102d\\u0bec\\u003a",
3561 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3562 "\\u003a\\u0e57\\u0fad\\u002e",
3563 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3564 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3565 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3566 "\\u003a\\u0664\\u00b7\\u1fba",
3567 "\\u003b\\u0027\\u00b7\\u47a3",
3568 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3569 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3570 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3571 };
3572 int loop;
3573 if (U_FAILURE(status)) {
3574 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3575 return;
3576 }
3577 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3578 // printf("looping %d\n", loop);
3579 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3580 // RBBICharMonkey monkey;
3581 RBBIWordMonkey monkey;
3582
3583 int expected[50];
3584 int expectedcount = 0;
3585
3586 monkey.setText(ustr);
3587 int i;
3588 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3589 expected[expectedcount ++] = i;
3590 }
3591
3592 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3593 }
3594 delete bi;
3595 #endif
3596 }
3597
TestWordBoundary()3598 void RBBITest::TestWordBoundary()
3599 {
3600 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3601 Locale locale("en");
3602 UErrorCode status = U_ZERO_ERROR;
3603 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3604 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3605 if (U_FAILURE(status)) {
3606 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3607 __FILE__, __LINE__, u_errorName(status));
3608 return;
3609 }
3610 char16_t str[50];
3611 static const char *strlist[] =
3612 {
3613 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3614 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3615 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3616 "\\u2027\\U000e0067\\u0a47\\u00b7",
3617 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3618 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3619 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3620 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3621 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3622 "\\u0027\\u11af\\U000e0057\\u0602",
3623 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3624 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3625 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3626 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3627 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3628 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3629 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3630 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3631 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3632 "\\u58f4\\U000e0049\\u20e7\\u2027",
3633 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3634 "\\ua183\\u102d\\u0bec\\u003a",
3635 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3636 "\\u003a\\u0e57\\u0fad\\u002e",
3637 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3638 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3639 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3640 "\\u003a\\u0664\\u00b7\\u1fba",
3641 "\\u003b\\u0027\\u00b7\\u47a3",
3642 };
3643 int loop;
3644 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3645 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3646 UnicodeString ustr(str);
3647 int forward[50];
3648 int count = 0;
3649
3650 bi->setText(ustr);
3651 int prev = -1;
3652 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3653 ++count;
3654 if (count >= UPRV_LENGTHOF(forward)) {
3655 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3656 __FILE__, __LINE__, loop, count, boundary);
3657 return;
3658 }
3659 forward[count] = boundary;
3660 if (boundary <= prev) {
3661 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3662 __FILE__, __LINE__, loop, prev, boundary);
3663 break;
3664 }
3665 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3666 if (bi->isBoundary(nonBoundary)) {
3667 printStringBreaks(ustr, forward, count);
3668 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3669 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3670 return;
3671 }
3672 }
3673 if (!bi->isBoundary(boundary)) {
3674 printStringBreaks(ustr, forward, count);
3675 errln("%s:%d happy boundary test failed: expected %d a boundary",
3676 __FILE__, __LINE__, boundary);
3677 return;
3678 }
3679 prev = boundary;
3680 }
3681 }
3682 }
3683
TestLineBreaks()3684 void RBBITest::TestLineBreaks()
3685 {
3686 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3687 Locale locale("en");
3688 UErrorCode status = U_ZERO_ERROR;
3689 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3690 const int32_t STRSIZE = 50;
3691 char16_t str[STRSIZE];
3692 static const char *strlist[] =
3693 {
3694 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3695 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3696 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3697 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3698 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3699 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3700 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3701 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3702 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3703 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3704 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3705 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3706 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3707 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3708 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3709 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3710 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3711 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3712 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3713 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3714 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3715 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3716 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3717 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3718 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3719 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3720 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3721 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3722 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3723 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3724 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3725 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3726 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3727 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3728 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3729 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3730 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3731 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3732 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3733 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3734 };
3735 int loop;
3736 TEST_ASSERT_SUCCESS(status);
3737 if (U_FAILURE(status)) {
3738 return;
3739 }
3740 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3741 // printf("looping %d\n", loop);
3742 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3743 if (t >= STRSIZE) {
3744 TEST_ASSERT(false);
3745 continue;
3746 }
3747
3748
3749 UnicodeString ustr(str);
3750 RBBILineMonkey monkey;
3751 if (U_FAILURE(monkey.deferredStatus)) {
3752 continue;
3753 }
3754
3755 const int EXPECTEDSIZE = 50;
3756 int expected[EXPECTEDSIZE];
3757 int expectedcount = 0;
3758
3759 monkey.setText(ustr);
3760
3761 int i;
3762 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3763 if (expectedcount >= EXPECTEDSIZE) {
3764 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3765 return;
3766 }
3767 expected[expectedcount ++] = i;
3768 }
3769
3770 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3771 }
3772 delete bi;
3773 #endif
3774 }
3775
TestSentBreaks()3776 void RBBITest::TestSentBreaks()
3777 {
3778 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3779 Locale locale("en");
3780 UErrorCode status = U_ZERO_ERROR;
3781 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3782 char16_t str[200];
3783 static const char *strlist[] =
3784 {
3785 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3786 "This\n",
3787 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3788 "\"Sentence ending with a quote.\" Bye.",
3789 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3790 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3791 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3792 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3793 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3794 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3795 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3796 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3797 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3798 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3799 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3800 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3801 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3802 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3803 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3804 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3805 };
3806 int loop;
3807 if (U_FAILURE(status)) {
3808 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3809 return;
3810 }
3811 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3812 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3813 UnicodeString ustr(str);
3814
3815 RBBISentMonkey monkey;
3816 if (U_FAILURE(monkey.deferredStatus)) {
3817 continue;
3818 }
3819
3820 const int EXPECTEDSIZE = 50;
3821 int expected[EXPECTEDSIZE];
3822 int expectedcount = 0;
3823
3824 monkey.setText(ustr);
3825
3826 int i;
3827 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3828 if (expectedcount >= EXPECTEDSIZE) {
3829 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3830 return;
3831 }
3832 expected[expectedcount ++] = i;
3833 }
3834
3835 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3836 }
3837 delete bi;
3838 #endif
3839 }
3840
TestMonkey()3841 void RBBITest::TestMonkey() {
3842 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3843
3844 UErrorCode status = U_ZERO_ERROR;
3845 int32_t loopCount = 500;
3846 int32_t seed = 1;
3847 UnicodeString breakType = "all";
3848 Locale locale("en");
3849 UBool useUText = false;
3850
3851 if (quick == false) {
3852 loopCount = 10000;
3853 }
3854
3855 if (fTestParams) {
3856 UnicodeString p(fTestParams);
3857 loopCount = getIntParam("loop", p, loopCount);
3858 seed = getIntParam("seed", p, seed);
3859
3860 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3861 if (m.find()) {
3862 breakType = m.group(1, status);
3863 m.reset();
3864 p = m.replaceFirst("", status);
3865 }
3866
3867 RegexMatcher u(" *utext", p, 0, status);
3868 if (u.find()) {
3869 useUText = true;
3870 u.reset();
3871 p = u.replaceFirst("", status);
3872 }
3873
3874
3875 // m.reset(p);
3876 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3877 // Each option is stripped out of the option string as it is processed.
3878 // All options have been checked. The option string should have been completely emptied..
3879 char buf[100];
3880 p.extract(buf, sizeof(buf), nullptr, status);
3881 buf[sizeof(buf)-1] = 0;
3882 errln("Unrecognized or extra parameter: %s\n", buf);
3883 return;
3884 }
3885
3886 }
3887
3888 if (breakType == "char" || breakType == "all") {
3889 RBBICharMonkey m;
3890 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3891 if (U_SUCCESS(status)) {
3892 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3893 if (breakType == "all" && useUText==false) {
3894 // Also run a quick test with UText when "all" is specified
3895 RunMonkey(bi, m, "char", seed, loopCount, true);
3896 }
3897 }
3898 else {
3899 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3900 }
3901 delete bi;
3902 }
3903
3904 if (breakType == "word" || breakType == "all") {
3905 logln("Word Break Monkey Test");
3906 RBBIWordMonkey m;
3907 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3908 if (U_SUCCESS(status)) {
3909 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3910 }
3911 else {
3912 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3913 }
3914 delete bi;
3915 }
3916
3917 if (breakType == "line" || breakType == "all") {
3918 logln("Line Break Monkey Test");
3919 RBBILineMonkey m;
3920 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3921 if (loopCount >= 10) {
3922 loopCount = loopCount / 5; // Line break runs slower than the others.
3923 }
3924 if (U_SUCCESS(status)) {
3925 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3926 }
3927 else {
3928 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3929 }
3930 delete bi;
3931 }
3932
3933 if (breakType == "sent" || breakType == "all" ) {
3934 logln("Sentence Break Monkey Test");
3935 RBBISentMonkey m;
3936 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3937 if (loopCount >= 10) {
3938 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3939 }
3940 if (U_SUCCESS(status)) {
3941 RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3942 }
3943 else {
3944 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3945 }
3946 delete bi;
3947 }
3948
3949 #endif
3950 }
3951
3952 //
3953 // Run a RBBI monkey test. Common routine, for all break iterator types.
3954 // Parameters:
3955 // bi - the break iterator to use
3956 // mk - MonkeyKind, abstraction for obtaining expected results
3957 // name - Name of test (char, word, etc.) for use in error messages
3958 // seed - Seed for starting random number generator (parameter from user)
3959 // numIterations
3960 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3961 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
3962 int32_t numIterations, UBool useUText) {
3963
3964 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3965
3966 const int32_t TESTSTRINGLEN = 500;
3967 UnicodeString testText;
3968 int32_t numCharClasses;
3969 UVector *chClasses;
3970 int expectedCount = 0;
3971 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3972 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3973 char reverseBreaks[TESTSTRINGLEN*2+1];
3974 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3975 char followingBreaks[TESTSTRINGLEN*2+1];
3976 char precedingBreaks[TESTSTRINGLEN*2+1];
3977 int i;
3978 int loopCount = 0;
3979
3980
3981 m_seed = seed;
3982
3983 numCharClasses = mk.charClasses()->size();
3984 chClasses = mk.charClasses();
3985
3986 // Check for errors that occurred during the construction of the MonkeyKind object.
3987 // Can't report them where they occurred because errln() is a method coming from intlTest,
3988 // and is not visible outside of RBBITest :-(
3989 if (U_FAILURE(mk.deferredStatus)) {
3990 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3991 return;
3992 }
3993
3994 // Verify that the character classes all have at least one member.
3995 for (i=0; i<numCharClasses; i++) {
3996 UnicodeSet *s = static_cast<UnicodeSet *>(chClasses->elementAt(i));
3997 if (s == nullptr || s->size() == 0) {
3998 errln("Character Class #%d is null or of zero size.", i);
3999 return;
4000 }
4001 }
4002
4003 // For minimizing width of class name output.
4004 int classNameSize = mk.maxClassNameSize();
4005
4006 while (loopCount < numIterations || numIterations == -1) {
4007 if (numIterations == -1 && loopCount % 10 == 0) {
4008 // If test is running in an infinite loop, display a periodic tic so
4009 // we can tell that it is making progress.
4010 fprintf(stderr, ".");
4011 }
4012 // Save current random number seed, so that we can recreate the random numbers
4013 // for this loop iteration in event of an error.
4014 seed = m_seed;
4015
4016 // Populate a test string with data.
4017 testText.truncate(0);
4018 for (i=0; i<TESTSTRINGLEN; i++) {
4019 int32_t aClassNum = m_rand() % numCharClasses;
4020 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4021 int32_t charIdx = m_rand() % classSet->size();
4022 UChar32 c = classSet->charAt(charIdx);
4023 if (c < 0) { // TODO: deal with sets containing strings.
4024 errln("%s:%d c < 0", __FILE__, __LINE__);
4025 break;
4026 }
4027 // Do not assemble a supplementary character from randomly generated separate surrogates.
4028 // (It could be a dictionary character)
4029 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4030 continue;
4031 }
4032
4033 testText.append(c);
4034 }
4035
4036 // Calculate the expected results for this test string and reset applied rules.
4037 mk.setText(testText);
4038
4039 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4040 expectedBreaks[0] = 1;
4041 int32_t breakPos = 0;
4042 expectedCount = 0;
4043 for (;;) {
4044 breakPos = mk.next(breakPos);
4045 if (breakPos == -1) {
4046 break;
4047 }
4048 if (breakPos > testText.length()) {
4049 errln("breakPos > testText.length()");
4050 }
4051 expectedBreaks[breakPos] = 1;
4052 expectedCount++;
4053 U_ASSERT(expectedCount<testText.length());
4054 }
4055
4056 // Find the break positions using forward iteration
4057 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4058 if (useUText) {
4059 UErrorCode status = U_ZERO_ERROR;
4060 UText *testUText = utext_openReplaceable(nullptr, &testText, &status);
4061 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4062 bi->setText(testUText, status);
4063 TEST_ASSERT_SUCCESS(status);
4064 utext_close(testUText); // The break iterator does a shallow clone of the UText
4065 // This UText can be closed immediately, so long as the
4066 // testText string continues to exist.
4067 } else {
4068 bi->setText(testText);
4069 }
4070
4071 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4072 if (i < 0 || i > testText.length()) {
4073 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4074 break;
4075 }
4076 forwardBreaks[i] = 1;
4077 }
4078
4079 // Find the break positions using reverse iteration
4080 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4081 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4082 if (i < 0 || i > testText.length()) {
4083 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4084 break;
4085 }
4086 reverseBreaks[i] = 1;
4087 }
4088
4089 // Find the break positions using isBoundary() tests.
4090 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4091 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4092 for (i=0; i<=testText.length(); i++) {
4093 isBoundaryBreaks[i] = bi->isBoundary(i);
4094 }
4095
4096
4097 // Find the break positions using the following() function.
4098 // printf(".");
4099 memset(followingBreaks, 0, sizeof(followingBreaks));
4100 int32_t lastBreakPos = 0;
4101 followingBreaks[0] = 1;
4102 for (i=0; i<testText.length(); i++) {
4103 breakPos = bi->following(i);
4104 if (breakPos <= i ||
4105 breakPos < lastBreakPos ||
4106 breakPos > testText.length() ||
4107 (breakPos > lastBreakPos && lastBreakPos > i)) {
4108 errln("%s break monkey test: "
4109 "Out of range value returned by BreakIterator::following().\n"
4110 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4111 name, seed, i, breakPos, lastBreakPos);
4112 break;
4113 }
4114 followingBreaks[breakPos] = 1;
4115 lastBreakPos = breakPos;
4116 }
4117
4118 // Find the break positions using the preceding() function.
4119 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4120 lastBreakPos = testText.length();
4121 precedingBreaks[testText.length()] = 1;
4122 for (i=testText.length(); i>0; i--) {
4123 breakPos = bi->preceding(i);
4124 if (breakPos >= i ||
4125 breakPos > lastBreakPos ||
4126 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4127 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4128 errln("%s break monkey test: "
4129 "Out of range value returned by BreakIterator::preceding().\n"
4130 "index=%d; prev returned %d; lastBreak=%d" ,
4131 name, i, breakPos, lastBreakPos);
4132 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4133 precedingBreaks[i] = 2; // Forces an error.
4134 }
4135 } else {
4136 if (breakPos >= 0) {
4137 precedingBreaks[breakPos] = 1;
4138 }
4139 lastBreakPos = breakPos;
4140 }
4141 }
4142
4143 // Compare the expected and actual results.
4144 for (i=0; i<=testText.length(); i++) {
4145 const char *errorType = nullptr;
4146 const char* currentBreakData = nullptr;
4147 if (forwardBreaks[i] != expectedBreaks[i]) {
4148 errorType = "next()";
4149 currentBreakData = forwardBreaks;
4150 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4151 errorType = "previous()";
4152 currentBreakData = reverseBreaks;
4153 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4154 errorType = "isBoundary()";
4155 currentBreakData = isBoundaryBreaks;
4156 } else if (followingBreaks[i] != expectedBreaks[i]) {
4157 errorType = "following()";
4158 currentBreakData = followingBreaks;
4159 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4160 errorType = "preceding()";
4161 currentBreakData = precedingBreaks;
4162 }
4163
4164 if (errorType != nullptr) {
4165 // Format a range of the test text that includes the failure as
4166 // a data item that can be included in the rbbi test data file.
4167
4168 // Start of the range is the last point where expected and actual results
4169 // both agreed that there was a break position.
4170
4171 int startContext = i;
4172 int32_t count = 0;
4173 for (;;) {
4174 if (startContext==0) { break; }
4175 startContext --;
4176 if (expectedBreaks[startContext] != 0) {
4177 if (count == 2) break;
4178 count ++;
4179 }
4180 }
4181
4182 // End of range is two expected breaks past the start position.
4183 int endContext = i + 1;
4184 int ci;
4185 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4186 for (;;) {
4187 if (endContext >= testText.length()) {break;}
4188 if (expectedBreaks[endContext-1] != 0) {
4189 if (count == 0) break;
4190 count --;
4191 }
4192 endContext ++;
4193 }
4194 }
4195
4196 // Formatting of each line includes:
4197 // character code
4198 // reference break: '|' -> a break, '.' -> no break
4199 // actual break: '|' -> a break, '.' -> no break
4200 // (name of character clase)
4201 // Unicode name of character
4202 // '-->' indicates location of the difference.
4203
4204 MONKEY_ERROR(
4205 (expectedBreaks[i] ? "Break expected but not found" :
4206 "Break found but not expected"),
4207 name, i, seed);
4208
4209 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4210 UChar32 c;
4211 c = testText.char32At(ci);
4212
4213 std::string currentLineFlag = " ";
4214 if (ci == i) {
4215 currentLineFlag = "-->"; // Error position
4216 }
4217
4218 // BMP or SMP character in hex
4219 char hexCodePoint[12];
4220 std::string format = " \\u%04x";
4221 if (c >= 0x10000) {
4222 format = "\\U%08x";
4223 }
4224 snprintf(hexCodePoint, sizeof(hexCodePoint), format.c_str(), c);
4225
4226 // Get the class name and character name for the character.
4227 char cName[200];
4228 UErrorCode status = U_ZERO_ERROR;
4229 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4230
4231 char buffer[200];
4232 auto ret = snprintf(buffer, sizeof(buffer),
4233 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4234 currentLineFlag.c_str(),
4235 ci,
4236 expectedBreaks[ci] == 0 ? "." : "|", // Reference break
4237 currentBreakData[ci] == 0 ? "." : "|", // Actual break
4238 hexCodePoint,
4239 classNameSize,
4240 mk.classNameFromCodepoint(c).c_str(),
4241 mk.getAppliedRule(ci).c_str(), cName);
4242 (void)ret;
4243 U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4244
4245 // Output the error
4246 if (ci == i) {
4247 errln(buffer);
4248 } else {
4249 infoln(buffer);
4250 }
4251
4252 if (ci >= endContext) { break; }
4253 }
4254 break;
4255 }
4256 }
4257
4258 loopCount++;
4259 }
4260 #endif
4261 }
4262
4263
4264 // Bug 5532. UTF-8 based UText fails in dictionary code.
4265 // This test checks the initial patch,
4266 // which is to just keep it from crashing. Correct word boundaries
4267 // await a proper fix to the dictionary code.
4268 //
TestBug5532()4269 void RBBITest::TestBug5532() {
4270 // Text includes a mixture of Thai and Latin.
4271 const unsigned char utf8Data[] = {
4272 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4273 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4274 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4275 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4276 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4277 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4278 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4279 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4280 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4281 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4282 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4283
4284 UErrorCode status = U_ZERO_ERROR;
4285 UText utext=UTEXT_INITIALIZER;
4286 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4287 TEST_ASSERT_SUCCESS(status);
4288
4289 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4290 TEST_ASSERT_SUCCESS(status);
4291 if (U_SUCCESS(status)) {
4292 bi->setText(&utext, status);
4293 TEST_ASSERT_SUCCESS(status);
4294
4295 int32_t breakCount = 0;
4296 int32_t previousBreak = -1;
4297 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4298 // For now, just make sure that the break iterator doesn't hang.
4299 TEST_ASSERT(previousBreak < bi->current());
4300 previousBreak = bi->current();
4301 }
4302 TEST_ASSERT(breakCount > 0);
4303 }
4304 delete bi;
4305 utext_close(&utext);
4306 }
4307
4308
TestBug9983()4309 void RBBITest::TestBug9983() {
4310 UnicodeString text = UnicodeString("\\u002A" // * Other
4311 "\\uFF65" // Other
4312 "\\u309C" // Katakana
4313 "\\uFF9F" // Extend
4314 "\\uFF65" // Other
4315 "\\u0020" // Other
4316 "\\u0000").unescape();
4317
4318 UErrorCode status = U_ZERO_ERROR;
4319 LocalPointer<RuleBasedBreakIterator> brkiter(dynamic_cast<RuleBasedBreakIterator *>(
4320 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4321 TEST_ASSERT_SUCCESS(status);
4322 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(dynamic_cast<RuleBasedBreakIterator *>(
4323 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4324 TEST_ASSERT_SUCCESS(status);
4325 if (U_FAILURE(status)) {
4326 return;
4327 }
4328 int32_t offset, rstatus, iterationCount;
4329
4330 brkiter->setText(text);
4331 brkiter->last();
4332 iterationCount = 0;
4333 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4334 iterationCount++;
4335 rstatus = brkiter->getRuleStatus();
4336 (void)rstatus; // Suppress set but not used warning.
4337 if (iterationCount >= 10) {
4338 break;
4339 }
4340 }
4341 TEST_ASSERT(iterationCount == 6);
4342
4343 brkiterPOSIX->setText(text);
4344 brkiterPOSIX->last();
4345 iterationCount = 0;
4346 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4347 iterationCount++;
4348 rstatus = brkiterPOSIX->getRuleStatus();
4349 (void)rstatus; // Suppress set but not used warning.
4350 if (iterationCount >= 10) {
4351 break;
4352 }
4353 }
4354 TEST_ASSERT(iterationCount == 6);
4355 }
4356
4357 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4358 //
TestBug7547()4359 void RBBITest::TestBug7547() {
4360 UnicodeString rules;
4361 UErrorCode status = U_ZERO_ERROR;
4362 UParseError parseError;
4363 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4364 if (status != U_BRK_RULE_SYNTAX) {
4365 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4366 }
4367 if (parseError.line != 1 || parseError.offset != 0) {
4368 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4369 }
4370 }
4371
4372
TestBug12797()4373 void RBBITest::TestBug12797() {
4374 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4375 UErrorCode status = U_ZERO_ERROR;
4376 UParseError parseError;
4377 RuleBasedBreakIterator bi(rules, parseError, status);
4378 if (U_FAILURE(status)) {
4379 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4380 return;
4381 }
4382 UnicodeString text = "abc";
4383 bi.setText(text);
4384 bi.first();
4385 int32_t boundary = bi.next();
4386 if (boundary != 3) {
4387 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4388 }
4389 }
4390
TestBug12918()4391 void RBBITest::TestBug12918() {
4392 // This test triggers an assertion failure in dictbe.cpp
4393 const char16_t *crasherString = u"\u3325\u4a16";
4394 UErrorCode status = U_ZERO_ERROR;
4395 UBreakIterator* iter = ubrk_open(UBRK_WORD, nullptr, crasherString, -1, &status);
4396 if (U_FAILURE(status)) {
4397 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4398 return;
4399 }
4400 ubrk_first(iter);
4401 int32_t pos = 0;
4402 int32_t lastPos = -1;
4403 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4404 if (pos <= lastPos) {
4405 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4406 break;
4407 }
4408 }
4409 ubrk_close(iter);
4410 }
4411
TestBug12932()4412 void RBBITest::TestBug12932() {
4413 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4414 UnicodeString ruleStr(
4415 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4416 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4417 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4418 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4419 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4420 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4421
4422 UErrorCode status = U_ZERO_ERROR;
4423 UParseError parseError;
4424 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4425 if (status != U_BRK_RULE_SYNTAX) {
4426 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4427 __FILE__, __LINE__, u_errorName(status));
4428 }
4429 }
4430
4431
4432 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4433 // remain undevided by ICU char, word and line break.
TestEmoji()4434 void RBBITest::TestEmoji() {
4435 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4436 UErrorCode status = U_ZERO_ERROR;
4437
4438 CharString testFileName;
4439 testFileName.append(IntlTest::getSourceTestData(status), status);
4440 testFileName.appendPathPart("emoji-test.txt", status);
4441 if (U_FAILURE(status)) {
4442 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4443 return;
4444 }
4445 logln("Opening data file %s\n", testFileName.data());
4446
4447 int len;
4448 char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4449 if (U_FAILURE(status) || testFile == nullptr) {
4450 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4451 return;
4452 }
4453 UnicodeString testFileAsString(testFile, len);
4454 delete [] testFile;
4455
4456 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4457 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4458 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4459 int32_t lineNumber = 0;
4460
4461 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4462 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4463 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4464 if (U_FAILURE(status)) {
4465 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4466 return;
4467 }
4468
4469 while (lineMatcher.find()) {
4470 ++lineNumber;
4471 UnicodeString line = lineMatcher.group(status);
4472 hexMatcher.reset(line);
4473 UnicodeString testString; // accumulates the emoji sequence.
4474 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4475 UnicodeString hex = hexMatcher.group(1, status);
4476 if (hex.length() > 8) {
4477 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4478 break;
4479 }
4480 CharString hex8;
4481 hex8.appendInvariantChars(hex, status);
4482 UChar32 c = (UChar32)strtol(hex8.data(), nullptr, 16);
4483 if (c<=0x10ffff) {
4484 testString.append(c);
4485 } else {
4486 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4487 __FILE__, __LINE__, lineNumber, hex8.data());
4488 break;
4489 }
4490 }
4491
4492 if (testString.length() > 1) {
4493 charBreaks->setText(testString);
4494 charBreaks->first();
4495 int32_t firstBreak = charBreaks->next();
4496 if (testString.length() != firstBreak) {
4497 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4498 __FILE__, __LINE__, lineNumber, firstBreak);
4499 }
4500 wordBreaks->setText(testString);
4501 wordBreaks->first();
4502 firstBreak = wordBreaks->next();
4503 if (testString.length() != firstBreak) {
4504 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4505 __FILE__, __LINE__, lineNumber, firstBreak);
4506 }
4507 lineBreaks->setText(testString);
4508 lineBreaks->first();
4509 firstBreak = lineBreaks->next();
4510 if (testString.length() != firstBreak) {
4511 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4512 __FILE__, __LINE__, lineNumber, firstBreak);
4513 }
4514 }
4515 }
4516 #endif
4517 }
4518
4519
4520 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4521
TestBug12519()4522 void RBBITest::TestBug12519() {
4523 UErrorCode status = U_ZERO_ERROR;
4524 LocalPointer<RuleBasedBreakIterator> biEn(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4525 LocalPointer<RuleBasedBreakIterator> biFr(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getFrance(), status)));
4526 if (!assertSuccess(WHERE, status)) {
4527 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4528 return;
4529 }
4530 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4531
4532 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4533 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4534
4535 LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4536 assertTrue(WHERE, *biEn == *cloneEn);
4537 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4538
4539 LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4540 assertTrue(WHERE, *biFr == *cloneFr);
4541 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4542
4543 LocalPointer<RuleBasedBreakIterator>biDe(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getGerman(), status)));
4544 UnicodeString text("Hallo Welt");
4545 biDe->setText(text);
4546 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4547 *biDe = *biFr;
4548 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4549 }
4550
TestBug12677()4551 void RBBITest::TestBug12677() {
4552 // Check that stripping of comments from rules for getRules() is not confused by
4553 // the presence of '#' characters in the rules that do not introduce comments.
4554 UnicodeString rules(u"!!forward; \n"
4555 "$x = [ab#]; # a set with a # literal. \n"
4556 " # .; # a comment that looks sort of like a rule. \n"
4557 " '#' '?'; # a rule with a quoted # \n"
4558 );
4559
4560 UErrorCode status = U_ZERO_ERROR;
4561 UParseError pe;
4562 RuleBasedBreakIterator bi(rules, pe, status);
4563 assertSuccess(WHERE, status);
4564 UnicodeString rtRules = bi.getRules();
4565 assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules);
4566 }
4567
4568
TestTableRedundancies()4569 void RBBITest::TestTableRedundancies() {
4570 UErrorCode status = U_ZERO_ERROR;
4571
4572 LocalPointer<RuleBasedBreakIterator> bi (
4573 dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4574 assertSuccess(WHERE, status);
4575 if (U_FAILURE(status)) return;
4576
4577 RBBIDataWrapper *dw = bi->fData;
4578 const RBBIStateTable *fwtbl = dw->fForwardTable;
4579 UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4580 int32_t numCharClasses = dw->fHeader->fCatCount;
4581 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4582
4583 // Check for duplicate columns (character categories)
4584
4585 std::vector<UnicodeString> columns;
4586 for (int32_t column = 0; column < numCharClasses; column++) {
4587 UnicodeString s;
4588 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4589 RBBIStateTableRow *row = reinterpret_cast<RBBIStateTableRow *>(const_cast<char*>(fwtbl->fTableData + (fwtbl->fRowLen * r)));
4590 s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4591 }
4592 columns.push_back(s);
4593 }
4594 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4595 for (int c1=1; c1<numCharClasses; c1++) {
4596 int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4597 for (int c2 = c1+1; c2 < limit; c2++) {
4598 if (columns.at(c1) == columns.at(c2)) {
4599 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4600 goto out;
4601 }
4602 }
4603 }
4604 out:
4605
4606 // Check for duplicate states
4607 std::vector<UnicodeString> rows;
4608 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4609 UnicodeString s;
4610 RBBIStateTableRow *row = reinterpret_cast<RBBIStateTableRow *>(const_cast<char*>((fwtbl->fTableData + (fwtbl->fRowLen * r))));
4611 if (in8Bits) {
4612 s.append(row->r8.fAccepting);
4613 s.append(row->r8.fLookAhead);
4614 s.append(row->r8.fTagsIdx);
4615 for (int32_t column = 0; column < numCharClasses; column++) {
4616 s.append(row->r8.fNextState[column]);
4617 }
4618 } else {
4619 s.append(row->r16.fAccepting);
4620 s.append(row->r16.fLookAhead);
4621 s.append(row->r16.fTagsIdx);
4622 for (int32_t column = 0; column < numCharClasses; column++) {
4623 s.append(row->r16.fNextState[column]);
4624 }
4625 }
4626 rows.push_back(s);
4627 }
4628 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4629 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4630 if (rows.at(r1) == rows.at(r2)) {
4631 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4632 return;
4633 }
4634 }
4635 }
4636 }
4637
4638 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4639 // even after next() has returned DONE.
4640
TestBug13447()4641 void RBBITest::TestBug13447() {
4642 UErrorCode status = U_ZERO_ERROR;
4643 LocalPointer<RuleBasedBreakIterator> bi(
4644 dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4645 assertSuccess(WHERE, status);
4646 if (U_FAILURE(status)) return;
4647 UnicodeString data(u"1234");
4648 bi->setText(data);
4649 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4650 assertEquals(WHERE, 4, bi->next());
4651 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4652 assertEquals(WHERE, UBRK_DONE, bi->next());
4653 assertEquals(WHERE, 4, bi->current());
4654 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4655 }
4656
4657 // TestReverse exercises both the synthesized safe reverse rules and the logic
4658 // for filling the break iterator cache when starting from random positions
4659 // in the text.
4660 //
4661 // It's a monkey test, working on random data, with the expected data obtained
4662 // from forward iteration (no safe rules involved), comparing with results
4663 // when indexing into the interior of the string (safe rules needed).
4664
TestReverse()4665 void RBBITest::TestReverse() {
4666 UErrorCode status = U_ZERO_ERROR;
4667
4668 TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4669 BreakIterator::createCharacterInstance(Locale::getEnglish(), status))));
4670 assertSuccess(WHERE, status, true);
4671 status = U_ZERO_ERROR;
4672 TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4673 BreakIterator::createWordInstance(Locale::getEnglish(), status))));
4674 assertSuccess(WHERE, status, true);
4675 status = U_ZERO_ERROR;
4676 TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4677 BreakIterator::createLineInstance(Locale::getEnglish(), status))));
4678 assertSuccess(WHERE, status, true);
4679 status = U_ZERO_ERROR;
4680 TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4681 BreakIterator::createSentenceInstance(Locale::getEnglish(), status))));
4682 assertSuccess(WHERE, status, true);
4683 }
4684
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4685 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4686 if (!bi) {
4687 return;
4688 }
4689
4690 // From the mapping trie in the break iterator's internal data, create a
4691 // vector of UnicodeStrings, one for each character category, containing
4692 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4693 // to avoid an execess of unassigned code points.
4694
4695 RBBIDataWrapper *data = bi->fData;
4696 int32_t categoryCount = data->fHeader->fCatCount;
4697 UCPTrie *trie = data->fTrie;
4698 bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4699 uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4700
4701 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4702 for (int cp=0; cp<0x1fff0; ++cp) {
4703 int cat = ucptrie_get(trie, cp);
4704 cat &= ~dictBit; // And off the dictionary bit from the category.
4705 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4706 if (cat < 0 || cat >= categoryCount) return;
4707 strings[cat].append(cp);
4708 }
4709
4710 icu_rand randomGen;
4711 const int testStringLength = 10000;
4712 UnicodeString testString;
4713
4714 for (int i=0; i<testStringLength; ++i) {
4715 int charClass = randomGen() % categoryCount;
4716 if (strings[charClass].length() > 0) {
4717 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4718 testString.append(cp);
4719 }
4720 }
4721
4722 typedef std::pair<UBool, int32_t> Result;
4723 std::vector<Result> expectedResults;
4724 bi->setText(testString);
4725 for (int i=0; i<testString.length(); ++i) {
4726 bool isboundary = bi->isBoundary(i);
4727 int ruleStatus = bi->getRuleStatus();
4728 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4729 }
4730
4731 for (int i=testString.length()-1; i>=0; --i) {
4732 bi->setText(testString); // clears the internal break cache
4733 Result expected = expectedResults[i];
4734 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4735 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4736 }
4737 }
4738
4739
4740 // Ticket 13692 - finding word boundaries in very large numbers or words could
4741 // be very time consuming. When the problem was present, this void test
4742 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4743
TestBug13692()4744 void RBBITest::TestBug13692() {
4745 UErrorCode status = U_ZERO_ERROR;
4746 LocalPointer<RuleBasedBreakIterator> bi (dynamic_cast<RuleBasedBreakIterator*>(
4747 BreakIterator::createWordInstance(Locale::getEnglish(), status)), status);
4748 if (!assertSuccess(WHERE, status, true)) {
4749 return;
4750 }
4751 constexpr int32_t LENGTH = 1000000;
4752 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4753 for (int i=0; i<20; i+=2) {
4754 longNumber.setCharAt(i, u' ');
4755 }
4756 bi->setText(longNumber);
4757 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4758 assertSuccess(WHERE, status);
4759 }
4760
4761
TestProperties()4762 void RBBITest::TestProperties() {
4763 UErrorCode errorCode = U_ZERO_ERROR;
4764 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4765 if (!prependSet.isEmpty()) {
4766 errln(
4767 "[:GCB=Prepend:] is not empty any more. "
4768 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4769 "change this test to the opposite condition.");
4770 }
4771 }
4772
4773
4774 //
4775 // TestDebug - A place-holder test for debugging purposes.
4776 // For putting in fragments of other tests that can be invoked
4777 // for tracing without a lot of unwanted extra stuff happening.
4778 //
TestDebug()4779 void RBBITest::TestDebug() {
4780 UErrorCode status = U_ZERO_ERROR;
4781 LocalPointer<RuleBasedBreakIterator> bi (dynamic_cast<RuleBasedBreakIterator*>(
4782 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)), status);
4783 if (!assertSuccess(WHERE, status, true)) {
4784 return;
4785 }
4786 const UnicodeString &rules = bi->getRules();
4787 UParseError pe;
4788 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4789 assertSuccess(WHERE, status);
4790 }
4791
4792
4793 //
4794 // TestDebugRules A stub test for use in debugging rule compilation problems.
4795 // Can be freely altered as needed or convenient.
4796 // Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4797 // data files may not be available in all environments.
4798 // Any permanent test cases should be moved to rbbitst.txt
4799 // (see Bug 20303 in that file, for example), or to another test function in this file.
4800 //
TestDebugRules()4801 void RBBITest::TestDebugRules() {
4802 #if 0
4803 const char16_t *rules = u""
4804 "!!quoted_literals_only; \n"
4805 "!!chain; \n"
4806 "!!lookAheadHardBreak; \n"
4807 " \n"
4808 // "[a] / ; \n"
4809 "[a] [b] / [c] [d]; \n"
4810 "[a] [b] / [c] [d] {100}; \n"
4811 "[x] [a] [b] / [c] [d] {100}; \n"
4812 "[a] [b] [c] / [d] {100}; \n"
4813 //" [c] [d] / [e] [f]; \n"
4814 //"[a] [b] / [c]; \n"
4815 ;
4816
4817 UErrorCode status = U_ZERO_ERROR;
4818 CharString path(pathToDataDirectory(), status);
4819 path.appendPathPart("brkitr", status);
4820 path.appendPathPart("rules", status);
4821 path.appendPathPart("line.txt", status);
4822 int len;
4823 std::unique_ptr<char16_t []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4824 if (!assertSuccess(WHERE, status)) {
4825 return;
4826 }
4827
4828 UParseError pe;
4829 // rules = testFile.get();
4830 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4831
4832 if (!assertSuccess(WHERE, status)) {
4833 delete bi;
4834 return;
4835 }
4836 // bi->dumpTables();
4837
4838 delete bi;
4839 #endif
4840 }
4841
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4842 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4843 UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4844 int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4845 // Text are duplicate characters from U+4E00 to U+4FFF
4846 UnicodeString text;
4847 for (char16_t c = 0x4e00; c < 0x5000; c++) {
4848 text.append(c).append(c);
4849 }
4850 // Generate rule which will caused length+4 character classes and
4851 // length+3 states
4852 UnicodeString rules(u"!!quoted_literals_only;");
4853 for (char16_t c = 0x4e00; c < 0x4e00 + numChar; c++) {
4854 rules.append(u'\'').append(c).append(c).append(u"';");
4855 }
4856 rules.append(u".;");
4857 UErrorCode status = U_ZERO_ERROR;
4858 UParseError parseError;
4859 RuleBasedBreakIterator bi(rules, parseError, status);
4860
4861 assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4862 assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4863 assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4864 assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4865 assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4866
4867 bi.setText(text);
4868
4869 int32_t pos;
4870 int32_t i = 0;
4871 while ((pos = bi.next()) > 0) {
4872 // The first numChar should not break between the pair
4873 if (i++ < numChar) {
4874 assertEquals(WHERE, i * 2, pos);
4875 } else {
4876 // After the first numChar next(), break on each character.
4877 assertEquals(WHERE, i + numChar, pos);
4878 }
4879 }
4880 while ((pos = bi.previous()) > 0) {
4881 // The first numChar should not break between the pair
4882 if (--i < numChar) {
4883 assertEquals(WHERE, i * 2, pos);
4884 } else {
4885 // After the first numChar next(), break on each character.
4886 assertEquals(WHERE, i + numChar, pos);
4887 }
4888 }
4889 }
4890
Test8BitsTrieWith8BitStateTable()4891 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4892 testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4893 }
4894
Test16BitsTrieWith8BitStateTable()4895 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4896 testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4897 }
4898
Test16BitsTrieWith16BitStateTable()4899 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4900 testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4901 }
4902
Test8BitsTrieWith16BitStateTable()4903 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4904 // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4905 // create state table in 16 bits.
4906
4907 // Generate 510 'a' as text
4908 UnicodeString text;
4909 for (int32_t i = 0; i < 510; i++) {
4910 text.append(u'a');
4911 }
4912
4913 UnicodeString rules(u"!!quoted_literals_only;'");
4914 // 254 'a' in the rule will cause 256 states
4915 for (int32_t i = 0; i < 254; i++) {
4916 rules.append(u'a');
4917 }
4918 rules.append(u"';.;");
4919
4920 UErrorCode status = U_ZERO_ERROR;
4921 UParseError parseError;
4922 LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4923
4924 assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4925 assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4926 assertEquals(WHERE,
4927 false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
4928 bi->setText(text);
4929
4930 // break positions:
4931 // 254, 508, 509, ... 510
4932 assertEquals("next()", 254, bi->next());
4933 int32_t i = 0;
4934 int32_t pos;
4935 while ((pos = bi->next()) > 0) {
4936 assertEquals(WHERE, 508 + i , pos);
4937 i++;
4938 }
4939 i = 0;
4940 while ((pos = bi->previous()) > 0) {
4941 i++;
4942 if (pos >= 508) {
4943 assertEquals(WHERE, 510 - i , pos);
4944 } else {
4945 assertEquals(WHERE, 254 , pos);
4946 }
4947 }
4948 }
4949
4950 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
4951 // that there are no problems with rules at the size that transitions between the two.
4952 //
4953 // A rule that matches a literal string, like 'abcdefghij', will require one state and
4954 // one character class per character in the string. So we can make a rule to tickle the
4955 // boundaries by using literal strings of various lengths.
4956 //
4957 // For both the number of states and the number of character classes, the eight bit format
4958 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
4959 // leaving 120 something available. This test runs the string over the range of 120 - 130,
4960 // which allows some margin for changes to the number of values reserved by the rule builder
4961 // without breaking the test.
4962
TestTable_8_16_Bits()4963 void RBBITest::TestTable_8_16_Bits() {
4964
4965 // testStr serves as both the source of the rule string (truncated to the desired length)
4966 // and as test data to check matching behavior. A break rule consisting of the first 120
4967 // characters of testStr will match the first 120 chars of the full-length testStr.
4968 UnicodeString testStr;
4969 for (char16_t c=0x3000; c<0x3200; ++c) {
4970 testStr.append(c);
4971 }
4972
4973 const int32_t startLength = 120; // The shortest rule string to test.
4974 const int32_t endLength = 260; // The longest rule string to test
4975 const int32_t increment = this->quick ? endLength - startLength : 1;
4976
4977 for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
4978 UParseError parseError;
4979 UErrorCode status = U_ZERO_ERROR;
4980
4981 UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
4982 ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
4983 RuleBasedBreakIterator bi(ruleString, parseError, status);
4984 if (!assertSuccess(WHERE, status)) {
4985 errln(ruleString);
4986 break;
4987 }
4988 // bi.dumpTables();
4989
4990 // Verify that the break iterator is functioning - that the first boundary found
4991 // in testStr is at the length of the rule string.
4992 bi.setText(testStr);
4993 assertEquals(WHERE, ruleLen, bi.next());
4994
4995 // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
4996 // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
4997 bi.setText(testStr);
4998 int32_t result = bi.preceding(ruleLen);
4999 assertEquals(WHERE, 0, result);
5000
5001 // Verify that the range of rule lengths being tested cover the translations
5002 // from 8 to 16 bit data.
5003 bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
5004 bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
5005
5006 if (ruleLen == startLength) {
5007 assertEquals(WHERE, true, has8BitRowData);
5008 assertEquals(WHERE, true, has8BitsTrie);
5009 }
5010 if (ruleLen == endLength) {
5011 assertEquals(WHERE, false, has8BitRowData);
5012 assertEquals(WHERE, false, has8BitsTrie);
5013 }
5014 }
5015 }
5016
5017 /* Test handling of a large number of look-ahead rules.
5018 * The number of rules in the test exceeds the implementation limits prior to the
5019 * improvements introduced with #13590.
5020 *
5021 * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5022 * The text being matched is sequential, "ABCDEFGHI..."
5023 *
5024 * The upshot is that the look-ahead rules all match on their preceding context,
5025 * and consequently must save a potential result, but then fail to match on their
5026 * trailing context, so that they don't actually cause a boundary.
5027 *
5028 * Additionally, add a ".*" rule, so there are no boundaries unless a
5029 * look-ahead hard-break rule forces one.
5030 */
TestBug13590()5031 void RBBITest::TestBug13590() {
5032 UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5033
5034 const int NUM_LOOKAHEAD_RULES = 50;
5035 const char16_t STARTING_CHAR = u'\u5000';
5036 char16_t firstChar;
5037 for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5038 firstChar = STARTING_CHAR + ruleNum*2;
5039 rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5040 .append(u' ') .append(u'/') .append(u' ')
5041 .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5042 .append(u';') .append(u'\n');
5043 }
5044
5045 // Change the last rule added from the form "UV / WY" to "UV / WX".
5046 // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5047 rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5048
5049 UErrorCode status = U_ZERO_ERROR;
5050 UParseError parseError;
5051 RuleBasedBreakIterator bi(rules, parseError, status);
5052 if (!assertSuccess(WHERE, status)) {
5053 errln(rules);
5054 return;
5055 }
5056 // bi.dumpTables();
5057
5058 UnicodeString testString;
5059 for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5060 testString.append(c);
5061 }
5062 bi.setText(testString);
5063
5064 int breaksFound = 0;
5065 while (bi.next() != UBRK_DONE) {
5066 ++breaksFound;
5067 }
5068
5069 // Two matches are expected, one from the last rule that was explicitly modified,
5070 // and one at the end of the text.
5071 assertEquals(WHERE, 2, breaksFound);
5072 }
5073
5074
5075 #if U_ENABLE_TRACING
5076 static std::vector<std::string> gData;
5077 static std::vector<int32_t> gEntryFn;
5078 static std::vector<int32_t> gExitFn;
5079 static std::vector<int32_t> gDataFn;
5080
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5081 static void U_CALLCONV traceData(
5082 const void*,
5083 int32_t fnNumber,
5084 int32_t,
5085 const char *,
5086 va_list args) {
5087 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5088 const char* data = va_arg(args, const char*);
5089 gDataFn.push_back(fnNumber);
5090 gData.push_back(data);
5091 }
5092 }
5093
traceEntry(const void *,int32_t fnNumber)5094 static void traceEntry(const void *, int32_t fnNumber) {
5095 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5096 gEntryFn.push_back(fnNumber);
5097 }
5098 }
5099
traceExit(const void *,int32_t fnNumber,const char *,va_list)5100 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5101 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5102 gExitFn.push_back(fnNumber);
5103 }
5104 }
5105
5106
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5107 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5108 assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5109 assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5110 assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5111 assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5112
5113 if (expectedData == nullptr) {
5114 assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5115 assertEquals("utrace_data should not be called ", 0, gData.size());
5116 } else {
5117 assertEquals("utrace_data should be called ", 1, gDataFn.size());
5118 assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5119 assertEquals("utrace_data should be called ", 1, gData.size());
5120 assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5121 }
5122 }
5123
SetupTestTrace()5124 void SetupTestTrace() {
5125 gEntryFn.clear();
5126 gExitFn.clear();
5127 gDataFn.clear();
5128 gData.clear();
5129
5130 const void* context = nullptr;
5131 utrace_setFunctions(context, traceEntry, traceExit, traceData);
5132 utrace_setLevel(UTRACE_INFO);
5133 }
5134
TestTraceCreateCharacter()5135 void RBBITest::TestTraceCreateCharacter() {
5136 SetupTestTrace();
5137 IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5138 LocalPointer<BreakIterator> brkitr(
5139 BreakIterator::createCharacterInstance("zh-CN", status));
5140 status.errIfFailureAndReset();
5141 assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5142 }
5143
TestTraceCreateTitle()5144 void RBBITest::TestTraceCreateTitle() {
5145 SetupTestTrace();
5146 IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5147 LocalPointer<BreakIterator> brkitr(
5148 BreakIterator::createTitleInstance("zh-CN", status));
5149 status.errIfFailureAndReset();
5150 assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5151 }
5152
TestTraceCreateSentence()5153 void RBBITest::TestTraceCreateSentence() {
5154 SetupTestTrace();
5155 IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5156 LocalPointer<BreakIterator> brkitr(
5157 BreakIterator::createSentenceInstance("zh-CN", status));
5158 status.errIfFailureAndReset();
5159 assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5160 }
5161
TestTraceCreateWord()5162 void RBBITest::TestTraceCreateWord() {
5163 SetupTestTrace();
5164 IcuTestErrorCode status(*this, "TestTraceCreateWord");
5165 LocalPointer<BreakIterator> brkitr(
5166 BreakIterator::createWordInstance("zh-CN", status));
5167 status.errIfFailureAndReset();
5168 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5169 }
5170
TestTraceCreateLine()5171 void RBBITest::TestTraceCreateLine() {
5172 SetupTestTrace();
5173 IcuTestErrorCode status(*this, "TestTraceCreateLine");
5174 LocalPointer<BreakIterator> brkitr(
5175 BreakIterator::createLineInstance("zh-CN", status));
5176 status.errIfFailureAndReset();
5177 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line");
5178 }
5179
TestTraceCreateLineStrict()5180 void RBBITest::TestTraceCreateLineStrict() {
5181 SetupTestTrace();
5182 IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5183 LocalPointer<BreakIterator> brkitr(
5184 BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5185 status.errIfFailureAndReset();
5186 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict");
5187 }
5188
TestTraceCreateLineNormal()5189 void RBBITest::TestTraceCreateLineNormal() {
5190 SetupTestTrace();
5191 IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5192 LocalPointer<BreakIterator> brkitr(
5193 BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5194 status.errIfFailureAndReset();
5195 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal");
5196 }
5197
TestTraceCreateLineLoose()5198 void RBBITest::TestTraceCreateLineLoose() {
5199 SetupTestTrace();
5200 IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5201 LocalPointer<BreakIterator> brkitr(
5202 BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5203 status.errIfFailureAndReset();
5204 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose");
5205 }
5206
TestTraceCreateLineLoosePhrase()5207 void RBBITest::TestTraceCreateLineLoosePhrase() {
5208 SetupTestTrace();
5209 IcuTestErrorCode status(*this, "TestTraceCreateLineLoosePhrase");
5210 LocalPointer<BreakIterator> brkitr(
5211 BreakIterator::createLineInstance("ja-u-lb-loose-lw-phrase", status));
5212 status.errIfFailureAndReset();
5213 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose_phrase");
5214 }
5215
TestTraceCreateLineNormalPhrase()5216 void RBBITest::TestTraceCreateLineNormalPhrase() {
5217 SetupTestTrace();
5218 IcuTestErrorCode status(*this, "TestTraceCreateLineNormalPhrase");
5219 LocalPointer<BreakIterator> brkitr(
5220 BreakIterator::createLineInstance("ja-u-lb-normal-lw-phrase", status));
5221 status.errIfFailureAndReset();
5222 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal_phrase");
5223 }
5224
TestTraceCreateLineStrictPhrase()5225 void RBBITest::TestTraceCreateLineStrictPhrase() {
5226 SetupTestTrace();
5227 IcuTestErrorCode status(*this, "TestTraceCreateLineStrictPhrase");
5228 LocalPointer<BreakIterator> brkitr(
5229 BreakIterator::createLineInstance("ja-u-lb-strict-lw-phrase", status));
5230 status.errIfFailureAndReset();
5231 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict_phrase");
5232 }
5233
TestTraceCreateLinePhrase()5234 void RBBITest::TestTraceCreateLinePhrase() {
5235 SetupTestTrace();
5236 IcuTestErrorCode status(*this, "TestTraceCreateLinePhrase");
5237 LocalPointer<BreakIterator> brkitr(
5238 BreakIterator::createLineInstance("ja-u-lw-phrase", status));
5239 status.errIfFailureAndReset();
5240 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_phrase");
5241 }
5242
TestTraceCreateBreakEngine()5243 void RBBITest::TestTraceCreateBreakEngine() {
5244 rbbi_cleanup();
5245 SetupTestTrace();
5246 IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5247 LocalPointer<BreakIterator> brkitr(
5248 BreakIterator::createWordInstance("zh-CN", status));
5249 status.errIfFailureAndReset();
5250 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5251
5252 // To word break the following text, BreakIterator will create 5 dictionary
5253 // break engine internally.
5254 UnicodeString text(
5255 u"test "
5256 u"測試 " // Hani
5257 u"សាកល្បង " // Khmr
5258 u"ທົດສອບ " // Laoo
5259 u"စမ်းသပ်မှု " // Mymr
5260 u"ทดสอบ " // Thai
5261 u"test "
5262 );
5263 brkitr->setText(text);
5264
5265 // Loop through all the text.
5266 while (brkitr->next() > 0) ;
5267
5268 assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5269 assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5270 assertEquals("utrace_data should be called ", 5, gDataFn.size());
5271
5272 for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5273 assertEquals("utrace_entry should be called ",
5274 UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5275 assertEquals("utrace_exit should be called ",
5276 UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5277 assertEquals("utrace_data should be called ",
5278 UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5279 }
5280
5281 assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5282 assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5283 assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5284 assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5285 assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5286
5287 }
5288 #endif
5289
TestUnpairedSurrogate()5290 void RBBITest::TestUnpairedSurrogate() {
5291 UnicodeString rules(u"ab;");
5292
5293 UErrorCode status = U_ZERO_ERROR;
5294 UParseError pe;
5295 RuleBasedBreakIterator bi1(rules, pe, status);
5296 assertSuccess(WHERE, status);
5297 UnicodeString rtRules = bi1.getRules();
5298 // make sure the simple one work first.
5299 assertEquals(WHERE, rules, rtRules);
5300
5301
5302 rules = UnicodeString(u"a\\ud800b;").unescape();
5303 pe.line = 0;
5304 pe.offset = 0;
5305 RuleBasedBreakIterator bi2(rules, pe, status);
5306 assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5307 if (pe.line != 1 || pe.offset != 1) {
5308 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5309 }
5310
5311 status = U_ZERO_ERROR;
5312 rules = UnicodeString(u"a\\ude00b;").unescape();
5313 pe.line = 0;
5314 pe.offset = 0;
5315 RuleBasedBreakIterator bi3(rules, pe, status);
5316 assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5317 if (pe.line != 1 || pe.offset != 1) {
5318 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5319 }
5320
5321 // make sure the surrogate one work too.
5322 status = U_ZERO_ERROR;
5323 rules = UnicodeString(u"ab;");
5324 RuleBasedBreakIterator bi4(rules, pe, status);
5325 rtRules = bi4.getRules();
5326 assertEquals(WHERE, rules, rtRules);
5327 }
5328
5329 // Read file generated by
5330 // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5331 // as test cases and compare the Output.
5332 // Format of the file
5333 // Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5334 // Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5335 // Input:\t[source text]
5336 // Output:\t[expected output separated by | ]
5337 // Input: ...
5338 // Output: ...
5339
runLSTMTestFromFile(const char * filename,UScriptCode script)5340 void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5341 // The expectation in this test depends on LSTM, skip the test if the
5342 // configuration is not build with LSTM data.
5343 if (skipLSTMTest()) {
5344 return;
5345 }
5346 UErrorCode status = U_ZERO_ERROR;
5347 LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5348 if (U_FAILURE(status)) {
5349 errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5350 return;
5351 }
5352 // Open and read the test data file.
5353 const char *testDataDirectory = IntlTest::getSourceTestData(status);
5354 CharString testFileName(testDataDirectory, -1, status);
5355 testFileName.append(filename, -1, status);
5356
5357 int len;
5358 char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5359 if (U_FAILURE(status)) {
5360 errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5361 return;
5362 }
5363
5364 // Put the test data into a UnicodeString
5365 UnicodeString testString(false, testFile, len);
5366
5367 int32_t start = 0;
5368
5369 UnicodeString line;
5370 int32_t end;
5371 std::string actual_sep_str;
5372 int32_t caseNum = 0;
5373 // Iterate through all the lines in the test file.
5374 do {
5375 int32_t cr = testString.indexOf(u'\r', start);
5376 int32_t lf = testString.indexOf(u'\n', start);
5377 end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5378 line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5379 if (line.length() > 0) {
5380 // Separate each line to key and value by TAB.
5381 int32_t tab = line.indexOf(u'\t');
5382 UnicodeString key = line.tempSubString(0, tab);
5383 const UnicodeString value = line.tempSubString(tab+1);
5384
5385 if (key == "Model:") {
5386 // Verify the expectation in the test file match the LSTM model
5387 // we are using now.
5388 const LSTMData* data = CreateLSTMDataForScript(script, status);
5389 if (U_FAILURE(status)) {
5390 dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5391 __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5392 return;
5393 }
5394 UnicodeString name(LSTMDataName(data));
5395 DeleteLSTMData(data);
5396 if (value != name) {
5397 std::string utf8Name, utf8Value;
5398 dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5399 __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5400 name.toUTF8String<std::string>(utf8Name).c_str(),
5401 value.toUTF8String<std::string>(utf8Value).c_str());
5402 return;
5403 }
5404 } else if (key == "Input:") {
5405 UnicodeString input("prefix ");
5406 input += value + " suffix";
5407 std::stringstream ss;
5408
5409 // Construct the UText which is expected by the the engine as
5410 // input from the UnicodeString.
5411 UText ut = UTEXT_INITIALIZER;
5412 utext_openConstUnicodeString(&ut, &input, &status);
5413 if (U_FAILURE(status)) {
5414 dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5415 return;
5416 }
5417
5418 iterator->setText(&ut, status);
5419 if (U_FAILURE(status)) {
5420 errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5421 return;
5422 }
5423
5424 int32_t bp;
5425 for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5426 ss << bp;
5427 if (bp != input.length()) {
5428 ss << ", ";
5429 }
5430 }
5431
5432 utext_close(&ut);
5433 // Turn the break points into a string for easy comparison
5434 // output.
5435 actual_sep_str = "{" + ss.str() + "}";
5436 } else if (key == "Output:" && !actual_sep_str.empty()) {
5437 UnicodeString input("prefix| |");
5438 input += value + "| |suffix";
5439 std::string d;
5440 int32_t sep;
5441 int32_t start = 0;
5442 int32_t curr = 0;
5443 std::stringstream ss;
5444 // Include 0 as the break point.
5445 ss << "0, ";
5446 while ((sep = input.indexOf(u'|', start)) >= 0) {
5447 int32_t len = sep - start;
5448 if (len > 0) {
5449 if (curr > 0) {
5450 ss << ", ";
5451 }
5452 curr += len;
5453 ss << curr;
5454 }
5455 start = sep + 1;
5456 }
5457 // Include end of the string as break point.
5458 ss << ", " << curr + input.length() - start;
5459 // Turn the break points into a string for easy comparison
5460 // output.
5461 std::string expected = "{" + ss.str() + "}";
5462 std::string utf8;
5463
5464 assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5465 expected.c_str(), actual_sep_str.c_str());
5466 actual_sep_str.clear();
5467 }
5468 }
5469 start = std::max(cr, lf) + 1;
5470 } while (end >= 0);
5471
5472 delete [] testFile;
5473 }
5474
TestLSTMThai()5475 void RBBITest::TestLSTMThai() {
5476 runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5477 }
5478
TestLSTMBurmese()5479 void RBBITest::TestLSTMBurmese() {
5480 runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5481 }
5482
5483
5484 // Test preceding(index) and following(index), with semi-random indexes.
5485 // The random indexes are produced in clusters that are relatively closely spaced,
5486 // to increase the occurrences of hits to the internal break cache.
5487
TestRandomAccess()5488 void RBBITest::TestRandomAccess() {
5489 static constexpr int32_t CACHE_SIZE = 128;
5490
5491 UnicodeString testData;
5492 for (int i=0; i<CACHE_SIZE*2; ++i) {
5493 testData.append(u"aaaa\n");
5494 }
5495
5496 UErrorCode status = U_ZERO_ERROR;
5497 LocalPointer<RuleBasedBreakIterator> bi(
5498 dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status)),
5499 status);
5500 if (!assertSuccess(WHERE, status)) { return; };
5501
5502 bi->setText(testData);
5503
5504 auto expectedPreceding = [](int from) {
5505 if (from == 0) {return UBRK_DONE;}
5506 if (from % 5 == 0) {return from - 5;}
5507 return from - (from % 5);
5508 };
5509
5510 auto expectedFollow = [testData](int from) {
5511 if (from >= testData.length()) {return UBRK_DONE;}
5512 if (from % 5 == 0) {return from + 5;}
5513 return from + (5 - (from % 5));
5514 };
5515
5516 auto randomStringIndex = [testData]() {
5517 static icu_rand randomGenerator; // produces random uint32_t values.
5518 static int lastNum;
5519 static int clusterCount;
5520 static constexpr int CLUSTER_SIZE = 100;
5521 static constexpr int CLUSTER_LENGTH = 10;
5522
5523 if (clusterCount < CLUSTER_LENGTH) {
5524 ++clusterCount;
5525 lastNum += (randomGenerator() % CLUSTER_SIZE);
5526 lastNum -= CLUSTER_SIZE / 2;
5527 lastNum = std::max(0, lastNum);
5528 // Deliberately test indexes > testData.length.
5529 lastNum = std::min(testData.length() + 5, lastNum);
5530 } else {
5531 clusterCount = 0;
5532 lastNum = randomGenerator() % testData.length();
5533 }
5534 return lastNum;
5535 };
5536
5537 for (int i=0; i<5000; ++i) {
5538 int idx = randomStringIndex();
5539 assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
5540 idx = randomStringIndex();
5541 assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
5542 }
5543 }
5544
5545 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5546