1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <algorithm>
18 #include <sstream>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <utility>
23 #include <vector>
24
25 #include "unicode/brkiter.h"
26 #include "unicode/localpointer.h"
27 #include "unicode/numfmt.h"
28 #include "unicode/rbbi.h"
29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30 #include "unicode/regex.h"
31 #endif
32 #include "unicode/schriter.h"
33 #include "unicode/uchar.h"
34 #include "unicode/utf16.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uscript.h"
38 #include "unicode/ustring.h"
39 #include "unicode/utext.h"
40 #include "unicode/utrace.h"
41
42 #include "charstr.h"
43 #include "cmemory.h"
44 #include "cstr.h"
45 #include "intltest.h"
46 #include "lstmbe.h"
47 #include "rbbitst.h"
48 #include "rbbidata.h"
49 #include "utypeinfo.h" // for 'typeid' to work
50 #include "uvector.h"
51 #include "uvectr32.h"
52
53
54 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 #include "unicode/filteredbrk.h"
56 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
57
58 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
59 if (!(x)) { \
60 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
61 } \
62 } UPRV_BLOCK_MACRO_END
63
64 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
65 if (U_FAILURE(errcode)) { \
66 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
67 } \
68 } UPRV_BLOCK_MACRO_END
69
70 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
71 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
72 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
73 }
74
75 //---------------------------------------------
76 // runIndexedTest
77 //---------------------------------------------
78
79
80 // Note: Before adding new tests to this file, check whether the desired test data can
81 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
82 // it's much less work than writing a new test, diagnostic output in the event of failures
83 // is good, and the test data file will is shared with ICU4J, so eventually the test
84 // will run there as well, without additional effort.
85
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)86 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
87 {
88 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
89 fTestParams = params;
90
91 TESTCASE_AUTO_BEGIN;
92 #if !UCONFIG_NO_FILE_IO
93 TESTCASE_AUTO(TestBug4153072);
94 #endif
95 #if !UCONFIG_NO_FILE_IO
96 TESTCASE_AUTO(TestUnicodeFiles);
97 #endif
98 TESTCASE_AUTO(TestGetAvailableLocales);
99 TESTCASE_AUTO(TestGetDisplayName);
100 #if !UCONFIG_NO_FILE_IO
101 TESTCASE_AUTO(TestEndBehaviour);
102 TESTCASE_AUTO(TestWordBreaks);
103 TESTCASE_AUTO(TestWordBoundary);
104 TESTCASE_AUTO(TestLineBreaks);
105 TESTCASE_AUTO(TestSentBreaks);
106 TESTCASE_AUTO(TestExtended);
107 #endif
108 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
109 TESTCASE_AUTO(TestMonkey);
110 #endif
111 #if !UCONFIG_NO_FILE_IO
112 TESTCASE_AUTO(TestBug3818);
113 #endif
114 TESTCASE_AUTO(TestDebug);
115 #if !UCONFIG_NO_FILE_IO
116 TESTCASE_AUTO(TestBug5775);
117 #endif
118 TESTCASE_AUTO(TestBug9983);
119 TESTCASE_AUTO(TestDictRules);
120 TESTCASE_AUTO(TestBug5532);
121 TESTCASE_AUTO(TestBug7547);
122 TESTCASE_AUTO(TestBug12797);
123 TESTCASE_AUTO(TestBug12918);
124 TESTCASE_AUTO(TestBug12932);
125 TESTCASE_AUTO(TestEmoji);
126 TESTCASE_AUTO(TestBug12519);
127 TESTCASE_AUTO(TestBug12677);
128 TESTCASE_AUTO(TestTableRedundancies);
129 TESTCASE_AUTO(TestBug13447);
130 TESTCASE_AUTO(TestReverse);
131 TESTCASE_AUTO(TestBug13692);
132 TESTCASE_AUTO(TestDebugRules);
133 TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
134 TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
135 TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
136 TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
137 TESTCASE_AUTO(TestTable_8_16_Bits);
138 TESTCASE_AUTO(TestBug13590);
139 TESTCASE_AUTO(TestUnpairedSurrogate);
140 TESTCASE_AUTO(TestLSTMThai);
141 TESTCASE_AUTO(TestLSTMBurmese);
142 TESTCASE_AUTO(TestRandomAccess);
143
144 #if U_ENABLE_TRACING
145 TESTCASE_AUTO(TestTraceCreateCharacter);
146 TESTCASE_AUTO(TestTraceCreateWord);
147 TESTCASE_AUTO(TestTraceCreateSentence);
148 TESTCASE_AUTO(TestTraceCreateTitle);
149 TESTCASE_AUTO(TestTraceCreateLine);
150 TESTCASE_AUTO(TestTraceCreateLineNormal);
151 TESTCASE_AUTO(TestTraceCreateLineLoose);
152 TESTCASE_AUTO(TestTraceCreateLineStrict);
153 TESTCASE_AUTO(TestTraceCreateLineNormalPhrase);
154 TESTCASE_AUTO(TestTraceCreateLineLoosePhrase);
155 TESTCASE_AUTO(TestTraceCreateLineStrictPhrase);
156 TESTCASE_AUTO(TestTraceCreateLinePhrase);
157 TESTCASE_AUTO(TestTraceCreateBreakEngine);
158 #endif
159
160 TESTCASE_AUTO_END;
161 }
162
163
164 //--------------------------------------------------------------------------------------
165 //
166 // RBBITest constructor and destructor
167 //
168 //--------------------------------------------------------------------------------------
169
RBBITest()170 RBBITest::RBBITest() {
171 fTestParams = NULL;
172 }
173
174
~RBBITest()175 RBBITest::~RBBITest() {
176 }
177
178
printStringBreaks(UText * tstr,int expected[],int expectedCount)179 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
180 UErrorCode status = U_ZERO_ERROR;
181 char name[100];
182 printf("code alpha extend alphanum type word sent line name\n");
183 int nextExpectedIndex = 0;
184 utext_setNativeIndex(tstr, 0);
185 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
186 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
187 printf("------------------------------------------------ %d\n", j);
188 ++nextExpectedIndex;
189 }
190
191 UChar32 c = utext_next32(tstr);
192 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
193 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
194 u_isUAlphabetic(c),
195 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
196 u_isalnum(c),
197 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
198 u_charType(c),
199 U_SHORT_PROPERTY_NAME),
200 u_getPropertyValueName(UCHAR_WORD_BREAK,
201 u_getIntPropertyValue(c,
202 UCHAR_WORD_BREAK),
203 U_SHORT_PROPERTY_NAME),
204 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
205 u_getIntPropertyValue(c,
206 UCHAR_SENTENCE_BREAK),
207 U_SHORT_PROPERTY_NAME),
208 u_getPropertyValueName(UCHAR_LINE_BREAK,
209 u_getIntPropertyValue(c,
210 UCHAR_LINE_BREAK),
211 U_SHORT_PROPERTY_NAME),
212 name);
213 }
214 }
215
216
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)217 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
218 UErrorCode status = U_ZERO_ERROR;
219 UText *tstr = NULL;
220 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
221 if (U_FAILURE(status)) {
222 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
223 return;
224 }
225 printStringBreaks(tstr, expected, expectedCount);
226 utext_close(tstr);
227 }
228
229
TestBug3818()230 void RBBITest::TestBug3818() {
231 UErrorCode status = U_ZERO_ERROR;
232
233 // Four Thai words...
234 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
235 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
236 UnicodeString thaiStr(thaiWordData);
237
238 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
239 if (U_FAILURE(status) || bi == NULL) {
240 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
241 return;
242 }
243 bi->setText(thaiStr);
244
245 int32_t startOfSecondWord = bi->following(1);
246 if (startOfSecondWord != 4) {
247 errln("Fail at file %s, line %d expected start of word at 4, got %d",
248 __FILE__, __LINE__, startOfSecondWord);
249 }
250 startOfSecondWord = bi->following(0);
251 if (startOfSecondWord != 4) {
252 errln("Fail at file %s, line %d expected start of word at 4, got %d",
253 __FILE__, __LINE__, startOfSecondWord);
254 }
255 delete bi;
256 }
257
258
259 //---------------------------------------------
260 //
261 // other tests
262 //
263 //---------------------------------------------
264
TestGetAvailableLocales()265 void RBBITest::TestGetAvailableLocales()
266 {
267 int32_t locCount = 0;
268 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
269
270 if (locCount == 0)
271 dataerrln("getAvailableLocales() returned an empty list!");
272 // Just make sure that it's returning good memory.
273 int32_t i;
274 for (i = 0; i < locCount; ++i) {
275 logln(locList[i].getName());
276 }
277 }
278
279 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()280 void RBBITest::TestGetDisplayName()
281 {
282 UnicodeString result;
283
284 BreakIterator::getDisplayName(Locale::getUS(), result);
285 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
286 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
287 + result);
288
289 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
290 if (result != "French (France)")
291 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
292 + result);
293 }
294 /**
295 * Test End Behaviour
296 * @bug 4068137
297 */
TestEndBehaviour()298 void RBBITest::TestEndBehaviour()
299 {
300 UErrorCode status = U_ZERO_ERROR;
301 UnicodeString testString("boo.");
302 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
303 if (U_FAILURE(status))
304 {
305 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
306 return;
307 }
308 wb->setText(testString);
309
310 if (wb->first() != 0)
311 errln("Didn't get break at beginning of string.");
312 if (wb->next() != 3)
313 errln("Didn't get break before period in \"boo.\"");
314 if (wb->current() != 4 && wb->next() != 4)
315 errln("Didn't get break at end of string.");
316 delete wb;
317 }
318 /*
319 * @bug 4153072
320 */
TestBug4153072()321 void RBBITest::TestBug4153072() {
322 UErrorCode status = U_ZERO_ERROR;
323 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
324 if (U_FAILURE(status))
325 {
326 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
327 return;
328 }
329 UnicodeString str("...Hello, World!...");
330 int32_t begin = 3;
331 int32_t end = str.length() - 3;
332 UBool onBoundary;
333
334 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
335 iter->adoptText(textIterator);
336 int index;
337 // Note: with the switch to UText, there is no way to restrict the
338 // iteration range to begin at an index other than zero.
339 // String character iterators created with a non-zero bound are
340 // treated by RBBI as being empty.
341 for (index = -1; index < begin + 1; ++index) {
342 onBoundary = iter->isBoundary(index);
343 if (index == 0? !onBoundary : onBoundary) {
344 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
345 " and begin index = " + begin);
346 }
347 }
348 delete iter;
349 }
350
351
352 //
353 // Test for problem reported by Ashok Matoria on 9 July 2007
354 // One.<kSoftHyphen><kSpace>Two.
355 //
356 // Sentence break at start (0) and then on calling next() it breaks at
357 // 'T' of "Two". Now, at this point if I do next() and
358 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
359 //
TestBug5775()360 void RBBITest::TestBug5775() {
361 UErrorCode status = U_ZERO_ERROR;
362 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
363 TEST_ASSERT_SUCCESS(status);
364 if (U_FAILURE(status)) {
365 return;
366 }
367 // Check for status first for better handling of no data errors.
368 TEST_ASSERT(bi != NULL);
369 if (bi == NULL) {
370 return;
371 }
372
373 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
374 // 01234 56789
375 s = s.unescape();
376 bi->setText(s);
377 int pos = bi->next();
378 TEST_ASSERT(pos == 6);
379 pos = bi->next();
380 TEST_ASSERT(pos == 10);
381 pos = bi->previous();
382 TEST_ASSERT(pos == 6);
383 delete bi;
384 }
385
386
387
388 //------------------------------------------------------------------------------
389 //
390 // RBBITest::Extended Run RBBI Tests from an external test data file
391 //
392 //------------------------------------------------------------------------------
393
394 struct TestParams {
395 BreakIterator *bi; // Break iterator is set while parsing test source.
396 // Changed out whenever test data changes break type.
397
398 UnicodeString dataToBreak; // Data that is built up while parsing the test.
399 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
400 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
401 UVector32 *srcCol;
402
403 UText *textToBreak; // UText, could be UTF8 or UTF16.
404 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
405 CharString utf8String; // UTF-8 form of text to break.
406
TestParamsTestParams407 TestParams(UErrorCode &status) : dataToBreak() {
408 bi = NULL;
409 expectedBreaks = new UVector32(status);
410 srcLine = new UVector32(status);
411 srcCol = new UVector32(status);
412 textToBreak = NULL;
413 textMap = new UVector32(status);
414 }
415
~TestParamsTestParams416 ~TestParams() {
417 delete bi;
418 delete expectedBreaks;
419 delete srcLine;
420 delete srcCol;
421 utext_close(textToBreak);
422 delete textMap;
423 }
424
425 int32_t getSrcLine(int32_t bp);
426 int32_t getExpectedBreak(int32_t bp);
427 int32_t getSrcCol(int32_t bp);
428
429 void setUTF16(UErrorCode &status);
430 void setUTF8(UErrorCode &status);
431 };
432
433 // Append a UnicodeString to a CharString with UTF-8 encoding.
434 // Substitute any invalid chars.
435 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)436 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
437 if (U_FAILURE(status)) {
438 return;
439 }
440 int32_t utf8Length;
441 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
442 src.getBuffer(), src.length(), // UTF-16 data
443 0xfffd, NULL, // Substitution char, number of subs.
444 &status);
445 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
446 return;
447 }
448 status = U_ZERO_ERROR;
449 int32_t capacity;
450 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
451 u_strToUTF8WithSub(buffer, utf8Length, NULL,
452 src.getBuffer(), src.length(),
453 0xfffd, NULL, &status);
454 dest.append(buffer, utf8Length, status);
455 }
456
457
setUTF16(UErrorCode & status)458 void TestParams::setUTF16(UErrorCode &status) {
459 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
460 textMap->removeAllElements();
461 for (int32_t i=0; i<dataToBreak.length(); i++) {
462 if (i == dataToBreak.getChar32Start(i)) {
463 textMap->addElement(i, status);
464 } else {
465 textMap->addElement(-1, status);
466 }
467 }
468 textMap->addElement(dataToBreak.length(), status);
469 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
470 }
471
472
setUTF8(UErrorCode & status)473 void TestParams::setUTF8(UErrorCode &status) {
474 if (U_FAILURE(status)) {
475 return;
476 }
477 utf8String.clear();
478 CharStringAppend(utf8String, dataToBreak, status);
479 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
480 if (U_FAILURE(status)) {
481 return;
482 }
483
484 textMap->removeAllElements();
485 int32_t utf16Index = 0;
486 for (;;) {
487 textMap->addElement(utf16Index, status);
488 UChar32 c32 = utext_current32(textToBreak);
489 if (c32 < 0) {
490 break;
491 }
492 utf16Index += U16_LENGTH(c32);
493 utext_next32(textToBreak);
494 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
495 textMap->addElement(-1, status);
496 }
497 }
498 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
499 }
500
501
getSrcLine(int32_t bp)502 int32_t TestParams::getSrcLine(int32_t bp) {
503 if (bp >= textMap->size()) {
504 bp = textMap->size() - 1;
505 }
506 int32_t i = 0;
507 for(; bp >= 0 ; --bp) {
508 // Move to a character boundary if we are not on one already.
509 i = textMap->elementAti(bp);
510 if (i >= 0) {
511 break;
512 }
513 }
514 return srcLine->elementAti(i);
515 }
516
517
getExpectedBreak(int32_t bp)518 int32_t TestParams::getExpectedBreak(int32_t bp) {
519 if (bp >= textMap->size()) {
520 return 0;
521 }
522 int32_t i = textMap->elementAti(bp);
523 int32_t retVal = 0;
524 if (i >= 0) {
525 retVal = expectedBreaks->elementAti(i);
526 }
527 return retVal;
528 }
529
530
getSrcCol(int32_t bp)531 int32_t TestParams::getSrcCol(int32_t bp) {
532 if (bp >= textMap->size()) {
533 bp = textMap->size() - 1;
534 }
535 int32_t i = 0;
536 for(; bp >= 0; --bp) {
537 // Move bp to a character boundary if we are not on one already.
538 i = textMap->elementAti(bp);
539 if (i >= 0) {
540 break;
541 }
542 }
543 return srcCol->elementAti(i);
544 }
545
546
executeTest(TestParams * t,UErrorCode & status)547 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
548 int32_t bp;
549 int32_t prevBP;
550 int32_t i;
551
552 TEST_ASSERT_SUCCESS(status);
553 if (U_FAILURE(status)) {
554 return;
555 }
556
557 if (t->bi == NULL) {
558 return;
559 }
560
561 t->bi->setText(t->textToBreak, status);
562 //
563 // Run the iterator forward
564 //
565 prevBP = -1;
566 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
567 if (prevBP == bp) {
568 // Fail for lack of forward progress.
569 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
570 bp, t->getSrcLine(bp), t->getSrcCol(bp));
571 break;
572 }
573
574 // Check that there we didn't miss an expected break between the last one
575 // and this one.
576 for (i=prevBP+1; i<bp; i++) {
577 if (t->getExpectedBreak(i) != 0) {
578 int expected[] = {0, i};
579 printStringBreaks(t->dataToBreak, expected, 2);
580 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
581 i, t->getSrcLine(i), t->getSrcCol(i));
582 }
583 }
584
585 // Check that the break we did find was expected
586 if (t->getExpectedBreak(bp) == 0) {
587 int expected[] = {0, bp};
588 printStringBreaks(t->textToBreak, expected, 2);
589 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
590 bp, t->getSrcLine(bp), t->getSrcCol(bp));
591 } else {
592 // The break was expected.
593 // Check that the {nnn} tag value is correct.
594 int32_t expectedTagVal = t->getExpectedBreak(bp);
595 if (expectedTagVal == -1) {
596 expectedTagVal = 0;
597 }
598 int32_t line = t->getSrcLine(bp);
599 int32_t rs = t->bi->getRuleStatus();
600 if (rs != expectedTagVal) {
601 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
602 " Actual, Expected status = %4d, %4d",
603 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
604 }
605 }
606
607 prevBP = bp;
608 }
609
610 // Verify that there were no missed expected breaks after the last one found
611 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
612 if (t->getExpectedBreak(i) != 0) {
613 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
614 i, t->getSrcLine(i), t->getSrcCol(i));
615 }
616 }
617
618 //
619 // Run the iterator backwards, verify that the same breaks are found.
620 //
621 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
622 bp = t->bi->last();
623 while (bp != BreakIterator::DONE) {
624 if (prevBP == bp) {
625 // Fail for lack of progress.
626 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
627 bp, t->getSrcLine(bp), t->getSrcCol(bp));
628 break;
629 }
630
631 // Check that we didn't miss an expected break between the last one
632 // and this one. (UVector returns zeros for index out of bounds.)
633 for (i=prevBP-1; i>bp; i--) {
634 if (t->getExpectedBreak(i) != 0) {
635 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
636 i, t->getSrcLine(i), t->getSrcCol(i));
637 }
638 }
639
640 // Check that the break we did find was expected
641 if (t->getExpectedBreak(bp) == 0) {
642 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
643 bp, t->getSrcLine(bp), t->getSrcCol(bp));
644 } else {
645 // The break was expected.
646 // Check that the {nnn} tag value is correct.
647 int32_t expectedTagVal = t->getExpectedBreak(bp);
648 if (expectedTagVal == -1) {
649 expectedTagVal = 0;
650 }
651 int line = t->getSrcLine(bp);
652 int32_t rs = t->bi->getRuleStatus();
653 if (rs != expectedTagVal) {
654 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
655 " Actual, Expected status = %4d, %4d",
656 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
657 }
658 }
659
660 prevBP = bp;
661 bp = t->bi->previous();
662 }
663
664 // Verify that there were no missed breaks prior to the last one found
665 for (i=prevBP-1; i>=0; i--) {
666 if (t->getExpectedBreak(i) != 0) {
667 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
668 i, t->getSrcLine(i), t->getSrcCol(i));
669 }
670 }
671
672 // Check isBoundary()
673 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
674 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
675 UBool boundaryFound = t->bi->isBoundary(i);
676 if (boundaryExpected != boundaryFound) {
677 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
678 " Expected, Actual= %s, %s",
679 i, t->getSrcLine(i), t->getSrcCol(i),
680 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
681 }
682 }
683
684 // Check following()
685 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
686 int32_t actualBreak = t->bi->following(i);
687 int32_t expectedBreak = BreakIterator::DONE;
688 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
689 if (t->getExpectedBreak(j) != 0) {
690 expectedBreak = j;
691 break;
692 }
693 }
694 if (expectedBreak != actualBreak) {
695 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
696 " Expected, Actual= %d, %d",
697 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
698 }
699 }
700
701 // Check preceding()
702 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
703 int32_t actualBreak = t->bi->preceding(i);
704 int32_t expectedBreak = BreakIterator::DONE;
705
706 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
707 // preceding(trailing byte) will return the index of some preceding code point,
708 // not the lead byte of the current code point, even though that has a smaller index.
709 // Therefore, start looking at the expected break data not at i-1, but at
710 // the start of code point index - 1.
711 utext_setNativeIndex(t->textToBreak, i);
712 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
713 for (; j >= 0; j--) {
714 if (t->getExpectedBreak(j) != 0) {
715 expectedBreak = j;
716 break;
717 }
718 }
719 if (expectedBreak != actualBreak) {
720 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
721 " Expected, Actual= %d, %d",
722 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
723 }
724 }
725 }
726
TestExtended()727 void RBBITest::TestExtended() {
728 // The expectations in this test heavily depends on the Thai dictionary.
729 // Therefore, we skip this test under the LSTM configuration.
730 if (skipDictionaryTest()) {
731 return;
732 }
733 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
734 // data driven test closely entangles filtered and regular data.
735 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
736 UErrorCode status = U_ZERO_ERROR;
737 Locale locale("");
738
739 TestParams tp(status);
740
741 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
742 if (U_FAILURE(status)) {
743 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
744 }
745
746 //
747 // Open and read the test data file.
748 //
749 const char *testDataDirectory = IntlTest::getSourceTestData(status);
750 CharString testFileName(testDataDirectory, -1, status);
751 testFileName.append("rbbitst.txt", -1, status);
752
753 int len;
754 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
755 if (U_FAILURE(status)) {
756 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
757 return;
758 }
759
760 bool skipTest = false; // Skip this test?
761
762 //
763 // Put the test data into a UnicodeString
764 //
765 UnicodeString testString(false, testFile, len);
766
767 enum EParseState{
768 PARSE_COMMENT,
769 PARSE_TAG,
770 PARSE_DATA,
771 PARSE_NUM,
772 PARSE_RULES
773 }
774 parseState = PARSE_TAG;
775
776 EParseState savedState = PARSE_TAG;
777
778 int32_t lineNum = 1;
779 int32_t colStart = 0;
780 int32_t column = 0;
781 int32_t charIdx = 0;
782
783 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
784
785 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
786 int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
787
788 for (charIdx = 0; charIdx < len; ) {
789 status = U_ZERO_ERROR;
790 UChar c = testString.charAt(charIdx);
791 charIdx++;
792 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
793 // treat CRLF as a unit
794 c = u'\n';
795 charIdx++;
796 }
797 if (c == u'\n' || c == u'\r') {
798 lineNum++;
799 colStart = charIdx;
800 }
801 column = charIdx - colStart + 1;
802
803 switch (parseState) {
804 case PARSE_COMMENT:
805 if (c == u'\n' || c == u'\r') {
806 parseState = savedState;
807 }
808 break;
809
810 case PARSE_TAG:
811 {
812 if (c == u'#') {
813 parseState = PARSE_COMMENT;
814 savedState = PARSE_TAG;
815 break;
816 }
817 if (u_isUWhiteSpace(c)) {
818 break;
819 }
820 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
821 delete tp.bi;
822 tp.bi = BreakIterator::createWordInstance(locale, status);
823 skipTest = false;
824 charIdx += 5;
825 break;
826 }
827 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
828 delete tp.bi;
829 tp.bi = BreakIterator::createCharacterInstance(locale, status);
830 skipTest = false;
831 charIdx += 5;
832 break;
833 }
834 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
835 delete tp.bi;
836 tp.bi = BreakIterator::createLineInstance(locale, status);
837 skipTest = false;
838 charIdx += 5;
839 break;
840 }
841 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
842 delete tp.bi;
843 tp.bi = BreakIterator::createSentenceInstance(locale, status);
844 skipTest = false;
845 charIdx += 5;
846 break;
847 }
848 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
849 delete tp.bi;
850 tp.bi = BreakIterator::createTitleInstance(locale, status);
851 charIdx += 6;
852 break;
853 }
854
855 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
856 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
857 charIdx = testString.indexOf(u'>', charIdx) + 1;
858 parseState = PARSE_RULES;
859 rules.remove();
860 rulesFirstLine = lineNum;
861 break;
862 }
863
864 // <locale loc_name>
865 localeMatcher.reset(testString);
866 if (localeMatcher.lookingAt(charIdx-1, status)) {
867 UnicodeString localeName = localeMatcher.group(1, status);
868 char localeName8[100];
869 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
870 locale = Locale::createFromName(localeName8);
871 charIdx += localeMatcher.group(0, status).length() - 1;
872 TEST_ASSERT_SUCCESS(status);
873 break;
874 }
875 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
876 parseState = PARSE_DATA;
877 charIdx += 5;
878 tp.dataToBreak = "";
879 tp.expectedBreaks->removeAllElements();
880 tp.srcCol ->removeAllElements();
881 tp.srcLine->removeAllElements();
882 break;
883 }
884
885 errln("line %d: Tag expected in test file.", lineNum);
886 parseState = PARSE_COMMENT;
887 savedState = PARSE_DATA;
888 goto end_test; // Stop the test.
889 }
890 break;
891
892 case PARSE_RULES:
893 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
894 charIdx += 7;
895 parseState = PARSE_TAG;
896 delete tp.bi;
897 UParseError pe;
898 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
899 skipTest = U_FAILURE(status);
900 if (U_FAILURE(status)) {
901 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
902 rulesFirstLine + pe.line - 1, u_errorName(status));
903 }
904 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
905 charIdx += 10;
906 parseState = PARSE_TAG;
907 UErrorCode ec = U_ZERO_ERROR;
908 UParseError pe;
909 RuleBasedBreakIterator bi(rules, pe, ec);
910 if (U_SUCCESS(ec)) {
911 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
912 rulesFirstLine + pe.line - 1);
913 }
914 } else {
915 rules.append(c);
916 }
917 break;
918
919 case PARSE_DATA:
920 if (c == u'•') {
921 int32_t breakIdx = tp.dataToBreak.length();
922 if (tp.expectedBreaks->size() > breakIdx) {
923 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
924 lineNum, column);
925 }
926 tp.expectedBreaks->setSize(breakIdx+1);
927 tp.expectedBreaks->setElementAt(-1, breakIdx);
928 tp.srcLine->setSize(breakIdx+1);
929 tp.srcLine->setElementAt(lineNum, breakIdx);
930 tp.srcCol ->setSize(breakIdx+1);
931 tp.srcCol ->setElementAt(column, breakIdx);
932 break;
933 }
934
935 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
936 // Add final entry to mappings from break location to source file position.
937 // Need one extra because last break position returned is after the
938 // last char in the data, not at the last char.
939 tp.srcLine->addElement(lineNum, status);
940 tp.srcCol ->addElement(column, status);
941
942 parseState = PARSE_TAG;
943 charIdx += 6;
944
945 if (!skipTest) {
946 // RUN THE TEST!
947 status = U_ZERO_ERROR;
948 tp.setUTF16(status);
949 executeTest(&tp, status);
950 TEST_ASSERT_SUCCESS(status);
951
952 // Run again, this time with UTF-8 text wrapped in a UText.
953 status = U_ZERO_ERROR;
954 tp.setUTF8(status);
955 TEST_ASSERT_SUCCESS(status);
956 executeTest(&tp, status);
957 }
958 break;
959 }
960
961 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
962 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
963 // Get the code point from the name and insert it into the test data.
964 // (Damn, no API takes names in Unicode !!!
965 // we've got to take it back to char *)
966 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
967 int32_t nameLength = nameEndIdx - (charIdx+2);
968 char charNameBuf[200];
969 UChar32 theChar = -1;
970 if (nameEndIdx != -1) {
971 UErrorCode status = U_ZERO_ERROR;
972 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
973 charNameBuf[sizeof(charNameBuf)-1] = 0;
974 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
975 if (U_FAILURE(status)) {
976 theChar = -1;
977 }
978 }
979 if (theChar == -1) {
980 errln("Error in named character in test file at line %d, col %d",
981 lineNum, column);
982 } else {
983 // Named code point was recognized. Insert it
984 // into the test data.
985 tp.dataToBreak.append(theChar);
986 while (tp.dataToBreak.length() > tp.srcLine->size()) {
987 tp.srcLine->addElement(lineNum, status);
988 tp.srcCol ->addElement(column, status);
989 }
990 }
991 if (nameEndIdx > charIdx) {
992 charIdx = nameEndIdx+1;
993
994 }
995 break;
996 }
997
998
999
1000 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
1001 charIdx++;
1002 int32_t breakIdx = tp.dataToBreak.length();
1003 tp.expectedBreaks->setSize(breakIdx+1);
1004 tp.expectedBreaks->setElementAt(-1, breakIdx);
1005 tp.srcLine->setSize(breakIdx+1);
1006 tp.srcLine->setElementAt(lineNum, breakIdx);
1007 tp.srcCol ->setSize(breakIdx+1);
1008 tp.srcCol ->setElementAt(column, breakIdx);
1009 break;
1010 }
1011
1012 if (c == u'<') {
1013 tagValue = 0;
1014 parseState = PARSE_NUM;
1015 break;
1016 }
1017
1018 if (c == u'#' && column==3) { // TODO: why is column off so far?
1019 parseState = PARSE_COMMENT;
1020 savedState = PARSE_DATA;
1021 break;
1022 }
1023
1024 if (c == u'\\') {
1025 // Check for \ at end of line, a line continuation.
1026 // Advance over (discard) the newline
1027 UChar32 cp = testString.char32At(charIdx);
1028 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1029 // We have a CR LF
1030 // Need an extra increment of the input ptr to move over both of them
1031 charIdx++;
1032 }
1033 if (cp == u'\n' || cp == u'\r') {
1034 lineNum++;
1035 colStart = charIdx;
1036 charIdx++;
1037 break;
1038 }
1039
1040 // Let unescape handle the back slash.
1041 cp = testString.unescapeAt(charIdx);
1042 if (cp != -1) {
1043 // Escape sequence was recognized. Insert the char
1044 // into the test data.
1045 tp.dataToBreak.append(cp);
1046 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1047 tp.srcLine->addElement(lineNum, status);
1048 tp.srcCol ->addElement(column, status);
1049 }
1050 break;
1051 }
1052
1053
1054 // Not a recognized backslash escape sequence.
1055 // Take the next char as a literal.
1056 // TODO: Should this be an error?
1057 c = testString.charAt(charIdx);
1058 charIdx = testString.moveIndex32(charIdx, 1);
1059 }
1060
1061 // Normal, non-escaped data char.
1062 tp.dataToBreak.append(c);
1063
1064 // Save the mapping from offset in the data to line/column numbers in
1065 // the original input file. Will be used for better error messages only.
1066 // If there's an expected break before this char, the slot in the mapping
1067 // vector will already be set for this char; don't overwrite it.
1068 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1069 tp.srcLine->addElement(lineNum, status);
1070 tp.srcCol ->addElement(column, status);
1071 }
1072 break;
1073
1074
1075 case PARSE_NUM:
1076 // We are parsing an expected numeric tag value, like <1234>,
1077 // within a chunk of data.
1078 if (u_isUWhiteSpace(c)) {
1079 break;
1080 }
1081
1082 if (c == u'>') {
1083 // Finished the number. Add the info to the expected break data,
1084 // and switch parse state back to doing plain data.
1085 parseState = PARSE_DATA;
1086 if (tagValue == 0) {
1087 tagValue = -1;
1088 }
1089 int32_t breakIdx = tp.dataToBreak.length();
1090 if (tp.expectedBreaks->size() > breakIdx) {
1091 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1092 lineNum, column);
1093 }
1094 tp.expectedBreaks->setSize(breakIdx+1);
1095 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1096 tp.srcLine->setSize(breakIdx+1);
1097 tp.srcLine->setElementAt(lineNum, breakIdx);
1098 tp.srcCol ->setSize(breakIdx+1);
1099 tp.srcCol ->setElementAt(column, breakIdx);
1100 break;
1101 }
1102
1103 if (u_isdigit(c)) {
1104 tagValue = tagValue*10 + u_charDigitValue(c);
1105 break;
1106 }
1107
1108 errln("Syntax Error in test file at line %d, col %d",
1109 lineNum, column);
1110 parseState = PARSE_COMMENT;
1111 goto end_test; // Stop the test
1112 break;
1113 }
1114
1115
1116 if (U_FAILURE(status)) {
1117 dataerrln("ICU Error %s while parsing test file at line %d.",
1118 u_errorName(status), lineNum);
1119 status = U_ZERO_ERROR;
1120 goto end_test; // Stop the test
1121 }
1122
1123 }
1124
1125 // Reached end of test file. Raise an error if parseState indicates that we are
1126 // within a block that should have been terminated.
1127
1128 if (parseState == PARSE_RULES) {
1129 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1130 lineNum, rulesFirstLine);
1131 }
1132 if (parseState == PARSE_DATA) {
1133 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1134 }
1135
1136
1137 end_test:
1138 delete [] testFile;
1139 #endif
1140 }
1141
1142 //-------------------------------------------------------------------------------
1143 //
1144 // TestDictRules create a break iterator from source rules that includes a
1145 // dictionary range. Regression for bug #7130. Source rules
1146 // do not declare a break iterator type (word, line, sentence, etc.
1147 // but the dictionary code, without a type, would loop.
1148 //
1149 //-------------------------------------------------------------------------------
TestDictRules()1150 void RBBITest::TestDictRules() {
1151 const char *rules = "$dictionary = [a-z]; \n"
1152 "!!forward; \n"
1153 "$dictionary $dictionary; \n"
1154 "!!reverse; \n"
1155 "$dictionary $dictionary; \n";
1156 const char *text = "aa";
1157 UErrorCode status = U_ZERO_ERROR;
1158 UParseError parseError;
1159
1160 RuleBasedBreakIterator bi(rules, parseError, status);
1161 if (U_SUCCESS(status)) {
1162 UnicodeString utext = text;
1163 bi.setText(utext);
1164 int32_t position;
1165 int32_t loops;
1166 for (loops = 0; loops<10; loops++) {
1167 position = bi.next();
1168 if (position == RuleBasedBreakIterator::DONE) {
1169 break;
1170 }
1171 }
1172 TEST_ASSERT(loops == 1);
1173 } else {
1174 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1175 }
1176 }
1177
1178
1179
1180 //--------------------------------------------------------------------------------------------
1181 //
1182 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1183 //
1184 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1185 void RBBITest::TestUnicodeFiles() {
1186 RuleBasedBreakIterator *bi;
1187 UErrorCode status = U_ZERO_ERROR;
1188
1189 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1190 TEST_ASSERT_SUCCESS(status);
1191 if (U_SUCCESS(status)) {
1192 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1193 }
1194 delete bi;
1195
1196 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1197 TEST_ASSERT_SUCCESS(status);
1198 if (U_SUCCESS(status)) {
1199 runUnicodeTestData("WordBreakTest.txt", bi);
1200 }
1201 delete bi;
1202
1203 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1204 TEST_ASSERT_SUCCESS(status);
1205 if (U_SUCCESS(status)) {
1206 runUnicodeTestData("SentenceBreakTest.txt", bi);
1207 }
1208 delete bi;
1209
1210 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1211 TEST_ASSERT_SUCCESS(status);
1212 if (U_SUCCESS(status)) {
1213 runUnicodeTestData("LineBreakTest.txt", bi);
1214 }
1215 delete bi;
1216 }
1217
1218
1219 // Check for test cases from the Unicode test data files that are known to fail
1220 // and should be skipped as known issues because ICU does not fully implement
1221 // the Unicode specifications, or because ICU includes tailorings that differ from
1222 // the Unicode standard.
1223 //
1224 // Test cases are identified by the test data sequence, which tends to be more stable
1225 // across Unicode versions than the test file line numbers.
1226 //
1227 // The test case with ticket "10666" is a dummy, included as an example.
1228
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1229 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1230 static struct TestCase {
1231 const char *fTicketNum;
1232 const char *fFileName;
1233 const UChar *fString;
1234 } badTestCases[] = {
1235 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1236 // The following tests were originally for
1237 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1238 // However, that ticket has been closed as fixed but these tests still fail, so
1239 // ICU-21097 has been created to investigate and address these remaining issues.
1240 {"21097", "LineBreakTest.txt", u"-#"},
1241 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1242 {"21097", "LineBreakTest.txt", u"\u002d\u00a7"},
1243 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1244 {"21097", "LineBreakTest.txt", u"\u002d\U00050005"},
1245 {"21097", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1246 {"21097", "LineBreakTest.txt", u"\u002d\u0e01"},
1247 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1248
1249 // The following tests were originally for
1250 // Issue ICU-12017 Improve line break around numbers.
1251 // However, that ticket has been closed as fixed but these tests still fail, so
1252 // ICU-21097 has been created to investigate and address these remaining issues.
1253 {"21097", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1254 {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1255 {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1256 {"21097", "LineBreakTest.txt", u"a.2 "},
1257 {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1258 {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1259 {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1260 {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1261 {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1262 {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1263 {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1264 {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1265 {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1266 {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1267
1268 // ICU-22127 until UAX #29 wordbreak is update for the colon changes in ICU-22112,
1269 // need to skip some tests in WordBreakTest.txt
1270 {"22127", "WordBreakTest.txt", u"a:"},
1271 {"22127", "WordBreakTest.txt", u"A:"},
1272 };
1273
1274 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1275 const TestCase &badCase = badTestCases[n];
1276 if (!strcmp(fileName, badCase.fFileName) &&
1277 testCase.startsWith(UnicodeString(badCase.fString))) {
1278 return logKnownIssue(badCase.fTicketNum);
1279 }
1280 }
1281 return false;
1282 }
1283
1284
1285 //--------------------------------------------------------------------------------------------
1286 //
1287 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1288 //
1289 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1290 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1291 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1292 UErrorCode status = U_ZERO_ERROR;
1293
1294 //
1295 // Open and read the test data file, put it into a UnicodeString.
1296 //
1297 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1298 char testFileName[1000];
1299 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1300 dataerrln("Can't open test data. Path too long.");
1301 return;
1302 }
1303 strcpy(testFileName, testDataDirectory);
1304 strcat(testFileName, fileName);
1305
1306 logln("Opening data file %s\n", fileName);
1307
1308 int len;
1309 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1310 if (status != U_FILE_ACCESS_ERROR) {
1311 TEST_ASSERT_SUCCESS(status);
1312 TEST_ASSERT(testFile != NULL);
1313 }
1314 if (U_FAILURE(status) || testFile == NULL) {
1315 return; /* something went wrong, error already output */
1316 }
1317 UnicodeString testFileAsString(true, testFile, len);
1318
1319 //
1320 // Parse the test data file using a regular expression.
1321 // Each kind of token is recognized in its own capture group; what type of item was scanned
1322 // is identified by which group had a match.
1323 //
1324 // Capture Group # 1 2 3 4 5
1325 // Parses this item: divide x hex digits comment \n unrecognized \n
1326 //
1327 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1328 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1329 UnicodeString testString;
1330 UVector32 breakPositions(status);
1331 int lineNumber = 1;
1332 TEST_ASSERT_SUCCESS(status);
1333 if (U_FAILURE(status)) {
1334 return;
1335 }
1336
1337 //
1338 // Scan through each test case, building up the string to be broken in testString,
1339 // and the positions that should be boundaries in the breakPositions vector.
1340 //
1341 int spin = 0;
1342 while (tokenMatcher.find()) {
1343 if(tokenMatcher.hitEnd()) {
1344 /* Shouldn't Happen(TM). This means we didn't find the symbols we were looking for.
1345 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1346 and caused an infinite loop here on EBCDIC systems!
1347 */
1348 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1349 // return;
1350 }
1351 if (tokenMatcher.start(1, status) >= 0) {
1352 // Scanned a divide sign, indicating a break position in the test data.
1353 if (testString.length()>0) {
1354 breakPositions.addElement(testString.length(), status);
1355 }
1356 }
1357 else if (tokenMatcher.start(2, status) >= 0) {
1358 // Scanned an 'x', meaning no break at this position in the test data
1359 // Nothing to be done here.
1360 }
1361 else if (tokenMatcher.start(3, status) >= 0) {
1362 // Scanned Hex digits. Convert them to binary, append to the character data string.
1363 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1364 int length = hexNumber.length();
1365 if (length<=8) {
1366 char buf[10];
1367 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1368 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1369 if (c<=0x10ffff) {
1370 testString.append(c);
1371 } else {
1372 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1373 fileName, lineNumber);
1374 }
1375 } else {
1376 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1377 fileName, lineNumber);
1378 }
1379 }
1380 else if (tokenMatcher.start(4, status) >= 0) {
1381 // Scanned to end of a line, possibly skipping over a comment in the process.
1382 // If the line from the file contained test data, run the test now.
1383 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1384 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1385 }
1386
1387 // Clear out this test case.
1388 // The string and breakPositions vector will be refilled as the next
1389 // test case is parsed.
1390 testString.remove();
1391 breakPositions.removeAllElements();
1392 lineNumber++;
1393 } else {
1394 // Scanner catchall. Something unrecognized appeared on the line.
1395 char token[16];
1396 UnicodeString uToken = tokenMatcher.group(0, status);
1397 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1398 token[sizeof(token)-1] = 0;
1399 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1400
1401 // Clean up, in preparation for continuing with the next line.
1402 testString.remove();
1403 breakPositions.removeAllElements();
1404 lineNumber++;
1405 }
1406 TEST_ASSERT_SUCCESS(status);
1407 if (U_FAILURE(status)) {
1408 break;
1409 }
1410 }
1411
1412 delete [] testFile;
1413 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1414 }
1415
1416 //--------------------------------------------------------------------------------------------
1417 //
1418 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1419 // test data files. Do only a simple, forward-only check -
1420 // this test is mostly to check that ICU and the Unicode
1421 // data agree with each other.
1422 //
1423 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1424 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1425 const UnicodeString &testString, // Text data to be broken
1426 UVector32 *breakPositions, // Positions where breaks should be found.
1427 RuleBasedBreakIterator *bi) {
1428 int32_t pos; // Break Position in the test string
1429 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1430 int32_t expectedPos; // Expected break position (index into test string)
1431
1432 bi->setText(testString);
1433 pos = bi->first();
1434 pos = bi->next();
1435
1436 while (pos != BreakIterator::DONE) {
1437 if (expectedI >= breakPositions->size()) {
1438 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1439 testFileName, lineNumber, pos);
1440 break;
1441 }
1442 expectedPos = breakPositions->elementAti(expectedI);
1443 if (pos < expectedPos) {
1444 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1445 testFileName, lineNumber, pos);
1446 break;
1447 }
1448 if (pos > expectedPos) {
1449 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1450 testFileName, lineNumber, expectedPos);
1451 break;
1452 }
1453 pos = bi->next();
1454 expectedI++;
1455 }
1456
1457 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1458 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1459 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1460 }
1461 }
1462
1463
1464
1465 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1466 //---------------------------------------------------------------------------------------
1467 //
1468 // class RBBIMonkeyKind
1469 //
1470 // Monkey Test for Break Iteration
1471 // Abstract interface class. Concrete derived classes independently
1472 // implement the break rules for different iterator types.
1473 //
1474 // The Monkey Test itself uses doesn't know which type of break iterator it is
1475 // testing, but works purely in terms of the interface defined here.
1476 //
1477 //---------------------------------------------------------------------------------------
1478 class RBBIMonkeyKind {
1479 public:
1480 // Return a UVector of UnicodeSets, representing the character classes used
1481 // for this type of iterator.
1482 virtual UVector *charClasses() = 0;
1483
1484 // Set the test text on which subsequent calls to next() will operate
1485 virtual void setText(const UnicodeString &s) = 0;
1486
1487 // Find the next break position, starting from the prev break position, or from zero.
1488 // Return -1 after reaching end of string.
1489 virtual int32_t next(int32_t i) = 0;
1490
1491 // Name of each character class, parallel with charClasses. Used for debugging output
1492 // of characters.
1493 virtual std::vector<std::string>& characterClassNames();
1494
1495 void setAppliedRule(int32_t position, const char* value);
1496
1497 std::string getAppliedRule(int32_t position);
1498
1499 virtual ~RBBIMonkeyKind();
1500 UErrorCode deferredStatus;
1501
1502 std::string classNameFromCodepoint(const UChar32 c);
1503 unsigned int maxClassNameSize();
1504
1505 protected:
1506 RBBIMonkeyKind();
1507 std::vector<std::string> classNames;
1508 std::vector<std::string> appliedRules;
1509
1510 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1511 void prepareAppliedRules(int32_t size );
1512
1513 private:
1514
1515 };
1516
RBBIMonkeyKind()1517 RBBIMonkeyKind::RBBIMonkeyKind() {
1518 deferredStatus = U_ZERO_ERROR;
1519 }
1520
~RBBIMonkeyKind()1521 RBBIMonkeyKind::~RBBIMonkeyKind() {
1522 }
1523
characterClassNames()1524 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1525 return classNames;
1526 }
1527
prepareAppliedRules(int32_t size)1528 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1529 // Remove all the information in the `appliedRules`.
1530 appliedRules.clear();
1531 appliedRules.resize(size + 1);
1532 }
1533
setAppliedRule(int32_t position,const char * value)1534 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1535 appliedRules[position] = value;
1536 }
1537
getAppliedRule(int32_t position)1538 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1539 return appliedRules[position];
1540 }
1541
classNameFromCodepoint(const UChar32 c)1542 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1543 // Simply iterate through charClasses to find character's class
1544 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1545 UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1546 if (classSet->contains(c)) {
1547 return classNames[aClassNum];
1548 }
1549 }
1550 U_ASSERT(false); // This should not happen.
1551 return "bad class name";
1552 }
1553
maxClassNameSize()1554 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1555 unsigned int maxSize = 0;
1556 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1557 auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1558 if (aClassNumSize > maxSize) {
1559 maxSize = aClassNumSize;
1560 }
1561 }
1562 return maxSize;
1563 }
1564
1565 //----------------------------------------------------------------------------------------
1566 //
1567 // Random Numbers. Similar to standard lib rand() and srand()
1568 // Not using library to
1569 // 1. Get same results on all platforms.
1570 // 2. Get access to current seed, to more easily reproduce failures.
1571 //
1572 //---------------------------------------------------------------------------------------
1573 static uint32_t m_seed = 1;
1574
m_rand()1575 static uint32_t m_rand()
1576 {
1577 m_seed = m_seed * 1103515245 + 12345;
1578 return (uint32_t)(m_seed/65536) % 32768;
1579 }
1580
1581
1582 //------------------------------------------------------------------------------------------
1583 //
1584 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1585 // of RBBIMonkeyKind.
1586 //
1587 //------------------------------------------------------------------------------------------
1588 class RBBICharMonkey: public RBBIMonkeyKind {
1589 public:
1590 RBBICharMonkey();
1591 virtual ~RBBICharMonkey();
1592 virtual UVector *charClasses() override;
1593 virtual void setText(const UnicodeString &s) override;
1594 virtual int32_t next(int32_t i) override;
1595 private:
1596 UVector *fSets;
1597
1598 UnicodeSet *fCRLFSet;
1599 UnicodeSet *fControlSet;
1600 UnicodeSet *fExtendSet;
1601 UnicodeSet *fZWJSet;
1602 UnicodeSet *fRegionalIndicatorSet;
1603 UnicodeSet *fPrependSet;
1604 UnicodeSet *fSpacingSet;
1605 UnicodeSet *fLSet;
1606 UnicodeSet *fVSet;
1607 UnicodeSet *fTSet;
1608 UnicodeSet *fLVSet;
1609 UnicodeSet *fLVTSet;
1610 UnicodeSet *fHangulSet;
1611 UnicodeSet *fExtendedPictSet;
1612 UnicodeSet *fViramaSet;
1613 UnicodeSet *fLinkingConsonantSet;
1614 UnicodeSet *fExtCccZwjSet;
1615 UnicodeSet *fAnySet;
1616
1617 const UnicodeString *fText;
1618 };
1619
1620
RBBICharMonkey()1621 RBBICharMonkey::RBBICharMonkey() {
1622 UErrorCode status = U_ZERO_ERROR;
1623
1624 fText = NULL;
1625
1626 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1627 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1628 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1629 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1630 fRegionalIndicatorSet =
1631 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1632 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1633 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1634 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1635 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1636 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1637 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1638 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1639 fHangulSet = new UnicodeSet();
1640 fHangulSet->addAll(*fLSet);
1641 fHangulSet->addAll(*fVSet);
1642 fHangulSet->addAll(*fTSet);
1643 fHangulSet->addAll(*fLVSet);
1644 fHangulSet->addAll(*fLVTSet);
1645
1646 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1647 fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1648 "\\p{Indic_Syllabic_Category=Virama}]", status);
1649 fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1650 "\\p{Indic_Syllabic_Category=Consonant}]", status);
1651 fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1652 fAnySet = new UnicodeSet(0, 0x10ffff);
1653
1654 // Create sets of characters, and add the names of the above character sets.
1655 // In each new ICU release, add new names corresponding to the sets above.
1656 fSets = new UVector(status);
1657
1658 // Important: Keep class names the same as the class contents.
1659 fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1660 fSets->addElement(fControlSet, status); classNames.push_back("Control");
1661 fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1662 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1663 if (!fPrependSet->isEmpty()) {
1664 fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1665 }
1666 fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1667 fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1668 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1669 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1670 fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1671 fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1672 fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1673 fSets->addElement(fAnySet, status); classNames.push_back("Any");
1674
1675 if (U_FAILURE(status)) {
1676 deferredStatus = status;
1677 }
1678 }
1679
1680
setText(const UnicodeString & s)1681 void RBBICharMonkey::setText(const UnicodeString &s) {
1682 fText = &s;
1683 prepareAppliedRules(s.length());
1684 }
1685
1686
1687
next(int32_t prevPos)1688 int32_t RBBICharMonkey::next(int32_t prevPos) {
1689 int p0, p1, p2, p3; // Indices of the significant code points around the
1690 // break position being tested. The candidate break
1691 // location is before p2.
1692
1693 int breakPos = -1;
1694
1695 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1696 UChar32 cBase; // for (X Extend*) patterns, the X character.
1697
1698 if (U_FAILURE(deferredStatus)) {
1699 return -1;
1700 }
1701
1702 // Previous break at end of string. return DONE.
1703 if (prevPos >= fText->length()) {
1704 return -1;
1705 }
1706
1707 p0 = p1 = p2 = p3 = prevPos;
1708 c3 = fText->char32At(prevPos);
1709 c0 = c1 = c2 = cBase = 0;
1710 (void)p0; // suppress set but not used warning.
1711 (void)c0;
1712
1713 // Loop runs once per "significant" character position in the input text.
1714 for (;;) {
1715 // Move all of the positions forward in the input string.
1716 p0 = p1; c0 = c1;
1717 p1 = p2; c1 = c2;
1718 p2 = p3; c2 = c3;
1719
1720 // Advance p3 by one codepoint
1721 p3 = fText->moveIndex32(p3, 1);
1722 c3 = fText->char32At(p3);
1723
1724 if (p1 == p2) {
1725 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1726 continue;
1727 }
1728
1729 if (p2 == fText->length()) {
1730 setAppliedRule(p2, "End of String");
1731 break;
1732 }
1733
1734 // No Extend or Format characters may appear between the CR and LF,
1735 // which requires the additional check for p2 immediately following p1.
1736 //
1737 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1738 setAppliedRule(p2, "GB3 CR x LF");
1739 continue;
1740 }
1741
1742 if (fControlSet->contains(c1) ||
1743 c1 == 0x0D ||
1744 c1 == 0x0A) {
1745 setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
1746 break;
1747 }
1748
1749 if (fControlSet->contains(c2) ||
1750 c2 == 0x0D ||
1751 c2 == 0x0A) {
1752 setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
1753 break;
1754 }
1755
1756 if (fLSet->contains(c1) &&
1757 (fLSet->contains(c2) ||
1758 fVSet->contains(c2) ||
1759 fLVSet->contains(c2) ||
1760 fLVTSet->contains(c2))) {
1761 setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
1762 continue;
1763 }
1764
1765 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1766 (fVSet->contains(c2) || fTSet->contains(c2))) {
1767 setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
1768 continue;
1769 }
1770
1771 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1772 fTSet->contains(c2)) {
1773 setAppliedRule(p2, "GB8 ( LVT | T) x T");
1774 continue;
1775 }
1776
1777 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1778 if (!fExtendSet->contains(c1)) {
1779 cBase = c1;
1780 }
1781 setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
1782 continue;
1783 }
1784
1785 if (fSpacingSet->contains(c2)) {
1786 setAppliedRule(p2, "GB9a x SpacingMark");
1787 continue;
1788 }
1789
1790 if (fPrependSet->contains(c1)) {
1791 setAppliedRule(p2, "GB9b Prepend x");
1792 continue;
1793 }
1794
1795 // Note: Viramas are also included in the ExtCccZwj class.
1796 if (fLinkingConsonantSet->contains(c2)) {
1797 int pi = p1;
1798 bool sawVirama = false;
1799 while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1800 if (fViramaSet->contains(fText->char32At(pi))) {
1801 sawVirama = true;
1802 }
1803 pi = fText->moveIndex32(pi, -1);
1804 }
1805 if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1806 setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1807 continue;
1808 }
1809 }
1810
1811 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1812 setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1813 continue;
1814 }
1815
1816 // Note: The first if condition is a little tricky. We only need to force
1817 // a break if there are three or more contiguous RIs. If there are
1818 // only two, a break following will occur via other rules, and will include
1819 // any trailing extend characters, which is needed behavior.
1820 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1821 && fRegionalIndicatorSet->contains(c2)) {
1822 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1823 break;
1824 }
1825 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1826 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1827 continue;
1828 }
1829
1830 setAppliedRule(p2, "GB999 Any <break> Any");
1831 break;
1832 }
1833
1834 breakPos = p2;
1835 return breakPos;
1836 }
1837
1838
1839
charClasses()1840 UVector *RBBICharMonkey::charClasses() {
1841 return fSets;
1842 }
1843
~RBBICharMonkey()1844 RBBICharMonkey::~RBBICharMonkey() {
1845 delete fSets;
1846 delete fCRLFSet;
1847 delete fControlSet;
1848 delete fExtendSet;
1849 delete fRegionalIndicatorSet;
1850 delete fPrependSet;
1851 delete fSpacingSet;
1852 delete fLSet;
1853 delete fVSet;
1854 delete fTSet;
1855 delete fLVSet;
1856 delete fLVTSet;
1857 delete fHangulSet;
1858 delete fAnySet;
1859 delete fZWJSet;
1860 delete fExtendedPictSet;
1861 delete fViramaSet;
1862 delete fLinkingConsonantSet;
1863 delete fExtCccZwjSet;
1864 }
1865
1866 //------------------------------------------------------------------------------------------
1867 //
1868 // class RBBIWordMonkey Word Break specific implementation
1869 // of RBBIMonkeyKind.
1870 //
1871 //------------------------------------------------------------------------------------------
1872 class RBBIWordMonkey: public RBBIMonkeyKind {
1873 public:
1874 RBBIWordMonkey();
1875 virtual ~RBBIWordMonkey();
1876 virtual UVector *charClasses() override;
1877 virtual void setText(const UnicodeString &s) override;
1878 virtual int32_t next(int32_t i) override;
1879 private:
1880 UVector *fSets;
1881
1882 UnicodeSet *fCRSet;
1883 UnicodeSet *fLFSet;
1884 UnicodeSet *fNewlineSet;
1885 UnicodeSet *fRegionalIndicatorSet;
1886 UnicodeSet *fKatakanaSet;
1887 UnicodeSet *fHebrew_LetterSet;
1888 UnicodeSet *fALetterSet;
1889 UnicodeSet *fSingle_QuoteSet;
1890 UnicodeSet *fDouble_QuoteSet;
1891 UnicodeSet *fMidNumLetSet;
1892 UnicodeSet *fMidLetterSet;
1893 UnicodeSet *fMidNumSet;
1894 UnicodeSet *fNumericSet;
1895 UnicodeSet *fFormatSet;
1896 UnicodeSet *fOtherSet = nullptr;
1897 UnicodeSet *fExtendSet;
1898 UnicodeSet *fExtendNumLetSet;
1899 UnicodeSet *fWSegSpaceSet;
1900 UnicodeSet *fDictionarySet = nullptr;
1901 UnicodeSet *fZWJSet;
1902 UnicodeSet *fExtendedPictSet;
1903
1904 const UnicodeString *fText;
1905 };
1906
1907
RBBIWordMonkey()1908 RBBIWordMonkey::RBBIWordMonkey()
1909 {
1910 UErrorCode status = U_ZERO_ERROR;
1911
1912 fSets = new UVector(status);
1913
1914 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
1915 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
1916 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
1917 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
1918 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1919 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1920 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter} @]", status);
1921 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
1922 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
1923 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
1924 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]", status);
1925 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
1926 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1927 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
1928 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1929 // There are some sc=Hani characters with WB=Extend.
1930 // The break rules need to pick one or the other because
1931 // Extend overlapping with something else is messy.
1932 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1933 // in $Han (for $dictionary) and out of $Extend.
1934 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1935 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
1936
1937 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
1938 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1939 if(U_FAILURE(status)) {
1940 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1941 deferredStatus = status;
1942 return;
1943 }
1944
1945 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1946 fDictionarySet->addAll(*fKatakanaSet);
1947 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1948
1949 fALetterSet->removeAll(*fDictionarySet);
1950
1951 fOtherSet = new UnicodeSet();
1952 if(U_FAILURE(status)) {
1953 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1954 deferredStatus = status;
1955 return;
1956 }
1957
1958 fOtherSet->complement();
1959 fOtherSet->removeAll(*fCRSet);
1960 fOtherSet->removeAll(*fLFSet);
1961 fOtherSet->removeAll(*fNewlineSet);
1962 fOtherSet->removeAll(*fKatakanaSet);
1963 fOtherSet->removeAll(*fHebrew_LetterSet);
1964 fOtherSet->removeAll(*fALetterSet);
1965 fOtherSet->removeAll(*fSingle_QuoteSet);
1966 fOtherSet->removeAll(*fDouble_QuoteSet);
1967 fOtherSet->removeAll(*fMidLetterSet);
1968 fOtherSet->removeAll(*fMidNumSet);
1969 fOtherSet->removeAll(*fNumericSet);
1970 fOtherSet->removeAll(*fExtendNumLetSet);
1971 fOtherSet->removeAll(*fWSegSpaceSet);
1972 fOtherSet->removeAll(*fFormatSet);
1973 fOtherSet->removeAll(*fExtendSet);
1974 fOtherSet->removeAll(*fRegionalIndicatorSet);
1975 fOtherSet->removeAll(*fZWJSet);
1976 fOtherSet->removeAll(*fExtendedPictSet);
1977
1978 // Inhibit dictionary characters from being tested at all.
1979 fOtherSet->removeAll(*fDictionarySet);
1980
1981 // Add classes and their names
1982 fSets->addElement(fCRSet, status); classNames.push_back("CR");
1983 fSets->addElement(fLFSet, status); classNames.push_back("LF");
1984 fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
1985 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1986 fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
1987 fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
1988 fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
1989 fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
1990 // Omit Katakana from fSets, which omits Katakana characters
1991 // from the test data. They are all in the dictionary set,
1992 // which this (old, to be retired) monkey test cannot handle.
1993 //fSets->addElement(fKatakanaSet, status);
1994
1995 fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
1996 fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
1997 fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
1998 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
1999 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2000 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2001 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2002 fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2003 fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2004
2005 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2006 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2007
2008 if (U_FAILURE(status)) {
2009 deferredStatus = status;
2010 }
2011 }
2012
setText(const UnicodeString & s)2013 void RBBIWordMonkey::setText(const UnicodeString &s) {
2014 fText = &s;
2015 prepareAppliedRules(s.length());
2016 }
2017
2018
next(int32_t prevPos)2019 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2020 int p0, p1, p2, p3; // Indices of the significant code points around the
2021 // break position being tested. The candidate break
2022 // location is before p2.
2023
2024 int breakPos = -1;
2025
2026 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2027
2028 if (U_FAILURE(deferredStatus)) {
2029 return -1;
2030 }
2031
2032 // Prev break at end of string. return DONE.
2033 if (prevPos >= fText->length()) {
2034 return -1;
2035 }
2036 p0 = p1 = p2 = p3 = prevPos;
2037 c3 = fText->char32At(prevPos);
2038 c0 = c1 = c2 = 0;
2039 (void)p0; // Suppress set but not used warning.
2040
2041 // Loop runs once per "significant" character position in the input text.
2042 for (;;) {
2043 // Move all of the positions forward in the input string.
2044 p0 = p1; c0 = c1;
2045 p1 = p2; c1 = c2;
2046 p2 = p3; c2 = c3;
2047
2048 // Advance p3 by X(Extend | Format)* Rule 4
2049 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2050 do {
2051 p3 = fText->moveIndex32(p3, 1);
2052 c3 = fText->char32At(p3);
2053 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2054 break;
2055 }
2056 }
2057 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2058
2059
2060 if (p1 == p2) {
2061 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2062 continue;
2063 }
2064
2065 if (p2 == fText->length()) {
2066 // Reached end of string. Always a break position.
2067 break;
2068 }
2069
2070 // No Extend or Format characters may appear between the CR and LF,
2071 // which requires the additional check for p2 immediately following p1.
2072 //
2073 if (c1==0x0D && c2==0x0A) {
2074 setAppliedRule(p2, "WB3 CR x LF");
2075 continue;
2076 }
2077
2078 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2079 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2080 break;
2081 }
2082 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2083 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2084 break;
2085 }
2086
2087 // Not ignoring extend chars, so peek into input text to
2088 // get the potential ZWJ, the character immediately preceding c2.
2089 // Sloppy UChar32 indexing: p2-1 may reference trail half
2090 // but char32At will get the full code point.
2091 if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2092 setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
2093 continue;
2094 }
2095
2096 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2097 setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
2098 continue;
2099 }
2100
2101 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2102 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2103 setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2104 continue;
2105 }
2106
2107 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2108 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2109 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2110 setAppliedRule(p2,
2111 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2112 continue;
2113 }
2114
2115 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2116 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2117 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2118 setAppliedRule(p2,
2119 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
2120 continue;
2121 }
2122
2123 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2124 setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
2125 continue;
2126 }
2127
2128 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2129 setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
2130 continue;
2131 }
2132
2133 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2134 setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
2135 continue;
2136 }
2137
2138 if (fNumericSet->contains(c1) &&
2139 fNumericSet->contains(c2)) {
2140 setAppliedRule(p2, "WB8 Numeric x Numeric");
2141 continue;
2142 }
2143
2144 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2145 fNumericSet->contains(c2)) {
2146 setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
2147 continue;
2148 }
2149
2150 if (fNumericSet->contains(c1) &&
2151 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2152 setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
2153 continue;
2154 }
2155
2156 if (fNumericSet->contains(c0) &&
2157 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2158 fNumericSet->contains(c2)) {
2159 setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
2160 continue;
2161 }
2162
2163 if (fNumericSet->contains(c1) &&
2164 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2165 fNumericSet->contains(c3)) {
2166 setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2167 continue;
2168 }
2169
2170 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2171 // all Katakana are handled by the dictionary breaker.
2172 if (fKatakanaSet->contains(c1) &&
2173 fKatakanaSet->contains(c2)) {
2174 setAppliedRule(p2, "WB13 Katakana x Katakana");
2175 continue;
2176 }
2177
2178 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2179 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2180 fExtendNumLetSet->contains(c2)) {
2181 setAppliedRule(p2,
2182 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2183 continue;
2184 }
2185
2186 if (fExtendNumLetSet->contains(c1) &&
2187 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2188 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2189 setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2190 continue;
2191 }
2192
2193 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2194 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2195 break;
2196 }
2197 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2198 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2199 continue;
2200 }
2201
2202 setAppliedRule(p2, "WB999");
2203 break;
2204 }
2205
2206 breakPos = p2;
2207 return breakPos;
2208 }
2209
2210
charClasses()2211 UVector *RBBIWordMonkey::charClasses() {
2212 return fSets;
2213 }
2214
~RBBIWordMonkey()2215 RBBIWordMonkey::~RBBIWordMonkey() {
2216 delete fSets;
2217 delete fCRSet;
2218 delete fLFSet;
2219 delete fNewlineSet;
2220 delete fKatakanaSet;
2221 delete fHebrew_LetterSet;
2222 delete fALetterSet;
2223 delete fSingle_QuoteSet;
2224 delete fDouble_QuoteSet;
2225 delete fMidNumLetSet;
2226 delete fMidLetterSet;
2227 delete fMidNumSet;
2228 delete fNumericSet;
2229 delete fFormatSet;
2230 delete fExtendSet;
2231 delete fExtendNumLetSet;
2232 delete fWSegSpaceSet;
2233 delete fRegionalIndicatorSet;
2234 delete fDictionarySet;
2235 delete fOtherSet;
2236 delete fZWJSet;
2237 delete fExtendedPictSet;
2238 }
2239
2240
2241
2242
2243 //------------------------------------------------------------------------------------------
2244 //
2245 // class RBBISentMonkey Sentence Break specific implementation
2246 // of RBBIMonkeyKind.
2247 //
2248 //------------------------------------------------------------------------------------------
2249 class RBBISentMonkey: public RBBIMonkeyKind {
2250 public:
2251 RBBISentMonkey();
2252 virtual ~RBBISentMonkey();
2253 virtual UVector *charClasses() override;
2254 virtual void setText(const UnicodeString &s) override;
2255 virtual int32_t next(int32_t i) override;
2256 private:
2257 int moveBack(int posFrom);
2258 int moveForward(int posFrom);
2259 UChar32 cAt(int pos);
2260
2261 UVector *fSets;
2262
2263 UnicodeSet *fSepSet;
2264 UnicodeSet *fFormatSet;
2265 UnicodeSet *fSpSet;
2266 UnicodeSet *fLowerSet;
2267 UnicodeSet *fUpperSet;
2268 UnicodeSet *fOLetterSet;
2269 UnicodeSet *fNumericSet;
2270 UnicodeSet *fATermSet;
2271 UnicodeSet *fSContinueSet;
2272 UnicodeSet *fSTermSet;
2273 UnicodeSet *fCloseSet;
2274 UnicodeSet *fOtherSet;
2275 UnicodeSet *fExtendSet;
2276
2277 const UnicodeString *fText;
2278 };
2279
RBBISentMonkey()2280 RBBISentMonkey::RBBISentMonkey()
2281 {
2282 UErrorCode status = U_ZERO_ERROR;
2283
2284 fSets = new UVector(status);
2285
2286 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2287 // set and made into character classes of their own. For the monkey impl,
2288 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2289 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2290 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2291 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2292 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2293 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2294 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2295 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2296 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2297 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2298 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2299 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2300 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2301 fOtherSet = new UnicodeSet();
2302
2303 if(U_FAILURE(status)) {
2304 deferredStatus = status;
2305 return;
2306 }
2307
2308 fOtherSet->complement();
2309 fOtherSet->removeAll(*fSepSet);
2310 fOtherSet->removeAll(*fFormatSet);
2311 fOtherSet->removeAll(*fSpSet);
2312 fOtherSet->removeAll(*fLowerSet);
2313 fOtherSet->removeAll(*fUpperSet);
2314 fOtherSet->removeAll(*fOLetterSet);
2315 fOtherSet->removeAll(*fNumericSet);
2316 fOtherSet->removeAll(*fATermSet);
2317 fOtherSet->removeAll(*fSContinueSet);
2318 fOtherSet->removeAll(*fSTermSet);
2319 fOtherSet->removeAll(*fCloseSet);
2320 fOtherSet->removeAll(*fExtendSet);
2321
2322 fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2323 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2324 fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2325 fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2326 fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2327 fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2328 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2329 fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2330 fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2331 fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2332 fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2333 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2334 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2335
2336 if (U_FAILURE(status)) {
2337 deferredStatus = status;
2338 }
2339 }
2340
2341
2342
setText(const UnicodeString & s)2343 void RBBISentMonkey::setText(const UnicodeString &s) {
2344 fText = &s;
2345 prepareAppliedRules(s.length());
2346 }
2347
charClasses()2348 UVector *RBBISentMonkey::charClasses() {
2349 return fSets;
2350 }
2351
2352 // moveBack() Find the "significant" code point preceding the index i.
2353 // Skips over ($Extend | $Format)* .
2354 //
moveBack(int i)2355 int RBBISentMonkey::moveBack(int i) {
2356 if (i <= 0) {
2357 return -1;
2358 }
2359 UChar32 c;
2360 int32_t j = i;
2361 do {
2362 j = fText->moveIndex32(j, -1);
2363 c = fText->char32At(j);
2364 }
2365 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2366 return j;
2367
2368 }
2369
2370
moveForward(int i)2371 int RBBISentMonkey::moveForward(int i) {
2372 if (i>=fText->length()) {
2373 return fText->length();
2374 }
2375 UChar32 c;
2376 int32_t j = i;
2377 do {
2378 j = fText->moveIndex32(j, 1);
2379 c = cAt(j);
2380 }
2381 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2382 return j;
2383 }
2384
cAt(int pos)2385 UChar32 RBBISentMonkey::cAt(int pos) {
2386 if (pos<0 || pos>=fText->length()) {
2387 return -1;
2388 } else {
2389 return fText->char32At(pos);
2390 }
2391 }
2392
next(int32_t prevPos)2393 int32_t RBBISentMonkey::next(int32_t prevPos) {
2394 int p0, p1, p2, p3; // Indices of the significant code points around the
2395 // break position being tested. The candidate break
2396 // location is before p2.
2397
2398 int breakPos = -1;
2399
2400 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2401 UChar32 c;
2402
2403 if (U_FAILURE(deferredStatus)) {
2404 return -1;
2405 }
2406
2407 // Prev break at end of string. return DONE.
2408 if (prevPos >= fText->length()) {
2409 return -1;
2410 }
2411 p0 = p1 = p2 = p3 = prevPos;
2412 c3 = fText->char32At(prevPos);
2413 c0 = c1 = c2 = 0;
2414 (void)p0; // Suppress set but not used warning.
2415
2416 // Loop runs once per "significant" character position in the input text.
2417 for (;;) {
2418 // Move all of the positions forward in the input string.
2419 p0 = p1; c0 = c1;
2420 p1 = p2; c1 = c2;
2421 p2 = p3; c2 = c3;
2422
2423 // Advance p3 by X(Extend | Format)* Rule 4
2424 p3 = moveForward(p3);
2425 c3 = cAt(p3);
2426
2427 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2428 setAppliedRule(p2, "SB3 CR x LF");
2429 continue;
2430 }
2431
2432 if (fSepSet->contains(c1)) {
2433 p2 = p1+1; // Separators don't combine with Extend or Format.
2434
2435 setAppliedRule(p2, "SB4 Sep <break>");
2436 break;
2437 }
2438
2439 if (p2 >= fText->length()) {
2440 // Reached end of string. Always a break position.
2441 setAppliedRule(p2, "SB4 Sep <break>");
2442 break;
2443 }
2444
2445 if (p2 == prevPos) {
2446 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2447 setAppliedRule(p2, "SB4 Sep <break>");
2448 continue;
2449 }
2450
2451 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2452 setAppliedRule(p2, "SB6 ATerm x Numeric");
2453 continue;
2454 }
2455
2456 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2457 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2458 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
2459 continue;
2460 }
2461
2462 // Note: STerm | ATerm are added to the negated part of the expression by a
2463 // note to the Unicode 5.0 documents.
2464 int p8 = p1;
2465 while (fSpSet->contains(cAt(p8))) {
2466 p8 = moveBack(p8);
2467 }
2468 while (fCloseSet->contains(cAt(p8))) {
2469 p8 = moveBack(p8);
2470 }
2471 if (fATermSet->contains(cAt(p8))) {
2472 p8=p2;
2473 for (;;) {
2474 c = cAt(p8);
2475 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2476 fLowerSet->contains(c) || fSepSet->contains(c) ||
2477 fATermSet->contains(c) || fSTermSet->contains(c)) {
2478
2479 setAppliedRule(p2,
2480 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2481 break;
2482 }
2483 p8 = moveForward(p8);
2484 }
2485 if (fLowerSet->contains(cAt(p8))) {
2486
2487 setAppliedRule(p2,
2488 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2489 continue;
2490 }
2491 }
2492
2493 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2494 p8 = p1;
2495 while (fSpSet->contains(cAt(p8))) {
2496 p8 = moveBack(p8);
2497 }
2498 while (fCloseSet->contains(cAt(p8))) {
2499 p8 = moveBack(p8);
2500 }
2501 c = cAt(p8);
2502 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2503 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2504 continue;
2505 }
2506 }
2507
2508 int p9 = p1;
2509 while (fCloseSet->contains(cAt(p9))) {
2510 p9 = moveBack(p9);
2511 }
2512 c = cAt(p9);
2513 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2514 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2515
2516 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
2517 continue;
2518 }
2519 }
2520
2521 int p10 = p1;
2522 while (fSpSet->contains(cAt(p10))) {
2523 p10 = moveBack(p10);
2524 }
2525 while (fCloseSet->contains(cAt(p10))) {
2526 p10 = moveBack(p10);
2527 }
2528 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2529 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2530 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
2531 continue;
2532 }
2533 }
2534
2535 int p11 = p1;
2536 if (fSepSet->contains(cAt(p11))) {
2537 p11 = moveBack(p11);
2538 }
2539 while (fSpSet->contains(cAt(p11))) {
2540 p11 = moveBack(p11);
2541 }
2542 while (fCloseSet->contains(cAt(p11))) {
2543 p11 = moveBack(p11);
2544 }
2545 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2546 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
2547 break;
2548 }
2549
2550 setAppliedRule(p2, "SB12 Any x Any");
2551 continue;
2552 }
2553
2554 breakPos = p2;
2555 return breakPos;
2556 }
2557
~RBBISentMonkey()2558 RBBISentMonkey::~RBBISentMonkey() {
2559 delete fSets;
2560 delete fSepSet;
2561 delete fFormatSet;
2562 delete fSpSet;
2563 delete fLowerSet;
2564 delete fUpperSet;
2565 delete fOLetterSet;
2566 delete fNumericSet;
2567 delete fATermSet;
2568 delete fSContinueSet;
2569 delete fSTermSet;
2570 delete fCloseSet;
2571 delete fOtherSet;
2572 delete fExtendSet;
2573 }
2574
2575
2576
2577 //-------------------------------------------------------------------------------------------
2578 //
2579 // RBBILineMonkey
2580 //
2581 //-------------------------------------------------------------------------------------------
2582
2583 class RBBILineMonkey: public RBBIMonkeyKind {
2584 public:
2585 RBBILineMonkey();
2586 virtual ~RBBILineMonkey();
2587 virtual UVector *charClasses() override;
2588 virtual void setText(const UnicodeString &s) override;
2589 virtual int32_t next(int32_t i) override;
2590 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2591 private:
2592 UVector *fSets;
2593
2594 UnicodeSet *fBK;
2595 UnicodeSet *fCR;
2596 UnicodeSet *fLF;
2597 UnicodeSet *fCM;
2598 UnicodeSet *fNL;
2599 UnicodeSet *fSG;
2600 UnicodeSet *fWJ;
2601 UnicodeSet *fZW;
2602 UnicodeSet *fGL;
2603 UnicodeSet *fCB;
2604 UnicodeSet *fSP;
2605 UnicodeSet *fB2;
2606 UnicodeSet *fBA;
2607 UnicodeSet *fBB;
2608 UnicodeSet *fHH;
2609 UnicodeSet *fHY;
2610 UnicodeSet *fH2;
2611 UnicodeSet *fH3;
2612 UnicodeSet *fCL;
2613 UnicodeSet *fCP;
2614 UnicodeSet *fEX;
2615 UnicodeSet *fIN;
2616 UnicodeSet *fJL;
2617 UnicodeSet *fJV;
2618 UnicodeSet *fJT;
2619 UnicodeSet *fNS;
2620 UnicodeSet *fOP;
2621 UnicodeSet *fQU;
2622 UnicodeSet *fIS;
2623 UnicodeSet *fNU;
2624 UnicodeSet *fPO;
2625 UnicodeSet *fPR;
2626 UnicodeSet *fSY;
2627 UnicodeSet *fAI;
2628 UnicodeSet *fAL;
2629 UnicodeSet *fCJ;
2630 UnicodeSet *fHL;
2631 UnicodeSet *fID;
2632 UnicodeSet *fRI;
2633 UnicodeSet *fXX;
2634 UnicodeSet *fEB;
2635 UnicodeSet *fEM;
2636 UnicodeSet *fZWJ;
2637 UnicodeSet *fOP30;
2638 UnicodeSet *fCP30;
2639 UnicodeSet *fExtPictUnassigned;
2640
2641 BreakIterator *fCharBI;
2642 const UnicodeString *fText;
2643 RegexMatcher *fNumberMatcher;
2644 };
2645
RBBILineMonkey()2646 RBBILineMonkey::RBBILineMonkey() :
2647 RBBIMonkeyKind(),
2648 fSets(NULL),
2649
2650 fCharBI(NULL),
2651 fText(NULL),
2652 fNumberMatcher(NULL)
2653
2654 {
2655 if (U_FAILURE(deferredStatus)) {
2656 return;
2657 }
2658
2659 UErrorCode status = U_ZERO_ERROR;
2660
2661 fSets = new UVector(status);
2662
2663 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2664 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2665 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2666 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2667 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2668 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2669 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2670 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2671 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2672 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2673 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2674 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2675 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2676 fHH = new UnicodeSet();
2677 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2678 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2679 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2680 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2681 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2682 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2683 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2684 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2685 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2686 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2687 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2688 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2689 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2690 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2691 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2692 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2693 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2694 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2695 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2696 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2697 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2698 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2699 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2700 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2701 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2702 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2703 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2704 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2705 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2706 fOP30 = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2707 fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2708 fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2709
2710 if (U_FAILURE(status)) {
2711 deferredStatus = status;
2712 return;
2713 }
2714
2715 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2716 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2717 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2718
2719 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2720 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2721
2722 fHH->add(u'\u2010'); // Hyphen, '‐'
2723
2724 // Sets and names.
2725 fSets->addElement(fBK, status); classNames.push_back("fBK");
2726 fSets->addElement(fCR, status); classNames.push_back("fCR");
2727 fSets->addElement(fLF, status); classNames.push_back("fLF");
2728 fSets->addElement(fCM, status); classNames.push_back("fCM");
2729 fSets->addElement(fNL, status); classNames.push_back("fNL");
2730 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2731 fSets->addElement(fZW, status); classNames.push_back("fZW");
2732 fSets->addElement(fGL, status); classNames.push_back("fGL");
2733 fSets->addElement(fCB, status); classNames.push_back("fCB");
2734 fSets->addElement(fSP, status); classNames.push_back("fSP");
2735 fSets->addElement(fB2, status); classNames.push_back("fB2");
2736 fSets->addElement(fBA, status); classNames.push_back("fBA");
2737 fSets->addElement(fBB, status); classNames.push_back("fBB");
2738 fSets->addElement(fHY, status); classNames.push_back("fHY");
2739 fSets->addElement(fH2, status); classNames.push_back("fH2");
2740 fSets->addElement(fH3, status); classNames.push_back("fH3");
2741 fSets->addElement(fCL, status); classNames.push_back("fCL");
2742 fSets->addElement(fCP, status); classNames.push_back("fCP");
2743 fSets->addElement(fEX, status); classNames.push_back("fEX");
2744 fSets->addElement(fIN, status); classNames.push_back("fIN");
2745 fSets->addElement(fJL, status); classNames.push_back("fJL");
2746 fSets->addElement(fJT, status); classNames.push_back("fJT");
2747 fSets->addElement(fJV, status); classNames.push_back("fJV");
2748 fSets->addElement(fNS, status); classNames.push_back("fNS");
2749 fSets->addElement(fOP, status); classNames.push_back("fOP");
2750 fSets->addElement(fQU, status); classNames.push_back("fQU");
2751 fSets->addElement(fIS, status); classNames.push_back("fIS");
2752 fSets->addElement(fNU, status); classNames.push_back("fNU");
2753 fSets->addElement(fPO, status); classNames.push_back("fPO");
2754 fSets->addElement(fPR, status); classNames.push_back("fPR");
2755 fSets->addElement(fSY, status); classNames.push_back("fSY");
2756 fSets->addElement(fAI, status); classNames.push_back("fAI");
2757 fSets->addElement(fAL, status); classNames.push_back("fAL");
2758 fSets->addElement(fHL, status); classNames.push_back("fHL");
2759 fSets->addElement(fID, status); classNames.push_back("fID");
2760 fSets->addElement(fRI, status); classNames.push_back("fRI");
2761 fSets->addElement(fSG, status); classNames.push_back("fSG");
2762 fSets->addElement(fEB, status); classNames.push_back("fEB");
2763 fSets->addElement(fEM, status); classNames.push_back("fEM");
2764 fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2765 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2766 fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2767 fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2768 fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
2769
2770 const char *rules =
2771 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2772 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2773 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2774 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2775 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2776 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2777 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2778
2779 fNumberMatcher = new RegexMatcher(
2780 UnicodeString(rules, -1, US_INV), 0, status);
2781
2782 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2783
2784 if (U_FAILURE(status)) {
2785 deferredStatus = status;
2786 }
2787
2788 }
2789
2790
setText(const UnicodeString & s)2791 void RBBILineMonkey::setText(const UnicodeString &s) {
2792 fText = &s;
2793 fCharBI->setText(s);
2794 prepareAppliedRules(s.length());
2795 fNumberMatcher->reset(s);
2796 }
2797
2798 //
2799 // rule9Adjust
2800 // Line Break TR rules 9 and 10 implementation.
2801 // This deals with combining marks and other sequences that
2802 // that must be treated as if they were something other than what they actually are.
2803 //
2804 // This is factored out into a separate function because it must be applied twice for
2805 // each potential break, once to the chars before the position being checked, then
2806 // again to the text following the possible break.
2807 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2808 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2809 if (pos == -1) {
2810 // Invalid initial position. Happens during the warmup iteration of the
2811 // main loop in next().
2812 return;
2813 }
2814
2815 int32_t nPos = *nextPos;
2816
2817 // LB 9 Keep combining sequences together.
2818 // advance over any CM class chars. Note that Line Break CM is different
2819 // from the normal Grapheme Extend property.
2820 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2821 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2822 for (;;) {
2823 *nextChar = fText->char32At(nPos);
2824 if (!fCM->contains(*nextChar)) {
2825 break;
2826 }
2827 nPos = fText->moveIndex32(nPos, 1);
2828 }
2829 }
2830
2831
2832 // LB 9 Treat X CM* as if it were x.
2833 // No explicit action required.
2834
2835 // LB 10 Treat any remaining combining mark as AL
2836 if (fCM->contains(*posChar)) {
2837 *posChar = u'A';
2838 }
2839
2840 // Push the updated nextPos and nextChar back to our caller.
2841 // This only makes a difference if posChar got bigger by consuming a
2842 // combining sequence.
2843 *nextPos = nPos;
2844 *nextChar = fText->char32At(nPos);
2845 }
2846
2847
2848
next(int32_t startPos)2849 int32_t RBBILineMonkey::next(int32_t startPos) {
2850 UErrorCode status = U_ZERO_ERROR;
2851 int32_t pos; // Index of the char following a potential break position
2852 UChar32 thisChar; // Character at above position "pos"
2853
2854 int32_t prevPos; // Index of the char preceding a potential break position
2855 UChar32 prevChar; // Character at above position. Note that prevChar
2856 // and thisChar may not be adjacent because combining
2857 // characters between them will be ignored.
2858
2859 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2860 UChar32 prevCharX2;
2861
2862 int32_t nextPos; // Index of the next character following pos.
2863 // Usually skips over combining marks.
2864 int32_t nextCPPos; // Index of the code point following "pos."
2865 // May point to a combining mark.
2866 int32_t tPos; // temp value.
2867 UChar32 c;
2868
2869 if (U_FAILURE(deferredStatus)) {
2870 return -1;
2871 }
2872
2873 if (startPos >= fText->length()) {
2874 return -1;
2875 }
2876
2877
2878 // Initial values for loop. Loop will run the first time without finding breaks,
2879 // while the invalid values shift out and the "this" and
2880 // "prev" positions are filled in with good values.
2881 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2882 thisChar = prevChar = prevCharX2 = 0;
2883 nextPos = nextCPPos = startPos;
2884
2885
2886 // Loop runs once per position in the test text, until a break position
2887 // is found.
2888 for (;;) {
2889 prevPosX2 = prevPos;
2890 prevCharX2 = prevChar;
2891
2892 prevPos = pos;
2893 prevChar = thisChar;
2894
2895 pos = nextPos;
2896 thisChar = fText->char32At(pos);
2897
2898 nextCPPos = fText->moveIndex32(pos, 1);
2899 nextPos = nextCPPos;
2900
2901
2902 if (pos >= fText->length()) {
2903 setAppliedRule(pos, "LB2 - Break at end of text.");
2904 break;
2905 }
2906
2907
2908 // We do this one out-of-order because the adjustment does not change anything
2909 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2910 // be applied.
2911 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2912 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2913 c = fText->char32At(nextPos);
2914 rule9Adjust(pos, &thisChar, &nextPos, &c);
2915
2916 // If the loop is still warming up - if we haven't shifted the initial
2917 // -1 positions out of prevPos yet - loop back to advance the
2918 // position in the input without any further looking for breaks.
2919 if (prevPos == -1) {
2920 setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2921 continue;
2922 }
2923
2924
2925 if (fBK->contains(prevChar)) {
2926 setAppliedRule(pos, "LB 4 Always break after hard line breaks");
2927 break;
2928 }
2929
2930
2931 if (prevChar == 0x0d && thisChar == 0x0a) {
2932 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
2933 continue;
2934 }
2935 if (prevChar == 0x0d ||
2936 prevChar == 0x0a ||
2937 prevChar == 0x85) {
2938 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
2939 break;
2940 }
2941
2942
2943 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2944 fBK->contains(thisChar)) {
2945 setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
2946 continue;
2947 }
2948
2949
2950 if (fSP->contains(thisChar)) {
2951 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
2952 continue;
2953 }
2954
2955 // !!! ??? Is this the right text for the applied rule?
2956 if (fZW->contains(thisChar)) {
2957 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
2958 continue;
2959 }
2960
2961
2962 // ZW SP* ÷
2963 // Scan backwards from prevChar for SP* ZW
2964 tPos = prevPos;
2965 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2966 tPos = fText->moveIndex32(tPos, -1);
2967 }
2968 if (fZW->contains(fText->char32At(tPos))) {
2969 setAppliedRule(pos, "LB 8 Break after zero width space");
2970 break;
2971 }
2972
2973
2974 // Move this test up, before LB8a, because numbers can match a longer sequence that would
2975 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
2976 if (fNumberMatcher->lookingAt(prevPos, status)) {
2977 if (U_FAILURE(status)) {
2978 setAppliedRule(pos, "LB 25 Numbers");
2979 break;
2980 }
2981 // Matched a number. But could have been just a single digit, which would
2982 // not represent a "no break here" between prevChar and thisChar
2983 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
2984 if (numEndIdx > pos) {
2985 // Number match includes at least our two chars being checked
2986 if (numEndIdx > nextPos) {
2987 // Number match includes additional chars. Update pos and nextPos
2988 // so that next loop iteration will continue at the end of the number,
2989 // checking for breaks between last char in number & whatever follows.
2990 pos = nextPos = numEndIdx;
2991 do {
2992 pos = fText->moveIndex32(pos, -1);
2993 thisChar = fText->char32At(pos);
2994 } while (fCM->contains(thisChar));
2995 }
2996 setAppliedRule(pos, "LB 25 Numbers");
2997 continue;
2998 }
2999 }
3000
3001
3002 // The monkey test's way of ignoring combining characters doesn't work
3003 // for this rule. ZJ is also a CM. Need to get the actual character
3004 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3005 {
3006 int32_t prevIdx = fText->moveIndex32(pos, -1);
3007 UChar32 prevC = fText->char32At(prevIdx);
3008 if (fZWJ->contains(prevC)) {
3009 setAppliedRule(pos, "LB 8a ZWJ x");
3010 continue;
3011 }
3012 }
3013
3014
3015 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
3016 //
3017
3018
3019 // x WJ
3020 // WJ x
3021 //
3022 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3023 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
3024 continue;
3025 }
3026
3027
3028 if (fGL->contains(prevChar)) {
3029 setAppliedRule(pos, "LB 12 GL x");
3030 continue;
3031 }
3032
3033
3034 if (!(fSP->contains(prevChar) ||
3035 fBA->contains(prevChar) ||
3036 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3037 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
3038 continue;
3039 }
3040
3041
3042 if (fCL->contains(thisChar) ||
3043 fCP->contains(thisChar) ||
3044 fEX->contains(thisChar) ||
3045 fSY->contains(thisChar)) {
3046 setAppliedRule(pos, "LB 13 Don't break before closings.");
3047 continue;
3048 }
3049
3050
3051 // Scan backwards, checking for this sequence.
3052 // The OP char could include combining marks, so we actually check for
3053 // OP CM* SP*
3054 // Another Twist: The Rule 9 fixes may have changed a SP CM
3055 // sequence into a ID char, so before scanning back through spaces,
3056 // verify that prevChar is indeed a space. The prevChar variable
3057 // may differ from fText[prevPos]
3058 tPos = prevPos;
3059 if (fSP->contains(prevChar)) {
3060 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3061 tPos=fText->moveIndex32(tPos, -1);
3062 }
3063 }
3064 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3065 tPos=fText->moveIndex32(tPos, -1);
3066 }
3067 if (fOP->contains(fText->char32At(tPos))) {
3068 setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3069 continue;
3070 }
3071
3072
3073 if (nextPos < fText->length()) {
3074 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3075 // from a legit ffff character. So test length separately.
3076 UChar32 nextChar = fText->char32At(nextPos);
3077 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3078 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3079 break;
3080 }
3081 }
3082
3083
3084 if (fIS->contains(thisChar)) {
3085 setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
3086 continue;
3087 }
3088
3089
3090 if (fOP->contains(thisChar)) {
3091 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3092 int tPos = prevPos;
3093 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3094 tPos = fText->moveIndex32(tPos, -1);
3095 }
3096 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3097 tPos = fText->moveIndex32(tPos, -1);
3098 }
3099 if (fQU->contains(fText->char32At(tPos))) {
3100 setAppliedRule(pos, "LB 15 QU SP* x OP");
3101 continue;
3102 }
3103 }
3104
3105
3106 // Scan backwards for SP* CM* (CL | CP)
3107 if (fNS->contains(thisChar)) {
3108 int tPos = prevPos;
3109 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3110 tPos = fText->moveIndex32(tPos, -1);
3111 }
3112 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3113 tPos = fText->moveIndex32(tPos, -1);
3114 }
3115 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3116 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
3117 continue;
3118 }
3119 }
3120
3121
3122 if (fB2->contains(thisChar)) {
3123 // Scan backwards, checking for the B2 CM* SP* sequence.
3124 tPos = prevPos;
3125 if (fSP->contains(prevChar)) {
3126 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3127 tPos=fText->moveIndex32(tPos, -1);
3128 }
3129 }
3130 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3131 tPos=fText->moveIndex32(tPos, -1);
3132 }
3133 if (fB2->contains(fText->char32At(tPos))) {
3134 setAppliedRule(pos, "LB 17 B2 SP* x B2");
3135 continue;
3136 }
3137 }
3138
3139
3140 if (fSP->contains(prevChar)) {
3141 setAppliedRule(pos, "LB 18 break after space");
3142 break;
3143 }
3144
3145 // x QU
3146 // QU x
3147 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3148 setAppliedRule(pos, "LB 19");
3149 continue;
3150 }
3151
3152 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3153 setAppliedRule(pos, "LB 20 Break around a CB");
3154 break;
3155 }
3156
3157 // Don't break between Hyphens and letters if a break precedes the hyphen.
3158 // Formerly this was a Finnish tailoring.
3159 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3160 // ^($HY | $HH) $AL;
3161 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3162 prevPosX2 == -1) {
3163 setAppliedRule(pos, "LB 20.09");
3164 continue;
3165 }
3166
3167 if (fBA->contains(thisChar) ||
3168 fHY->contains(thisChar) ||
3169 fNS->contains(thisChar) ||
3170 fBB->contains(prevChar) ) {
3171 setAppliedRule(pos, "LB 21");
3172 continue;
3173 }
3174
3175 if (fHL->contains(prevCharX2) &&
3176 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3177 setAppliedRule(pos, "LB 21a HL (HY | BA) x");
3178 continue;
3179 }
3180
3181 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3182 setAppliedRule(pos, "LB 21b SY x HL");
3183 continue;
3184 }
3185
3186 if (fIN->contains(thisChar)) {
3187 setAppliedRule(pos, "LB 22");
3188 continue;
3189 }
3190
3191
3192 // (AL | HL) x NU
3193 // NU x (AL | HL)
3194 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3195 setAppliedRule(pos, "LB 23");
3196 continue;
3197 }
3198 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3199 setAppliedRule(pos, "LB 23");
3200 continue;
3201 }
3202
3203 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3204 // PR x (ID | EB | EM)
3205 // (ID | EB | EM) x PO
3206 if (fPR->contains(prevChar) &&
3207 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3208 setAppliedRule(pos, "LB 23a");
3209 continue;
3210 }
3211 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3212 fPO->contains(thisChar)) {
3213 setAppliedRule(pos, "LB 23a");
3214 continue;
3215 }
3216
3217 // Do not break between prefix and letters or ideographs.
3218 // (PR | PO) x (AL | HL)
3219 // (AL | HL) x (PR | PO)
3220 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3221 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3222 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3223 continue;
3224 }
3225 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3226 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3227 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3228 continue;
3229 }
3230
3231 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3232
3233 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3234 fJV->contains(thisChar) ||
3235 fH2->contains(thisChar) ||
3236 fH3->contains(thisChar))) {
3237 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3238 continue;
3239 }
3240
3241 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3242 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3243 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3244 continue;
3245 }
3246
3247 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3248 fJT->contains(thisChar)) {
3249 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3250 continue;
3251 }
3252
3253 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3254 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3255 fPO->contains(thisChar)) {
3256 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3257 continue;
3258 }
3259 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3260 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3261 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3262 continue;
3263 }
3264
3265
3266
3267 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3268 setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
3269 continue;
3270 }
3271
3272 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3273 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3274 continue;
3275 }
3276
3277 // (AL | NU) x OP
3278 // CP x (AL | NU)
3279 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3280 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3281 continue;
3282 }
3283 if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3284 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3285 continue;
3286 }
3287
3288 // RI x RI
3289 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3290 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3291 break;
3292 }
3293 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3294 // Two Regional Indicators have been paired.
3295 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3296 // following RI. This is a hack.
3297 thisChar = -1;
3298 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3299 continue;
3300 }
3301
3302 // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3303 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3304 setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
3305 continue;
3306 }
3307
3308 if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3309 setAppliedRule(pos, "LB30b [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
3310 continue;
3311 }
3312
3313 setAppliedRule(pos, "LB 31 Break everywhere else");
3314 break;
3315 }
3316
3317 return pos;
3318 }
3319
3320
charClasses()3321 UVector *RBBILineMonkey::charClasses() {
3322 return fSets;
3323 }
3324
3325
~RBBILineMonkey()3326 RBBILineMonkey::~RBBILineMonkey() {
3327 delete fSets;
3328
3329 delete fBK;
3330 delete fCR;
3331 delete fLF;
3332 delete fCM;
3333 delete fNL;
3334 delete fWJ;
3335 delete fZW;
3336 delete fGL;
3337 delete fCB;
3338 delete fSP;
3339 delete fB2;
3340 delete fBA;
3341 delete fBB;
3342 delete fHH;
3343 delete fHY;
3344 delete fH2;
3345 delete fH3;
3346 delete fCL;
3347 delete fCP;
3348 delete fEX;
3349 delete fIN;
3350 delete fJL;
3351 delete fJV;
3352 delete fJT;
3353 delete fNS;
3354 delete fOP;
3355 delete fQU;
3356 delete fIS;
3357 delete fNU;
3358 delete fPO;
3359 delete fPR;
3360 delete fSY;
3361 delete fAI;
3362 delete fAL;
3363 delete fCJ;
3364 delete fHL;
3365 delete fID;
3366 delete fRI;
3367 delete fSG;
3368 delete fXX;
3369 delete fEB;
3370 delete fEM;
3371 delete fZWJ;
3372 delete fOP30;
3373 delete fCP30;
3374 delete fExtPictUnassigned;
3375
3376 delete fCharBI;
3377 delete fNumberMatcher;
3378 }
3379
3380
3381 //-------------------------------------------------------------------------------------------
3382 //
3383 // TestMonkey
3384 //
3385 // params
3386 // seed=nnnnn Random number starting seed.
3387 // Setting the seed allows errors to be reproduced.
3388 // loop=nnn Looping count. Controls running time.
3389 // -1: run forever.
3390 // 0 or greater: run length.
3391 //
3392 // type = char | word | line | sent | title
3393 //
3394 // Example:
3395 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3396 //
3397 //-------------------------------------------------------------------------------------------
3398
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3399 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3400 int32_t val = defaultVal;
3401 name.append(" *= *(-?\\d+)");
3402 UErrorCode status = U_ZERO_ERROR;
3403 RegexMatcher m(name, params, 0, status);
3404 if (m.find()) {
3405 // The param exists. Convert the string to an int.
3406 char valString[100];
3407 int32_t paramLength = m.end(1, status) - m.start(1, status);
3408 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3409 paramLength = (int32_t)(sizeof(valString)-2);
3410 }
3411 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3412 val = strtol(valString, NULL, 10);
3413
3414 // Delete this parameter from the params string.
3415 m.reset();
3416 params = m.replaceFirst("", status);
3417 }
3418 U_ASSERT(U_SUCCESS(status));
3419 return val;
3420 }
3421 #endif
3422
3423 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3424 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3425 BreakIterator *bi,
3426 int expected[],
3427 int expectedcount)
3428 {
3429 int count = 0;
3430 int i = 0;
3431 int forward[50];
3432 bi->setText(ustr);
3433 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3434 forward[count] = i;
3435 if (count < expectedcount && expected[count] != i) {
3436 test->errln("%s:%d break forward test failed: expected %d but got %d",
3437 __FILE__, __LINE__, expected[count], i);
3438 break;
3439 }
3440 count ++;
3441 }
3442 if (count != expectedcount) {
3443 printStringBreaks(ustr, expected, expectedcount);
3444 test->errln("%s:%d break forward test failed: missed %d match",
3445 __FILE__, __LINE__, expectedcount - count);
3446 return;
3447 }
3448 // testing boundaries
3449 for (i = 1; i < expectedcount; i ++) {
3450 int j = expected[i - 1];
3451 if (!bi->isBoundary(j)) {
3452 printStringBreaks(ustr, expected, expectedcount);
3453 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3454 __FILE__, __LINE__, j);
3455 return;
3456 }
3457 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3458 if (bi->isBoundary(j)) {
3459 printStringBreaks(ustr, expected, expectedcount);
3460 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3461 __FILE__, __LINE__, j);
3462 return;
3463 }
3464 }
3465 }
3466
3467 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3468 count --;
3469 if (forward[count] != i) {
3470 printStringBreaks(ustr, expected, expectedcount);
3471 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3472 __FILE__, __LINE__, forward[count], i);
3473 break;
3474 }
3475 }
3476 if (count != 0) {
3477 printStringBreaks(ustr, expected, expectedcount);
3478 test->errln("break test previous() failed: missed a match");
3479 return;
3480 }
3481
3482 // testing preceding
3483 for (i = 0; i < expectedcount - 1; i ++) {
3484 // int j = expected[i] + 1;
3485 int j = ustr.moveIndex32(expected[i], 1);
3486 for (; j <= expected[i + 1]; j ++) {
3487 int32_t expectedPreceding = expected[i];
3488 int32_t actualPreceding = bi->preceding(j);
3489 if (actualPreceding != expectedPreceding) {
3490 printStringBreaks(ustr, expected, expectedcount);
3491 test->errln("%s:%d preceding(%d): expected %d, got %d",
3492 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3493 return;
3494 }
3495 }
3496 }
3497 }
3498 #endif
3499
TestWordBreaks(void)3500 void RBBITest::TestWordBreaks(void)
3501 {
3502 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3503
3504 Locale locale("en");
3505 UErrorCode status = U_ZERO_ERROR;
3506 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3507 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3508 // Replaced any C+J characters in a row with a random sequence of characters
3509 // of the same length to make our C+J segmentation not get in the way.
3510 static const char *strlist[] =
3511 {
3512 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3513 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3514 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3515 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3516 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3517 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3518 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3519 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3520 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3521 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3522 "\\u2027\\U000e0067\\u0a47\\u00b7",
3523 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3524 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3525 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3526 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3527 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3528 "\\u0027\\u11af\\U000e0057\\u0602",
3529 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3530 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3531 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3532 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3533 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3534 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3535 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3536 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3537 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3538 "\\u18f4\\U000e0049\\u20e7\\u2027",
3539 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3540 "\\ua183\\u102d\\u0bec\\u003a",
3541 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3542 "\\u003a\\u0e57\\u0fad\\u002e",
3543 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3544 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3545 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3546 "\\u003a\\u0664\\u00b7\\u1fba",
3547 "\\u003b\\u0027\\u00b7\\u47a3",
3548 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3549 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3550 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3551 };
3552 int loop;
3553 if (U_FAILURE(status)) {
3554 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3555 return;
3556 }
3557 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3558 // printf("looping %d\n", loop);
3559 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3560 // RBBICharMonkey monkey;
3561 RBBIWordMonkey monkey;
3562
3563 int expected[50];
3564 int expectedcount = 0;
3565
3566 monkey.setText(ustr);
3567 int i;
3568 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3569 expected[expectedcount ++] = i;
3570 }
3571
3572 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3573 }
3574 delete bi;
3575 #endif
3576 }
3577
TestWordBoundary(void)3578 void RBBITest::TestWordBoundary(void)
3579 {
3580 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3581 Locale locale("en");
3582 UErrorCode status = U_ZERO_ERROR;
3583 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3584 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3585 if (U_FAILURE(status)) {
3586 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3587 __FILE__, __LINE__, u_errorName(status));
3588 return;
3589 }
3590 UChar str[50];
3591 static const char *strlist[] =
3592 {
3593 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3594 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3595 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3596 "\\u2027\\U000e0067\\u0a47\\u00b7",
3597 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3598 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3599 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3600 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3601 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3602 "\\u0027\\u11af\\U000e0057\\u0602",
3603 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3604 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3605 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3606 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3607 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3608 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3609 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3610 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3611 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3612 "\\u58f4\\U000e0049\\u20e7\\u2027",
3613 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3614 "\\ua183\\u102d\\u0bec\\u003a",
3615 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3616 "\\u003a\\u0e57\\u0fad\\u002e",
3617 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3618 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3619 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3620 "\\u003a\\u0664\\u00b7\\u1fba",
3621 "\\u003b\\u0027\\u00b7\\u47a3",
3622 };
3623 int loop;
3624 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3625 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3626 UnicodeString ustr(str);
3627 int forward[50];
3628 int count = 0;
3629
3630 bi->setText(ustr);
3631 int prev = -1;
3632 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3633 ++count;
3634 if (count >= UPRV_LENGTHOF(forward)) {
3635 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3636 __FILE__, __LINE__, loop, count, boundary);
3637 return;
3638 }
3639 forward[count] = boundary;
3640 if (boundary <= prev) {
3641 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3642 __FILE__, __LINE__, loop, prev, boundary);
3643 break;
3644 }
3645 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3646 if (bi->isBoundary(nonBoundary)) {
3647 printStringBreaks(ustr, forward, count);
3648 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3649 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3650 return;
3651 }
3652 }
3653 if (!bi->isBoundary(boundary)) {
3654 printStringBreaks(ustr, forward, count);
3655 errln("%s:%d happy boundary test failed: expected %d a boundary",
3656 __FILE__, __LINE__, boundary);
3657 return;
3658 }
3659 prev = boundary;
3660 }
3661 }
3662 }
3663
TestLineBreaks(void)3664 void RBBITest::TestLineBreaks(void)
3665 {
3666 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3667 Locale locale("en");
3668 UErrorCode status = U_ZERO_ERROR;
3669 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3670 const int32_t STRSIZE = 50;
3671 UChar str[STRSIZE];
3672 static const char *strlist[] =
3673 {
3674 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3675 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3676 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3677 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3678 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3679 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3680 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3681 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3682 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3683 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3684 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3685 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3686 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3687 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3688 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3689 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3690 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3691 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3692 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3693 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3694 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3695 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3696 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3697 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3698 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3699 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3700 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3701 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3702 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3703 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3704 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3705 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3706 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3707 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3708 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3709 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3710 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3711 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3712 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3713 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3714 };
3715 int loop;
3716 TEST_ASSERT_SUCCESS(status);
3717 if (U_FAILURE(status)) {
3718 return;
3719 }
3720 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3721 // printf("looping %d\n", loop);
3722 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3723 if (t >= STRSIZE) {
3724 TEST_ASSERT(false);
3725 continue;
3726 }
3727
3728
3729 UnicodeString ustr(str);
3730 RBBILineMonkey monkey;
3731 if (U_FAILURE(monkey.deferredStatus)) {
3732 continue;
3733 }
3734
3735 const int EXPECTEDSIZE = 50;
3736 int expected[EXPECTEDSIZE];
3737 int expectedcount = 0;
3738
3739 monkey.setText(ustr);
3740
3741 int i;
3742 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3743 if (expectedcount >= EXPECTEDSIZE) {
3744 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3745 return;
3746 }
3747 expected[expectedcount ++] = i;
3748 }
3749
3750 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3751 }
3752 delete bi;
3753 #endif
3754 }
3755
TestSentBreaks(void)3756 void RBBITest::TestSentBreaks(void)
3757 {
3758 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3759 Locale locale("en");
3760 UErrorCode status = U_ZERO_ERROR;
3761 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3762 UChar str[200];
3763 static const char *strlist[] =
3764 {
3765 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3766 "This\n",
3767 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3768 "\"Sentence ending with a quote.\" Bye.",
3769 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3770 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3771 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3772 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3773 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3774 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3775 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3776 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3777 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3778 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3779 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3780 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3781 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3782 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3783 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3784 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3785 };
3786 int loop;
3787 if (U_FAILURE(status)) {
3788 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3789 return;
3790 }
3791 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3792 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3793 UnicodeString ustr(str);
3794
3795 RBBISentMonkey monkey;
3796 if (U_FAILURE(monkey.deferredStatus)) {
3797 continue;
3798 }
3799
3800 const int EXPECTEDSIZE = 50;
3801 int expected[EXPECTEDSIZE];
3802 int expectedcount = 0;
3803
3804 monkey.setText(ustr);
3805
3806 int i;
3807 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3808 if (expectedcount >= EXPECTEDSIZE) {
3809 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3810 return;
3811 }
3812 expected[expectedcount ++] = i;
3813 }
3814
3815 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3816 }
3817 delete bi;
3818 #endif
3819 }
3820
TestMonkey()3821 void RBBITest::TestMonkey() {
3822 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3823
3824 UErrorCode status = U_ZERO_ERROR;
3825 int32_t loopCount = 500;
3826 int32_t seed = 1;
3827 UnicodeString breakType = "all";
3828 Locale locale("en");
3829 UBool useUText = false;
3830
3831 if (quick == false) {
3832 loopCount = 10000;
3833 }
3834
3835 if (fTestParams) {
3836 UnicodeString p(fTestParams);
3837 loopCount = getIntParam("loop", p, loopCount);
3838 seed = getIntParam("seed", p, seed);
3839
3840 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3841 if (m.find()) {
3842 breakType = m.group(1, status);
3843 m.reset();
3844 p = m.replaceFirst("", status);
3845 }
3846
3847 RegexMatcher u(" *utext", p, 0, status);
3848 if (u.find()) {
3849 useUText = true;
3850 u.reset();
3851 p = u.replaceFirst("", status);
3852 }
3853
3854
3855 // m.reset(p);
3856 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3857 // Each option is stripped out of the option string as it is processed.
3858 // All options have been checked. The option string should have been completely emptied..
3859 char buf[100];
3860 p.extract(buf, sizeof(buf), NULL, status);
3861 buf[sizeof(buf)-1] = 0;
3862 errln("Unrecognized or extra parameter: %s\n", buf);
3863 return;
3864 }
3865
3866 }
3867
3868 if (breakType == "char" || breakType == "all") {
3869 RBBICharMonkey m;
3870 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3871 if (U_SUCCESS(status)) {
3872 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3873 if (breakType == "all" && useUText==false) {
3874 // Also run a quick test with UText when "all" is specified
3875 RunMonkey(bi, m, "char", seed, loopCount, true);
3876 }
3877 }
3878 else {
3879 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3880 }
3881 delete bi;
3882 }
3883
3884 if (breakType == "word" || breakType == "all") {
3885 logln("Word Break Monkey Test");
3886 RBBIWordMonkey m;
3887 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3888 if (U_SUCCESS(status)) {
3889 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3890 }
3891 else {
3892 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3893 }
3894 delete bi;
3895 }
3896
3897 if (breakType == "line" || breakType == "all") {
3898 logln("Line Break Monkey Test");
3899 RBBILineMonkey m;
3900 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3901 if (loopCount >= 10) {
3902 loopCount = loopCount / 5; // Line break runs slower than the others.
3903 }
3904 if (U_SUCCESS(status)) {
3905 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3906 }
3907 else {
3908 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3909 }
3910 delete bi;
3911 }
3912
3913 if (breakType == "sent" || breakType == "all" ) {
3914 logln("Sentence Break Monkey Test");
3915 RBBISentMonkey m;
3916 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3917 if (loopCount >= 10) {
3918 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3919 }
3920 if (U_SUCCESS(status)) {
3921 RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3922 }
3923 else {
3924 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3925 }
3926 delete bi;
3927 }
3928
3929 #endif
3930 }
3931
3932 //
3933 // Run a RBBI monkey test. Common routine, for all break iterator types.
3934 // Parameters:
3935 // bi - the break iterator to use
3936 // mk - MonkeyKind, abstraction for obtaining expected results
3937 // name - Name of test (char, word, etc.) for use in error messages
3938 // seed - Seed for starting random number generator (parameter from user)
3939 // numIterations
3940 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3941 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
3942 int32_t numIterations, UBool useUText) {
3943
3944 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3945
3946 const int32_t TESTSTRINGLEN = 500;
3947 UnicodeString testText;
3948 int32_t numCharClasses;
3949 UVector *chClasses;
3950 int expectedCount = 0;
3951 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3952 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3953 char reverseBreaks[TESTSTRINGLEN*2+1];
3954 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3955 char followingBreaks[TESTSTRINGLEN*2+1];
3956 char precedingBreaks[TESTSTRINGLEN*2+1];
3957 int i;
3958 int loopCount = 0;
3959
3960
3961 m_seed = seed;
3962
3963 numCharClasses = mk.charClasses()->size();
3964 chClasses = mk.charClasses();
3965
3966 // Check for errors that occurred during the construction of the MonkeyKind object.
3967 // Can't report them where they occurred because errln() is a method coming from intlTest,
3968 // and is not visible outside of RBBITest :-(
3969 if (U_FAILURE(mk.deferredStatus)) {
3970 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3971 return;
3972 }
3973
3974 // Verify that the character classes all have at least one member.
3975 for (i=0; i<numCharClasses; i++) {
3976 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3977 if (s == NULL || s->size() == 0) {
3978 errln("Character Class #%d is null or of zero size.", i);
3979 return;
3980 }
3981 }
3982
3983 // For minimizing width of class name output.
3984 int classNameSize = mk.maxClassNameSize();
3985
3986 while (loopCount < numIterations || numIterations == -1) {
3987 if (numIterations == -1 && loopCount % 10 == 0) {
3988 // If test is running in an infinite loop, display a periodic tic so
3989 // we can tell that it is making progress.
3990 fprintf(stderr, ".");
3991 }
3992 // Save current random number seed, so that we can recreate the random numbers
3993 // for this loop iteration in event of an error.
3994 seed = m_seed;
3995
3996 // Populate a test string with data.
3997 testText.truncate(0);
3998 for (i=0; i<TESTSTRINGLEN; i++) {
3999 int32_t aClassNum = m_rand() % numCharClasses;
4000 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4001 int32_t charIdx = m_rand() % classSet->size();
4002 UChar32 c = classSet->charAt(charIdx);
4003 if (c < 0) { // TODO: deal with sets containing strings.
4004 errln("%s:%d c < 0", __FILE__, __LINE__);
4005 break;
4006 }
4007 // Do not assemble a supplementary character from randomly generated separate surrogates.
4008 // (It could be a dictionary character)
4009 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4010 continue;
4011 }
4012
4013 testText.append(c);
4014 }
4015
4016 // Calculate the expected results for this test string and reset applied rules.
4017 mk.setText(testText);
4018
4019 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4020 expectedBreaks[0] = 1;
4021 int32_t breakPos = 0;
4022 expectedCount = 0;
4023 for (;;) {
4024 breakPos = mk.next(breakPos);
4025 if (breakPos == -1) {
4026 break;
4027 }
4028 if (breakPos > testText.length()) {
4029 errln("breakPos > testText.length()");
4030 }
4031 expectedBreaks[breakPos] = 1;
4032 expectedCount++;
4033 U_ASSERT(expectedCount<testText.length());
4034 }
4035
4036 // Find the break positions using forward iteration
4037 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4038 if (useUText) {
4039 UErrorCode status = U_ZERO_ERROR;
4040 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4041 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4042 bi->setText(testUText, status);
4043 TEST_ASSERT_SUCCESS(status);
4044 utext_close(testUText); // The break iterator does a shallow clone of the UText
4045 // This UText can be closed immediately, so long as the
4046 // testText string continues to exist.
4047 } else {
4048 bi->setText(testText);
4049 }
4050
4051 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4052 if (i < 0 || i > testText.length()) {
4053 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4054 break;
4055 }
4056 forwardBreaks[i] = 1;
4057 }
4058
4059 // Find the break positions using reverse iteration
4060 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4061 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4062 if (i < 0 || i > testText.length()) {
4063 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4064 break;
4065 }
4066 reverseBreaks[i] = 1;
4067 }
4068
4069 // Find the break positions using isBoundary() tests.
4070 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4071 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4072 for (i=0; i<=testText.length(); i++) {
4073 isBoundaryBreaks[i] = bi->isBoundary(i);
4074 }
4075
4076
4077 // Find the break positions using the following() function.
4078 // printf(".");
4079 memset(followingBreaks, 0, sizeof(followingBreaks));
4080 int32_t lastBreakPos = 0;
4081 followingBreaks[0] = 1;
4082 for (i=0; i<testText.length(); i++) {
4083 breakPos = bi->following(i);
4084 if (breakPos <= i ||
4085 breakPos < lastBreakPos ||
4086 breakPos > testText.length() ||
4087 (breakPos > lastBreakPos && lastBreakPos > i)) {
4088 errln("%s break monkey test: "
4089 "Out of range value returned by BreakIterator::following().\n"
4090 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4091 name, seed, i, breakPos, lastBreakPos);
4092 break;
4093 }
4094 followingBreaks[breakPos] = 1;
4095 lastBreakPos = breakPos;
4096 }
4097
4098 // Find the break positions using the preceding() function.
4099 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4100 lastBreakPos = testText.length();
4101 precedingBreaks[testText.length()] = 1;
4102 for (i=testText.length(); i>0; i--) {
4103 breakPos = bi->preceding(i);
4104 if (breakPos >= i ||
4105 breakPos > lastBreakPos ||
4106 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4107 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4108 errln("%s break monkey test: "
4109 "Out of range value returned by BreakIterator::preceding().\n"
4110 "index=%d; prev returned %d; lastBreak=%d" ,
4111 name, i, breakPos, lastBreakPos);
4112 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4113 precedingBreaks[i] = 2; // Forces an error.
4114 }
4115 } else {
4116 if (breakPos >= 0) {
4117 precedingBreaks[breakPos] = 1;
4118 }
4119 lastBreakPos = breakPos;
4120 }
4121 }
4122
4123 // Compare the expected and actual results.
4124 for (i=0; i<=testText.length(); i++) {
4125 const char *errorType = NULL;
4126 const char* currentBreakData = NULL;
4127 if (forwardBreaks[i] != expectedBreaks[i]) {
4128 errorType = "next()";
4129 currentBreakData = forwardBreaks;
4130 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4131 errorType = "previous()";
4132 currentBreakData = reverseBreaks;
4133 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4134 errorType = "isBoundary()";
4135 currentBreakData = isBoundaryBreaks;
4136 } else if (followingBreaks[i] != expectedBreaks[i]) {
4137 errorType = "following()";
4138 currentBreakData = followingBreaks;
4139 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4140 errorType = "preceding()";
4141 currentBreakData = precedingBreaks;
4142 }
4143
4144 if (errorType != NULL) {
4145 // Format a range of the test text that includes the failure as
4146 // a data item that can be included in the rbbi test data file.
4147
4148 // Start of the range is the last point where expected and actual results
4149 // both agreed that there was a break position.
4150
4151 int startContext = i;
4152 int32_t count = 0;
4153 for (;;) {
4154 if (startContext==0) { break; }
4155 startContext --;
4156 if (expectedBreaks[startContext] != 0) {
4157 if (count == 2) break;
4158 count ++;
4159 }
4160 }
4161
4162 // End of range is two expected breaks past the start position.
4163 int endContext = i + 1;
4164 int ci;
4165 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4166 for (;;) {
4167 if (endContext >= testText.length()) {break;}
4168 if (expectedBreaks[endContext-1] != 0) {
4169 if (count == 0) break;
4170 count --;
4171 }
4172 endContext ++;
4173 }
4174 }
4175
4176 // Formatting of each line includes:
4177 // character code
4178 // reference break: '|' -> a break, '.' -> no break
4179 // actual break: '|' -> a break, '.' -> no break
4180 // (name of character clase)
4181 // Unicode name of character
4182 // '-->' indicates location of the difference.
4183
4184 MONKEY_ERROR(
4185 (expectedBreaks[i] ? "Break expected but not found" :
4186 "Break found but not expected"),
4187 name, i, seed);
4188
4189 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4190 UChar32 c;
4191 c = testText.char32At(ci);
4192
4193 std::string currentLineFlag = " ";
4194 if (ci == i) {
4195 currentLineFlag = "-->"; // Error position
4196 }
4197
4198 // BMP or SMP character in hex
4199 char hexCodePoint[12];
4200 std::string format = " \\u%04x";
4201 if (c >= 0x10000) {
4202 format = "\\U%08x";
4203 }
4204 sprintf(hexCodePoint, format.c_str(), c);
4205
4206 // Get the class name and character name for the character.
4207 char cName[200];
4208 UErrorCode status = U_ZERO_ERROR;
4209 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4210
4211 char buffer[200];
4212 auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4213 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4214 currentLineFlag.c_str(),
4215 ci,
4216 expectedBreaks[ci] == 0 ? "." : "|", // Reference break
4217 currentBreakData[ci] == 0 ? "." : "|", // Actual break
4218 hexCodePoint,
4219 classNameSize,
4220 mk.classNameFromCodepoint(c).c_str(),
4221 mk.getAppliedRule(ci).c_str(), cName);
4222 (void)ret;
4223 U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4224
4225 // Output the error
4226 if (ci == i) {
4227 errln(buffer);
4228 } else {
4229 infoln(buffer);
4230 }
4231
4232 if (ci >= endContext) { break; }
4233 }
4234 break;
4235 }
4236 }
4237
4238 loopCount++;
4239 }
4240 #endif
4241 }
4242
4243
4244 // Bug 5532. UTF-8 based UText fails in dictionary code.
4245 // This test checks the initial patch,
4246 // which is to just keep it from crashing. Correct word boundaries
4247 // await a proper fix to the dictionary code.
4248 //
TestBug5532(void)4249 void RBBITest::TestBug5532(void) {
4250 // Text includes a mixture of Thai and Latin.
4251 const unsigned char utf8Data[] = {
4252 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4253 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4254 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4255 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4256 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4257 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4258 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4259 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4260 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4261 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4262 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4263
4264 UErrorCode status = U_ZERO_ERROR;
4265 UText utext=UTEXT_INITIALIZER;
4266 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4267 TEST_ASSERT_SUCCESS(status);
4268
4269 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4270 TEST_ASSERT_SUCCESS(status);
4271 if (U_SUCCESS(status)) {
4272 bi->setText(&utext, status);
4273 TEST_ASSERT_SUCCESS(status);
4274
4275 int32_t breakCount = 0;
4276 int32_t previousBreak = -1;
4277 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4278 // For now, just make sure that the break iterator doesn't hang.
4279 TEST_ASSERT(previousBreak < bi->current());
4280 previousBreak = bi->current();
4281 }
4282 TEST_ASSERT(breakCount > 0);
4283 }
4284 delete bi;
4285 utext_close(&utext);
4286 }
4287
4288
TestBug9983(void)4289 void RBBITest::TestBug9983(void) {
4290 UnicodeString text = UnicodeString("\\u002A" // * Other
4291 "\\uFF65" // Other
4292 "\\u309C" // Katakana
4293 "\\uFF9F" // Extend
4294 "\\uFF65" // Other
4295 "\\u0020" // Other
4296 "\\u0000").unescape();
4297
4298 UErrorCode status = U_ZERO_ERROR;
4299 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4300 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4301 TEST_ASSERT_SUCCESS(status);
4302 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4303 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4304 TEST_ASSERT_SUCCESS(status);
4305 if (U_FAILURE(status)) {
4306 return;
4307 }
4308 int32_t offset, rstatus, iterationCount;
4309
4310 brkiter->setText(text);
4311 brkiter->last();
4312 iterationCount = 0;
4313 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4314 iterationCount++;
4315 rstatus = brkiter->getRuleStatus();
4316 (void)rstatus; // Suppress set but not used warning.
4317 if (iterationCount >= 10) {
4318 break;
4319 }
4320 }
4321 TEST_ASSERT(iterationCount == 6);
4322
4323 brkiterPOSIX->setText(text);
4324 brkiterPOSIX->last();
4325 iterationCount = 0;
4326 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4327 iterationCount++;
4328 rstatus = brkiterPOSIX->getRuleStatus();
4329 (void)rstatus; // Suppress set but not used warning.
4330 if (iterationCount >= 10) {
4331 break;
4332 }
4333 }
4334 TEST_ASSERT(iterationCount == 6);
4335 }
4336
4337 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4338 //
TestBug7547()4339 void RBBITest::TestBug7547() {
4340 UnicodeString rules;
4341 UErrorCode status = U_ZERO_ERROR;
4342 UParseError parseError;
4343 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4344 if (status != U_BRK_RULE_SYNTAX) {
4345 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4346 }
4347 if (parseError.line != 1 || parseError.offset != 0) {
4348 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4349 }
4350 }
4351
4352
TestBug12797()4353 void RBBITest::TestBug12797() {
4354 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4355 UErrorCode status = U_ZERO_ERROR;
4356 UParseError parseError;
4357 RuleBasedBreakIterator bi(rules, parseError, status);
4358 if (U_FAILURE(status)) {
4359 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4360 return;
4361 }
4362 UnicodeString text = "abc";
4363 bi.setText(text);
4364 bi.first();
4365 int32_t boundary = bi.next();
4366 if (boundary != 3) {
4367 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4368 }
4369 }
4370
TestBug12918()4371 void RBBITest::TestBug12918() {
4372 // This test triggers an assertion failure in dictbe.cpp
4373 const UChar *crasherString = u"\u3325\u4a16";
4374 UErrorCode status = U_ZERO_ERROR;
4375 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4376 if (U_FAILURE(status)) {
4377 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4378 return;
4379 }
4380 ubrk_first(iter);
4381 int32_t pos = 0;
4382 int32_t lastPos = -1;
4383 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4384 if (pos <= lastPos) {
4385 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4386 break;
4387 }
4388 }
4389 ubrk_close(iter);
4390 }
4391
TestBug12932()4392 void RBBITest::TestBug12932() {
4393 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4394 UnicodeString ruleStr(
4395 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4396 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4397 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4398 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4399 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4400 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4401
4402 UErrorCode status = U_ZERO_ERROR;
4403 UParseError parseError;
4404 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4405 if (status != U_BRK_RULE_SYNTAX) {
4406 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4407 __FILE__, __LINE__, u_errorName(status));
4408 }
4409 }
4410
4411
4412 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4413 // remain undevided by ICU char, word and line break.
TestEmoji()4414 void RBBITest::TestEmoji() {
4415 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4416 UErrorCode status = U_ZERO_ERROR;
4417
4418 CharString testFileName;
4419 testFileName.append(IntlTest::getSourceTestData(status), status);
4420 testFileName.appendPathPart("emoji-test.txt", status);
4421 if (U_FAILURE(status)) {
4422 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4423 return;
4424 }
4425 logln("Opening data file %s\n", testFileName.data());
4426
4427 int len;
4428 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4429 if (U_FAILURE(status) || testFile == NULL) {
4430 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4431 return;
4432 }
4433 UnicodeString testFileAsString(testFile, len);
4434 delete [] testFile;
4435
4436 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4437 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4438 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4439 int32_t lineNumber = 0;
4440
4441 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4442 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4443 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4444 if (U_FAILURE(status)) {
4445 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4446 return;
4447 }
4448
4449 while (lineMatcher.find()) {
4450 ++lineNumber;
4451 UnicodeString line = lineMatcher.group(status);
4452 hexMatcher.reset(line);
4453 UnicodeString testString; // accumulates the emoji sequence.
4454 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4455 UnicodeString hex = hexMatcher.group(1, status);
4456 if (hex.length() > 8) {
4457 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4458 break;
4459 }
4460 CharString hex8;
4461 hex8.appendInvariantChars(hex, status);
4462 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4463 if (c<=0x10ffff) {
4464 testString.append(c);
4465 } else {
4466 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4467 __FILE__, __LINE__, lineNumber, hex8.data());
4468 break;
4469 }
4470 }
4471
4472 if (testString.length() > 1) {
4473 charBreaks->setText(testString);
4474 charBreaks->first();
4475 int32_t firstBreak = charBreaks->next();
4476 if (testString.length() != firstBreak) {
4477 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4478 __FILE__, __LINE__, lineNumber, firstBreak);
4479 }
4480 wordBreaks->setText(testString);
4481 wordBreaks->first();
4482 firstBreak = wordBreaks->next();
4483 if (testString.length() != firstBreak) {
4484 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4485 __FILE__, __LINE__, lineNumber, firstBreak);
4486 }
4487 lineBreaks->setText(testString);
4488 lineBreaks->first();
4489 firstBreak = lineBreaks->next();
4490 if (testString.length() != firstBreak) {
4491 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4492 __FILE__, __LINE__, lineNumber, firstBreak);
4493 }
4494 }
4495 }
4496 #endif
4497 }
4498
4499
4500 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4501
TestBug12519()4502 void RBBITest::TestBug12519() {
4503 UErrorCode status = U_ZERO_ERROR;
4504 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4505 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4506 if (!assertSuccess(WHERE, status)) {
4507 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4508 return;
4509 }
4510 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4511
4512 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4513 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4514
4515 LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4516 assertTrue(WHERE, *biEn == *cloneEn);
4517 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4518
4519 LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4520 assertTrue(WHERE, *biFr == *cloneFr);
4521 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4522
4523 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4524 UnicodeString text("Hallo Welt");
4525 biDe->setText(text);
4526 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4527 *biDe = *biFr;
4528 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4529 }
4530
TestBug12677()4531 void RBBITest::TestBug12677() {
4532 // Check that stripping of comments from rules for getRules() is not confused by
4533 // the presence of '#' characters in the rules that do not introduce comments.
4534 UnicodeString rules(u"!!forward; \n"
4535 "$x = [ab#]; # a set with a # literal. \n"
4536 " # .; # a comment that looks sort of like a rule. \n"
4537 " '#' '?'; # a rule with a quoted # \n"
4538 );
4539
4540 UErrorCode status = U_ZERO_ERROR;
4541 UParseError pe;
4542 RuleBasedBreakIterator bi(rules, pe, status);
4543 assertSuccess(WHERE, status);
4544 UnicodeString rtRules = bi.getRules();
4545 assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules);
4546 }
4547
4548
TestTableRedundancies()4549 void RBBITest::TestTableRedundancies() {
4550 UErrorCode status = U_ZERO_ERROR;
4551
4552 LocalPointer<RuleBasedBreakIterator> bi (
4553 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4554 assertSuccess(WHERE, status);
4555 if (U_FAILURE(status)) return;
4556
4557 RBBIDataWrapper *dw = bi->fData;
4558 const RBBIStateTable *fwtbl = dw->fForwardTable;
4559 UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4560 int32_t numCharClasses = dw->fHeader->fCatCount;
4561 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4562
4563 // Check for duplicate columns (character categories)
4564
4565 std::vector<UnicodeString> columns;
4566 for (int32_t column = 0; column < numCharClasses; column++) {
4567 UnicodeString s;
4568 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4569 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4570 s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4571 }
4572 columns.push_back(s);
4573 }
4574 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4575 for (int c1=1; c1<numCharClasses; c1++) {
4576 int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4577 for (int c2 = c1+1; c2 < limit; c2++) {
4578 if (columns.at(c1) == columns.at(c2)) {
4579 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4580 goto out;
4581 }
4582 }
4583 }
4584 out:
4585
4586 // Check for duplicate states
4587 std::vector<UnicodeString> rows;
4588 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4589 UnicodeString s;
4590 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4591 if (in8Bits) {
4592 s.append(row->r8.fAccepting);
4593 s.append(row->r8.fLookAhead);
4594 s.append(row->r8.fTagsIdx);
4595 for (int32_t column = 0; column < numCharClasses; column++) {
4596 s.append(row->r8.fNextState[column]);
4597 }
4598 } else {
4599 s.append(row->r16.fAccepting);
4600 s.append(row->r16.fLookAhead);
4601 s.append(row->r16.fTagsIdx);
4602 for (int32_t column = 0; column < numCharClasses; column++) {
4603 s.append(row->r16.fNextState[column]);
4604 }
4605 }
4606 rows.push_back(s);
4607 }
4608 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4609 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4610 if (rows.at(r1) == rows.at(r2)) {
4611 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4612 return;
4613 }
4614 }
4615 }
4616 }
4617
4618 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4619 // even after next() has returned DONE.
4620
TestBug13447()4621 void RBBITest::TestBug13447() {
4622 UErrorCode status = U_ZERO_ERROR;
4623 LocalPointer<RuleBasedBreakIterator> bi(
4624 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4625 assertSuccess(WHERE, status);
4626 if (U_FAILURE(status)) return;
4627 UnicodeString data(u"1234");
4628 bi->setText(data);
4629 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4630 assertEquals(WHERE, 4, bi->next());
4631 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4632 assertEquals(WHERE, UBRK_DONE, bi->next());
4633 assertEquals(WHERE, 4, bi->current());
4634 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4635 }
4636
4637 // TestReverse exercises both the synthesized safe reverse rules and the logic
4638 // for filling the break iterator cache when starting from random positions
4639 // in the text.
4640 //
4641 // It's a monkey test, working on random data, with the expected data obtained
4642 // from forward iteration (no safe rules involved), comparing with results
4643 // when indexing into the interior of the string (safe rules needed).
4644
TestReverse()4645 void RBBITest::TestReverse() {
4646 UErrorCode status = U_ZERO_ERROR;
4647
4648 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4649 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4650 assertSuccess(WHERE, status, true);
4651 status = U_ZERO_ERROR;
4652 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4653 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4654 assertSuccess(WHERE, status, true);
4655 status = U_ZERO_ERROR;
4656 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4657 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4658 assertSuccess(WHERE, status, true);
4659 status = U_ZERO_ERROR;
4660 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4661 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4662 assertSuccess(WHERE, status, true);
4663 }
4664
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4665 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4666 if (!bi) {
4667 return;
4668 }
4669
4670 // From the mapping trie in the break iterator's internal data, create a
4671 // vector of UnicodeStrings, one for each character category, containing
4672 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4673 // to avoid an execess of unassigned code points.
4674
4675 RBBIDataWrapper *data = bi->fData;
4676 int32_t categoryCount = data->fHeader->fCatCount;
4677 UCPTrie *trie = data->fTrie;
4678 bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4679 uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4680
4681 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4682 for (int cp=0; cp<0x1fff0; ++cp) {
4683 int cat = ucptrie_get(trie, cp);
4684 cat &= ~dictBit; // And off the dictionary bit from the category.
4685 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4686 if (cat < 0 || cat >= categoryCount) return;
4687 strings[cat].append(cp);
4688 }
4689
4690 icu_rand randomGen;
4691 const int testStringLength = 10000;
4692 UnicodeString testString;
4693
4694 for (int i=0; i<testStringLength; ++i) {
4695 int charClass = randomGen() % categoryCount;
4696 if (strings[charClass].length() > 0) {
4697 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4698 testString.append(cp);
4699 }
4700 }
4701
4702 typedef std::pair<UBool, int32_t> Result;
4703 std::vector<Result> expectedResults;
4704 bi->setText(testString);
4705 for (int i=0; i<testString.length(); ++i) {
4706 bool isboundary = bi->isBoundary(i);
4707 int ruleStatus = bi->getRuleStatus();
4708 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4709 }
4710
4711 for (int i=testString.length()-1; i>=0; --i) {
4712 bi->setText(testString); // clears the internal break cache
4713 Result expected = expectedResults[i];
4714 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4715 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4716 }
4717 }
4718
4719
4720 // Ticket 13692 - finding word boundaries in very large numbers or words could
4721 // be very time consuming. When the problem was present, this void test
4722 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4723
TestBug13692()4724 void RBBITest::TestBug13692() {
4725 UErrorCode status = U_ZERO_ERROR;
4726 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4727 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4728 if (!assertSuccess(WHERE, status, true)) {
4729 return;
4730 }
4731 constexpr int32_t LENGTH = 1000000;
4732 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4733 for (int i=0; i<20; i+=2) {
4734 longNumber.setCharAt(i, u' ');
4735 }
4736 bi->setText(longNumber);
4737 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4738 assertSuccess(WHERE, status);
4739 }
4740
4741
TestProperties()4742 void RBBITest::TestProperties() {
4743 UErrorCode errorCode = U_ZERO_ERROR;
4744 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4745 if (!prependSet.isEmpty()) {
4746 errln(
4747 "[:GCB=Prepend:] is not empty any more. "
4748 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4749 "change this test to the opposite condition.");
4750 }
4751 }
4752
4753
4754 //
4755 // TestDebug - A place-holder test for debugging purposes.
4756 // For putting in fragments of other tests that can be invoked
4757 // for tracing without a lot of unwanted extra stuff happening.
4758 //
TestDebug(void)4759 void RBBITest::TestDebug(void) {
4760 UErrorCode status = U_ZERO_ERROR;
4761 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4762 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4763 if (!assertSuccess(WHERE, status, true)) {
4764 return;
4765 }
4766 const UnicodeString &rules = bi->getRules();
4767 UParseError pe;
4768 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4769 assertSuccess(WHERE, status);
4770 }
4771
4772
4773 //
4774 // TestDebugRules A stub test for use in debugging rule compilation problems.
4775 // Can be freely altered as needed or convenient.
4776 // Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4777 // data files may not be available in all environments.
4778 // Any permanent test cases should be moved to rbbitst.txt
4779 // (see Bug 20303 in that file, for example), or to another test function in this file.
4780 //
TestDebugRules()4781 void RBBITest::TestDebugRules() {
4782 #if 0
4783 const char16_t *rules = u""
4784 "!!quoted_literals_only; \n"
4785 "!!chain; \n"
4786 "!!lookAheadHardBreak; \n"
4787 " \n"
4788 // "[a] / ; \n"
4789 "[a] [b] / [c] [d]; \n"
4790 "[a] [b] / [c] [d] {100}; \n"
4791 "[x] [a] [b] / [c] [d] {100}; \n"
4792 "[a] [b] [c] / [d] {100}; \n"
4793 //" [c] [d] / [e] [f]; \n"
4794 //"[a] [b] / [c]; \n"
4795 ;
4796
4797 UErrorCode status = U_ZERO_ERROR;
4798 CharString path(pathToDataDirectory(), status);
4799 path.appendPathPart("brkitr", status);
4800 path.appendPathPart("rules", status);
4801 path.appendPathPart("line.txt", status);
4802 int len;
4803 std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4804 if (!assertSuccess(WHERE, status)) {
4805 return;
4806 }
4807
4808 UParseError pe;
4809 // rules = testFile.get();
4810 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4811
4812 if (!assertSuccess(WHERE, status)) {
4813 delete bi;
4814 return;
4815 }
4816 // bi->dumpTables();
4817
4818 delete bi;
4819 #endif
4820 }
4821
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4822 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4823 UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4824 int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4825 // Text are duplicate characters from U+4E00 to U+4FFF
4826 UnicodeString text;
4827 for (UChar c = 0x4e00; c < 0x5000; c++) {
4828 text.append(c).append(c);
4829 }
4830 // Generate rule which will caused length+4 character classes and
4831 // length+3 states
4832 UnicodeString rules(u"!!quoted_literals_only;");
4833 for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4834 rules.append(u'\'').append(c).append(c).append(u"';");
4835 }
4836 rules.append(u".;");
4837 UErrorCode status = U_ZERO_ERROR;
4838 UParseError parseError;
4839 RuleBasedBreakIterator bi(rules, parseError, status);
4840
4841 assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4842 assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4843 assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4844 assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4845 assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4846
4847 bi.setText(text);
4848
4849 int32_t pos;
4850 int32_t i = 0;
4851 while ((pos = bi.next()) > 0) {
4852 // The first numChar should not break between the pair
4853 if (i++ < numChar) {
4854 assertEquals(WHERE, i * 2, pos);
4855 } else {
4856 // After the first numChar next(), break on each character.
4857 assertEquals(WHERE, i + numChar, pos);
4858 }
4859 }
4860 while ((pos = bi.previous()) > 0) {
4861 // The first numChar should not break between the pair
4862 if (--i < numChar) {
4863 assertEquals(WHERE, i * 2, pos);
4864 } else {
4865 // After the first numChar next(), break on each character.
4866 assertEquals(WHERE, i + numChar, pos);
4867 }
4868 }
4869 }
4870
Test8BitsTrieWith8BitStateTable()4871 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4872 testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4873 }
4874
Test16BitsTrieWith8BitStateTable()4875 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4876 testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4877 }
4878
Test16BitsTrieWith16BitStateTable()4879 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4880 testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4881 }
4882
Test8BitsTrieWith16BitStateTable()4883 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4884 // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4885 // create state table in 16 bits.
4886
4887 // Generate 510 'a' as text
4888 UnicodeString text;
4889 for (int32_t i = 0; i < 510; i++) {
4890 text.append(u'a');
4891 }
4892
4893 UnicodeString rules(u"!!quoted_literals_only;'");
4894 // 254 'a' in the rule will cause 256 states
4895 for (int32_t i = 0; i < 254; i++) {
4896 rules.append(u'a');
4897 }
4898 rules.append(u"';.;");
4899
4900 UErrorCode status = U_ZERO_ERROR;
4901 UParseError parseError;
4902 LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4903
4904 assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4905 assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4906 assertEquals(WHERE,
4907 false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
4908 bi->setText(text);
4909
4910 // break positions:
4911 // 254, 508, 509, ... 510
4912 assertEquals("next()", 254, bi->next());
4913 int32_t i = 0;
4914 int32_t pos;
4915 while ((pos = bi->next()) > 0) {
4916 assertEquals(WHERE, 508 + i , pos);
4917 i++;
4918 }
4919 i = 0;
4920 while ((pos = bi->previous()) > 0) {
4921 i++;
4922 if (pos >= 508) {
4923 assertEquals(WHERE, 510 - i , pos);
4924 } else {
4925 assertEquals(WHERE, 254 , pos);
4926 }
4927 }
4928 }
4929
4930 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
4931 // that there are no problems with rules at the size that transitions between the two.
4932 //
4933 // A rule that matches a literal string, like 'abcdefghij', will require one state and
4934 // one character class per character in the string. So we can make a rule to tickle the
4935 // boundaries by using literal strings of various lengths.
4936 //
4937 // For both the number of states and the number of character classes, the eight bit format
4938 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
4939 // leaving 120 something available. This test runs the string over the range of 120 - 130,
4940 // which allows some margin for changes to the number of values reserved by the rule builder
4941 // without breaking the test.
4942
TestTable_8_16_Bits()4943 void RBBITest::TestTable_8_16_Bits() {
4944
4945 // testStr serves as both the source of the rule string (truncated to the desired length)
4946 // and as test data to check matching behavior. A break rule consisting of the first 120
4947 // characters of testStr will match the first 120 chars of the full-length testStr.
4948 UnicodeString testStr;
4949 for (UChar c=0x3000; c<0x3200; ++c) {
4950 testStr.append(c);
4951 }
4952
4953 const int32_t startLength = 120; // The shortest rule string to test.
4954 const int32_t endLength = 260; // The longest rule string to test
4955 const int32_t increment = this->quick ? endLength - startLength : 1;
4956
4957 for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
4958 UParseError parseError;
4959 UErrorCode status = U_ZERO_ERROR;
4960
4961 UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
4962 ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
4963 RuleBasedBreakIterator bi(ruleString, parseError, status);
4964 if (!assertSuccess(WHERE, status)) {
4965 errln(ruleString);
4966 break;
4967 }
4968 // bi.dumpTables();
4969
4970 // Verify that the break iterator is functioning - that the first boundary found
4971 // in testStr is at the length of the rule string.
4972 bi.setText(testStr);
4973 assertEquals(WHERE, ruleLen, bi.next());
4974
4975 // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
4976 // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
4977 bi.setText(testStr);
4978 int32_t result = bi.preceding(ruleLen);
4979 assertEquals(WHERE, 0, result);
4980
4981 // Verify that the range of rule lengths being tested cover the translations
4982 // from 8 to 16 bit data.
4983 bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
4984 bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
4985
4986 if (ruleLen == startLength) {
4987 assertEquals(WHERE, true, has8BitRowData);
4988 assertEquals(WHERE, true, has8BitsTrie);
4989 }
4990 if (ruleLen == endLength) {
4991 assertEquals(WHERE, false, has8BitRowData);
4992 assertEquals(WHERE, false, has8BitsTrie);
4993 }
4994 }
4995 }
4996
4997 /* Test handling of a large number of look-ahead rules.
4998 * The number of rules in the test exceeds the implementation limits prior to the
4999 * improvements introduced with #13590.
5000 *
5001 * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5002 * The text being matched is sequential, "ABCDEFGHI..."
5003 *
5004 * The upshot is that the look-ahead rules all match on their preceding context,
5005 * and consequently must save a potential result, but then fail to match on their
5006 * trailing context, so that they don't actually cause a boundary.
5007 *
5008 * Additionally, add a ".*" rule, so there are no boundaries unless a
5009 * look-ahead hard-break rule forces one.
5010 */
TestBug13590()5011 void RBBITest::TestBug13590() {
5012 UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5013
5014 const int NUM_LOOKAHEAD_RULES = 50;
5015 const char16_t STARTING_CHAR = u'\u5000';
5016 char16_t firstChar;
5017 for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5018 firstChar = STARTING_CHAR + ruleNum*2;
5019 rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5020 .append(u' ') .append(u'/') .append(u' ')
5021 .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5022 .append(u';') .append(u'\n');
5023 }
5024
5025 // Change the last rule added from the form "UV / WY" to "UV / WX".
5026 // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5027 rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5028
5029 UErrorCode status = U_ZERO_ERROR;
5030 UParseError parseError;
5031 RuleBasedBreakIterator bi(rules, parseError, status);
5032 if (!assertSuccess(WHERE, status)) {
5033 errln(rules);
5034 return;
5035 }
5036 // bi.dumpTables();
5037
5038 UnicodeString testString;
5039 for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5040 testString.append(c);
5041 }
5042 bi.setText(testString);
5043
5044 int breaksFound = 0;
5045 while (bi.next() != UBRK_DONE) {
5046 ++breaksFound;
5047 }
5048
5049 // Two matches are expected, one from the last rule that was explicitly modified,
5050 // and one at the end of the text.
5051 assertEquals(WHERE, 2, breaksFound);
5052 }
5053
5054
5055 #if U_ENABLE_TRACING
5056 static std::vector<std::string> gData;
5057 static std::vector<int32_t> gEntryFn;
5058 static std::vector<int32_t> gExitFn;
5059 static std::vector<int32_t> gDataFn;
5060
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5061 static void U_CALLCONV traceData(
5062 const void*,
5063 int32_t fnNumber,
5064 int32_t,
5065 const char *,
5066 va_list args) {
5067 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5068 const char* data = va_arg(args, const char*);
5069 gDataFn.push_back(fnNumber);
5070 gData.push_back(data);
5071 }
5072 }
5073
traceEntry(const void *,int32_t fnNumber)5074 static void traceEntry(const void *, int32_t fnNumber) {
5075 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5076 gEntryFn.push_back(fnNumber);
5077 }
5078 }
5079
traceExit(const void *,int32_t fnNumber,const char *,va_list)5080 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5081 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5082 gExitFn.push_back(fnNumber);
5083 }
5084 }
5085
5086
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5087 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5088 assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5089 assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5090 assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5091 assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5092
5093 if (expectedData == nullptr) {
5094 assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5095 assertEquals("utrace_data should not be called ", 0, gData.size());
5096 } else {
5097 assertEquals("utrace_data should be called ", 1, gDataFn.size());
5098 assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5099 assertEquals("utrace_data should be called ", 1, gData.size());
5100 assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5101 }
5102 }
5103
SetupTestTrace()5104 void SetupTestTrace() {
5105 gEntryFn.clear();
5106 gExitFn.clear();
5107 gDataFn.clear();
5108 gData.clear();
5109
5110 const void* context = nullptr;
5111 utrace_setFunctions(context, traceEntry, traceExit, traceData);
5112 utrace_setLevel(UTRACE_INFO);
5113 }
5114
TestTraceCreateCharacter(void)5115 void RBBITest::TestTraceCreateCharacter(void) {
5116 SetupTestTrace();
5117 IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5118 LocalPointer<BreakIterator> brkitr(
5119 BreakIterator::createCharacterInstance("zh-CN", status));
5120 status.errIfFailureAndReset();
5121 assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5122 }
5123
TestTraceCreateTitle(void)5124 void RBBITest::TestTraceCreateTitle(void) {
5125 SetupTestTrace();
5126 IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5127 LocalPointer<BreakIterator> brkitr(
5128 BreakIterator::createTitleInstance("zh-CN", status));
5129 status.errIfFailureAndReset();
5130 assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5131 }
5132
TestTraceCreateSentence(void)5133 void RBBITest::TestTraceCreateSentence(void) {
5134 SetupTestTrace();
5135 IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5136 LocalPointer<BreakIterator> brkitr(
5137 BreakIterator::createSentenceInstance("zh-CN", status));
5138 status.errIfFailureAndReset();
5139 assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5140 }
5141
TestTraceCreateWord(void)5142 void RBBITest::TestTraceCreateWord(void) {
5143 SetupTestTrace();
5144 IcuTestErrorCode status(*this, "TestTraceCreateWord");
5145 LocalPointer<BreakIterator> brkitr(
5146 BreakIterator::createWordInstance("zh-CN", status));
5147 status.errIfFailureAndReset();
5148 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5149 }
5150
TestTraceCreateLine(void)5151 void RBBITest::TestTraceCreateLine(void) {
5152 SetupTestTrace();
5153 IcuTestErrorCode status(*this, "TestTraceCreateLine");
5154 LocalPointer<BreakIterator> brkitr(
5155 BreakIterator::createLineInstance("zh-CN", status));
5156 status.errIfFailureAndReset();
5157 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line");
5158 }
5159
TestTraceCreateLineStrict(void)5160 void RBBITest::TestTraceCreateLineStrict(void) {
5161 SetupTestTrace();
5162 IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5163 LocalPointer<BreakIterator> brkitr(
5164 BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5165 status.errIfFailureAndReset();
5166 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict");
5167 }
5168
TestTraceCreateLineNormal(void)5169 void RBBITest::TestTraceCreateLineNormal(void) {
5170 SetupTestTrace();
5171 IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5172 LocalPointer<BreakIterator> brkitr(
5173 BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5174 status.errIfFailureAndReset();
5175 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal");
5176 }
5177
TestTraceCreateLineLoose(void)5178 void RBBITest::TestTraceCreateLineLoose(void) {
5179 SetupTestTrace();
5180 IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5181 LocalPointer<BreakIterator> brkitr(
5182 BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5183 status.errIfFailureAndReset();
5184 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose");
5185 }
5186
TestTraceCreateLineLoosePhrase(void)5187 void RBBITest::TestTraceCreateLineLoosePhrase(void) {
5188 SetupTestTrace();
5189 IcuTestErrorCode status(*this, "TestTraceCreateLineLoosePhrase");
5190 LocalPointer<BreakIterator> brkitr(
5191 BreakIterator::createLineInstance("ja-u-lb-loose-lw-phrase", status));
5192 status.errIfFailureAndReset();
5193 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose_phrase");
5194 }
5195
TestTraceCreateLineNormalPhrase(void)5196 void RBBITest::TestTraceCreateLineNormalPhrase(void) {
5197 SetupTestTrace();
5198 IcuTestErrorCode status(*this, "TestTraceCreateLineNormalPhrase");
5199 LocalPointer<BreakIterator> brkitr(
5200 BreakIterator::createLineInstance("ja-u-lb-normal-lw-phrase", status));
5201 status.errIfFailureAndReset();
5202 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal_phrase");
5203 }
5204
TestTraceCreateLineStrictPhrase(void)5205 void RBBITest::TestTraceCreateLineStrictPhrase(void) {
5206 SetupTestTrace();
5207 IcuTestErrorCode status(*this, "TestTraceCreateLineStrictPhrase");
5208 LocalPointer<BreakIterator> brkitr(
5209 BreakIterator::createLineInstance("ja-u-lb-strict-lw-phrase", status));
5210 status.errIfFailureAndReset();
5211 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict_phrase");
5212 }
5213
TestTraceCreateLinePhrase(void)5214 void RBBITest::TestTraceCreateLinePhrase(void) {
5215 SetupTestTrace();
5216 IcuTestErrorCode status(*this, "TestTraceCreateLinePhrase");
5217 LocalPointer<BreakIterator> brkitr(
5218 BreakIterator::createLineInstance("ja-u-lw-phrase", status));
5219 status.errIfFailureAndReset();
5220 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_phrase");
5221 }
5222
TestTraceCreateBreakEngine(void)5223 void RBBITest::TestTraceCreateBreakEngine(void) {
5224 rbbi_cleanup();
5225 SetupTestTrace();
5226 IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5227 LocalPointer<BreakIterator> brkitr(
5228 BreakIterator::createWordInstance("zh-CN", status));
5229 status.errIfFailureAndReset();
5230 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5231
5232 // To word break the following text, BreakIterator will create 5 dictionary
5233 // break engine internally.
5234 brkitr->setText(
5235 u"test "
5236 u"測試 " // Hani
5237 u"សាកល្បង " // Khmr
5238 u"ທົດສອບ " // Laoo
5239 u"စမ်းသပ်မှု " // Mymr
5240 u"ทดสอบ " // Thai
5241 u"test "
5242 );
5243
5244 // Loop through all the text.
5245 while (brkitr->next() > 0) ;
5246
5247 assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5248 assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5249 assertEquals("utrace_data should be called ", 5, gDataFn.size());
5250
5251 for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5252 assertEquals("utrace_entry should be called ",
5253 UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5254 assertEquals("utrace_exit should be called ",
5255 UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5256 assertEquals("utrace_data should be called ",
5257 UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5258 }
5259
5260 assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5261 assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5262 assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5263 assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5264 assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5265
5266 }
5267 #endif
5268
TestUnpairedSurrogate()5269 void RBBITest::TestUnpairedSurrogate() {
5270 UnicodeString rules(u"ab;");
5271
5272 UErrorCode status = U_ZERO_ERROR;
5273 UParseError pe;
5274 RuleBasedBreakIterator bi1(rules, pe, status);
5275 assertSuccess(WHERE, status);
5276 UnicodeString rtRules = bi1.getRules();
5277 // make sure the simple one work first.
5278 assertEquals(WHERE, rules, rtRules);
5279
5280
5281 rules = UnicodeString(u"a\\ud800b;").unescape();
5282 pe.line = 0;
5283 pe.offset = 0;
5284 RuleBasedBreakIterator bi2(rules, pe, status);
5285 assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5286 if (pe.line != 1 || pe.offset != 1) {
5287 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5288 }
5289
5290 status = U_ZERO_ERROR;
5291 rules = UnicodeString(u"a\\ude00b;").unescape();
5292 pe.line = 0;
5293 pe.offset = 0;
5294 RuleBasedBreakIterator bi3(rules, pe, status);
5295 assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5296 if (pe.line != 1 || pe.offset != 1) {
5297 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5298 }
5299
5300 // make sure the surrogate one work too.
5301 status = U_ZERO_ERROR;
5302 rules = UnicodeString(u"a��b;");
5303 RuleBasedBreakIterator bi4(rules, pe, status);
5304 rtRules = bi4.getRules();
5305 assertEquals(WHERE, rules, rtRules);
5306 }
5307
5308 // Read file generated by
5309 // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5310 // as test cases and compare the Output.
5311 // Format of the file
5312 // Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5313 // Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5314 // Input:\t[source text]
5315 // Output:\t[expected output separated by | ]
5316 // Input: ...
5317 // Output: ...
5318
runLSTMTestFromFile(const char * filename,UScriptCode script)5319 void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5320 // The expectation in this test depends on LSTM, skip the test if the
5321 // configuration is not build with LSTM data.
5322 if (skipLSTMTest()) {
5323 return;
5324 }
5325 UErrorCode status = U_ZERO_ERROR;
5326 LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5327 if (U_FAILURE(status)) {
5328 errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5329 return;
5330 }
5331 // Open and read the test data file.
5332 const char *testDataDirectory = IntlTest::getSourceTestData(status);
5333 CharString testFileName(testDataDirectory, -1, status);
5334 testFileName.append(filename, -1, status);
5335
5336 int len;
5337 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5338 if (U_FAILURE(status)) {
5339 errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5340 return;
5341 }
5342
5343 // Put the test data into a UnicodeString
5344 UnicodeString testString(false, testFile, len);
5345
5346 int32_t start = 0;
5347
5348 UnicodeString line;
5349 int32_t end;
5350 std::string actual_sep_str;
5351 int32_t caseNum = 0;
5352 // Iterate through all the lines in the test file.
5353 do {
5354 int32_t cr = testString.indexOf(u'\r', start);
5355 int32_t lf = testString.indexOf(u'\n', start);
5356 end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5357 line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5358 if (line.length() > 0) {
5359 // Separate each line to key and value by TAB.
5360 int32_t tab = line.indexOf(u'\t');
5361 UnicodeString key = line.tempSubString(0, tab);
5362 const UnicodeString value = line.tempSubString(tab+1);
5363
5364 if (key == "Model:") {
5365 // Verify the expectation in the test file match the LSTM model
5366 // we are using now.
5367 const LSTMData* data = CreateLSTMDataForScript(script, status);
5368 if (U_FAILURE(status)) {
5369 dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5370 __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5371 return;
5372 }
5373 UnicodeString name(LSTMDataName(data));
5374 DeleteLSTMData(data);
5375 if (value != name) {
5376 std::string utf8Name, utf8Value;
5377 dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5378 __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5379 name.toUTF8String<std::string>(utf8Name).c_str(),
5380 value.toUTF8String<std::string>(utf8Value).c_str());
5381 return;
5382 }
5383 } else if (key == "Input:") {
5384 UnicodeString input("prefix ");
5385 input += value + " suffix";
5386 std::stringstream ss;
5387
5388 // Construct the UText which is expected by the the engine as
5389 // input from the UnicodeString.
5390 UText ut = UTEXT_INITIALIZER;
5391 utext_openConstUnicodeString(&ut, &input, &status);
5392 if (U_FAILURE(status)) {
5393 dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5394 return;
5395 }
5396
5397 iterator->setText(&ut, status);
5398 if (U_FAILURE(status)) {
5399 errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5400 return;
5401 }
5402
5403 int32_t bp;
5404 for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5405 ss << bp;
5406 if (bp != input.length()) {
5407 ss << ", ";
5408 }
5409 }
5410
5411 utext_close(&ut);
5412 // Turn the break points into a string for easy comparison
5413 // output.
5414 actual_sep_str = "{" + ss.str() + "}";
5415 } else if (key == "Output:" && !actual_sep_str.empty()) {
5416 UnicodeString input("prefix| |");
5417 input += value + "| |suffix";
5418 std::string d;
5419 int32_t sep;
5420 int32_t start = 0;
5421 int32_t curr = 0;
5422 std::stringstream ss;
5423 // Include 0 as the break point.
5424 ss << "0, ";
5425 while ((sep = input.indexOf(u'|', start)) >= 0) {
5426 int32_t len = sep - start;
5427 if (len > 0) {
5428 if (curr > 0) {
5429 ss << ", ";
5430 }
5431 curr += len;
5432 ss << curr;
5433 }
5434 start = sep + 1;
5435 }
5436 // Include end of the string as break point.
5437 ss << ", " << curr + input.length() - start;
5438 // Turn the break points into a string for easy comparison
5439 // output.
5440 std::string expected = "{" + ss.str() + "}";
5441 std::string utf8;
5442
5443 assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5444 expected.c_str(), actual_sep_str.c_str());
5445 actual_sep_str.clear();
5446 }
5447 }
5448 start = std::max(cr, lf) + 1;
5449 } while (end >= 0);
5450
5451 delete [] testFile;
5452 }
5453
TestLSTMThai()5454 void RBBITest::TestLSTMThai() {
5455 runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5456 }
5457
TestLSTMBurmese()5458 void RBBITest::TestLSTMBurmese() {
5459 runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5460 }
5461
5462
5463 // Test preceding(index) and following(index), with semi-random indexes.
5464 // The random indexes are produced in clusters that are relatively closely spaced,
5465 // to increase the occurrences of hits to the internal break cache.
5466
TestRandomAccess()5467 void RBBITest::TestRandomAccess() {
5468 static constexpr int32_t CACHE_SIZE = 128;
5469
5470 UnicodeString testData;
5471 for (int i=0; i<CACHE_SIZE*2; ++i) {
5472 testData.append(u"aaaa\n");
5473 }
5474
5475 UErrorCode status = U_ZERO_ERROR;
5476 LocalPointer<RuleBasedBreakIterator> bi(
5477 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status),
5478 status);
5479 if (!assertSuccess(WHERE, status)) { return; };
5480
5481 bi->setText(testData);
5482
5483 auto expectedPreceding = [](int from) {
5484 if (from == 0) {return UBRK_DONE;}
5485 if (from % 5 == 0) {return from - 5;}
5486 return from - (from % 5);
5487 };
5488
5489 auto expectedFollow = [testData](int from) {
5490 if (from >= testData.length()) {return UBRK_DONE;}
5491 if (from % 5 == 0) {return from + 5;}
5492 return from + (5 - (from % 5));
5493 };
5494
5495 auto randomStringIndex = [testData]() {
5496 static icu_rand randomGenerator; // produces random uint32_t values.
5497 static int lastNum;
5498 static int clusterCount;
5499 static constexpr int CLUSTER_SIZE = 100;
5500 static constexpr int CLUSTER_LENGTH = 10;
5501
5502 if (clusterCount < CLUSTER_LENGTH) {
5503 ++clusterCount;
5504 lastNum += (randomGenerator() % CLUSTER_SIZE);
5505 lastNum -= CLUSTER_SIZE / 2;
5506 lastNum = std::max(0, lastNum);
5507 // Deliberately test indexes > testData.length.
5508 lastNum = std::min(testData.length() + 5, lastNum);
5509 } else {
5510 clusterCount = 0;
5511 lastNum = randomGenerator() % testData.length();
5512 }
5513 return lastNum;
5514 };
5515
5516 for (int i=0; i<5000; ++i) {
5517 int idx = randomStringIndex();
5518 assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
5519 idx = randomStringIndex();
5520 assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
5521 }
5522 }
5523
5524 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5525