1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <algorithm>
18 #include <sstream>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <utility>
23 #include <vector>
24
25 #include "unicode/brkiter.h"
26 #include "unicode/localpointer.h"
27 #include "unicode/numfmt.h"
28 #include "unicode/rbbi.h"
29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30 #include "unicode/regex.h"
31 #endif
32 #include "unicode/schriter.h"
33 #include "unicode/uchar.h"
34 #include "unicode/utf16.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uscript.h"
38 #include "unicode/ustring.h"
39 #include "unicode/utext.h"
40 #include "unicode/utrace.h"
41
42 #include "charstr.h"
43 #include "cmemory.h"
44 #include "cstr.h"
45 #include "intltest.h"
46 #include "lstmbe.h"
47 #include "rbbitst.h"
48 #include "rbbidata.h"
49 #include "utypeinfo.h" // for 'typeid' to work
50 #include "uvector.h"
51 #include "uvectr32.h"
52
53
54 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
55 #include "unicode/filteredbrk.h"
56 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
57
58 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
59 if (!(x)) { \
60 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
61 } \
62 } UPRV_BLOCK_MACRO_END
63
64 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
65 if (U_FAILURE(errcode)) { \
66 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
67 } \
68 } UPRV_BLOCK_MACRO_END
69
70 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
71 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
72 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
73 }
74
75 //---------------------------------------------
76 // runIndexedTest
77 //---------------------------------------------
78
79
80 // Note: Before adding new tests to this file, check whether the desired test data can
81 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
82 // it's much less work than writing a new test, diagnostic output in the event of failures
83 // is good, and the test data file will is shared with ICU4J, so eventually the test
84 // will run there as well, without additional effort.
85
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)86 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
87 {
88 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
89 fTestParams = params;
90
91 TESTCASE_AUTO_BEGIN;
92 #if !UCONFIG_NO_FILE_IO
93 TESTCASE_AUTO(TestBug4153072);
94 #endif
95 #if !UCONFIG_NO_FILE_IO
96 TESTCASE_AUTO(TestUnicodeFiles);
97 #endif
98 TESTCASE_AUTO(TestGetAvailableLocales);
99 TESTCASE_AUTO(TestGetDisplayName);
100 #if !UCONFIG_NO_FILE_IO
101 TESTCASE_AUTO(TestEndBehaviour);
102 TESTCASE_AUTO(TestWordBreaks);
103 TESTCASE_AUTO(TestWordBoundary);
104 TESTCASE_AUTO(TestLineBreaks);
105 TESTCASE_AUTO(TestSentBreaks);
106 TESTCASE_AUTO(TestExtended);
107 #endif
108 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
109 TESTCASE_AUTO(TestMonkey);
110 #endif
111 #if !UCONFIG_NO_FILE_IO
112 TESTCASE_AUTO(TestBug3818);
113 #endif
114 TESTCASE_AUTO(TestDebug);
115 #if !UCONFIG_NO_FILE_IO
116 TESTCASE_AUTO(TestBug5775);
117 #endif
118 TESTCASE_AUTO(TestBug9983);
119 TESTCASE_AUTO(TestDictRules);
120 TESTCASE_AUTO(TestBug5532);
121 TESTCASE_AUTO(TestBug7547);
122 TESTCASE_AUTO(TestBug12797);
123 TESTCASE_AUTO(TestBug12918);
124 TESTCASE_AUTO(TestBug12932);
125 TESTCASE_AUTO(TestEmoji);
126 TESTCASE_AUTO(TestBug12519);
127 TESTCASE_AUTO(TestBug12677);
128 TESTCASE_AUTO(TestTableRedundancies);
129 TESTCASE_AUTO(TestBug13447);
130 TESTCASE_AUTO(TestReverse);
131 TESTCASE_AUTO(TestBug13692);
132 TESTCASE_AUTO(TestDebugRules);
133 TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
134 TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
135 TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
136 TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
137 TESTCASE_AUTO(TestTable_8_16_Bits);
138 TESTCASE_AUTO(TestBug13590);
139 TESTCASE_AUTO(TestUnpairedSurrogate);
140 TESTCASE_AUTO(TestLSTMThai);
141 TESTCASE_AUTO(TestLSTMBurmese);
142
143 #if U_ENABLE_TRACING
144 TESTCASE_AUTO(TestTraceCreateCharacter);
145 TESTCASE_AUTO(TestTraceCreateWord);
146 TESTCASE_AUTO(TestTraceCreateSentence);
147 TESTCASE_AUTO(TestTraceCreateTitle);
148 TESTCASE_AUTO(TestTraceCreateLine);
149 TESTCASE_AUTO(TestTraceCreateLineNormal);
150 TESTCASE_AUTO(TestTraceCreateLineLoose);
151 TESTCASE_AUTO(TestTraceCreateLineStrict);
152 TESTCASE_AUTO(TestTraceCreateLineNormalPhrase);
153 TESTCASE_AUTO(TestTraceCreateLineLoosePhrase);
154 TESTCASE_AUTO(TestTraceCreateLineStrictPhrase);
155 TESTCASE_AUTO(TestTraceCreateLinePhrase);
156 TESTCASE_AUTO(TestTraceCreateBreakEngine);
157 #endif
158
159 TESTCASE_AUTO_END;
160 }
161
162
163 //--------------------------------------------------------------------------------------
164 //
165 // RBBITest constructor and destructor
166 //
167 //--------------------------------------------------------------------------------------
168
RBBITest()169 RBBITest::RBBITest() {
170 fTestParams = NULL;
171 }
172
173
~RBBITest()174 RBBITest::~RBBITest() {
175 }
176
177
printStringBreaks(UText * tstr,int expected[],int expectedCount)178 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
179 UErrorCode status = U_ZERO_ERROR;
180 char name[100];
181 printf("code alpha extend alphanum type word sent line name\n");
182 int nextExpectedIndex = 0;
183 utext_setNativeIndex(tstr, 0);
184 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
185 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
186 printf("------------------------------------------------ %d\n", j);
187 ++nextExpectedIndex;
188 }
189
190 UChar32 c = utext_next32(tstr);
191 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
192 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
193 u_isUAlphabetic(c),
194 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
195 u_isalnum(c),
196 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
197 u_charType(c),
198 U_SHORT_PROPERTY_NAME),
199 u_getPropertyValueName(UCHAR_WORD_BREAK,
200 u_getIntPropertyValue(c,
201 UCHAR_WORD_BREAK),
202 U_SHORT_PROPERTY_NAME),
203 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
204 u_getIntPropertyValue(c,
205 UCHAR_SENTENCE_BREAK),
206 U_SHORT_PROPERTY_NAME),
207 u_getPropertyValueName(UCHAR_LINE_BREAK,
208 u_getIntPropertyValue(c,
209 UCHAR_LINE_BREAK),
210 U_SHORT_PROPERTY_NAME),
211 name);
212 }
213 }
214
215
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)216 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
217 UErrorCode status = U_ZERO_ERROR;
218 UText *tstr = NULL;
219 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
220 if (U_FAILURE(status)) {
221 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
222 return;
223 }
224 printStringBreaks(tstr, expected, expectedCount);
225 utext_close(tstr);
226 }
227
228
TestBug3818()229 void RBBITest::TestBug3818() {
230 UErrorCode status = U_ZERO_ERROR;
231
232 // Four Thai words...
233 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
234 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
235 UnicodeString thaiStr(thaiWordData);
236
237 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
238 if (U_FAILURE(status) || bi == NULL) {
239 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
240 return;
241 }
242 bi->setText(thaiStr);
243
244 int32_t startOfSecondWord = bi->following(1);
245 if (startOfSecondWord != 4) {
246 errln("Fail at file %s, line %d expected start of word at 4, got %d",
247 __FILE__, __LINE__, startOfSecondWord);
248 }
249 startOfSecondWord = bi->following(0);
250 if (startOfSecondWord != 4) {
251 errln("Fail at file %s, line %d expected start of word at 4, got %d",
252 __FILE__, __LINE__, startOfSecondWord);
253 }
254 delete bi;
255 }
256
257
258 //---------------------------------------------
259 //
260 // other tests
261 //
262 //---------------------------------------------
263
TestGetAvailableLocales()264 void RBBITest::TestGetAvailableLocales()
265 {
266 int32_t locCount = 0;
267 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
268
269 if (locCount == 0)
270 dataerrln("getAvailableLocales() returned an empty list!");
271 // Just make sure that it's returning good memory.
272 int32_t i;
273 for (i = 0; i < locCount; ++i) {
274 logln(locList[i].getName());
275 }
276 }
277
278 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()279 void RBBITest::TestGetDisplayName()
280 {
281 UnicodeString result;
282
283 BreakIterator::getDisplayName(Locale::getUS(), result);
284 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
285 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
286 + result);
287
288 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
289 if (result != "French (France)")
290 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
291 + result);
292 }
293 /**
294 * Test End Behaviour
295 * @bug 4068137
296 */
TestEndBehaviour()297 void RBBITest::TestEndBehaviour()
298 {
299 UErrorCode status = U_ZERO_ERROR;
300 UnicodeString testString("boo.");
301 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
302 if (U_FAILURE(status))
303 {
304 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
305 return;
306 }
307 wb->setText(testString);
308
309 if (wb->first() != 0)
310 errln("Didn't get break at beginning of string.");
311 if (wb->next() != 3)
312 errln("Didn't get break before period in \"boo.\"");
313 if (wb->current() != 4 && wb->next() != 4)
314 errln("Didn't get break at end of string.");
315 delete wb;
316 }
317 /*
318 * @bug 4153072
319 */
TestBug4153072()320 void RBBITest::TestBug4153072() {
321 UErrorCode status = U_ZERO_ERROR;
322 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
323 if (U_FAILURE(status))
324 {
325 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
326 return;
327 }
328 UnicodeString str("...Hello, World!...");
329 int32_t begin = 3;
330 int32_t end = str.length() - 3;
331 UBool onBoundary;
332
333 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
334 iter->adoptText(textIterator);
335 int index;
336 // Note: with the switch to UText, there is no way to restrict the
337 // iteration range to begin at an index other than zero.
338 // String character iterators created with a non-zero bound are
339 // treated by RBBI as being empty.
340 for (index = -1; index < begin + 1; ++index) {
341 onBoundary = iter->isBoundary(index);
342 if (index == 0? !onBoundary : onBoundary) {
343 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
344 " and begin index = " + begin);
345 }
346 }
347 delete iter;
348 }
349
350
351 //
352 // Test for problem reported by Ashok Matoria on 9 July 2007
353 // One.<kSoftHyphen><kSpace>Two.
354 //
355 // Sentence break at start (0) and then on calling next() it breaks at
356 // 'T' of "Two". Now, at this point if I do next() and
357 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
358 //
TestBug5775()359 void RBBITest::TestBug5775() {
360 UErrorCode status = U_ZERO_ERROR;
361 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
362 TEST_ASSERT_SUCCESS(status);
363 if (U_FAILURE(status)) {
364 return;
365 }
366 // Check for status first for better handling of no data errors.
367 TEST_ASSERT(bi != NULL);
368 if (bi == NULL) {
369 return;
370 }
371
372 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
373 // 01234 56789
374 s = s.unescape();
375 bi->setText(s);
376 int pos = bi->next();
377 TEST_ASSERT(pos == 6);
378 pos = bi->next();
379 TEST_ASSERT(pos == 10);
380 pos = bi->previous();
381 TEST_ASSERT(pos == 6);
382 delete bi;
383 }
384
385
386
387 //------------------------------------------------------------------------------
388 //
389 // RBBITest::Extended Run RBBI Tests from an external test data file
390 //
391 //------------------------------------------------------------------------------
392
393 struct TestParams {
394 BreakIterator *bi; // Break iterator is set while parsing test source.
395 // Changed out whenever test data changes break type.
396
397 UnicodeString dataToBreak; // Data that is built up while parsing the test.
398 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
399 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
400 UVector32 *srcCol;
401
402 UText *textToBreak; // UText, could be UTF8 or UTF16.
403 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
404 CharString utf8String; // UTF-8 form of text to break.
405
TestParamsTestParams406 TestParams(UErrorCode &status) : dataToBreak() {
407 bi = NULL;
408 expectedBreaks = new UVector32(status);
409 srcLine = new UVector32(status);
410 srcCol = new UVector32(status);
411 textToBreak = NULL;
412 textMap = new UVector32(status);
413 }
414
~TestParamsTestParams415 ~TestParams() {
416 delete bi;
417 delete expectedBreaks;
418 delete srcLine;
419 delete srcCol;
420 utext_close(textToBreak);
421 delete textMap;
422 }
423
424 int32_t getSrcLine(int32_t bp);
425 int32_t getExpectedBreak(int32_t bp);
426 int32_t getSrcCol(int32_t bp);
427
428 void setUTF16(UErrorCode &status);
429 void setUTF8(UErrorCode &status);
430 };
431
432 // Append a UnicodeString to a CharString with UTF-8 encoding.
433 // Substitute any invalid chars.
434 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)435 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
436 if (U_FAILURE(status)) {
437 return;
438 }
439 int32_t utf8Length;
440 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
441 src.getBuffer(), src.length(), // UTF-16 data
442 0xfffd, NULL, // Substitution char, number of subs.
443 &status);
444 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
445 return;
446 }
447 status = U_ZERO_ERROR;
448 int32_t capacity;
449 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
450 u_strToUTF8WithSub(buffer, utf8Length, NULL,
451 src.getBuffer(), src.length(),
452 0xfffd, NULL, &status);
453 dest.append(buffer, utf8Length, status);
454 }
455
456
setUTF16(UErrorCode & status)457 void TestParams::setUTF16(UErrorCode &status) {
458 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
459 textMap->removeAllElements();
460 for (int32_t i=0; i<dataToBreak.length(); i++) {
461 if (i == dataToBreak.getChar32Start(i)) {
462 textMap->addElement(i, status);
463 } else {
464 textMap->addElement(-1, status);
465 }
466 }
467 textMap->addElement(dataToBreak.length(), status);
468 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
469 }
470
471
setUTF8(UErrorCode & status)472 void TestParams::setUTF8(UErrorCode &status) {
473 if (U_FAILURE(status)) {
474 return;
475 }
476 utf8String.clear();
477 CharStringAppend(utf8String, dataToBreak, status);
478 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
479 if (U_FAILURE(status)) {
480 return;
481 }
482
483 textMap->removeAllElements();
484 int32_t utf16Index = 0;
485 for (;;) {
486 textMap->addElement(utf16Index, status);
487 UChar32 c32 = utext_current32(textToBreak);
488 if (c32 < 0) {
489 break;
490 }
491 utf16Index += U16_LENGTH(c32);
492 utext_next32(textToBreak);
493 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
494 textMap->addElement(-1, status);
495 }
496 }
497 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
498 }
499
500
getSrcLine(int32_t bp)501 int32_t TestParams::getSrcLine(int32_t bp) {
502 if (bp >= textMap->size()) {
503 bp = textMap->size() - 1;
504 }
505 int32_t i = 0;
506 for(; bp >= 0 ; --bp) {
507 // Move to a character boundary if we are not on one already.
508 i = textMap->elementAti(bp);
509 if (i >= 0) {
510 break;
511 }
512 }
513 return srcLine->elementAti(i);
514 }
515
516
getExpectedBreak(int32_t bp)517 int32_t TestParams::getExpectedBreak(int32_t bp) {
518 if (bp >= textMap->size()) {
519 return 0;
520 }
521 int32_t i = textMap->elementAti(bp);
522 int32_t retVal = 0;
523 if (i >= 0) {
524 retVal = expectedBreaks->elementAti(i);
525 }
526 return retVal;
527 }
528
529
getSrcCol(int32_t bp)530 int32_t TestParams::getSrcCol(int32_t bp) {
531 if (bp >= textMap->size()) {
532 bp = textMap->size() - 1;
533 }
534 int32_t i = 0;
535 for(; bp >= 0; --bp) {
536 // Move bp to a character boundary if we are not on one already.
537 i = textMap->elementAti(bp);
538 if (i >= 0) {
539 break;
540 }
541 }
542 return srcCol->elementAti(i);
543 }
544
545
executeTest(TestParams * t,UErrorCode & status)546 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
547 int32_t bp;
548 int32_t prevBP;
549 int32_t i;
550
551 TEST_ASSERT_SUCCESS(status);
552 if (U_FAILURE(status)) {
553 return;
554 }
555
556 if (t->bi == NULL) {
557 return;
558 }
559
560 t->bi->setText(t->textToBreak, status);
561 //
562 // Run the iterator forward
563 //
564 prevBP = -1;
565 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
566 if (prevBP == bp) {
567 // Fail for lack of forward progress.
568 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
569 bp, t->getSrcLine(bp), t->getSrcCol(bp));
570 break;
571 }
572
573 // Check that there we didn't miss an expected break between the last one
574 // and this one.
575 for (i=prevBP+1; i<bp; i++) {
576 if (t->getExpectedBreak(i) != 0) {
577 int expected[] = {0, i};
578 printStringBreaks(t->dataToBreak, expected, 2);
579 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
580 i, t->getSrcLine(i), t->getSrcCol(i));
581 }
582 }
583
584 // Check that the break we did find was expected
585 if (t->getExpectedBreak(bp) == 0) {
586 int expected[] = {0, bp};
587 printStringBreaks(t->textToBreak, expected, 2);
588 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
589 bp, t->getSrcLine(bp), t->getSrcCol(bp));
590 } else {
591 // The break was expected.
592 // Check that the {nnn} tag value is correct.
593 int32_t expectedTagVal = t->getExpectedBreak(bp);
594 if (expectedTagVal == -1) {
595 expectedTagVal = 0;
596 }
597 int32_t line = t->getSrcLine(bp);
598 int32_t rs = t->bi->getRuleStatus();
599 if (rs != expectedTagVal) {
600 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
601 " Actual, Expected status = %4d, %4d",
602 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
603 }
604 }
605
606 prevBP = bp;
607 }
608
609 // Verify that there were no missed expected breaks after the last one found
610 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
611 if (t->getExpectedBreak(i) != 0) {
612 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
613 i, t->getSrcLine(i), t->getSrcCol(i));
614 }
615 }
616
617 //
618 // Run the iterator backwards, verify that the same breaks are found.
619 //
620 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
621 bp = t->bi->last();
622 while (bp != BreakIterator::DONE) {
623 if (prevBP == bp) {
624 // Fail for lack of progress.
625 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
626 bp, t->getSrcLine(bp), t->getSrcCol(bp));
627 break;
628 }
629
630 // Check that we didn't miss an expected break between the last one
631 // and this one. (UVector returns zeros for index out of bounds.)
632 for (i=prevBP-1; i>bp; i--) {
633 if (t->getExpectedBreak(i) != 0) {
634 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
635 i, t->getSrcLine(i), t->getSrcCol(i));
636 }
637 }
638
639 // Check that the break we did find was expected
640 if (t->getExpectedBreak(bp) == 0) {
641 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
642 bp, t->getSrcLine(bp), t->getSrcCol(bp));
643 } else {
644 // The break was expected.
645 // Check that the {nnn} tag value is correct.
646 int32_t expectedTagVal = t->getExpectedBreak(bp);
647 if (expectedTagVal == -1) {
648 expectedTagVal = 0;
649 }
650 int line = t->getSrcLine(bp);
651 int32_t rs = t->bi->getRuleStatus();
652 if (rs != expectedTagVal) {
653 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
654 " Actual, Expected status = %4d, %4d",
655 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
656 }
657 }
658
659 prevBP = bp;
660 bp = t->bi->previous();
661 }
662
663 // Verify that there were no missed breaks prior to the last one found
664 for (i=prevBP-1; i>=0; i--) {
665 if (t->getExpectedBreak(i) != 0) {
666 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
667 i, t->getSrcLine(i), t->getSrcCol(i));
668 }
669 }
670
671 // Check isBoundary()
672 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
673 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
674 UBool boundaryFound = t->bi->isBoundary(i);
675 if (boundaryExpected != boundaryFound) {
676 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
677 " Expected, Actual= %s, %s",
678 i, t->getSrcLine(i), t->getSrcCol(i),
679 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
680 }
681 }
682
683 // Check following()
684 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
685 int32_t actualBreak = t->bi->following(i);
686 int32_t expectedBreak = BreakIterator::DONE;
687 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
688 if (t->getExpectedBreak(j) != 0) {
689 expectedBreak = j;
690 break;
691 }
692 }
693 if (expectedBreak != actualBreak) {
694 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
695 " Expected, Actual= %d, %d",
696 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
697 }
698 }
699
700 // Check preceding()
701 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
702 int32_t actualBreak = t->bi->preceding(i);
703 int32_t expectedBreak = BreakIterator::DONE;
704
705 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
706 // preceding(trailing byte) will return the index of some preceding code point,
707 // not the lead byte of the current code point, even though that has a smaller index.
708 // Therefore, start looking at the expected break data not at i-1, but at
709 // the start of code point index - 1.
710 utext_setNativeIndex(t->textToBreak, i);
711 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
712 for (; j >= 0; j--) {
713 if (t->getExpectedBreak(j) != 0) {
714 expectedBreak = j;
715 break;
716 }
717 }
718 if (expectedBreak != actualBreak) {
719 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
720 " Expected, Actual= %d, %d",
721 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
722 }
723 }
724 }
725
TestExtended()726 void RBBITest::TestExtended() {
727 // The expectations in this test heavily depends on the Thai dictionary.
728 // Therefore, we skip this test under the LSTM configuration.
729 if (skipDictionaryTest()) {
730 return;
731 }
732 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
733 // data driven test closely entangles filtered and regular data.
734 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
735 UErrorCode status = U_ZERO_ERROR;
736 Locale locale("");
737
738 TestParams tp(status);
739
740 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
741 if (U_FAILURE(status)) {
742 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
743 }
744
745 //
746 // Open and read the test data file.
747 //
748 const char *testDataDirectory = IntlTest::getSourceTestData(status);
749 CharString testFileName(testDataDirectory, -1, status);
750 testFileName.append("rbbitst.txt", -1, status);
751
752 int len;
753 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
754 if (U_FAILURE(status)) {
755 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
756 return;
757 }
758
759 bool skipTest = false; // Skip this test?
760
761 //
762 // Put the test data into a UnicodeString
763 //
764 UnicodeString testString(FALSE, testFile, len);
765
766 enum EParseState{
767 PARSE_COMMENT,
768 PARSE_TAG,
769 PARSE_DATA,
770 PARSE_NUM,
771 PARSE_RULES
772 }
773 parseState = PARSE_TAG;
774
775 EParseState savedState = PARSE_TAG;
776
777 int32_t lineNum = 1;
778 int32_t colStart = 0;
779 int32_t column = 0;
780 int32_t charIdx = 0;
781
782 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
783
784 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
785 int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
786
787 for (charIdx = 0; charIdx < len; ) {
788 status = U_ZERO_ERROR;
789 UChar c = testString.charAt(charIdx);
790 charIdx++;
791 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
792 // treat CRLF as a unit
793 c = u'\n';
794 charIdx++;
795 }
796 if (c == u'\n' || c == u'\r') {
797 lineNum++;
798 colStart = charIdx;
799 }
800 column = charIdx - colStart + 1;
801
802 switch (parseState) {
803 case PARSE_COMMENT:
804 if (c == u'\n' || c == u'\r') {
805 parseState = savedState;
806 }
807 break;
808
809 case PARSE_TAG:
810 {
811 if (c == u'#') {
812 parseState = PARSE_COMMENT;
813 savedState = PARSE_TAG;
814 break;
815 }
816 if (u_isUWhiteSpace(c)) {
817 break;
818 }
819 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
820 delete tp.bi;
821 tp.bi = BreakIterator::createWordInstance(locale, status);
822 skipTest = false;
823 charIdx += 5;
824 break;
825 }
826 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
827 delete tp.bi;
828 tp.bi = BreakIterator::createCharacterInstance(locale, status);
829 skipTest = false;
830 charIdx += 5;
831 break;
832 }
833 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
834 delete tp.bi;
835 tp.bi = BreakIterator::createLineInstance(locale, status);
836 skipTest = false;
837 charIdx += 5;
838 break;
839 }
840 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
841 delete tp.bi;
842 tp.bi = BreakIterator::createSentenceInstance(locale, status);
843 skipTest = false;
844 charIdx += 5;
845 break;
846 }
847 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
848 delete tp.bi;
849 tp.bi = BreakIterator::createTitleInstance(locale, status);
850 charIdx += 6;
851 break;
852 }
853
854 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
855 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
856 charIdx = testString.indexOf(u'>', charIdx) + 1;
857 parseState = PARSE_RULES;
858 rules.remove();
859 rulesFirstLine = lineNum;
860 break;
861 }
862
863 // <locale loc_name>
864 localeMatcher.reset(testString);
865 if (localeMatcher.lookingAt(charIdx-1, status)) {
866 UnicodeString localeName = localeMatcher.group(1, status);
867 char localeName8[100];
868 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
869 locale = Locale::createFromName(localeName8);
870 charIdx += localeMatcher.group(0, status).length() - 1;
871 TEST_ASSERT_SUCCESS(status);
872 break;
873 }
874 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
875 parseState = PARSE_DATA;
876 charIdx += 5;
877 tp.dataToBreak = "";
878 tp.expectedBreaks->removeAllElements();
879 tp.srcCol ->removeAllElements();
880 tp.srcLine->removeAllElements();
881 break;
882 }
883
884 errln("line %d: Tag expected in test file.", lineNum);
885 parseState = PARSE_COMMENT;
886 savedState = PARSE_DATA;
887 goto end_test; // Stop the test.
888 }
889 break;
890
891 case PARSE_RULES:
892 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
893 charIdx += 7;
894 parseState = PARSE_TAG;
895 delete tp.bi;
896 UParseError pe;
897 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
898 skipTest = U_FAILURE(status);
899 if (U_FAILURE(status)) {
900 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
901 rulesFirstLine + pe.line - 1, u_errorName(status));
902 }
903 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
904 charIdx += 10;
905 parseState = PARSE_TAG;
906 UErrorCode ec = U_ZERO_ERROR;
907 UParseError pe;
908 RuleBasedBreakIterator bi(rules, pe, ec);
909 if (U_SUCCESS(ec)) {
910 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
911 rulesFirstLine + pe.line - 1);
912 }
913 } else {
914 rules.append(c);
915 }
916 break;
917
918 case PARSE_DATA:
919 if (c == u'•') {
920 int32_t breakIdx = tp.dataToBreak.length();
921 if (tp.expectedBreaks->size() > breakIdx) {
922 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
923 lineNum, column);
924 }
925 tp.expectedBreaks->setSize(breakIdx+1);
926 tp.expectedBreaks->setElementAt(-1, breakIdx);
927 tp.srcLine->setSize(breakIdx+1);
928 tp.srcLine->setElementAt(lineNum, breakIdx);
929 tp.srcCol ->setSize(breakIdx+1);
930 tp.srcCol ->setElementAt(column, breakIdx);
931 break;
932 }
933
934 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
935 // Add final entry to mappings from break location to source file position.
936 // Need one extra because last break position returned is after the
937 // last char in the data, not at the last char.
938 tp.srcLine->addElement(lineNum, status);
939 tp.srcCol ->addElement(column, status);
940
941 parseState = PARSE_TAG;
942 charIdx += 6;
943
944 if (!skipTest) {
945 // RUN THE TEST!
946 status = U_ZERO_ERROR;
947 tp.setUTF16(status);
948 executeTest(&tp, status);
949 TEST_ASSERT_SUCCESS(status);
950
951 // Run again, this time with UTF-8 text wrapped in a UText.
952 status = U_ZERO_ERROR;
953 tp.setUTF8(status);
954 TEST_ASSERT_SUCCESS(status);
955 executeTest(&tp, status);
956 }
957 break;
958 }
959
960 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
961 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
962 // Get the code point from the name and insert it into the test data.
963 // (Damn, no API takes names in Unicode !!!
964 // we've got to take it back to char *)
965 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
966 int32_t nameLength = nameEndIdx - (charIdx+2);
967 char charNameBuf[200];
968 UChar32 theChar = -1;
969 if (nameEndIdx != -1) {
970 UErrorCode status = U_ZERO_ERROR;
971 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
972 charNameBuf[sizeof(charNameBuf)-1] = 0;
973 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
974 if (U_FAILURE(status)) {
975 theChar = -1;
976 }
977 }
978 if (theChar == -1) {
979 errln("Error in named character in test file at line %d, col %d",
980 lineNum, column);
981 } else {
982 // Named code point was recognized. Insert it
983 // into the test data.
984 tp.dataToBreak.append(theChar);
985 while (tp.dataToBreak.length() > tp.srcLine->size()) {
986 tp.srcLine->addElement(lineNum, status);
987 tp.srcCol ->addElement(column, status);
988 }
989 }
990 if (nameEndIdx > charIdx) {
991 charIdx = nameEndIdx+1;
992
993 }
994 break;
995 }
996
997
998
999 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
1000 charIdx++;
1001 int32_t breakIdx = tp.dataToBreak.length();
1002 tp.expectedBreaks->setSize(breakIdx+1);
1003 tp.expectedBreaks->setElementAt(-1, breakIdx);
1004 tp.srcLine->setSize(breakIdx+1);
1005 tp.srcLine->setElementAt(lineNum, breakIdx);
1006 tp.srcCol ->setSize(breakIdx+1);
1007 tp.srcCol ->setElementAt(column, breakIdx);
1008 break;
1009 }
1010
1011 if (c == u'<') {
1012 tagValue = 0;
1013 parseState = PARSE_NUM;
1014 break;
1015 }
1016
1017 if (c == u'#' && column==3) { // TODO: why is column off so far?
1018 parseState = PARSE_COMMENT;
1019 savedState = PARSE_DATA;
1020 break;
1021 }
1022
1023 if (c == u'\\') {
1024 // Check for \ at end of line, a line continuation.
1025 // Advance over (discard) the newline
1026 UChar32 cp = testString.char32At(charIdx);
1027 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1028 // We have a CR LF
1029 // Need an extra increment of the input ptr to move over both of them
1030 charIdx++;
1031 }
1032 if (cp == u'\n' || cp == u'\r') {
1033 lineNum++;
1034 colStart = charIdx;
1035 charIdx++;
1036 break;
1037 }
1038
1039 // Let unescape handle the back slash.
1040 cp = testString.unescapeAt(charIdx);
1041 if (cp != -1) {
1042 // Escape sequence was recognized. Insert the char
1043 // into the test data.
1044 tp.dataToBreak.append(cp);
1045 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1046 tp.srcLine->addElement(lineNum, status);
1047 tp.srcCol ->addElement(column, status);
1048 }
1049 break;
1050 }
1051
1052
1053 // Not a recognized backslash escape sequence.
1054 // Take the next char as a literal.
1055 // TODO: Should this be an error?
1056 c = testString.charAt(charIdx);
1057 charIdx = testString.moveIndex32(charIdx, 1);
1058 }
1059
1060 // Normal, non-escaped data char.
1061 tp.dataToBreak.append(c);
1062
1063 // Save the mapping from offset in the data to line/column numbers in
1064 // the original input file. Will be used for better error messages only.
1065 // If there's an expected break before this char, the slot in the mapping
1066 // vector will already be set for this char; don't overwrite it.
1067 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1068 tp.srcLine->addElement(lineNum, status);
1069 tp.srcCol ->addElement(column, status);
1070 }
1071 break;
1072
1073
1074 case PARSE_NUM:
1075 // We are parsing an expected numeric tag value, like <1234>,
1076 // within a chunk of data.
1077 if (u_isUWhiteSpace(c)) {
1078 break;
1079 }
1080
1081 if (c == u'>') {
1082 // Finished the number. Add the info to the expected break data,
1083 // and switch parse state back to doing plain data.
1084 parseState = PARSE_DATA;
1085 if (tagValue == 0) {
1086 tagValue = -1;
1087 }
1088 int32_t breakIdx = tp.dataToBreak.length();
1089 if (tp.expectedBreaks->size() > breakIdx) {
1090 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1091 lineNum, column);
1092 }
1093 tp.expectedBreaks->setSize(breakIdx+1);
1094 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1095 tp.srcLine->setSize(breakIdx+1);
1096 tp.srcLine->setElementAt(lineNum, breakIdx);
1097 tp.srcCol ->setSize(breakIdx+1);
1098 tp.srcCol ->setElementAt(column, breakIdx);
1099 break;
1100 }
1101
1102 if (u_isdigit(c)) {
1103 tagValue = tagValue*10 + u_charDigitValue(c);
1104 break;
1105 }
1106
1107 errln("Syntax Error in test file at line %d, col %d",
1108 lineNum, column);
1109 parseState = PARSE_COMMENT;
1110 goto end_test; // Stop the test
1111 break;
1112 }
1113
1114
1115 if (U_FAILURE(status)) {
1116 dataerrln("ICU Error %s while parsing test file at line %d.",
1117 u_errorName(status), lineNum);
1118 status = U_ZERO_ERROR;
1119 goto end_test; // Stop the test
1120 }
1121
1122 }
1123
1124 // Reached end of test file. Raise an error if parseState indicates that we are
1125 // within a block that should have been terminated.
1126
1127 if (parseState == PARSE_RULES) {
1128 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1129 lineNum, rulesFirstLine);
1130 }
1131 if (parseState == PARSE_DATA) {
1132 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1133 }
1134
1135
1136 end_test:
1137 delete [] testFile;
1138 #endif
1139 }
1140
1141 //-------------------------------------------------------------------------------
1142 //
1143 // TestDictRules create a break iterator from source rules that includes a
1144 // dictionary range. Regression for bug #7130. Source rules
1145 // do not declare a break iterator type (word, line, sentence, etc.
1146 // but the dictionary code, without a type, would loop.
1147 //
1148 //-------------------------------------------------------------------------------
TestDictRules()1149 void RBBITest::TestDictRules() {
1150 const char *rules = "$dictionary = [a-z]; \n"
1151 "!!forward; \n"
1152 "$dictionary $dictionary; \n"
1153 "!!reverse; \n"
1154 "$dictionary $dictionary; \n";
1155 const char *text = "aa";
1156 UErrorCode status = U_ZERO_ERROR;
1157 UParseError parseError;
1158
1159 RuleBasedBreakIterator bi(rules, parseError, status);
1160 if (U_SUCCESS(status)) {
1161 UnicodeString utext = text;
1162 bi.setText(utext);
1163 int32_t position;
1164 int32_t loops;
1165 for (loops = 0; loops<10; loops++) {
1166 position = bi.next();
1167 if (position == RuleBasedBreakIterator::DONE) {
1168 break;
1169 }
1170 }
1171 TEST_ASSERT(loops == 1);
1172 } else {
1173 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1174 }
1175 }
1176
1177
1178
1179 //--------------------------------------------------------------------------------------------
1180 //
1181 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1182 //
1183 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1184 void RBBITest::TestUnicodeFiles() {
1185 RuleBasedBreakIterator *bi;
1186 UErrorCode status = U_ZERO_ERROR;
1187
1188 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1189 TEST_ASSERT_SUCCESS(status);
1190 if (U_SUCCESS(status)) {
1191 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1192 }
1193 delete bi;
1194
1195 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1196 TEST_ASSERT_SUCCESS(status);
1197 if (U_SUCCESS(status)) {
1198 runUnicodeTestData("WordBreakTest.txt", bi);
1199 }
1200 delete bi;
1201
1202 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1203 TEST_ASSERT_SUCCESS(status);
1204 if (U_SUCCESS(status)) {
1205 runUnicodeTestData("SentenceBreakTest.txt", bi);
1206 }
1207 delete bi;
1208
1209 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1210 TEST_ASSERT_SUCCESS(status);
1211 if (U_SUCCESS(status)) {
1212 runUnicodeTestData("LineBreakTest.txt", bi);
1213 }
1214 delete bi;
1215 }
1216
1217
1218 // Check for test cases from the Unicode test data files that are known to fail
1219 // and should be skipped as known issues because ICU does not fully implement
1220 // the Unicode specifications, or because ICU includes tailorings that differ from
1221 // the Unicode standard.
1222 //
1223 // Test cases are identified by the test data sequence, which tends to be more stable
1224 // across Unicode versions than the test file line numbers.
1225 //
1226 // The test case with ticket "10666" is a dummy, included as an example.
1227
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1228 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1229 static struct TestCase {
1230 const char *fTicketNum;
1231 const char *fFileName;
1232 const UChar *fString;
1233 } badTestCases[] = {
1234 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1235 // The following tests were originally for
1236 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1237 // However, that ticket has been closed as fixed but these tests still fail, so
1238 // ICU-21097 has been created to investigate and address these remaining issues.
1239 {"21097", "LineBreakTest.txt", u"-#"},
1240 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1241 {"21097", "LineBreakTest.txt", u"\u002d\u00a7"},
1242 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1243 {"21097", "LineBreakTest.txt", u"\u002d\U00050005"},
1244 {"21097", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1245 {"21097", "LineBreakTest.txt", u"\u002d\u0e01"},
1246 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1247
1248 // The following tests were originally for
1249 // Issue ICU-12017 Improve line break around numbers.
1250 // However, that ticket has been closed as fixed but these tests still fail, so
1251 // ICU-21097 has been created to investigate and address these remaining issues.
1252 {"21097", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1253 {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1254 {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1255 {"21097", "LineBreakTest.txt", u"a.2 "},
1256 {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1257 {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1258 {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1259 {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1260 {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1261 {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1262 {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1263 {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1264 {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1265 {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1266 };
1267
1268 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1269 const TestCase &badCase = badTestCases[n];
1270 if (!strcmp(fileName, badCase.fFileName) &&
1271 testCase == UnicodeString(badCase.fString)) {
1272 return logKnownIssue(badCase.fTicketNum);
1273 }
1274 }
1275 return FALSE;
1276 }
1277
1278
1279 //--------------------------------------------------------------------------------------------
1280 //
1281 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1282 //
1283 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1284 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1285 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1286 UErrorCode status = U_ZERO_ERROR;
1287
1288 //
1289 // Open and read the test data file, put it into a UnicodeString.
1290 //
1291 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1292 char testFileName[1000];
1293 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1294 dataerrln("Can't open test data. Path too long.");
1295 return;
1296 }
1297 strcpy(testFileName, testDataDirectory);
1298 strcat(testFileName, fileName);
1299
1300 logln("Opening data file %s\n", fileName);
1301
1302 int len;
1303 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1304 if (status != U_FILE_ACCESS_ERROR) {
1305 TEST_ASSERT_SUCCESS(status);
1306 TEST_ASSERT(testFile != NULL);
1307 }
1308 if (U_FAILURE(status) || testFile == NULL) {
1309 return; /* something went wrong, error already output */
1310 }
1311 UnicodeString testFileAsString(TRUE, testFile, len);
1312
1313 //
1314 // Parse the test data file using a regular expression.
1315 // Each kind of token is recognized in its own capture group; what type of item was scanned
1316 // is identified by which group had a match.
1317 //
1318 // Capture Group # 1 2 3 4 5
1319 // Parses this item: divide x hex digits comment \n unrecognized \n
1320 //
1321 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1322 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1323 UnicodeString testString;
1324 UVector32 breakPositions(status);
1325 int lineNumber = 1;
1326 TEST_ASSERT_SUCCESS(status);
1327 if (U_FAILURE(status)) {
1328 return;
1329 }
1330
1331 //
1332 // Scan through each test case, building up the string to be broken in testString,
1333 // and the positions that should be boundaries in the breakPositions vector.
1334 //
1335 int spin = 0;
1336 while (tokenMatcher.find()) {
1337 if(tokenMatcher.hitEnd()) {
1338 /* Shouldn't Happen(TM). This means we didn't find the symbols we were looking for.
1339 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1340 and caused an infinite loop here on EBCDIC systems!
1341 */
1342 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1343 // return;
1344 }
1345 if (tokenMatcher.start(1, status) >= 0) {
1346 // Scanned a divide sign, indicating a break position in the test data.
1347 if (testString.length()>0) {
1348 breakPositions.addElement(testString.length(), status);
1349 }
1350 }
1351 else if (tokenMatcher.start(2, status) >= 0) {
1352 // Scanned an 'x', meaning no break at this position in the test data
1353 // Nothing to be done here.
1354 }
1355 else if (tokenMatcher.start(3, status) >= 0) {
1356 // Scanned Hex digits. Convert them to binary, append to the character data string.
1357 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1358 int length = hexNumber.length();
1359 if (length<=8) {
1360 char buf[10];
1361 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1362 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1363 if (c<=0x10ffff) {
1364 testString.append(c);
1365 } else {
1366 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1367 fileName, lineNumber);
1368 }
1369 } else {
1370 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1371 fileName, lineNumber);
1372 }
1373 }
1374 else if (tokenMatcher.start(4, status) >= 0) {
1375 // Scanned to end of a line, possibly skipping over a comment in the process.
1376 // If the line from the file contained test data, run the test now.
1377 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1378 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1379 }
1380
1381 // Clear out this test case.
1382 // The string and breakPositions vector will be refilled as the next
1383 // test case is parsed.
1384 testString.remove();
1385 breakPositions.removeAllElements();
1386 lineNumber++;
1387 } else {
1388 // Scanner catchall. Something unrecognized appeared on the line.
1389 char token[16];
1390 UnicodeString uToken = tokenMatcher.group(0, status);
1391 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1392 token[sizeof(token)-1] = 0;
1393 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1394
1395 // Clean up, in preparation for continuing with the next line.
1396 testString.remove();
1397 breakPositions.removeAllElements();
1398 lineNumber++;
1399 }
1400 TEST_ASSERT_SUCCESS(status);
1401 if (U_FAILURE(status)) {
1402 break;
1403 }
1404 }
1405
1406 delete [] testFile;
1407 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1408 }
1409
1410 //--------------------------------------------------------------------------------------------
1411 //
1412 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1413 // test data files. Do only a simple, forward-only check -
1414 // this test is mostly to check that ICU and the Unicode
1415 // data agree with each other.
1416 //
1417 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1418 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1419 const UnicodeString &testString, // Text data to be broken
1420 UVector32 *breakPositions, // Positions where breaks should be found.
1421 RuleBasedBreakIterator *bi) {
1422 int32_t pos; // Break Position in the test string
1423 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1424 int32_t expectedPos; // Expected break position (index into test string)
1425
1426 bi->setText(testString);
1427 pos = bi->first();
1428 pos = bi->next();
1429
1430 while (pos != BreakIterator::DONE) {
1431 if (expectedI >= breakPositions->size()) {
1432 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1433 testFileName, lineNumber, pos);
1434 break;
1435 }
1436 expectedPos = breakPositions->elementAti(expectedI);
1437 if (pos < expectedPos) {
1438 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1439 testFileName, lineNumber, pos);
1440 break;
1441 }
1442 if (pos > expectedPos) {
1443 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1444 testFileName, lineNumber, expectedPos);
1445 break;
1446 }
1447 pos = bi->next();
1448 expectedI++;
1449 }
1450
1451 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1452 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1453 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1454 }
1455 }
1456
1457
1458
1459 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1460 //---------------------------------------------------------------------------------------
1461 //
1462 // class RBBIMonkeyKind
1463 //
1464 // Monkey Test for Break Iteration
1465 // Abstract interface class. Concrete derived classes independently
1466 // implement the break rules for different iterator types.
1467 //
1468 // The Monkey Test itself uses doesn't know which type of break iterator it is
1469 // testing, but works purely in terms of the interface defined here.
1470 //
1471 //---------------------------------------------------------------------------------------
1472 class RBBIMonkeyKind {
1473 public:
1474 // Return a UVector of UnicodeSets, representing the character classes used
1475 // for this type of iterator.
1476 virtual UVector *charClasses() = 0;
1477
1478 // Set the test text on which subsequent calls to next() will operate
1479 virtual void setText(const UnicodeString &s) = 0;
1480
1481 // Find the next break position, starting from the prev break position, or from zero.
1482 // Return -1 after reaching end of string.
1483 virtual int32_t next(int32_t i) = 0;
1484
1485 // Name of each character class, parallel with charClasses. Used for debugging output
1486 // of characters.
1487 virtual std::vector<std::string>& characterClassNames();
1488
1489 void setAppliedRule(int32_t position, const char* value);
1490
1491 std::string getAppliedRule(int32_t position);
1492
1493 virtual ~RBBIMonkeyKind();
1494 UErrorCode deferredStatus;
1495
1496 std::string classNameFromCodepoint(const UChar32 c);
1497 unsigned int maxClassNameSize();
1498
1499 protected:
1500 RBBIMonkeyKind();
1501 std::vector<std::string> classNames;
1502 std::vector<std::string> appliedRules;
1503
1504 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1505 void prepareAppliedRules(int32_t size );
1506
1507 private:
1508
1509 };
1510
RBBIMonkeyKind()1511 RBBIMonkeyKind::RBBIMonkeyKind() {
1512 deferredStatus = U_ZERO_ERROR;
1513 }
1514
~RBBIMonkeyKind()1515 RBBIMonkeyKind::~RBBIMonkeyKind() {
1516 }
1517
characterClassNames()1518 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1519 return classNames;
1520 }
1521
prepareAppliedRules(int32_t size)1522 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1523 // Remove all the information in the `appliedRules`.
1524 appliedRules.clear();
1525 appliedRules.resize(size + 1);
1526 }
1527
setAppliedRule(int32_t position,const char * value)1528 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1529 appliedRules[position] = value;
1530 }
1531
getAppliedRule(int32_t position)1532 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1533 return appliedRules[position];
1534 }
1535
classNameFromCodepoint(const UChar32 c)1536 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1537 // Simply iterate through charClasses to find character's class
1538 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1539 UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1540 if (classSet->contains(c)) {
1541 return classNames[aClassNum];
1542 }
1543 }
1544 U_ASSERT(FALSE); // This should not happen.
1545 return "bad class name";
1546 }
1547
maxClassNameSize()1548 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1549 unsigned int maxSize = 0;
1550 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1551 auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1552 if (aClassNumSize > maxSize) {
1553 maxSize = aClassNumSize;
1554 }
1555 }
1556 return maxSize;
1557 }
1558
1559 //----------------------------------------------------------------------------------------
1560 //
1561 // Random Numbers. Similar to standard lib rand() and srand()
1562 // Not using library to
1563 // 1. Get same results on all platforms.
1564 // 2. Get access to current seed, to more easily reproduce failures.
1565 //
1566 //---------------------------------------------------------------------------------------
1567 static uint32_t m_seed = 1;
1568
m_rand()1569 static uint32_t m_rand()
1570 {
1571 m_seed = m_seed * 1103515245 + 12345;
1572 return (uint32_t)(m_seed/65536) % 32768;
1573 }
1574
1575
1576 //------------------------------------------------------------------------------------------
1577 //
1578 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1579 // of RBBIMonkeyKind.
1580 //
1581 //------------------------------------------------------------------------------------------
1582 class RBBICharMonkey: public RBBIMonkeyKind {
1583 public:
1584 RBBICharMonkey();
1585 virtual ~RBBICharMonkey();
1586 virtual UVector *charClasses() override;
1587 virtual void setText(const UnicodeString &s) override;
1588 virtual int32_t next(int32_t i) override;
1589 private:
1590 UVector *fSets;
1591
1592 UnicodeSet *fCRLFSet;
1593 UnicodeSet *fControlSet;
1594 UnicodeSet *fExtendSet;
1595 UnicodeSet *fZWJSet;
1596 UnicodeSet *fRegionalIndicatorSet;
1597 UnicodeSet *fPrependSet;
1598 UnicodeSet *fSpacingSet;
1599 UnicodeSet *fLSet;
1600 UnicodeSet *fVSet;
1601 UnicodeSet *fTSet;
1602 UnicodeSet *fLVSet;
1603 UnicodeSet *fLVTSet;
1604 UnicodeSet *fHangulSet;
1605 UnicodeSet *fExtendedPictSet;
1606 UnicodeSet *fViramaSet;
1607 UnicodeSet *fLinkingConsonantSet;
1608 UnicodeSet *fExtCccZwjSet;
1609 UnicodeSet *fAnySet;
1610
1611 const UnicodeString *fText;
1612 };
1613
1614
RBBICharMonkey()1615 RBBICharMonkey::RBBICharMonkey() {
1616 UErrorCode status = U_ZERO_ERROR;
1617
1618 fText = NULL;
1619
1620 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1621 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1622 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1623 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1624 fRegionalIndicatorSet =
1625 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1626 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1627 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1628 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1629 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1630 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1631 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1632 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1633 fHangulSet = new UnicodeSet();
1634 fHangulSet->addAll(*fLSet);
1635 fHangulSet->addAll(*fVSet);
1636 fHangulSet->addAll(*fTSet);
1637 fHangulSet->addAll(*fLVSet);
1638 fHangulSet->addAll(*fLVTSet);
1639
1640 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1641 fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1642 "\\p{Indic_Syllabic_Category=Virama}]", status);
1643 fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1644 "\\p{Indic_Syllabic_Category=Consonant}]", status);
1645 fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1646 fAnySet = new UnicodeSet(0, 0x10ffff);
1647
1648 // Create sets of characters, and add the names of the above character sets.
1649 // In each new ICU release, add new names corresponding to the sets above.
1650 fSets = new UVector(status);
1651
1652 // Important: Keep class names the same as the class contents.
1653 fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1654 fSets->addElement(fControlSet, status); classNames.push_back("Control");
1655 fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1656 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1657 if (!fPrependSet->isEmpty()) {
1658 fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1659 }
1660 fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1661 fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1662 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1663 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1664 fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1665 fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1666 fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1667 fSets->addElement(fAnySet, status); classNames.push_back("Any");
1668
1669 if (U_FAILURE(status)) {
1670 deferredStatus = status;
1671 }
1672 }
1673
1674
setText(const UnicodeString & s)1675 void RBBICharMonkey::setText(const UnicodeString &s) {
1676 fText = &s;
1677 prepareAppliedRules(s.length());
1678 }
1679
1680
1681
next(int32_t prevPos)1682 int32_t RBBICharMonkey::next(int32_t prevPos) {
1683 int p0, p1, p2, p3; // Indices of the significant code points around the
1684 // break position being tested. The candidate break
1685 // location is before p2.
1686
1687 int breakPos = -1;
1688
1689 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1690 UChar32 cBase; // for (X Extend*) patterns, the X character.
1691
1692 if (U_FAILURE(deferredStatus)) {
1693 return -1;
1694 }
1695
1696 // Previous break at end of string. return DONE.
1697 if (prevPos >= fText->length()) {
1698 return -1;
1699 }
1700
1701 p0 = p1 = p2 = p3 = prevPos;
1702 c3 = fText->char32At(prevPos);
1703 c0 = c1 = c2 = cBase = 0;
1704 (void)p0; // suppress set but not used warning.
1705 (void)c0;
1706
1707 // Loop runs once per "significant" character position in the input text.
1708 for (;;) {
1709 // Move all of the positions forward in the input string.
1710 p0 = p1; c0 = c1;
1711 p1 = p2; c1 = c2;
1712 p2 = p3; c2 = c3;
1713
1714 // Advance p3 by one codepoint
1715 p3 = fText->moveIndex32(p3, 1);
1716 c3 = fText->char32At(p3);
1717
1718 if (p1 == p2) {
1719 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1720 continue;
1721 }
1722
1723 if (p2 == fText->length()) {
1724 setAppliedRule(p2, "End of String");
1725 break;
1726 }
1727
1728 // No Extend or Format characters may appear between the CR and LF,
1729 // which requires the additional check for p2 immediately following p1.
1730 //
1731 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1732 setAppliedRule(p2, "GB3 CR x LF");
1733 continue;
1734 }
1735
1736 if (fControlSet->contains(c1) ||
1737 c1 == 0x0D ||
1738 c1 == 0x0A) {
1739 setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
1740 break;
1741 }
1742
1743 if (fControlSet->contains(c2) ||
1744 c2 == 0x0D ||
1745 c2 == 0x0A) {
1746 setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
1747 break;
1748 }
1749
1750 if (fLSet->contains(c1) &&
1751 (fLSet->contains(c2) ||
1752 fVSet->contains(c2) ||
1753 fLVSet->contains(c2) ||
1754 fLVTSet->contains(c2))) {
1755 setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
1756 continue;
1757 }
1758
1759 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1760 (fVSet->contains(c2) || fTSet->contains(c2))) {
1761 setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
1762 continue;
1763 }
1764
1765 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1766 fTSet->contains(c2)) {
1767 setAppliedRule(p2, "GB8 ( LVT | T) x T");
1768 continue;
1769 }
1770
1771 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1772 if (!fExtendSet->contains(c1)) {
1773 cBase = c1;
1774 }
1775 setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
1776 continue;
1777 }
1778
1779 if (fSpacingSet->contains(c2)) {
1780 setAppliedRule(p2, "GB9a x SpacingMark");
1781 continue;
1782 }
1783
1784 if (fPrependSet->contains(c1)) {
1785 setAppliedRule(p2, "GB9b Prepend x");
1786 continue;
1787 }
1788
1789 // Note: Viramas are also included in the ExtCccZwj class.
1790 if (fLinkingConsonantSet->contains(c2)) {
1791 int pi = p1;
1792 bool sawVirama = false;
1793 while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1794 if (fViramaSet->contains(fText->char32At(pi))) {
1795 sawVirama = true;
1796 }
1797 pi = fText->moveIndex32(pi, -1);
1798 }
1799 if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1800 setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1801 continue;
1802 }
1803 }
1804
1805 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1806 setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1807 continue;
1808 }
1809
1810 // Note: The first if condition is a little tricky. We only need to force
1811 // a break if there are three or more contiguous RIs. If there are
1812 // only two, a break following will occur via other rules, and will include
1813 // any trailing extend characters, which is needed behavior.
1814 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1815 && fRegionalIndicatorSet->contains(c2)) {
1816 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1817 break;
1818 }
1819 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1820 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1821 continue;
1822 }
1823
1824 setAppliedRule(p2, "GB999 Any <break> Any");
1825 break;
1826 }
1827
1828 breakPos = p2;
1829 return breakPos;
1830 }
1831
1832
1833
charClasses()1834 UVector *RBBICharMonkey::charClasses() {
1835 return fSets;
1836 }
1837
~RBBICharMonkey()1838 RBBICharMonkey::~RBBICharMonkey() {
1839 delete fSets;
1840 delete fCRLFSet;
1841 delete fControlSet;
1842 delete fExtendSet;
1843 delete fRegionalIndicatorSet;
1844 delete fPrependSet;
1845 delete fSpacingSet;
1846 delete fLSet;
1847 delete fVSet;
1848 delete fTSet;
1849 delete fLVSet;
1850 delete fLVTSet;
1851 delete fHangulSet;
1852 delete fAnySet;
1853 delete fZWJSet;
1854 delete fExtendedPictSet;
1855 delete fViramaSet;
1856 delete fLinkingConsonantSet;
1857 delete fExtCccZwjSet;
1858 }
1859
1860 //------------------------------------------------------------------------------------------
1861 //
1862 // class RBBIWordMonkey Word Break specific implementation
1863 // of RBBIMonkeyKind.
1864 //
1865 //------------------------------------------------------------------------------------------
1866 class RBBIWordMonkey: public RBBIMonkeyKind {
1867 public:
1868 RBBIWordMonkey();
1869 virtual ~RBBIWordMonkey();
1870 virtual UVector *charClasses() override;
1871 virtual void setText(const UnicodeString &s) override;
1872 virtual int32_t next(int32_t i) override;
1873 private:
1874 UVector *fSets;
1875
1876 UnicodeSet *fCRSet;
1877 UnicodeSet *fLFSet;
1878 UnicodeSet *fNewlineSet;
1879 UnicodeSet *fRegionalIndicatorSet;
1880 UnicodeSet *fKatakanaSet;
1881 UnicodeSet *fHebrew_LetterSet;
1882 UnicodeSet *fALetterSet;
1883 UnicodeSet *fSingle_QuoteSet;
1884 UnicodeSet *fDouble_QuoteSet;
1885 UnicodeSet *fMidNumLetSet;
1886 UnicodeSet *fMidLetterSet;
1887 UnicodeSet *fMidNumSet;
1888 UnicodeSet *fNumericSet;
1889 UnicodeSet *fFormatSet;
1890 UnicodeSet *fOtherSet = nullptr;
1891 UnicodeSet *fExtendSet;
1892 UnicodeSet *fExtendNumLetSet;
1893 UnicodeSet *fWSegSpaceSet;
1894 UnicodeSet *fDictionarySet = nullptr;
1895 UnicodeSet *fZWJSet;
1896 UnicodeSet *fExtendedPictSet;
1897
1898 const UnicodeString *fText;
1899 };
1900
1901
RBBIWordMonkey()1902 RBBIWordMonkey::RBBIWordMonkey()
1903 {
1904 UErrorCode status = U_ZERO_ERROR;
1905
1906 fSets = new UVector(status);
1907
1908 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
1909 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
1910 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
1911 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
1912 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1913 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1914 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1915 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
1916 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
1917 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
1918 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status);
1919 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
1920 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1921 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
1922 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1923 // There are some sc=Hani characters with WB=Extend.
1924 // The break rules need to pick one or the other because
1925 // Extend overlapping with something else is messy.
1926 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1927 // in $Han (for $dictionary) and out of $Extend.
1928 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1929 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
1930
1931 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
1932 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1933 if(U_FAILURE(status)) {
1934 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1935 deferredStatus = status;
1936 return;
1937 }
1938
1939 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1940 fDictionarySet->addAll(*fKatakanaSet);
1941 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1942
1943 fALetterSet->removeAll(*fDictionarySet);
1944
1945 fOtherSet = new UnicodeSet();
1946 if(U_FAILURE(status)) {
1947 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1948 deferredStatus = status;
1949 return;
1950 }
1951
1952 fOtherSet->complement();
1953 fOtherSet->removeAll(*fCRSet);
1954 fOtherSet->removeAll(*fLFSet);
1955 fOtherSet->removeAll(*fNewlineSet);
1956 fOtherSet->removeAll(*fKatakanaSet);
1957 fOtherSet->removeAll(*fHebrew_LetterSet);
1958 fOtherSet->removeAll(*fALetterSet);
1959 fOtherSet->removeAll(*fSingle_QuoteSet);
1960 fOtherSet->removeAll(*fDouble_QuoteSet);
1961 fOtherSet->removeAll(*fMidLetterSet);
1962 fOtherSet->removeAll(*fMidNumSet);
1963 fOtherSet->removeAll(*fNumericSet);
1964 fOtherSet->removeAll(*fExtendNumLetSet);
1965 fOtherSet->removeAll(*fWSegSpaceSet);
1966 fOtherSet->removeAll(*fFormatSet);
1967 fOtherSet->removeAll(*fExtendSet);
1968 fOtherSet->removeAll(*fRegionalIndicatorSet);
1969 fOtherSet->removeAll(*fZWJSet);
1970 fOtherSet->removeAll(*fExtendedPictSet);
1971
1972 // Inhibit dictionary characters from being tested at all.
1973 fOtherSet->removeAll(*fDictionarySet);
1974
1975 // Add classes and their names
1976 fSets->addElement(fCRSet, status); classNames.push_back("CR");
1977 fSets->addElement(fLFSet, status); classNames.push_back("LF");
1978 fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
1979 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1980 fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
1981 fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
1982 fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
1983 fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
1984 // Omit Katakana from fSets, which omits Katakana characters
1985 // from the test data. They are all in the dictionary set,
1986 // which this (old, to be retired) monkey test cannot handle.
1987 //fSets->addElement(fKatakanaSet, status);
1988
1989 fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
1990 fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
1991 fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
1992 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
1993 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
1994 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
1995 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
1996 fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
1997 fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
1998
1999 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2000 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2001
2002 if (U_FAILURE(status)) {
2003 deferredStatus = status;
2004 }
2005 }
2006
setText(const UnicodeString & s)2007 void RBBIWordMonkey::setText(const UnicodeString &s) {
2008 fText = &s;
2009 prepareAppliedRules(s.length());
2010 }
2011
2012
next(int32_t prevPos)2013 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2014 int p0, p1, p2, p3; // Indices of the significant code points around the
2015 // break position being tested. The candidate break
2016 // location is before p2.
2017
2018 int breakPos = -1;
2019
2020 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2021
2022 if (U_FAILURE(deferredStatus)) {
2023 return -1;
2024 }
2025
2026 // Prev break at end of string. return DONE.
2027 if (prevPos >= fText->length()) {
2028 return -1;
2029 }
2030 p0 = p1 = p2 = p3 = prevPos;
2031 c3 = fText->char32At(prevPos);
2032 c0 = c1 = c2 = 0;
2033 (void)p0; // Suppress set but not used warning.
2034
2035 // Loop runs once per "significant" character position in the input text.
2036 for (;;) {
2037 // Move all of the positions forward in the input string.
2038 p0 = p1; c0 = c1;
2039 p1 = p2; c1 = c2;
2040 p2 = p3; c2 = c3;
2041
2042 // Advance p3 by X(Extend | Format)* Rule 4
2043 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2044 do {
2045 p3 = fText->moveIndex32(p3, 1);
2046 c3 = fText->char32At(p3);
2047 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2048 break;
2049 }
2050 }
2051 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2052
2053
2054 if (p1 == p2) {
2055 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2056 continue;
2057 }
2058
2059 if (p2 == fText->length()) {
2060 // Reached end of string. Always a break position.
2061 break;
2062 }
2063
2064 // No Extend or Format characters may appear between the CR and LF,
2065 // which requires the additional check for p2 immediately following p1.
2066 //
2067 if (c1==0x0D && c2==0x0A) {
2068 setAppliedRule(p2, "WB3 CR x LF");
2069 continue;
2070 }
2071
2072 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2073 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2074 break;
2075 }
2076 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2077 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2078 break;
2079 }
2080
2081 // Not ignoring extend chars, so peek into input text to
2082 // get the potential ZWJ, the character immediately preceding c2.
2083 // Sloppy UChar32 indexing: p2-1 may reference trail half
2084 // but char32At will get the full code point.
2085 if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2086 setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
2087 continue;
2088 }
2089
2090 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2091 setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
2092 continue;
2093 }
2094
2095 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2096 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2097 setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2098 continue;
2099 }
2100
2101 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2102 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2103 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2104 setAppliedRule(p2,
2105 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2106 continue;
2107 }
2108
2109 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2110 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2111 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2112 setAppliedRule(p2,
2113 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
2114 continue;
2115 }
2116
2117 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2118 setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
2119 continue;
2120 }
2121
2122 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2123 setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
2124 continue;
2125 }
2126
2127 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2128 setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
2129 continue;
2130 }
2131
2132 if (fNumericSet->contains(c1) &&
2133 fNumericSet->contains(c2)) {
2134 setAppliedRule(p2, "WB8 Numeric x Numeric");
2135 continue;
2136 }
2137
2138 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2139 fNumericSet->contains(c2)) {
2140 setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
2141 continue;
2142 }
2143
2144 if (fNumericSet->contains(c1) &&
2145 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2146 setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
2147 continue;
2148 }
2149
2150 if (fNumericSet->contains(c0) &&
2151 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2152 fNumericSet->contains(c2)) {
2153 setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
2154 continue;
2155 }
2156
2157 if (fNumericSet->contains(c1) &&
2158 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2159 fNumericSet->contains(c3)) {
2160 setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2161 continue;
2162 }
2163
2164 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2165 // all Katakana are handled by the dictionary breaker.
2166 if (fKatakanaSet->contains(c1) &&
2167 fKatakanaSet->contains(c2)) {
2168 setAppliedRule(p2, "WB13 Katakana x Katakana");
2169 continue;
2170 }
2171
2172 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2173 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2174 fExtendNumLetSet->contains(c2)) {
2175 setAppliedRule(p2,
2176 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2177 continue;
2178 }
2179
2180 if (fExtendNumLetSet->contains(c1) &&
2181 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2182 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2183 setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2184 continue;
2185 }
2186
2187 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2188 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2189 break;
2190 }
2191 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2192 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2193 continue;
2194 }
2195
2196 setAppliedRule(p2, "WB999");
2197 break;
2198 }
2199
2200 breakPos = p2;
2201 return breakPos;
2202 }
2203
2204
charClasses()2205 UVector *RBBIWordMonkey::charClasses() {
2206 return fSets;
2207 }
2208
~RBBIWordMonkey()2209 RBBIWordMonkey::~RBBIWordMonkey() {
2210 delete fSets;
2211 delete fCRSet;
2212 delete fLFSet;
2213 delete fNewlineSet;
2214 delete fKatakanaSet;
2215 delete fHebrew_LetterSet;
2216 delete fALetterSet;
2217 delete fSingle_QuoteSet;
2218 delete fDouble_QuoteSet;
2219 delete fMidNumLetSet;
2220 delete fMidLetterSet;
2221 delete fMidNumSet;
2222 delete fNumericSet;
2223 delete fFormatSet;
2224 delete fExtendSet;
2225 delete fExtendNumLetSet;
2226 delete fWSegSpaceSet;
2227 delete fRegionalIndicatorSet;
2228 delete fDictionarySet;
2229 delete fOtherSet;
2230 delete fZWJSet;
2231 delete fExtendedPictSet;
2232 }
2233
2234
2235
2236
2237 //------------------------------------------------------------------------------------------
2238 //
2239 // class RBBISentMonkey Sentence Break specific implementation
2240 // of RBBIMonkeyKind.
2241 //
2242 //------------------------------------------------------------------------------------------
2243 class RBBISentMonkey: public RBBIMonkeyKind {
2244 public:
2245 RBBISentMonkey();
2246 virtual ~RBBISentMonkey();
2247 virtual UVector *charClasses() override;
2248 virtual void setText(const UnicodeString &s) override;
2249 virtual int32_t next(int32_t i) override;
2250 private:
2251 int moveBack(int posFrom);
2252 int moveForward(int posFrom);
2253 UChar32 cAt(int pos);
2254
2255 UVector *fSets;
2256
2257 UnicodeSet *fSepSet;
2258 UnicodeSet *fFormatSet;
2259 UnicodeSet *fSpSet;
2260 UnicodeSet *fLowerSet;
2261 UnicodeSet *fUpperSet;
2262 UnicodeSet *fOLetterSet;
2263 UnicodeSet *fNumericSet;
2264 UnicodeSet *fATermSet;
2265 UnicodeSet *fSContinueSet;
2266 UnicodeSet *fSTermSet;
2267 UnicodeSet *fCloseSet;
2268 UnicodeSet *fOtherSet;
2269 UnicodeSet *fExtendSet;
2270
2271 const UnicodeString *fText;
2272 };
2273
RBBISentMonkey()2274 RBBISentMonkey::RBBISentMonkey()
2275 {
2276 UErrorCode status = U_ZERO_ERROR;
2277
2278 fSets = new UVector(status);
2279
2280 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2281 // set and made into character classes of their own. For the monkey impl,
2282 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2283 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2284 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2285 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2286 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2287 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2288 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2289 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2290 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2291 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2292 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2293 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2294 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2295 fOtherSet = new UnicodeSet();
2296
2297 if(U_FAILURE(status)) {
2298 deferredStatus = status;
2299 return;
2300 }
2301
2302 fOtherSet->complement();
2303 fOtherSet->removeAll(*fSepSet);
2304 fOtherSet->removeAll(*fFormatSet);
2305 fOtherSet->removeAll(*fSpSet);
2306 fOtherSet->removeAll(*fLowerSet);
2307 fOtherSet->removeAll(*fUpperSet);
2308 fOtherSet->removeAll(*fOLetterSet);
2309 fOtherSet->removeAll(*fNumericSet);
2310 fOtherSet->removeAll(*fATermSet);
2311 fOtherSet->removeAll(*fSContinueSet);
2312 fOtherSet->removeAll(*fSTermSet);
2313 fOtherSet->removeAll(*fCloseSet);
2314 fOtherSet->removeAll(*fExtendSet);
2315
2316 fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2317 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2318 fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2319 fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2320 fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2321 fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2322 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2323 fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2324 fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2325 fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2326 fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2327 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2328 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2329
2330 if (U_FAILURE(status)) {
2331 deferredStatus = status;
2332 }
2333 }
2334
2335
2336
setText(const UnicodeString & s)2337 void RBBISentMonkey::setText(const UnicodeString &s) {
2338 fText = &s;
2339 prepareAppliedRules(s.length());
2340 }
2341
charClasses()2342 UVector *RBBISentMonkey::charClasses() {
2343 return fSets;
2344 }
2345
2346 // moveBack() Find the "significant" code point preceding the index i.
2347 // Skips over ($Extend | $Format)* .
2348 //
moveBack(int i)2349 int RBBISentMonkey::moveBack(int i) {
2350 if (i <= 0) {
2351 return -1;
2352 }
2353 UChar32 c;
2354 int32_t j = i;
2355 do {
2356 j = fText->moveIndex32(j, -1);
2357 c = fText->char32At(j);
2358 }
2359 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2360 return j;
2361
2362 }
2363
2364
moveForward(int i)2365 int RBBISentMonkey::moveForward(int i) {
2366 if (i>=fText->length()) {
2367 return fText->length();
2368 }
2369 UChar32 c;
2370 int32_t j = i;
2371 do {
2372 j = fText->moveIndex32(j, 1);
2373 c = cAt(j);
2374 }
2375 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2376 return j;
2377 }
2378
cAt(int pos)2379 UChar32 RBBISentMonkey::cAt(int pos) {
2380 if (pos<0 || pos>=fText->length()) {
2381 return -1;
2382 } else {
2383 return fText->char32At(pos);
2384 }
2385 }
2386
next(int32_t prevPos)2387 int32_t RBBISentMonkey::next(int32_t prevPos) {
2388 int p0, p1, p2, p3; // Indices of the significant code points around the
2389 // break position being tested. The candidate break
2390 // location is before p2.
2391
2392 int breakPos = -1;
2393
2394 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2395 UChar32 c;
2396
2397 if (U_FAILURE(deferredStatus)) {
2398 return -1;
2399 }
2400
2401 // Prev break at end of string. return DONE.
2402 if (prevPos >= fText->length()) {
2403 return -1;
2404 }
2405 p0 = p1 = p2 = p3 = prevPos;
2406 c3 = fText->char32At(prevPos);
2407 c0 = c1 = c2 = 0;
2408 (void)p0; // Suppress set but not used warning.
2409
2410 // Loop runs once per "significant" character position in the input text.
2411 for (;;) {
2412 // Move all of the positions forward in the input string.
2413 p0 = p1; c0 = c1;
2414 p1 = p2; c1 = c2;
2415 p2 = p3; c2 = c3;
2416
2417 // Advance p3 by X(Extend | Format)* Rule 4
2418 p3 = moveForward(p3);
2419 c3 = cAt(p3);
2420
2421 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2422 setAppliedRule(p2, "SB3 CR x LF");
2423 continue;
2424 }
2425
2426 if (fSepSet->contains(c1)) {
2427 p2 = p1+1; // Separators don't combine with Extend or Format.
2428
2429 setAppliedRule(p2, "SB4 Sep <break>");
2430 break;
2431 }
2432
2433 if (p2 >= fText->length()) {
2434 // Reached end of string. Always a break position.
2435 setAppliedRule(p2, "SB4 Sep <break>");
2436 break;
2437 }
2438
2439 if (p2 == prevPos) {
2440 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2441 setAppliedRule(p2, "SB4 Sep <break>");
2442 continue;
2443 }
2444
2445 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2446 setAppliedRule(p2, "SB6 ATerm x Numeric");
2447 continue;
2448 }
2449
2450 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2451 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2452 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
2453 continue;
2454 }
2455
2456 // Note: STerm | ATerm are added to the negated part of the expression by a
2457 // note to the Unicode 5.0 documents.
2458 int p8 = p1;
2459 while (fSpSet->contains(cAt(p8))) {
2460 p8 = moveBack(p8);
2461 }
2462 while (fCloseSet->contains(cAt(p8))) {
2463 p8 = moveBack(p8);
2464 }
2465 if (fATermSet->contains(cAt(p8))) {
2466 p8=p2;
2467 for (;;) {
2468 c = cAt(p8);
2469 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2470 fLowerSet->contains(c) || fSepSet->contains(c) ||
2471 fATermSet->contains(c) || fSTermSet->contains(c)) {
2472
2473 setAppliedRule(p2,
2474 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2475 break;
2476 }
2477 p8 = moveForward(p8);
2478 }
2479 if (fLowerSet->contains(cAt(p8))) {
2480
2481 setAppliedRule(p2,
2482 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2483 continue;
2484 }
2485 }
2486
2487 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2488 p8 = p1;
2489 while (fSpSet->contains(cAt(p8))) {
2490 p8 = moveBack(p8);
2491 }
2492 while (fCloseSet->contains(cAt(p8))) {
2493 p8 = moveBack(p8);
2494 }
2495 c = cAt(p8);
2496 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2497 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2498 continue;
2499 }
2500 }
2501
2502 int p9 = p1;
2503 while (fCloseSet->contains(cAt(p9))) {
2504 p9 = moveBack(p9);
2505 }
2506 c = cAt(p9);
2507 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2508 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2509
2510 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
2511 continue;
2512 }
2513 }
2514
2515 int p10 = p1;
2516 while (fSpSet->contains(cAt(p10))) {
2517 p10 = moveBack(p10);
2518 }
2519 while (fCloseSet->contains(cAt(p10))) {
2520 p10 = moveBack(p10);
2521 }
2522 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2523 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2524 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
2525 continue;
2526 }
2527 }
2528
2529 int p11 = p1;
2530 if (fSepSet->contains(cAt(p11))) {
2531 p11 = moveBack(p11);
2532 }
2533 while (fSpSet->contains(cAt(p11))) {
2534 p11 = moveBack(p11);
2535 }
2536 while (fCloseSet->contains(cAt(p11))) {
2537 p11 = moveBack(p11);
2538 }
2539 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2540 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
2541 break;
2542 }
2543
2544 setAppliedRule(p2, "SB12 Any x Any");
2545 continue;
2546 }
2547
2548 breakPos = p2;
2549 return breakPos;
2550 }
2551
~RBBISentMonkey()2552 RBBISentMonkey::~RBBISentMonkey() {
2553 delete fSets;
2554 delete fSepSet;
2555 delete fFormatSet;
2556 delete fSpSet;
2557 delete fLowerSet;
2558 delete fUpperSet;
2559 delete fOLetterSet;
2560 delete fNumericSet;
2561 delete fATermSet;
2562 delete fSContinueSet;
2563 delete fSTermSet;
2564 delete fCloseSet;
2565 delete fOtherSet;
2566 delete fExtendSet;
2567 }
2568
2569
2570
2571 //-------------------------------------------------------------------------------------------
2572 //
2573 // RBBILineMonkey
2574 //
2575 //-------------------------------------------------------------------------------------------
2576
2577 class RBBILineMonkey: public RBBIMonkeyKind {
2578 public:
2579 RBBILineMonkey();
2580 virtual ~RBBILineMonkey();
2581 virtual UVector *charClasses() override;
2582 virtual void setText(const UnicodeString &s) override;
2583 virtual int32_t next(int32_t i) override;
2584 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2585 private:
2586 UVector *fSets;
2587
2588 UnicodeSet *fBK;
2589 UnicodeSet *fCR;
2590 UnicodeSet *fLF;
2591 UnicodeSet *fCM;
2592 UnicodeSet *fNL;
2593 UnicodeSet *fSG;
2594 UnicodeSet *fWJ;
2595 UnicodeSet *fZW;
2596 UnicodeSet *fGL;
2597 UnicodeSet *fCB;
2598 UnicodeSet *fSP;
2599 UnicodeSet *fB2;
2600 UnicodeSet *fBA;
2601 UnicodeSet *fBB;
2602 UnicodeSet *fHH;
2603 UnicodeSet *fHY;
2604 UnicodeSet *fH2;
2605 UnicodeSet *fH3;
2606 UnicodeSet *fCL;
2607 UnicodeSet *fCP;
2608 UnicodeSet *fEX;
2609 UnicodeSet *fIN;
2610 UnicodeSet *fJL;
2611 UnicodeSet *fJV;
2612 UnicodeSet *fJT;
2613 UnicodeSet *fNS;
2614 UnicodeSet *fOP;
2615 UnicodeSet *fQU;
2616 UnicodeSet *fIS;
2617 UnicodeSet *fNU;
2618 UnicodeSet *fPO;
2619 UnicodeSet *fPR;
2620 UnicodeSet *fSY;
2621 UnicodeSet *fAI;
2622 UnicodeSet *fAL;
2623 UnicodeSet *fCJ;
2624 UnicodeSet *fHL;
2625 UnicodeSet *fID;
2626 UnicodeSet *fRI;
2627 UnicodeSet *fXX;
2628 UnicodeSet *fEB;
2629 UnicodeSet *fEM;
2630 UnicodeSet *fZWJ;
2631 UnicodeSet *fOP30;
2632 UnicodeSet *fCP30;
2633 UnicodeSet *fExtPictUnassigned;
2634
2635 BreakIterator *fCharBI;
2636 const UnicodeString *fText;
2637 RegexMatcher *fNumberMatcher;
2638 };
2639
RBBILineMonkey()2640 RBBILineMonkey::RBBILineMonkey() :
2641 RBBIMonkeyKind(),
2642 fSets(NULL),
2643
2644 fCharBI(NULL),
2645 fText(NULL),
2646 fNumberMatcher(NULL)
2647
2648 {
2649 if (U_FAILURE(deferredStatus)) {
2650 return;
2651 }
2652
2653 UErrorCode status = U_ZERO_ERROR;
2654
2655 fSets = new UVector(status);
2656
2657 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2658 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2659 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2660 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2661 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2662 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2663 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2664 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2665 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2666 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2667 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2668 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2669 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2670 fHH = new UnicodeSet();
2671 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2672 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2673 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2674 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2675 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2676 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2677 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2678 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2679 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2680 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2681 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2682 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2683 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2684 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2685 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2686 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2687 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2688 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2689 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2690 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2691 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2692 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2693 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2694 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2695 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2696 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2697 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2698 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2699 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2700 fOP30 = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2701 fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2702 fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2703
2704 if (U_FAILURE(status)) {
2705 deferredStatus = status;
2706 return;
2707 }
2708
2709 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2710 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2711 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2712
2713 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2714 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2715
2716 fHH->add(u'\u2010'); // Hyphen, '‐'
2717
2718 // Sets and names.
2719 fSets->addElement(fBK, status); classNames.push_back("fBK");
2720 fSets->addElement(fCR, status); classNames.push_back("fCR");
2721 fSets->addElement(fLF, status); classNames.push_back("fLF");
2722 fSets->addElement(fCM, status); classNames.push_back("fCM");
2723 fSets->addElement(fNL, status); classNames.push_back("fNL");
2724 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2725 fSets->addElement(fZW, status); classNames.push_back("fZW");
2726 fSets->addElement(fGL, status); classNames.push_back("fGL");
2727 fSets->addElement(fCB, status); classNames.push_back("fCB");
2728 fSets->addElement(fSP, status); classNames.push_back("fSP");
2729 fSets->addElement(fB2, status); classNames.push_back("fB2");
2730 fSets->addElement(fBA, status); classNames.push_back("fBA");
2731 fSets->addElement(fBB, status); classNames.push_back("fBB");
2732 fSets->addElement(fHY, status); classNames.push_back("fHY");
2733 fSets->addElement(fH2, status); classNames.push_back("fH2");
2734 fSets->addElement(fH3, status); classNames.push_back("fH3");
2735 fSets->addElement(fCL, status); classNames.push_back("fCL");
2736 fSets->addElement(fCP, status); classNames.push_back("fCP");
2737 fSets->addElement(fEX, status); classNames.push_back("fEX");
2738 fSets->addElement(fIN, status); classNames.push_back("fIN");
2739 fSets->addElement(fJL, status); classNames.push_back("fJL");
2740 fSets->addElement(fJT, status); classNames.push_back("fJT");
2741 fSets->addElement(fJV, status); classNames.push_back("fJV");
2742 fSets->addElement(fNS, status); classNames.push_back("fNS");
2743 fSets->addElement(fOP, status); classNames.push_back("fOP");
2744 fSets->addElement(fQU, status); classNames.push_back("fQU");
2745 fSets->addElement(fIS, status); classNames.push_back("fIS");
2746 fSets->addElement(fNU, status); classNames.push_back("fNU");
2747 fSets->addElement(fPO, status); classNames.push_back("fPO");
2748 fSets->addElement(fPR, status); classNames.push_back("fPR");
2749 fSets->addElement(fSY, status); classNames.push_back("fSY");
2750 fSets->addElement(fAI, status); classNames.push_back("fAI");
2751 fSets->addElement(fAL, status); classNames.push_back("fAL");
2752 fSets->addElement(fHL, status); classNames.push_back("fHL");
2753 fSets->addElement(fID, status); classNames.push_back("fID");
2754 fSets->addElement(fRI, status); classNames.push_back("fRI");
2755 fSets->addElement(fSG, status); classNames.push_back("fSG");
2756 fSets->addElement(fEB, status); classNames.push_back("fEB");
2757 fSets->addElement(fEM, status); classNames.push_back("fEM");
2758 fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2759 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2760 fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2761 fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2762 fSets->addElement(fExtPictUnassigned, status); classNames.push_back("fExtPictUnassigned");
2763
2764 const char *rules =
2765 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2766 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2767 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2768 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2769 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2770 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2771 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2772
2773 fNumberMatcher = new RegexMatcher(
2774 UnicodeString(rules, -1, US_INV), 0, status);
2775
2776 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2777
2778 if (U_FAILURE(status)) {
2779 deferredStatus = status;
2780 }
2781
2782 }
2783
2784
setText(const UnicodeString & s)2785 void RBBILineMonkey::setText(const UnicodeString &s) {
2786 fText = &s;
2787 fCharBI->setText(s);
2788 prepareAppliedRules(s.length());
2789 fNumberMatcher->reset(s);
2790 }
2791
2792 //
2793 // rule9Adjust
2794 // Line Break TR rules 9 and 10 implementation.
2795 // This deals with combining marks and other sequences that
2796 // that must be treated as if they were something other than what they actually are.
2797 //
2798 // This is factored out into a separate function because it must be applied twice for
2799 // each potential break, once to the chars before the position being checked, then
2800 // again to the text following the possible break.
2801 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2802 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2803 if (pos == -1) {
2804 // Invalid initial position. Happens during the warmup iteration of the
2805 // main loop in next().
2806 return;
2807 }
2808
2809 int32_t nPos = *nextPos;
2810
2811 // LB 9 Keep combining sequences together.
2812 // advance over any CM class chars. Note that Line Break CM is different
2813 // from the normal Grapheme Extend property.
2814 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2815 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2816 for (;;) {
2817 *nextChar = fText->char32At(nPos);
2818 if (!fCM->contains(*nextChar)) {
2819 break;
2820 }
2821 nPos = fText->moveIndex32(nPos, 1);
2822 }
2823 }
2824
2825
2826 // LB 9 Treat X CM* as if it were x.
2827 // No explicit action required.
2828
2829 // LB 10 Treat any remaining combining mark as AL
2830 if (fCM->contains(*posChar)) {
2831 *posChar = u'A';
2832 }
2833
2834 // Push the updated nextPos and nextChar back to our caller.
2835 // This only makes a difference if posChar got bigger by consuming a
2836 // combining sequence.
2837 *nextPos = nPos;
2838 *nextChar = fText->char32At(nPos);
2839 }
2840
2841
2842
next(int32_t startPos)2843 int32_t RBBILineMonkey::next(int32_t startPos) {
2844 UErrorCode status = U_ZERO_ERROR;
2845 int32_t pos; // Index of the char following a potential break position
2846 UChar32 thisChar; // Character at above position "pos"
2847
2848 int32_t prevPos; // Index of the char preceding a potential break position
2849 UChar32 prevChar; // Character at above position. Note that prevChar
2850 // and thisChar may not be adjacent because combining
2851 // characters between them will be ignored.
2852
2853 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2854 UChar32 prevCharX2;
2855
2856 int32_t nextPos; // Index of the next character following pos.
2857 // Usually skips over combining marks.
2858 int32_t nextCPPos; // Index of the code point following "pos."
2859 // May point to a combining mark.
2860 int32_t tPos; // temp value.
2861 UChar32 c;
2862
2863 if (U_FAILURE(deferredStatus)) {
2864 return -1;
2865 }
2866
2867 if (startPos >= fText->length()) {
2868 return -1;
2869 }
2870
2871
2872 // Initial values for loop. Loop will run the first time without finding breaks,
2873 // while the invalid values shift out and the "this" and
2874 // "prev" positions are filled in with good values.
2875 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2876 thisChar = prevChar = prevCharX2 = 0;
2877 nextPos = nextCPPos = startPos;
2878
2879
2880 // Loop runs once per position in the test text, until a break position
2881 // is found.
2882 for (;;) {
2883 prevPosX2 = prevPos;
2884 prevCharX2 = prevChar;
2885
2886 prevPos = pos;
2887 prevChar = thisChar;
2888
2889 pos = nextPos;
2890 thisChar = fText->char32At(pos);
2891
2892 nextCPPos = fText->moveIndex32(pos, 1);
2893 nextPos = nextCPPos;
2894
2895
2896 if (pos >= fText->length()) {
2897 setAppliedRule(pos, "LB2 - Break at end of text.");
2898 break;
2899 }
2900
2901
2902 // We do this one out-of-order because the adjustment does not change anything
2903 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2904 // be applied.
2905 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2906 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2907 c = fText->char32At(nextPos);
2908 rule9Adjust(pos, &thisChar, &nextPos, &c);
2909
2910 // If the loop is still warming up - if we haven't shifted the initial
2911 // -1 positions out of prevPos yet - loop back to advance the
2912 // position in the input without any further looking for breaks.
2913 if (prevPos == -1) {
2914 setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2915 continue;
2916 }
2917
2918
2919 if (fBK->contains(prevChar)) {
2920 setAppliedRule(pos, "LB 4 Always break after hard line breaks");
2921 break;
2922 }
2923
2924
2925 if (prevChar == 0x0d && thisChar == 0x0a) {
2926 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
2927 continue;
2928 }
2929 if (prevChar == 0x0d ||
2930 prevChar == 0x0a ||
2931 prevChar == 0x85) {
2932 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
2933 break;
2934 }
2935
2936
2937 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2938 fBK->contains(thisChar)) {
2939 setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
2940 continue;
2941 }
2942
2943
2944 if (fSP->contains(thisChar)) {
2945 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
2946 continue;
2947 }
2948
2949 // !!! ??? Is this the right text for the applied rule?
2950 if (fZW->contains(thisChar)) {
2951 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
2952 continue;
2953 }
2954
2955
2956 // ZW SP* ÷
2957 // Scan backwards from prevChar for SP* ZW
2958 tPos = prevPos;
2959 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2960 tPos = fText->moveIndex32(tPos, -1);
2961 }
2962 if (fZW->contains(fText->char32At(tPos))) {
2963 setAppliedRule(pos, "LB 8 Break after zero width space");
2964 break;
2965 }
2966
2967
2968 // Move this test up, before LB8a, because numbers can match a longer sequence that would
2969 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
2970 if (fNumberMatcher->lookingAt(prevPos, status)) {
2971 if (U_FAILURE(status)) {
2972 setAppliedRule(pos, "LB 25 Numbers");
2973 break;
2974 }
2975 // Matched a number. But could have been just a single digit, which would
2976 // not represent a "no break here" between prevChar and thisChar
2977 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
2978 if (numEndIdx > pos) {
2979 // Number match includes at least our two chars being checked
2980 if (numEndIdx > nextPos) {
2981 // Number match includes additional chars. Update pos and nextPos
2982 // so that next loop iteration will continue at the end of the number,
2983 // checking for breaks between last char in number & whatever follows.
2984 pos = nextPos = numEndIdx;
2985 do {
2986 pos = fText->moveIndex32(pos, -1);
2987 thisChar = fText->char32At(pos);
2988 } while (fCM->contains(thisChar));
2989 }
2990 setAppliedRule(pos, "LB 25 Numbers");
2991 continue;
2992 }
2993 }
2994
2995
2996 // The monkey test's way of ignoring combining characters doesn't work
2997 // for this rule. ZJ is also a CM. Need to get the actual character
2998 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
2999 {
3000 int32_t prevIdx = fText->moveIndex32(pos, -1);
3001 UChar32 prevC = fText->char32At(prevIdx);
3002 if (fZWJ->contains(prevC)) {
3003 setAppliedRule(pos, "LB 8a ZWJ x");
3004 continue;
3005 }
3006 }
3007
3008
3009 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
3010 //
3011
3012
3013 // x WJ
3014 // WJ x
3015 //
3016 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3017 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
3018 continue;
3019 }
3020
3021
3022 if (fGL->contains(prevChar)) {
3023 setAppliedRule(pos, "LB 12 GL x");
3024 continue;
3025 }
3026
3027
3028 if (!(fSP->contains(prevChar) ||
3029 fBA->contains(prevChar) ||
3030 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3031 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
3032 continue;
3033 }
3034
3035
3036 if (fCL->contains(thisChar) ||
3037 fCP->contains(thisChar) ||
3038 fEX->contains(thisChar) ||
3039 fSY->contains(thisChar)) {
3040 setAppliedRule(pos, "LB 13 Don't break before closings.");
3041 continue;
3042 }
3043
3044
3045 // Scan backwards, checking for this sequence.
3046 // The OP char could include combining marks, so we actually check for
3047 // OP CM* SP*
3048 // Another Twist: The Rule 9 fixes may have changed a SP CM
3049 // sequence into a ID char, so before scanning back through spaces,
3050 // verify that prevChar is indeed a space. The prevChar variable
3051 // may differ from fText[prevPos]
3052 tPos = prevPos;
3053 if (fSP->contains(prevChar)) {
3054 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3055 tPos=fText->moveIndex32(tPos, -1);
3056 }
3057 }
3058 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3059 tPos=fText->moveIndex32(tPos, -1);
3060 }
3061 if (fOP->contains(fText->char32At(tPos))) {
3062 setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3063 continue;
3064 }
3065
3066
3067 if (nextPos < fText->length()) {
3068 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3069 // from a legit ffff character. So test length separately.
3070 UChar32 nextChar = fText->char32At(nextPos);
3071 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3072 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3073 break;
3074 }
3075 }
3076
3077
3078 if (fIS->contains(thisChar)) {
3079 setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
3080 continue;
3081 }
3082
3083
3084 if (fOP->contains(thisChar)) {
3085 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3086 int tPos = prevPos;
3087 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3088 tPos = fText->moveIndex32(tPos, -1);
3089 }
3090 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3091 tPos = fText->moveIndex32(tPos, -1);
3092 }
3093 if (fQU->contains(fText->char32At(tPos))) {
3094 setAppliedRule(pos, "LB 15 QU SP* x OP");
3095 continue;
3096 }
3097 }
3098
3099
3100 // Scan backwards for SP* CM* (CL | CP)
3101 if (fNS->contains(thisChar)) {
3102 int tPos = prevPos;
3103 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3104 tPos = fText->moveIndex32(tPos, -1);
3105 }
3106 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3107 tPos = fText->moveIndex32(tPos, -1);
3108 }
3109 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3110 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
3111 continue;
3112 }
3113 }
3114
3115
3116 if (fB2->contains(thisChar)) {
3117 // Scan backwards, checking for the B2 CM* SP* sequence.
3118 tPos = prevPos;
3119 if (fSP->contains(prevChar)) {
3120 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3121 tPos=fText->moveIndex32(tPos, -1);
3122 }
3123 }
3124 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3125 tPos=fText->moveIndex32(tPos, -1);
3126 }
3127 if (fB2->contains(fText->char32At(tPos))) {
3128 setAppliedRule(pos, "LB 17 B2 SP* x B2");
3129 continue;
3130 }
3131 }
3132
3133
3134 if (fSP->contains(prevChar)) {
3135 setAppliedRule(pos, "LB 18 break after space");
3136 break;
3137 }
3138
3139 // x QU
3140 // QU x
3141 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3142 setAppliedRule(pos, "LB 19");
3143 continue;
3144 }
3145
3146 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3147 setAppliedRule(pos, "LB 20 Break around a CB");
3148 break;
3149 }
3150
3151 // Don't break between Hyphens and letters if a break precedes the hyphen.
3152 // Formerly this was a Finnish tailoring.
3153 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3154 // ^($HY | $HH) $AL;
3155 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3156 prevPosX2 == -1) {
3157 setAppliedRule(pos, "LB 20.09");
3158 continue;
3159 }
3160
3161 if (fBA->contains(thisChar) ||
3162 fHY->contains(thisChar) ||
3163 fNS->contains(thisChar) ||
3164 fBB->contains(prevChar) ) {
3165 setAppliedRule(pos, "LB 21");
3166 continue;
3167 }
3168
3169 if (fHL->contains(prevCharX2) &&
3170 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3171 setAppliedRule(pos, "LB 21a HL (HY | BA) x");
3172 continue;
3173 }
3174
3175 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3176 setAppliedRule(pos, "LB 21b SY x HL");
3177 continue;
3178 }
3179
3180 if (fIN->contains(thisChar)) {
3181 setAppliedRule(pos, "LB 22");
3182 continue;
3183 }
3184
3185
3186 // (AL | HL) x NU
3187 // NU x (AL | HL)
3188 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3189 setAppliedRule(pos, "LB 23");
3190 continue;
3191 }
3192 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3193 setAppliedRule(pos, "LB 23");
3194 continue;
3195 }
3196
3197 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3198 // PR x (ID | EB | EM)
3199 // (ID | EB | EM) x PO
3200 if (fPR->contains(prevChar) &&
3201 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3202 setAppliedRule(pos, "LB 23a");
3203 continue;
3204 }
3205 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3206 fPO->contains(thisChar)) {
3207 setAppliedRule(pos, "LB 23a");
3208 continue;
3209 }
3210
3211 // Do not break between prefix and letters or ideographs.
3212 // (PR | PO) x (AL | HL)
3213 // (AL | HL) x (PR | PO)
3214 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3215 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3216 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3217 continue;
3218 }
3219 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3220 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3221 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3222 continue;
3223 }
3224
3225 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3226
3227 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3228 fJV->contains(thisChar) ||
3229 fH2->contains(thisChar) ||
3230 fH3->contains(thisChar))) {
3231 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3232 continue;
3233 }
3234
3235 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3236 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3237 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3238 continue;
3239 }
3240
3241 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3242 fJT->contains(thisChar)) {
3243 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3244 continue;
3245 }
3246
3247 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3248 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3249 fPO->contains(thisChar)) {
3250 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3251 continue;
3252 }
3253 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3254 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3255 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3256 continue;
3257 }
3258
3259
3260
3261 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3262 setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
3263 continue;
3264 }
3265
3266 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3267 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3268 continue;
3269 }
3270
3271 // (AL | NU) x OP
3272 // CP x (AL | NU)
3273 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3274 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3275 continue;
3276 }
3277 if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3278 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3279 continue;
3280 }
3281
3282 // RI x RI
3283 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3284 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3285 break;
3286 }
3287 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3288 // Two Regional Indicators have been paired.
3289 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3290 // following RI. This is a hack.
3291 thisChar = -1;
3292 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3293 continue;
3294 }
3295
3296 // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3297 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3298 setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
3299 continue;
3300 }
3301
3302 if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3303 setAppliedRule(pos, "LB30b [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
3304 continue;
3305 }
3306
3307 setAppliedRule(pos, "LB 31 Break everywhere else");
3308 break;
3309 }
3310
3311 return pos;
3312 }
3313
3314
charClasses()3315 UVector *RBBILineMonkey::charClasses() {
3316 return fSets;
3317 }
3318
3319
~RBBILineMonkey()3320 RBBILineMonkey::~RBBILineMonkey() {
3321 delete fSets;
3322
3323 delete fBK;
3324 delete fCR;
3325 delete fLF;
3326 delete fCM;
3327 delete fNL;
3328 delete fWJ;
3329 delete fZW;
3330 delete fGL;
3331 delete fCB;
3332 delete fSP;
3333 delete fB2;
3334 delete fBA;
3335 delete fBB;
3336 delete fHH;
3337 delete fHY;
3338 delete fH2;
3339 delete fH3;
3340 delete fCL;
3341 delete fCP;
3342 delete fEX;
3343 delete fIN;
3344 delete fJL;
3345 delete fJV;
3346 delete fJT;
3347 delete fNS;
3348 delete fOP;
3349 delete fQU;
3350 delete fIS;
3351 delete fNU;
3352 delete fPO;
3353 delete fPR;
3354 delete fSY;
3355 delete fAI;
3356 delete fAL;
3357 delete fCJ;
3358 delete fHL;
3359 delete fID;
3360 delete fRI;
3361 delete fSG;
3362 delete fXX;
3363 delete fEB;
3364 delete fEM;
3365 delete fZWJ;
3366 delete fOP30;
3367 delete fCP30;
3368 delete fExtPictUnassigned;
3369
3370 delete fCharBI;
3371 delete fNumberMatcher;
3372 }
3373
3374
3375 //-------------------------------------------------------------------------------------------
3376 //
3377 // TestMonkey
3378 //
3379 // params
3380 // seed=nnnnn Random number starting seed.
3381 // Setting the seed allows errors to be reproduced.
3382 // loop=nnn Looping count. Controls running time.
3383 // -1: run forever.
3384 // 0 or greater: run length.
3385 //
3386 // type = char | word | line | sent | title
3387 //
3388 // Example:
3389 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3390 //
3391 //-------------------------------------------------------------------------------------------
3392
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3393 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3394 int32_t val = defaultVal;
3395 name.append(" *= *(-?\\d+)");
3396 UErrorCode status = U_ZERO_ERROR;
3397 RegexMatcher m(name, params, 0, status);
3398 if (m.find()) {
3399 // The param exists. Convert the string to an int.
3400 char valString[100];
3401 int32_t paramLength = m.end(1, status) - m.start(1, status);
3402 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3403 paramLength = (int32_t)(sizeof(valString)-2);
3404 }
3405 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3406 val = strtol(valString, NULL, 10);
3407
3408 // Delete this parameter from the params string.
3409 m.reset();
3410 params = m.replaceFirst("", status);
3411 }
3412 U_ASSERT(U_SUCCESS(status));
3413 return val;
3414 }
3415 #endif
3416
3417 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3418 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3419 BreakIterator *bi,
3420 int expected[],
3421 int expectedcount)
3422 {
3423 int count = 0;
3424 int i = 0;
3425 int forward[50];
3426 bi->setText(ustr);
3427 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3428 forward[count] = i;
3429 if (count < expectedcount && expected[count] != i) {
3430 test->errln("%s:%d break forward test failed: expected %d but got %d",
3431 __FILE__, __LINE__, expected[count], i);
3432 break;
3433 }
3434 count ++;
3435 }
3436 if (count != expectedcount) {
3437 printStringBreaks(ustr, expected, expectedcount);
3438 test->errln("%s:%d break forward test failed: missed %d match",
3439 __FILE__, __LINE__, expectedcount - count);
3440 return;
3441 }
3442 // testing boundaries
3443 for (i = 1; i < expectedcount; i ++) {
3444 int j = expected[i - 1];
3445 if (!bi->isBoundary(j)) {
3446 printStringBreaks(ustr, expected, expectedcount);
3447 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3448 __FILE__, __LINE__, j);
3449 return;
3450 }
3451 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3452 if (bi->isBoundary(j)) {
3453 printStringBreaks(ustr, expected, expectedcount);
3454 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3455 __FILE__, __LINE__, j);
3456 return;
3457 }
3458 }
3459 }
3460
3461 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3462 count --;
3463 if (forward[count] != i) {
3464 printStringBreaks(ustr, expected, expectedcount);
3465 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3466 __FILE__, __LINE__, forward[count], i);
3467 break;
3468 }
3469 }
3470 if (count != 0) {
3471 printStringBreaks(ustr, expected, expectedcount);
3472 test->errln("break test previous() failed: missed a match");
3473 return;
3474 }
3475
3476 // testing preceding
3477 for (i = 0; i < expectedcount - 1; i ++) {
3478 // int j = expected[i] + 1;
3479 int j = ustr.moveIndex32(expected[i], 1);
3480 for (; j <= expected[i + 1]; j ++) {
3481 int32_t expectedPreceding = expected[i];
3482 int32_t actualPreceding = bi->preceding(j);
3483 if (actualPreceding != expectedPreceding) {
3484 printStringBreaks(ustr, expected, expectedcount);
3485 test->errln("%s:%d preceding(%d): expected %d, got %d",
3486 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3487 return;
3488 }
3489 }
3490 }
3491 }
3492 #endif
3493
TestWordBreaks(void)3494 void RBBITest::TestWordBreaks(void)
3495 {
3496 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3497
3498 Locale locale("en");
3499 UErrorCode status = U_ZERO_ERROR;
3500 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3501 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3502 // Replaced any C+J characters in a row with a random sequence of characters
3503 // of the same length to make our C+J segmentation not get in the way.
3504 static const char *strlist[] =
3505 {
3506 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3507 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3508 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3509 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3510 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3511 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3512 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3513 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3514 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3515 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3516 "\\u2027\\U000e0067\\u0a47\\u00b7",
3517 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3518 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3519 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3520 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3521 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3522 "\\u0027\\u11af\\U000e0057\\u0602",
3523 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3524 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3525 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3526 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3527 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3528 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3529 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3530 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3531 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3532 "\\u18f4\\U000e0049\\u20e7\\u2027",
3533 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3534 "\\ua183\\u102d\\u0bec\\u003a",
3535 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3536 "\\u003a\\u0e57\\u0fad\\u002e",
3537 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3538 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3539 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3540 "\\u003a\\u0664\\u00b7\\u1fba",
3541 "\\u003b\\u0027\\u00b7\\u47a3",
3542 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3543 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3544 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3545 };
3546 int loop;
3547 if (U_FAILURE(status)) {
3548 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3549 return;
3550 }
3551 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3552 // printf("looping %d\n", loop);
3553 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3554 // RBBICharMonkey monkey;
3555 RBBIWordMonkey monkey;
3556
3557 int expected[50];
3558 int expectedcount = 0;
3559
3560 monkey.setText(ustr);
3561 int i;
3562 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3563 expected[expectedcount ++] = i;
3564 }
3565
3566 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3567 }
3568 delete bi;
3569 #endif
3570 }
3571
TestWordBoundary(void)3572 void RBBITest::TestWordBoundary(void)
3573 {
3574 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3575 Locale locale("en");
3576 UErrorCode status = U_ZERO_ERROR;
3577 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3578 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3579 if (U_FAILURE(status)) {
3580 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3581 __FILE__, __LINE__, u_errorName(status));
3582 return;
3583 }
3584 UChar str[50];
3585 static const char *strlist[] =
3586 {
3587 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3588 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3589 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3590 "\\u2027\\U000e0067\\u0a47\\u00b7",
3591 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3592 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3593 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3594 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3595 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3596 "\\u0027\\u11af\\U000e0057\\u0602",
3597 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3598 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3599 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3600 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3601 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3602 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3603 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3604 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3605 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3606 "\\u58f4\\U000e0049\\u20e7\\u2027",
3607 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3608 "\\ua183\\u102d\\u0bec\\u003a",
3609 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3610 "\\u003a\\u0e57\\u0fad\\u002e",
3611 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3612 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3613 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3614 "\\u003a\\u0664\\u00b7\\u1fba",
3615 "\\u003b\\u0027\\u00b7\\u47a3",
3616 };
3617 int loop;
3618 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3619 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3620 UnicodeString ustr(str);
3621 int forward[50];
3622 int count = 0;
3623
3624 bi->setText(ustr);
3625 int prev = -1;
3626 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3627 ++count;
3628 if (count >= UPRV_LENGTHOF(forward)) {
3629 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3630 __FILE__, __LINE__, loop, count, boundary);
3631 return;
3632 }
3633 forward[count] = boundary;
3634 if (boundary <= prev) {
3635 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3636 __FILE__, __LINE__, loop, prev, boundary);
3637 break;
3638 }
3639 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3640 if (bi->isBoundary(nonBoundary)) {
3641 printStringBreaks(ustr, forward, count);
3642 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3643 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3644 return;
3645 }
3646 }
3647 if (!bi->isBoundary(boundary)) {
3648 printStringBreaks(ustr, forward, count);
3649 errln("%s:%d happy boundary test failed: expected %d a boundary",
3650 __FILE__, __LINE__, boundary);
3651 return;
3652 }
3653 prev = boundary;
3654 }
3655 }
3656 }
3657
TestLineBreaks(void)3658 void RBBITest::TestLineBreaks(void)
3659 {
3660 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3661 Locale locale("en");
3662 UErrorCode status = U_ZERO_ERROR;
3663 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3664 const int32_t STRSIZE = 50;
3665 UChar str[STRSIZE];
3666 static const char *strlist[] =
3667 {
3668 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3669 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3670 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3671 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3672 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3673 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3674 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3675 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3676 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3677 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3678 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3679 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3680 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3681 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3682 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3683 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3684 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3685 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3686 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3687 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3688 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3689 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3690 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3691 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3692 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3693 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3694 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3695 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3696 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3697 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3698 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3699 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3700 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3701 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3702 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3703 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3704 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3705 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3706 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3707 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3708 };
3709 int loop;
3710 TEST_ASSERT_SUCCESS(status);
3711 if (U_FAILURE(status)) {
3712 return;
3713 }
3714 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3715 // printf("looping %d\n", loop);
3716 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3717 if (t >= STRSIZE) {
3718 TEST_ASSERT(FALSE);
3719 continue;
3720 }
3721
3722
3723 UnicodeString ustr(str);
3724 RBBILineMonkey monkey;
3725 if (U_FAILURE(monkey.deferredStatus)) {
3726 continue;
3727 }
3728
3729 const int EXPECTEDSIZE = 50;
3730 int expected[EXPECTEDSIZE];
3731 int expectedcount = 0;
3732
3733 monkey.setText(ustr);
3734
3735 int i;
3736 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3737 if (expectedcount >= EXPECTEDSIZE) {
3738 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3739 return;
3740 }
3741 expected[expectedcount ++] = i;
3742 }
3743
3744 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3745 }
3746 delete bi;
3747 #endif
3748 }
3749
TestSentBreaks(void)3750 void RBBITest::TestSentBreaks(void)
3751 {
3752 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3753 Locale locale("en");
3754 UErrorCode status = U_ZERO_ERROR;
3755 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3756 UChar str[200];
3757 static const char *strlist[] =
3758 {
3759 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3760 "This\n",
3761 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3762 "\"Sentence ending with a quote.\" Bye.",
3763 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3764 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3765 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3766 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3767 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3768 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3769 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3770 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3771 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3772 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3773 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3774 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3775 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3776 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3777 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3778 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3779 };
3780 int loop;
3781 if (U_FAILURE(status)) {
3782 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3783 return;
3784 }
3785 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3786 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3787 UnicodeString ustr(str);
3788
3789 RBBISentMonkey monkey;
3790 if (U_FAILURE(monkey.deferredStatus)) {
3791 continue;
3792 }
3793
3794 const int EXPECTEDSIZE = 50;
3795 int expected[EXPECTEDSIZE];
3796 int expectedcount = 0;
3797
3798 monkey.setText(ustr);
3799
3800 int i;
3801 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3802 if (expectedcount >= EXPECTEDSIZE) {
3803 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3804 return;
3805 }
3806 expected[expectedcount ++] = i;
3807 }
3808
3809 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3810 }
3811 delete bi;
3812 #endif
3813 }
3814
TestMonkey()3815 void RBBITest::TestMonkey() {
3816 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3817
3818 UErrorCode status = U_ZERO_ERROR;
3819 int32_t loopCount = 500;
3820 int32_t seed = 1;
3821 UnicodeString breakType = "all";
3822 Locale locale("en");
3823 UBool useUText = FALSE;
3824
3825 if (quick == FALSE) {
3826 loopCount = 10000;
3827 }
3828
3829 if (fTestParams) {
3830 UnicodeString p(fTestParams);
3831 loopCount = getIntParam("loop", p, loopCount);
3832 seed = getIntParam("seed", p, seed);
3833
3834 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3835 if (m.find()) {
3836 breakType = m.group(1, status);
3837 m.reset();
3838 p = m.replaceFirst("", status);
3839 }
3840
3841 RegexMatcher u(" *utext", p, 0, status);
3842 if (u.find()) {
3843 useUText = TRUE;
3844 u.reset();
3845 p = u.replaceFirst("", status);
3846 }
3847
3848
3849 // m.reset(p);
3850 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3851 // Each option is stripped out of the option string as it is processed.
3852 // All options have been checked. The option string should have been completely emptied..
3853 char buf[100];
3854 p.extract(buf, sizeof(buf), NULL, status);
3855 buf[sizeof(buf)-1] = 0;
3856 errln("Unrecognized or extra parameter: %s\n", buf);
3857 return;
3858 }
3859
3860 }
3861
3862 if (breakType == "char" || breakType == "all") {
3863 RBBICharMonkey m;
3864 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3865 if (U_SUCCESS(status)) {
3866 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3867 if (breakType == "all" && useUText==FALSE) {
3868 // Also run a quick test with UText when "all" is specified
3869 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3870 }
3871 }
3872 else {
3873 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3874 }
3875 delete bi;
3876 }
3877
3878 if (breakType == "word" || breakType == "all") {
3879 logln("Word Break Monkey Test");
3880 RBBIWordMonkey m;
3881 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3882 if (U_SUCCESS(status)) {
3883 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3884 }
3885 else {
3886 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3887 }
3888 delete bi;
3889 }
3890
3891 if (breakType == "line" || breakType == "all") {
3892 logln("Line Break Monkey Test");
3893 RBBILineMonkey m;
3894 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3895 if (loopCount >= 10) {
3896 loopCount = loopCount / 5; // Line break runs slower than the others.
3897 }
3898 if (U_SUCCESS(status)) {
3899 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3900 }
3901 else {
3902 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3903 }
3904 delete bi;
3905 }
3906
3907 if (breakType == "sent" || breakType == "all" ) {
3908 logln("Sentence Break Monkey Test");
3909 RBBISentMonkey m;
3910 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3911 if (loopCount >= 10) {
3912 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3913 }
3914 if (U_SUCCESS(status)) {
3915 RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3916 }
3917 else {
3918 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3919 }
3920 delete bi;
3921 }
3922
3923 #endif
3924 }
3925
3926 //
3927 // Run a RBBI monkey test. Common routine, for all break iterator types.
3928 // Parameters:
3929 // bi - the break iterator to use
3930 // mk - MonkeyKind, abstraction for obtaining expected results
3931 // name - Name of test (char, word, etc.) for use in error messages
3932 // seed - Seed for starting random number generator (parameter from user)
3933 // numIterations
3934 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3935 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
3936 int32_t numIterations, UBool useUText) {
3937
3938 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3939
3940 const int32_t TESTSTRINGLEN = 500;
3941 UnicodeString testText;
3942 int32_t numCharClasses;
3943 UVector *chClasses;
3944 int expectedCount = 0;
3945 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3946 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3947 char reverseBreaks[TESTSTRINGLEN*2+1];
3948 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3949 char followingBreaks[TESTSTRINGLEN*2+1];
3950 char precedingBreaks[TESTSTRINGLEN*2+1];
3951 int i;
3952 int loopCount = 0;
3953
3954
3955 m_seed = seed;
3956
3957 numCharClasses = mk.charClasses()->size();
3958 chClasses = mk.charClasses();
3959
3960 // Check for errors that occurred during the construction of the MonkeyKind object.
3961 // Can't report them where they occurred because errln() is a method coming from intlTest,
3962 // and is not visible outside of RBBITest :-(
3963 if (U_FAILURE(mk.deferredStatus)) {
3964 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3965 return;
3966 }
3967
3968 // Verify that the character classes all have at least one member.
3969 for (i=0; i<numCharClasses; i++) {
3970 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3971 if (s == NULL || s->size() == 0) {
3972 errln("Character Class #%d is null or of zero size.", i);
3973 return;
3974 }
3975 }
3976
3977 // For minimizing width of class name output.
3978 int classNameSize = mk.maxClassNameSize();
3979
3980 while (loopCount < numIterations || numIterations == -1) {
3981 if (numIterations == -1 && loopCount % 10 == 0) {
3982 // If test is running in an infinite loop, display a periodic tic so
3983 // we can tell that it is making progress.
3984 fprintf(stderr, ".");
3985 }
3986 // Save current random number seed, so that we can recreate the random numbers
3987 // for this loop iteration in event of an error.
3988 seed = m_seed;
3989
3990 // Populate a test string with data.
3991 testText.truncate(0);
3992 for (i=0; i<TESTSTRINGLEN; i++) {
3993 int32_t aClassNum = m_rand() % numCharClasses;
3994 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3995 int32_t charIdx = m_rand() % classSet->size();
3996 UChar32 c = classSet->charAt(charIdx);
3997 if (c < 0) { // TODO: deal with sets containing strings.
3998 errln("%s:%d c < 0", __FILE__, __LINE__);
3999 break;
4000 }
4001 // Do not assemble a supplementary character from randomly generated separate surrogates.
4002 // (It could be a dictionary character)
4003 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4004 continue;
4005 }
4006
4007 testText.append(c);
4008 }
4009
4010 // Calculate the expected results for this test string and reset applied rules.
4011 mk.setText(testText);
4012
4013 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4014 expectedBreaks[0] = 1;
4015 int32_t breakPos = 0;
4016 expectedCount = 0;
4017 for (;;) {
4018 breakPos = mk.next(breakPos);
4019 if (breakPos == -1) {
4020 break;
4021 }
4022 if (breakPos > testText.length()) {
4023 errln("breakPos > testText.length()");
4024 }
4025 expectedBreaks[breakPos] = 1;
4026 expectedCount++;
4027 U_ASSERT(expectedCount<testText.length());
4028 }
4029
4030 // Find the break positions using forward iteration
4031 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4032 if (useUText) {
4033 UErrorCode status = U_ZERO_ERROR;
4034 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4035 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4036 bi->setText(testUText, status);
4037 TEST_ASSERT_SUCCESS(status);
4038 utext_close(testUText); // The break iterator does a shallow clone of the UText
4039 // This UText can be closed immediately, so long as the
4040 // testText string continues to exist.
4041 } else {
4042 bi->setText(testText);
4043 }
4044
4045 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4046 if (i < 0 || i > testText.length()) {
4047 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4048 break;
4049 }
4050 forwardBreaks[i] = 1;
4051 }
4052
4053 // Find the break positions using reverse iteration
4054 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4055 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4056 if (i < 0 || i > testText.length()) {
4057 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4058 break;
4059 }
4060 reverseBreaks[i] = 1;
4061 }
4062
4063 // Find the break positions using isBoundary() tests.
4064 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4065 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4066 for (i=0; i<=testText.length(); i++) {
4067 isBoundaryBreaks[i] = bi->isBoundary(i);
4068 }
4069
4070
4071 // Find the break positions using the following() function.
4072 // printf(".");
4073 memset(followingBreaks, 0, sizeof(followingBreaks));
4074 int32_t lastBreakPos = 0;
4075 followingBreaks[0] = 1;
4076 for (i=0; i<testText.length(); i++) {
4077 breakPos = bi->following(i);
4078 if (breakPos <= i ||
4079 breakPos < lastBreakPos ||
4080 breakPos > testText.length() ||
4081 (breakPos > lastBreakPos && lastBreakPos > i)) {
4082 errln("%s break monkey test: "
4083 "Out of range value returned by BreakIterator::following().\n"
4084 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4085 name, seed, i, breakPos, lastBreakPos);
4086 break;
4087 }
4088 followingBreaks[breakPos] = 1;
4089 lastBreakPos = breakPos;
4090 }
4091
4092 // Find the break positions using the preceding() function.
4093 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4094 lastBreakPos = testText.length();
4095 precedingBreaks[testText.length()] = 1;
4096 for (i=testText.length(); i>0; i--) {
4097 breakPos = bi->preceding(i);
4098 if (breakPos >= i ||
4099 breakPos > lastBreakPos ||
4100 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4101 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4102 errln("%s break monkey test: "
4103 "Out of range value returned by BreakIterator::preceding().\n"
4104 "index=%d; prev returned %d; lastBreak=%d" ,
4105 name, i, breakPos, lastBreakPos);
4106 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4107 precedingBreaks[i] = 2; // Forces an error.
4108 }
4109 } else {
4110 if (breakPos >= 0) {
4111 precedingBreaks[breakPos] = 1;
4112 }
4113 lastBreakPos = breakPos;
4114 }
4115 }
4116
4117 // Compare the expected and actual results.
4118 for (i=0; i<=testText.length(); i++) {
4119 const char *errorType = NULL;
4120 const char* currentBreakData = NULL;
4121 if (forwardBreaks[i] != expectedBreaks[i]) {
4122 errorType = "next()";
4123 currentBreakData = forwardBreaks;
4124 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4125 errorType = "previous()";
4126 currentBreakData = reverseBreaks;
4127 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4128 errorType = "isBoundary()";
4129 currentBreakData = isBoundaryBreaks;
4130 } else if (followingBreaks[i] != expectedBreaks[i]) {
4131 errorType = "following()";
4132 currentBreakData = followingBreaks;
4133 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4134 errorType = "preceding()";
4135 currentBreakData = precedingBreaks;
4136 }
4137
4138 if (errorType != NULL) {
4139 // Format a range of the test text that includes the failure as
4140 // a data item that can be included in the rbbi test data file.
4141
4142 // Start of the range is the last point where expected and actual results
4143 // both agreed that there was a break position.
4144
4145 int startContext = i;
4146 int32_t count = 0;
4147 for (;;) {
4148 if (startContext==0) { break; }
4149 startContext --;
4150 if (expectedBreaks[startContext] != 0) {
4151 if (count == 2) break;
4152 count ++;
4153 }
4154 }
4155
4156 // End of range is two expected breaks past the start position.
4157 int endContext = i + 1;
4158 int ci;
4159 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4160 for (;;) {
4161 if (endContext >= testText.length()) {break;}
4162 if (expectedBreaks[endContext-1] != 0) {
4163 if (count == 0) break;
4164 count --;
4165 }
4166 endContext ++;
4167 }
4168 }
4169
4170 // Formatting of each line includes:
4171 // character code
4172 // reference break: '|' -> a break, '.' -> no break
4173 // actual break: '|' -> a break, '.' -> no break
4174 // (name of character clase)
4175 // Unicode name of character
4176 // '-->' indicates location of the difference.
4177
4178 MONKEY_ERROR(
4179 (expectedBreaks[i] ? "Break expected but not found" :
4180 "Break found but not expected"),
4181 name, i, seed);
4182
4183 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4184 UChar32 c;
4185 c = testText.char32At(ci);
4186
4187 std::string currentLineFlag = " ";
4188 if (ci == i) {
4189 currentLineFlag = "-->"; // Error position
4190 }
4191
4192 // BMP or SMP character in hex
4193 char hexCodePoint[12];
4194 std::string format = " \\u%04x";
4195 if (c >= 0x10000) {
4196 format = "\\U%08x";
4197 }
4198 sprintf(hexCodePoint, format.c_str(), c);
4199
4200 // Get the class name and character name for the character.
4201 char cName[200];
4202 UErrorCode status = U_ZERO_ERROR;
4203 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4204
4205 char buffer[200];
4206 auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4207 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4208 currentLineFlag.c_str(),
4209 ci,
4210 expectedBreaks[ci] == 0 ? "." : "|", // Reference break
4211 currentBreakData[ci] == 0 ? "." : "|", // Actual break
4212 hexCodePoint,
4213 classNameSize,
4214 mk.classNameFromCodepoint(c).c_str(),
4215 mk.getAppliedRule(ci).c_str(), cName);
4216 (void)ret;
4217 U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4218
4219 // Output the error
4220 if (ci == i) {
4221 errln(buffer);
4222 } else {
4223 infoln(buffer);
4224 }
4225
4226 if (ci >= endContext) { break; }
4227 }
4228 break;
4229 }
4230 }
4231
4232 loopCount++;
4233 }
4234 #endif
4235 }
4236
4237
4238 // Bug 5532. UTF-8 based UText fails in dictionary code.
4239 // This test checks the initial patch,
4240 // which is to just keep it from crashing. Correct word boundaries
4241 // await a proper fix to the dictionary code.
4242 //
TestBug5532(void)4243 void RBBITest::TestBug5532(void) {
4244 // Text includes a mixture of Thai and Latin.
4245 const unsigned char utf8Data[] = {
4246 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4247 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4248 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4249 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4250 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4251 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4252 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4253 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4254 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4255 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4256 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4257
4258 UErrorCode status = U_ZERO_ERROR;
4259 UText utext=UTEXT_INITIALIZER;
4260 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4261 TEST_ASSERT_SUCCESS(status);
4262
4263 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4264 TEST_ASSERT_SUCCESS(status);
4265 if (U_SUCCESS(status)) {
4266 bi->setText(&utext, status);
4267 TEST_ASSERT_SUCCESS(status);
4268
4269 int32_t breakCount = 0;
4270 int32_t previousBreak = -1;
4271 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4272 // For now, just make sure that the break iterator doesn't hang.
4273 TEST_ASSERT(previousBreak < bi->current());
4274 previousBreak = bi->current();
4275 }
4276 TEST_ASSERT(breakCount > 0);
4277 }
4278 delete bi;
4279 utext_close(&utext);
4280 }
4281
4282
TestBug9983(void)4283 void RBBITest::TestBug9983(void) {
4284 UnicodeString text = UnicodeString("\\u002A" // * Other
4285 "\\uFF65" // Other
4286 "\\u309C" // Katakana
4287 "\\uFF9F" // Extend
4288 "\\uFF65" // Other
4289 "\\u0020" // Other
4290 "\\u0000").unescape();
4291
4292 UErrorCode status = U_ZERO_ERROR;
4293 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4294 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4295 TEST_ASSERT_SUCCESS(status);
4296 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4297 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4298 TEST_ASSERT_SUCCESS(status);
4299 if (U_FAILURE(status)) {
4300 return;
4301 }
4302 int32_t offset, rstatus, iterationCount;
4303
4304 brkiter->setText(text);
4305 brkiter->last();
4306 iterationCount = 0;
4307 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4308 iterationCount++;
4309 rstatus = brkiter->getRuleStatus();
4310 (void)rstatus; // Suppress set but not used warning.
4311 if (iterationCount >= 10) {
4312 break;
4313 }
4314 }
4315 TEST_ASSERT(iterationCount == 6);
4316
4317 brkiterPOSIX->setText(text);
4318 brkiterPOSIX->last();
4319 iterationCount = 0;
4320 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4321 iterationCount++;
4322 rstatus = brkiterPOSIX->getRuleStatus();
4323 (void)rstatus; // Suppress set but not used warning.
4324 if (iterationCount >= 10) {
4325 break;
4326 }
4327 }
4328 TEST_ASSERT(iterationCount == 6);
4329 }
4330
4331 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4332 //
TestBug7547()4333 void RBBITest::TestBug7547() {
4334 UnicodeString rules;
4335 UErrorCode status = U_ZERO_ERROR;
4336 UParseError parseError;
4337 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4338 if (status != U_BRK_RULE_SYNTAX) {
4339 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4340 }
4341 if (parseError.line != 1 || parseError.offset != 0) {
4342 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4343 }
4344 }
4345
4346
TestBug12797()4347 void RBBITest::TestBug12797() {
4348 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4349 UErrorCode status = U_ZERO_ERROR;
4350 UParseError parseError;
4351 RuleBasedBreakIterator bi(rules, parseError, status);
4352 if (U_FAILURE(status)) {
4353 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4354 return;
4355 }
4356 UnicodeString text = "abc";
4357 bi.setText(text);
4358 bi.first();
4359 int32_t boundary = bi.next();
4360 if (boundary != 3) {
4361 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4362 }
4363 }
4364
TestBug12918()4365 void RBBITest::TestBug12918() {
4366 // This test triggers an assertion failure in dictbe.cpp
4367 const UChar *crasherString = u"\u3325\u4a16";
4368 UErrorCode status = U_ZERO_ERROR;
4369 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4370 if (U_FAILURE(status)) {
4371 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4372 return;
4373 }
4374 ubrk_first(iter);
4375 int32_t pos = 0;
4376 int32_t lastPos = -1;
4377 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4378 if (pos <= lastPos) {
4379 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4380 break;
4381 }
4382 }
4383 ubrk_close(iter);
4384 }
4385
TestBug12932()4386 void RBBITest::TestBug12932() {
4387 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4388 UnicodeString ruleStr(
4389 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4390 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4391 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4392 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4393 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4394 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4395
4396 UErrorCode status = U_ZERO_ERROR;
4397 UParseError parseError;
4398 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4399 if (status != U_BRK_RULE_SYNTAX) {
4400 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4401 __FILE__, __LINE__, u_errorName(status));
4402 }
4403 }
4404
4405
4406 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4407 // remain undevided by ICU char, word and line break.
TestEmoji()4408 void RBBITest::TestEmoji() {
4409 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4410 UErrorCode status = U_ZERO_ERROR;
4411
4412 CharString testFileName;
4413 testFileName.append(IntlTest::getSourceTestData(status), status);
4414 testFileName.appendPathPart("emoji-test.txt", status);
4415 if (U_FAILURE(status)) {
4416 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4417 return;
4418 }
4419 logln("Opening data file %s\n", testFileName.data());
4420
4421 int len;
4422 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4423 if (U_FAILURE(status) || testFile == NULL) {
4424 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4425 return;
4426 }
4427 UnicodeString testFileAsString(testFile, len);
4428 delete [] testFile;
4429
4430 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4431 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4432 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4433 int32_t lineNumber = 0;
4434
4435 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4436 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4437 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4438 if (U_FAILURE(status)) {
4439 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4440 return;
4441 }
4442
4443 while (lineMatcher.find()) {
4444 ++lineNumber;
4445 UnicodeString line = lineMatcher.group(status);
4446 hexMatcher.reset(line);
4447 UnicodeString testString; // accumulates the emoji sequence.
4448 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4449 UnicodeString hex = hexMatcher.group(1, status);
4450 if (hex.length() > 8) {
4451 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4452 break;
4453 }
4454 CharString hex8;
4455 hex8.appendInvariantChars(hex, status);
4456 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4457 if (c<=0x10ffff) {
4458 testString.append(c);
4459 } else {
4460 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4461 __FILE__, __LINE__, lineNumber, hex8.data());
4462 break;
4463 }
4464 }
4465
4466 if (testString.length() > 1) {
4467 charBreaks->setText(testString);
4468 charBreaks->first();
4469 int32_t firstBreak = charBreaks->next();
4470 if (testString.length() != firstBreak) {
4471 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4472 __FILE__, __LINE__, lineNumber, firstBreak);
4473 }
4474 wordBreaks->setText(testString);
4475 wordBreaks->first();
4476 firstBreak = wordBreaks->next();
4477 if (testString.length() != firstBreak) {
4478 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4479 __FILE__, __LINE__, lineNumber, firstBreak);
4480 }
4481 lineBreaks->setText(testString);
4482 lineBreaks->first();
4483 firstBreak = lineBreaks->next();
4484 if (testString.length() != firstBreak) {
4485 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4486 __FILE__, __LINE__, lineNumber, firstBreak);
4487 }
4488 }
4489 }
4490 #endif
4491 }
4492
4493
4494 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4495
TestBug12519()4496 void RBBITest::TestBug12519() {
4497 UErrorCode status = U_ZERO_ERROR;
4498 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4499 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4500 if (!assertSuccess(WHERE, status)) {
4501 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4502 return;
4503 }
4504 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4505
4506 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4507 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4508
4509 LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4510 assertTrue(WHERE, *biEn == *cloneEn);
4511 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4512
4513 LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4514 assertTrue(WHERE, *biFr == *cloneFr);
4515 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4516
4517 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4518 UnicodeString text("Hallo Welt");
4519 biDe->setText(text);
4520 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4521 *biDe = *biFr;
4522 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4523 }
4524
TestBug12677()4525 void RBBITest::TestBug12677() {
4526 // Check that stripping of comments from rules for getRules() is not confused by
4527 // the presence of '#' characters in the rules that do not introduce comments.
4528 UnicodeString rules(u"!!forward; \n"
4529 "$x = [ab#]; # a set with a # literal. \n"
4530 " # .; # a comment that looks sort of like a rule. \n"
4531 " '#' '?'; # a rule with a quoted # \n"
4532 );
4533
4534 UErrorCode status = U_ZERO_ERROR;
4535 UParseError pe;
4536 RuleBasedBreakIterator bi(rules, pe, status);
4537 assertSuccess(WHERE, status);
4538 UnicodeString rtRules = bi.getRules();
4539 assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules);
4540 }
4541
4542
TestTableRedundancies()4543 void RBBITest::TestTableRedundancies() {
4544 UErrorCode status = U_ZERO_ERROR;
4545
4546 LocalPointer<RuleBasedBreakIterator> bi (
4547 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4548 assertSuccess(WHERE, status);
4549 if (U_FAILURE(status)) return;
4550
4551 RBBIDataWrapper *dw = bi->fData;
4552 const RBBIStateTable *fwtbl = dw->fForwardTable;
4553 UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4554 int32_t numCharClasses = dw->fHeader->fCatCount;
4555 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4556
4557 // Check for duplicate columns (character categories)
4558
4559 std::vector<UnicodeString> columns;
4560 for (int32_t column = 0; column < numCharClasses; column++) {
4561 UnicodeString s;
4562 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4563 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4564 s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4565 }
4566 columns.push_back(s);
4567 }
4568 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4569 for (int c1=1; c1<numCharClasses; c1++) {
4570 int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4571 for (int c2 = c1+1; c2 < limit; c2++) {
4572 if (columns.at(c1) == columns.at(c2)) {
4573 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4574 goto out;
4575 }
4576 }
4577 }
4578 out:
4579
4580 // Check for duplicate states
4581 std::vector<UnicodeString> rows;
4582 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4583 UnicodeString s;
4584 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4585 if (in8Bits) {
4586 s.append(row->r8.fAccepting);
4587 s.append(row->r8.fLookAhead);
4588 s.append(row->r8.fTagsIdx);
4589 for (int32_t column = 0; column < numCharClasses; column++) {
4590 s.append(row->r8.fNextState[column]);
4591 }
4592 } else {
4593 s.append(row->r16.fAccepting);
4594 s.append(row->r16.fLookAhead);
4595 s.append(row->r16.fTagsIdx);
4596 for (int32_t column = 0; column < numCharClasses; column++) {
4597 s.append(row->r16.fNextState[column]);
4598 }
4599 }
4600 rows.push_back(s);
4601 }
4602 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4603 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4604 if (rows.at(r1) == rows.at(r2)) {
4605 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4606 return;
4607 }
4608 }
4609 }
4610 }
4611
4612 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4613 // even after next() has returned DONE.
4614
TestBug13447()4615 void RBBITest::TestBug13447() {
4616 UErrorCode status = U_ZERO_ERROR;
4617 LocalPointer<RuleBasedBreakIterator> bi(
4618 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4619 assertSuccess(WHERE, status);
4620 if (U_FAILURE(status)) return;
4621 UnicodeString data(u"1234");
4622 bi->setText(data);
4623 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4624 assertEquals(WHERE, 4, bi->next());
4625 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4626 assertEquals(WHERE, UBRK_DONE, bi->next());
4627 assertEquals(WHERE, 4, bi->current());
4628 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4629 }
4630
4631 // TestReverse exercises both the synthesized safe reverse rules and the logic
4632 // for filling the break iterator cache when starting from random positions
4633 // in the text.
4634 //
4635 // It's a monkey test, working on random data, with the expected data obtained
4636 // from forward iteration (no safe rules involved), comparing with results
4637 // when indexing into the interior of the string (safe rules needed).
4638
TestReverse()4639 void RBBITest::TestReverse() {
4640 UErrorCode status = U_ZERO_ERROR;
4641
4642 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4643 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4644 assertSuccess(WHERE, status, true);
4645 status = U_ZERO_ERROR;
4646 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4647 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4648 assertSuccess(WHERE, status, true);
4649 status = U_ZERO_ERROR;
4650 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4651 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4652 assertSuccess(WHERE, status, true);
4653 status = U_ZERO_ERROR;
4654 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4655 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4656 assertSuccess(WHERE, status, true);
4657 }
4658
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4659 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4660 if (!bi) {
4661 return;
4662 }
4663
4664 // From the mapping trie in the break iterator's internal data, create a
4665 // vector of UnicodeStrings, one for each character category, containing
4666 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4667 // to avoid an execess of unassigned code points.
4668
4669 RBBIDataWrapper *data = bi->fData;
4670 int32_t categoryCount = data->fHeader->fCatCount;
4671 UCPTrie *trie = data->fTrie;
4672 bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4673 uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4674
4675 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4676 for (int cp=0; cp<0x1fff0; ++cp) {
4677 int cat = ucptrie_get(trie, cp);
4678 cat &= ~dictBit; // And off the dictionary bit from the category.
4679 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4680 if (cat < 0 || cat >= categoryCount) return;
4681 strings[cat].append(cp);
4682 }
4683
4684 icu_rand randomGen;
4685 const int testStringLength = 10000;
4686 UnicodeString testString;
4687
4688 for (int i=0; i<testStringLength; ++i) {
4689 int charClass = randomGen() % categoryCount;
4690 if (strings[charClass].length() > 0) {
4691 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4692 testString.append(cp);
4693 }
4694 }
4695
4696 typedef std::pair<UBool, int32_t> Result;
4697 std::vector<Result> expectedResults;
4698 bi->setText(testString);
4699 for (int i=0; i<testString.length(); ++i) {
4700 bool isboundary = bi->isBoundary(i);
4701 int ruleStatus = bi->getRuleStatus();
4702 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4703 }
4704
4705 for (int i=testString.length()-1; i>=0; --i) {
4706 bi->setText(testString); // clears the internal break cache
4707 Result expected = expectedResults[i];
4708 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4709 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4710 }
4711 }
4712
4713
4714 // Ticket 13692 - finding word boundaries in very large numbers or words could
4715 // be very time consuming. When the problem was present, this void test
4716 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4717
TestBug13692()4718 void RBBITest::TestBug13692() {
4719 UErrorCode status = U_ZERO_ERROR;
4720 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4721 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4722 if (!assertSuccess(WHERE, status, true)) {
4723 return;
4724 }
4725 constexpr int32_t LENGTH = 1000000;
4726 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4727 for (int i=0; i<20; i+=2) {
4728 longNumber.setCharAt(i, u' ');
4729 }
4730 bi->setText(longNumber);
4731 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4732 assertSuccess(WHERE, status);
4733 }
4734
4735
TestProperties()4736 void RBBITest::TestProperties() {
4737 UErrorCode errorCode = U_ZERO_ERROR;
4738 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4739 if (!prependSet.isEmpty()) {
4740 errln(
4741 "[:GCB=Prepend:] is not empty any more. "
4742 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4743 "change this test to the opposite condition.");
4744 }
4745 }
4746
4747
4748 //
4749 // TestDebug - A place-holder test for debugging purposes.
4750 // For putting in fragments of other tests that can be invoked
4751 // for tracing without a lot of unwanted extra stuff happening.
4752 //
TestDebug(void)4753 void RBBITest::TestDebug(void) {
4754 UErrorCode status = U_ZERO_ERROR;
4755 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4756 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4757 if (!assertSuccess(WHERE, status, true)) {
4758 return;
4759 }
4760 const UnicodeString &rules = bi->getRules();
4761 UParseError pe;
4762 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4763 assertSuccess(WHERE, status);
4764 }
4765
4766
4767 //
4768 // TestDebugRules A stub test for use in debugging rule compilation problems.
4769 // Can be freely altered as needed or convenient.
4770 // Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4771 // data files may not be available in all environments.
4772 // Any permanent test cases should be moved to rbbitst.txt
4773 // (see Bug 20303 in that file, for example), or to another test function in this file.
4774 //
TestDebugRules()4775 void RBBITest::TestDebugRules() {
4776 #if 0
4777 const char16_t *rules = u""
4778 "!!quoted_literals_only; \n"
4779 "!!chain; \n"
4780 "!!lookAheadHardBreak; \n"
4781 " \n"
4782 // "[a] / ; \n"
4783 "[a] [b] / [c] [d]; \n"
4784 "[a] [b] / [c] [d] {100}; \n"
4785 "[x] [a] [b] / [c] [d] {100}; \n"
4786 "[a] [b] [c] / [d] {100}; \n"
4787 //" [c] [d] / [e] [f]; \n"
4788 //"[a] [b] / [c]; \n"
4789 ;
4790
4791 UErrorCode status = U_ZERO_ERROR;
4792 CharString path(pathToDataDirectory(), status);
4793 path.appendPathPart("brkitr", status);
4794 path.appendPathPart("rules", status);
4795 path.appendPathPart("line.txt", status);
4796 int len;
4797 std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4798 if (!assertSuccess(WHERE, status)) {
4799 return;
4800 }
4801
4802 UParseError pe;
4803 // rules = testFile.get();
4804 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4805
4806 if (!assertSuccess(WHERE, status)) {
4807 delete bi;
4808 return;
4809 }
4810 // bi->dumpTables();
4811
4812 delete bi;
4813 #endif
4814 }
4815
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4816 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4817 UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4818 int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4819 // Text are duplicate characters from U+4E00 to U+4FFF
4820 UnicodeString text;
4821 for (UChar c = 0x4e00; c < 0x5000; c++) {
4822 text.append(c).append(c);
4823 }
4824 // Generate rule which will caused length+4 character classes and
4825 // length+3 states
4826 UnicodeString rules(u"!!quoted_literals_only;");
4827 for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4828 rules.append(u'\'').append(c).append(c).append(u"';");
4829 }
4830 rules.append(u".;");
4831 UErrorCode status = U_ZERO_ERROR;
4832 UParseError parseError;
4833 RuleBasedBreakIterator bi(rules, parseError, status);
4834
4835 assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4836 assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4837 assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4838 assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4839 assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4840
4841 bi.setText(text);
4842
4843 int32_t pos;
4844 int32_t i = 0;
4845 while ((pos = bi.next()) > 0) {
4846 // The first numChar should not break between the pair
4847 if (i++ < numChar) {
4848 assertEquals(WHERE, i * 2, pos);
4849 } else {
4850 // After the first numChar next(), break on each character.
4851 assertEquals(WHERE, i + numChar, pos);
4852 }
4853 }
4854 while ((pos = bi.previous()) > 0) {
4855 // The first numChar should not break between the pair
4856 if (--i < numChar) {
4857 assertEquals(WHERE, i * 2, pos);
4858 } else {
4859 // After the first numChar next(), break on each character.
4860 assertEquals(WHERE, i + numChar, pos);
4861 }
4862 }
4863 }
4864
Test8BitsTrieWith8BitStateTable()4865 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4866 testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4867 }
4868
Test16BitsTrieWith8BitStateTable()4869 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4870 testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4871 }
4872
Test16BitsTrieWith16BitStateTable()4873 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4874 testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4875 }
4876
Test8BitsTrieWith16BitStateTable()4877 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4878 // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4879 // create state table in 16 bits.
4880
4881 // Generate 510 'a' as text
4882 UnicodeString text;
4883 for (int32_t i = 0; i < 510; i++) {
4884 text.append(u'a');
4885 }
4886
4887 UnicodeString rules(u"!!quoted_literals_only;'");
4888 // 254 'a' in the rule will cause 256 states
4889 for (int32_t i = 0; i < 254; i++) {
4890 rules.append(u'a');
4891 }
4892 rules.append(u"';.;");
4893
4894 UErrorCode status = U_ZERO_ERROR;
4895 UParseError parseError;
4896 LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4897
4898 assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4899 assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
4900 assertEquals(WHERE,
4901 false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
4902 bi->setText(text);
4903
4904 // break positions:
4905 // 254, 508, 509, ... 510
4906 assertEquals("next()", 254, bi->next());
4907 int32_t i = 0;
4908 int32_t pos;
4909 while ((pos = bi->next()) > 0) {
4910 assertEquals(WHERE, 508 + i , pos);
4911 i++;
4912 }
4913 i = 0;
4914 while ((pos = bi->previous()) > 0) {
4915 i++;
4916 if (pos >= 508) {
4917 assertEquals(WHERE, 510 - i , pos);
4918 } else {
4919 assertEquals(WHERE, 254 , pos);
4920 }
4921 }
4922 }
4923
4924 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
4925 // that there are no problems with rules at the size that transitions between the two.
4926 //
4927 // A rule that matches a literal string, like 'abcdefghij', will require one state and
4928 // one character class per character in the string. So we can make a rule to tickle the
4929 // boundaries by using literal strings of various lengths.
4930 //
4931 // For both the number of states and the number of character classes, the eight bit format
4932 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
4933 // leaving 120 something available. This test runs the string over the range of 120 - 130,
4934 // which allows some margin for changes to the number of values reserved by the rule builder
4935 // without breaking the test.
4936
TestTable_8_16_Bits()4937 void RBBITest::TestTable_8_16_Bits() {
4938
4939 // testStr serves as both the source of the rule string (truncated to the desired length)
4940 // and as test data to check matching behavior. A break rule consisting of the first 120
4941 // characters of testStr will match the first 120 chars of the full-length testStr.
4942 UnicodeString testStr;
4943 for (UChar c=0x3000; c<0x3200; ++c) {
4944 testStr.append(c);
4945 }
4946
4947 const int32_t startLength = 120; // The shortest rule string to test.
4948 const int32_t endLength = 260; // The longest rule string to test
4949 const int32_t increment = this->quick ? endLength - startLength : 1;
4950
4951 for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
4952 UParseError parseError;
4953 UErrorCode status = U_ZERO_ERROR;
4954
4955 UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
4956 ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
4957 RuleBasedBreakIterator bi(ruleString, parseError, status);
4958 if (!assertSuccess(WHERE, status)) {
4959 errln(ruleString);
4960 break;
4961 }
4962 // bi.dumpTables();
4963
4964 // Verify that the break iterator is functioning - that the first boundary found
4965 // in testStr is at the length of the rule string.
4966 bi.setText(testStr);
4967 assertEquals(WHERE, ruleLen, bi.next());
4968
4969 // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
4970 // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
4971 bi.setText(testStr);
4972 int32_t result = bi.preceding(ruleLen);
4973 assertEquals(WHERE, 0, result);
4974
4975 // Verify that the range of rule lengths being tested cover the translations
4976 // from 8 to 16 bit data.
4977 bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
4978 bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
4979
4980 if (ruleLen == startLength) {
4981 assertEquals(WHERE, true, has8BitRowData);
4982 assertEquals(WHERE, true, has8BitsTrie);
4983 }
4984 if (ruleLen == endLength) {
4985 assertEquals(WHERE, false, has8BitRowData);
4986 assertEquals(WHERE, false, has8BitsTrie);
4987 }
4988 }
4989 }
4990
4991 /* Test handling of a large number of look-ahead rules.
4992 * The number of rules in the test exceeds the implementation limits prior to the
4993 * improvements introduced with #13590.
4994 *
4995 * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
4996 * The text being matched is sequential, "ABCDEFGHI..."
4997 *
4998 * The upshot is that the look-ahead rules all match on their preceding context,
4999 * and consequently must save a potential result, but then fail to match on their
5000 * trailing context, so that they don't actually cause a boundary.
5001 *
5002 * Additionally, add a ".*" rule, so there are no boundaries unless a
5003 * look-ahead hard-break rule forces one.
5004 */
TestBug13590()5005 void RBBITest::TestBug13590() {
5006 UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5007
5008 const int NUM_LOOKAHEAD_RULES = 50;
5009 const char16_t STARTING_CHAR = u'\u5000';
5010 char16_t firstChar;
5011 for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5012 firstChar = STARTING_CHAR + ruleNum*2;
5013 rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5014 .append(u' ') .append(u'/') .append(u' ')
5015 .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5016 .append(u';') .append(u'\n');
5017 }
5018
5019 // Change the last rule added from the form "UV / WY" to "UV / WX".
5020 // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5021 rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5022
5023 UErrorCode status = U_ZERO_ERROR;
5024 UParseError parseError;
5025 RuleBasedBreakIterator bi(rules, parseError, status);
5026 if (!assertSuccess(WHERE, status)) {
5027 errln(rules);
5028 return;
5029 }
5030 // bi.dumpTables();
5031
5032 UnicodeString testString;
5033 for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5034 testString.append(c);
5035 }
5036 bi.setText(testString);
5037
5038 int breaksFound = 0;
5039 while (bi.next() != UBRK_DONE) {
5040 ++breaksFound;
5041 }
5042
5043 // Two matches are expected, one from the last rule that was explicitly modified,
5044 // and one at the end of the text.
5045 assertEquals(WHERE, 2, breaksFound);
5046 }
5047
5048
5049 #if U_ENABLE_TRACING
5050 static std::vector<std::string> gData;
5051 static std::vector<int32_t> gEntryFn;
5052 static std::vector<int32_t> gExitFn;
5053 static std::vector<int32_t> gDataFn;
5054
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5055 static void U_CALLCONV traceData(
5056 const void*,
5057 int32_t fnNumber,
5058 int32_t,
5059 const char *,
5060 va_list args) {
5061 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5062 const char* data = va_arg(args, const char*);
5063 gDataFn.push_back(fnNumber);
5064 gData.push_back(data);
5065 }
5066 }
5067
traceEntry(const void *,int32_t fnNumber)5068 static void traceEntry(const void *, int32_t fnNumber) {
5069 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5070 gEntryFn.push_back(fnNumber);
5071 }
5072 }
5073
traceExit(const void *,int32_t fnNumber,const char *,va_list)5074 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5075 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5076 gExitFn.push_back(fnNumber);
5077 }
5078 }
5079
5080
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5081 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5082 assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5083 assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5084 assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5085 assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5086
5087 if (expectedData == nullptr) {
5088 assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5089 assertEquals("utrace_data should not be called ", 0, gData.size());
5090 } else {
5091 assertEquals("utrace_data should be called ", 1, gDataFn.size());
5092 assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5093 assertEquals("utrace_data should be called ", 1, gData.size());
5094 assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5095 }
5096 }
5097
SetupTestTrace()5098 void SetupTestTrace() {
5099 gEntryFn.clear();
5100 gExitFn.clear();
5101 gDataFn.clear();
5102 gData.clear();
5103
5104 const void* context = nullptr;
5105 utrace_setFunctions(context, traceEntry, traceExit, traceData);
5106 utrace_setLevel(UTRACE_INFO);
5107 }
5108
TestTraceCreateCharacter(void)5109 void RBBITest::TestTraceCreateCharacter(void) {
5110 SetupTestTrace();
5111 IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5112 LocalPointer<BreakIterator> brkitr(
5113 BreakIterator::createCharacterInstance("zh-CN", status));
5114 status.errIfFailureAndReset();
5115 assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5116 }
5117
TestTraceCreateTitle(void)5118 void RBBITest::TestTraceCreateTitle(void) {
5119 SetupTestTrace();
5120 IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5121 LocalPointer<BreakIterator> brkitr(
5122 BreakIterator::createTitleInstance("zh-CN", status));
5123 status.errIfFailureAndReset();
5124 assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5125 }
5126
TestTraceCreateSentence(void)5127 void RBBITest::TestTraceCreateSentence(void) {
5128 SetupTestTrace();
5129 IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5130 LocalPointer<BreakIterator> brkitr(
5131 BreakIterator::createSentenceInstance("zh-CN", status));
5132 status.errIfFailureAndReset();
5133 assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5134 }
5135
TestTraceCreateWord(void)5136 void RBBITest::TestTraceCreateWord(void) {
5137 SetupTestTrace();
5138 IcuTestErrorCode status(*this, "TestTraceCreateWord");
5139 LocalPointer<BreakIterator> brkitr(
5140 BreakIterator::createWordInstance("zh-CN", status));
5141 status.errIfFailureAndReset();
5142 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5143 }
5144
TestTraceCreateLine(void)5145 void RBBITest::TestTraceCreateLine(void) {
5146 SetupTestTrace();
5147 IcuTestErrorCode status(*this, "TestTraceCreateLine");
5148 LocalPointer<BreakIterator> brkitr(
5149 BreakIterator::createLineInstance("zh-CN", status));
5150 status.errIfFailureAndReset();
5151 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line");
5152 }
5153
TestTraceCreateLineStrict(void)5154 void RBBITest::TestTraceCreateLineStrict(void) {
5155 SetupTestTrace();
5156 IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5157 LocalPointer<BreakIterator> brkitr(
5158 BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5159 status.errIfFailureAndReset();
5160 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict");
5161 }
5162
TestTraceCreateLineNormal(void)5163 void RBBITest::TestTraceCreateLineNormal(void) {
5164 SetupTestTrace();
5165 IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5166 LocalPointer<BreakIterator> brkitr(
5167 BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5168 status.errIfFailureAndReset();
5169 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal");
5170 }
5171
TestTraceCreateLineLoose(void)5172 void RBBITest::TestTraceCreateLineLoose(void) {
5173 SetupTestTrace();
5174 IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5175 LocalPointer<BreakIterator> brkitr(
5176 BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5177 status.errIfFailureAndReset();
5178 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose");
5179 }
5180
TestTraceCreateLineLoosePhrase(void)5181 void RBBITest::TestTraceCreateLineLoosePhrase(void) {
5182 SetupTestTrace();
5183 IcuTestErrorCode status(*this, "TestTraceCreateLineLoosePhrase");
5184 LocalPointer<BreakIterator> brkitr(
5185 BreakIterator::createLineInstance("ja-u-lb-loose-lw-phrase", status));
5186 status.errIfFailureAndReset();
5187 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose_phrase");
5188 }
5189
TestTraceCreateLineNormalPhrase(void)5190 void RBBITest::TestTraceCreateLineNormalPhrase(void) {
5191 SetupTestTrace();
5192 IcuTestErrorCode status(*this, "TestTraceCreateLineNormalPhrase");
5193 LocalPointer<BreakIterator> brkitr(
5194 BreakIterator::createLineInstance("ja-u-lb-normal-lw-phrase", status));
5195 status.errIfFailureAndReset();
5196 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal_phrase");
5197 }
5198
TestTraceCreateLineStrictPhrase(void)5199 void RBBITest::TestTraceCreateLineStrictPhrase(void) {
5200 SetupTestTrace();
5201 IcuTestErrorCode status(*this, "TestTraceCreateLineStrictPhrase");
5202 LocalPointer<BreakIterator> brkitr(
5203 BreakIterator::createLineInstance("ja-u-lb-strict-lw-phrase", status));
5204 status.errIfFailureAndReset();
5205 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict_phrase");
5206 }
5207
TestTraceCreateLinePhrase(void)5208 void RBBITest::TestTraceCreateLinePhrase(void) {
5209 SetupTestTrace();
5210 IcuTestErrorCode status(*this, "TestTraceCreateLinePhrase");
5211 LocalPointer<BreakIterator> brkitr(
5212 BreakIterator::createLineInstance("ja-u-lw-phrase", status));
5213 status.errIfFailureAndReset();
5214 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_phrase");
5215 }
5216
TestTraceCreateBreakEngine(void)5217 void RBBITest::TestTraceCreateBreakEngine(void) {
5218 rbbi_cleanup();
5219 SetupTestTrace();
5220 IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5221 LocalPointer<BreakIterator> brkitr(
5222 BreakIterator::createWordInstance("zh-CN", status));
5223 status.errIfFailureAndReset();
5224 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5225
5226 // To word break the following text, BreakIterator will create 5 dictionary
5227 // break engine internally.
5228 brkitr->setText(
5229 u"test "
5230 u"測試 " // Hani
5231 u"សាកល្បង " // Khmr
5232 u"ທົດສອບ " // Laoo
5233 u"စမ်းသပ်မှု " // Mymr
5234 u"ทดสอบ " // Thai
5235 u"test "
5236 );
5237
5238 // Loop through all the text.
5239 while (brkitr->next() > 0) ;
5240
5241 assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5242 assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5243 assertEquals("utrace_data should be called ", 5, gDataFn.size());
5244
5245 for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5246 assertEquals("utrace_entry should be called ",
5247 UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5248 assertEquals("utrace_exit should be called ",
5249 UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5250 assertEquals("utrace_data should be called ",
5251 UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5252 }
5253
5254 assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5255 assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5256 assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5257 assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5258 assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5259
5260 }
5261 #endif
5262
TestUnpairedSurrogate()5263 void RBBITest::TestUnpairedSurrogate() {
5264 UnicodeString rules(u"ab;");
5265
5266 UErrorCode status = U_ZERO_ERROR;
5267 UParseError pe;
5268 RuleBasedBreakIterator bi1(rules, pe, status);
5269 assertSuccess(WHERE, status);
5270 UnicodeString rtRules = bi1.getRules();
5271 // make sure the simple one work first.
5272 assertEquals(WHERE, rules, rtRules);
5273
5274
5275 rules = UnicodeString(u"a\\ud800b;").unescape();
5276 pe.line = 0;
5277 pe.offset = 0;
5278 RuleBasedBreakIterator bi2(rules, pe, status);
5279 assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5280 if (pe.line != 1 || pe.offset != 1) {
5281 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5282 }
5283
5284 status = U_ZERO_ERROR;
5285 rules = UnicodeString(u"a\\ude00b;").unescape();
5286 pe.line = 0;
5287 pe.offset = 0;
5288 RuleBasedBreakIterator bi3(rules, pe, status);
5289 assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5290 if (pe.line != 1 || pe.offset != 1) {
5291 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5292 }
5293
5294 // make sure the surrogate one work too.
5295 status = U_ZERO_ERROR;
5296 rules = UnicodeString(u"ab;");
5297 RuleBasedBreakIterator bi4(rules, pe, status);
5298 rtRules = bi4.getRules();
5299 assertEquals(WHERE, rules, rtRules);
5300 }
5301
5302 // Read file generated by
5303 // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5304 // as test cases and compare the Output.
5305 // Format of the file
5306 // Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5307 // Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5308 // Input:\t[source text]
5309 // Output:\t[expected output separated by | ]
5310 // Input: ...
5311 // Output: ...
5312
runLSTMTestFromFile(const char * filename,UScriptCode script)5313 void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5314 // The expectation in this test depends on LSTM, skip the test if the
5315 // configuration is not build with LSTM data.
5316 if (skipLSTMTest()) {
5317 return;
5318 }
5319 UErrorCode status = U_ZERO_ERROR;
5320 LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5321 if (U_FAILURE(status)) {
5322 errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5323 return;
5324 }
5325 // Open and read the test data file.
5326 const char *testDataDirectory = IntlTest::getSourceTestData(status);
5327 CharString testFileName(testDataDirectory, -1, status);
5328 testFileName.append(filename, -1, status);
5329
5330 int len;
5331 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5332 if (U_FAILURE(status)) {
5333 errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5334 return;
5335 }
5336
5337 // Put the test data into a UnicodeString
5338 UnicodeString testString(FALSE, testFile, len);
5339
5340 int32_t start = 0;
5341
5342 UnicodeString line;
5343 int32_t end;
5344 std::string actual_sep_str;
5345 int32_t caseNum = 0;
5346 // Iterate through all the lines in the test file.
5347 do {
5348 int32_t cr = testString.indexOf(u'\r', start);
5349 int32_t lf = testString.indexOf(u'\n', start);
5350 end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5351 line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5352 if (line.length() > 0) {
5353 // Separate each line to key and value by TAB.
5354 int32_t tab = line.indexOf(u'\t');
5355 UnicodeString key = line.tempSubString(0, tab);
5356 const UnicodeString value = line.tempSubString(tab+1);
5357
5358 if (key == "Model:") {
5359 // Verify the expectation in the test file match the LSTM model
5360 // we are using now.
5361 const LSTMData* data = CreateLSTMDataForScript(script, status);
5362 if (U_FAILURE(status)) {
5363 dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5364 __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5365 return;
5366 }
5367 UnicodeString name(LSTMDataName(data));
5368 DeleteLSTMData(data);
5369 if (value != name) {
5370 std::string utf8Name, utf8Value;
5371 dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5372 __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5373 name.toUTF8String<std::string>(utf8Name).c_str(),
5374 value.toUTF8String<std::string>(utf8Value).c_str());
5375 return;
5376 }
5377 } else if (key == "Input:") {
5378 UnicodeString input("prefix ");
5379 input += value + " suffix";
5380 std::stringstream ss;
5381
5382 // Construct the UText which is expected by the the engine as
5383 // input from the UnicodeString.
5384 UText ut = UTEXT_INITIALIZER;
5385 utext_openConstUnicodeString(&ut, &input, &status);
5386 if (U_FAILURE(status)) {
5387 dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5388 return;
5389 }
5390
5391 iterator->setText(&ut, status);
5392 if (U_FAILURE(status)) {
5393 errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5394 return;
5395 }
5396
5397 int32_t bp;
5398 for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5399 ss << bp;
5400 if (bp != input.length()) {
5401 ss << ", ";
5402 }
5403 }
5404
5405 utext_close(&ut);
5406 // Turn the break points into a string for easy comparison
5407 // output.
5408 actual_sep_str = "{" + ss.str() + "}";
5409 } else if (key == "Output:" && !actual_sep_str.empty()) {
5410 UnicodeString input("prefix| |");
5411 input += value + "| |suffix";
5412 std::string d;
5413 int32_t sep;
5414 int32_t start = 0;
5415 int32_t curr = 0;
5416 std::stringstream ss;
5417 // Include 0 as the break point.
5418 ss << "0, ";
5419 while ((sep = input.indexOf(u'|', start)) >= 0) {
5420 int32_t len = sep - start;
5421 if (len > 0) {
5422 if (curr > 0) {
5423 ss << ", ";
5424 }
5425 curr += len;
5426 ss << curr;
5427 }
5428 start = sep + 1;
5429 }
5430 // Include end of the string as break point.
5431 ss << ", " << curr + input.length() - start;
5432 // Turn the break points into a string for easy comparison
5433 // output.
5434 std::string expected = "{" + ss.str() + "}";
5435 std::string utf8;
5436
5437 assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5438 expected.c_str(), actual_sep_str.c_str());
5439 actual_sep_str.clear();
5440 }
5441 }
5442 start = std::max(cr, lf) + 1;
5443 } while (end >= 0);
5444
5445 delete [] testFile;
5446 }
5447
TestLSTMThai()5448 void RBBITest::TestLSTMThai() {
5449 runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5450 }
5451
TestLSTMBurmese()5452 void RBBITest::TestLSTMBurmese() {
5453 runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5454 }
5455
5456 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5457