1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <sstream>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <utility>
22 #include <vector>
23
24 #include "unicode/brkiter.h"
25 #include "unicode/localpointer.h"
26 #include "unicode/numfmt.h"
27 #include "unicode/rbbi.h"
28 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
29 #include "unicode/regex.h"
30 #endif
31 #include "unicode/schriter.h"
32 #include "unicode/uchar.h"
33 #include "unicode/utf16.h"
34 #include "unicode/ucnv.h"
35 #include "unicode/uniset.h"
36 #include "unicode/uscript.h"
37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
39 #include "unicode/utrace.h"
40
41 #include "charstr.h"
42 #include "cmemory.h"
43 #include "cstr.h"
44 #include "intltest.h"
45 #include "rbbitst.h"
46 #include "rbbidata.h"
47 #include "utypeinfo.h" // for 'typeid' to work
48 #include "uvector.h"
49 #include "uvectr32.h"
50
51
52 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
53 #include "unicode/filteredbrk.h"
54 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
55
56 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
57 if (!(x)) { \
58 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
59 } \
60 } UPRV_BLOCK_MACRO_END
61
62 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
63 if (U_FAILURE(errcode)) { \
64 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
65 } \
66 } UPRV_BLOCK_MACRO_END
67
68 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
69 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
70 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
71 }
72
73 //---------------------------------------------
74 // runIndexedTest
75 //---------------------------------------------
76
77
78 // Note: Before adding new tests to this file, check whether the desired test data can
79 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
80 // it's much less work than writing a new test, diagnostic output in the event of failures
81 // is good, and the test data file will is shared with ICU4J, so eventually the test
82 // will run there as well, without additional effort.
83
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)84 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
85 {
86 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
87 fTestParams = params;
88
89 TESTCASE_AUTO_BEGIN;
90 #if !UCONFIG_NO_FILE_IO
91 TESTCASE_AUTO(TestBug4153072);
92 #endif
93 #if !UCONFIG_NO_FILE_IO
94 TESTCASE_AUTO(TestUnicodeFiles);
95 #endif
96 TESTCASE_AUTO(TestGetAvailableLocales);
97 TESTCASE_AUTO(TestGetDisplayName);
98 #if !UCONFIG_NO_FILE_IO
99 TESTCASE_AUTO(TestEndBehaviour);
100 TESTCASE_AUTO(TestWordBreaks);
101 TESTCASE_AUTO(TestWordBoundary);
102 TESTCASE_AUTO(TestLineBreaks);
103 TESTCASE_AUTO(TestSentBreaks);
104 TESTCASE_AUTO(TestExtended);
105 #endif
106 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
107 TESTCASE_AUTO(TestMonkey);
108 #endif
109 #if !UCONFIG_NO_FILE_IO
110 TESTCASE_AUTO(TestBug3818);
111 #endif
112 TESTCASE_AUTO(TestDebug);
113 #if !UCONFIG_NO_FILE_IO
114 TESTCASE_AUTO(TestBug5775);
115 #endif
116 TESTCASE_AUTO(TestBug9983);
117 TESTCASE_AUTO(TestDictRules);
118 TESTCASE_AUTO(TestBug5532);
119 TESTCASE_AUTO(TestBug7547);
120 TESTCASE_AUTO(TestBug12797);
121 TESTCASE_AUTO(TestBug12918);
122 TESTCASE_AUTO(TestBug12932);
123 TESTCASE_AUTO(TestEmoji);
124 TESTCASE_AUTO(TestBug12519);
125 TESTCASE_AUTO(TestBug12677);
126 TESTCASE_AUTO(TestTableRedundancies);
127 TESTCASE_AUTO(TestBug13447);
128 TESTCASE_AUTO(TestReverse);
129 TESTCASE_AUTO(TestBug13692);
130 TESTCASE_AUTO(TestDebugRules);
131 TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
132 TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
133 TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
134 TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
135 TESTCASE_AUTO(TestTable_8_16_Bits);
136 TESTCASE_AUTO(TestBug13590);
137 TESTCASE_AUTO(TestUnpairedSurrogate);
138
139 #if U_ENABLE_TRACING
140 TESTCASE_AUTO(TestTraceCreateCharacter);
141 TESTCASE_AUTO(TestTraceCreateWord);
142 TESTCASE_AUTO(TestTraceCreateSentence);
143 TESTCASE_AUTO(TestTraceCreateTitle);
144 TESTCASE_AUTO(TestTraceCreateLine);
145 TESTCASE_AUTO(TestTraceCreateLineNormal);
146 TESTCASE_AUTO(TestTraceCreateLineLoose);
147 TESTCASE_AUTO(TestTraceCreateLineStrict);
148 TESTCASE_AUTO(TestTraceCreateBreakEngine);
149 #endif
150
151 TESTCASE_AUTO_END;
152 }
153
154
155 //--------------------------------------------------------------------------------------
156 //
157 // RBBITest constructor and destructor
158 //
159 //--------------------------------------------------------------------------------------
160
RBBITest()161 RBBITest::RBBITest() {
162 fTestParams = NULL;
163 }
164
165
~RBBITest()166 RBBITest::~RBBITest() {
167 }
168
169
printStringBreaks(UText * tstr,int expected[],int expectedCount)170 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
171 UErrorCode status = U_ZERO_ERROR;
172 char name[100];
173 printf("code alpha extend alphanum type word sent line name\n");
174 int nextExpectedIndex = 0;
175 utext_setNativeIndex(tstr, 0);
176 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
177 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
178 printf("------------------------------------------------ %d\n", j);
179 ++nextExpectedIndex;
180 }
181
182 UChar32 c = utext_next32(tstr);
183 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
184 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
185 u_isUAlphabetic(c),
186 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
187 u_isalnum(c),
188 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
189 u_charType(c),
190 U_SHORT_PROPERTY_NAME),
191 u_getPropertyValueName(UCHAR_WORD_BREAK,
192 u_getIntPropertyValue(c,
193 UCHAR_WORD_BREAK),
194 U_SHORT_PROPERTY_NAME),
195 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
196 u_getIntPropertyValue(c,
197 UCHAR_SENTENCE_BREAK),
198 U_SHORT_PROPERTY_NAME),
199 u_getPropertyValueName(UCHAR_LINE_BREAK,
200 u_getIntPropertyValue(c,
201 UCHAR_LINE_BREAK),
202 U_SHORT_PROPERTY_NAME),
203 name);
204 }
205 }
206
207
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)208 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
209 UErrorCode status = U_ZERO_ERROR;
210 UText *tstr = NULL;
211 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
212 if (U_FAILURE(status)) {
213 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
214 return;
215 }
216 printStringBreaks(tstr, expected, expectedCount);
217 utext_close(tstr);
218 }
219
220
TestBug3818()221 void RBBITest::TestBug3818() {
222 UErrorCode status = U_ZERO_ERROR;
223
224 // Four Thai words...
225 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
226 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
227 UnicodeString thaiStr(thaiWordData);
228
229 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
230 if (U_FAILURE(status) || bi == NULL) {
231 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
232 return;
233 }
234 bi->setText(thaiStr);
235
236 int32_t startOfSecondWord = bi->following(1);
237 if (startOfSecondWord != 4) {
238 errln("Fail at file %s, line %d expected start of word at 4, got %d",
239 __FILE__, __LINE__, startOfSecondWord);
240 }
241 startOfSecondWord = bi->following(0);
242 if (startOfSecondWord != 4) {
243 errln("Fail at file %s, line %d expected start of word at 4, got %d",
244 __FILE__, __LINE__, startOfSecondWord);
245 }
246 delete bi;
247 }
248
249
250 //---------------------------------------------
251 //
252 // other tests
253 //
254 //---------------------------------------------
255
TestGetAvailableLocales()256 void RBBITest::TestGetAvailableLocales()
257 {
258 int32_t locCount = 0;
259 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
260
261 if (locCount == 0)
262 dataerrln("getAvailableLocales() returned an empty list!");
263 // Just make sure that it's returning good memory.
264 int32_t i;
265 for (i = 0; i < locCount; ++i) {
266 logln(locList[i].getName());
267 }
268 }
269
270 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()271 void RBBITest::TestGetDisplayName()
272 {
273 UnicodeString result;
274
275 BreakIterator::getDisplayName(Locale::getUS(), result);
276 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
277 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
278 + result);
279
280 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
281 if (result != "French (France)")
282 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
283 + result);
284 }
285 /**
286 * Test End Behaviour
287 * @bug 4068137
288 */
TestEndBehaviour()289 void RBBITest::TestEndBehaviour()
290 {
291 UErrorCode status = U_ZERO_ERROR;
292 UnicodeString testString("boo.");
293 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
294 if (U_FAILURE(status))
295 {
296 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
297 return;
298 }
299 wb->setText(testString);
300
301 if (wb->first() != 0)
302 errln("Didn't get break at beginning of string.");
303 if (wb->next() != 3)
304 errln("Didn't get break before period in \"boo.\"");
305 if (wb->current() != 4 && wb->next() != 4)
306 errln("Didn't get break at end of string.");
307 delete wb;
308 }
309 /*
310 * @bug 4153072
311 */
TestBug4153072()312 void RBBITest::TestBug4153072() {
313 UErrorCode status = U_ZERO_ERROR;
314 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
315 if (U_FAILURE(status))
316 {
317 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
318 return;
319 }
320 UnicodeString str("...Hello, World!...");
321 int32_t begin = 3;
322 int32_t end = str.length() - 3;
323 UBool onBoundary;
324
325 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
326 iter->adoptText(textIterator);
327 int index;
328 // Note: with the switch to UText, there is no way to restrict the
329 // iteration range to begin at an index other than zero.
330 // String character iterators created with a non-zero bound are
331 // treated by RBBI as being empty.
332 for (index = -1; index < begin + 1; ++index) {
333 onBoundary = iter->isBoundary(index);
334 if (index == 0? !onBoundary : onBoundary) {
335 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
336 " and begin index = " + begin);
337 }
338 }
339 delete iter;
340 }
341
342
343 //
344 // Test for problem reported by Ashok Matoria on 9 July 2007
345 // One.<kSoftHyphen><kSpace>Two.
346 //
347 // Sentence break at start (0) and then on calling next() it breaks at
348 // 'T' of "Two". Now, at this point if I do next() and
349 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
350 //
TestBug5775()351 void RBBITest::TestBug5775() {
352 UErrorCode status = U_ZERO_ERROR;
353 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
354 TEST_ASSERT_SUCCESS(status);
355 if (U_FAILURE(status)) {
356 return;
357 }
358 // Check for status first for better handling of no data errors.
359 TEST_ASSERT(bi != NULL);
360 if (bi == NULL) {
361 return;
362 }
363
364 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
365 // 01234 56789
366 s = s.unescape();
367 bi->setText(s);
368 int pos = bi->next();
369 TEST_ASSERT(pos == 6);
370 pos = bi->next();
371 TEST_ASSERT(pos == 10);
372 pos = bi->previous();
373 TEST_ASSERT(pos == 6);
374 delete bi;
375 }
376
377
378
379 //------------------------------------------------------------------------------
380 //
381 // RBBITest::Extended Run RBBI Tests from an external test data file
382 //
383 //------------------------------------------------------------------------------
384
385 struct TestParams {
386 BreakIterator *bi; // Break iterator is set while parsing test source.
387 // Changed out whenever test data changes break type.
388
389 UnicodeString dataToBreak; // Data that is built up while parsing the test.
390 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
391 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
392 UVector32 *srcCol;
393
394 UText *textToBreak; // UText, could be UTF8 or UTF16.
395 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
396 CharString utf8String; // UTF-8 form of text to break.
397
TestParamsTestParams398 TestParams(UErrorCode &status) : dataToBreak() {
399 bi = NULL;
400 expectedBreaks = new UVector32(status);
401 srcLine = new UVector32(status);
402 srcCol = new UVector32(status);
403 textToBreak = NULL;
404 textMap = new UVector32(status);
405 }
406
~TestParamsTestParams407 ~TestParams() {
408 delete bi;
409 delete expectedBreaks;
410 delete srcLine;
411 delete srcCol;
412 utext_close(textToBreak);
413 delete textMap;
414 }
415
416 int32_t getSrcLine(int32_t bp);
417 int32_t getExpectedBreak(int32_t bp);
418 int32_t getSrcCol(int32_t bp);
419
420 void setUTF16(UErrorCode &status);
421 void setUTF8(UErrorCode &status);
422 };
423
424 // Append a UnicodeString to a CharString with UTF-8 encoding.
425 // Substitute any invalid chars.
426 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)427 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
428 if (U_FAILURE(status)) {
429 return;
430 }
431 int32_t utf8Length;
432 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
433 src.getBuffer(), src.length(), // UTF-16 data
434 0xfffd, NULL, // Substitution char, number of subs.
435 &status);
436 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
437 return;
438 }
439 status = U_ZERO_ERROR;
440 int32_t capacity;
441 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
442 u_strToUTF8WithSub(buffer, utf8Length, NULL,
443 src.getBuffer(), src.length(),
444 0xfffd, NULL, &status);
445 dest.append(buffer, utf8Length, status);
446 }
447
448
setUTF16(UErrorCode & status)449 void TestParams::setUTF16(UErrorCode &status) {
450 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
451 textMap->removeAllElements();
452 for (int32_t i=0; i<dataToBreak.length(); i++) {
453 if (i == dataToBreak.getChar32Start(i)) {
454 textMap->addElement(i, status);
455 } else {
456 textMap->addElement(-1, status);
457 }
458 }
459 textMap->addElement(dataToBreak.length(), status);
460 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
461 }
462
463
setUTF8(UErrorCode & status)464 void TestParams::setUTF8(UErrorCode &status) {
465 if (U_FAILURE(status)) {
466 return;
467 }
468 utf8String.clear();
469 CharStringAppend(utf8String, dataToBreak, status);
470 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
471 if (U_FAILURE(status)) {
472 return;
473 }
474
475 textMap->removeAllElements();
476 int32_t utf16Index = 0;
477 for (;;) {
478 textMap->addElement(utf16Index, status);
479 UChar32 c32 = utext_current32(textToBreak);
480 if (c32 < 0) {
481 break;
482 }
483 utf16Index += U16_LENGTH(c32);
484 utext_next32(textToBreak);
485 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
486 textMap->addElement(-1, status);
487 }
488 }
489 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
490 }
491
492
getSrcLine(int32_t bp)493 int32_t TestParams::getSrcLine(int32_t bp) {
494 if (bp >= textMap->size()) {
495 bp = textMap->size() - 1;
496 }
497 int32_t i = 0;
498 for(; bp >= 0 ; --bp) {
499 // Move to a character boundary if we are not on one already.
500 i = textMap->elementAti(bp);
501 if (i >= 0) {
502 break;
503 }
504 }
505 return srcLine->elementAti(i);
506 }
507
508
getExpectedBreak(int32_t bp)509 int32_t TestParams::getExpectedBreak(int32_t bp) {
510 if (bp >= textMap->size()) {
511 return 0;
512 }
513 int32_t i = textMap->elementAti(bp);
514 int32_t retVal = 0;
515 if (i >= 0) {
516 retVal = expectedBreaks->elementAti(i);
517 }
518 return retVal;
519 }
520
521
getSrcCol(int32_t bp)522 int32_t TestParams::getSrcCol(int32_t bp) {
523 if (bp >= textMap->size()) {
524 bp = textMap->size() - 1;
525 }
526 int32_t i = 0;
527 for(; bp >= 0; --bp) {
528 // Move bp to a character boundary if we are not on one already.
529 i = textMap->elementAti(bp);
530 if (i >= 0) {
531 break;
532 }
533 }
534 return srcCol->elementAti(i);
535 }
536
537
executeTest(TestParams * t,UErrorCode & status)538 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
539 int32_t bp;
540 int32_t prevBP;
541 int32_t i;
542
543 TEST_ASSERT_SUCCESS(status);
544 if (U_FAILURE(status)) {
545 return;
546 }
547
548 if (t->bi == NULL) {
549 return;
550 }
551
552 t->bi->setText(t->textToBreak, status);
553 //
554 // Run the iterator forward
555 //
556 prevBP = -1;
557 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
558 if (prevBP == bp) {
559 // Fail for lack of forward progress.
560 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
561 bp, t->getSrcLine(bp), t->getSrcCol(bp));
562 break;
563 }
564
565 // Check that there we didn't miss an expected break between the last one
566 // and this one.
567 for (i=prevBP+1; i<bp; i++) {
568 if (t->getExpectedBreak(i) != 0) {
569 int expected[] = {0, i};
570 printStringBreaks(t->dataToBreak, expected, 2);
571 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
572 i, t->getSrcLine(i), t->getSrcCol(i));
573 }
574 }
575
576 // Check that the break we did find was expected
577 if (t->getExpectedBreak(bp) == 0) {
578 int expected[] = {0, bp};
579 printStringBreaks(t->textToBreak, expected, 2);
580 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
581 bp, t->getSrcLine(bp), t->getSrcCol(bp));
582 } else {
583 // The break was expected.
584 // Check that the {nnn} tag value is correct.
585 int32_t expectedTagVal = t->getExpectedBreak(bp);
586 if (expectedTagVal == -1) {
587 expectedTagVal = 0;
588 }
589 int32_t line = t->getSrcLine(bp);
590 int32_t rs = t->bi->getRuleStatus();
591 if (rs != expectedTagVal) {
592 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
593 " Actual, Expected status = %4d, %4d",
594 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
595 }
596 }
597
598 prevBP = bp;
599 }
600
601 // Verify that there were no missed expected breaks after the last one found
602 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
603 if (t->getExpectedBreak(i) != 0) {
604 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
605 i, t->getSrcLine(i), t->getSrcCol(i));
606 }
607 }
608
609 //
610 // Run the iterator backwards, verify that the same breaks are found.
611 //
612 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
613 bp = t->bi->last();
614 while (bp != BreakIterator::DONE) {
615 if (prevBP == bp) {
616 // Fail for lack of progress.
617 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
618 bp, t->getSrcLine(bp), t->getSrcCol(bp));
619 break;
620 }
621
622 // Check that we didn't miss an expected break between the last one
623 // and this one. (UVector returns zeros for index out of bounds.)
624 for (i=prevBP-1; i>bp; i--) {
625 if (t->getExpectedBreak(i) != 0) {
626 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
627 i, t->getSrcLine(i), t->getSrcCol(i));
628 }
629 }
630
631 // Check that the break we did find was expected
632 if (t->getExpectedBreak(bp) == 0) {
633 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
634 bp, t->getSrcLine(bp), t->getSrcCol(bp));
635 } else {
636 // The break was expected.
637 // Check that the {nnn} tag value is correct.
638 int32_t expectedTagVal = t->getExpectedBreak(bp);
639 if (expectedTagVal == -1) {
640 expectedTagVal = 0;
641 }
642 int line = t->getSrcLine(bp);
643 int32_t rs = t->bi->getRuleStatus();
644 if (rs != expectedTagVal) {
645 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
646 " Actual, Expected status = %4d, %4d",
647 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
648 }
649 }
650
651 prevBP = bp;
652 bp = t->bi->previous();
653 }
654
655 // Verify that there were no missed breaks prior to the last one found
656 for (i=prevBP-1; i>=0; i--) {
657 if (t->getExpectedBreak(i) != 0) {
658 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
659 i, t->getSrcLine(i), t->getSrcCol(i));
660 }
661 }
662
663 // Check isBoundary()
664 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
665 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
666 UBool boundaryFound = t->bi->isBoundary(i);
667 if (boundaryExpected != boundaryFound) {
668 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
669 " Expected, Actual= %s, %s",
670 i, t->getSrcLine(i), t->getSrcCol(i),
671 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
672 }
673 }
674
675 // Check following()
676 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
677 int32_t actualBreak = t->bi->following(i);
678 int32_t expectedBreak = BreakIterator::DONE;
679 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
680 if (t->getExpectedBreak(j) != 0) {
681 expectedBreak = j;
682 break;
683 }
684 }
685 if (expectedBreak != actualBreak) {
686 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
687 " Expected, Actual= %d, %d",
688 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
689 }
690 }
691
692 // Check preceding()
693 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
694 int32_t actualBreak = t->bi->preceding(i);
695 int32_t expectedBreak = BreakIterator::DONE;
696
697 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
698 // preceding(trailing byte) will return the index of some preceding code point,
699 // not the lead byte of the current code point, even though that has a smaller index.
700 // Therefore, start looking at the expected break data not at i-1, but at
701 // the start of code point index - 1.
702 utext_setNativeIndex(t->textToBreak, i);
703 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
704 for (; j >= 0; j--) {
705 if (t->getExpectedBreak(j) != 0) {
706 expectedBreak = j;
707 break;
708 }
709 }
710 if (expectedBreak != actualBreak) {
711 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
712 " Expected, Actual= %d, %d",
713 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
714 }
715 }
716 }
717
718
TestExtended()719 void RBBITest::TestExtended() {
720 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
721 // data driven test closely entangles filtered and regular data.
722 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
723 UErrorCode status = U_ZERO_ERROR;
724 Locale locale("");
725
726 TestParams tp(status);
727
728 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
729 if (U_FAILURE(status)) {
730 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
731 }
732
733 //
734 // Open and read the test data file.
735 //
736 const char *testDataDirectory = IntlTest::getSourceTestData(status);
737 CharString testFileName(testDataDirectory, -1, status);
738 testFileName.append("rbbitst.txt", -1, status);
739
740 int len;
741 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
742 if (U_FAILURE(status)) {
743 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
744 return;
745 }
746
747 bool skipTest = false; // Skip this test?
748
749 //
750 // Put the test data into a UnicodeString
751 //
752 UnicodeString testString(FALSE, testFile, len);
753
754 enum EParseState{
755 PARSE_COMMENT,
756 PARSE_TAG,
757 PARSE_DATA,
758 PARSE_NUM,
759 PARSE_RULES
760 }
761 parseState = PARSE_TAG;
762
763 EParseState savedState = PARSE_TAG;
764
765 int32_t lineNum = 1;
766 int32_t colStart = 0;
767 int32_t column = 0;
768 int32_t charIdx = 0;
769
770 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
771
772 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
773 int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
774
775 for (charIdx = 0; charIdx < len; ) {
776 status = U_ZERO_ERROR;
777 UChar c = testString.charAt(charIdx);
778 charIdx++;
779 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
780 // treat CRLF as a unit
781 c = u'\n';
782 charIdx++;
783 }
784 if (c == u'\n' || c == u'\r') {
785 lineNum++;
786 colStart = charIdx;
787 }
788 column = charIdx - colStart + 1;
789
790 switch (parseState) {
791 case PARSE_COMMENT:
792 if (c == u'\n' || c == u'\r') {
793 parseState = savedState;
794 }
795 break;
796
797 case PARSE_TAG:
798 {
799 if (c == u'#') {
800 parseState = PARSE_COMMENT;
801 savedState = PARSE_TAG;
802 break;
803 }
804 if (u_isUWhiteSpace(c)) {
805 break;
806 }
807 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
808 delete tp.bi;
809 tp.bi = BreakIterator::createWordInstance(locale, status);
810 skipTest = false;
811 charIdx += 5;
812 break;
813 }
814 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
815 delete tp.bi;
816 tp.bi = BreakIterator::createCharacterInstance(locale, status);
817 skipTest = false;
818 charIdx += 5;
819 break;
820 }
821 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
822 delete tp.bi;
823 tp.bi = BreakIterator::createLineInstance(locale, status);
824 skipTest = false;
825 charIdx += 5;
826 break;
827 }
828 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
829 delete tp.bi;
830 tp.bi = BreakIterator::createSentenceInstance(locale, status);
831 skipTest = false;
832 charIdx += 5;
833 break;
834 }
835 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
836 delete tp.bi;
837 tp.bi = BreakIterator::createTitleInstance(locale, status);
838 charIdx += 6;
839 break;
840 }
841
842 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
843 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
844 charIdx = testString.indexOf(u'>', charIdx) + 1;
845 parseState = PARSE_RULES;
846 rules.remove();
847 rulesFirstLine = lineNum;
848 break;
849 }
850
851 // <locale loc_name>
852 localeMatcher.reset(testString);
853 if (localeMatcher.lookingAt(charIdx-1, status)) {
854 UnicodeString localeName = localeMatcher.group(1, status);
855 char localeName8[100];
856 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
857 locale = Locale::createFromName(localeName8);
858 charIdx += localeMatcher.group(0, status).length() - 1;
859 TEST_ASSERT_SUCCESS(status);
860 break;
861 }
862 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
863 parseState = PARSE_DATA;
864 charIdx += 5;
865 tp.dataToBreak = "";
866 tp.expectedBreaks->removeAllElements();
867 tp.srcCol ->removeAllElements();
868 tp.srcLine->removeAllElements();
869 break;
870 }
871
872 errln("line %d: Tag expected in test file.", lineNum);
873 parseState = PARSE_COMMENT;
874 savedState = PARSE_DATA;
875 goto end_test; // Stop the test.
876 }
877 break;
878
879 case PARSE_RULES:
880 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
881 charIdx += 7;
882 parseState = PARSE_TAG;
883 delete tp.bi;
884 UParseError pe;
885 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
886 skipTest = U_FAILURE(status);
887 if (U_FAILURE(status)) {
888 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
889 rulesFirstLine + pe.line - 1, u_errorName(status));
890 }
891 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
892 charIdx += 10;
893 parseState = PARSE_TAG;
894 UErrorCode ec = U_ZERO_ERROR;
895 UParseError pe;
896 RuleBasedBreakIterator bi(rules, pe, ec);
897 if (U_SUCCESS(ec)) {
898 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
899 rulesFirstLine + pe.line - 1);
900 }
901 } else {
902 rules.append(c);
903 }
904 break;
905
906 case PARSE_DATA:
907 if (c == u'•') {
908 int32_t breakIdx = tp.dataToBreak.length();
909 if (tp.expectedBreaks->size() > breakIdx) {
910 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
911 lineNum, column);
912 }
913 tp.expectedBreaks->setSize(breakIdx+1);
914 tp.expectedBreaks->setElementAt(-1, breakIdx);
915 tp.srcLine->setSize(breakIdx+1);
916 tp.srcLine->setElementAt(lineNum, breakIdx);
917 tp.srcCol ->setSize(breakIdx+1);
918 tp.srcCol ->setElementAt(column, breakIdx);
919 break;
920 }
921
922 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
923 // Add final entry to mappings from break location to source file position.
924 // Need one extra because last break position returned is after the
925 // last char in the data, not at the last char.
926 tp.srcLine->addElement(lineNum, status);
927 tp.srcCol ->addElement(column, status);
928
929 parseState = PARSE_TAG;
930 charIdx += 6;
931
932 if (!skipTest) {
933 // RUN THE TEST!
934 status = U_ZERO_ERROR;
935 tp.setUTF16(status);
936 executeTest(&tp, status);
937 TEST_ASSERT_SUCCESS(status);
938
939 // Run again, this time with UTF-8 text wrapped in a UText.
940 status = U_ZERO_ERROR;
941 tp.setUTF8(status);
942 TEST_ASSERT_SUCCESS(status);
943 executeTest(&tp, status);
944 }
945 break;
946 }
947
948 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
949 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
950 // Get the code point from the name and insert it into the test data.
951 // (Damn, no API takes names in Unicode !!!
952 // we've got to take it back to char *)
953 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
954 int32_t nameLength = nameEndIdx - (charIdx+2);
955 char charNameBuf[200];
956 UChar32 theChar = -1;
957 if (nameEndIdx != -1) {
958 UErrorCode status = U_ZERO_ERROR;
959 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
960 charNameBuf[sizeof(charNameBuf)-1] = 0;
961 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
962 if (U_FAILURE(status)) {
963 theChar = -1;
964 }
965 }
966 if (theChar == -1) {
967 errln("Error in named character in test file at line %d, col %d",
968 lineNum, column);
969 } else {
970 // Named code point was recognized. Insert it
971 // into the test data.
972 tp.dataToBreak.append(theChar);
973 while (tp.dataToBreak.length() > tp.srcLine->size()) {
974 tp.srcLine->addElement(lineNum, status);
975 tp.srcCol ->addElement(column, status);
976 }
977 }
978 if (nameEndIdx > charIdx) {
979 charIdx = nameEndIdx+1;
980
981 }
982 break;
983 }
984
985
986
987 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
988 charIdx++;
989 int32_t breakIdx = tp.dataToBreak.length();
990 tp.expectedBreaks->setSize(breakIdx+1);
991 tp.expectedBreaks->setElementAt(-1, breakIdx);
992 tp.srcLine->setSize(breakIdx+1);
993 tp.srcLine->setElementAt(lineNum, breakIdx);
994 tp.srcCol ->setSize(breakIdx+1);
995 tp.srcCol ->setElementAt(column, breakIdx);
996 break;
997 }
998
999 if (c == u'<') {
1000 tagValue = 0;
1001 parseState = PARSE_NUM;
1002 break;
1003 }
1004
1005 if (c == u'#' && column==3) { // TODO: why is column off so far?
1006 parseState = PARSE_COMMENT;
1007 savedState = PARSE_DATA;
1008 break;
1009 }
1010
1011 if (c == u'\\') {
1012 // Check for \ at end of line, a line continuation.
1013 // Advance over (discard) the newline
1014 UChar32 cp = testString.char32At(charIdx);
1015 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1016 // We have a CR LF
1017 // Need an extra increment of the input ptr to move over both of them
1018 charIdx++;
1019 }
1020 if (cp == u'\n' || cp == u'\r') {
1021 lineNum++;
1022 colStart = charIdx;
1023 charIdx++;
1024 break;
1025 }
1026
1027 // Let unescape handle the back slash.
1028 cp = testString.unescapeAt(charIdx);
1029 if (cp != -1) {
1030 // Escape sequence was recognized. Insert the char
1031 // into the test data.
1032 tp.dataToBreak.append(cp);
1033 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1034 tp.srcLine->addElement(lineNum, status);
1035 tp.srcCol ->addElement(column, status);
1036 }
1037 break;
1038 }
1039
1040
1041 // Not a recognized backslash escape sequence.
1042 // Take the next char as a literal.
1043 // TODO: Should this be an error?
1044 c = testString.charAt(charIdx);
1045 charIdx = testString.moveIndex32(charIdx, 1);
1046 }
1047
1048 // Normal, non-escaped data char.
1049 tp.dataToBreak.append(c);
1050
1051 // Save the mapping from offset in the data to line/column numbers in
1052 // the original input file. Will be used for better error messages only.
1053 // If there's an expected break before this char, the slot in the mapping
1054 // vector will already be set for this char; don't overwrite it.
1055 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1056 tp.srcLine->addElement(lineNum, status);
1057 tp.srcCol ->addElement(column, status);
1058 }
1059 break;
1060
1061
1062 case PARSE_NUM:
1063 // We are parsing an expected numeric tag value, like <1234>,
1064 // within a chunk of data.
1065 if (u_isUWhiteSpace(c)) {
1066 break;
1067 }
1068
1069 if (c == u'>') {
1070 // Finished the number. Add the info to the expected break data,
1071 // and switch parse state back to doing plain data.
1072 parseState = PARSE_DATA;
1073 if (tagValue == 0) {
1074 tagValue = -1;
1075 }
1076 int32_t breakIdx = tp.dataToBreak.length();
1077 if (tp.expectedBreaks->size() > breakIdx) {
1078 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1079 lineNum, column);
1080 }
1081 tp.expectedBreaks->setSize(breakIdx+1);
1082 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1083 tp.srcLine->setSize(breakIdx+1);
1084 tp.srcLine->setElementAt(lineNum, breakIdx);
1085 tp.srcCol ->setSize(breakIdx+1);
1086 tp.srcCol ->setElementAt(column, breakIdx);
1087 break;
1088 }
1089
1090 if (u_isdigit(c)) {
1091 tagValue = tagValue*10 + u_charDigitValue(c);
1092 break;
1093 }
1094
1095 errln("Syntax Error in test file at line %d, col %d",
1096 lineNum, column);
1097 parseState = PARSE_COMMENT;
1098 goto end_test; // Stop the test
1099 break;
1100 }
1101
1102
1103 if (U_FAILURE(status)) {
1104 dataerrln("ICU Error %s while parsing test file at line %d.",
1105 u_errorName(status), lineNum);
1106 status = U_ZERO_ERROR;
1107 goto end_test; // Stop the test
1108 }
1109
1110 }
1111
1112 // Reached end of test file. Raise an error if parseState indicates that we are
1113 // within a block that should have been terminated.
1114
1115 if (parseState == PARSE_RULES) {
1116 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1117 lineNum, rulesFirstLine);
1118 }
1119 if (parseState == PARSE_DATA) {
1120 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1121 }
1122
1123
1124 end_test:
1125 delete [] testFile;
1126 #endif
1127 }
1128
1129
1130 //-------------------------------------------------------------------------------
1131 //
1132 // TestDictRules create a break iterator from source rules that includes a
1133 // dictionary range. Regression for bug #7130. Source rules
1134 // do not declare a break iterator type (word, line, sentence, etc.
1135 // but the dictionary code, without a type, would loop.
1136 //
1137 //-------------------------------------------------------------------------------
TestDictRules()1138 void RBBITest::TestDictRules() {
1139 const char *rules = "$dictionary = [a-z]; \n"
1140 "!!forward; \n"
1141 "$dictionary $dictionary; \n"
1142 "!!reverse; \n"
1143 "$dictionary $dictionary; \n";
1144 const char *text = "aa";
1145 UErrorCode status = U_ZERO_ERROR;
1146 UParseError parseError;
1147
1148 RuleBasedBreakIterator bi(rules, parseError, status);
1149 if (U_SUCCESS(status)) {
1150 UnicodeString utext = text;
1151 bi.setText(utext);
1152 int32_t position;
1153 int32_t loops;
1154 for (loops = 0; loops<10; loops++) {
1155 position = bi.next();
1156 if (position == RuleBasedBreakIterator::DONE) {
1157 break;
1158 }
1159 }
1160 TEST_ASSERT(loops == 1);
1161 } else {
1162 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1163 }
1164 }
1165
1166
1167
1168 //-------------------------------------------------------------------------------
1169 //
1170 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1171 // return the data in one big UChar * buffer, which the caller must delete.
1172 //
1173 // parameters:
1174 // fileName: the name of the file, with no directory part. The test data directory
1175 // is assumed.
1176 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1177 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1178 // specified here. The BOM, if it exists, will be stripped from the returned data.
1179 // Pass NULL for the system default encoding.
1180 // status
1181 // returns:
1182 // The file data, converted to UChar.
1183 // The caller must delete this when done with
1184 // delete [] theBuffer;
1185 //
1186 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1187 // Move this function to some common place.
1188 //
1189 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1190 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1191 UChar *retPtr = NULL;
1192 char *fileBuf = NULL;
1193 UConverter* conv = NULL;
1194 FILE *f = NULL;
1195
1196 ulen = 0;
1197 if (U_FAILURE(status)) {
1198 return retPtr;
1199 }
1200
1201 //
1202 // Open the file.
1203 //
1204 f = fopen(fileName, "rb");
1205 if (f == 0) {
1206 dataerrln("Error opening test data file %s\n", fileName);
1207 status = U_FILE_ACCESS_ERROR;
1208 return NULL;
1209 }
1210 //
1211 // Read it in
1212 //
1213 int fileSize;
1214 int amt_read;
1215
1216 fseek( f, 0, SEEK_END);
1217 fileSize = ftell(f);
1218 fileBuf = new char[fileSize];
1219 fseek(f, 0, SEEK_SET);
1220 amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
1221 if (amt_read != fileSize || fileSize <= 0) {
1222 errln("Error reading test data file.");
1223 goto cleanUpAndReturn;
1224 }
1225
1226 //
1227 // Look for a Unicode Signature (BOM) on the data just read
1228 //
1229 int32_t signatureLength;
1230 const char * fileBufC;
1231 const char* bomEncoding;
1232
1233 fileBufC = fileBuf;
1234 bomEncoding = ucnv_detectUnicodeSignature(
1235 fileBuf, fileSize, &signatureLength, &status);
1236 if(bomEncoding!=NULL ){
1237 fileBufC += signatureLength;
1238 fileSize -= signatureLength;
1239 encoding = bomEncoding;
1240 }
1241
1242 //
1243 // Open a converter to take the rule file to UTF-16
1244 //
1245 conv = ucnv_open(encoding, &status);
1246 if (U_FAILURE(status)) {
1247 goto cleanUpAndReturn;
1248 }
1249
1250 //
1251 // Convert the rules to UChar.
1252 // Preflight first to determine required buffer size.
1253 //
1254 ulen = ucnv_toUChars(conv,
1255 NULL, // dest,
1256 0, // destCapacity,
1257 fileBufC,
1258 fileSize,
1259 &status);
1260 if (status == U_BUFFER_OVERFLOW_ERROR) {
1261 // Buffer Overflow is expected from the preflight operation.
1262 status = U_ZERO_ERROR;
1263
1264 retPtr = new UChar[ulen+1];
1265 ucnv_toUChars(conv,
1266 retPtr, // dest,
1267 ulen+1,
1268 fileBufC,
1269 fileSize,
1270 &status);
1271 }
1272
1273 cleanUpAndReturn:
1274 fclose(f);
1275 delete []fileBuf;
1276 ucnv_close(conv);
1277 if (U_FAILURE(status)) {
1278 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1279 delete []retPtr;
1280 retPtr = 0;
1281 ulen = 0;
1282 }
1283 return retPtr;
1284 }
1285
1286
1287
1288 //--------------------------------------------------------------------------------------------
1289 //
1290 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1291 //
1292 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1293 void RBBITest::TestUnicodeFiles() {
1294 RuleBasedBreakIterator *bi;
1295 UErrorCode status = U_ZERO_ERROR;
1296
1297 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1298 TEST_ASSERT_SUCCESS(status);
1299 if (U_SUCCESS(status)) {
1300 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1301 }
1302 delete bi;
1303
1304 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1305 TEST_ASSERT_SUCCESS(status);
1306 if (U_SUCCESS(status)) {
1307 runUnicodeTestData("WordBreakTest.txt", bi);
1308 }
1309 delete bi;
1310
1311 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1312 TEST_ASSERT_SUCCESS(status);
1313 if (U_SUCCESS(status)) {
1314 runUnicodeTestData("SentenceBreakTest.txt", bi);
1315 }
1316 delete bi;
1317
1318 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1319 TEST_ASSERT_SUCCESS(status);
1320 if (U_SUCCESS(status)) {
1321 runUnicodeTestData("LineBreakTest.txt", bi);
1322 }
1323 delete bi;
1324 }
1325
1326
1327 // Check for test cases from the Unicode test data files that are known to fail
1328 // and should be skipped as known issues because ICU does not fully implement
1329 // the Unicode specifications, or because ICU includes tailorings that differ from
1330 // the Unicode standard.
1331 //
1332 // Test cases are identified by the test data sequence, which tends to be more stable
1333 // across Unicode versions than the test file line numbers.
1334 //
1335 // The test case with ticket "10666" is a dummy, included as an example.
1336
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1337 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1338 static struct TestCase {
1339 const char *fTicketNum;
1340 const char *fFileName;
1341 const UChar *fString;
1342 } badTestCases[] = {
1343 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1344 // The following tests were originally for
1345 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1346 // However, that ticket has been closed as fixed but these tests still fail, so
1347 // ICU-21097 has been created to investigate and address these remaining issues.
1348 {"21097", "LineBreakTest.txt", u"-#"},
1349 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1350 {"21097", "LineBreakTest.txt", u"\u002d\u00a7"},
1351 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1352 {"21097", "LineBreakTest.txt", u"\u002d\U00050005"},
1353 {"21097", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1354 {"21097", "LineBreakTest.txt", u"\u002d\u0e01"},
1355 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1356
1357 // The following tests were originally for
1358 // Issue ICU-12017 Improve line break around numbers.
1359 // However, that ticket has been closed as fixed but these tests still fail, so
1360 // ICU-21097 has been created to investigate and address these remaining issues.
1361 {"21097", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1362 {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1363 {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1364 {"21097", "LineBreakTest.txt", u"a.2 "},
1365 {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1366 {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1367 {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1368 {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1369 {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1370 {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1371 {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1372 {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1373 {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1374 {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1375 };
1376
1377 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1378 const TestCase &badCase = badTestCases[n];
1379 if (!strcmp(fileName, badCase.fFileName) &&
1380 testCase == UnicodeString(badCase.fString)) {
1381 return logKnownIssue(badCase.fTicketNum);
1382 }
1383 }
1384 return FALSE;
1385 }
1386
1387
1388 //--------------------------------------------------------------------------------------------
1389 //
1390 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1391 //
1392 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1393 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1394 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1395 UErrorCode status = U_ZERO_ERROR;
1396
1397 //
1398 // Open and read the test data file, put it into a UnicodeString.
1399 //
1400 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1401 char testFileName[1000];
1402 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1403 dataerrln("Can't open test data. Path too long.");
1404 return;
1405 }
1406 strcpy(testFileName, testDataDirectory);
1407 strcat(testFileName, fileName);
1408
1409 logln("Opening data file %s\n", fileName);
1410
1411 int len;
1412 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1413 if (status != U_FILE_ACCESS_ERROR) {
1414 TEST_ASSERT_SUCCESS(status);
1415 TEST_ASSERT(testFile != NULL);
1416 }
1417 if (U_FAILURE(status) || testFile == NULL) {
1418 return; /* something went wrong, error already output */
1419 }
1420 UnicodeString testFileAsString(TRUE, testFile, len);
1421
1422 //
1423 // Parse the test data file using a regular expression.
1424 // Each kind of token is recognized in its own capture group; what type of item was scanned
1425 // is identified by which group had a match.
1426 //
1427 // Caputure Group # 1 2 3 4 5
1428 // Parses this item: divide x hex digits comment \n unrecognized \n
1429 //
1430 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1431 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1432 UnicodeString testString;
1433 UVector32 breakPositions(status);
1434 int lineNumber = 1;
1435 TEST_ASSERT_SUCCESS(status);
1436 if (U_FAILURE(status)) {
1437 return;
1438 }
1439
1440 //
1441 // Scan through each test case, building up the string to be broken in testString,
1442 // and the positions that should be boundaries in the breakPositions vector.
1443 //
1444 int spin = 0;
1445 while (tokenMatcher.find()) {
1446 if(tokenMatcher.hitEnd()) {
1447 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1448 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1449 and caused an infinite loop here on EBCDIC systems!
1450 */
1451 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1452 // return;
1453 }
1454 if (tokenMatcher.start(1, status) >= 0) {
1455 // Scanned a divide sign, indicating a break position in the test data.
1456 if (testString.length()>0) {
1457 breakPositions.addElement(testString.length(), status);
1458 }
1459 }
1460 else if (tokenMatcher.start(2, status) >= 0) {
1461 // Scanned an 'x', meaning no break at this position in the test data
1462 // Nothing to be done here.
1463 }
1464 else if (tokenMatcher.start(3, status) >= 0) {
1465 // Scanned Hex digits. Convert them to binary, append to the character data string.
1466 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1467 int length = hexNumber.length();
1468 if (length<=8) {
1469 char buf[10];
1470 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1471 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1472 if (c<=0x10ffff) {
1473 testString.append(c);
1474 } else {
1475 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1476 fileName, lineNumber);
1477 }
1478 } else {
1479 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1480 fileName, lineNumber);
1481 }
1482 }
1483 else if (tokenMatcher.start(4, status) >= 0) {
1484 // Scanned to end of a line, possibly skipping over a comment in the process.
1485 // If the line from the file contained test data, run the test now.
1486 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1487 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1488 }
1489
1490 // Clear out this test case.
1491 // The string and breakPositions vector will be refilled as the next
1492 // test case is parsed.
1493 testString.remove();
1494 breakPositions.removeAllElements();
1495 lineNumber++;
1496 } else {
1497 // Scanner catchall. Something unrecognized appeared on the line.
1498 char token[16];
1499 UnicodeString uToken = tokenMatcher.group(0, status);
1500 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1501 token[sizeof(token)-1] = 0;
1502 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1503
1504 // Clean up, in preparation for continuing with the next line.
1505 testString.remove();
1506 breakPositions.removeAllElements();
1507 lineNumber++;
1508 }
1509 TEST_ASSERT_SUCCESS(status);
1510 if (U_FAILURE(status)) {
1511 break;
1512 }
1513 }
1514
1515 delete [] testFile;
1516 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1517 }
1518
1519 //--------------------------------------------------------------------------------------------
1520 //
1521 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1522 // test data files. Do only a simple, forward-only check -
1523 // this test is mostly to check that ICU and the Unicode
1524 // data agree with each other.
1525 //
1526 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1527 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1528 const UnicodeString &testString, // Text data to be broken
1529 UVector32 *breakPositions, // Positions where breaks should be found.
1530 RuleBasedBreakIterator *bi) {
1531 int32_t pos; // Break Position in the test string
1532 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1533 int32_t expectedPos; // Expected break position (index into test string)
1534
1535 bi->setText(testString);
1536 pos = bi->first();
1537 pos = bi->next();
1538
1539 while (pos != BreakIterator::DONE) {
1540 if (expectedI >= breakPositions->size()) {
1541 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1542 testFileName, lineNumber, pos);
1543 break;
1544 }
1545 expectedPos = breakPositions->elementAti(expectedI);
1546 if (pos < expectedPos) {
1547 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1548 testFileName, lineNumber, pos);
1549 break;
1550 }
1551 if (pos > expectedPos) {
1552 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1553 testFileName, lineNumber, expectedPos);
1554 break;
1555 }
1556 pos = bi->next();
1557 expectedI++;
1558 }
1559
1560 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1561 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1562 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1563 }
1564 }
1565
1566
1567
1568 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1569 //---------------------------------------------------------------------------------------
1570 //
1571 // classs RBBIMonkeyKind
1572 //
1573 // Monkey Test for Break Iteration
1574 // Abstract interface class. Concrete derived classes independently
1575 // implement the break rules for different iterator types.
1576 //
1577 // The Monkey Test itself uses doesn't know which type of break iterator it is
1578 // testing, but works purely in terms of the interface defined here.
1579 //
1580 //---------------------------------------------------------------------------------------
1581 class RBBIMonkeyKind {
1582 public:
1583 // Return a UVector of UnicodeSets, representing the character classes used
1584 // for this type of iterator.
1585 virtual UVector *charClasses() = 0;
1586
1587 // Set the test text on which subsequent calls to next() will operate
1588 virtual void setText(const UnicodeString &s) = 0;
1589
1590 // Find the next break postion, starting from the prev break position, or from zero.
1591 // Return -1 after reaching end of string.
1592 virtual int32_t next(int32_t i) = 0;
1593
1594 // Name of each character class, parallel with charClasses. Used for debugging output
1595 // of characters.
1596 virtual std::vector<std::string>& characterClassNames();
1597
1598 void setAppliedRule(int32_t position, const char* value);
1599
1600 std::string getAppliedRule(int32_t position);
1601
1602 virtual ~RBBIMonkeyKind();
1603 UErrorCode deferredStatus;
1604
1605 std::string classNameFromCodepoint(const UChar32 c);
1606 unsigned int maxClassNameSize();
1607
1608 protected:
1609 RBBIMonkeyKind();
1610 std::vector<std::string> classNames;
1611 std::vector<std::string> appliedRules;
1612
1613 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1614 void prepareAppliedRules(int32_t size );
1615
1616 private:
1617
1618 };
1619
RBBIMonkeyKind()1620 RBBIMonkeyKind::RBBIMonkeyKind() {
1621 deferredStatus = U_ZERO_ERROR;
1622 }
1623
~RBBIMonkeyKind()1624 RBBIMonkeyKind::~RBBIMonkeyKind() {
1625 }
1626
characterClassNames()1627 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1628 return classNames;
1629 }
1630
prepareAppliedRules(int32_t size)1631 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1632 // Remove all the information in the `appliedRules`.
1633 appliedRules.clear();
1634 appliedRules.resize(size + 1);
1635 }
1636
setAppliedRule(int32_t position,const char * value)1637 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1638 appliedRules[position] = value;
1639 }
1640
getAppliedRule(int32_t position)1641 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1642 return appliedRules[position];
1643 }
1644
classNameFromCodepoint(const UChar32 c)1645 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1646 // Simply iterate through charClasses to find character's class
1647 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1648 UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1649 if (classSet->contains(c)) {
1650 return classNames[aClassNum];
1651 }
1652 }
1653 U_ASSERT(FALSE); // This should not happen.
1654 return "bad class name";
1655 }
1656
maxClassNameSize()1657 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1658 unsigned int maxSize = 0;
1659 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1660 auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1661 if (aClassNumSize > maxSize) {
1662 maxSize = aClassNumSize;
1663 }
1664 }
1665 return maxSize;
1666 }
1667
1668 //----------------------------------------------------------------------------------------
1669 //
1670 // Random Numbers. Similar to standard lib rand() and srand()
1671 // Not using library to
1672 // 1. Get same results on all platforms.
1673 // 2. Get access to current seed, to more easily reproduce failures.
1674 //
1675 //---------------------------------------------------------------------------------------
1676 static uint32_t m_seed = 1;
1677
m_rand()1678 static uint32_t m_rand()
1679 {
1680 m_seed = m_seed * 1103515245 + 12345;
1681 return (uint32_t)(m_seed/65536) % 32768;
1682 }
1683
1684
1685 //------------------------------------------------------------------------------------------
1686 //
1687 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1688 // of RBBIMonkeyKind.
1689 //
1690 //------------------------------------------------------------------------------------------
1691 class RBBICharMonkey: public RBBIMonkeyKind {
1692 public:
1693 RBBICharMonkey();
1694 virtual ~RBBICharMonkey();
1695 virtual UVector *charClasses();
1696 virtual void setText(const UnicodeString &s);
1697 virtual int32_t next(int32_t i);
1698 private:
1699 UVector *fSets;
1700
1701 UnicodeSet *fCRLFSet;
1702 UnicodeSet *fControlSet;
1703 UnicodeSet *fExtendSet;
1704 UnicodeSet *fZWJSet;
1705 UnicodeSet *fRegionalIndicatorSet;
1706 UnicodeSet *fPrependSet;
1707 UnicodeSet *fSpacingSet;
1708 UnicodeSet *fLSet;
1709 UnicodeSet *fVSet;
1710 UnicodeSet *fTSet;
1711 UnicodeSet *fLVSet;
1712 UnicodeSet *fLVTSet;
1713 UnicodeSet *fHangulSet;
1714 UnicodeSet *fExtendedPictSet;
1715 UnicodeSet *fViramaSet;
1716 UnicodeSet *fLinkingConsonantSet;
1717 UnicodeSet *fExtCccZwjSet;
1718 UnicodeSet *fAnySet;
1719
1720 const UnicodeString *fText;
1721 };
1722
1723
RBBICharMonkey()1724 RBBICharMonkey::RBBICharMonkey() {
1725 UErrorCode status = U_ZERO_ERROR;
1726
1727 fText = NULL;
1728
1729 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1730 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1731 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1732 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1733 fRegionalIndicatorSet =
1734 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1735 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1736 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1737 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1738 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1739 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1740 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1741 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1742 fHangulSet = new UnicodeSet();
1743 fHangulSet->addAll(*fLSet);
1744 fHangulSet->addAll(*fVSet);
1745 fHangulSet->addAll(*fTSet);
1746 fHangulSet->addAll(*fLVSet);
1747 fHangulSet->addAll(*fLVTSet);
1748
1749 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1750 fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1751 "\\p{Indic_Syllabic_Category=Virama}]", status);
1752 fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1753 "\\p{Indic_Syllabic_Category=Consonant}]", status);
1754 fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1755 fAnySet = new UnicodeSet(0, 0x10ffff);
1756
1757 // Create sets of characters, and add the names of the above character sets.
1758 // In each new ICU release, add new names corresponding to the sets above.
1759 fSets = new UVector(status);
1760
1761 // Important: Keep class names the same as the class contents.
1762 fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1763 fSets->addElement(fControlSet, status); classNames.push_back("Control");
1764 fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1765 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1766 if (!fPrependSet->isEmpty()) {
1767 fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1768 }
1769 fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1770 fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1771 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1772 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1773 fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1774 fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1775 fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1776 fSets->addElement(fAnySet, status); classNames.push_back("Any");
1777
1778 if (U_FAILURE(status)) {
1779 deferredStatus = status;
1780 }
1781 }
1782
1783
setText(const UnicodeString & s)1784 void RBBICharMonkey::setText(const UnicodeString &s) {
1785 fText = &s;
1786 prepareAppliedRules(s.length());
1787 }
1788
1789
1790
next(int32_t prevPos)1791 int32_t RBBICharMonkey::next(int32_t prevPos) {
1792 int p0, p1, p2, p3; // Indices of the significant code points around the
1793 // break position being tested. The candidate break
1794 // location is before p2.
1795
1796 int breakPos = -1;
1797
1798 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1799 UChar32 cBase; // for (X Extend*) patterns, the X character.
1800
1801 if (U_FAILURE(deferredStatus)) {
1802 return -1;
1803 }
1804
1805 // Previous break at end of string. return DONE.
1806 if (prevPos >= fText->length()) {
1807 return -1;
1808 }
1809
1810 p0 = p1 = p2 = p3 = prevPos;
1811 c3 = fText->char32At(prevPos);
1812 c0 = c1 = c2 = cBase = 0;
1813 (void)p0; // suppress set but not used warning.
1814 (void)c0;
1815
1816 // Loop runs once per "significant" character position in the input text.
1817 for (;;) {
1818 // Move all of the positions forward in the input string.
1819 p0 = p1; c0 = c1;
1820 p1 = p2; c1 = c2;
1821 p2 = p3; c2 = c3;
1822
1823 // Advance p3 by one codepoint
1824 p3 = fText->moveIndex32(p3, 1);
1825 c3 = fText->char32At(p3);
1826
1827 if (p1 == p2) {
1828 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1829 continue;
1830 }
1831
1832 if (p2 == fText->length()) {
1833 setAppliedRule(p2, "End of String");
1834 break;
1835 }
1836
1837 // No Extend or Format characters may appear between the CR and LF,
1838 // which requires the additional check for p2 immediately following p1.
1839 //
1840 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1841 setAppliedRule(p2, "GB3 CR x LF");
1842 continue;
1843 }
1844
1845 if (fControlSet->contains(c1) ||
1846 c1 == 0x0D ||
1847 c1 == 0x0A) {
1848 setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
1849 break;
1850 }
1851
1852 if (fControlSet->contains(c2) ||
1853 c2 == 0x0D ||
1854 c2 == 0x0A) {
1855 setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
1856 break;
1857 }
1858
1859 if (fLSet->contains(c1) &&
1860 (fLSet->contains(c2) ||
1861 fVSet->contains(c2) ||
1862 fLVSet->contains(c2) ||
1863 fLVTSet->contains(c2))) {
1864 setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
1865 continue;
1866 }
1867
1868 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1869 (fVSet->contains(c2) || fTSet->contains(c2))) {
1870 setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
1871 continue;
1872 }
1873
1874 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1875 fTSet->contains(c2)) {
1876 setAppliedRule(p2, "GB8 ( LVT | T) x T");
1877 continue;
1878 }
1879
1880 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1881 if (!fExtendSet->contains(c1)) {
1882 cBase = c1;
1883 }
1884 setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
1885 continue;
1886 }
1887
1888 if (fSpacingSet->contains(c2)) {
1889 setAppliedRule(p2, "GB9a x SpacingMark");
1890 continue;
1891 }
1892
1893 if (fPrependSet->contains(c1)) {
1894 setAppliedRule(p2, "GB9b Prepend x");
1895 continue;
1896 }
1897
1898 // Note: Viramas are also included in the ExtCccZwj class.
1899 if (fLinkingConsonantSet->contains(c2)) {
1900 int pi = p1;
1901 bool sawVirama = false;
1902 while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1903 if (fViramaSet->contains(fText->char32At(pi))) {
1904 sawVirama = true;
1905 }
1906 pi = fText->moveIndex32(pi, -1);
1907 }
1908 if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1909 setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1910 continue;
1911 }
1912 }
1913
1914 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1915 setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1916 continue;
1917 }
1918
1919 // Note: The first if condition is a little tricky. We only need to force
1920 // a break if there are three or more contiguous RIs. If there are
1921 // only two, a break following will occur via other rules, and will include
1922 // any trailing extend characters, which is needed behavior.
1923 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1924 && fRegionalIndicatorSet->contains(c2)) {
1925 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1926 break;
1927 }
1928 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1929 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1930 continue;
1931 }
1932
1933 setAppliedRule(p2, "GB999 Any <break> Any");
1934 break;
1935 }
1936
1937 breakPos = p2;
1938 return breakPos;
1939 }
1940
1941
1942
charClasses()1943 UVector *RBBICharMonkey::charClasses() {
1944 return fSets;
1945 }
1946
~RBBICharMonkey()1947 RBBICharMonkey::~RBBICharMonkey() {
1948 delete fSets;
1949 delete fCRLFSet;
1950 delete fControlSet;
1951 delete fExtendSet;
1952 delete fRegionalIndicatorSet;
1953 delete fPrependSet;
1954 delete fSpacingSet;
1955 delete fLSet;
1956 delete fVSet;
1957 delete fTSet;
1958 delete fLVSet;
1959 delete fLVTSet;
1960 delete fHangulSet;
1961 delete fAnySet;
1962 delete fZWJSet;
1963 delete fExtendedPictSet;
1964 delete fViramaSet;
1965 delete fLinkingConsonantSet;
1966 delete fExtCccZwjSet;
1967 }
1968
1969 //------------------------------------------------------------------------------------------
1970 //
1971 // class RBBIWordMonkey Word Break specific implementation
1972 // of RBBIMonkeyKind.
1973 //
1974 //------------------------------------------------------------------------------------------
1975 class RBBIWordMonkey: public RBBIMonkeyKind {
1976 public:
1977 RBBIWordMonkey();
1978 virtual ~RBBIWordMonkey();
1979 virtual UVector *charClasses();
1980 virtual void setText(const UnicodeString &s);
1981 virtual int32_t next(int32_t i);
1982 private:
1983 UVector *fSets;
1984
1985 UnicodeSet *fCRSet;
1986 UnicodeSet *fLFSet;
1987 UnicodeSet *fNewlineSet;
1988 UnicodeSet *fRegionalIndicatorSet;
1989 UnicodeSet *fKatakanaSet;
1990 UnicodeSet *fHebrew_LetterSet;
1991 UnicodeSet *fALetterSet;
1992 UnicodeSet *fSingle_QuoteSet;
1993 UnicodeSet *fDouble_QuoteSet;
1994 UnicodeSet *fMidNumLetSet;
1995 UnicodeSet *fMidLetterSet;
1996 UnicodeSet *fMidNumSet;
1997 UnicodeSet *fNumericSet;
1998 UnicodeSet *fFormatSet;
1999 UnicodeSet *fOtherSet;
2000 UnicodeSet *fExtendSet;
2001 UnicodeSet *fExtendNumLetSet;
2002 UnicodeSet *fWSegSpaceSet;
2003 UnicodeSet *fDictionarySet;
2004 UnicodeSet *fZWJSet;
2005 UnicodeSet *fExtendedPictSet;
2006
2007 const UnicodeString *fText;
2008 };
2009
2010
RBBIWordMonkey()2011 RBBIWordMonkey::RBBIWordMonkey()
2012 {
2013 UErrorCode status = U_ZERO_ERROR;
2014
2015 fSets = new UVector(status);
2016
2017 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
2018 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
2019 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
2020 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
2021 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2022 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2023 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2024 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
2025 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
2026 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
2027 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status);
2028 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
2029 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
2030 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
2031 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
2032 // There are some sc=Hani characters with WB=Extend.
2033 // The break rules need to pick one or the other because
2034 // Extend overlapping with something else is messy.
2035 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
2036 // in $Han (for $dictionary) and out of $Extend.
2037 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
2038 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
2039
2040 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
2041 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
2042
2043 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2044 fDictionarySet->addAll(*fKatakanaSet);
2045 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2046
2047 fALetterSet->removeAll(*fDictionarySet);
2048
2049 fOtherSet = new UnicodeSet();
2050 if(U_FAILURE(status)) {
2051 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2052 deferredStatus = status;
2053 return;
2054 }
2055
2056 fOtherSet->complement();
2057 fOtherSet->removeAll(*fCRSet);
2058 fOtherSet->removeAll(*fLFSet);
2059 fOtherSet->removeAll(*fNewlineSet);
2060 fOtherSet->removeAll(*fKatakanaSet);
2061 fOtherSet->removeAll(*fHebrew_LetterSet);
2062 fOtherSet->removeAll(*fALetterSet);
2063 fOtherSet->removeAll(*fSingle_QuoteSet);
2064 fOtherSet->removeAll(*fDouble_QuoteSet);
2065 fOtherSet->removeAll(*fMidLetterSet);
2066 fOtherSet->removeAll(*fMidNumSet);
2067 fOtherSet->removeAll(*fNumericSet);
2068 fOtherSet->removeAll(*fExtendNumLetSet);
2069 fOtherSet->removeAll(*fWSegSpaceSet);
2070 fOtherSet->removeAll(*fFormatSet);
2071 fOtherSet->removeAll(*fExtendSet);
2072 fOtherSet->removeAll(*fRegionalIndicatorSet);
2073 fOtherSet->removeAll(*fZWJSet);
2074 fOtherSet->removeAll(*fExtendedPictSet);
2075
2076 // Inhibit dictionary characters from being tested at all.
2077 fOtherSet->removeAll(*fDictionarySet);
2078
2079 // Add classes and their names
2080 fSets->addElement(fCRSet, status); classNames.push_back("CR");
2081 fSets->addElement(fLFSet, status); classNames.push_back("LF");
2082 fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2083 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2084 fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2085 fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2086 fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2087 fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2088 // Omit Katakana from fSets, which omits Katakana characters
2089 // from the test data. They are all in the dictionary set,
2090 // which this (old, to be retired) monkey test cannot handle.
2091 //fSets->addElement(fKatakanaSet, status);
2092
2093 fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2094 fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2095 fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2096 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2097 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2098 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2099 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2100 fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2101 fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2102
2103 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2104 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2105
2106 if (U_FAILURE(status)) {
2107 deferredStatus = status;
2108 }
2109 }
2110
setText(const UnicodeString & s)2111 void RBBIWordMonkey::setText(const UnicodeString &s) {
2112 fText = &s;
2113 prepareAppliedRules(s.length());
2114 }
2115
2116
next(int32_t prevPos)2117 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2118 int p0, p1, p2, p3; // Indices of the significant code points around the
2119 // break position being tested. The candidate break
2120 // location is before p2.
2121
2122 int breakPos = -1;
2123
2124 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2125
2126 if (U_FAILURE(deferredStatus)) {
2127 return -1;
2128 }
2129
2130 // Prev break at end of string. return DONE.
2131 if (prevPos >= fText->length()) {
2132 return -1;
2133 }
2134 p0 = p1 = p2 = p3 = prevPos;
2135 c3 = fText->char32At(prevPos);
2136 c0 = c1 = c2 = 0;
2137 (void)p0; // Suppress set but not used warning.
2138
2139 // Loop runs once per "significant" character position in the input text.
2140 for (;;) {
2141 // Move all of the positions forward in the input string.
2142 p0 = p1; c0 = c1;
2143 p1 = p2; c1 = c2;
2144 p2 = p3; c2 = c3;
2145
2146 // Advance p3 by X(Extend | Format)* Rule 4
2147 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2148 do {
2149 p3 = fText->moveIndex32(p3, 1);
2150 c3 = fText->char32At(p3);
2151 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2152 break;
2153 }
2154 }
2155 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2156
2157
2158 if (p1 == p2) {
2159 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2160 continue;
2161 }
2162
2163 if (p2 == fText->length()) {
2164 // Reached end of string. Always a break position.
2165 break;
2166 }
2167
2168 // No Extend or Format characters may appear between the CR and LF,
2169 // which requires the additional check for p2 immediately following p1.
2170 //
2171 if (c1==0x0D && c2==0x0A) {
2172 setAppliedRule(p2, "WB3 CR x LF");
2173 continue;
2174 }
2175
2176 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2177 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2178 break;
2179 }
2180 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2181 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2182 break;
2183 }
2184
2185 // Not ignoring extend chars, so peek into input text to
2186 // get the potential ZWJ, the character immediately preceding c2.
2187 // Sloppy UChar32 indexing: p2-1 may reference trail half
2188 // but char32At will get the full code point.
2189 if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2190 setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
2191 continue;
2192 }
2193
2194 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2195 setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
2196 continue;
2197 }
2198
2199 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2200 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2201 setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2202 continue;
2203 }
2204
2205 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2206 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2207 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2208 setAppliedRule(p2,
2209 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2210 continue;
2211 }
2212
2213 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2214 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2215 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2216 setAppliedRule(p2,
2217 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
2218 continue;
2219 }
2220
2221 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2222 setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
2223 continue;
2224 }
2225
2226 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2227 setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
2228 continue;
2229 }
2230
2231 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2232 setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
2233 continue;
2234 }
2235
2236 if (fNumericSet->contains(c1) &&
2237 fNumericSet->contains(c2)) {
2238 setAppliedRule(p2, "WB8 Numeric x Numeric");
2239 continue;
2240 }
2241
2242 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2243 fNumericSet->contains(c2)) {
2244 setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
2245 continue;
2246 }
2247
2248 if (fNumericSet->contains(c1) &&
2249 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2250 setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
2251 continue;
2252 }
2253
2254 if (fNumericSet->contains(c0) &&
2255 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2256 fNumericSet->contains(c2)) {
2257 setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
2258 continue;
2259 }
2260
2261 if (fNumericSet->contains(c1) &&
2262 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2263 fNumericSet->contains(c3)) {
2264 setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2265 continue;
2266 }
2267
2268 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2269 // all Katakana are handled by the dictionary breaker.
2270 if (fKatakanaSet->contains(c1) &&
2271 fKatakanaSet->contains(c2)) {
2272 setAppliedRule(p2, "WB13 Katakana x Katakana");
2273 continue;
2274 }
2275
2276 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2277 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2278 fExtendNumLetSet->contains(c2)) {
2279 setAppliedRule(p2,
2280 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2281 continue;
2282 }
2283
2284 if (fExtendNumLetSet->contains(c1) &&
2285 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2286 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2287 setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2288 continue;
2289 }
2290
2291 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2292 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2293 break;
2294 }
2295 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2296 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2297 continue;
2298 }
2299
2300 setAppliedRule(p2, "WB999");
2301 break;
2302 }
2303
2304 breakPos = p2;
2305 return breakPos;
2306 }
2307
2308
charClasses()2309 UVector *RBBIWordMonkey::charClasses() {
2310 return fSets;
2311 }
2312
~RBBIWordMonkey()2313 RBBIWordMonkey::~RBBIWordMonkey() {
2314 delete fSets;
2315 delete fCRSet;
2316 delete fLFSet;
2317 delete fNewlineSet;
2318 delete fKatakanaSet;
2319 delete fHebrew_LetterSet;
2320 delete fALetterSet;
2321 delete fSingle_QuoteSet;
2322 delete fDouble_QuoteSet;
2323 delete fMidNumLetSet;
2324 delete fMidLetterSet;
2325 delete fMidNumSet;
2326 delete fNumericSet;
2327 delete fFormatSet;
2328 delete fExtendSet;
2329 delete fExtendNumLetSet;
2330 delete fWSegSpaceSet;
2331 delete fRegionalIndicatorSet;
2332 delete fDictionarySet;
2333 delete fOtherSet;
2334 delete fZWJSet;
2335 delete fExtendedPictSet;
2336 }
2337
2338
2339
2340
2341 //------------------------------------------------------------------------------------------
2342 //
2343 // class RBBISentMonkey Sentence Break specific implementation
2344 // of RBBIMonkeyKind.
2345 //
2346 //------------------------------------------------------------------------------------------
2347 class RBBISentMonkey: public RBBIMonkeyKind {
2348 public:
2349 RBBISentMonkey();
2350 virtual ~RBBISentMonkey();
2351 virtual UVector *charClasses();
2352 virtual void setText(const UnicodeString &s);
2353 virtual int32_t next(int32_t i);
2354 private:
2355 int moveBack(int posFrom);
2356 int moveForward(int posFrom);
2357 UChar32 cAt(int pos);
2358
2359 UVector *fSets;
2360
2361 UnicodeSet *fSepSet;
2362 UnicodeSet *fFormatSet;
2363 UnicodeSet *fSpSet;
2364 UnicodeSet *fLowerSet;
2365 UnicodeSet *fUpperSet;
2366 UnicodeSet *fOLetterSet;
2367 UnicodeSet *fNumericSet;
2368 UnicodeSet *fATermSet;
2369 UnicodeSet *fSContinueSet;
2370 UnicodeSet *fSTermSet;
2371 UnicodeSet *fCloseSet;
2372 UnicodeSet *fOtherSet;
2373 UnicodeSet *fExtendSet;
2374
2375 const UnicodeString *fText;
2376 };
2377
RBBISentMonkey()2378 RBBISentMonkey::RBBISentMonkey()
2379 {
2380 UErrorCode status = U_ZERO_ERROR;
2381
2382 fSets = new UVector(status);
2383
2384 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2385 // set and made into character classes of their own. For the monkey impl,
2386 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2387 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2388 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2389 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2390 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2391 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2392 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2393 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2394 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2395 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2396 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2397 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2398 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2399 fOtherSet = new UnicodeSet();
2400
2401 if(U_FAILURE(status)) {
2402 deferredStatus = status;
2403 return;
2404 }
2405
2406 fOtherSet->complement();
2407 fOtherSet->removeAll(*fSepSet);
2408 fOtherSet->removeAll(*fFormatSet);
2409 fOtherSet->removeAll(*fSpSet);
2410 fOtherSet->removeAll(*fLowerSet);
2411 fOtherSet->removeAll(*fUpperSet);
2412 fOtherSet->removeAll(*fOLetterSet);
2413 fOtherSet->removeAll(*fNumericSet);
2414 fOtherSet->removeAll(*fATermSet);
2415 fOtherSet->removeAll(*fSContinueSet);
2416 fOtherSet->removeAll(*fSTermSet);
2417 fOtherSet->removeAll(*fCloseSet);
2418 fOtherSet->removeAll(*fExtendSet);
2419
2420 fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2421 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2422 fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2423 fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2424 fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2425 fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2426 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2427 fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2428 fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2429 fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2430 fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2431 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2432 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2433
2434 if (U_FAILURE(status)) {
2435 deferredStatus = status;
2436 }
2437 }
2438
2439
2440
setText(const UnicodeString & s)2441 void RBBISentMonkey::setText(const UnicodeString &s) {
2442 fText = &s;
2443 prepareAppliedRules(s.length());
2444 }
2445
charClasses()2446 UVector *RBBISentMonkey::charClasses() {
2447 return fSets;
2448 }
2449
2450 // moveBack() Find the "significant" code point preceding the index i.
2451 // Skips over ($Extend | $Format)* .
2452 //
moveBack(int i)2453 int RBBISentMonkey::moveBack(int i) {
2454 if (i <= 0) {
2455 return -1;
2456 }
2457 UChar32 c;
2458 int32_t j = i;
2459 do {
2460 j = fText->moveIndex32(j, -1);
2461 c = fText->char32At(j);
2462 }
2463 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2464 return j;
2465
2466 }
2467
2468
moveForward(int i)2469 int RBBISentMonkey::moveForward(int i) {
2470 if (i>=fText->length()) {
2471 return fText->length();
2472 }
2473 UChar32 c;
2474 int32_t j = i;
2475 do {
2476 j = fText->moveIndex32(j, 1);
2477 c = cAt(j);
2478 }
2479 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2480 return j;
2481 }
2482
cAt(int pos)2483 UChar32 RBBISentMonkey::cAt(int pos) {
2484 if (pos<0 || pos>=fText->length()) {
2485 return -1;
2486 } else {
2487 return fText->char32At(pos);
2488 }
2489 }
2490
next(int32_t prevPos)2491 int32_t RBBISentMonkey::next(int32_t prevPos) {
2492 int p0, p1, p2, p3; // Indices of the significant code points around the
2493 // break position being tested. The candidate break
2494 // location is before p2.
2495
2496 int breakPos = -1;
2497
2498 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2499 UChar32 c;
2500
2501 if (U_FAILURE(deferredStatus)) {
2502 return -1;
2503 }
2504
2505 // Prev break at end of string. return DONE.
2506 if (prevPos >= fText->length()) {
2507 return -1;
2508 }
2509 p0 = p1 = p2 = p3 = prevPos;
2510 c3 = fText->char32At(prevPos);
2511 c0 = c1 = c2 = 0;
2512 (void)p0; // Suppress set but not used warning.
2513
2514 // Loop runs once per "significant" character position in the input text.
2515 for (;;) {
2516 // Move all of the positions forward in the input string.
2517 p0 = p1; c0 = c1;
2518 p1 = p2; c1 = c2;
2519 p2 = p3; c2 = c3;
2520
2521 // Advance p3 by X(Extend | Format)* Rule 4
2522 p3 = moveForward(p3);
2523 c3 = cAt(p3);
2524
2525 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2526 setAppliedRule(p2, "SB3 CR x LF");
2527 continue;
2528 }
2529
2530 if (fSepSet->contains(c1)) {
2531 p2 = p1+1; // Separators don't combine with Extend or Format.
2532
2533 setAppliedRule(p2, "SB4 Sep <break>");
2534 break;
2535 }
2536
2537 if (p2 >= fText->length()) {
2538 // Reached end of string. Always a break position.
2539 setAppliedRule(p2, "SB4 Sep <break>");
2540 break;
2541 }
2542
2543 if (p2 == prevPos) {
2544 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2545 setAppliedRule(p2, "SB4 Sep <break>");
2546 continue;
2547 }
2548
2549 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2550 setAppliedRule(p2, "SB6 ATerm x Numeric");
2551 continue;
2552 }
2553
2554 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2555 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2556 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
2557 continue;
2558 }
2559
2560 // Note: STerm | ATerm are added to the negated part of the expression by a
2561 // note to the Unicode 5.0 documents.
2562 int p8 = p1;
2563 while (fSpSet->contains(cAt(p8))) {
2564 p8 = moveBack(p8);
2565 }
2566 while (fCloseSet->contains(cAt(p8))) {
2567 p8 = moveBack(p8);
2568 }
2569 if (fATermSet->contains(cAt(p8))) {
2570 p8=p2;
2571 for (;;) {
2572 c = cAt(p8);
2573 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2574 fLowerSet->contains(c) || fSepSet->contains(c) ||
2575 fATermSet->contains(c) || fSTermSet->contains(c)) {
2576
2577 setAppliedRule(p2,
2578 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2579 break;
2580 }
2581 p8 = moveForward(p8);
2582 }
2583 if (fLowerSet->contains(cAt(p8))) {
2584
2585 setAppliedRule(p2,
2586 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2587 continue;
2588 }
2589 }
2590
2591 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2592 p8 = p1;
2593 while (fSpSet->contains(cAt(p8))) {
2594 p8 = moveBack(p8);
2595 }
2596 while (fCloseSet->contains(cAt(p8))) {
2597 p8 = moveBack(p8);
2598 }
2599 c = cAt(p8);
2600 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2601 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2602 continue;
2603 }
2604 }
2605
2606 int p9 = p1;
2607 while (fCloseSet->contains(cAt(p9))) {
2608 p9 = moveBack(p9);
2609 }
2610 c = cAt(p9);
2611 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2612 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2613
2614 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
2615 continue;
2616 }
2617 }
2618
2619 int p10 = p1;
2620 while (fSpSet->contains(cAt(p10))) {
2621 p10 = moveBack(p10);
2622 }
2623 while (fCloseSet->contains(cAt(p10))) {
2624 p10 = moveBack(p10);
2625 }
2626 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2627 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2628 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
2629 continue;
2630 }
2631 }
2632
2633 int p11 = p1;
2634 if (fSepSet->contains(cAt(p11))) {
2635 p11 = moveBack(p11);
2636 }
2637 while (fSpSet->contains(cAt(p11))) {
2638 p11 = moveBack(p11);
2639 }
2640 while (fCloseSet->contains(cAt(p11))) {
2641 p11 = moveBack(p11);
2642 }
2643 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2644 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
2645 break;
2646 }
2647
2648 setAppliedRule(p2, "SB12 Any x Any");
2649 continue;
2650 }
2651
2652 breakPos = p2;
2653 return breakPos;
2654 }
2655
~RBBISentMonkey()2656 RBBISentMonkey::~RBBISentMonkey() {
2657 delete fSets;
2658 delete fSepSet;
2659 delete fFormatSet;
2660 delete fSpSet;
2661 delete fLowerSet;
2662 delete fUpperSet;
2663 delete fOLetterSet;
2664 delete fNumericSet;
2665 delete fATermSet;
2666 delete fSContinueSet;
2667 delete fSTermSet;
2668 delete fCloseSet;
2669 delete fOtherSet;
2670 delete fExtendSet;
2671 }
2672
2673
2674
2675 //-------------------------------------------------------------------------------------------
2676 //
2677 // RBBILineMonkey
2678 //
2679 //-------------------------------------------------------------------------------------------
2680
2681 class RBBILineMonkey: public RBBIMonkeyKind {
2682 public:
2683 RBBILineMonkey();
2684 virtual ~RBBILineMonkey();
2685 virtual UVector *charClasses();
2686 virtual void setText(const UnicodeString &s);
2687 virtual int32_t next(int32_t i);
2688 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2689 private:
2690 UVector *fSets;
2691
2692 UnicodeSet *fBK;
2693 UnicodeSet *fCR;
2694 UnicodeSet *fLF;
2695 UnicodeSet *fCM;
2696 UnicodeSet *fNL;
2697 UnicodeSet *fSG;
2698 UnicodeSet *fWJ;
2699 UnicodeSet *fZW;
2700 UnicodeSet *fGL;
2701 UnicodeSet *fCB;
2702 UnicodeSet *fSP;
2703 UnicodeSet *fB2;
2704 UnicodeSet *fBA;
2705 UnicodeSet *fBB;
2706 UnicodeSet *fHH;
2707 UnicodeSet *fHY;
2708 UnicodeSet *fH2;
2709 UnicodeSet *fH3;
2710 UnicodeSet *fCL;
2711 UnicodeSet *fCP;
2712 UnicodeSet *fEX;
2713 UnicodeSet *fIN;
2714 UnicodeSet *fJL;
2715 UnicodeSet *fJV;
2716 UnicodeSet *fJT;
2717 UnicodeSet *fNS;
2718 UnicodeSet *fOP;
2719 UnicodeSet *fQU;
2720 UnicodeSet *fIS;
2721 UnicodeSet *fNU;
2722 UnicodeSet *fPO;
2723 UnicodeSet *fPR;
2724 UnicodeSet *fSY;
2725 UnicodeSet *fAI;
2726 UnicodeSet *fAL;
2727 UnicodeSet *fCJ;
2728 UnicodeSet *fHL;
2729 UnicodeSet *fID;
2730 UnicodeSet *fRI;
2731 UnicodeSet *fXX;
2732 UnicodeSet *fEB;
2733 UnicodeSet *fEM;
2734 UnicodeSet *fZWJ;
2735 UnicodeSet *fOP30;
2736 UnicodeSet *fCP30;
2737
2738 BreakIterator *fCharBI;
2739 const UnicodeString *fText;
2740 RegexMatcher *fNumberMatcher;
2741 };
2742
RBBILineMonkey()2743 RBBILineMonkey::RBBILineMonkey() :
2744 RBBIMonkeyKind(),
2745 fSets(NULL),
2746
2747 fCharBI(NULL),
2748 fText(NULL),
2749 fNumberMatcher(NULL)
2750
2751 {
2752 if (U_FAILURE(deferredStatus)) {
2753 return;
2754 }
2755
2756 UErrorCode status = U_ZERO_ERROR;
2757
2758 fSets = new UVector(status);
2759
2760 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2761 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2762 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2763 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2764 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2765 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2766 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2767 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2768 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2769 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2770 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2771 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2772 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2773 fHH = new UnicodeSet();
2774 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2775 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2776 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2777 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2778 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2779 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2780 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2781 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2782 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2783 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2784 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2785 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2786 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2787 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2788 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2789 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2790 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2791 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2792 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2793 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2794 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2795 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2796 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2797 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2798 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2799 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2800 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2801 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2802 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2803 fOP30 = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2804 fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2805
2806 if (U_FAILURE(status)) {
2807 deferredStatus = status;
2808 return;
2809 }
2810
2811 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2812 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2813 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2814
2815 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2816 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2817
2818 fHH->add(u'\u2010'); // Hyphen, '‐'
2819
2820 // Sets and names.
2821 fSets->addElement(fBK, status); classNames.push_back("fBK");
2822 fSets->addElement(fCR, status); classNames.push_back("fCR");
2823 fSets->addElement(fLF, status); classNames.push_back("fLF");
2824 fSets->addElement(fCM, status); classNames.push_back("fCM");
2825 fSets->addElement(fNL, status); classNames.push_back("fNL");
2826 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2827 fSets->addElement(fZW, status); classNames.push_back("fZW");
2828 fSets->addElement(fGL, status); classNames.push_back("fGL");
2829 fSets->addElement(fCB, status); classNames.push_back("fCB");
2830 fSets->addElement(fSP, status); classNames.push_back("fSP");
2831 fSets->addElement(fB2, status); classNames.push_back("fB2");
2832 fSets->addElement(fBA, status); classNames.push_back("fBA");
2833 fSets->addElement(fBB, status); classNames.push_back("fBB");
2834 fSets->addElement(fHY, status); classNames.push_back("fHY");
2835 fSets->addElement(fH2, status); classNames.push_back("fH2");
2836 fSets->addElement(fH3, status); classNames.push_back("fH3");
2837 fSets->addElement(fCL, status); classNames.push_back("fCL");
2838 fSets->addElement(fCP, status); classNames.push_back("fCP");
2839 fSets->addElement(fEX, status); classNames.push_back("fEX");
2840 fSets->addElement(fIN, status); classNames.push_back("fIN");
2841 fSets->addElement(fJL, status); classNames.push_back("fJL");
2842 fSets->addElement(fJT, status); classNames.push_back("fJT");
2843 fSets->addElement(fJV, status); classNames.push_back("fJV");
2844 fSets->addElement(fNS, status); classNames.push_back("fNS");
2845 fSets->addElement(fOP, status); classNames.push_back("fOP");
2846 fSets->addElement(fQU, status); classNames.push_back("fQU");
2847 fSets->addElement(fIS, status); classNames.push_back("fIS");
2848 fSets->addElement(fNU, status); classNames.push_back("fNU");
2849 fSets->addElement(fPO, status); classNames.push_back("fPO");
2850 fSets->addElement(fPR, status); classNames.push_back("fPR");
2851 fSets->addElement(fSY, status); classNames.push_back("fSY");
2852 fSets->addElement(fAI, status); classNames.push_back("fAI");
2853 fSets->addElement(fAL, status); classNames.push_back("fAL");
2854 fSets->addElement(fHL, status); classNames.push_back("fHL");
2855 fSets->addElement(fID, status); classNames.push_back("fID");
2856 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2857 fSets->addElement(fRI, status); classNames.push_back("fRI");
2858 fSets->addElement(fSG, status); classNames.push_back("fSG");
2859 fSets->addElement(fEB, status); classNames.push_back("fEB");
2860 fSets->addElement(fEM, status); classNames.push_back("fEM");
2861 fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2862 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2863 fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2864 fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2865
2866 const char *rules =
2867 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2868 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2869 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2870 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2871 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2872 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2873 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2874
2875 fNumberMatcher = new RegexMatcher(
2876 UnicodeString(rules, -1, US_INV), 0, status);
2877
2878 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2879
2880 if (U_FAILURE(status)) {
2881 deferredStatus = status;
2882 }
2883
2884 }
2885
2886
setText(const UnicodeString & s)2887 void RBBILineMonkey::setText(const UnicodeString &s) {
2888 fText = &s;
2889 fCharBI->setText(s);
2890 prepareAppliedRules(s.length());
2891 fNumberMatcher->reset(s);
2892 }
2893
2894 //
2895 // rule9Adjust
2896 // Line Break TR rules 9 and 10 implementation.
2897 // This deals with combining marks and other sequences that
2898 // that must be treated as if they were something other than what they actually are.
2899 //
2900 // This is factored out into a separate function because it must be applied twice for
2901 // each potential break, once to the chars before the position being checked, then
2902 // again to the text following the possible break.
2903 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2904 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2905 if (pos == -1) {
2906 // Invalid initial position. Happens during the warmup iteration of the
2907 // main loop in next().
2908 return;
2909 }
2910
2911 int32_t nPos = *nextPos;
2912
2913 // LB 9 Keep combining sequences together.
2914 // advance over any CM class chars. Note that Line Break CM is different
2915 // from the normal Grapheme Extend property.
2916 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2917 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2918 for (;;) {
2919 *nextChar = fText->char32At(nPos);
2920 if (!fCM->contains(*nextChar)) {
2921 break;
2922 }
2923 nPos = fText->moveIndex32(nPos, 1);
2924 }
2925 }
2926
2927
2928 // LB 9 Treat X CM* as if it were x.
2929 // No explicit action required.
2930
2931 // LB 10 Treat any remaining combining mark as AL
2932 if (fCM->contains(*posChar)) {
2933 *posChar = u'A';
2934 }
2935
2936 // Push the updated nextPos and nextChar back to our caller.
2937 // This only makes a difference if posChar got bigger by consuming a
2938 // combining sequence.
2939 *nextPos = nPos;
2940 *nextChar = fText->char32At(nPos);
2941 }
2942
2943
2944
next(int32_t startPos)2945 int32_t RBBILineMonkey::next(int32_t startPos) {
2946 UErrorCode status = U_ZERO_ERROR;
2947 int32_t pos; // Index of the char following a potential break position
2948 UChar32 thisChar; // Character at above position "pos"
2949
2950 int32_t prevPos; // Index of the char preceding a potential break position
2951 UChar32 prevChar; // Character at above position. Note that prevChar
2952 // and thisChar may not be adjacent because combining
2953 // characters between them will be ignored.
2954
2955 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2956 UChar32 prevCharX2;
2957
2958 int32_t nextPos; // Index of the next character following pos.
2959 // Usually skips over combining marks.
2960 int32_t nextCPPos; // Index of the code point following "pos."
2961 // May point to a combining mark.
2962 int32_t tPos; // temp value.
2963 UChar32 c;
2964
2965 if (U_FAILURE(deferredStatus)) {
2966 return -1;
2967 }
2968
2969 if (startPos >= fText->length()) {
2970 return -1;
2971 }
2972
2973
2974 // Initial values for loop. Loop will run the first time without finding breaks,
2975 // while the invalid values shift out and the "this" and
2976 // "prev" positions are filled in with good values.
2977 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2978 thisChar = prevChar = prevCharX2 = 0;
2979 nextPos = nextCPPos = startPos;
2980
2981
2982 // Loop runs once per position in the test text, until a break position
2983 // is found.
2984 for (;;) {
2985 prevPosX2 = prevPos;
2986 prevCharX2 = prevChar;
2987
2988 prevPos = pos;
2989 prevChar = thisChar;
2990
2991 pos = nextPos;
2992 thisChar = fText->char32At(pos);
2993
2994 nextCPPos = fText->moveIndex32(pos, 1);
2995 nextPos = nextCPPos;
2996
2997
2998 if (pos >= fText->length()) {
2999 setAppliedRule(pos, "LB2 - Break at end of text.");
3000 break;
3001 }
3002
3003
3004 // We do this one out-of-order because the adjustment does not change anything
3005 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3006 // be applied.
3007 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3008 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3009 c = fText->char32At(nextPos);
3010 rule9Adjust(pos, &thisChar, &nextPos, &c);
3011
3012 // If the loop is still warming up - if we haven't shifted the initial
3013 // -1 positions out of prevPos yet - loop back to advance the
3014 // position in the input without any further looking for breaks.
3015 if (prevPos == -1) {
3016 setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
3017 continue;
3018 }
3019
3020
3021 if (fBK->contains(prevChar)) {
3022 setAppliedRule(pos, "LB 4 Always break after hard line breaks");
3023 break;
3024 }
3025
3026
3027 if (prevChar == 0x0d && thisChar == 0x0a) {
3028 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3029 continue;
3030 }
3031 if (prevChar == 0x0d ||
3032 prevChar == 0x0a ||
3033 prevChar == 0x85) {
3034 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3035 break;
3036 }
3037
3038
3039 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3040 fBK->contains(thisChar)) {
3041 setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
3042 continue;
3043 }
3044
3045
3046 if (fSP->contains(thisChar)) {
3047 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
3048 continue;
3049 }
3050
3051 // !!! ??? Is this the right text for the applied rule?
3052 if (fZW->contains(thisChar)) {
3053 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
3054 continue;
3055 }
3056
3057
3058 // ZW SP* ÷
3059 // Scan backwards from prevChar for SP* ZW
3060 tPos = prevPos;
3061 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3062 tPos = fText->moveIndex32(tPos, -1);
3063 }
3064 if (fZW->contains(fText->char32At(tPos))) {
3065 setAppliedRule(pos, "LB 8 Break after zero width space");
3066 break;
3067 }
3068
3069
3070 // Move this test up, before LB8a, because numbers can match a longer sequence that would
3071 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
3072 if (fNumberMatcher->lookingAt(prevPos, status)) {
3073 if (U_FAILURE(status)) {
3074 setAppliedRule(pos, "LB 25 Numbers");
3075 break;
3076 }
3077 // Matched a number. But could have been just a single digit, which would
3078 // not represent a "no break here" between prevChar and thisChar
3079 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3080 if (numEndIdx > pos) {
3081 // Number match includes at least our two chars being checked
3082 if (numEndIdx > nextPos) {
3083 // Number match includes additional chars. Update pos and nextPos
3084 // so that next loop iteration will continue at the end of the number,
3085 // checking for breaks between last char in number & whatever follows.
3086 pos = nextPos = numEndIdx;
3087 do {
3088 pos = fText->moveIndex32(pos, -1);
3089 thisChar = fText->char32At(pos);
3090 } while (fCM->contains(thisChar));
3091 }
3092 setAppliedRule(pos, "LB 25 Numbers");
3093 continue;
3094 }
3095 }
3096
3097
3098 // The monkey test's way of ignoring combining characters doesn't work
3099 // for this rule. ZJ is also a CM. Need to get the actual character
3100 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3101 {
3102 int32_t prevIdx = fText->moveIndex32(pos, -1);
3103 UChar32 prevC = fText->char32At(prevIdx);
3104 if (fZWJ->contains(prevC)) {
3105 setAppliedRule(pos, "LB 8a ZWJ x");
3106 continue;
3107 }
3108 }
3109
3110
3111 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
3112 //
3113
3114
3115 // x WJ
3116 // WJ x
3117 //
3118 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3119 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
3120 continue;
3121 }
3122
3123
3124 if (fGL->contains(prevChar)) {
3125 setAppliedRule(pos, "LB 12 GL x");
3126 continue;
3127 }
3128
3129
3130 if (!(fSP->contains(prevChar) ||
3131 fBA->contains(prevChar) ||
3132 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3133 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
3134 continue;
3135 }
3136
3137
3138 if (fCL->contains(thisChar) ||
3139 fCP->contains(thisChar) ||
3140 fEX->contains(thisChar) ||
3141 fSY->contains(thisChar)) {
3142 setAppliedRule(pos, "LB 13 Don't break before closings.");
3143 continue;
3144 }
3145
3146
3147 // Scan backwards, checking for this sequence.
3148 // The OP char could include combining marks, so we actually check for
3149 // OP CM* SP*
3150 // Another Twist: The Rule 9 fixes may have changed a SP CM
3151 // sequence into a ID char, so before scanning back through spaces,
3152 // verify that prevChar is indeed a space. The prevChar variable
3153 // may differ from fText[prevPos]
3154 tPos = prevPos;
3155 if (fSP->contains(prevChar)) {
3156 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3157 tPos=fText->moveIndex32(tPos, -1);
3158 }
3159 }
3160 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3161 tPos=fText->moveIndex32(tPos, -1);
3162 }
3163 if (fOP->contains(fText->char32At(tPos))) {
3164 setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3165 continue;
3166 }
3167
3168
3169 if (nextPos < fText->length()) {
3170 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3171 // from a legit ffff character. So test length separately.
3172 UChar32 nextChar = fText->char32At(nextPos);
3173 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3174 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3175 break;
3176 }
3177 }
3178
3179
3180 if (fIS->contains(thisChar)) {
3181 setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
3182 continue;
3183 }
3184
3185
3186 if (fOP->contains(thisChar)) {
3187 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3188 int tPos = prevPos;
3189 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3190 tPos = fText->moveIndex32(tPos, -1);
3191 }
3192 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3193 tPos = fText->moveIndex32(tPos, -1);
3194 }
3195 if (fQU->contains(fText->char32At(tPos))) {
3196 setAppliedRule(pos, "LB 15 QU SP* x OP");
3197 continue;
3198 }
3199 }
3200
3201
3202 // Scan backwards for SP* CM* (CL | CP)
3203 if (fNS->contains(thisChar)) {
3204 int tPos = prevPos;
3205 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3206 tPos = fText->moveIndex32(tPos, -1);
3207 }
3208 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3209 tPos = fText->moveIndex32(tPos, -1);
3210 }
3211 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3212 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
3213 continue;
3214 }
3215 }
3216
3217
3218 if (fB2->contains(thisChar)) {
3219 // Scan backwards, checking for the B2 CM* SP* sequence.
3220 tPos = prevPos;
3221 if (fSP->contains(prevChar)) {
3222 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3223 tPos=fText->moveIndex32(tPos, -1);
3224 }
3225 }
3226 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3227 tPos=fText->moveIndex32(tPos, -1);
3228 }
3229 if (fB2->contains(fText->char32At(tPos))) {
3230 setAppliedRule(pos, "LB 17 B2 SP* x B2");
3231 continue;
3232 }
3233 }
3234
3235
3236 if (fSP->contains(prevChar)) {
3237 setAppliedRule(pos, "LB 18 break after space");
3238 break;
3239 }
3240
3241 // x QU
3242 // QU x
3243 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3244 setAppliedRule(pos, "LB 19");
3245 continue;
3246 }
3247
3248 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3249 setAppliedRule(pos, "LB 20 Break around a CB");
3250 break;
3251 }
3252
3253 // Don't break between Hyphens and letters if a break precedes the hyphen.
3254 // Formerly this was a Finnish tailoring.
3255 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3256 // ^($HY | $HH) $AL;
3257 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3258 prevPosX2 == -1) {
3259 setAppliedRule(pos, "LB 20.09");
3260 continue;
3261 }
3262
3263 if (fBA->contains(thisChar) ||
3264 fHY->contains(thisChar) ||
3265 fNS->contains(thisChar) ||
3266 fBB->contains(prevChar) ) {
3267 setAppliedRule(pos, "LB 21");
3268 continue;
3269 }
3270
3271 if (fHL->contains(prevCharX2) &&
3272 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3273 setAppliedRule(pos, "LB 21a HL (HY | BA) x");
3274 continue;
3275 }
3276
3277 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3278 setAppliedRule(pos, "LB 21b SY x HL");
3279 continue;
3280 }
3281
3282 if (fIN->contains(thisChar)) {
3283 setAppliedRule(pos, "LB 22");
3284 continue;
3285 }
3286
3287
3288 // (AL | HL) x NU
3289 // NU x (AL | HL)
3290 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3291 setAppliedRule(pos, "LB 23");
3292 continue;
3293 }
3294 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3295 setAppliedRule(pos, "LB 23");
3296 continue;
3297 }
3298
3299 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3300 // PR x (ID | EB | EM)
3301 // (ID | EB | EM) x PO
3302 if (fPR->contains(prevChar) &&
3303 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3304 setAppliedRule(pos, "LB 23a");
3305 continue;
3306 }
3307 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3308 fPO->contains(thisChar)) {
3309 setAppliedRule(pos, "LB 23a");
3310 continue;
3311 }
3312
3313 // Do not break between prefix and letters or ideographs.
3314 // (PR | PO) x (AL | HL)
3315 // (AL | HL) x (PR | PO)
3316 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3317 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3318 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3319 continue;
3320 }
3321 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3322 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3323 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3324 continue;
3325 }
3326
3327 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3328
3329 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3330 fJV->contains(thisChar) ||
3331 fH2->contains(thisChar) ||
3332 fH3->contains(thisChar))) {
3333 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3334 continue;
3335 }
3336
3337 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3338 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3339 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3340 continue;
3341 }
3342
3343 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3344 fJT->contains(thisChar)) {
3345 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3346 continue;
3347 }
3348
3349 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3350 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3351 fIN->contains(thisChar)) {
3352 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3353 continue;
3354 }
3355 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3356 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3357 fPO->contains(thisChar)) {
3358 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3359 continue;
3360 }
3361 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3362 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3363 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3364 continue;
3365 }
3366
3367
3368
3369 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3370 setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
3371 continue;
3372 }
3373
3374 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3375 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3376 continue;
3377 }
3378
3379 // (AL | NU) x OP
3380 // CP x (AL | NU)
3381 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3382 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3383 continue;
3384 }
3385 if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3386 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3387 continue;
3388 }
3389
3390 // RI x RI
3391 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3392 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3393 break;
3394 }
3395 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3396 // Two Regional Indicators have been paired.
3397 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3398 // following RI. This is a hack.
3399 thisChar = -1;
3400 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3401 continue;
3402 }
3403
3404 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3405 setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
3406 continue;
3407 }
3408
3409 setAppliedRule(pos, "LB 31 Break everywhere else");
3410 break;
3411 }
3412
3413 return pos;
3414 }
3415
3416
charClasses()3417 UVector *RBBILineMonkey::charClasses() {
3418 return fSets;
3419 }
3420
3421
~RBBILineMonkey()3422 RBBILineMonkey::~RBBILineMonkey() {
3423 delete fSets;
3424
3425 delete fBK;
3426 delete fCR;
3427 delete fLF;
3428 delete fCM;
3429 delete fNL;
3430 delete fWJ;
3431 delete fZW;
3432 delete fGL;
3433 delete fCB;
3434 delete fSP;
3435 delete fB2;
3436 delete fBA;
3437 delete fBB;
3438 delete fHH;
3439 delete fHY;
3440 delete fH2;
3441 delete fH3;
3442 delete fCL;
3443 delete fCP;
3444 delete fEX;
3445 delete fIN;
3446 delete fJL;
3447 delete fJV;
3448 delete fJT;
3449 delete fNS;
3450 delete fOP;
3451 delete fQU;
3452 delete fIS;
3453 delete fNU;
3454 delete fPO;
3455 delete fPR;
3456 delete fSY;
3457 delete fAI;
3458 delete fAL;
3459 delete fCJ;
3460 delete fHL;
3461 delete fID;
3462 delete fRI;
3463 delete fSG;
3464 delete fXX;
3465 delete fEB;
3466 delete fEM;
3467 delete fZWJ;
3468 delete fOP30;
3469 delete fCP30;
3470
3471 delete fCharBI;
3472 delete fNumberMatcher;
3473 }
3474
3475
3476 //-------------------------------------------------------------------------------------------
3477 //
3478 // TestMonkey
3479 //
3480 // params
3481 // seed=nnnnn Random number starting seed.
3482 // Setting the seed allows errors to be reproduced.
3483 // loop=nnn Looping count. Controls running time.
3484 // -1: run forever.
3485 // 0 or greater: run length.
3486 //
3487 // type = char | word | line | sent | title
3488 //
3489 // Example:
3490 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3491 //
3492 //-------------------------------------------------------------------------------------------
3493
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3494 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3495 int32_t val = defaultVal;
3496 name.append(" *= *(-?\\d+)");
3497 UErrorCode status = U_ZERO_ERROR;
3498 RegexMatcher m(name, params, 0, status);
3499 if (m.find()) {
3500 // The param exists. Convert the string to an int.
3501 char valString[100];
3502 int32_t paramLength = m.end(1, status) - m.start(1, status);
3503 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3504 paramLength = (int32_t)(sizeof(valString)-2);
3505 }
3506 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3507 val = strtol(valString, NULL, 10);
3508
3509 // Delete this parameter from the params string.
3510 m.reset();
3511 params = m.replaceFirst("", status);
3512 }
3513 U_ASSERT(U_SUCCESS(status));
3514 return val;
3515 }
3516 #endif
3517
3518 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3519 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3520 BreakIterator *bi,
3521 int expected[],
3522 int expectedcount)
3523 {
3524 int count = 0;
3525 int i = 0;
3526 int forward[50];
3527 bi->setText(ustr);
3528 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3529 forward[count] = i;
3530 if (count < expectedcount && expected[count] != i) {
3531 test->errln("%s:%d break forward test failed: expected %d but got %d",
3532 __FILE__, __LINE__, expected[count], i);
3533 break;
3534 }
3535 count ++;
3536 }
3537 if (count != expectedcount) {
3538 printStringBreaks(ustr, expected, expectedcount);
3539 test->errln("%s:%d break forward test failed: missed %d match",
3540 __FILE__, __LINE__, expectedcount - count);
3541 return;
3542 }
3543 // testing boundaries
3544 for (i = 1; i < expectedcount; i ++) {
3545 int j = expected[i - 1];
3546 if (!bi->isBoundary(j)) {
3547 printStringBreaks(ustr, expected, expectedcount);
3548 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3549 __FILE__, __LINE__, j);
3550 return;
3551 }
3552 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3553 if (bi->isBoundary(j)) {
3554 printStringBreaks(ustr, expected, expectedcount);
3555 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3556 __FILE__, __LINE__, j);
3557 return;
3558 }
3559 }
3560 }
3561
3562 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3563 count --;
3564 if (forward[count] != i) {
3565 printStringBreaks(ustr, expected, expectedcount);
3566 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3567 __FILE__, __LINE__, forward[count], i);
3568 break;
3569 }
3570 }
3571 if (count != 0) {
3572 printStringBreaks(ustr, expected, expectedcount);
3573 test->errln("break test previous() failed: missed a match");
3574 return;
3575 }
3576
3577 // testing preceding
3578 for (i = 0; i < expectedcount - 1; i ++) {
3579 // int j = expected[i] + 1;
3580 int j = ustr.moveIndex32(expected[i], 1);
3581 for (; j <= expected[i + 1]; j ++) {
3582 int32_t expectedPreceding = expected[i];
3583 int32_t actualPreceding = bi->preceding(j);
3584 if (actualPreceding != expectedPreceding) {
3585 printStringBreaks(ustr, expected, expectedcount);
3586 test->errln("%s:%d preceding(%d): expected %d, got %d",
3587 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3588 return;
3589 }
3590 }
3591 }
3592 }
3593 #endif
3594
TestWordBreaks(void)3595 void RBBITest::TestWordBreaks(void)
3596 {
3597 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3598
3599 Locale locale("en");
3600 UErrorCode status = U_ZERO_ERROR;
3601 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3602 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3603 // Replaced any C+J characters in a row with a random sequence of characters
3604 // of the same length to make our C+J segmentation not get in the way.
3605 static const char *strlist[] =
3606 {
3607 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3608 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3609 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3610 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3611 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3612 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3613 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3614 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3615 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3616 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3617 "\\u2027\\U000e0067\\u0a47\\u00b7",
3618 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3619 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3620 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3621 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3622 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3623 "\\u0027\\u11af\\U000e0057\\u0602",
3624 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3625 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3626 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3627 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3628 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3629 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3630 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3631 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3632 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3633 "\\u18f4\\U000e0049\\u20e7\\u2027",
3634 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3635 "\\ua183\\u102d\\u0bec\\u003a",
3636 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3637 "\\u003a\\u0e57\\u0fad\\u002e",
3638 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3639 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3640 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3641 "\\u003a\\u0664\\u00b7\\u1fba",
3642 "\\u003b\\u0027\\u00b7\\u47a3",
3643 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3644 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3645 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3646 };
3647 int loop;
3648 if (U_FAILURE(status)) {
3649 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3650 return;
3651 }
3652 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3653 // printf("looping %d\n", loop);
3654 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3655 // RBBICharMonkey monkey;
3656 RBBIWordMonkey monkey;
3657
3658 int expected[50];
3659 int expectedcount = 0;
3660
3661 monkey.setText(ustr);
3662 int i;
3663 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3664 expected[expectedcount ++] = i;
3665 }
3666
3667 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3668 }
3669 delete bi;
3670 #endif
3671 }
3672
TestWordBoundary(void)3673 void RBBITest::TestWordBoundary(void)
3674 {
3675 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3676 Locale locale("en");
3677 UErrorCode status = U_ZERO_ERROR;
3678 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3679 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3680 if (U_FAILURE(status)) {
3681 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3682 __FILE__, __LINE__, u_errorName(status));
3683 return;
3684 }
3685 UChar str[50];
3686 static const char *strlist[] =
3687 {
3688 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3689 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3690 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3691 "\\u2027\\U000e0067\\u0a47\\u00b7",
3692 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3693 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3694 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3695 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3696 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3697 "\\u0027\\u11af\\U000e0057\\u0602",
3698 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3699 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3700 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3701 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3702 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3703 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3704 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3705 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3706 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3707 "\\u58f4\\U000e0049\\u20e7\\u2027",
3708 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3709 "\\ua183\\u102d\\u0bec\\u003a",
3710 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3711 "\\u003a\\u0e57\\u0fad\\u002e",
3712 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3713 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3714 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3715 "\\u003a\\u0664\\u00b7\\u1fba",
3716 "\\u003b\\u0027\\u00b7\\u47a3",
3717 };
3718 int loop;
3719 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3720 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3721 UnicodeString ustr(str);
3722 int forward[50];
3723 int count = 0;
3724
3725 bi->setText(ustr);
3726 int prev = -1;
3727 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3728 ++count;
3729 if (count >= UPRV_LENGTHOF(forward)) {
3730 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3731 __FILE__, __LINE__, loop, count, boundary);
3732 return;
3733 }
3734 forward[count] = boundary;
3735 if (boundary <= prev) {
3736 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3737 __FILE__, __LINE__, loop, prev, boundary);
3738 break;
3739 }
3740 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3741 if (bi->isBoundary(nonBoundary)) {
3742 printStringBreaks(ustr, forward, count);
3743 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3744 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3745 return;
3746 }
3747 }
3748 if (!bi->isBoundary(boundary)) {
3749 printStringBreaks(ustr, forward, count);
3750 errln("%s:%d happy boundary test failed: expected %d a boundary",
3751 __FILE__, __LINE__, boundary);
3752 return;
3753 }
3754 prev = boundary;
3755 }
3756 }
3757 }
3758
TestLineBreaks(void)3759 void RBBITest::TestLineBreaks(void)
3760 {
3761 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3762 Locale locale("en");
3763 UErrorCode status = U_ZERO_ERROR;
3764 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3765 const int32_t STRSIZE = 50;
3766 UChar str[STRSIZE];
3767 static const char *strlist[] =
3768 {
3769 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3770 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3771 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3772 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3773 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3774 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3775 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3776 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3777 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3778 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3779 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3780 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3781 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3782 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3783 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3784 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3785 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3786 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3787 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3788 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3789 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3790 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3791 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3792 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3793 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3794 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3795 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3796 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3797 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3798 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3799 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3800 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3801 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3802 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3803 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3804 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3805 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3806 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3807 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3808 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3809 };
3810 int loop;
3811 TEST_ASSERT_SUCCESS(status);
3812 if (U_FAILURE(status)) {
3813 return;
3814 }
3815 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3816 // printf("looping %d\n", loop);
3817 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3818 if (t >= STRSIZE) {
3819 TEST_ASSERT(FALSE);
3820 continue;
3821 }
3822
3823
3824 UnicodeString ustr(str);
3825 RBBILineMonkey monkey;
3826 if (U_FAILURE(monkey.deferredStatus)) {
3827 continue;
3828 }
3829
3830 const int EXPECTEDSIZE = 50;
3831 int expected[EXPECTEDSIZE];
3832 int expectedcount = 0;
3833
3834 monkey.setText(ustr);
3835
3836 int i;
3837 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3838 if (expectedcount >= EXPECTEDSIZE) {
3839 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3840 return;
3841 }
3842 expected[expectedcount ++] = i;
3843 }
3844
3845 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3846 }
3847 delete bi;
3848 #endif
3849 }
3850
TestSentBreaks(void)3851 void RBBITest::TestSentBreaks(void)
3852 {
3853 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3854 Locale locale("en");
3855 UErrorCode status = U_ZERO_ERROR;
3856 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3857 UChar str[200];
3858 static const char *strlist[] =
3859 {
3860 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3861 "This\n",
3862 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3863 "\"Sentence ending with a quote.\" Bye.",
3864 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3865 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3866 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3867 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3868 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3869 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3870 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3871 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3872 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3873 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3874 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3875 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3876 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3877 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3878 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3879 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3880 };
3881 int loop;
3882 if (U_FAILURE(status)) {
3883 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3884 return;
3885 }
3886 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3887 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3888 UnicodeString ustr(str);
3889
3890 RBBISentMonkey monkey;
3891 if (U_FAILURE(monkey.deferredStatus)) {
3892 continue;
3893 }
3894
3895 const int EXPECTEDSIZE = 50;
3896 int expected[EXPECTEDSIZE];
3897 int expectedcount = 0;
3898
3899 monkey.setText(ustr);
3900
3901 int i;
3902 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3903 if (expectedcount >= EXPECTEDSIZE) {
3904 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3905 return;
3906 }
3907 expected[expectedcount ++] = i;
3908 }
3909
3910 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3911 }
3912 delete bi;
3913 #endif
3914 }
3915
TestMonkey()3916 void RBBITest::TestMonkey() {
3917 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3918
3919 UErrorCode status = U_ZERO_ERROR;
3920 int32_t loopCount = 500;
3921 int32_t seed = 1;
3922 UnicodeString breakType = "all";
3923 Locale locale("en");
3924 UBool useUText = FALSE;
3925
3926 if (quick == FALSE) {
3927 loopCount = 10000;
3928 }
3929
3930 if (fTestParams) {
3931 UnicodeString p(fTestParams);
3932 loopCount = getIntParam("loop", p, loopCount);
3933 seed = getIntParam("seed", p, seed);
3934
3935 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3936 if (m.find()) {
3937 breakType = m.group(1, status);
3938 m.reset();
3939 p = m.replaceFirst("", status);
3940 }
3941
3942 RegexMatcher u(" *utext", p, 0, status);
3943 if (u.find()) {
3944 useUText = TRUE;
3945 u.reset();
3946 p = u.replaceFirst("", status);
3947 }
3948
3949
3950 // m.reset(p);
3951 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3952 // Each option is stripped out of the option string as it is processed.
3953 // All options have been checked. The option string should have been completely emptied..
3954 char buf[100];
3955 p.extract(buf, sizeof(buf), NULL, status);
3956 buf[sizeof(buf)-1] = 0;
3957 errln("Unrecognized or extra parameter: %s\n", buf);
3958 return;
3959 }
3960
3961 }
3962
3963 if (breakType == "char" || breakType == "all") {
3964 RBBICharMonkey m;
3965 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3966 if (U_SUCCESS(status)) {
3967 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3968 if (breakType == "all" && useUText==FALSE) {
3969 // Also run a quick test with UText when "all" is specified
3970 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3971 }
3972 }
3973 else {
3974 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3975 }
3976 delete bi;
3977 }
3978
3979 if (breakType == "word" || breakType == "all") {
3980 logln("Word Break Monkey Test");
3981 RBBIWordMonkey m;
3982 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3983 if (U_SUCCESS(status)) {
3984 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3985 }
3986 else {
3987 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3988 }
3989 delete bi;
3990 }
3991
3992 if (breakType == "line" || breakType == "all") {
3993 logln("Line Break Monkey Test");
3994 RBBILineMonkey m;
3995 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3996 if (loopCount >= 10) {
3997 loopCount = loopCount / 5; // Line break runs slower than the others.
3998 }
3999 if (U_SUCCESS(status)) {
4000 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4001 }
4002 else {
4003 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4004 }
4005 delete bi;
4006 }
4007
4008 if (breakType == "sent" || breakType == "all" ) {
4009 logln("Sentence Break Monkey Test");
4010 RBBISentMonkey m;
4011 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4012 if (loopCount >= 10) {
4013 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4014 }
4015 if (U_SUCCESS(status)) {
4016 RunMonkey(bi, m, "sent", seed, loopCount, useUText);
4017 }
4018 else {
4019 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4020 }
4021 delete bi;
4022 }
4023
4024 #endif
4025 }
4026
4027 //
4028 // Run a RBBI monkey test. Common routine, for all break iterator types.
4029 // Parameters:
4030 // bi - the break iterator to use
4031 // mk - MonkeyKind, abstraction for obtaining expected results
4032 // name - Name of test (char, word, etc.) for use in error messages
4033 // seed - Seed for starting random number generator (parameter from user)
4034 // numIterations
4035 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4036 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4037 int32_t numIterations, UBool useUText) {
4038
4039 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4040
4041 const int32_t TESTSTRINGLEN = 500;
4042 UnicodeString testText;
4043 int32_t numCharClasses;
4044 UVector *chClasses;
4045 int expectedCount = 0;
4046 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4047 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4048 char reverseBreaks[TESTSTRINGLEN*2+1];
4049 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4050 char followingBreaks[TESTSTRINGLEN*2+1];
4051 char precedingBreaks[TESTSTRINGLEN*2+1];
4052 int i;
4053 int loopCount = 0;
4054
4055
4056 m_seed = seed;
4057
4058 numCharClasses = mk.charClasses()->size();
4059 chClasses = mk.charClasses();
4060
4061 // Check for errors that occured during the construction of the MonkeyKind object.
4062 // Can't report them where they occured because errln() is a method coming from intlTest,
4063 // and is not visible outside of RBBITest :-(
4064 if (U_FAILURE(mk.deferredStatus)) {
4065 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4066 return;
4067 }
4068
4069 // Verify that the character classes all have at least one member.
4070 for (i=0; i<numCharClasses; i++) {
4071 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4072 if (s == NULL || s->size() == 0) {
4073 errln("Character Class #%d is null or of zero size.", i);
4074 return;
4075 }
4076 }
4077
4078 // For minimizing width of class name output.
4079 int classNameSize = mk.maxClassNameSize();
4080
4081 while (loopCount < numIterations || numIterations == -1) {
4082 if (numIterations == -1 && loopCount % 10 == 0) {
4083 // If test is running in an infinite loop, display a periodic tic so
4084 // we can tell that it is making progress.
4085 fprintf(stderr, ".");
4086 }
4087 // Save current random number seed, so that we can recreate the random numbers
4088 // for this loop iteration in event of an error.
4089 seed = m_seed;
4090
4091 // Populate a test string with data.
4092 testText.truncate(0);
4093 for (i=0; i<TESTSTRINGLEN; i++) {
4094 int32_t aClassNum = m_rand() % numCharClasses;
4095 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4096 int32_t charIdx = m_rand() % classSet->size();
4097 UChar32 c = classSet->charAt(charIdx);
4098 if (c < 0) { // TODO: deal with sets containing strings.
4099 errln("%s:%d c < 0", __FILE__, __LINE__);
4100 break;
4101 }
4102 // Do not assemble a supplementary character from randomly generated separate surrogates.
4103 // (It could be a dictionary character)
4104 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4105 continue;
4106 }
4107
4108 testText.append(c);
4109 }
4110
4111 // Calculate the expected results for this test string and reset applied rules.
4112 mk.setText(testText);
4113
4114 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4115 expectedBreaks[0] = 1;
4116 int32_t breakPos = 0;
4117 expectedCount = 0;
4118 for (;;) {
4119 breakPos = mk.next(breakPos);
4120 if (breakPos == -1) {
4121 break;
4122 }
4123 if (breakPos > testText.length()) {
4124 errln("breakPos > testText.length()");
4125 }
4126 expectedBreaks[breakPos] = 1;
4127 U_ASSERT(expectedCount<testText.length());
4128 }
4129
4130 // Find the break positions using forward iteration
4131 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4132 if (useUText) {
4133 UErrorCode status = U_ZERO_ERROR;
4134 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4135 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4136 bi->setText(testUText, status);
4137 TEST_ASSERT_SUCCESS(status);
4138 utext_close(testUText); // The break iterator does a shallow clone of the UText
4139 // This UText can be closed immediately, so long as the
4140 // testText string continues to exist.
4141 } else {
4142 bi->setText(testText);
4143 }
4144
4145 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4146 if (i < 0 || i > testText.length()) {
4147 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4148 break;
4149 }
4150 forwardBreaks[i] = 1;
4151 }
4152
4153 // Find the break positions using reverse iteration
4154 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4155 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4156 if (i < 0 || i > testText.length()) {
4157 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4158 break;
4159 }
4160 reverseBreaks[i] = 1;
4161 }
4162
4163 // Find the break positions using isBoundary() tests.
4164 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4165 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4166 for (i=0; i<=testText.length(); i++) {
4167 isBoundaryBreaks[i] = bi->isBoundary(i);
4168 }
4169
4170
4171 // Find the break positions using the following() function.
4172 // printf(".");
4173 memset(followingBreaks, 0, sizeof(followingBreaks));
4174 int32_t lastBreakPos = 0;
4175 followingBreaks[0] = 1;
4176 for (i=0; i<testText.length(); i++) {
4177 breakPos = bi->following(i);
4178 if (breakPos <= i ||
4179 breakPos < lastBreakPos ||
4180 breakPos > testText.length() ||
4181 (breakPos > lastBreakPos && lastBreakPos > i)) {
4182 errln("%s break monkey test: "
4183 "Out of range value returned by BreakIterator::following().\n"
4184 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4185 name, seed, i, breakPos, lastBreakPos);
4186 break;
4187 }
4188 followingBreaks[breakPos] = 1;
4189 lastBreakPos = breakPos;
4190 }
4191
4192 // Find the break positions using the preceding() function.
4193 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4194 lastBreakPos = testText.length();
4195 precedingBreaks[testText.length()] = 1;
4196 for (i=testText.length(); i>0; i--) {
4197 breakPos = bi->preceding(i);
4198 if (breakPos >= i ||
4199 breakPos > lastBreakPos ||
4200 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4201 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4202 errln("%s break monkey test: "
4203 "Out of range value returned by BreakIterator::preceding().\n"
4204 "index=%d; prev returned %d; lastBreak=%d" ,
4205 name, i, breakPos, lastBreakPos);
4206 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4207 precedingBreaks[i] = 2; // Forces an error.
4208 }
4209 } else {
4210 if (breakPos >= 0) {
4211 precedingBreaks[breakPos] = 1;
4212 }
4213 lastBreakPos = breakPos;
4214 }
4215 }
4216
4217 // Compare the expected and actual results.
4218 for (i=0; i<=testText.length(); i++) {
4219 const char *errorType = NULL;
4220 const char* currentBreakData = NULL;
4221 if (forwardBreaks[i] != expectedBreaks[i]) {
4222 errorType = "next()";
4223 currentBreakData = forwardBreaks;
4224 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4225 errorType = "previous()";
4226 currentBreakData = reverseBreaks;
4227 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4228 errorType = "isBoundary()";
4229 currentBreakData = isBoundaryBreaks;
4230 } else if (followingBreaks[i] != expectedBreaks[i]) {
4231 errorType = "following()";
4232 currentBreakData = followingBreaks;
4233 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4234 errorType = "preceding()";
4235 currentBreakData = precedingBreaks;
4236 }
4237
4238 if (errorType != NULL) {
4239 // Format a range of the test text that includes the failure as
4240 // a data item that can be included in the rbbi test data file.
4241
4242 // Start of the range is the last point where expected and actual results
4243 // both agreed that there was a break position.
4244
4245 int startContext = i;
4246 int32_t count = 0;
4247 for (;;) {
4248 if (startContext==0) { break; }
4249 startContext --;
4250 if (expectedBreaks[startContext] != 0) {
4251 if (count == 2) break;
4252 count ++;
4253 }
4254 }
4255
4256 // End of range is two expected breaks past the start position.
4257 int endContext = i + 1;
4258 int ci;
4259 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4260 for (;;) {
4261 if (endContext >= testText.length()) {break;}
4262 if (expectedBreaks[endContext-1] != 0) {
4263 if (count == 0) break;
4264 count --;
4265 }
4266 endContext ++;
4267 }
4268 }
4269
4270 // Formatting of each line includes:
4271 // character code
4272 // reference break: '|' -> a break, '.' -> no break
4273 // actual break: '|' -> a break, '.' -> no break
4274 // (name of character clase)
4275 // Unicode name of character
4276 // '-->' indicates location of the difference.
4277
4278 MONKEY_ERROR(
4279 (expectedBreaks[i] ? "Break expected but not found" :
4280 "Break found but not expected"),
4281 name, i, seed);
4282
4283 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4284 UChar32 c;
4285 c = testText.char32At(ci);
4286
4287 std::string currentLineFlag = " ";
4288 if (ci == i) {
4289 currentLineFlag = "-->"; // Error position
4290 }
4291
4292 // BMP or SMP character in hex
4293 char hexCodePoint[12];
4294 std::string format = " \\u%04x";
4295 if (c >= 0x10000) {
4296 format = "\\U%08x";
4297 }
4298 sprintf(hexCodePoint, format.c_str(), c);
4299
4300 // Get the class name and character name for the character.
4301 char cName[200];
4302 UErrorCode status = U_ZERO_ERROR;
4303 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4304
4305 char buffer[200];
4306 auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4307 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4308 currentLineFlag.c_str(),
4309 ci,
4310 expectedBreaks[ci] == 0 ? "." : "|", // Reference break
4311 currentBreakData[ci] == 0 ? "." : "|", // Actual break
4312 hexCodePoint,
4313 classNameSize,
4314 mk.classNameFromCodepoint(c).c_str(),
4315 mk.getAppliedRule(ci).c_str(), cName);
4316 (void)ret;
4317 U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4318
4319 // Output the error
4320 if (ci == i) {
4321 errln(buffer);
4322 } else {
4323 infoln(buffer);
4324 }
4325
4326 if (ci >= endContext) { break; }
4327 }
4328 break;
4329 }
4330 }
4331
4332 loopCount++;
4333 }
4334 #endif
4335 }
4336
4337
4338 // Bug 5532. UTF-8 based UText fails in dictionary code.
4339 // This test checks the initial patch,
4340 // which is to just keep it from crashing. Correct word boundaries
4341 // await a proper fix to the dictionary code.
4342 //
TestBug5532(void)4343 void RBBITest::TestBug5532(void) {
4344 // Text includes a mixture of Thai and Latin.
4345 const unsigned char utf8Data[] = {
4346 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4347 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4348 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4349 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4350 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4351 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4352 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4353 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4354 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4355 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4356 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4357
4358 UErrorCode status = U_ZERO_ERROR;
4359 UText utext=UTEXT_INITIALIZER;
4360 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4361 TEST_ASSERT_SUCCESS(status);
4362
4363 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4364 TEST_ASSERT_SUCCESS(status);
4365 if (U_SUCCESS(status)) {
4366 bi->setText(&utext, status);
4367 TEST_ASSERT_SUCCESS(status);
4368
4369 int32_t breakCount = 0;
4370 int32_t previousBreak = -1;
4371 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4372 // For now, just make sure that the break iterator doesn't hang.
4373 TEST_ASSERT(previousBreak < bi->current());
4374 previousBreak = bi->current();
4375 }
4376 TEST_ASSERT(breakCount > 0);
4377 }
4378 delete bi;
4379 utext_close(&utext);
4380 }
4381
4382
TestBug9983(void)4383 void RBBITest::TestBug9983(void) {
4384 UnicodeString text = UnicodeString("\\u002A" // * Other
4385 "\\uFF65" // Other
4386 "\\u309C" // Katakana
4387 "\\uFF9F" // Extend
4388 "\\uFF65" // Other
4389 "\\u0020" // Other
4390 "\\u0000").unescape();
4391
4392 UErrorCode status = U_ZERO_ERROR;
4393 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4394 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4395 TEST_ASSERT_SUCCESS(status);
4396 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4397 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4398 TEST_ASSERT_SUCCESS(status);
4399 if (U_FAILURE(status)) {
4400 return;
4401 }
4402 int32_t offset, rstatus, iterationCount;
4403
4404 brkiter->setText(text);
4405 brkiter->last();
4406 iterationCount = 0;
4407 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4408 iterationCount++;
4409 rstatus = brkiter->getRuleStatus();
4410 (void)rstatus; // Suppress set but not used warning.
4411 if (iterationCount >= 10) {
4412 break;
4413 }
4414 }
4415 TEST_ASSERT(iterationCount == 6);
4416
4417 brkiterPOSIX->setText(text);
4418 brkiterPOSIX->last();
4419 iterationCount = 0;
4420 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4421 iterationCount++;
4422 rstatus = brkiterPOSIX->getRuleStatus();
4423 (void)rstatus; // Suppress set but not used warning.
4424 if (iterationCount >= 10) {
4425 break;
4426 }
4427 }
4428 TEST_ASSERT(iterationCount == 6);
4429 }
4430
4431 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4432 //
TestBug7547()4433 void RBBITest::TestBug7547() {
4434 UnicodeString rules;
4435 UErrorCode status = U_ZERO_ERROR;
4436 UParseError parseError;
4437 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4438 if (status != U_BRK_RULE_SYNTAX) {
4439 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4440 }
4441 if (parseError.line != 1 || parseError.offset != 0) {
4442 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4443 }
4444 }
4445
4446
TestBug12797()4447 void RBBITest::TestBug12797() {
4448 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4449 UErrorCode status = U_ZERO_ERROR;
4450 UParseError parseError;
4451 RuleBasedBreakIterator bi(rules, parseError, status);
4452 if (U_FAILURE(status)) {
4453 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4454 return;
4455 }
4456 UnicodeString text = "abc";
4457 bi.setText(text);
4458 bi.first();
4459 int32_t boundary = bi.next();
4460 if (boundary != 3) {
4461 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4462 }
4463 }
4464
TestBug12918()4465 void RBBITest::TestBug12918() {
4466 // This test triggers an assertion failure in dictbe.cpp
4467 const UChar *crasherString = u"\u3325\u4a16";
4468 UErrorCode status = U_ZERO_ERROR;
4469 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4470 if (U_FAILURE(status)) {
4471 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4472 return;
4473 }
4474 ubrk_first(iter);
4475 int32_t pos = 0;
4476 int32_t lastPos = -1;
4477 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4478 if (pos <= lastPos) {
4479 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4480 break;
4481 }
4482 }
4483 ubrk_close(iter);
4484 }
4485
TestBug12932()4486 void RBBITest::TestBug12932() {
4487 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4488 UnicodeString ruleStr(
4489 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4490 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4491 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4492 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4493 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4494 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4495
4496 UErrorCode status = U_ZERO_ERROR;
4497 UParseError parseError;
4498 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4499 if (status != U_BRK_RULE_SYNTAX) {
4500 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4501 __FILE__, __LINE__, u_errorName(status));
4502 }
4503 }
4504
4505
4506 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4507 // remain undevided by ICU char, word and line break.
TestEmoji()4508 void RBBITest::TestEmoji() {
4509 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4510 UErrorCode status = U_ZERO_ERROR;
4511
4512 CharString testFileName;
4513 testFileName.append(IntlTest::getSourceTestData(status), status);
4514 testFileName.appendPathPart("emoji-test.txt", status);
4515 if (U_FAILURE(status)) {
4516 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4517 return;
4518 }
4519 logln("Opening data file %s\n", testFileName.data());
4520
4521 int len;
4522 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4523 if (U_FAILURE(status) || testFile == NULL) {
4524 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4525 return;
4526 }
4527 UnicodeString testFileAsString(testFile, len);
4528 delete [] testFile;
4529
4530 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4531 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4532 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4533 int32_t lineNumber = 0;
4534
4535 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4536 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4537 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4538 if (U_FAILURE(status)) {
4539 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4540 return;
4541 }
4542
4543 while (lineMatcher.find()) {
4544 ++lineNumber;
4545 UnicodeString line = lineMatcher.group(status);
4546 hexMatcher.reset(line);
4547 UnicodeString testString; // accumulates the emoji sequence.
4548 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4549 UnicodeString hex = hexMatcher.group(1, status);
4550 if (hex.length() > 8) {
4551 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4552 break;
4553 }
4554 CharString hex8;
4555 hex8.appendInvariantChars(hex, status);
4556 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4557 if (c<=0x10ffff) {
4558 testString.append(c);
4559 } else {
4560 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4561 __FILE__, __LINE__, lineNumber, hex8.data());
4562 break;
4563 }
4564 }
4565
4566 if (testString.length() > 1) {
4567 charBreaks->setText(testString);
4568 charBreaks->first();
4569 int32_t firstBreak = charBreaks->next();
4570 if (testString.length() != firstBreak) {
4571 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4572 __FILE__, __LINE__, lineNumber, firstBreak);
4573 }
4574 wordBreaks->setText(testString);
4575 wordBreaks->first();
4576 firstBreak = wordBreaks->next();
4577 if (testString.length() != firstBreak) {
4578 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4579 __FILE__, __LINE__, lineNumber, firstBreak);
4580 }
4581 lineBreaks->setText(testString);
4582 lineBreaks->first();
4583 firstBreak = lineBreaks->next();
4584 if (testString.length() != firstBreak) {
4585 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4586 __FILE__, __LINE__, lineNumber, firstBreak);
4587 }
4588 }
4589 }
4590 #endif
4591 }
4592
4593
4594 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4595
TestBug12519()4596 void RBBITest::TestBug12519() {
4597 UErrorCode status = U_ZERO_ERROR;
4598 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4599 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4600 if (!assertSuccess(WHERE, status)) {
4601 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4602 return;
4603 }
4604 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4605
4606 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4607 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4608
4609 LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4610 assertTrue(WHERE, *biEn == *cloneEn);
4611 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4612
4613 LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4614 assertTrue(WHERE, *biFr == *cloneFr);
4615 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4616
4617 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4618 UnicodeString text("Hallo Welt");
4619 biDe->setText(text);
4620 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4621 *biDe = *biFr;
4622 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4623 }
4624
TestBug12677()4625 void RBBITest::TestBug12677() {
4626 // Check that stripping of comments from rules for getRules() is not confused by
4627 // the presence of '#' characters in the rules that do not introduce comments.
4628 UnicodeString rules(u"!!forward; \n"
4629 "$x = [ab#]; # a set with a # literal. \n"
4630 " # .; # a comment that looks sort of like a rule. \n"
4631 " '#' '?'; # a rule with a quoted # \n"
4632 );
4633
4634 UErrorCode status = U_ZERO_ERROR;
4635 UParseError pe;
4636 RuleBasedBreakIterator bi(rules, pe, status);
4637 assertSuccess(WHERE, status);
4638 UnicodeString rtRules = bi.getRules();
4639 assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules);
4640 }
4641
4642
TestTableRedundancies()4643 void RBBITest::TestTableRedundancies() {
4644 UErrorCode status = U_ZERO_ERROR;
4645
4646 LocalPointer<RuleBasedBreakIterator> bi (
4647 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4648 assertSuccess(WHERE, status);
4649 if (U_FAILURE(status)) return;
4650
4651 RBBIDataWrapper *dw = bi->fData;
4652 const RBBIStateTable *fwtbl = dw->fForwardTable;
4653 UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4654 int32_t numCharClasses = dw->fHeader->fCatCount;
4655 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4656
4657 // Check for duplicate columns (character categories)
4658
4659 std::vector<UnicodeString> columns;
4660 for (int32_t column = 0; column < numCharClasses; column++) {
4661 UnicodeString s;
4662 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4663 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4664 s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4665 }
4666 columns.push_back(s);
4667 }
4668 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4669 for (int c1=1; c1<numCharClasses; c1++) {
4670 int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4671 for (int c2 = c1+1; c2 < limit; c2++) {
4672 if (columns.at(c1) == columns.at(c2)) {
4673 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4674 goto out;
4675 }
4676 }
4677 }
4678 out:
4679
4680 // Check for duplicate states
4681 std::vector<UnicodeString> rows;
4682 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4683 UnicodeString s;
4684 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4685 if (in8Bits) {
4686 s.append(row->r8.fAccepting);
4687 s.append(row->r8.fLookAhead);
4688 s.append(row->r8.fTagsIdx);
4689 for (int32_t column = 0; column < numCharClasses; column++) {
4690 s.append(row->r8.fNextState[column]);
4691 }
4692 } else {
4693 s.append(row->r16.fAccepting);
4694 s.append(row->r16.fLookAhead);
4695 s.append(row->r16.fTagsIdx);
4696 for (int32_t column = 0; column < numCharClasses; column++) {
4697 s.append(row->r16.fNextState[column]);
4698 }
4699 }
4700 rows.push_back(s);
4701 }
4702 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4703 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4704 if (rows.at(r1) == rows.at(r2)) {
4705 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4706 return;
4707 }
4708 }
4709 }
4710 }
4711
4712 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4713 // even after next() has returned DONE.
4714
TestBug13447()4715 void RBBITest::TestBug13447() {
4716 UErrorCode status = U_ZERO_ERROR;
4717 LocalPointer<RuleBasedBreakIterator> bi(
4718 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4719 assertSuccess(WHERE, status);
4720 if (U_FAILURE(status)) return;
4721 UnicodeString data(u"1234");
4722 bi->setText(data);
4723 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4724 assertEquals(WHERE, 4, bi->next());
4725 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4726 assertEquals(WHERE, UBRK_DONE, bi->next());
4727 assertEquals(WHERE, 4, bi->current());
4728 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4729 }
4730
4731 // TestReverse exercises both the synthesized safe reverse rules and the logic
4732 // for filling the break iterator cache when starting from random positions
4733 // in the text.
4734 //
4735 // It's a monkey test, working on random data, with the expected data obtained
4736 // from forward iteration (no safe rules involved), comparing with results
4737 // when indexing into the interior of the string (safe rules needed).
4738
TestReverse()4739 void RBBITest::TestReverse() {
4740 UErrorCode status = U_ZERO_ERROR;
4741
4742 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4743 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4744 assertSuccess(WHERE, status, true);
4745 status = U_ZERO_ERROR;
4746 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4747 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4748 assertSuccess(WHERE, status, true);
4749 status = U_ZERO_ERROR;
4750 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4751 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4752 assertSuccess(WHERE, status, true);
4753 status = U_ZERO_ERROR;
4754 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4755 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4756 assertSuccess(WHERE, status, true);
4757 }
4758
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4759 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4760 if (!bi) {
4761 return;
4762 }
4763
4764 // From the mapping trie in the break iterator's internal data, create a
4765 // vector of UnicodeStrings, one for each character category, containing
4766 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4767 // to avoid an execess of unassigned code points.
4768
4769 RBBIDataWrapper *data = bi->fData;
4770 int32_t categoryCount = data->fHeader->fCatCount;
4771 UCPTrie *trie = data->fTrie;
4772 bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4773 uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4774
4775 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4776 for (int cp=0; cp<0x1fff0; ++cp) {
4777 int cat = ucptrie_get(trie, cp);
4778 cat &= ~dictBit; // And off the dictionary bit from the category.
4779 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4780 if (cat < 0 || cat >= categoryCount) return;
4781 strings[cat].append(cp);
4782 }
4783
4784 icu_rand randomGen;
4785 const int testStringLength = 10000;
4786 UnicodeString testString;
4787
4788 for (int i=0; i<testStringLength; ++i) {
4789 int charClass = randomGen() % categoryCount;
4790 if (strings[charClass].length() > 0) {
4791 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4792 testString.append(cp);
4793 }
4794 }
4795
4796 typedef std::pair<UBool, int32_t> Result;
4797 std::vector<Result> expectedResults;
4798 bi->setText(testString);
4799 for (int i=0; i<testString.length(); ++i) {
4800 bool isboundary = bi->isBoundary(i);
4801 int ruleStatus = bi->getRuleStatus();
4802 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4803 }
4804
4805 for (int i=testString.length()-1; i>=0; --i) {
4806 bi->setText(testString); // clears the internal break cache
4807 Result expected = expectedResults[i];
4808 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4809 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4810 }
4811 }
4812
4813
4814 // Ticket 13692 - finding word boundaries in very large numbers or words could
4815 // be very time consuming. When the problem was present, this void test
4816 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4817
TestBug13692()4818 void RBBITest::TestBug13692() {
4819 UErrorCode status = U_ZERO_ERROR;
4820 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4821 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4822 if (!assertSuccess(WHERE, status, true)) {
4823 return;
4824 }
4825 constexpr int32_t LENGTH = 1000000;
4826 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4827 for (int i=0; i<20; i+=2) {
4828 longNumber.setCharAt(i, u' ');
4829 }
4830 bi->setText(longNumber);
4831 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4832 assertSuccess(WHERE, status);
4833 }
4834
4835
TestProperties()4836 void RBBITest::TestProperties() {
4837 UErrorCode errorCode = U_ZERO_ERROR;
4838 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4839 if (!prependSet.isEmpty()) {
4840 errln(
4841 "[:GCB=Prepend:] is not empty any more. "
4842 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4843 "change this test to the opposite condition.");
4844 }
4845 }
4846
4847
4848 //
4849 // TestDebug - A place-holder test for debugging purposes.
4850 // For putting in fragments of other tests that can be invoked
4851 // for tracing without a lot of unwanted extra stuff happening.
4852 //
TestDebug(void)4853 void RBBITest::TestDebug(void) {
4854 UErrorCode status = U_ZERO_ERROR;
4855 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4856 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4857 if (!assertSuccess(WHERE, status, true)) {
4858 return;
4859 }
4860 const UnicodeString &rules = bi->getRules();
4861 UParseError pe;
4862 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4863 assertSuccess(WHERE, status);
4864 }
4865
4866
4867 //
4868 // TestDebugRules A stub test for use in debugging rule compilation problems.
4869 // Can be freely altered as needed or convenient.
4870 // Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4871 // data files may not be available in all environments.
4872 // Any permanent test cases should be moved to rbbitst.txt
4873 // (see Bug 20303 in that file, for example), or to another test function in this file.
4874 //
TestDebugRules()4875 void RBBITest::TestDebugRules() {
4876 #if 0
4877 const char16_t *rules = u""
4878 "!!quoted_literals_only; \n"
4879 "!!chain; \n"
4880 "!!lookAheadHardBreak; \n"
4881 " \n"
4882 // "[a] / ; \n"
4883 "[a] [b] / [c] [d]; \n"
4884 "[a] [b] / [c] [d] {100}; \n"
4885 "[x] [a] [b] / [c] [d] {100}; \n"
4886 "[a] [b] [c] / [d] {100}; \n"
4887 //" [c] [d] / [e] [f]; \n"
4888 //"[a] [b] / [c]; \n"
4889 ;
4890
4891 UErrorCode status = U_ZERO_ERROR;
4892 CharString path(pathToDataDirectory(), status);
4893 path.appendPathPart("brkitr", status);
4894 path.appendPathPart("rules", status);
4895 path.appendPathPart("line.txt", status);
4896 int len;
4897 std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4898 if (!assertSuccess(WHERE, status)) {
4899 return;
4900 }
4901
4902 UParseError pe;
4903 // rules = testFile.get();
4904 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4905
4906 if (!assertSuccess(WHERE, status)) {
4907 delete bi;
4908 return;
4909 }
4910 // bi->dumpTables();
4911
4912 delete bi;
4913 #endif
4914 }
4915
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)4916 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
4917 UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
4918 int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
4919 // Text are duplicate characters from U+4E00 to U+4FFF
4920 UnicodeString text;
4921 for (UChar c = 0x4e00; c < 0x5000; c++) {
4922 text.append(c).append(c);
4923 }
4924 // Generate rule which will caused length+4 character classes and
4925 // length+3 states
4926 UnicodeString rules(u"!!quoted_literals_only;");
4927 for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) {
4928 rules.append(u'\'').append(c).append(c).append(u"';");
4929 }
4930 rules.append(u".;");
4931 UErrorCode status = U_ZERO_ERROR;
4932 UParseError parseError;
4933 RuleBasedBreakIterator bi(rules, parseError, status);
4934
4935 assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
4936 assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
4937 assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
4938 assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
4939 assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
4940
4941 bi.setText(text);
4942
4943 int32_t pos;
4944 int32_t i = 0;
4945 while ((pos = bi.next()) > 0) {
4946 // The first numChar should not break between the pair
4947 if (i++ < numChar) {
4948 assertEquals(WHERE, i * 2, pos);
4949 } else {
4950 // After the first numChar next(), break on each character.
4951 assertEquals(WHERE, i + numChar, pos);
4952 }
4953 }
4954 while ((pos = bi.previous()) > 0) {
4955 // The first numChar should not break between the pair
4956 if (--i < numChar) {
4957 assertEquals(WHERE, i * 2, pos);
4958 } else {
4959 // After the first numChar next(), break on each character.
4960 assertEquals(WHERE, i + numChar, pos);
4961 }
4962 }
4963 }
4964
Test8BitsTrieWith8BitStateTable()4965 void RBBITest::Test8BitsTrieWith8BitStateTable() {
4966 testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4967 }
4968
Test16BitsTrieWith8BitStateTable()4969 void RBBITest::Test16BitsTrieWith8BitStateTable() {
4970 testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
4971 }
4972
Test16BitsTrieWith16BitStateTable()4973 void RBBITest::Test16BitsTrieWith16BitStateTable() {
4974 testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
4975 }
4976
Test8BitsTrieWith16BitStateTable()4977 void RBBITest::Test8BitsTrieWith16BitStateTable() {
4978 // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
4979 // create state table in 16 bits.
4980
4981 // Generate 510 'a' as text
4982 UnicodeString text;
4983 for (int32_t i = 0; i < 510; i++) {
4984 text.append(u'a');
4985 }
4986
4987 UnicodeString rules(u"!!quoted_literals_only;'");
4988 // 254 'a' in the rule will cause 256 states
4989 for (int32_t i = 0; i < 254; i++) {
4990 rules.append(u'a');
4991 }
4992 rules.append(u"';.;");
4993
4994 UErrorCode status = U_ZERO_ERROR;
4995 UParseError parseError;
4996 LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
4997
4998 assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
4999 assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
5000 assertEquals(WHERE,
5001 false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
5002 bi->setText(text);
5003
5004 // break positions:
5005 // 254, 508, 509, ... 510
5006 assertEquals("next()", 254, bi->next());
5007 int32_t i = 0;
5008 int32_t pos;
5009 while ((pos = bi->next()) > 0) {
5010 assertEquals(WHERE, 508 + i , pos);
5011 i++;
5012 }
5013 i = 0;
5014 while ((pos = bi->previous()) > 0) {
5015 i++;
5016 if (pos >= 508) {
5017 assertEquals(WHERE, 510 - i , pos);
5018 } else {
5019 assertEquals(WHERE, 254 , pos);
5020 }
5021 }
5022 }
5023
5024 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
5025 // that there are no problems with rules at the size that transitions between the two.
5026 //
5027 // A rule that matches a literal string, like 'abcdefghij', will require one state and
5028 // one character class per character in the string. So we can make a rule to tickle the
5029 // boundaries by using literal strings of various lengths.
5030 //
5031 // For both the number of states and the number of character classes, the eight bit format
5032 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
5033 // leaving 120 something available. This test runs the string over the range of 120 - 130,
5034 // which allows some margin for changes to the number of values reserved by the rule builder
5035 // without breaking the test.
5036
TestTable_8_16_Bits()5037 void RBBITest::TestTable_8_16_Bits() {
5038
5039 // testStr serves as both the source of the rule string (truncated to the desired length)
5040 // and as test data to check matching behavior. A break rule consisting of the first 120
5041 // characters of testStr will match the first 120 chars of the full-length testStr.
5042 UnicodeString testStr;
5043 for (UChar c=0x3000; c<0x3200; ++c) {
5044 testStr.append(c);
5045 }
5046
5047 const int32_t startLength = 120; // The shortest rule string to test.
5048 const int32_t endLength = 260; // The longest rule string to test
5049 const int32_t increment = this->quick ? endLength - startLength : 1;
5050
5051 for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
5052 UParseError parseError;
5053 UErrorCode status = U_ZERO_ERROR;
5054
5055 UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
5056 ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
5057 RuleBasedBreakIterator bi(ruleString, parseError, status);
5058 if (!assertSuccess(WHERE, status)) {
5059 errln(ruleString);
5060 break;
5061 }
5062 // bi.dumpTables();
5063
5064 // Verify that the break iterator is functioning - that the first boundary found
5065 // in testStr is at the length of the rule string.
5066 bi.setText(testStr);
5067 assertEquals(WHERE, ruleLen, bi.next());
5068
5069 // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
5070 // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
5071 bi.setText(testStr);
5072 int32_t result = bi.preceding(ruleLen);
5073 assertEquals(WHERE, 0, result);
5074
5075 // Verify that the range of rule lengths being tested cover the transations
5076 // from 8 to 16 bit data.
5077 bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
5078 bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
5079
5080 if (ruleLen == startLength) {
5081 assertEquals(WHERE, true, has8BitRowData);
5082 assertEquals(WHERE, true, has8BitsTrie);
5083 }
5084 if (ruleLen == endLength) {
5085 assertEquals(WHERE, false, has8BitRowData);
5086 assertEquals(WHERE, false, has8BitsTrie);
5087 }
5088 }
5089 }
5090
5091 /* Test handling of a large number of look-ahead rules.
5092 * The number of rules in the test exceeds the implementation limits prior to the
5093 * improvements introduced with #13590.
5094 *
5095 * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5096 * The text being matched is sequential, "ABCDEFGHI..."
5097 *
5098 * The upshot is that the look-ahead rules all match on their preceding context,
5099 * and consequently must save a potential result, but then fail to match on their
5100 * trailing context, so that they don't actually cause a boundary.
5101 *
5102 * Additionally, add a ".*" rule, so there are no boundaries unless a
5103 * look-ahead hard-break rule forces one.
5104 */
TestBug13590()5105 void RBBITest::TestBug13590() {
5106 UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5107
5108 const int NUM_LOOKAHEAD_RULES = 50;
5109 const char16_t STARTING_CHAR = u'\u5000';
5110 char16_t firstChar;
5111 for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5112 firstChar = STARTING_CHAR + ruleNum*2;
5113 rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5114 .append(u' ') .append(u'/') .append(u' ')
5115 .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5116 .append(u';') .append(u'\n');
5117 }
5118
5119 // Change the last rule added from the form "UV / WY" to "UV / WX".
5120 // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5121 rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5122
5123 UErrorCode status = U_ZERO_ERROR;
5124 UParseError parseError;
5125 RuleBasedBreakIterator bi(rules, parseError, status);
5126 if (!assertSuccess(WHERE, status)) {
5127 errln(rules);
5128 return;
5129 }
5130 // bi.dumpTables();
5131
5132 UnicodeString testString;
5133 for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5134 testString.append(c);
5135 }
5136 bi.setText(testString);
5137
5138 int breaksFound = 0;
5139 while (bi.next() != UBRK_DONE) {
5140 ++breaksFound;
5141 }
5142
5143 // Two matches are expected, one from the last rule that was explicitly modified,
5144 // and one at the end of the text.
5145 assertEquals(WHERE, 2, breaksFound);
5146 }
5147
5148
5149 #if U_ENABLE_TRACING
5150 static std::vector<std::string> gData;
5151 static std::vector<int32_t> gEntryFn;
5152 static std::vector<int32_t> gExitFn;
5153 static std::vector<int32_t> gDataFn;
5154
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5155 static void U_CALLCONV traceData(
5156 const void*,
5157 int32_t fnNumber,
5158 int32_t,
5159 const char *,
5160 va_list args) {
5161 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5162 const char* data = va_arg(args, const char*);
5163 gDataFn.push_back(fnNumber);
5164 gData.push_back(data);
5165 }
5166 }
5167
traceEntry(const void *,int32_t fnNumber)5168 static void traceEntry(const void *, int32_t fnNumber) {
5169 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5170 gEntryFn.push_back(fnNumber);
5171 }
5172 }
5173
traceExit(const void *,int32_t fnNumber,const char *,va_list)5174 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5175 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5176 gExitFn.push_back(fnNumber);
5177 }
5178 }
5179
5180
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5181 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5182 assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5183 assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5184 assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5185 assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5186
5187 if (expectedData == nullptr) {
5188 assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5189 assertEquals("utrace_data should not be called ", 0, gData.size());
5190 } else {
5191 assertEquals("utrace_data should be called ", 1, gDataFn.size());
5192 assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5193 assertEquals("utrace_data should be called ", 1, gData.size());
5194 assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5195 }
5196 }
5197
SetupTestTrace()5198 void SetupTestTrace() {
5199 gEntryFn.clear();
5200 gExitFn.clear();
5201 gDataFn.clear();
5202 gData.clear();
5203
5204 const void* context = nullptr;
5205 utrace_setFunctions(context, traceEntry, traceExit, traceData);
5206 utrace_setLevel(UTRACE_INFO);
5207 }
5208
TestTraceCreateCharacter(void)5209 void RBBITest::TestTraceCreateCharacter(void) {
5210 SetupTestTrace();
5211 IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5212 LocalPointer<BreakIterator> brkitr(
5213 BreakIterator::createCharacterInstance("zh-CN", status));
5214 status.errIfFailureAndReset();
5215 assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5216 }
5217
TestTraceCreateTitle(void)5218 void RBBITest::TestTraceCreateTitle(void) {
5219 SetupTestTrace();
5220 IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5221 LocalPointer<BreakIterator> brkitr(
5222 BreakIterator::createTitleInstance("zh-CN", status));
5223 status.errIfFailureAndReset();
5224 assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5225 }
5226
TestTraceCreateSentence(void)5227 void RBBITest::TestTraceCreateSentence(void) {
5228 SetupTestTrace();
5229 IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5230 LocalPointer<BreakIterator> brkitr(
5231 BreakIterator::createSentenceInstance("zh-CN", status));
5232 status.errIfFailureAndReset();
5233 assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5234 }
5235
TestTraceCreateWord(void)5236 void RBBITest::TestTraceCreateWord(void) {
5237 SetupTestTrace();
5238 IcuTestErrorCode status(*this, "TestTraceCreateWord");
5239 LocalPointer<BreakIterator> brkitr(
5240 BreakIterator::createWordInstance("zh-CN", status));
5241 status.errIfFailureAndReset();
5242 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5243 }
5244
TestTraceCreateLine(void)5245 void RBBITest::TestTraceCreateLine(void) {
5246 SetupTestTrace();
5247 IcuTestErrorCode status(*this, "TestTraceCreateLine");
5248 LocalPointer<BreakIterator> brkitr(
5249 BreakIterator::createLineInstance("zh-CN", status));
5250 status.errIfFailureAndReset();
5251 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "");
5252 }
5253
TestTraceCreateLineStrict(void)5254 void RBBITest::TestTraceCreateLineStrict(void) {
5255 SetupTestTrace();
5256 IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5257 LocalPointer<BreakIterator> brkitr(
5258 BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5259 status.errIfFailureAndReset();
5260 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "strict");
5261 }
5262
TestTraceCreateLineNormal(void)5263 void RBBITest::TestTraceCreateLineNormal(void) {
5264 SetupTestTrace();
5265 IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5266 LocalPointer<BreakIterator> brkitr(
5267 BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5268 status.errIfFailureAndReset();
5269 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "normal");
5270 }
5271
TestTraceCreateLineLoose(void)5272 void RBBITest::TestTraceCreateLineLoose(void) {
5273 SetupTestTrace();
5274 IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5275 LocalPointer<BreakIterator> brkitr(
5276 BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5277 status.errIfFailureAndReset();
5278 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "loose");
5279 }
5280
TestTraceCreateBreakEngine(void)5281 void RBBITest::TestTraceCreateBreakEngine(void) {
5282 rbbi_cleanup();
5283 SetupTestTrace();
5284 IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5285 LocalPointer<BreakIterator> brkitr(
5286 BreakIterator::createWordInstance("zh-CN", status));
5287 status.errIfFailureAndReset();
5288 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5289
5290 // To word break the following text, BreakIterator will create 5 dictionary
5291 // break engine internally.
5292 brkitr->setText(
5293 u"test "
5294 u"測試 " // Hani
5295 u"សាកល្បង " // Khmr
5296 u"ທົດສອບ " // Laoo
5297 u"စမ်းသပ်မှု " // Mymr
5298 u"ทดสอบ " // Thai
5299 u"test "
5300 );
5301
5302 // Loop through all the text.
5303 while (brkitr->next() > 0) ;
5304
5305 assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5306 assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5307 assertEquals("utrace_data should be called ", 5, gDataFn.size());
5308
5309 for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5310 assertEquals("utrace_entry should be called ",
5311 UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5312 assertEquals("utrace_exit should be called ",
5313 UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5314 assertEquals("utrace_data should be called ",
5315 UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5316 }
5317
5318 assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5319 assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5320 assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5321 assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5322 assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5323
5324 }
5325 #endif
5326
TestUnpairedSurrogate()5327 void RBBITest::TestUnpairedSurrogate() {
5328 UnicodeString rules(u"ab;");
5329
5330 UErrorCode status = U_ZERO_ERROR;
5331 UParseError pe;
5332 RuleBasedBreakIterator bi1(rules, pe, status);
5333 assertSuccess(WHERE, status);
5334 UnicodeString rtRules = bi1.getRules();
5335 // make sure the simple one work first.
5336 assertEquals(WHERE, rules, rtRules);
5337
5338
5339 rules = UnicodeString(u"a\\ud800b;").unescape();
5340 pe.line = 0;
5341 pe.offset = 0;
5342 RuleBasedBreakIterator bi2(rules, pe, status);
5343 assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5344 if (pe.line != 1 || pe.offset != 1) {
5345 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5346 }
5347
5348 status = U_ZERO_ERROR;
5349 rules = UnicodeString(u"a\\ude00b;").unescape();
5350 pe.line = 0;
5351 pe.offset = 0;
5352 RuleBasedBreakIterator bi3(rules, pe, status);
5353 assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5354 if (pe.line != 1 || pe.offset != 1) {
5355 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5356 }
5357
5358 // make sure the surrogate one work too.
5359 status = U_ZERO_ERROR;
5360 rules = UnicodeString(u"ab;");
5361 RuleBasedBreakIterator bi4(rules, pe, status);
5362 rtRules = bi4.getRules();
5363 assertEquals(WHERE, rules, rtRules);
5364 }
5365
5366 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5367