1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <sstream>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <utility>
22 #include <vector>
23
24 #include "unicode/brkiter.h"
25 #include "unicode/localpointer.h"
26 #include "unicode/numfmt.h"
27 #include "unicode/rbbi.h"
28 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
29 #include "unicode/regex.h"
30 #endif
31 #include "unicode/schriter.h"
32 #include "unicode/uchar.h"
33 #include "unicode/utf16.h"
34 #include "unicode/ucnv.h"
35 #include "unicode/uniset.h"
36 #include "unicode/uscript.h"
37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
39 #include "unicode/utrace.h"
40
41 #include "charstr.h"
42 #include "cmemory.h"
43 #include "cstr.h"
44 #include "intltest.h"
45 #include "rbbitst.h"
46 #include "rbbidata.h"
47 #include "utypeinfo.h" // for 'typeid' to work
48 #include "uvector.h"
49 #include "uvectr32.h"
50
51
52 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
53 #include "unicode/filteredbrk.h"
54 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
55
56 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
57 if (!(x)) { \
58 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
59 } \
60 } UPRV_BLOCK_MACRO_END
61
62 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
63 if (U_FAILURE(errcode)) { \
64 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
65 } \
66 } UPRV_BLOCK_MACRO_END
67
68 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
69 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
70 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
71 }
72
73 //---------------------------------------------
74 // runIndexedTest
75 //---------------------------------------------
76
77
78 // Note: Before adding new tests to this file, check whether the desired test data can
79 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
80 // it's much less work than writing a new test, diagnostic output in the event of failures
81 // is good, and the test data file will is shared with ICU4J, so eventually the test
82 // will run there as well, without additional effort.
83
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)84 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
85 {
86 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
87 fTestParams = params;
88
89 TESTCASE_AUTO_BEGIN;
90 #if !UCONFIG_NO_FILE_IO
91 TESTCASE_AUTO(TestBug4153072);
92 #endif
93 #if !UCONFIG_NO_FILE_IO
94 TESTCASE_AUTO(TestUnicodeFiles);
95 #endif
96 TESTCASE_AUTO(TestGetAvailableLocales);
97 TESTCASE_AUTO(TestGetDisplayName);
98 #if !UCONFIG_NO_FILE_IO
99 TESTCASE_AUTO(TestEndBehaviour);
100 TESTCASE_AUTO(TestWordBreaks);
101 TESTCASE_AUTO(TestWordBoundary);
102 TESTCASE_AUTO(TestLineBreaks);
103 TESTCASE_AUTO(TestSentBreaks);
104 TESTCASE_AUTO(TestExtended);
105 #endif
106 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
107 TESTCASE_AUTO(TestMonkey);
108 #endif
109 #if !UCONFIG_NO_FILE_IO
110 TESTCASE_AUTO(TestBug3818);
111 #endif
112 TESTCASE_AUTO(TestDebug);
113 #if !UCONFIG_NO_FILE_IO
114 TESTCASE_AUTO(TestBug5775);
115 #endif
116 TESTCASE_AUTO(TestBug9983);
117 TESTCASE_AUTO(TestDictRules);
118 TESTCASE_AUTO(TestBug5532);
119 TESTCASE_AUTO(TestBug7547);
120 TESTCASE_AUTO(TestBug12797);
121 TESTCASE_AUTO(TestBug12918);
122 TESTCASE_AUTO(TestBug12932);
123 TESTCASE_AUTO(TestEmoji);
124 TESTCASE_AUTO(TestBug12519);
125 TESTCASE_AUTO(TestBug12677);
126 TESTCASE_AUTO(TestTableRedundancies);
127 TESTCASE_AUTO(TestBug13447);
128 TESTCASE_AUTO(TestReverse);
129 TESTCASE_AUTO(TestBug13692);
130 TESTCASE_AUTO(TestDebugRules);
131
132 #if U_ENABLE_TRACING
133 TESTCASE_AUTO(TestTraceCreateCharacter);
134 TESTCASE_AUTO(TestTraceCreateWord);
135 TESTCASE_AUTO(TestTraceCreateSentence);
136 TESTCASE_AUTO(TestTraceCreateTitle);
137 TESTCASE_AUTO(TestTraceCreateLine);
138 TESTCASE_AUTO(TestTraceCreateLineNormal);
139 TESTCASE_AUTO(TestTraceCreateLineLoose);
140 TESTCASE_AUTO(TestTraceCreateLineStrict);
141 TESTCASE_AUTO(TestTraceCreateBreakEngine);
142 #endif
143
144 TESTCASE_AUTO_END;
145 }
146
147
148 //--------------------------------------------------------------------------------------
149 //
150 // RBBITest constructor and destructor
151 //
152 //--------------------------------------------------------------------------------------
153
RBBITest()154 RBBITest::RBBITest() {
155 fTestParams = NULL;
156 }
157
158
~RBBITest()159 RBBITest::~RBBITest() {
160 }
161
162
printStringBreaks(UText * tstr,int expected[],int expectedCount)163 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
164 UErrorCode status = U_ZERO_ERROR;
165 char name[100];
166 printf("code alpha extend alphanum type word sent line name\n");
167 int nextExpectedIndex = 0;
168 utext_setNativeIndex(tstr, 0);
169 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
170 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
171 printf("------------------------------------------------ %d\n", j);
172 ++nextExpectedIndex;
173 }
174
175 UChar32 c = utext_next32(tstr);
176 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
177 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
178 u_isUAlphabetic(c),
179 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
180 u_isalnum(c),
181 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
182 u_charType(c),
183 U_SHORT_PROPERTY_NAME),
184 u_getPropertyValueName(UCHAR_WORD_BREAK,
185 u_getIntPropertyValue(c,
186 UCHAR_WORD_BREAK),
187 U_SHORT_PROPERTY_NAME),
188 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
189 u_getIntPropertyValue(c,
190 UCHAR_SENTENCE_BREAK),
191 U_SHORT_PROPERTY_NAME),
192 u_getPropertyValueName(UCHAR_LINE_BREAK,
193 u_getIntPropertyValue(c,
194 UCHAR_LINE_BREAK),
195 U_SHORT_PROPERTY_NAME),
196 name);
197 }
198 }
199
200
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)201 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
202 UErrorCode status = U_ZERO_ERROR;
203 UText *tstr = NULL;
204 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
205 if (U_FAILURE(status)) {
206 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
207 return;
208 }
209 printStringBreaks(tstr, expected, expectedCount);
210 utext_close(tstr);
211 }
212
213
TestBug3818()214 void RBBITest::TestBug3818() {
215 UErrorCode status = U_ZERO_ERROR;
216
217 // Four Thai words...
218 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
219 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
220 UnicodeString thaiStr(thaiWordData);
221
222 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
223 if (U_FAILURE(status) || bi == NULL) {
224 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
225 return;
226 }
227 bi->setText(thaiStr);
228
229 int32_t startOfSecondWord = bi->following(1);
230 if (startOfSecondWord != 4) {
231 errln("Fail at file %s, line %d expected start of word at 4, got %d",
232 __FILE__, __LINE__, startOfSecondWord);
233 }
234 startOfSecondWord = bi->following(0);
235 if (startOfSecondWord != 4) {
236 errln("Fail at file %s, line %d expected start of word at 4, got %d",
237 __FILE__, __LINE__, startOfSecondWord);
238 }
239 delete bi;
240 }
241
242
243 //---------------------------------------------
244 //
245 // other tests
246 //
247 //---------------------------------------------
248
TestGetAvailableLocales()249 void RBBITest::TestGetAvailableLocales()
250 {
251 int32_t locCount = 0;
252 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
253
254 if (locCount == 0)
255 dataerrln("getAvailableLocales() returned an empty list!");
256 // Just make sure that it's returning good memory.
257 int32_t i;
258 for (i = 0; i < locCount; ++i) {
259 logln(locList[i].getName());
260 }
261 }
262
263 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()264 void RBBITest::TestGetDisplayName()
265 {
266 UnicodeString result;
267
268 BreakIterator::getDisplayName(Locale::getUS(), result);
269 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
270 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
271 + result);
272
273 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
274 if (result != "French (France)")
275 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
276 + result);
277 }
278 /**
279 * Test End Behaviour
280 * @bug 4068137
281 */
TestEndBehaviour()282 void RBBITest::TestEndBehaviour()
283 {
284 UErrorCode status = U_ZERO_ERROR;
285 UnicodeString testString("boo.");
286 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
287 if (U_FAILURE(status))
288 {
289 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
290 return;
291 }
292 wb->setText(testString);
293
294 if (wb->first() != 0)
295 errln("Didn't get break at beginning of string.");
296 if (wb->next() != 3)
297 errln("Didn't get break before period in \"boo.\"");
298 if (wb->current() != 4 && wb->next() != 4)
299 errln("Didn't get break at end of string.");
300 delete wb;
301 }
302 /*
303 * @bug 4153072
304 */
TestBug4153072()305 void RBBITest::TestBug4153072() {
306 UErrorCode status = U_ZERO_ERROR;
307 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
308 if (U_FAILURE(status))
309 {
310 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
311 return;
312 }
313 UnicodeString str("...Hello, World!...");
314 int32_t begin = 3;
315 int32_t end = str.length() - 3;
316 UBool onBoundary;
317
318 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
319 iter->adoptText(textIterator);
320 int index;
321 // Note: with the switch to UText, there is no way to restrict the
322 // iteration range to begin at an index other than zero.
323 // String character iterators created with a non-zero bound are
324 // treated by RBBI as being empty.
325 for (index = -1; index < begin + 1; ++index) {
326 onBoundary = iter->isBoundary(index);
327 if (index == 0? !onBoundary : onBoundary) {
328 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
329 " and begin index = " + begin);
330 }
331 }
332 delete iter;
333 }
334
335
336 //
337 // Test for problem reported by Ashok Matoria on 9 July 2007
338 // One.<kSoftHyphen><kSpace>Two.
339 //
340 // Sentence break at start (0) and then on calling next() it breaks at
341 // 'T' of "Two". Now, at this point if I do next() and
342 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
343 //
TestBug5775()344 void RBBITest::TestBug5775() {
345 UErrorCode status = U_ZERO_ERROR;
346 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
347 TEST_ASSERT_SUCCESS(status);
348 if (U_FAILURE(status)) {
349 return;
350 }
351 // Check for status first for better handling of no data errors.
352 TEST_ASSERT(bi != NULL);
353 if (bi == NULL) {
354 return;
355 }
356
357 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
358 // 01234 56789
359 s = s.unescape();
360 bi->setText(s);
361 int pos = bi->next();
362 TEST_ASSERT(pos == 6);
363 pos = bi->next();
364 TEST_ASSERT(pos == 10);
365 pos = bi->previous();
366 TEST_ASSERT(pos == 6);
367 delete bi;
368 }
369
370
371
372 //------------------------------------------------------------------------------
373 //
374 // RBBITest::Extended Run RBBI Tests from an external test data file
375 //
376 //------------------------------------------------------------------------------
377
378 struct TestParams {
379 BreakIterator *bi; // Break iterator is set while parsing test source.
380 // Changed out whenever test data changes break type.
381
382 UnicodeString dataToBreak; // Data that is built up while parsing the test.
383 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
384 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
385 UVector32 *srcCol;
386
387 UText *textToBreak; // UText, could be UTF8 or UTF16.
388 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
389 CharString utf8String; // UTF-8 form of text to break.
390
TestParamsTestParams391 TestParams(UErrorCode &status) : dataToBreak() {
392 bi = NULL;
393 expectedBreaks = new UVector32(status);
394 srcLine = new UVector32(status);
395 srcCol = new UVector32(status);
396 textToBreak = NULL;
397 textMap = new UVector32(status);
398 }
399
~TestParamsTestParams400 ~TestParams() {
401 delete bi;
402 delete expectedBreaks;
403 delete srcLine;
404 delete srcCol;
405 utext_close(textToBreak);
406 delete textMap;
407 }
408
409 int32_t getSrcLine(int32_t bp);
410 int32_t getExpectedBreak(int32_t bp);
411 int32_t getSrcCol(int32_t bp);
412
413 void setUTF16(UErrorCode &status);
414 void setUTF8(UErrorCode &status);
415 };
416
417 // Append a UnicodeString to a CharString with UTF-8 encoding.
418 // Substitute any invalid chars.
419 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)420 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
421 if (U_FAILURE(status)) {
422 return;
423 }
424 int32_t utf8Length;
425 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
426 src.getBuffer(), src.length(), // UTF-16 data
427 0xfffd, NULL, // Substitution char, number of subs.
428 &status);
429 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
430 return;
431 }
432 status = U_ZERO_ERROR;
433 int32_t capacity;
434 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
435 u_strToUTF8WithSub(buffer, utf8Length, NULL,
436 src.getBuffer(), src.length(),
437 0xfffd, NULL, &status);
438 dest.append(buffer, utf8Length, status);
439 }
440
441
setUTF16(UErrorCode & status)442 void TestParams::setUTF16(UErrorCode &status) {
443 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
444 textMap->removeAllElements();
445 for (int32_t i=0; i<dataToBreak.length(); i++) {
446 if (i == dataToBreak.getChar32Start(i)) {
447 textMap->addElement(i, status);
448 } else {
449 textMap->addElement(-1, status);
450 }
451 }
452 textMap->addElement(dataToBreak.length(), status);
453 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
454 }
455
456
setUTF8(UErrorCode & status)457 void TestParams::setUTF8(UErrorCode &status) {
458 if (U_FAILURE(status)) {
459 return;
460 }
461 utf8String.clear();
462 CharStringAppend(utf8String, dataToBreak, status);
463 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
464 if (U_FAILURE(status)) {
465 return;
466 }
467
468 textMap->removeAllElements();
469 int32_t utf16Index = 0;
470 for (;;) {
471 textMap->addElement(utf16Index, status);
472 UChar32 c32 = utext_current32(textToBreak);
473 if (c32 < 0) {
474 break;
475 }
476 utf16Index += U16_LENGTH(c32);
477 utext_next32(textToBreak);
478 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
479 textMap->addElement(-1, status);
480 }
481 }
482 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
483 }
484
485
getSrcLine(int32_t bp)486 int32_t TestParams::getSrcLine(int32_t bp) {
487 if (bp >= textMap->size()) {
488 bp = textMap->size() - 1;
489 }
490 int32_t i = 0;
491 for(; bp >= 0 ; --bp) {
492 // Move to a character boundary if we are not on one already.
493 i = textMap->elementAti(bp);
494 if (i >= 0) {
495 break;
496 }
497 }
498 return srcLine->elementAti(i);
499 }
500
501
getExpectedBreak(int32_t bp)502 int32_t TestParams::getExpectedBreak(int32_t bp) {
503 if (bp >= textMap->size()) {
504 return 0;
505 }
506 int32_t i = textMap->elementAti(bp);
507 int32_t retVal = 0;
508 if (i >= 0) {
509 retVal = expectedBreaks->elementAti(i);
510 }
511 return retVal;
512 }
513
514
getSrcCol(int32_t bp)515 int32_t TestParams::getSrcCol(int32_t bp) {
516 if (bp >= textMap->size()) {
517 bp = textMap->size() - 1;
518 }
519 int32_t i = 0;
520 for(; bp >= 0; --bp) {
521 // Move bp to a character boundary if we are not on one already.
522 i = textMap->elementAti(bp);
523 if (i >= 0) {
524 break;
525 }
526 }
527 return srcCol->elementAti(i);
528 }
529
530
executeTest(TestParams * t,UErrorCode & status)531 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
532 int32_t bp;
533 int32_t prevBP;
534 int32_t i;
535
536 TEST_ASSERT_SUCCESS(status);
537 if (U_FAILURE(status)) {
538 return;
539 }
540
541 if (t->bi == NULL) {
542 return;
543 }
544
545 t->bi->setText(t->textToBreak, status);
546 //
547 // Run the iterator forward
548 //
549 prevBP = -1;
550 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
551 if (prevBP == bp) {
552 // Fail for lack of forward progress.
553 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
554 bp, t->getSrcLine(bp), t->getSrcCol(bp));
555 break;
556 }
557
558 // Check that there we didn't miss an expected break between the last one
559 // and this one.
560 for (i=prevBP+1; i<bp; i++) {
561 if (t->getExpectedBreak(i) != 0) {
562 int expected[] = {0, i};
563 printStringBreaks(t->dataToBreak, expected, 2);
564 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
565 i, t->getSrcLine(i), t->getSrcCol(i));
566 }
567 }
568
569 // Check that the break we did find was expected
570 if (t->getExpectedBreak(bp) == 0) {
571 int expected[] = {0, bp};
572 printStringBreaks(t->textToBreak, expected, 2);
573 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
574 bp, t->getSrcLine(bp), t->getSrcCol(bp));
575 } else {
576 // The break was expected.
577 // Check that the {nnn} tag value is correct.
578 int32_t expectedTagVal = t->getExpectedBreak(bp);
579 if (expectedTagVal == -1) {
580 expectedTagVal = 0;
581 }
582 int32_t line = t->getSrcLine(bp);
583 int32_t rs = t->bi->getRuleStatus();
584 if (rs != expectedTagVal) {
585 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
586 " Actual, Expected status = %4d, %4d",
587 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
588 }
589 }
590
591 prevBP = bp;
592 }
593
594 // Verify that there were no missed expected breaks after the last one found
595 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
596 if (t->getExpectedBreak(i) != 0) {
597 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
598 i, t->getSrcLine(i), t->getSrcCol(i));
599 }
600 }
601
602 //
603 // Run the iterator backwards, verify that the same breaks are found.
604 //
605 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
606 bp = t->bi->last();
607 while (bp != BreakIterator::DONE) {
608 if (prevBP == bp) {
609 // Fail for lack of progress.
610 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
611 bp, t->getSrcLine(bp), t->getSrcCol(bp));
612 break;
613 }
614
615 // Check that we didn't miss an expected break between the last one
616 // and this one. (UVector returns zeros for index out of bounds.)
617 for (i=prevBP-1; i>bp; i--) {
618 if (t->getExpectedBreak(i) != 0) {
619 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
620 i, t->getSrcLine(i), t->getSrcCol(i));
621 }
622 }
623
624 // Check that the break we did find was expected
625 if (t->getExpectedBreak(bp) == 0) {
626 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
627 bp, t->getSrcLine(bp), t->getSrcCol(bp));
628 } else {
629 // The break was expected.
630 // Check that the {nnn} tag value is correct.
631 int32_t expectedTagVal = t->getExpectedBreak(bp);
632 if (expectedTagVal == -1) {
633 expectedTagVal = 0;
634 }
635 int line = t->getSrcLine(bp);
636 int32_t rs = t->bi->getRuleStatus();
637 if (rs != expectedTagVal) {
638 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
639 " Actual, Expected status = %4d, %4d",
640 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
641 }
642 }
643
644 prevBP = bp;
645 bp = t->bi->previous();
646 }
647
648 // Verify that there were no missed breaks prior to the last one found
649 for (i=prevBP-1; i>=0; i--) {
650 if (t->getExpectedBreak(i) != 0) {
651 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
652 i, t->getSrcLine(i), t->getSrcCol(i));
653 }
654 }
655
656 // Check isBoundary()
657 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
658 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
659 UBool boundaryFound = t->bi->isBoundary(i);
660 if (boundaryExpected != boundaryFound) {
661 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
662 " Expected, Actual= %s, %s",
663 i, t->getSrcLine(i), t->getSrcCol(i),
664 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
665 }
666 }
667
668 // Check following()
669 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
670 int32_t actualBreak = t->bi->following(i);
671 int32_t expectedBreak = BreakIterator::DONE;
672 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
673 if (t->getExpectedBreak(j) != 0) {
674 expectedBreak = j;
675 break;
676 }
677 }
678 if (expectedBreak != actualBreak) {
679 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
680 " Expected, Actual= %d, %d",
681 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
682 }
683 }
684
685 // Check preceding()
686 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
687 int32_t actualBreak = t->bi->preceding(i);
688 int32_t expectedBreak = BreakIterator::DONE;
689
690 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
691 // preceding(trailing byte) will return the index of some preceding code point,
692 // not the lead byte of the current code point, even though that has a smaller index.
693 // Therefore, start looking at the expected break data not at i-1, but at
694 // the start of code point index - 1.
695 utext_setNativeIndex(t->textToBreak, i);
696 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
697 for (; j >= 0; j--) {
698 if (t->getExpectedBreak(j) != 0) {
699 expectedBreak = j;
700 break;
701 }
702 }
703 if (expectedBreak != actualBreak) {
704 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
705 " Expected, Actual= %d, %d",
706 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
707 }
708 }
709 }
710
711
TestExtended()712 void RBBITest::TestExtended() {
713 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
714 // data driven test closely entangles filtered and regular data.
715 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
716 UErrorCode status = U_ZERO_ERROR;
717 Locale locale("");
718
719 TestParams tp(status);
720
721 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
722 if (U_FAILURE(status)) {
723 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
724 }
725
726 //
727 // Open and read the test data file.
728 //
729 const char *testDataDirectory = IntlTest::getSourceTestData(status);
730 CharString testFileName(testDataDirectory, -1, status);
731 testFileName.append("rbbitst.txt", -1, status);
732
733 int len;
734 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
735 if (U_FAILURE(status)) {
736 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
737 return;
738 }
739
740 bool skipTest = false; // Skip this test?
741
742 //
743 // Put the test data into a UnicodeString
744 //
745 UnicodeString testString(FALSE, testFile, len);
746
747 enum EParseState{
748 PARSE_COMMENT,
749 PARSE_TAG,
750 PARSE_DATA,
751 PARSE_NUM,
752 PARSE_RULES
753 }
754 parseState = PARSE_TAG;
755
756 EParseState savedState = PARSE_TAG;
757
758 int32_t lineNum = 1;
759 int32_t colStart = 0;
760 int32_t column = 0;
761 int32_t charIdx = 0;
762
763 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
764
765 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
766 int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
767
768 for (charIdx = 0; charIdx < len; ) {
769 status = U_ZERO_ERROR;
770 UChar c = testString.charAt(charIdx);
771 charIdx++;
772 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
773 // treat CRLF as a unit
774 c = u'\n';
775 charIdx++;
776 }
777 if (c == u'\n' || c == u'\r') {
778 lineNum++;
779 colStart = charIdx;
780 }
781 column = charIdx - colStart + 1;
782
783 switch (parseState) {
784 case PARSE_COMMENT:
785 if (c == u'\n' || c == u'\r') {
786 parseState = savedState;
787 }
788 break;
789
790 case PARSE_TAG:
791 {
792 if (c == u'#') {
793 parseState = PARSE_COMMENT;
794 savedState = PARSE_TAG;
795 break;
796 }
797 if (u_isUWhiteSpace(c)) {
798 break;
799 }
800 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
801 delete tp.bi;
802 tp.bi = BreakIterator::createWordInstance(locale, status);
803 skipTest = false;
804 charIdx += 5;
805 break;
806 }
807 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
808 delete tp.bi;
809 tp.bi = BreakIterator::createCharacterInstance(locale, status);
810 skipTest = false;
811 charIdx += 5;
812 break;
813 }
814 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
815 delete tp.bi;
816 tp.bi = BreakIterator::createLineInstance(locale, status);
817 skipTest = false;
818 charIdx += 5;
819 break;
820 }
821 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
822 delete tp.bi;
823 tp.bi = BreakIterator::createSentenceInstance(locale, status);
824 skipTest = false;
825 charIdx += 5;
826 break;
827 }
828 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
829 delete tp.bi;
830 tp.bi = BreakIterator::createTitleInstance(locale, status);
831 charIdx += 6;
832 break;
833 }
834
835 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
836 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
837 charIdx = testString.indexOf(u'>', charIdx) + 1;
838 parseState = PARSE_RULES;
839 rules.remove();
840 rulesFirstLine = lineNum;
841 break;
842 }
843
844 // <locale loc_name>
845 localeMatcher.reset(testString);
846 if (localeMatcher.lookingAt(charIdx-1, status)) {
847 UnicodeString localeName = localeMatcher.group(1, status);
848 char localeName8[100];
849 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
850 locale = Locale::createFromName(localeName8);
851 charIdx += localeMatcher.group(0, status).length() - 1;
852 TEST_ASSERT_SUCCESS(status);
853 break;
854 }
855 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
856 parseState = PARSE_DATA;
857 charIdx += 5;
858 tp.dataToBreak = "";
859 tp.expectedBreaks->removeAllElements();
860 tp.srcCol ->removeAllElements();
861 tp.srcLine->removeAllElements();
862 break;
863 }
864
865 errln("line %d: Tag expected in test file.", lineNum);
866 parseState = PARSE_COMMENT;
867 savedState = PARSE_DATA;
868 goto end_test; // Stop the test.
869 }
870 break;
871
872 case PARSE_RULES:
873 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
874 charIdx += 7;
875 parseState = PARSE_TAG;
876 delete tp.bi;
877 UParseError pe;
878 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
879 skipTest = U_FAILURE(status);
880 if (U_FAILURE(status)) {
881 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
882 rulesFirstLine + pe.line - 1, u_errorName(status));
883 }
884 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
885 charIdx += 10;
886 parseState = PARSE_TAG;
887 UErrorCode ec = U_ZERO_ERROR;
888 UParseError pe;
889 RuleBasedBreakIterator bi(rules, pe, ec);
890 if (U_SUCCESS(ec)) {
891 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
892 rulesFirstLine + pe.line - 1);
893 }
894 } else {
895 rules.append(c);
896 }
897 break;
898
899 case PARSE_DATA:
900 if (c == u'•') {
901 int32_t breakIdx = tp.dataToBreak.length();
902 tp.expectedBreaks->setSize(breakIdx+1);
903 tp.expectedBreaks->setElementAt(-1, breakIdx);
904 tp.srcLine->setSize(breakIdx+1);
905 tp.srcLine->setElementAt(lineNum, breakIdx);
906 tp.srcCol ->setSize(breakIdx+1);
907 tp.srcCol ->setElementAt(column, breakIdx);
908 break;
909 }
910
911 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
912 // Add final entry to mappings from break location to source file position.
913 // Need one extra because last break position returned is after the
914 // last char in the data, not at the last char.
915 tp.srcLine->addElement(lineNum, status);
916 tp.srcCol ->addElement(column, status);
917
918 parseState = PARSE_TAG;
919 charIdx += 6;
920
921 if (!skipTest) {
922 // RUN THE TEST!
923 status = U_ZERO_ERROR;
924 tp.setUTF16(status);
925 executeTest(&tp, status);
926 TEST_ASSERT_SUCCESS(status);
927
928 // Run again, this time with UTF-8 text wrapped in a UText.
929 status = U_ZERO_ERROR;
930 tp.setUTF8(status);
931 TEST_ASSERT_SUCCESS(status);
932 executeTest(&tp, status);
933 }
934 break;
935 }
936
937 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
938 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
939 // Get the code point from the name and insert it into the test data.
940 // (Damn, no API takes names in Unicode !!!
941 // we've got to take it back to char *)
942 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
943 int32_t nameLength = nameEndIdx - (charIdx+2);
944 char charNameBuf[200];
945 UChar32 theChar = -1;
946 if (nameEndIdx != -1) {
947 UErrorCode status = U_ZERO_ERROR;
948 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
949 charNameBuf[sizeof(charNameBuf)-1] = 0;
950 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
951 if (U_FAILURE(status)) {
952 theChar = -1;
953 }
954 }
955 if (theChar == -1) {
956 errln("Error in named character in test file at line %d, col %d",
957 lineNum, column);
958 } else {
959 // Named code point was recognized. Insert it
960 // into the test data.
961 tp.dataToBreak.append(theChar);
962 while (tp.dataToBreak.length() > tp.srcLine->size()) {
963 tp.srcLine->addElement(lineNum, status);
964 tp.srcCol ->addElement(column, status);
965 }
966 }
967 if (nameEndIdx > charIdx) {
968 charIdx = nameEndIdx+1;
969
970 }
971 break;
972 }
973
974
975
976 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
977 charIdx++;
978 int32_t breakIdx = tp.dataToBreak.length();
979 tp.expectedBreaks->setSize(breakIdx+1);
980 tp.expectedBreaks->setElementAt(-1, breakIdx);
981 tp.srcLine->setSize(breakIdx+1);
982 tp.srcLine->setElementAt(lineNum, breakIdx);
983 tp.srcCol ->setSize(breakIdx+1);
984 tp.srcCol ->setElementAt(column, breakIdx);
985 break;
986 }
987
988 if (c == u'<') {
989 tagValue = 0;
990 parseState = PARSE_NUM;
991 break;
992 }
993
994 if (c == u'#' && column==3) { // TODO: why is column off so far?
995 parseState = PARSE_COMMENT;
996 savedState = PARSE_DATA;
997 break;
998 }
999
1000 if (c == u'\\') {
1001 // Check for \ at end of line, a line continuation.
1002 // Advance over (discard) the newline
1003 UChar32 cp = testString.char32At(charIdx);
1004 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1005 // We have a CR LF
1006 // Need an extra increment of the input ptr to move over both of them
1007 charIdx++;
1008 }
1009 if (cp == u'\n' || cp == u'\r') {
1010 lineNum++;
1011 colStart = charIdx;
1012 charIdx++;
1013 break;
1014 }
1015
1016 // Let unescape handle the back slash.
1017 cp = testString.unescapeAt(charIdx);
1018 if (cp != -1) {
1019 // Escape sequence was recognized. Insert the char
1020 // into the test data.
1021 tp.dataToBreak.append(cp);
1022 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1023 tp.srcLine->addElement(lineNum, status);
1024 tp.srcCol ->addElement(column, status);
1025 }
1026 break;
1027 }
1028
1029
1030 // Not a recognized backslash escape sequence.
1031 // Take the next char as a literal.
1032 // TODO: Should this be an error?
1033 c = testString.charAt(charIdx);
1034 charIdx = testString.moveIndex32(charIdx, 1);
1035 }
1036
1037 // Normal, non-escaped data char.
1038 tp.dataToBreak.append(c);
1039
1040 // Save the mapping from offset in the data to line/column numbers in
1041 // the original input file. Will be used for better error messages only.
1042 // If there's an expected break before this char, the slot in the mapping
1043 // vector will already be set for this char; don't overwrite it.
1044 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1045 tp.srcLine->addElement(lineNum, status);
1046 tp.srcCol ->addElement(column, status);
1047 }
1048 break;
1049
1050
1051 case PARSE_NUM:
1052 // We are parsing an expected numeric tag value, like <1234>,
1053 // within a chunk of data.
1054 if (u_isUWhiteSpace(c)) {
1055 break;
1056 }
1057
1058 if (c == u'>') {
1059 // Finished the number. Add the info to the expected break data,
1060 // and switch parse state back to doing plain data.
1061 parseState = PARSE_DATA;
1062 if (tagValue == 0) {
1063 tagValue = -1;
1064 }
1065 int32_t breakIdx = tp.dataToBreak.length();
1066 tp.expectedBreaks->setSize(breakIdx+1);
1067 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1068 tp.srcLine->setSize(breakIdx+1);
1069 tp.srcLine->setElementAt(lineNum, breakIdx);
1070 tp.srcCol ->setSize(breakIdx+1);
1071 tp.srcCol ->setElementAt(column, breakIdx);
1072 break;
1073 }
1074
1075 if (u_isdigit(c)) {
1076 tagValue = tagValue*10 + u_charDigitValue(c);
1077 break;
1078 }
1079
1080 errln("Syntax Error in test file at line %d, col %d",
1081 lineNum, column);
1082 parseState = PARSE_COMMENT;
1083 goto end_test; // Stop the test
1084 break;
1085 }
1086
1087
1088 if (U_FAILURE(status)) {
1089 dataerrln("ICU Error %s while parsing test file at line %d.",
1090 u_errorName(status), lineNum);
1091 status = U_ZERO_ERROR;
1092 goto end_test; // Stop the test
1093 }
1094
1095 }
1096
1097 // Reached end of test file. Raise an error if parseState indicates that we are
1098 // within a block that should have been terminated.
1099
1100 if (parseState == PARSE_RULES) {
1101 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1102 lineNum, rulesFirstLine);
1103 }
1104 if (parseState == PARSE_DATA) {
1105 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1106 }
1107
1108
1109 end_test:
1110 delete [] testFile;
1111 #endif
1112 }
1113
1114
1115 //-------------------------------------------------------------------------------
1116 //
1117 // TestDictRules create a break iterator from source rules that includes a
1118 // dictionary range. Regression for bug #7130. Source rules
1119 // do not declare a break iterator type (word, line, sentence, etc.
1120 // but the dictionary code, without a type, would loop.
1121 //
1122 //-------------------------------------------------------------------------------
TestDictRules()1123 void RBBITest::TestDictRules() {
1124 const char *rules = "$dictionary = [a-z]; \n"
1125 "!!forward; \n"
1126 "$dictionary $dictionary; \n"
1127 "!!reverse; \n"
1128 "$dictionary $dictionary; \n";
1129 const char *text = "aa";
1130 UErrorCode status = U_ZERO_ERROR;
1131 UParseError parseError;
1132
1133 RuleBasedBreakIterator bi(rules, parseError, status);
1134 if (U_SUCCESS(status)) {
1135 UnicodeString utext = text;
1136 bi.setText(utext);
1137 int32_t position;
1138 int32_t loops;
1139 for (loops = 0; loops<10; loops++) {
1140 position = bi.next();
1141 if (position == RuleBasedBreakIterator::DONE) {
1142 break;
1143 }
1144 }
1145 TEST_ASSERT(loops == 1);
1146 } else {
1147 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1148 }
1149 }
1150
1151
1152
1153 //-------------------------------------------------------------------------------
1154 //
1155 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1156 // return the data in one big UChar * buffer, which the caller must delete.
1157 //
1158 // parameters:
1159 // fileName: the name of the file, with no directory part. The test data directory
1160 // is assumed.
1161 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1162 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1163 // specified here. The BOM, if it exists, will be stripped from the returned data.
1164 // Pass NULL for the system default encoding.
1165 // status
1166 // returns:
1167 // The file data, converted to UChar.
1168 // The caller must delete this when done with
1169 // delete [] theBuffer;
1170 //
1171 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1172 // Move this function to some common place.
1173 //
1174 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1175 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1176 UChar *retPtr = NULL;
1177 char *fileBuf = NULL;
1178 UConverter* conv = NULL;
1179 FILE *f = NULL;
1180
1181 ulen = 0;
1182 if (U_FAILURE(status)) {
1183 return retPtr;
1184 }
1185
1186 //
1187 // Open the file.
1188 //
1189 f = fopen(fileName, "rb");
1190 if (f == 0) {
1191 dataerrln("Error opening test data file %s\n", fileName);
1192 status = U_FILE_ACCESS_ERROR;
1193 return NULL;
1194 }
1195 //
1196 // Read it in
1197 //
1198 int fileSize;
1199 int amt_read;
1200
1201 fseek( f, 0, SEEK_END);
1202 fileSize = ftell(f);
1203 fileBuf = new char[fileSize];
1204 fseek(f, 0, SEEK_SET);
1205 amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
1206 if (amt_read != fileSize || fileSize <= 0) {
1207 errln("Error reading test data file.");
1208 goto cleanUpAndReturn;
1209 }
1210
1211 //
1212 // Look for a Unicode Signature (BOM) on the data just read
1213 //
1214 int32_t signatureLength;
1215 const char * fileBufC;
1216 const char* bomEncoding;
1217
1218 fileBufC = fileBuf;
1219 bomEncoding = ucnv_detectUnicodeSignature(
1220 fileBuf, fileSize, &signatureLength, &status);
1221 if(bomEncoding!=NULL ){
1222 fileBufC += signatureLength;
1223 fileSize -= signatureLength;
1224 encoding = bomEncoding;
1225 }
1226
1227 //
1228 // Open a converter to take the rule file to UTF-16
1229 //
1230 conv = ucnv_open(encoding, &status);
1231 if (U_FAILURE(status)) {
1232 goto cleanUpAndReturn;
1233 }
1234
1235 //
1236 // Convert the rules to UChar.
1237 // Preflight first to determine required buffer size.
1238 //
1239 ulen = ucnv_toUChars(conv,
1240 NULL, // dest,
1241 0, // destCapacity,
1242 fileBufC,
1243 fileSize,
1244 &status);
1245 if (status == U_BUFFER_OVERFLOW_ERROR) {
1246 // Buffer Overflow is expected from the preflight operation.
1247 status = U_ZERO_ERROR;
1248
1249 retPtr = new UChar[ulen+1];
1250 ucnv_toUChars(conv,
1251 retPtr, // dest,
1252 ulen+1,
1253 fileBufC,
1254 fileSize,
1255 &status);
1256 }
1257
1258 cleanUpAndReturn:
1259 fclose(f);
1260 delete []fileBuf;
1261 ucnv_close(conv);
1262 if (U_FAILURE(status)) {
1263 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1264 delete []retPtr;
1265 retPtr = 0;
1266 ulen = 0;
1267 }
1268 return retPtr;
1269 }
1270
1271
1272
1273 //--------------------------------------------------------------------------------------------
1274 //
1275 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1276 //
1277 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1278 void RBBITest::TestUnicodeFiles() {
1279 RuleBasedBreakIterator *bi;
1280 UErrorCode status = U_ZERO_ERROR;
1281
1282 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1283 TEST_ASSERT_SUCCESS(status);
1284 if (U_SUCCESS(status)) {
1285 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1286 }
1287 delete bi;
1288
1289 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1290 TEST_ASSERT_SUCCESS(status);
1291 if (U_SUCCESS(status)) {
1292 runUnicodeTestData("WordBreakTest.txt", bi);
1293 }
1294 delete bi;
1295
1296 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1297 TEST_ASSERT_SUCCESS(status);
1298 if (U_SUCCESS(status)) {
1299 runUnicodeTestData("SentenceBreakTest.txt", bi);
1300 }
1301 delete bi;
1302
1303 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1304 TEST_ASSERT_SUCCESS(status);
1305 if (U_SUCCESS(status)) {
1306 runUnicodeTestData("LineBreakTest.txt", bi);
1307 }
1308 delete bi;
1309 }
1310
1311
1312 // Check for test cases from the Unicode test data files that are known to fail
1313 // and should be skipped as known issues because ICU does not fully implement
1314 // the Unicode specifications, or because ICU includes tailorings that differ from
1315 // the Unicode standard.
1316 //
1317 // Test cases are identified by the test data sequence, which tends to be more stable
1318 // across Unicode versions than the test file line numbers.
1319 //
1320 // The test case with ticket "10666" is a dummy, included as an example.
1321
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1322 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1323 static struct TestCase {
1324 const char *fTicketNum;
1325 const char *fFileName;
1326 const UChar *fString;
1327 } badTestCases[] = {
1328 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1329 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1330 // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1331 // ICU is out of sync with Unicode.
1332 {"8151", "LineBreakTest.txt", u"-#"},
1333 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1334 {"8151", "LineBreakTest.txt", u"\u002d\u00a7"},
1335 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1336 {"8151", "LineBreakTest.txt", u"\u002d\U00050005"},
1337 {"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1338 {"8151", "LineBreakTest.txt", u"\u002d\u0e01"},
1339 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1340
1341 // Issue ICU-12017 Improve line break around numbers
1342 {"12017", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1343 {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1344 {"12017", "LineBreakTest.txt", u"find .com"},
1345 {"12017", "LineBreakTest.txt", u"equals .35 cents"},
1346 {"12017", "LineBreakTest.txt", u"a.2 "},
1347 {"12017", "LineBreakTest.txt", u"a.2 \u0915"},
1348 {"12017", "LineBreakTest.txt", u"a.2 \u672C"},
1349 {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1350 {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1351 {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1352 {"12017", "LineBreakTest.txt", u"A.1 \uBABB"},
1353 {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1354 {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1355 {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1356 {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1357 };
1358
1359 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1360 const TestCase &badCase = badTestCases[n];
1361 if (!strcmp(fileName, badCase.fFileName) &&
1362 testCase == UnicodeString(badCase.fString)) {
1363 return logKnownIssue(badCase.fTicketNum);
1364 }
1365 }
1366 return FALSE;
1367 }
1368
1369
1370 //--------------------------------------------------------------------------------------------
1371 //
1372 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1373 //
1374 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1375 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1376 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1377 UErrorCode status = U_ZERO_ERROR;
1378
1379 //
1380 // Open and read the test data file, put it into a UnicodeString.
1381 //
1382 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1383 char testFileName[1000];
1384 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1385 dataerrln("Can't open test data. Path too long.");
1386 return;
1387 }
1388 strcpy(testFileName, testDataDirectory);
1389 strcat(testFileName, fileName);
1390
1391 logln("Opening data file %s\n", fileName);
1392
1393 int len;
1394 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1395 if (status != U_FILE_ACCESS_ERROR) {
1396 TEST_ASSERT_SUCCESS(status);
1397 TEST_ASSERT(testFile != NULL);
1398 }
1399 if (U_FAILURE(status) || testFile == NULL) {
1400 return; /* something went wrong, error already output */
1401 }
1402 UnicodeString testFileAsString(TRUE, testFile, len);
1403
1404 //
1405 // Parse the test data file using a regular expression.
1406 // Each kind of token is recognized in its own capture group; what type of item was scanned
1407 // is identified by which group had a match.
1408 //
1409 // Caputure Group # 1 2 3 4 5
1410 // Parses this item: divide x hex digits comment \n unrecognized \n
1411 //
1412 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1413 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1414 UnicodeString testString;
1415 UVector32 breakPositions(status);
1416 int lineNumber = 1;
1417 TEST_ASSERT_SUCCESS(status);
1418 if (U_FAILURE(status)) {
1419 return;
1420 }
1421
1422 //
1423 // Scan through each test case, building up the string to be broken in testString,
1424 // and the positions that should be boundaries in the breakPositions vector.
1425 //
1426 int spin = 0;
1427 while (tokenMatcher.find()) {
1428 if(tokenMatcher.hitEnd()) {
1429 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1430 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1431 and caused an infinite loop here on EBCDIC systems!
1432 */
1433 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1434 // return;
1435 }
1436 if (tokenMatcher.start(1, status) >= 0) {
1437 // Scanned a divide sign, indicating a break position in the test data.
1438 if (testString.length()>0) {
1439 breakPositions.addElement(testString.length(), status);
1440 }
1441 }
1442 else if (tokenMatcher.start(2, status) >= 0) {
1443 // Scanned an 'x', meaning no break at this position in the test data
1444 // Nothing to be done here.
1445 }
1446 else if (tokenMatcher.start(3, status) >= 0) {
1447 // Scanned Hex digits. Convert them to binary, append to the character data string.
1448 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1449 int length = hexNumber.length();
1450 if (length<=8) {
1451 char buf[10];
1452 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1453 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1454 if (c<=0x10ffff) {
1455 testString.append(c);
1456 } else {
1457 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1458 fileName, lineNumber);
1459 }
1460 } else {
1461 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1462 fileName, lineNumber);
1463 }
1464 }
1465 else if (tokenMatcher.start(4, status) >= 0) {
1466 // Scanned to end of a line, possibly skipping over a comment in the process.
1467 // If the line from the file contained test data, run the test now.
1468 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1469 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1470 }
1471
1472 // Clear out this test case.
1473 // The string and breakPositions vector will be refilled as the next
1474 // test case is parsed.
1475 testString.remove();
1476 breakPositions.removeAllElements();
1477 lineNumber++;
1478 } else {
1479 // Scanner catchall. Something unrecognized appeared on the line.
1480 char token[16];
1481 UnicodeString uToken = tokenMatcher.group(0, status);
1482 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1483 token[sizeof(token)-1] = 0;
1484 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1485
1486 // Clean up, in preparation for continuing with the next line.
1487 testString.remove();
1488 breakPositions.removeAllElements();
1489 lineNumber++;
1490 }
1491 TEST_ASSERT_SUCCESS(status);
1492 if (U_FAILURE(status)) {
1493 break;
1494 }
1495 }
1496
1497 delete [] testFile;
1498 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1499 }
1500
1501 //--------------------------------------------------------------------------------------------
1502 //
1503 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1504 // test data files. Do only a simple, forward-only check -
1505 // this test is mostly to check that ICU and the Unicode
1506 // data agree with each other.
1507 //
1508 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1509 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1510 const UnicodeString &testString, // Text data to be broken
1511 UVector32 *breakPositions, // Positions where breaks should be found.
1512 RuleBasedBreakIterator *bi) {
1513 int32_t pos; // Break Position in the test string
1514 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1515 int32_t expectedPos; // Expected break position (index into test string)
1516
1517 bi->setText(testString);
1518 pos = bi->first();
1519 pos = bi->next();
1520
1521 while (pos != BreakIterator::DONE) {
1522 if (expectedI >= breakPositions->size()) {
1523 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1524 testFileName, lineNumber, pos);
1525 break;
1526 }
1527 expectedPos = breakPositions->elementAti(expectedI);
1528 if (pos < expectedPos) {
1529 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1530 testFileName, lineNumber, pos);
1531 break;
1532 }
1533 if (pos > expectedPos) {
1534 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1535 testFileName, lineNumber, expectedPos);
1536 break;
1537 }
1538 pos = bi->next();
1539 expectedI++;
1540 }
1541
1542 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1543 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1544 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1545 }
1546 }
1547
1548
1549
1550 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1551 //---------------------------------------------------------------------------------------
1552 //
1553 // classs RBBIMonkeyKind
1554 //
1555 // Monkey Test for Break Iteration
1556 // Abstract interface class. Concrete derived classes independently
1557 // implement the break rules for different iterator types.
1558 //
1559 // The Monkey Test itself uses doesn't know which type of break iterator it is
1560 // testing, but works purely in terms of the interface defined here.
1561 //
1562 //---------------------------------------------------------------------------------------
1563 class RBBIMonkeyKind {
1564 public:
1565 // Return a UVector of UnicodeSets, representing the character classes used
1566 // for this type of iterator.
1567 virtual UVector *charClasses() = 0;
1568
1569 // Set the test text on which subsequent calls to next() will operate
1570 virtual void setText(const UnicodeString &s) = 0;
1571
1572 // Find the next break postion, starting from the prev break position, or from zero.
1573 // Return -1 after reaching end of string.
1574 virtual int32_t next(int32_t i) = 0;
1575
1576 // Name of each character class, parallel with charClasses. Used for debugging output
1577 // of characters.
1578 virtual std::vector<std::string>& characterClassNames();
1579
1580 void setAppliedRule(int32_t position, const char* value);
1581
1582 std::string getAppliedRule(int32_t position);
1583
1584 virtual ~RBBIMonkeyKind();
1585 UErrorCode deferredStatus;
1586
1587 std::string classNameFromCodepoint(const UChar32 c);
1588 unsigned int maxClassNameSize();
1589
1590 protected:
1591 RBBIMonkeyKind();
1592 std::vector<std::string> classNames;
1593 std::vector<std::string> appliedRules;
1594
1595 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1596 void prepareAppliedRules(int32_t size );
1597
1598 private:
1599
1600 };
1601
RBBIMonkeyKind()1602 RBBIMonkeyKind::RBBIMonkeyKind() {
1603 deferredStatus = U_ZERO_ERROR;
1604 }
1605
~RBBIMonkeyKind()1606 RBBIMonkeyKind::~RBBIMonkeyKind() {
1607 }
1608
characterClassNames()1609 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1610 return classNames;
1611 }
1612
prepareAppliedRules(int32_t size)1613 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1614 // Remove all the information in the `appliedRules`.
1615 appliedRules.clear();
1616 appliedRules.resize(size + 1);
1617 }
1618
setAppliedRule(int32_t position,const char * value)1619 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1620 appliedRules[position] = value;
1621 }
1622
getAppliedRule(int32_t position)1623 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1624 return appliedRules[position];
1625 }
1626
classNameFromCodepoint(const UChar32 c)1627 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1628 // Simply iterate through charClasses to find character's class
1629 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1630 UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1631 if (classSet->contains(c)) {
1632 return classNames[aClassNum];
1633 }
1634 }
1635 U_ASSERT(FALSE); // This should not happen.
1636 return "bad class name";
1637 }
1638
maxClassNameSize()1639 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1640 unsigned int maxSize = 0;
1641 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1642 auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1643 if (aClassNumSize > maxSize) {
1644 maxSize = aClassNumSize;
1645 }
1646 }
1647 return maxSize;
1648 }
1649
1650 //----------------------------------------------------------------------------------------
1651 //
1652 // Random Numbers. Similar to standard lib rand() and srand()
1653 // Not using library to
1654 // 1. Get same results on all platforms.
1655 // 2. Get access to current seed, to more easily reproduce failures.
1656 //
1657 //---------------------------------------------------------------------------------------
1658 static uint32_t m_seed = 1;
1659
m_rand()1660 static uint32_t m_rand()
1661 {
1662 m_seed = m_seed * 1103515245 + 12345;
1663 return (uint32_t)(m_seed/65536) % 32768;
1664 }
1665
1666
1667 //------------------------------------------------------------------------------------------
1668 //
1669 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1670 // of RBBIMonkeyKind.
1671 //
1672 //------------------------------------------------------------------------------------------
1673 class RBBICharMonkey: public RBBIMonkeyKind {
1674 public:
1675 RBBICharMonkey();
1676 virtual ~RBBICharMonkey();
1677 virtual UVector *charClasses();
1678 virtual void setText(const UnicodeString &s);
1679 virtual int32_t next(int32_t i);
1680 private:
1681 UVector *fSets;
1682
1683 UnicodeSet *fCRLFSet;
1684 UnicodeSet *fControlSet;
1685 UnicodeSet *fExtendSet;
1686 UnicodeSet *fZWJSet;
1687 UnicodeSet *fRegionalIndicatorSet;
1688 UnicodeSet *fPrependSet;
1689 UnicodeSet *fSpacingSet;
1690 UnicodeSet *fLSet;
1691 UnicodeSet *fVSet;
1692 UnicodeSet *fTSet;
1693 UnicodeSet *fLVSet;
1694 UnicodeSet *fLVTSet;
1695 UnicodeSet *fHangulSet;
1696 UnicodeSet *fExtendedPictSet;
1697 UnicodeSet *fViramaSet;
1698 UnicodeSet *fLinkingConsonantSet;
1699 UnicodeSet *fExtCccZwjSet;
1700 UnicodeSet *fAnySet;
1701
1702 const UnicodeString *fText;
1703 };
1704
1705
RBBICharMonkey()1706 RBBICharMonkey::RBBICharMonkey() {
1707 UErrorCode status = U_ZERO_ERROR;
1708
1709 fText = NULL;
1710
1711 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1712 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1713 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1714 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1715 fRegionalIndicatorSet =
1716 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1717 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1718 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1719 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1720 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1721 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1722 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1723 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1724 fHangulSet = new UnicodeSet();
1725 fHangulSet->addAll(*fLSet);
1726 fHangulSet->addAll(*fVSet);
1727 fHangulSet->addAll(*fTSet);
1728 fHangulSet->addAll(*fLVSet);
1729 fHangulSet->addAll(*fLVTSet);
1730
1731 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1732 fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1733 "\\p{Indic_Syllabic_Category=Virama}]", status);
1734 fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1735 "\\p{Indic_Syllabic_Category=Consonant}]", status);
1736 fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1737 fAnySet = new UnicodeSet(0, 0x10ffff);
1738
1739 // Create sets of characters, and add the names of the above character sets.
1740 // In each new ICU release, add new names corresponding to the sets above.
1741 fSets = new UVector(status);
1742
1743 // Important: Keep class names the same as the class contents.
1744 fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1745 fSets->addElement(fControlSet, status); classNames.push_back("Control");
1746 fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1747 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1748 if (!fPrependSet->isEmpty()) {
1749 fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1750 }
1751 fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1752 fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1753 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1754 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1755 fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1756 fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1757 fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1758 fSets->addElement(fAnySet, status); classNames.push_back("Any");
1759
1760 if (U_FAILURE(status)) {
1761 deferredStatus = status;
1762 }
1763 }
1764
1765
setText(const UnicodeString & s)1766 void RBBICharMonkey::setText(const UnicodeString &s) {
1767 fText = &s;
1768 prepareAppliedRules(s.length());
1769 }
1770
1771
1772
next(int32_t prevPos)1773 int32_t RBBICharMonkey::next(int32_t prevPos) {
1774 int p0, p1, p2, p3; // Indices of the significant code points around the
1775 // break position being tested. The candidate break
1776 // location is before p2.
1777
1778 int breakPos = -1;
1779
1780 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1781 UChar32 cBase; // for (X Extend*) patterns, the X character.
1782
1783 if (U_FAILURE(deferredStatus)) {
1784 return -1;
1785 }
1786
1787 // Previous break at end of string. return DONE.
1788 if (prevPos >= fText->length()) {
1789 return -1;
1790 }
1791
1792 p0 = p1 = p2 = p3 = prevPos;
1793 c3 = fText->char32At(prevPos);
1794 c0 = c1 = c2 = cBase = 0;
1795 (void)p0; // suppress set but not used warning.
1796 (void)c0;
1797
1798 // Loop runs once per "significant" character position in the input text.
1799 for (;;) {
1800 // Move all of the positions forward in the input string.
1801 p0 = p1; c0 = c1;
1802 p1 = p2; c1 = c2;
1803 p2 = p3; c2 = c3;
1804
1805 // Advance p3 by one codepoint
1806 p3 = fText->moveIndex32(p3, 1);
1807 c3 = fText->char32At(p3);
1808
1809 if (p1 == p2) {
1810 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1811 continue;
1812 }
1813
1814 if (p2 == fText->length()) {
1815 setAppliedRule(p2, "End of String");
1816 break;
1817 }
1818
1819 // No Extend or Format characters may appear between the CR and LF,
1820 // which requires the additional check for p2 immediately following p1.
1821 //
1822 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1823 setAppliedRule(p2, "GB3 CR x LF");
1824 continue;
1825 }
1826
1827 if (fControlSet->contains(c1) ||
1828 c1 == 0x0D ||
1829 c1 == 0x0A) {
1830 setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
1831 break;
1832 }
1833
1834 if (fControlSet->contains(c2) ||
1835 c2 == 0x0D ||
1836 c2 == 0x0A) {
1837 setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
1838 break;
1839 }
1840
1841 if (fLSet->contains(c1) &&
1842 (fLSet->contains(c2) ||
1843 fVSet->contains(c2) ||
1844 fLVSet->contains(c2) ||
1845 fLVTSet->contains(c2))) {
1846 setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
1847 continue;
1848 }
1849
1850 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1851 (fVSet->contains(c2) || fTSet->contains(c2))) {
1852 setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
1853 continue;
1854 }
1855
1856 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1857 fTSet->contains(c2)) {
1858 setAppliedRule(p2, "GB8 ( LVT | T) x T");
1859 continue;
1860 }
1861
1862 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1863 if (!fExtendSet->contains(c1)) {
1864 cBase = c1;
1865 }
1866 setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
1867 continue;
1868 }
1869
1870 if (fSpacingSet->contains(c2)) {
1871 setAppliedRule(p2, "GB9a x SpacingMark");
1872 continue;
1873 }
1874
1875 if (fPrependSet->contains(c1)) {
1876 setAppliedRule(p2, "GB9b Prepend x");
1877 continue;
1878 }
1879
1880 // Note: Viramas are also included in the ExtCccZwj class.
1881 if (fLinkingConsonantSet->contains(c2)) {
1882 int pi = p1;
1883 bool sawVirama = false;
1884 while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1885 if (fViramaSet->contains(fText->char32At(pi))) {
1886 sawVirama = true;
1887 }
1888 pi = fText->moveIndex32(pi, -1);
1889 }
1890 if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1891 setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1892 continue;
1893 }
1894 }
1895
1896 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1897 setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1898 continue;
1899 }
1900
1901 // Note: The first if condition is a little tricky. We only need to force
1902 // a break if there are three or more contiguous RIs. If there are
1903 // only two, a break following will occur via other rules, and will include
1904 // any trailing extend characters, which is needed behavior.
1905 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1906 && fRegionalIndicatorSet->contains(c2)) {
1907 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1908 break;
1909 }
1910 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1911 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1912 continue;
1913 }
1914
1915 setAppliedRule(p2, "GB999 Any <break> Any");
1916 break;
1917 }
1918
1919 breakPos = p2;
1920 return breakPos;
1921 }
1922
1923
1924
charClasses()1925 UVector *RBBICharMonkey::charClasses() {
1926 return fSets;
1927 }
1928
~RBBICharMonkey()1929 RBBICharMonkey::~RBBICharMonkey() {
1930 delete fSets;
1931 delete fCRLFSet;
1932 delete fControlSet;
1933 delete fExtendSet;
1934 delete fRegionalIndicatorSet;
1935 delete fPrependSet;
1936 delete fSpacingSet;
1937 delete fLSet;
1938 delete fVSet;
1939 delete fTSet;
1940 delete fLVSet;
1941 delete fLVTSet;
1942 delete fHangulSet;
1943 delete fAnySet;
1944 delete fZWJSet;
1945 delete fExtendedPictSet;
1946 delete fViramaSet;
1947 delete fLinkingConsonantSet;
1948 delete fExtCccZwjSet;
1949 }
1950
1951 //------------------------------------------------------------------------------------------
1952 //
1953 // class RBBIWordMonkey Word Break specific implementation
1954 // of RBBIMonkeyKind.
1955 //
1956 //------------------------------------------------------------------------------------------
1957 class RBBIWordMonkey: public RBBIMonkeyKind {
1958 public:
1959 RBBIWordMonkey();
1960 virtual ~RBBIWordMonkey();
1961 virtual UVector *charClasses();
1962 virtual void setText(const UnicodeString &s);
1963 virtual int32_t next(int32_t i);
1964 private:
1965 UVector *fSets;
1966
1967 UnicodeSet *fCRSet;
1968 UnicodeSet *fLFSet;
1969 UnicodeSet *fNewlineSet;
1970 UnicodeSet *fRegionalIndicatorSet;
1971 UnicodeSet *fKatakanaSet;
1972 UnicodeSet *fHebrew_LetterSet;
1973 UnicodeSet *fALetterSet;
1974 UnicodeSet *fSingle_QuoteSet;
1975 UnicodeSet *fDouble_QuoteSet;
1976 UnicodeSet *fMidNumLetSet;
1977 UnicodeSet *fMidLetterSet;
1978 UnicodeSet *fMidNumSet;
1979 UnicodeSet *fNumericSet;
1980 UnicodeSet *fFormatSet;
1981 UnicodeSet *fOtherSet;
1982 UnicodeSet *fExtendSet;
1983 UnicodeSet *fExtendNumLetSet;
1984 UnicodeSet *fWSegSpaceSet;
1985 UnicodeSet *fDictionarySet;
1986 UnicodeSet *fZWJSet;
1987 UnicodeSet *fExtendedPictSet;
1988
1989 const UnicodeString *fText;
1990 };
1991
1992
RBBIWordMonkey()1993 RBBIWordMonkey::RBBIWordMonkey()
1994 {
1995 UErrorCode status = U_ZERO_ERROR;
1996
1997 fSets = new UVector(status);
1998
1999 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
2000 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
2001 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
2002 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
2003 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2004 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2005 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2006 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
2007 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
2008 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
2009 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status);
2010 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
2011 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
2012 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
2013 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
2014 // There are some sc=Hani characters with WB=Extend.
2015 // The break rules need to pick one or the other because
2016 // Extend overlapping with something else is messy.
2017 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
2018 // in $Han (for $dictionary) and out of $Extend.
2019 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
2020 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
2021
2022 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
2023 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
2024
2025 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2026 fDictionarySet->addAll(*fKatakanaSet);
2027 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2028
2029 fALetterSet->removeAll(*fDictionarySet);
2030
2031 fOtherSet = new UnicodeSet();
2032 if(U_FAILURE(status)) {
2033 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2034 deferredStatus = status;
2035 return;
2036 }
2037
2038 fOtherSet->complement();
2039 fOtherSet->removeAll(*fCRSet);
2040 fOtherSet->removeAll(*fLFSet);
2041 fOtherSet->removeAll(*fNewlineSet);
2042 fOtherSet->removeAll(*fKatakanaSet);
2043 fOtherSet->removeAll(*fHebrew_LetterSet);
2044 fOtherSet->removeAll(*fALetterSet);
2045 fOtherSet->removeAll(*fSingle_QuoteSet);
2046 fOtherSet->removeAll(*fDouble_QuoteSet);
2047 fOtherSet->removeAll(*fMidLetterSet);
2048 fOtherSet->removeAll(*fMidNumSet);
2049 fOtherSet->removeAll(*fNumericSet);
2050 fOtherSet->removeAll(*fExtendNumLetSet);
2051 fOtherSet->removeAll(*fWSegSpaceSet);
2052 fOtherSet->removeAll(*fFormatSet);
2053 fOtherSet->removeAll(*fExtendSet);
2054 fOtherSet->removeAll(*fRegionalIndicatorSet);
2055 fOtherSet->removeAll(*fZWJSet);
2056 fOtherSet->removeAll(*fExtendedPictSet);
2057
2058 // Inhibit dictionary characters from being tested at all.
2059 fOtherSet->removeAll(*fDictionarySet);
2060
2061 // Add classes and their names
2062 fSets->addElement(fCRSet, status); classNames.push_back("CR");
2063 fSets->addElement(fLFSet, status); classNames.push_back("LF");
2064 fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2065 fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2066 fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2067 fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2068 fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2069 fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2070 // Omit Katakana from fSets, which omits Katakana characters
2071 // from the test data. They are all in the dictionary set,
2072 // which this (old, to be retired) monkey test cannot handle.
2073 //fSets->addElement(fKatakanaSet, status);
2074
2075 fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2076 fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2077 fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2078 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2079 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2080 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2081 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2082 fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2083 fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2084
2085 fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2086 fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2087
2088 if (U_FAILURE(status)) {
2089 deferredStatus = status;
2090 }
2091 }
2092
setText(const UnicodeString & s)2093 void RBBIWordMonkey::setText(const UnicodeString &s) {
2094 fText = &s;
2095 prepareAppliedRules(s.length());
2096 }
2097
2098
next(int32_t prevPos)2099 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2100 int p0, p1, p2, p3; // Indices of the significant code points around the
2101 // break position being tested. The candidate break
2102 // location is before p2.
2103
2104 int breakPos = -1;
2105
2106 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2107
2108 if (U_FAILURE(deferredStatus)) {
2109 return -1;
2110 }
2111
2112 // Prev break at end of string. return DONE.
2113 if (prevPos >= fText->length()) {
2114 return -1;
2115 }
2116 p0 = p1 = p2 = p3 = prevPos;
2117 c3 = fText->char32At(prevPos);
2118 c0 = c1 = c2 = 0;
2119 (void)p0; // Suppress set but not used warning.
2120
2121 // Loop runs once per "significant" character position in the input text.
2122 for (;;) {
2123 // Move all of the positions forward in the input string.
2124 p0 = p1; c0 = c1;
2125 p1 = p2; c1 = c2;
2126 p2 = p3; c2 = c3;
2127
2128 // Advance p3 by X(Extend | Format)* Rule 4
2129 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2130 do {
2131 p3 = fText->moveIndex32(p3, 1);
2132 c3 = fText->char32At(p3);
2133 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2134 break;
2135 }
2136 }
2137 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2138
2139
2140 if (p1 == p2) {
2141 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2142 continue;
2143 }
2144
2145 if (p2 == fText->length()) {
2146 // Reached end of string. Always a break position.
2147 break;
2148 }
2149
2150 // No Extend or Format characters may appear between the CR and LF,
2151 // which requires the additional check for p2 immediately following p1.
2152 //
2153 if (c1==0x0D && c2==0x0A) {
2154 setAppliedRule(p2, "WB3 CR x LF");
2155 continue;
2156 }
2157
2158 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2159 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2160 break;
2161 }
2162 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2163 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2164 break;
2165 }
2166
2167 // Not ignoring extend chars, so peek into input text to
2168 // get the potential ZWJ, the character immediately preceding c2.
2169 // Sloppy UChar32 indexing: p2-1 may reference trail half
2170 // but char32At will get the full code point.
2171 if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2172 setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
2173 continue;
2174 }
2175
2176 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2177 setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
2178 continue;
2179 }
2180
2181 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2182 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2183 setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2184 continue;
2185 }
2186
2187 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2188 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2189 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2190 setAppliedRule(p2,
2191 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2192 continue;
2193 }
2194
2195 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2196 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2197 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2198 setAppliedRule(p2,
2199 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
2200 continue;
2201 }
2202
2203 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2204 setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
2205 continue;
2206 }
2207
2208 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2209 setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
2210 continue;
2211 }
2212
2213 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2214 setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
2215 continue;
2216 }
2217
2218 if (fNumericSet->contains(c1) &&
2219 fNumericSet->contains(c2)) {
2220 setAppliedRule(p2, "WB8 Numeric x Numeric");
2221 continue;
2222 }
2223
2224 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2225 fNumericSet->contains(c2)) {
2226 setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
2227 continue;
2228 }
2229
2230 if (fNumericSet->contains(c1) &&
2231 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2232 setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
2233 continue;
2234 }
2235
2236 if (fNumericSet->contains(c0) &&
2237 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2238 fNumericSet->contains(c2)) {
2239 setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
2240 continue;
2241 }
2242
2243 if (fNumericSet->contains(c1) &&
2244 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2245 fNumericSet->contains(c3)) {
2246 setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2247 continue;
2248 }
2249
2250 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2251 // all Katakana are handled by the dictionary breaker.
2252 if (fKatakanaSet->contains(c1) &&
2253 fKatakanaSet->contains(c2)) {
2254 setAppliedRule(p2, "WB13 Katakana x Katakana");
2255 continue;
2256 }
2257
2258 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2259 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2260 fExtendNumLetSet->contains(c2)) {
2261 setAppliedRule(p2,
2262 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2263 continue;
2264 }
2265
2266 if (fExtendNumLetSet->contains(c1) &&
2267 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2268 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2269 setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2270 continue;
2271 }
2272
2273 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2274 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2275 break;
2276 }
2277 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2278 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2279 continue;
2280 }
2281
2282 setAppliedRule(p2, "WB999");
2283 break;
2284 }
2285
2286 breakPos = p2;
2287 return breakPos;
2288 }
2289
2290
charClasses()2291 UVector *RBBIWordMonkey::charClasses() {
2292 return fSets;
2293 }
2294
~RBBIWordMonkey()2295 RBBIWordMonkey::~RBBIWordMonkey() {
2296 delete fSets;
2297 delete fCRSet;
2298 delete fLFSet;
2299 delete fNewlineSet;
2300 delete fKatakanaSet;
2301 delete fHebrew_LetterSet;
2302 delete fALetterSet;
2303 delete fSingle_QuoteSet;
2304 delete fDouble_QuoteSet;
2305 delete fMidNumLetSet;
2306 delete fMidLetterSet;
2307 delete fMidNumSet;
2308 delete fNumericSet;
2309 delete fFormatSet;
2310 delete fExtendSet;
2311 delete fExtendNumLetSet;
2312 delete fWSegSpaceSet;
2313 delete fRegionalIndicatorSet;
2314 delete fDictionarySet;
2315 delete fOtherSet;
2316 delete fZWJSet;
2317 delete fExtendedPictSet;
2318 }
2319
2320
2321
2322
2323 //------------------------------------------------------------------------------------------
2324 //
2325 // class RBBISentMonkey Sentence Break specific implementation
2326 // of RBBIMonkeyKind.
2327 //
2328 //------------------------------------------------------------------------------------------
2329 class RBBISentMonkey: public RBBIMonkeyKind {
2330 public:
2331 RBBISentMonkey();
2332 virtual ~RBBISentMonkey();
2333 virtual UVector *charClasses();
2334 virtual void setText(const UnicodeString &s);
2335 virtual int32_t next(int32_t i);
2336 private:
2337 int moveBack(int posFrom);
2338 int moveForward(int posFrom);
2339 UChar32 cAt(int pos);
2340
2341 UVector *fSets;
2342
2343 UnicodeSet *fSepSet;
2344 UnicodeSet *fFormatSet;
2345 UnicodeSet *fSpSet;
2346 UnicodeSet *fLowerSet;
2347 UnicodeSet *fUpperSet;
2348 UnicodeSet *fOLetterSet;
2349 UnicodeSet *fNumericSet;
2350 UnicodeSet *fATermSet;
2351 UnicodeSet *fSContinueSet;
2352 UnicodeSet *fSTermSet;
2353 UnicodeSet *fCloseSet;
2354 UnicodeSet *fOtherSet;
2355 UnicodeSet *fExtendSet;
2356
2357 const UnicodeString *fText;
2358 };
2359
RBBISentMonkey()2360 RBBISentMonkey::RBBISentMonkey()
2361 {
2362 UErrorCode status = U_ZERO_ERROR;
2363
2364 fSets = new UVector(status);
2365
2366 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2367 // set and made into character classes of their own. For the monkey impl,
2368 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2369 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2370 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2371 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2372 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2373 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2374 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2375 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2376 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2377 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2378 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2379 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2380 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2381 fOtherSet = new UnicodeSet();
2382
2383 if(U_FAILURE(status)) {
2384 deferredStatus = status;
2385 return;
2386 }
2387
2388 fOtherSet->complement();
2389 fOtherSet->removeAll(*fSepSet);
2390 fOtherSet->removeAll(*fFormatSet);
2391 fOtherSet->removeAll(*fSpSet);
2392 fOtherSet->removeAll(*fLowerSet);
2393 fOtherSet->removeAll(*fUpperSet);
2394 fOtherSet->removeAll(*fOLetterSet);
2395 fOtherSet->removeAll(*fNumericSet);
2396 fOtherSet->removeAll(*fATermSet);
2397 fOtherSet->removeAll(*fSContinueSet);
2398 fOtherSet->removeAll(*fSTermSet);
2399 fOtherSet->removeAll(*fCloseSet);
2400 fOtherSet->removeAll(*fExtendSet);
2401
2402 fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2403 fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2404 fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2405 fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2406 fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2407 fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2408 fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2409 fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2410 fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2411 fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2412 fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2413 fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2414 fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2415
2416 if (U_FAILURE(status)) {
2417 deferredStatus = status;
2418 }
2419 }
2420
2421
2422
setText(const UnicodeString & s)2423 void RBBISentMonkey::setText(const UnicodeString &s) {
2424 fText = &s;
2425 prepareAppliedRules(s.length());
2426 }
2427
charClasses()2428 UVector *RBBISentMonkey::charClasses() {
2429 return fSets;
2430 }
2431
2432 // moveBack() Find the "significant" code point preceding the index i.
2433 // Skips over ($Extend | $Format)* .
2434 //
moveBack(int i)2435 int RBBISentMonkey::moveBack(int i) {
2436 if (i <= 0) {
2437 return -1;
2438 }
2439 UChar32 c;
2440 int32_t j = i;
2441 do {
2442 j = fText->moveIndex32(j, -1);
2443 c = fText->char32At(j);
2444 }
2445 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2446 return j;
2447
2448 }
2449
2450
moveForward(int i)2451 int RBBISentMonkey::moveForward(int i) {
2452 if (i>=fText->length()) {
2453 return fText->length();
2454 }
2455 UChar32 c;
2456 int32_t j = i;
2457 do {
2458 j = fText->moveIndex32(j, 1);
2459 c = cAt(j);
2460 }
2461 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2462 return j;
2463 }
2464
cAt(int pos)2465 UChar32 RBBISentMonkey::cAt(int pos) {
2466 if (pos<0 || pos>=fText->length()) {
2467 return -1;
2468 } else {
2469 return fText->char32At(pos);
2470 }
2471 }
2472
next(int32_t prevPos)2473 int32_t RBBISentMonkey::next(int32_t prevPos) {
2474 int p0, p1, p2, p3; // Indices of the significant code points around the
2475 // break position being tested. The candidate break
2476 // location is before p2.
2477
2478 int breakPos = -1;
2479
2480 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2481 UChar32 c;
2482
2483 if (U_FAILURE(deferredStatus)) {
2484 return -1;
2485 }
2486
2487 // Prev break at end of string. return DONE.
2488 if (prevPos >= fText->length()) {
2489 return -1;
2490 }
2491 p0 = p1 = p2 = p3 = prevPos;
2492 c3 = fText->char32At(prevPos);
2493 c0 = c1 = c2 = 0;
2494 (void)p0; // Suppress set but not used warning.
2495
2496 // Loop runs once per "significant" character position in the input text.
2497 for (;;) {
2498 // Move all of the positions forward in the input string.
2499 p0 = p1; c0 = c1;
2500 p1 = p2; c1 = c2;
2501 p2 = p3; c2 = c3;
2502
2503 // Advance p3 by X(Extend | Format)* Rule 4
2504 p3 = moveForward(p3);
2505 c3 = cAt(p3);
2506
2507 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2508 setAppliedRule(p2, "SB3 CR x LF");
2509 continue;
2510 }
2511
2512 if (fSepSet->contains(c1)) {
2513 p2 = p1+1; // Separators don't combine with Extend or Format.
2514
2515 setAppliedRule(p2, "SB4 Sep <break>");
2516 break;
2517 }
2518
2519 if (p2 >= fText->length()) {
2520 // Reached end of string. Always a break position.
2521 setAppliedRule(p2, "SB4 Sep <break>");
2522 break;
2523 }
2524
2525 if (p2 == prevPos) {
2526 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2527 setAppliedRule(p2, "SB4 Sep <break>");
2528 continue;
2529 }
2530
2531 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2532 setAppliedRule(p2, "SB6 ATerm x Numeric");
2533 continue;
2534 }
2535
2536 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2537 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2538 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
2539 continue;
2540 }
2541
2542 // Note: STerm | ATerm are added to the negated part of the expression by a
2543 // note to the Unicode 5.0 documents.
2544 int p8 = p1;
2545 while (fSpSet->contains(cAt(p8))) {
2546 p8 = moveBack(p8);
2547 }
2548 while (fCloseSet->contains(cAt(p8))) {
2549 p8 = moveBack(p8);
2550 }
2551 if (fATermSet->contains(cAt(p8))) {
2552 p8=p2;
2553 for (;;) {
2554 c = cAt(p8);
2555 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2556 fLowerSet->contains(c) || fSepSet->contains(c) ||
2557 fATermSet->contains(c) || fSTermSet->contains(c)) {
2558
2559 setAppliedRule(p2,
2560 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2561 break;
2562 }
2563 p8 = moveForward(p8);
2564 }
2565 if (fLowerSet->contains(cAt(p8))) {
2566
2567 setAppliedRule(p2,
2568 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2569 continue;
2570 }
2571 }
2572
2573 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2574 p8 = p1;
2575 while (fSpSet->contains(cAt(p8))) {
2576 p8 = moveBack(p8);
2577 }
2578 while (fCloseSet->contains(cAt(p8))) {
2579 p8 = moveBack(p8);
2580 }
2581 c = cAt(p8);
2582 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2583 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2584 continue;
2585 }
2586 }
2587
2588 int p9 = p1;
2589 while (fCloseSet->contains(cAt(p9))) {
2590 p9 = moveBack(p9);
2591 }
2592 c = cAt(p9);
2593 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2594 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2595
2596 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
2597 continue;
2598 }
2599 }
2600
2601 int p10 = p1;
2602 while (fSpSet->contains(cAt(p10))) {
2603 p10 = moveBack(p10);
2604 }
2605 while (fCloseSet->contains(cAt(p10))) {
2606 p10 = moveBack(p10);
2607 }
2608 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2609 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2610 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
2611 continue;
2612 }
2613 }
2614
2615 int p11 = p1;
2616 if (fSepSet->contains(cAt(p11))) {
2617 p11 = moveBack(p11);
2618 }
2619 while (fSpSet->contains(cAt(p11))) {
2620 p11 = moveBack(p11);
2621 }
2622 while (fCloseSet->contains(cAt(p11))) {
2623 p11 = moveBack(p11);
2624 }
2625 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2626 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
2627 break;
2628 }
2629
2630 setAppliedRule(p2, "SB12 Any x Any");
2631 continue;
2632 }
2633
2634 breakPos = p2;
2635 return breakPos;
2636 }
2637
~RBBISentMonkey()2638 RBBISentMonkey::~RBBISentMonkey() {
2639 delete fSets;
2640 delete fSepSet;
2641 delete fFormatSet;
2642 delete fSpSet;
2643 delete fLowerSet;
2644 delete fUpperSet;
2645 delete fOLetterSet;
2646 delete fNumericSet;
2647 delete fATermSet;
2648 delete fSContinueSet;
2649 delete fSTermSet;
2650 delete fCloseSet;
2651 delete fOtherSet;
2652 delete fExtendSet;
2653 }
2654
2655
2656
2657 //-------------------------------------------------------------------------------------------
2658 //
2659 // RBBILineMonkey
2660 //
2661 //-------------------------------------------------------------------------------------------
2662
2663 class RBBILineMonkey: public RBBIMonkeyKind {
2664 public:
2665 RBBILineMonkey();
2666 virtual ~RBBILineMonkey();
2667 virtual UVector *charClasses();
2668 virtual void setText(const UnicodeString &s);
2669 virtual int32_t next(int32_t i);
2670 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2671 private:
2672 UVector *fSets;
2673
2674 UnicodeSet *fBK;
2675 UnicodeSet *fCR;
2676 UnicodeSet *fLF;
2677 UnicodeSet *fCM;
2678 UnicodeSet *fNL;
2679 UnicodeSet *fSG;
2680 UnicodeSet *fWJ;
2681 UnicodeSet *fZW;
2682 UnicodeSet *fGL;
2683 UnicodeSet *fCB;
2684 UnicodeSet *fSP;
2685 UnicodeSet *fB2;
2686 UnicodeSet *fBA;
2687 UnicodeSet *fBB;
2688 UnicodeSet *fHH;
2689 UnicodeSet *fHY;
2690 UnicodeSet *fH2;
2691 UnicodeSet *fH3;
2692 UnicodeSet *fCL;
2693 UnicodeSet *fCP;
2694 UnicodeSet *fEX;
2695 UnicodeSet *fIN;
2696 UnicodeSet *fJL;
2697 UnicodeSet *fJV;
2698 UnicodeSet *fJT;
2699 UnicodeSet *fNS;
2700 UnicodeSet *fOP;
2701 UnicodeSet *fQU;
2702 UnicodeSet *fIS;
2703 UnicodeSet *fNU;
2704 UnicodeSet *fPO;
2705 UnicodeSet *fPR;
2706 UnicodeSet *fSY;
2707 UnicodeSet *fAI;
2708 UnicodeSet *fAL;
2709 UnicodeSet *fCJ;
2710 UnicodeSet *fHL;
2711 UnicodeSet *fID;
2712 UnicodeSet *fRI;
2713 UnicodeSet *fXX;
2714 UnicodeSet *fEB;
2715 UnicodeSet *fEM;
2716 UnicodeSet *fZWJ;
2717 UnicodeSet *fOP30;
2718 UnicodeSet *fCP30;
2719
2720 BreakIterator *fCharBI;
2721 const UnicodeString *fText;
2722 RegexMatcher *fNumberMatcher;
2723 };
2724
RBBILineMonkey()2725 RBBILineMonkey::RBBILineMonkey() :
2726 RBBIMonkeyKind(),
2727 fSets(NULL),
2728
2729 fCharBI(NULL),
2730 fText(NULL),
2731 fNumberMatcher(NULL)
2732
2733 {
2734 if (U_FAILURE(deferredStatus)) {
2735 return;
2736 }
2737
2738 UErrorCode status = U_ZERO_ERROR;
2739
2740 fSets = new UVector(status);
2741
2742 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2743 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2744 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2745 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2746 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2747 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2748 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2749 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2750 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2751 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2752 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2753 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2754 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2755 fHH = new UnicodeSet();
2756 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2757 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2758 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2759 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2760 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2761 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2762 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2763 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2764 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2765 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2766 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2767 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2768 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2769 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2770 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2771 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2772 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2773 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2774 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2775 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2776 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2777 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2778 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2779 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2780 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2781 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2782 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2783 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2784 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2785 fOP30 = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2786 fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2787
2788 if (U_FAILURE(status)) {
2789 deferredStatus = status;
2790 return;
2791 }
2792
2793 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2794 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2795 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2796
2797 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2798 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2799
2800 fHH->add(u'\u2010'); // Hyphen, '‐'
2801
2802 // Sets and names.
2803 fSets->addElement(fBK, status); classNames.push_back("fBK");
2804 fSets->addElement(fCR, status); classNames.push_back("fCR");
2805 fSets->addElement(fLF, status); classNames.push_back("fLF");
2806 fSets->addElement(fCM, status); classNames.push_back("fCM");
2807 fSets->addElement(fNL, status); classNames.push_back("fNL");
2808 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2809 fSets->addElement(fZW, status); classNames.push_back("fZW");
2810 fSets->addElement(fGL, status); classNames.push_back("fGL");
2811 fSets->addElement(fCB, status); classNames.push_back("fCB");
2812 fSets->addElement(fSP, status); classNames.push_back("fSP");
2813 fSets->addElement(fB2, status); classNames.push_back("fB2");
2814 fSets->addElement(fBA, status); classNames.push_back("fBA");
2815 fSets->addElement(fBB, status); classNames.push_back("fBB");
2816 fSets->addElement(fHY, status); classNames.push_back("fHY");
2817 fSets->addElement(fH2, status); classNames.push_back("fH2");
2818 fSets->addElement(fH3, status); classNames.push_back("fH3");
2819 fSets->addElement(fCL, status); classNames.push_back("fCL");
2820 fSets->addElement(fCP, status); classNames.push_back("fCP");
2821 fSets->addElement(fEX, status); classNames.push_back("fEX");
2822 fSets->addElement(fIN, status); classNames.push_back("fIN");
2823 fSets->addElement(fJL, status); classNames.push_back("fJL");
2824 fSets->addElement(fJT, status); classNames.push_back("fJT");
2825 fSets->addElement(fJV, status); classNames.push_back("fJV");
2826 fSets->addElement(fNS, status); classNames.push_back("fNS");
2827 fSets->addElement(fOP, status); classNames.push_back("fOP");
2828 fSets->addElement(fQU, status); classNames.push_back("fQU");
2829 fSets->addElement(fIS, status); classNames.push_back("fIS");
2830 fSets->addElement(fNU, status); classNames.push_back("fNU");
2831 fSets->addElement(fPO, status); classNames.push_back("fPO");
2832 fSets->addElement(fPR, status); classNames.push_back("fPR");
2833 fSets->addElement(fSY, status); classNames.push_back("fSY");
2834 fSets->addElement(fAI, status); classNames.push_back("fAI");
2835 fSets->addElement(fAL, status); classNames.push_back("fAL");
2836 fSets->addElement(fHL, status); classNames.push_back("fHL");
2837 fSets->addElement(fID, status); classNames.push_back("fID");
2838 fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2839 fSets->addElement(fRI, status); classNames.push_back("fRI");
2840 fSets->addElement(fSG, status); classNames.push_back("fSG");
2841 fSets->addElement(fEB, status); classNames.push_back("fEB");
2842 fSets->addElement(fEM, status); classNames.push_back("fEM");
2843 fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2844 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2845 fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2846 fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2847
2848 const char *rules =
2849 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2850 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2851 "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2852 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2853 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2854 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2855 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2856
2857 fNumberMatcher = new RegexMatcher(
2858 UnicodeString(rules, -1, US_INV), 0, status);
2859
2860 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2861
2862 if (U_FAILURE(status)) {
2863 deferredStatus = status;
2864 }
2865
2866 }
2867
2868
setText(const UnicodeString & s)2869 void RBBILineMonkey::setText(const UnicodeString &s) {
2870 fText = &s;
2871 fCharBI->setText(s);
2872 prepareAppliedRules(s.length());
2873 fNumberMatcher->reset(s);
2874 }
2875
2876 //
2877 // rule9Adjust
2878 // Line Break TR rules 9 and 10 implementation.
2879 // This deals with combining marks and other sequences that
2880 // that must be treated as if they were something other than what they actually are.
2881 //
2882 // This is factored out into a separate function because it must be applied twice for
2883 // each potential break, once to the chars before the position being checked, then
2884 // again to the text following the possible break.
2885 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2886 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2887 if (pos == -1) {
2888 // Invalid initial position. Happens during the warmup iteration of the
2889 // main loop in next().
2890 return;
2891 }
2892
2893 int32_t nPos = *nextPos;
2894
2895 // LB 9 Keep combining sequences together.
2896 // advance over any CM class chars. Note that Line Break CM is different
2897 // from the normal Grapheme Extend property.
2898 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2899 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2900 for (;;) {
2901 *nextChar = fText->char32At(nPos);
2902 if (!fCM->contains(*nextChar)) {
2903 break;
2904 }
2905 nPos = fText->moveIndex32(nPos, 1);
2906 }
2907 }
2908
2909
2910 // LB 9 Treat X CM* as if it were x.
2911 // No explicit action required.
2912
2913 // LB 10 Treat any remaining combining mark as AL
2914 if (fCM->contains(*posChar)) {
2915 *posChar = u'A';
2916 }
2917
2918 // Push the updated nextPos and nextChar back to our caller.
2919 // This only makes a difference if posChar got bigger by consuming a
2920 // combining sequence.
2921 *nextPos = nPos;
2922 *nextChar = fText->char32At(nPos);
2923 }
2924
2925
2926
next(int32_t startPos)2927 int32_t RBBILineMonkey::next(int32_t startPos) {
2928 UErrorCode status = U_ZERO_ERROR;
2929 int32_t pos; // Index of the char following a potential break position
2930 UChar32 thisChar; // Character at above position "pos"
2931
2932 int32_t prevPos; // Index of the char preceding a potential break position
2933 UChar32 prevChar; // Character at above position. Note that prevChar
2934 // and thisChar may not be adjacent because combining
2935 // characters between them will be ignored.
2936
2937 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2938 UChar32 prevCharX2;
2939
2940 int32_t nextPos; // Index of the next character following pos.
2941 // Usually skips over combining marks.
2942 int32_t nextCPPos; // Index of the code point following "pos."
2943 // May point to a combining mark.
2944 int32_t tPos; // temp value.
2945 UChar32 c;
2946
2947 if (U_FAILURE(deferredStatus)) {
2948 return -1;
2949 }
2950
2951 if (startPos >= fText->length()) {
2952 return -1;
2953 }
2954
2955
2956 // Initial values for loop. Loop will run the first time without finding breaks,
2957 // while the invalid values shift out and the "this" and
2958 // "prev" positions are filled in with good values.
2959 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2960 thisChar = prevChar = prevCharX2 = 0;
2961 nextPos = nextCPPos = startPos;
2962
2963
2964 // Loop runs once per position in the test text, until a break position
2965 // is found.
2966 for (;;) {
2967 prevPosX2 = prevPos;
2968 prevCharX2 = prevChar;
2969
2970 prevPos = pos;
2971 prevChar = thisChar;
2972
2973 pos = nextPos;
2974 thisChar = fText->char32At(pos);
2975
2976 nextCPPos = fText->moveIndex32(pos, 1);
2977 nextPos = nextCPPos;
2978
2979
2980 if (pos >= fText->length()) {
2981 setAppliedRule(pos, "LB2 - Break at end of text.");
2982 break;
2983 }
2984
2985
2986 // We do this one out-of-order because the adjustment does not change anything
2987 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2988 // be applied.
2989 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2990 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2991 c = fText->char32At(nextPos);
2992 rule9Adjust(pos, &thisChar, &nextPos, &c);
2993
2994 // If the loop is still warming up - if we haven't shifted the initial
2995 // -1 positions out of prevPos yet - loop back to advance the
2996 // position in the input without any further looking for breaks.
2997 if (prevPos == -1) {
2998 setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
2999 continue;
3000 }
3001
3002
3003 if (fBK->contains(prevChar)) {
3004 setAppliedRule(pos, "LB 4 Always break after hard line breaks");
3005 break;
3006 }
3007
3008
3009 if (prevChar == 0x0d && thisChar == 0x0a) {
3010 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3011 continue;
3012 }
3013 if (prevChar == 0x0d ||
3014 prevChar == 0x0a ||
3015 prevChar == 0x85) {
3016 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3017 break;
3018 }
3019
3020
3021 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3022 fBK->contains(thisChar)) {
3023 setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
3024 continue;
3025 }
3026
3027
3028 if (fSP->contains(thisChar)) {
3029 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
3030 continue;
3031 }
3032
3033 // !!! ??? Is this the right text for the applied rule?
3034 if (fZW->contains(thisChar)) {
3035 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
3036 continue;
3037 }
3038
3039
3040 // ZW SP* ÷
3041 // Scan backwards from prevChar for SP* ZW
3042 tPos = prevPos;
3043 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3044 tPos = fText->moveIndex32(tPos, -1);
3045 }
3046 if (fZW->contains(fText->char32At(tPos))) {
3047 setAppliedRule(pos, "LB 8 Break after zero width space");
3048 break;
3049 }
3050
3051
3052 // Move this test up, before LB8a, because numbers can match a longer sequence that would
3053 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
3054 if (fNumberMatcher->lookingAt(prevPos, status)) {
3055 if (U_FAILURE(status)) {
3056 setAppliedRule(pos, "LB 25 Numbers");
3057 break;
3058 }
3059 // Matched a number. But could have been just a single digit, which would
3060 // not represent a "no break here" between prevChar and thisChar
3061 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3062 if (numEndIdx > pos) {
3063 // Number match includes at least our two chars being checked
3064 if (numEndIdx > nextPos) {
3065 // Number match includes additional chars. Update pos and nextPos
3066 // so that next loop iteration will continue at the end of the number,
3067 // checking for breaks between last char in number & whatever follows.
3068 pos = nextPos = numEndIdx;
3069 do {
3070 pos = fText->moveIndex32(pos, -1);
3071 thisChar = fText->char32At(pos);
3072 } while (fCM->contains(thisChar));
3073 }
3074 setAppliedRule(pos, "LB 25 Numbers");
3075 continue;
3076 }
3077 }
3078
3079
3080 // The monkey test's way of ignoring combining characters doesn't work
3081 // for this rule. ZJ is also a CM. Need to get the actual character
3082 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3083 {
3084 int32_t prevIdx = fText->moveIndex32(pos, -1);
3085 UChar32 prevC = fText->char32At(prevIdx);
3086 if (fZWJ->contains(prevC)) {
3087 setAppliedRule(pos, "LB 8a ZWJ x");
3088 continue;
3089 }
3090 }
3091
3092
3093 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
3094 //
3095
3096
3097 // x WJ
3098 // WJ x
3099 //
3100 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3101 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
3102 continue;
3103 }
3104
3105
3106 if (fGL->contains(prevChar)) {
3107 setAppliedRule(pos, "LB 12 GL x");
3108 continue;
3109 }
3110
3111
3112 if (!(fSP->contains(prevChar) ||
3113 fBA->contains(prevChar) ||
3114 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3115 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
3116 continue;
3117 }
3118
3119
3120 if (fCL->contains(thisChar) ||
3121 fCP->contains(thisChar) ||
3122 fEX->contains(thisChar) ||
3123 fSY->contains(thisChar)) {
3124 setAppliedRule(pos, "LB 13 Don't break before closings.");
3125 continue;
3126 }
3127
3128
3129 // Scan backwards, checking for this sequence.
3130 // The OP char could include combining marks, so we actually check for
3131 // OP CM* SP*
3132 // Another Twist: The Rule 9 fixes may have changed a SP CM
3133 // sequence into a ID char, so before scanning back through spaces,
3134 // verify that prevChar is indeed a space. The prevChar variable
3135 // may differ from fText[prevPos]
3136 tPos = prevPos;
3137 if (fSP->contains(prevChar)) {
3138 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3139 tPos=fText->moveIndex32(tPos, -1);
3140 }
3141 }
3142 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3143 tPos=fText->moveIndex32(tPos, -1);
3144 }
3145 if (fOP->contains(fText->char32At(tPos))) {
3146 setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3147 continue;
3148 }
3149
3150
3151 if (nextPos < fText->length()) {
3152 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3153 // from a legit ffff character. So test length separately.
3154 UChar32 nextChar = fText->char32At(nextPos);
3155 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3156 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3157 break;
3158 }
3159 }
3160
3161
3162 if (fIS->contains(thisChar)) {
3163 setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
3164 continue;
3165 }
3166
3167
3168 if (fOP->contains(thisChar)) {
3169 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3170 int tPos = prevPos;
3171 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3172 tPos = fText->moveIndex32(tPos, -1);
3173 }
3174 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3175 tPos = fText->moveIndex32(tPos, -1);
3176 }
3177 if (fQU->contains(fText->char32At(tPos))) {
3178 setAppliedRule(pos, "LB 15 QU SP* x OP");
3179 continue;
3180 }
3181 }
3182
3183
3184 // Scan backwards for SP* CM* (CL | CP)
3185 if (fNS->contains(thisChar)) {
3186 int tPos = prevPos;
3187 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3188 tPos = fText->moveIndex32(tPos, -1);
3189 }
3190 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3191 tPos = fText->moveIndex32(tPos, -1);
3192 }
3193 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3194 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
3195 continue;
3196 }
3197 }
3198
3199
3200 if (fB2->contains(thisChar)) {
3201 // Scan backwards, checking for the B2 CM* SP* sequence.
3202 tPos = prevPos;
3203 if (fSP->contains(prevChar)) {
3204 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3205 tPos=fText->moveIndex32(tPos, -1);
3206 }
3207 }
3208 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3209 tPos=fText->moveIndex32(tPos, -1);
3210 }
3211 if (fB2->contains(fText->char32At(tPos))) {
3212 setAppliedRule(pos, "LB 17 B2 SP* x B2");
3213 continue;
3214 }
3215 }
3216
3217
3218 if (fSP->contains(prevChar)) {
3219 setAppliedRule(pos, "LB 18 break after space");
3220 break;
3221 }
3222
3223 // x QU
3224 // QU x
3225 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3226 setAppliedRule(pos, "LB 19");
3227 continue;
3228 }
3229
3230 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3231 setAppliedRule(pos, "LB 20 Break around a CB");
3232 break;
3233 }
3234
3235 // Don't break between Hyphens and letters if a break precedes the hyphen.
3236 // Formerly this was a Finnish tailoring.
3237 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3238 // ^($HY | $HH) $AL;
3239 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3240 prevPosX2 == -1) {
3241 setAppliedRule(pos, "LB 20.09");
3242 continue;
3243 }
3244
3245 if (fBA->contains(thisChar) ||
3246 fHY->contains(thisChar) ||
3247 fNS->contains(thisChar) ||
3248 fBB->contains(prevChar) ) {
3249 setAppliedRule(pos, "LB 21");
3250 continue;
3251 }
3252
3253 if (fHL->contains(prevCharX2) &&
3254 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3255 setAppliedRule(pos, "LB 21a HL (HY | BA) x");
3256 continue;
3257 }
3258
3259 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3260 setAppliedRule(pos, "LB 21b SY x HL");
3261 continue;
3262 }
3263
3264 if (fIN->contains(thisChar)) {
3265 setAppliedRule(pos, "LB 22");
3266 continue;
3267 }
3268
3269
3270 // (AL | HL) x NU
3271 // NU x (AL | HL)
3272 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3273 setAppliedRule(pos, "LB 23");
3274 continue;
3275 }
3276 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3277 setAppliedRule(pos, "LB 23");
3278 continue;
3279 }
3280
3281 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3282 // PR x (ID | EB | EM)
3283 // (ID | EB | EM) x PO
3284 if (fPR->contains(prevChar) &&
3285 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3286 setAppliedRule(pos, "LB 23a");
3287 continue;
3288 }
3289 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3290 fPO->contains(thisChar)) {
3291 setAppliedRule(pos, "LB 23a");
3292 continue;
3293 }
3294
3295 // Do not break between prefix and letters or ideographs.
3296 // (PR | PO) x (AL | HL)
3297 // (AL | HL) x (PR | PO)
3298 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3299 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3300 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3301 continue;
3302 }
3303 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3304 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3305 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3306 continue;
3307 }
3308
3309 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3310
3311 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3312 fJV->contains(thisChar) ||
3313 fH2->contains(thisChar) ||
3314 fH3->contains(thisChar))) {
3315 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3316 continue;
3317 }
3318
3319 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3320 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3321 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3322 continue;
3323 }
3324
3325 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3326 fJT->contains(thisChar)) {
3327 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3328 continue;
3329 }
3330
3331 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3332 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3333 fIN->contains(thisChar)) {
3334 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3335 continue;
3336 }
3337 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3338 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3339 fPO->contains(thisChar)) {
3340 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3341 continue;
3342 }
3343 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3344 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3345 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3346 continue;
3347 }
3348
3349
3350
3351 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3352 setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
3353 continue;
3354 }
3355
3356 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3357 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3358 continue;
3359 }
3360
3361 // (AL | NU) x OP
3362 // CP x (AL | NU)
3363 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3364 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3365 continue;
3366 }
3367 if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3368 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3369 continue;
3370 }
3371
3372 // RI x RI
3373 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3374 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3375 break;
3376 }
3377 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3378 // Two Regional Indicators have been paired.
3379 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3380 // following RI. This is a hack.
3381 thisChar = -1;
3382 setAppliedRule(pos, "LB30a RI RI ÷ RI");
3383 continue;
3384 }
3385
3386 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3387 setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
3388 continue;
3389 }
3390
3391 setAppliedRule(pos, "LB 31 Break everywhere else");
3392 break;
3393 }
3394
3395 return pos;
3396 }
3397
3398
charClasses()3399 UVector *RBBILineMonkey::charClasses() {
3400 return fSets;
3401 }
3402
3403
~RBBILineMonkey()3404 RBBILineMonkey::~RBBILineMonkey() {
3405 delete fSets;
3406
3407 delete fBK;
3408 delete fCR;
3409 delete fLF;
3410 delete fCM;
3411 delete fNL;
3412 delete fWJ;
3413 delete fZW;
3414 delete fGL;
3415 delete fCB;
3416 delete fSP;
3417 delete fB2;
3418 delete fBA;
3419 delete fBB;
3420 delete fHH;
3421 delete fHY;
3422 delete fH2;
3423 delete fH3;
3424 delete fCL;
3425 delete fCP;
3426 delete fEX;
3427 delete fIN;
3428 delete fJL;
3429 delete fJV;
3430 delete fJT;
3431 delete fNS;
3432 delete fOP;
3433 delete fQU;
3434 delete fIS;
3435 delete fNU;
3436 delete fPO;
3437 delete fPR;
3438 delete fSY;
3439 delete fAI;
3440 delete fAL;
3441 delete fCJ;
3442 delete fHL;
3443 delete fID;
3444 delete fRI;
3445 delete fSG;
3446 delete fXX;
3447 delete fEB;
3448 delete fEM;
3449 delete fZWJ;
3450 delete fOP30;
3451 delete fCP30;
3452
3453 delete fCharBI;
3454 delete fNumberMatcher;
3455 }
3456
3457
3458 //-------------------------------------------------------------------------------------------
3459 //
3460 // TestMonkey
3461 //
3462 // params
3463 // seed=nnnnn Random number starting seed.
3464 // Setting the seed allows errors to be reproduced.
3465 // loop=nnn Looping count. Controls running time.
3466 // -1: run forever.
3467 // 0 or greater: run length.
3468 //
3469 // type = char | word | line | sent | title
3470 //
3471 // Example:
3472 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3473 //
3474 //-------------------------------------------------------------------------------------------
3475
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3476 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3477 int32_t val = defaultVal;
3478 name.append(" *= *(-?\\d+)");
3479 UErrorCode status = U_ZERO_ERROR;
3480 RegexMatcher m(name, params, 0, status);
3481 if (m.find()) {
3482 // The param exists. Convert the string to an int.
3483 char valString[100];
3484 int32_t paramLength = m.end(1, status) - m.start(1, status);
3485 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3486 paramLength = (int32_t)(sizeof(valString)-2);
3487 }
3488 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3489 val = strtol(valString, NULL, 10);
3490
3491 // Delete this parameter from the params string.
3492 m.reset();
3493 params = m.replaceFirst("", status);
3494 }
3495 U_ASSERT(U_SUCCESS(status));
3496 return val;
3497 }
3498 #endif
3499
3500 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3501 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3502 BreakIterator *bi,
3503 int expected[],
3504 int expectedcount)
3505 {
3506 int count = 0;
3507 int i = 0;
3508 int forward[50];
3509 bi->setText(ustr);
3510 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3511 forward[count] = i;
3512 if (count < expectedcount && expected[count] != i) {
3513 test->errln("%s:%d break forward test failed: expected %d but got %d",
3514 __FILE__, __LINE__, expected[count], i);
3515 break;
3516 }
3517 count ++;
3518 }
3519 if (count != expectedcount) {
3520 printStringBreaks(ustr, expected, expectedcount);
3521 test->errln("%s:%d break forward test failed: missed %d match",
3522 __FILE__, __LINE__, expectedcount - count);
3523 return;
3524 }
3525 // testing boundaries
3526 for (i = 1; i < expectedcount; i ++) {
3527 int j = expected[i - 1];
3528 if (!bi->isBoundary(j)) {
3529 printStringBreaks(ustr, expected, expectedcount);
3530 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3531 __FILE__, __LINE__, j);
3532 return;
3533 }
3534 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3535 if (bi->isBoundary(j)) {
3536 printStringBreaks(ustr, expected, expectedcount);
3537 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3538 __FILE__, __LINE__, j);
3539 return;
3540 }
3541 }
3542 }
3543
3544 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3545 count --;
3546 if (forward[count] != i) {
3547 printStringBreaks(ustr, expected, expectedcount);
3548 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3549 __FILE__, __LINE__, forward[count], i);
3550 break;
3551 }
3552 }
3553 if (count != 0) {
3554 printStringBreaks(ustr, expected, expectedcount);
3555 test->errln("break test previous() failed: missed a match");
3556 return;
3557 }
3558
3559 // testing preceding
3560 for (i = 0; i < expectedcount - 1; i ++) {
3561 // int j = expected[i] + 1;
3562 int j = ustr.moveIndex32(expected[i], 1);
3563 for (; j <= expected[i + 1]; j ++) {
3564 int32_t expectedPreceding = expected[i];
3565 int32_t actualPreceding = bi->preceding(j);
3566 if (actualPreceding != expectedPreceding) {
3567 printStringBreaks(ustr, expected, expectedcount);
3568 test->errln("%s:%d preceding(%d): expected %d, got %d",
3569 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3570 return;
3571 }
3572 }
3573 }
3574 }
3575 #endif
3576
TestWordBreaks(void)3577 void RBBITest::TestWordBreaks(void)
3578 {
3579 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3580
3581 Locale locale("en");
3582 UErrorCode status = U_ZERO_ERROR;
3583 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3584 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3585 // Replaced any C+J characters in a row with a random sequence of characters
3586 // of the same length to make our C+J segmentation not get in the way.
3587 static const char *strlist[] =
3588 {
3589 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3590 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3591 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3592 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3593 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3594 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3595 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3596 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3597 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3598 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3599 "\\u2027\\U000e0067\\u0a47\\u00b7",
3600 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3601 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3602 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3603 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3604 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3605 "\\u0027\\u11af\\U000e0057\\u0602",
3606 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3607 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3608 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3609 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3610 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3611 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3612 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3613 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3614 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3615 "\\u18f4\\U000e0049\\u20e7\\u2027",
3616 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3617 "\\ua183\\u102d\\u0bec\\u003a",
3618 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3619 "\\u003a\\u0e57\\u0fad\\u002e",
3620 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3621 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3622 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3623 "\\u003a\\u0664\\u00b7\\u1fba",
3624 "\\u003b\\u0027\\u00b7\\u47a3",
3625 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3626 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3627 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3628 };
3629 int loop;
3630 if (U_FAILURE(status)) {
3631 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3632 return;
3633 }
3634 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3635 // printf("looping %d\n", loop);
3636 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3637 // RBBICharMonkey monkey;
3638 RBBIWordMonkey monkey;
3639
3640 int expected[50];
3641 int expectedcount = 0;
3642
3643 monkey.setText(ustr);
3644 int i;
3645 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3646 expected[expectedcount ++] = i;
3647 }
3648
3649 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3650 }
3651 delete bi;
3652 #endif
3653 }
3654
TestWordBoundary(void)3655 void RBBITest::TestWordBoundary(void)
3656 {
3657 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3658 Locale locale("en");
3659 UErrorCode status = U_ZERO_ERROR;
3660 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3661 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3662 if (U_FAILURE(status)) {
3663 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3664 __FILE__, __LINE__, u_errorName(status));
3665 return;
3666 }
3667 UChar str[50];
3668 static const char *strlist[] =
3669 {
3670 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3671 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3672 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3673 "\\u2027\\U000e0067\\u0a47\\u00b7",
3674 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3675 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3676 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3677 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3678 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3679 "\\u0027\\u11af\\U000e0057\\u0602",
3680 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3681 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3682 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3683 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3684 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3685 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3686 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3687 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3688 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3689 "\\u58f4\\U000e0049\\u20e7\\u2027",
3690 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3691 "\\ua183\\u102d\\u0bec\\u003a",
3692 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3693 "\\u003a\\u0e57\\u0fad\\u002e",
3694 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3695 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3696 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3697 "\\u003a\\u0664\\u00b7\\u1fba",
3698 "\\u003b\\u0027\\u00b7\\u47a3",
3699 };
3700 int loop;
3701 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3702 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3703 UnicodeString ustr(str);
3704 int forward[50];
3705 int count = 0;
3706
3707 bi->setText(ustr);
3708 int prev = -1;
3709 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3710 ++count;
3711 if (count >= UPRV_LENGTHOF(forward)) {
3712 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3713 __FILE__, __LINE__, loop, count, boundary);
3714 return;
3715 }
3716 forward[count] = boundary;
3717 if (boundary <= prev) {
3718 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3719 __FILE__, __LINE__, loop, prev, boundary);
3720 break;
3721 }
3722 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3723 if (bi->isBoundary(nonBoundary)) {
3724 printStringBreaks(ustr, forward, count);
3725 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3726 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3727 return;
3728 }
3729 }
3730 if (!bi->isBoundary(boundary)) {
3731 printStringBreaks(ustr, forward, count);
3732 errln("%s:%d happy boundary test failed: expected %d a boundary",
3733 __FILE__, __LINE__, boundary);
3734 return;
3735 }
3736 prev = boundary;
3737 }
3738 }
3739 }
3740
TestLineBreaks(void)3741 void RBBITest::TestLineBreaks(void)
3742 {
3743 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3744 Locale locale("en");
3745 UErrorCode status = U_ZERO_ERROR;
3746 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3747 const int32_t STRSIZE = 50;
3748 UChar str[STRSIZE];
3749 static const char *strlist[] =
3750 {
3751 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3752 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3753 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3754 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3755 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3756 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3757 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3758 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3759 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3760 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3761 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3762 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3763 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3764 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3765 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3766 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3767 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3768 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3769 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3770 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3771 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3772 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3773 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3774 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3775 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3776 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3777 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3778 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3779 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3780 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3781 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3782 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3783 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3784 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3785 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3786 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3787 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3788 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3789 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3790 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3791 };
3792 int loop;
3793 TEST_ASSERT_SUCCESS(status);
3794 if (U_FAILURE(status)) {
3795 return;
3796 }
3797 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3798 // printf("looping %d\n", loop);
3799 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3800 if (t >= STRSIZE) {
3801 TEST_ASSERT(FALSE);
3802 continue;
3803 }
3804
3805
3806 UnicodeString ustr(str);
3807 RBBILineMonkey monkey;
3808 if (U_FAILURE(monkey.deferredStatus)) {
3809 continue;
3810 }
3811
3812 const int EXPECTEDSIZE = 50;
3813 int expected[EXPECTEDSIZE];
3814 int expectedcount = 0;
3815
3816 monkey.setText(ustr);
3817
3818 int i;
3819 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3820 if (expectedcount >= EXPECTEDSIZE) {
3821 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3822 return;
3823 }
3824 expected[expectedcount ++] = i;
3825 }
3826
3827 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3828 }
3829 delete bi;
3830 #endif
3831 }
3832
TestSentBreaks(void)3833 void RBBITest::TestSentBreaks(void)
3834 {
3835 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3836 Locale locale("en");
3837 UErrorCode status = U_ZERO_ERROR;
3838 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3839 UChar str[200];
3840 static const char *strlist[] =
3841 {
3842 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3843 "This\n",
3844 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3845 "\"Sentence ending with a quote.\" Bye.",
3846 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3847 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3848 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3849 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3850 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3851 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3852 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3853 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3854 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3855 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3856 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3857 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3858 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3859 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3860 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3861 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3862 };
3863 int loop;
3864 if (U_FAILURE(status)) {
3865 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3866 return;
3867 }
3868 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3869 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3870 UnicodeString ustr(str);
3871
3872 RBBISentMonkey monkey;
3873 if (U_FAILURE(monkey.deferredStatus)) {
3874 continue;
3875 }
3876
3877 const int EXPECTEDSIZE = 50;
3878 int expected[EXPECTEDSIZE];
3879 int expectedcount = 0;
3880
3881 monkey.setText(ustr);
3882
3883 int i;
3884 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3885 if (expectedcount >= EXPECTEDSIZE) {
3886 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3887 return;
3888 }
3889 expected[expectedcount ++] = i;
3890 }
3891
3892 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3893 }
3894 delete bi;
3895 #endif
3896 }
3897
TestMonkey()3898 void RBBITest::TestMonkey() {
3899 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3900
3901 UErrorCode status = U_ZERO_ERROR;
3902 int32_t loopCount = 500;
3903 int32_t seed = 1;
3904 UnicodeString breakType = "all";
3905 Locale locale("en");
3906 UBool useUText = FALSE;
3907
3908 if (quick == FALSE) {
3909 loopCount = 10000;
3910 }
3911
3912 if (fTestParams) {
3913 UnicodeString p(fTestParams);
3914 loopCount = getIntParam("loop", p, loopCount);
3915 seed = getIntParam("seed", p, seed);
3916
3917 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3918 if (m.find()) {
3919 breakType = m.group(1, status);
3920 m.reset();
3921 p = m.replaceFirst("", status);
3922 }
3923
3924 RegexMatcher u(" *utext", p, 0, status);
3925 if (u.find()) {
3926 useUText = TRUE;
3927 u.reset();
3928 p = u.replaceFirst("", status);
3929 }
3930
3931
3932 // m.reset(p);
3933 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3934 // Each option is stripped out of the option string as it is processed.
3935 // All options have been checked. The option string should have been completely emptied..
3936 char buf[100];
3937 p.extract(buf, sizeof(buf), NULL, status);
3938 buf[sizeof(buf)-1] = 0;
3939 errln("Unrecognized or extra parameter: %s\n", buf);
3940 return;
3941 }
3942
3943 }
3944
3945 if (breakType == "char" || breakType == "all") {
3946 RBBICharMonkey m;
3947 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3948 if (U_SUCCESS(status)) {
3949 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3950 if (breakType == "all" && useUText==FALSE) {
3951 // Also run a quick test with UText when "all" is specified
3952 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3953 }
3954 }
3955 else {
3956 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3957 }
3958 delete bi;
3959 }
3960
3961 if (breakType == "word" || breakType == "all") {
3962 logln("Word Break Monkey Test");
3963 RBBIWordMonkey m;
3964 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3965 if (U_SUCCESS(status)) {
3966 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3967 }
3968 else {
3969 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3970 }
3971 delete bi;
3972 }
3973
3974 if (breakType == "line" || breakType == "all") {
3975 logln("Line Break Monkey Test");
3976 RBBILineMonkey m;
3977 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3978 if (loopCount >= 10) {
3979 loopCount = loopCount / 5; // Line break runs slower than the others.
3980 }
3981 if (U_SUCCESS(status)) {
3982 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3983 }
3984 else {
3985 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3986 }
3987 delete bi;
3988 }
3989
3990 if (breakType == "sent" || breakType == "all" ) {
3991 logln("Sentence Break Monkey Test");
3992 RBBISentMonkey m;
3993 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3994 if (loopCount >= 10) {
3995 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3996 }
3997 if (U_SUCCESS(status)) {
3998 RunMonkey(bi, m, "sent", seed, loopCount, useUText);
3999 }
4000 else {
4001 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4002 }
4003 delete bi;
4004 }
4005
4006 #endif
4007 }
4008
4009 //
4010 // Run a RBBI monkey test. Common routine, for all break iterator types.
4011 // Parameters:
4012 // bi - the break iterator to use
4013 // mk - MonkeyKind, abstraction for obtaining expected results
4014 // name - Name of test (char, word, etc.) for use in error messages
4015 // seed - Seed for starting random number generator (parameter from user)
4016 // numIterations
4017 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4018 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4019 int32_t numIterations, UBool useUText) {
4020
4021 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4022
4023 const int32_t TESTSTRINGLEN = 500;
4024 UnicodeString testText;
4025 int32_t numCharClasses;
4026 UVector *chClasses;
4027 int expectedCount = 0;
4028 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4029 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4030 char reverseBreaks[TESTSTRINGLEN*2+1];
4031 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4032 char followingBreaks[TESTSTRINGLEN*2+1];
4033 char precedingBreaks[TESTSTRINGLEN*2+1];
4034 int i;
4035 int loopCount = 0;
4036
4037
4038 m_seed = seed;
4039
4040 numCharClasses = mk.charClasses()->size();
4041 chClasses = mk.charClasses();
4042
4043 // Check for errors that occured during the construction of the MonkeyKind object.
4044 // Can't report them where they occured because errln() is a method coming from intlTest,
4045 // and is not visible outside of RBBITest :-(
4046 if (U_FAILURE(mk.deferredStatus)) {
4047 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4048 return;
4049 }
4050
4051 // Verify that the character classes all have at least one member.
4052 for (i=0; i<numCharClasses; i++) {
4053 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4054 if (s == NULL || s->size() == 0) {
4055 errln("Character Class #%d is null or of zero size.", i);
4056 return;
4057 }
4058 }
4059
4060 // For minimizing width of class name output.
4061 int classNameSize = mk.maxClassNameSize();
4062
4063 while (loopCount < numIterations || numIterations == -1) {
4064 if (numIterations == -1 && loopCount % 10 == 0) {
4065 // If test is running in an infinite loop, display a periodic tic so
4066 // we can tell that it is making progress.
4067 fprintf(stderr, ".");
4068 }
4069 // Save current random number seed, so that we can recreate the random numbers
4070 // for this loop iteration in event of an error.
4071 seed = m_seed;
4072
4073 // Populate a test string with data.
4074 testText.truncate(0);
4075 for (i=0; i<TESTSTRINGLEN; i++) {
4076 int32_t aClassNum = m_rand() % numCharClasses;
4077 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4078 int32_t charIdx = m_rand() % classSet->size();
4079 UChar32 c = classSet->charAt(charIdx);
4080 if (c < 0) { // TODO: deal with sets containing strings.
4081 errln("%s:%d c < 0", __FILE__, __LINE__);
4082 break;
4083 }
4084 // Do not assemble a supplementary character from randomly generated separate surrogates.
4085 // (It could be a dictionary character)
4086 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4087 continue;
4088 }
4089
4090 testText.append(c);
4091 }
4092
4093 // Calculate the expected results for this test string and reset applied rules.
4094 mk.setText(testText);
4095
4096 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4097 expectedBreaks[0] = 1;
4098 int32_t breakPos = 0;
4099 expectedCount = 0;
4100 for (;;) {
4101 breakPos = mk.next(breakPos);
4102 if (breakPos == -1) {
4103 break;
4104 }
4105 if (breakPos > testText.length()) {
4106 errln("breakPos > testText.length()");
4107 }
4108 expectedBreaks[breakPos] = 1;
4109 U_ASSERT(expectedCount<testText.length());
4110 }
4111
4112 // Find the break positions using forward iteration
4113 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4114 if (useUText) {
4115 UErrorCode status = U_ZERO_ERROR;
4116 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4117 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4118 bi->setText(testUText, status);
4119 TEST_ASSERT_SUCCESS(status);
4120 utext_close(testUText); // The break iterator does a shallow clone of the UText
4121 // This UText can be closed immediately, so long as the
4122 // testText string continues to exist.
4123 } else {
4124 bi->setText(testText);
4125 }
4126
4127 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4128 if (i < 0 || i > testText.length()) {
4129 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4130 break;
4131 }
4132 forwardBreaks[i] = 1;
4133 }
4134
4135 // Find the break positions using reverse iteration
4136 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4137 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4138 if (i < 0 || i > testText.length()) {
4139 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4140 break;
4141 }
4142 reverseBreaks[i] = 1;
4143 }
4144
4145 // Find the break positions using isBoundary() tests.
4146 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4147 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4148 for (i=0; i<=testText.length(); i++) {
4149 isBoundaryBreaks[i] = bi->isBoundary(i);
4150 }
4151
4152
4153 // Find the break positions using the following() function.
4154 // printf(".");
4155 memset(followingBreaks, 0, sizeof(followingBreaks));
4156 int32_t lastBreakPos = 0;
4157 followingBreaks[0] = 1;
4158 for (i=0; i<testText.length(); i++) {
4159 breakPos = bi->following(i);
4160 if (breakPos <= i ||
4161 breakPos < lastBreakPos ||
4162 breakPos > testText.length() ||
4163 (breakPos > lastBreakPos && lastBreakPos > i)) {
4164 errln("%s break monkey test: "
4165 "Out of range value returned by BreakIterator::following().\n"
4166 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4167 name, seed, i, breakPos, lastBreakPos);
4168 break;
4169 }
4170 followingBreaks[breakPos] = 1;
4171 lastBreakPos = breakPos;
4172 }
4173
4174 // Find the break positions using the preceding() function.
4175 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4176 lastBreakPos = testText.length();
4177 precedingBreaks[testText.length()] = 1;
4178 for (i=testText.length(); i>0; i--) {
4179 breakPos = bi->preceding(i);
4180 if (breakPos >= i ||
4181 breakPos > lastBreakPos ||
4182 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4183 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4184 errln("%s break monkey test: "
4185 "Out of range value returned by BreakIterator::preceding().\n"
4186 "index=%d; prev returned %d; lastBreak=%d" ,
4187 name, i, breakPos, lastBreakPos);
4188 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4189 precedingBreaks[i] = 2; // Forces an error.
4190 }
4191 } else {
4192 if (breakPos >= 0) {
4193 precedingBreaks[breakPos] = 1;
4194 }
4195 lastBreakPos = breakPos;
4196 }
4197 }
4198
4199 // Compare the expected and actual results.
4200 for (i=0; i<=testText.length(); i++) {
4201 const char *errorType = NULL;
4202 const char* currentBreakData = NULL;
4203 if (forwardBreaks[i] != expectedBreaks[i]) {
4204 errorType = "next()";
4205 currentBreakData = forwardBreaks;
4206 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4207 errorType = "previous()";
4208 currentBreakData = reverseBreaks;
4209 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4210 errorType = "isBoundary()";
4211 currentBreakData = isBoundaryBreaks;
4212 } else if (followingBreaks[i] != expectedBreaks[i]) {
4213 errorType = "following()";
4214 currentBreakData = followingBreaks;
4215 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4216 errorType = "preceding()";
4217 currentBreakData = precedingBreaks;
4218 }
4219
4220 if (errorType != NULL) {
4221 // Format a range of the test text that includes the failure as
4222 // a data item that can be included in the rbbi test data file.
4223
4224 // Start of the range is the last point where expected and actual results
4225 // both agreed that there was a break position.
4226
4227 int startContext = i;
4228 int32_t count = 0;
4229 for (;;) {
4230 if (startContext==0) { break; }
4231 startContext --;
4232 if (expectedBreaks[startContext] != 0) {
4233 if (count == 2) break;
4234 count ++;
4235 }
4236 }
4237
4238 // End of range is two expected breaks past the start position.
4239 int endContext = i + 1;
4240 int ci;
4241 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4242 for (;;) {
4243 if (endContext >= testText.length()) {break;}
4244 if (expectedBreaks[endContext-1] != 0) {
4245 if (count == 0) break;
4246 count --;
4247 }
4248 endContext ++;
4249 }
4250 }
4251
4252 // Formatting of each line includes:
4253 // character code
4254 // reference break: '|' -> a break, '.' -> no break
4255 // actual break: '|' -> a break, '.' -> no break
4256 // (name of character clase)
4257 // Unicode name of character
4258 // '-->' indicates location of the difference.
4259
4260 MONKEY_ERROR(
4261 (expectedBreaks[i] ? "Break expected but not found" :
4262 "Break found but not expected"),
4263 name, i, seed);
4264
4265 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4266 UChar32 c;
4267 c = testText.char32At(ci);
4268
4269 std::string currentLineFlag = " ";
4270 if (ci == i) {
4271 currentLineFlag = "-->"; // Error position
4272 }
4273
4274 // BMP or SMP character in hex
4275 char hexCodePoint[12];
4276 std::string format = " \\u%04x";
4277 if (c >= 0x10000) {
4278 format = "\\U%08x";
4279 }
4280 sprintf(hexCodePoint, format.c_str(), c);
4281
4282 // Get the class name and character name for the character.
4283 char cName[200];
4284 UErrorCode status = U_ZERO_ERROR;
4285 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4286
4287 char buffer[200];
4288 auto ret = snprintf(buffer, UPRV_LENGTHOF(buffer),
4289 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4290 currentLineFlag.c_str(),
4291 ci,
4292 expectedBreaks[ci] == 0 ? "." : "|", // Reference break
4293 currentBreakData[ci] == 0 ? "." : "|", // Actual break
4294 hexCodePoint,
4295 classNameSize,
4296 mk.classNameFromCodepoint(c).c_str(),
4297 mk.getAppliedRule(ci).c_str(), cName);
4298 (void)ret;
4299 U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4300
4301 // Output the error
4302 if (ci == i) {
4303 errln(buffer);
4304 } else {
4305 infoln(buffer);
4306 }
4307
4308 if (ci >= endContext) { break; }
4309 }
4310 break;
4311 }
4312 }
4313
4314 loopCount++;
4315 }
4316 #endif
4317 }
4318
4319
4320 // Bug 5532. UTF-8 based UText fails in dictionary code.
4321 // This test checks the initial patch,
4322 // which is to just keep it from crashing. Correct word boundaries
4323 // await a proper fix to the dictionary code.
4324 //
TestBug5532(void)4325 void RBBITest::TestBug5532(void) {
4326 // Text includes a mixture of Thai and Latin.
4327 const unsigned char utf8Data[] = {
4328 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4329 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4330 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4331 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4332 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4333 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4334 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4335 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4336 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4337 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4338 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4339
4340 UErrorCode status = U_ZERO_ERROR;
4341 UText utext=UTEXT_INITIALIZER;
4342 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4343 TEST_ASSERT_SUCCESS(status);
4344
4345 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4346 TEST_ASSERT_SUCCESS(status);
4347 if (U_SUCCESS(status)) {
4348 bi->setText(&utext, status);
4349 TEST_ASSERT_SUCCESS(status);
4350
4351 int32_t breakCount = 0;
4352 int32_t previousBreak = -1;
4353 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4354 // For now, just make sure that the break iterator doesn't hang.
4355 TEST_ASSERT(previousBreak < bi->current());
4356 previousBreak = bi->current();
4357 }
4358 TEST_ASSERT(breakCount > 0);
4359 }
4360 delete bi;
4361 utext_close(&utext);
4362 }
4363
4364
TestBug9983(void)4365 void RBBITest::TestBug9983(void) {
4366 UnicodeString text = UnicodeString("\\u002A" // * Other
4367 "\\uFF65" // Other
4368 "\\u309C" // Katakana
4369 "\\uFF9F" // Extend
4370 "\\uFF65" // Other
4371 "\\u0020" // Other
4372 "\\u0000").unescape();
4373
4374 UErrorCode status = U_ZERO_ERROR;
4375 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4376 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4377 TEST_ASSERT_SUCCESS(status);
4378 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4379 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4380 TEST_ASSERT_SUCCESS(status);
4381 if (U_FAILURE(status)) {
4382 return;
4383 }
4384 int32_t offset, rstatus, iterationCount;
4385
4386 brkiter->setText(text);
4387 brkiter->last();
4388 iterationCount = 0;
4389 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4390 iterationCount++;
4391 rstatus = brkiter->getRuleStatus();
4392 (void)rstatus; // Suppress set but not used warning.
4393 if (iterationCount >= 10) {
4394 break;
4395 }
4396 }
4397 TEST_ASSERT(iterationCount == 6);
4398
4399 brkiterPOSIX->setText(text);
4400 brkiterPOSIX->last();
4401 iterationCount = 0;
4402 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4403 iterationCount++;
4404 rstatus = brkiterPOSIX->getRuleStatus();
4405 (void)rstatus; // Suppress set but not used warning.
4406 if (iterationCount >= 10) {
4407 break;
4408 }
4409 }
4410 TEST_ASSERT(iterationCount == 6);
4411 }
4412
4413 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4414 //
TestBug7547()4415 void RBBITest::TestBug7547() {
4416 UnicodeString rules;
4417 UErrorCode status = U_ZERO_ERROR;
4418 UParseError parseError;
4419 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4420 if (status != U_BRK_RULE_SYNTAX) {
4421 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4422 }
4423 if (parseError.line != 1 || parseError.offset != 0) {
4424 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4425 }
4426 }
4427
4428
TestBug12797()4429 void RBBITest::TestBug12797() {
4430 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4431 UErrorCode status = U_ZERO_ERROR;
4432 UParseError parseError;
4433 RuleBasedBreakIterator bi(rules, parseError, status);
4434 if (U_FAILURE(status)) {
4435 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4436 return;
4437 }
4438 UnicodeString text = "abc";
4439 bi.setText(text);
4440 bi.first();
4441 int32_t boundary = bi.next();
4442 if (boundary != 3) {
4443 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4444 }
4445 }
4446
TestBug12918()4447 void RBBITest::TestBug12918() {
4448 // This test triggers an assertion failure in dictbe.cpp
4449 const UChar *crasherString = u"\u3325\u4a16";
4450 UErrorCode status = U_ZERO_ERROR;
4451 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4452 if (U_FAILURE(status)) {
4453 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4454 return;
4455 }
4456 ubrk_first(iter);
4457 int32_t pos = 0;
4458 int32_t lastPos = -1;
4459 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4460 if (pos <= lastPos) {
4461 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4462 break;
4463 }
4464 }
4465 ubrk_close(iter);
4466 }
4467
TestBug12932()4468 void RBBITest::TestBug12932() {
4469 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4470 UnicodeString ruleStr(
4471 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4472 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4473 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4474 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4475 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4476 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4477
4478 UErrorCode status = U_ZERO_ERROR;
4479 UParseError parseError;
4480 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4481 if (status != U_BRK_RULE_SYNTAX) {
4482 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4483 __FILE__, __LINE__, u_errorName(status));
4484 }
4485 }
4486
4487
4488 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4489 // remain undevided by ICU char, word and line break.
TestEmoji()4490 void RBBITest::TestEmoji() {
4491 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4492 UErrorCode status = U_ZERO_ERROR;
4493
4494 CharString testFileName;
4495 testFileName.append(IntlTest::getSourceTestData(status), status);
4496 testFileName.appendPathPart("emoji-test.txt", status);
4497 if (U_FAILURE(status)) {
4498 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4499 return;
4500 }
4501 logln("Opening data file %s\n", testFileName.data());
4502
4503 int len;
4504 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4505 if (U_FAILURE(status) || testFile == NULL) {
4506 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4507 return;
4508 }
4509 UnicodeString testFileAsString(testFile, len);
4510 delete [] testFile;
4511
4512 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4513 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4514 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4515 int32_t lineNumber = 0;
4516
4517 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4518 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4519 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4520 if (U_FAILURE(status)) {
4521 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4522 return;
4523 }
4524
4525 while (lineMatcher.find()) {
4526 ++lineNumber;
4527 UnicodeString line = lineMatcher.group(status);
4528 hexMatcher.reset(line);
4529 UnicodeString testString; // accumulates the emoji sequence.
4530 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4531 UnicodeString hex = hexMatcher.group(1, status);
4532 if (hex.length() > 8) {
4533 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4534 break;
4535 }
4536 CharString hex8;
4537 hex8.appendInvariantChars(hex, status);
4538 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4539 if (c<=0x10ffff) {
4540 testString.append(c);
4541 } else {
4542 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4543 __FILE__, __LINE__, lineNumber, hex8.data());
4544 break;
4545 }
4546 }
4547
4548 if (testString.length() > 1) {
4549 charBreaks->setText(testString);
4550 charBreaks->first();
4551 int32_t firstBreak = charBreaks->next();
4552 if (testString.length() != firstBreak) {
4553 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4554 __FILE__, __LINE__, lineNumber, firstBreak);
4555 }
4556 wordBreaks->setText(testString);
4557 wordBreaks->first();
4558 firstBreak = wordBreaks->next();
4559 if (testString.length() != firstBreak) {
4560 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4561 __FILE__, __LINE__, lineNumber, firstBreak);
4562 }
4563 lineBreaks->setText(testString);
4564 lineBreaks->first();
4565 firstBreak = lineBreaks->next();
4566 if (testString.length() != firstBreak) {
4567 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4568 __FILE__, __LINE__, lineNumber, firstBreak);
4569 }
4570 }
4571 }
4572 #endif
4573 }
4574
4575
4576 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4577
TestBug12519()4578 void RBBITest::TestBug12519() {
4579 UErrorCode status = U_ZERO_ERROR;
4580 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4581 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4582 if (!assertSuccess(WHERE, status)) {
4583 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4584 return;
4585 }
4586 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4587
4588 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4589 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4590
4591 LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4592 assertTrue(WHERE, *biEn == *cloneEn);
4593 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4594
4595 LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4596 assertTrue(WHERE, *biFr == *cloneFr);
4597 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4598
4599 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4600 UnicodeString text("Hallo Welt");
4601 biDe->setText(text);
4602 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4603 *biDe = *biFr;
4604 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4605 }
4606
TestBug12677()4607 void RBBITest::TestBug12677() {
4608 // Check that stripping of comments from rules for getRules() is not confused by
4609 // the presence of '#' characters in the rules that do not introduce comments.
4610 UnicodeString rules(u"!!forward; \n"
4611 "$x = [ab#]; # a set with a # literal. \n"
4612 " # .; # a comment that looks sort of like a rule. \n"
4613 " '#' '?'; # a rule with a quoted # \n"
4614 );
4615
4616 UErrorCode status = U_ZERO_ERROR;
4617 UParseError pe;
4618 RuleBasedBreakIterator bi(rules, pe, status);
4619 assertSuccess(WHERE, status);
4620 UnicodeString rtRules = bi.getRules();
4621 assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
4622 }
4623
4624
TestTableRedundancies()4625 void RBBITest::TestTableRedundancies() {
4626 UErrorCode status = U_ZERO_ERROR;
4627
4628 LocalPointer<RuleBasedBreakIterator> bi (
4629 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4630 assertSuccess(WHERE, status);
4631 if (U_FAILURE(status)) return;
4632
4633 RBBIDataWrapper *dw = bi->fData;
4634 const RBBIStateTable *fwtbl = dw->fForwardTable;
4635 int32_t numCharClasses = dw->fHeader->fCatCount;
4636 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4637
4638 // Check for duplicate columns (character categories)
4639
4640 std::vector<UnicodeString> columns;
4641 for (int32_t column = 0; column < numCharClasses; column++) {
4642 UnicodeString s;
4643 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4644 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4645 s.append(row->fNextState[column]);
4646 }
4647 columns.push_back(s);
4648 }
4649 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4650 for (int c1=1; c1<numCharClasses; c1++) {
4651 for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4652 if (columns.at(c1) == columns.at(c2)) {
4653 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4654 goto out;
4655 }
4656 }
4657 }
4658 out:
4659
4660 // Check for duplicate states
4661 std::vector<UnicodeString> rows;
4662 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4663 UnicodeString s;
4664 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4665 assertTrue(WHERE, row->fAccepting >= -1);
4666 s.append(row->fAccepting + 1); // values of -1 are expected.
4667 s.append(row->fLookAhead);
4668 s.append(row->fTagIdx);
4669 for (int32_t column = 0; column < numCharClasses; column++) {
4670 s.append(row->fNextState[column]);
4671 }
4672 rows.push_back(s);
4673 }
4674 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4675 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4676 if (rows.at(r1) == rows.at(r2)) {
4677 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4678 return;
4679 }
4680 }
4681 }
4682 }
4683
4684 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4685 // even after next() has returned DONE.
4686
TestBug13447()4687 void RBBITest::TestBug13447() {
4688 UErrorCode status = U_ZERO_ERROR;
4689 LocalPointer<RuleBasedBreakIterator> bi(
4690 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4691 assertSuccess(WHERE, status);
4692 if (U_FAILURE(status)) return;
4693 UnicodeString data(u"1234");
4694 bi->setText(data);
4695 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4696 assertEquals(WHERE, 4, bi->next());
4697 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4698 assertEquals(WHERE, UBRK_DONE, bi->next());
4699 assertEquals(WHERE, 4, bi->current());
4700 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4701 }
4702
4703 // TestReverse exercises both the synthesized safe reverse rules and the logic
4704 // for filling the break iterator cache when starting from random positions
4705 // in the text.
4706 //
4707 // It's a monkey test, working on random data, with the expected data obtained
4708 // from forward iteration (no safe rules involved), comparing with results
4709 // when indexing into the interior of the string (safe rules needed).
4710
TestReverse()4711 void RBBITest::TestReverse() {
4712 UErrorCode status = U_ZERO_ERROR;
4713
4714 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4715 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4716 assertSuccess(WHERE, status, true);
4717 status = U_ZERO_ERROR;
4718 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4719 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4720 assertSuccess(WHERE, status, true);
4721 status = U_ZERO_ERROR;
4722 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4723 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4724 assertSuccess(WHERE, status, true);
4725 status = U_ZERO_ERROR;
4726 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4727 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4728 assertSuccess(WHERE, status, true);
4729 }
4730
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4731 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4732 if (!bi) {
4733 return;
4734 }
4735
4736 // From the mapping trie in the break iterator's internal data, create a
4737 // vector of UnicodeStrings, one for each character category, containing
4738 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4739 // to avoid an execess of unassigned code points.
4740
4741 RBBIDataWrapper *data = bi->fData;
4742 int32_t categoryCount = data->fHeader->fCatCount;
4743 UTrie2 *trie = data->fTrie;
4744
4745 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4746 for (int cp=0; cp<0x1fff0; ++cp) {
4747 int cat = utrie2_get32(trie, cp);
4748 cat &= ~0x4000; // And off the dictionary bit from the category.
4749 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4750 if (cat < 0 || cat >= categoryCount) return;
4751 strings[cat].append(cp);
4752 }
4753
4754 icu_rand randomGen;
4755 const int testStringLength = 10000;
4756 UnicodeString testString;
4757
4758 for (int i=0; i<testStringLength; ++i) {
4759 int charClass = randomGen() % categoryCount;
4760 if (strings[charClass].length() > 0) {
4761 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4762 testString.append(cp);
4763 }
4764 }
4765
4766 typedef std::pair<UBool, int32_t> Result;
4767 std::vector<Result> expectedResults;
4768 bi->setText(testString);
4769 for (int i=0; i<testString.length(); ++i) {
4770 bool isboundary = bi->isBoundary(i);
4771 int ruleStatus = bi->getRuleStatus();
4772 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4773 }
4774
4775 for (int i=testString.length()-1; i>=0; --i) {
4776 bi->setText(testString); // clears the internal break cache
4777 Result expected = expectedResults[i];
4778 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4779 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4780 }
4781 }
4782
4783
4784 // Ticket 13692 - finding word boundaries in very large numbers or words could
4785 // be very time consuming. When the problem was present, this void test
4786 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4787
TestBug13692()4788 void RBBITest::TestBug13692() {
4789 UErrorCode status = U_ZERO_ERROR;
4790 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4791 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4792 if (!assertSuccess(WHERE, status, true)) {
4793 return;
4794 }
4795 constexpr int32_t LENGTH = 1000000;
4796 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4797 for (int i=0; i<20; i+=2) {
4798 longNumber.setCharAt(i, u' ');
4799 }
4800 bi->setText(longNumber);
4801 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4802 assertSuccess(WHERE, status);
4803 }
4804
4805
TestProperties()4806 void RBBITest::TestProperties() {
4807 UErrorCode errorCode = U_ZERO_ERROR;
4808 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4809 if (!prependSet.isEmpty()) {
4810 errln(
4811 "[:GCB=Prepend:] is not empty any more. "
4812 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4813 "change this test to the opposite condition.");
4814 }
4815 }
4816
4817
4818 //
4819 // TestDebug - A place-holder test for debugging purposes.
4820 // For putting in fragments of other tests that can be invoked
4821 // for tracing without a lot of unwanted extra stuff happening.
4822 //
TestDebug(void)4823 void RBBITest::TestDebug(void) {
4824 UErrorCode status = U_ZERO_ERROR;
4825 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4826 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4827 if (!assertSuccess(WHERE, status, true)) {
4828 return;
4829 }
4830 const UnicodeString &rules = bi->getRules();
4831 UParseError pe;
4832 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4833 assertSuccess(WHERE, status);
4834 }
4835
4836
4837 //
4838 // TestDebugRules A stub test for use in debugging rule compilation problems.
4839 // Can be freely altered as needed or convenient.
4840 // Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4841 // data files may not be available in all environments.
4842 // Any permanent test cases should be moved to rbbitst.txt
4843 // (see Bug 20303 in that file, for example), or to another test function in this file.
4844 //
TestDebugRules()4845 void RBBITest::TestDebugRules() {
4846 #if 0
4847 const char16_t *rules = u""
4848 "!!quoted_literals_only; \n"
4849 "!!chain; \n"
4850 "!!lookAheadHardBreak; \n"
4851 " \n"
4852 // "[a] / ; \n"
4853 "[a] [b] / [c] [d]; \n"
4854 "[a] [b] / [c] [d] {100}; \n"
4855 "[x] [a] [b] / [c] [d] {100}; \n"
4856 "[a] [b] [c] / [d] {100}; \n"
4857 //" [c] [d] / [e] [f]; \n"
4858 //"[a] [b] / [c]; \n"
4859 ;
4860
4861 UErrorCode status = U_ZERO_ERROR;
4862 CharString path(pathToDataDirectory(), status);
4863 path.appendPathPart("brkitr", status);
4864 path.appendPathPart("rules", status);
4865 path.appendPathPart("line.txt", status);
4866 int len;
4867 std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
4868 if (!assertSuccess(WHERE, status)) {
4869 return;
4870 }
4871
4872 UParseError pe;
4873 // rules = testFile.get();
4874 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
4875
4876 if (!assertSuccess(WHERE, status)) {
4877 delete bi;
4878 return;
4879 }
4880 // bi->dumpTables();
4881
4882 delete bi;
4883 #endif
4884 }
4885
4886 #if U_ENABLE_TRACING
4887 static std::vector<std::string> gData;
4888 static std::vector<int32_t> gEntryFn;
4889 static std::vector<int32_t> gExitFn;
4890 static std::vector<int32_t> gDataFn;
4891
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)4892 static void U_CALLCONV traceData(
4893 const void*,
4894 int32_t fnNumber,
4895 int32_t,
4896 const char *,
4897 va_list args) {
4898 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
4899 const char* data = va_arg(args, const char*);
4900 gDataFn.push_back(fnNumber);
4901 gData.push_back(data);
4902 }
4903 }
4904
traceEntry(const void *,int32_t fnNumber)4905 static void traceEntry(const void *, int32_t fnNumber) {
4906 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
4907 gEntryFn.push_back(fnNumber);
4908 }
4909 }
4910
traceExit(const void *,int32_t fnNumber,const char *,va_list)4911 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
4912 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
4913 gExitFn.push_back(fnNumber);
4914 }
4915 }
4916
4917
assertTestTraceResult(int32_t fnNumber,const char * expectedData)4918 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
4919 assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
4920 assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
4921 assertEquals("utrace_exit should be called ", 1, gExitFn.size());
4922 assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
4923
4924 if (expectedData == nullptr) {
4925 assertEquals("utrace_data should not be called ", 0, gDataFn.size());
4926 assertEquals("utrace_data should not be called ", 0, gData.size());
4927 } else {
4928 assertEquals("utrace_data should be called ", 1, gDataFn.size());
4929 assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
4930 assertEquals("utrace_data should be called ", 1, gData.size());
4931 assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
4932 }
4933 }
4934
SetupTestTrace()4935 void SetupTestTrace() {
4936 gEntryFn.clear();
4937 gExitFn.clear();
4938 gDataFn.clear();
4939 gData.clear();
4940
4941 const void* context = nullptr;
4942 utrace_setFunctions(context, traceEntry, traceExit, traceData);
4943 utrace_setLevel(UTRACE_INFO);
4944 }
4945
TestTraceCreateCharacter(void)4946 void RBBITest::TestTraceCreateCharacter(void) {
4947 SetupTestTrace();
4948 IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
4949 LocalPointer<BreakIterator> brkitr(
4950 BreakIterator::createCharacterInstance("zh-CN", status));
4951 status.errIfFailureAndReset();
4952 assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
4953 }
4954
TestTraceCreateTitle(void)4955 void RBBITest::TestTraceCreateTitle(void) {
4956 SetupTestTrace();
4957 IcuTestErrorCode status(*this, "TestTraceCreateTitle");
4958 LocalPointer<BreakIterator> brkitr(
4959 BreakIterator::createTitleInstance("zh-CN", status));
4960 status.errIfFailureAndReset();
4961 assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
4962 }
4963
TestTraceCreateSentence(void)4964 void RBBITest::TestTraceCreateSentence(void) {
4965 SetupTestTrace();
4966 IcuTestErrorCode status(*this, "TestTraceCreateSentence");
4967 LocalPointer<BreakIterator> brkitr(
4968 BreakIterator::createSentenceInstance("zh-CN", status));
4969 status.errIfFailureAndReset();
4970 assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
4971 }
4972
TestTraceCreateWord(void)4973 void RBBITest::TestTraceCreateWord(void) {
4974 SetupTestTrace();
4975 IcuTestErrorCode status(*this, "TestTraceCreateWord");
4976 LocalPointer<BreakIterator> brkitr(
4977 BreakIterator::createWordInstance("zh-CN", status));
4978 status.errIfFailureAndReset();
4979 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
4980 }
4981
TestTraceCreateLine(void)4982 void RBBITest::TestTraceCreateLine(void) {
4983 SetupTestTrace();
4984 IcuTestErrorCode status(*this, "TestTraceCreateLine");
4985 LocalPointer<BreakIterator> brkitr(
4986 BreakIterator::createLineInstance("zh-CN", status));
4987 status.errIfFailureAndReset();
4988 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "");
4989 }
4990
TestTraceCreateLineStrict(void)4991 void RBBITest::TestTraceCreateLineStrict(void) {
4992 SetupTestTrace();
4993 IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
4994 LocalPointer<BreakIterator> brkitr(
4995 BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
4996 status.errIfFailureAndReset();
4997 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "strict");
4998 }
4999
TestTraceCreateLineNormal(void)5000 void RBBITest::TestTraceCreateLineNormal(void) {
5001 SetupTestTrace();
5002 IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5003 LocalPointer<BreakIterator> brkitr(
5004 BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5005 status.errIfFailureAndReset();
5006 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "normal");
5007 }
5008
TestTraceCreateLineLoose(void)5009 void RBBITest::TestTraceCreateLineLoose(void) {
5010 SetupTestTrace();
5011 IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5012 LocalPointer<BreakIterator> brkitr(
5013 BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5014 status.errIfFailureAndReset();
5015 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "loose");
5016 }
5017
TestTraceCreateBreakEngine(void)5018 void RBBITest::TestTraceCreateBreakEngine(void) {
5019 rbbi_cleanup();
5020 SetupTestTrace();
5021 IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5022 LocalPointer<BreakIterator> brkitr(
5023 BreakIterator::createWordInstance("zh-CN", status));
5024 status.errIfFailureAndReset();
5025 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5026
5027 // To word break the following text, BreakIterator will create 5 dictionary
5028 // break engine internally.
5029 brkitr->setText(
5030 u"test "
5031 u"測試 " // Hani
5032 u"សាកល្បង " // Khmr
5033 u"ທົດສອບ " // Laoo
5034 u"စမ်းသပ်မှု " // Mymr
5035 u"ทดสอบ " // Thai
5036 u"test "
5037 );
5038
5039 // Loop through all the text.
5040 while (brkitr->next() > 0) ;
5041
5042 assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5043 assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5044 assertEquals("utrace_data should be called ", 5, gDataFn.size());
5045
5046 for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5047 assertEquals("utrace_entry should be called ",
5048 UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5049 assertEquals("utrace_exit should be called ",
5050 UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5051 assertEquals("utrace_data should be called ",
5052 UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5053 }
5054
5055 assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5056 assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5057 assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5058 assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5059 assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5060
5061 }
5062 #endif
5063
5064 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5065