1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <utility>
21 #include <vector>
22
23 #include "unicode/brkiter.h"
24 #include "unicode/localpointer.h"
25 #include "unicode/numfmt.h"
26 #include "unicode/rbbi.h"
27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
28 #include "unicode/regex.h"
29 #endif
30 #include "unicode/schriter.h"
31 #include "unicode/uchar.h"
32 #include "unicode/utf16.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uscript.h"
36 #include "unicode/ustring.h"
37 #include "unicode/utext.h"
38
39 #include "charstr.h"
40 #include "cmemory.h"
41 #include "cstr.h"
42 #include "intltest.h"
43 #include "rbbitst.h"
44 #include "rbbidata.h"
45 #include "utypeinfo.h" // for 'typeid' to work
46 #include "uvector.h"
47 #include "uvectr32.h"
48
49
50 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
51 #include "unicode/filteredbrk.h"
52 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
53
54 #define TEST_ASSERT(x) {if (!(x)) { \
55 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
56
57 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
58 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
59
60 //---------------------------------------------
61 // runIndexedTest
62 //---------------------------------------------
63
64
65 // Note: Before adding new tests to this file, check whether the desired test data can
66 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
67 // it's much less work than writing a new test, diagnostic output in the event of failures
68 // is good, and the test data file will is shared with ICU4J, so eventually the test
69 // will run there as well, without additional effort.
70
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)71 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
72 {
73 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
74 fTestParams = params;
75
76 TESTCASE_AUTO_BEGIN;
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(TestBug4153072);
79 #endif
80 #if !UCONFIG_NO_FILE_IO
81 TESTCASE_AUTO(TestUnicodeFiles);
82 #endif
83 TESTCASE_AUTO(TestGetAvailableLocales);
84 TESTCASE_AUTO(TestGetDisplayName);
85 #if !UCONFIG_NO_FILE_IO
86 TESTCASE_AUTO(TestEndBehaviour);
87 TESTCASE_AUTO(TestWordBreaks);
88 TESTCASE_AUTO(TestWordBoundary);
89 TESTCASE_AUTO(TestLineBreaks);
90 TESTCASE_AUTO(TestSentBreaks);
91 TESTCASE_AUTO(TestExtended);
92 #endif
93 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
94 TESTCASE_AUTO(TestMonkey);
95 #endif
96 #if !UCONFIG_NO_FILE_IO
97 TESTCASE_AUTO(TestBug3818);
98 #endif
99 TESTCASE_AUTO(TestDebug);
100 #if !UCONFIG_NO_FILE_IO
101 TESTCASE_AUTO(TestBug5775);
102 #endif
103 TESTCASE_AUTO(TestBug9983);
104 TESTCASE_AUTO(TestDictRules);
105 TESTCASE_AUTO(TestBug5532);
106 TESTCASE_AUTO(TestBug7547);
107 TESTCASE_AUTO(TestBug12797);
108 TESTCASE_AUTO(TestBug12918);
109 TESTCASE_AUTO(TestBug12932);
110 TESTCASE_AUTO(TestEmoji);
111 TESTCASE_AUTO(TestBug12519);
112 TESTCASE_AUTO(TestBug12677);
113 TESTCASE_AUTO(TestTableRedundancies);
114 TESTCASE_AUTO(TestBug13447);
115 TESTCASE_AUTO(TestReverse);
116 TESTCASE_AUTO(TestBug13692);
117 TESTCASE_AUTO_END;
118 }
119
120
121 //--------------------------------------------------------------------------------------
122 //
123 // RBBITest constructor and destructor
124 //
125 //--------------------------------------------------------------------------------------
126
RBBITest()127 RBBITest::RBBITest() {
128 fTestParams = NULL;
129 }
130
131
~RBBITest()132 RBBITest::~RBBITest() {
133 }
134
135
printStringBreaks(UText * tstr,int expected[],int expectedCount)136 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
137 UErrorCode status = U_ZERO_ERROR;
138 char name[100];
139 printf("code alpha extend alphanum type word sent line name\n");
140 int nextExpectedIndex = 0;
141 utext_setNativeIndex(tstr, 0);
142 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
143 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
144 printf("------------------------------------------------ %d\n", j);
145 ++nextExpectedIndex;
146 }
147
148 UChar32 c = utext_next32(tstr);
149 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
150 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
151 u_isUAlphabetic(c),
152 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
153 u_isalnum(c),
154 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
155 u_charType(c),
156 U_SHORT_PROPERTY_NAME),
157 u_getPropertyValueName(UCHAR_WORD_BREAK,
158 u_getIntPropertyValue(c,
159 UCHAR_WORD_BREAK),
160 U_SHORT_PROPERTY_NAME),
161 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
162 u_getIntPropertyValue(c,
163 UCHAR_SENTENCE_BREAK),
164 U_SHORT_PROPERTY_NAME),
165 u_getPropertyValueName(UCHAR_LINE_BREAK,
166 u_getIntPropertyValue(c,
167 UCHAR_LINE_BREAK),
168 U_SHORT_PROPERTY_NAME),
169 name);
170 }
171 }
172
173
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)174 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
175 UErrorCode status = U_ZERO_ERROR;
176 UText *tstr = NULL;
177 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
178 if (U_FAILURE(status)) {
179 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
180 return;
181 }
182 printStringBreaks(tstr, expected, expectedCount);
183 utext_close(tstr);
184 }
185
186
TestBug3818()187 void RBBITest::TestBug3818() {
188 UErrorCode status = U_ZERO_ERROR;
189
190 // Four Thai words...
191 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
192 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
193 UnicodeString thaiStr(thaiWordData);
194
195 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
196 if (U_FAILURE(status) || bi == NULL) {
197 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
198 return;
199 }
200 bi->setText(thaiStr);
201
202 int32_t startOfSecondWord = bi->following(1);
203 if (startOfSecondWord != 4) {
204 errln("Fail at file %s, line %d expected start of word at 4, got %d",
205 __FILE__, __LINE__, startOfSecondWord);
206 }
207 startOfSecondWord = bi->following(0);
208 if (startOfSecondWord != 4) {
209 errln("Fail at file %s, line %d expected start of word at 4, got %d",
210 __FILE__, __LINE__, startOfSecondWord);
211 }
212 delete bi;
213 }
214
215
216 //---------------------------------------------
217 //
218 // other tests
219 //
220 //---------------------------------------------
221
TestGetAvailableLocales()222 void RBBITest::TestGetAvailableLocales()
223 {
224 int32_t locCount = 0;
225 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
226
227 if (locCount == 0)
228 dataerrln("getAvailableLocales() returned an empty list!");
229 // Just make sure that it's returning good memory.
230 int32_t i;
231 for (i = 0; i < locCount; ++i) {
232 logln(locList[i].getName());
233 }
234 }
235
236 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()237 void RBBITest::TestGetDisplayName()
238 {
239 UnicodeString result;
240
241 BreakIterator::getDisplayName(Locale::getUS(), result);
242 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
243 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
244 + result);
245
246 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
247 if (result != "French (France)")
248 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
249 + result);
250 }
251 /**
252 * Test End Behaviour
253 * @bug 4068137
254 */
TestEndBehaviour()255 void RBBITest::TestEndBehaviour()
256 {
257 UErrorCode status = U_ZERO_ERROR;
258 UnicodeString testString("boo.");
259 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
260 if (U_FAILURE(status))
261 {
262 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
263 return;
264 }
265 wb->setText(testString);
266
267 if (wb->first() != 0)
268 errln("Didn't get break at beginning of string.");
269 if (wb->next() != 3)
270 errln("Didn't get break before period in \"boo.\"");
271 if (wb->current() != 4 && wb->next() != 4)
272 errln("Didn't get break at end of string.");
273 delete wb;
274 }
275 /*
276 * @bug 4153072
277 */
TestBug4153072()278 void RBBITest::TestBug4153072() {
279 UErrorCode status = U_ZERO_ERROR;
280 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
281 if (U_FAILURE(status))
282 {
283 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
284 return;
285 }
286 UnicodeString str("...Hello, World!...");
287 int32_t begin = 3;
288 int32_t end = str.length() - 3;
289 UBool onBoundary;
290
291 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
292 iter->adoptText(textIterator);
293 int index;
294 // Note: with the switch to UText, there is no way to restrict the
295 // iteration range to begin at an index other than zero.
296 // String character iterators created with a non-zero bound are
297 // treated by RBBI as being empty.
298 for (index = -1; index < begin + 1; ++index) {
299 onBoundary = iter->isBoundary(index);
300 if (index == 0? !onBoundary : onBoundary) {
301 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
302 " and begin index = " + begin);
303 }
304 }
305 delete iter;
306 }
307
308
309 //
310 // Test for problem reported by Ashok Matoria on 9 July 2007
311 // One.<kSoftHyphen><kSpace>Two.
312 //
313 // Sentence break at start (0) and then on calling next() it breaks at
314 // 'T' of "Two". Now, at this point if I do next() and
315 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
316 //
TestBug5775()317 void RBBITest::TestBug5775() {
318 UErrorCode status = U_ZERO_ERROR;
319 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
320 TEST_ASSERT_SUCCESS(status);
321 if (U_FAILURE(status)) {
322 return;
323 }
324 // Check for status first for better handling of no data errors.
325 TEST_ASSERT(bi != NULL);
326 if (bi == NULL) {
327 return;
328 }
329
330 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
331 // 01234 56789
332 s = s.unescape();
333 bi->setText(s);
334 int pos = bi->next();
335 TEST_ASSERT(pos == 6);
336 pos = bi->next();
337 TEST_ASSERT(pos == 10);
338 pos = bi->previous();
339 TEST_ASSERT(pos == 6);
340 delete bi;
341 }
342
343
344
345 //------------------------------------------------------------------------------
346 //
347 // RBBITest::Extended Run RBBI Tests from an external test data file
348 //
349 //------------------------------------------------------------------------------
350
351 struct TestParams {
352 BreakIterator *bi; // Break iterator is set while parsing test source.
353 // Changed out whenever test data changes break type.
354
355 UnicodeString dataToBreak; // Data that is built up while parsing the test.
356 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
357 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
358 UVector32 *srcCol;
359
360 UText *textToBreak; // UText, could be UTF8 or UTF16.
361 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
362 CharString utf8String; // UTF-8 form of text to break.
363
TestParamsTestParams364 TestParams(UErrorCode &status) : dataToBreak() {
365 bi = NULL;
366 expectedBreaks = new UVector32(status);
367 srcLine = new UVector32(status);
368 srcCol = new UVector32(status);
369 textToBreak = NULL;
370 textMap = new UVector32(status);
371 }
372
~TestParamsTestParams373 ~TestParams() {
374 delete bi;
375 delete expectedBreaks;
376 delete srcLine;
377 delete srcCol;
378 utext_close(textToBreak);
379 delete textMap;
380 }
381
382 int32_t getSrcLine(int32_t bp);
383 int32_t getExpectedBreak(int32_t bp);
384 int32_t getSrcCol(int32_t bp);
385
386 void setUTF16(UErrorCode &status);
387 void setUTF8(UErrorCode &status);
388 };
389
390 // Append a UnicodeString to a CharString with UTF-8 encoding.
391 // Substitute any invalid chars.
392 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)393 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
394 if (U_FAILURE(status)) {
395 return;
396 }
397 int32_t utf8Length;
398 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
399 src.getBuffer(), src.length(), // UTF-16 data
400 0xfffd, NULL, // Substitution char, number of subs.
401 &status);
402 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
403 return;
404 }
405 status = U_ZERO_ERROR;
406 int32_t capacity;
407 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
408 u_strToUTF8WithSub(buffer, utf8Length, NULL,
409 src.getBuffer(), src.length(),
410 0xfffd, NULL, &status);
411 dest.append(buffer, utf8Length, status);
412 }
413
414
setUTF16(UErrorCode & status)415 void TestParams::setUTF16(UErrorCode &status) {
416 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
417 textMap->removeAllElements();
418 for (int32_t i=0; i<dataToBreak.length(); i++) {
419 if (i == dataToBreak.getChar32Start(i)) {
420 textMap->addElement(i, status);
421 } else {
422 textMap->addElement(-1, status);
423 }
424 }
425 textMap->addElement(dataToBreak.length(), status);
426 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
427 }
428
429
setUTF8(UErrorCode & status)430 void TestParams::setUTF8(UErrorCode &status) {
431 if (U_FAILURE(status)) {
432 return;
433 }
434 utf8String.clear();
435 CharStringAppend(utf8String, dataToBreak, status);
436 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
437 if (U_FAILURE(status)) {
438 return;
439 }
440
441 textMap->removeAllElements();
442 int32_t utf16Index = 0;
443 for (;;) {
444 textMap->addElement(utf16Index, status);
445 UChar32 c32 = utext_current32(textToBreak);
446 if (c32 < 0) {
447 break;
448 }
449 utf16Index += U16_LENGTH(c32);
450 utext_next32(textToBreak);
451 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
452 textMap->addElement(-1, status);
453 }
454 }
455 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
456 }
457
458
getSrcLine(int32_t bp)459 int32_t TestParams::getSrcLine(int32_t bp) {
460 if (bp >= textMap->size()) {
461 bp = textMap->size() - 1;
462 }
463 int32_t i = 0;
464 for(; bp >= 0 ; --bp) {
465 // Move to a character boundary if we are not on one already.
466 i = textMap->elementAti(bp);
467 if (i >= 0) {
468 break;
469 }
470 }
471 return srcLine->elementAti(i);
472 }
473
474
getExpectedBreak(int32_t bp)475 int32_t TestParams::getExpectedBreak(int32_t bp) {
476 if (bp >= textMap->size()) {
477 return 0;
478 }
479 int32_t i = textMap->elementAti(bp);
480 int32_t retVal = 0;
481 if (i >= 0) {
482 retVal = expectedBreaks->elementAti(i);
483 }
484 return retVal;
485 }
486
487
getSrcCol(int32_t bp)488 int32_t TestParams::getSrcCol(int32_t bp) {
489 if (bp >= textMap->size()) {
490 bp = textMap->size() - 1;
491 }
492 int32_t i = 0;
493 for(; bp >= 0; --bp) {
494 // Move bp to a character boundary if we are not on one already.
495 i = textMap->elementAti(bp);
496 if (i >= 0) {
497 break;
498 }
499 }
500 return srcCol->elementAti(i);
501 }
502
503
executeTest(TestParams * t,UErrorCode & status)504 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
505 int32_t bp;
506 int32_t prevBP;
507 int32_t i;
508
509 TEST_ASSERT_SUCCESS(status);
510 if (U_FAILURE(status)) {
511 return;
512 }
513
514 if (t->bi == NULL) {
515 return;
516 }
517
518 t->bi->setText(t->textToBreak, status);
519 //
520 // Run the iterator forward
521 //
522 prevBP = -1;
523 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
524 if (prevBP == bp) {
525 // Fail for lack of forward progress.
526 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
527 bp, t->getSrcLine(bp), t->getSrcCol(bp));
528 break;
529 }
530
531 // Check that there we didn't miss an expected break between the last one
532 // and this one.
533 for (i=prevBP+1; i<bp; i++) {
534 if (t->getExpectedBreak(i) != 0) {
535 int expected[] = {0, i};
536 printStringBreaks(t->dataToBreak, expected, 2);
537 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
538 i, t->getSrcLine(i), t->getSrcCol(i));
539 }
540 }
541
542 // Check that the break we did find was expected
543 if (t->getExpectedBreak(bp) == 0) {
544 int expected[] = {0, bp};
545 printStringBreaks(t->textToBreak, expected, 2);
546 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
547 bp, t->getSrcLine(bp), t->getSrcCol(bp));
548 } else {
549 // The break was expected.
550 // Check that the {nnn} tag value is correct.
551 int32_t expectedTagVal = t->getExpectedBreak(bp);
552 if (expectedTagVal == -1) {
553 expectedTagVal = 0;
554 }
555 int32_t line = t->getSrcLine(bp);
556 int32_t rs = t->bi->getRuleStatus();
557 if (rs != expectedTagVal) {
558 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
559 " Actual, Expected status = %4d, %4d",
560 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
561 }
562 }
563
564 prevBP = bp;
565 }
566
567 // Verify that there were no missed expected breaks after the last one found
568 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
569 if (t->getExpectedBreak(i) != 0) {
570 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
571 i, t->getSrcLine(i), t->getSrcCol(i));
572 }
573 }
574
575 //
576 // Run the iterator backwards, verify that the same breaks are found.
577 //
578 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
579 bp = t->bi->last();
580 while (bp != BreakIterator::DONE) {
581 if (prevBP == bp) {
582 // Fail for lack of progress.
583 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
584 bp, t->getSrcLine(bp), t->getSrcCol(bp));
585 break;
586 }
587
588 // Check that we didn't miss an expected break between the last one
589 // and this one. (UVector returns zeros for index out of bounds.)
590 for (i=prevBP-1; i>bp; i--) {
591 if (t->getExpectedBreak(i) != 0) {
592 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
593 i, t->getSrcLine(i), t->getSrcCol(i));
594 }
595 }
596
597 // Check that the break we did find was expected
598 if (t->getExpectedBreak(bp) == 0) {
599 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
600 bp, t->getSrcLine(bp), t->getSrcCol(bp));
601 } else {
602 // The break was expected.
603 // Check that the {nnn} tag value is correct.
604 int32_t expectedTagVal = t->getExpectedBreak(bp);
605 if (expectedTagVal == -1) {
606 expectedTagVal = 0;
607 }
608 int line = t->getSrcLine(bp);
609 int32_t rs = t->bi->getRuleStatus();
610 if (rs != expectedTagVal) {
611 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
612 " Actual, Expected status = %4d, %4d",
613 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
614 }
615 }
616
617 prevBP = bp;
618 bp = t->bi->previous();
619 }
620
621 // Verify that there were no missed breaks prior to the last one found
622 for (i=prevBP-1; i>=0; i--) {
623 if (t->getExpectedBreak(i) != 0) {
624 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
625 i, t->getSrcLine(i), t->getSrcCol(i));
626 }
627 }
628
629 // Check isBoundary()
630 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
631 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
632 UBool boundaryFound = t->bi->isBoundary(i);
633 if (boundaryExpected != boundaryFound) {
634 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
635 " Expected, Actual= %s, %s",
636 i, t->getSrcLine(i), t->getSrcCol(i),
637 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
638 }
639 }
640
641 // Check following()
642 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
643 int32_t actualBreak = t->bi->following(i);
644 int32_t expectedBreak = BreakIterator::DONE;
645 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
646 if (t->getExpectedBreak(j) != 0) {
647 expectedBreak = j;
648 break;
649 }
650 }
651 if (expectedBreak != actualBreak) {
652 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
653 " Expected, Actual= %d, %d",
654 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
655 }
656 }
657
658 // Check preceding()
659 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
660 int32_t actualBreak = t->bi->preceding(i);
661 int32_t expectedBreak = BreakIterator::DONE;
662
663 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
664 // preceding(trailing byte) will return the index of some preceding code point,
665 // not the lead byte of the current code point, even though that has a smaller index.
666 // Therefore, start looking at the expected break data not at i-1, but at
667 // the start of code point index - 1.
668 utext_setNativeIndex(t->textToBreak, i);
669 int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
670 for (; j >= 0; j--) {
671 if (t->getExpectedBreak(j) != 0) {
672 expectedBreak = j;
673 break;
674 }
675 }
676 if (expectedBreak != actualBreak) {
677 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
678 " Expected, Actual= %d, %d",
679 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
680 }
681 }
682 }
683
684
TestExtended()685 void RBBITest::TestExtended() {
686 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
687 // data driven test closely entangles filtered and regular data.
688 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
689 UErrorCode status = U_ZERO_ERROR;
690 Locale locale("");
691
692 TestParams tp(status);
693
694 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
695 if (U_FAILURE(status)) {
696 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
697 }
698
699 //
700 // Open and read the test data file.
701 //
702 const char *testDataDirectory = IntlTest::getSourceTestData(status);
703 CharString testFileName(testDataDirectory, -1, status);
704 testFileName.append("rbbitst.txt", -1, status);
705
706 int len;
707 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
708 if (U_FAILURE(status)) {
709 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
710 return;
711 }
712
713 bool skipTest = false; // Skip this test?
714
715 //
716 // Put the test data into a UnicodeString
717 //
718 UnicodeString testString(FALSE, testFile, len);
719
720 enum EParseState{
721 PARSE_COMMENT,
722 PARSE_TAG,
723 PARSE_DATA,
724 PARSE_NUM,
725 PARSE_RULES
726 }
727 parseState = PARSE_TAG;
728
729 EParseState savedState = PARSE_TAG;
730
731 int32_t lineNum = 1;
732 int32_t colStart = 0;
733 int32_t column = 0;
734 int32_t charIdx = 0;
735
736 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
737
738 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
739 int32_t rulesFirstLine; // Line number of the start of current <rules> block
740
741 for (charIdx = 0; charIdx < len; ) {
742 status = U_ZERO_ERROR;
743 UChar c = testString.charAt(charIdx);
744 charIdx++;
745 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
746 // treat CRLF as a unit
747 c = u'\n';
748 charIdx++;
749 }
750 if (c == u'\n' || c == u'\r') {
751 lineNum++;
752 colStart = charIdx;
753 }
754 column = charIdx - colStart + 1;
755
756 switch (parseState) {
757 case PARSE_COMMENT:
758 if (c == u'\n' || c == u'\r') {
759 parseState = savedState;
760 }
761 break;
762
763 case PARSE_TAG:
764 {
765 if (c == u'#') {
766 parseState = PARSE_COMMENT;
767 savedState = PARSE_TAG;
768 break;
769 }
770 if (u_isUWhiteSpace(c)) {
771 break;
772 }
773 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
774 delete tp.bi;
775 tp.bi = BreakIterator::createWordInstance(locale, status);
776 skipTest = false;
777 charIdx += 5;
778 break;
779 }
780 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
781 delete tp.bi;
782 tp.bi = BreakIterator::createCharacterInstance(locale, status);
783 skipTest = false;
784 charIdx += 5;
785 break;
786 }
787 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
788 delete tp.bi;
789 tp.bi = BreakIterator::createLineInstance(locale, status);
790 skipTest = false;
791 charIdx += 5;
792 break;
793 }
794 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
795 delete tp.bi;
796 tp.bi = BreakIterator::createSentenceInstance(locale, status);
797 skipTest = false;
798 charIdx += 5;
799 break;
800 }
801 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
802 delete tp.bi;
803 tp.bi = BreakIterator::createTitleInstance(locale, status);
804 charIdx += 6;
805 break;
806 }
807
808 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
809 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
810 charIdx = testString.indexOf(u'>', charIdx) + 1;
811 parseState = PARSE_RULES;
812 rules.remove();
813 rulesFirstLine = lineNum;
814 break;
815 }
816
817 // <locale loc_name>
818 localeMatcher.reset(testString);
819 if (localeMatcher.lookingAt(charIdx-1, status)) {
820 UnicodeString localeName = localeMatcher.group(1, status);
821 char localeName8[100];
822 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
823 locale = Locale::createFromName(localeName8);
824 charIdx += localeMatcher.group(0, status).length() - 1;
825 TEST_ASSERT_SUCCESS(status);
826 break;
827 }
828 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
829 parseState = PARSE_DATA;
830 charIdx += 5;
831 tp.dataToBreak = "";
832 tp.expectedBreaks->removeAllElements();
833 tp.srcCol ->removeAllElements();
834 tp.srcLine->removeAllElements();
835 break;
836 }
837
838 errln("line %d: Tag expected in test file.", lineNum);
839 parseState = PARSE_COMMENT;
840 savedState = PARSE_DATA;
841 goto end_test; // Stop the test.
842 }
843 break;
844
845 case PARSE_RULES:
846 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
847 charIdx += 7;
848 parseState = PARSE_TAG;
849 delete tp.bi;
850 UParseError pe;
851 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
852 skipTest = U_FAILURE(status);
853 if (U_FAILURE(status)) {
854 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
855 rulesFirstLine + pe.line - 1, u_errorName(status));
856 }
857 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
858 charIdx += 10;
859 parseState = PARSE_TAG;
860 UErrorCode ec = U_ZERO_ERROR;
861 UParseError pe;
862 RuleBasedBreakIterator bi(rules, pe, ec);
863 if (U_SUCCESS(ec)) {
864 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
865 rulesFirstLine + pe.line - 1);
866 }
867 } else {
868 rules.append(c);
869 }
870 break;
871
872 case PARSE_DATA:
873 if (c == u'•') {
874 int32_t breakIdx = tp.dataToBreak.length();
875 tp.expectedBreaks->setSize(breakIdx+1);
876 tp.expectedBreaks->setElementAt(-1, breakIdx);
877 tp.srcLine->setSize(breakIdx+1);
878 tp.srcLine->setElementAt(lineNum, breakIdx);
879 tp.srcCol ->setSize(breakIdx+1);
880 tp.srcCol ->setElementAt(column, breakIdx);
881 break;
882 }
883
884 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
885 // Add final entry to mappings from break location to source file position.
886 // Need one extra because last break position returned is after the
887 // last char in the data, not at the last char.
888 tp.srcLine->addElement(lineNum, status);
889 tp.srcCol ->addElement(column, status);
890
891 parseState = PARSE_TAG;
892 charIdx += 6;
893
894 if (!skipTest) {
895 // RUN THE TEST!
896 status = U_ZERO_ERROR;
897 tp.setUTF16(status);
898 executeTest(&tp, status);
899 TEST_ASSERT_SUCCESS(status);
900
901 // Run again, this time with UTF-8 text wrapped in a UText.
902 status = U_ZERO_ERROR;
903 tp.setUTF8(status);
904 TEST_ASSERT_SUCCESS(status);
905 executeTest(&tp, status);
906 }
907 break;
908 }
909
910 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
911 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
912 // Get the code point from the name and insert it into the test data.
913 // (Damn, no API takes names in Unicode !!!
914 // we've got to take it back to char *)
915 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
916 int32_t nameLength = nameEndIdx - (charIdx+2);
917 char charNameBuf[200];
918 UChar32 theChar = -1;
919 if (nameEndIdx != -1) {
920 UErrorCode status = U_ZERO_ERROR;
921 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
922 charNameBuf[sizeof(charNameBuf)-1] = 0;
923 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
924 if (U_FAILURE(status)) {
925 theChar = -1;
926 }
927 }
928 if (theChar == -1) {
929 errln("Error in named character in test file at line %d, col %d",
930 lineNum, column);
931 } else {
932 // Named code point was recognized. Insert it
933 // into the test data.
934 tp.dataToBreak.append(theChar);
935 while (tp.dataToBreak.length() > tp.srcLine->size()) {
936 tp.srcLine->addElement(lineNum, status);
937 tp.srcCol ->addElement(column, status);
938 }
939 }
940 if (nameEndIdx > charIdx) {
941 charIdx = nameEndIdx+1;
942
943 }
944 break;
945 }
946
947
948
949 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
950 charIdx++;
951 int32_t breakIdx = tp.dataToBreak.length();
952 tp.expectedBreaks->setSize(breakIdx+1);
953 tp.expectedBreaks->setElementAt(-1, breakIdx);
954 tp.srcLine->setSize(breakIdx+1);
955 tp.srcLine->setElementAt(lineNum, breakIdx);
956 tp.srcCol ->setSize(breakIdx+1);
957 tp.srcCol ->setElementAt(column, breakIdx);
958 break;
959 }
960
961 if (c == u'<') {
962 tagValue = 0;
963 parseState = PARSE_NUM;
964 break;
965 }
966
967 if (c == u'#' && column==3) { // TODO: why is column off so far?
968 parseState = PARSE_COMMENT;
969 savedState = PARSE_DATA;
970 break;
971 }
972
973 if (c == u'\\') {
974 // Check for \ at end of line, a line continuation.
975 // Advance over (discard) the newline
976 UChar32 cp = testString.char32At(charIdx);
977 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
978 // We have a CR LF
979 // Need an extra increment of the input ptr to move over both of them
980 charIdx++;
981 }
982 if (cp == u'\n' || cp == u'\r') {
983 lineNum++;
984 colStart = charIdx;
985 charIdx++;
986 break;
987 }
988
989 // Let unescape handle the back slash.
990 cp = testString.unescapeAt(charIdx);
991 if (cp != -1) {
992 // Escape sequence was recognized. Insert the char
993 // into the test data.
994 tp.dataToBreak.append(cp);
995 while (tp.dataToBreak.length() > tp.srcLine->size()) {
996 tp.srcLine->addElement(lineNum, status);
997 tp.srcCol ->addElement(column, status);
998 }
999 break;
1000 }
1001
1002
1003 // Not a recognized backslash escape sequence.
1004 // Take the next char as a literal.
1005 // TODO: Should this be an error?
1006 c = testString.charAt(charIdx);
1007 charIdx = testString.moveIndex32(charIdx, 1);
1008 }
1009
1010 // Normal, non-escaped data char.
1011 tp.dataToBreak.append(c);
1012
1013 // Save the mapping from offset in the data to line/column numbers in
1014 // the original input file. Will be used for better error messages only.
1015 // If there's an expected break before this char, the slot in the mapping
1016 // vector will already be set for this char; don't overwrite it.
1017 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1018 tp.srcLine->addElement(lineNum, status);
1019 tp.srcCol ->addElement(column, status);
1020 }
1021 break;
1022
1023
1024 case PARSE_NUM:
1025 // We are parsing an expected numeric tag value, like <1234>,
1026 // within a chunk of data.
1027 if (u_isUWhiteSpace(c)) {
1028 break;
1029 }
1030
1031 if (c == u'>') {
1032 // Finished the number. Add the info to the expected break data,
1033 // and switch parse state back to doing plain data.
1034 parseState = PARSE_DATA;
1035 if (tagValue == 0) {
1036 tagValue = -1;
1037 }
1038 int32_t breakIdx = tp.dataToBreak.length();
1039 tp.expectedBreaks->setSize(breakIdx+1);
1040 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1041 tp.srcLine->setSize(breakIdx+1);
1042 tp.srcLine->setElementAt(lineNum, breakIdx);
1043 tp.srcCol ->setSize(breakIdx+1);
1044 tp.srcCol ->setElementAt(column, breakIdx);
1045 break;
1046 }
1047
1048 if (u_isdigit(c)) {
1049 tagValue = tagValue*10 + u_charDigitValue(c);
1050 break;
1051 }
1052
1053 errln("Syntax Error in test file at line %d, col %d",
1054 lineNum, column);
1055 parseState = PARSE_COMMENT;
1056 goto end_test; // Stop the test
1057 break;
1058 }
1059
1060
1061 if (U_FAILURE(status)) {
1062 dataerrln("ICU Error %s while parsing test file at line %d.",
1063 u_errorName(status), lineNum);
1064 status = U_ZERO_ERROR;
1065 goto end_test; // Stop the test
1066 }
1067
1068 }
1069
1070 // Reached end of test file. Raise an error if parseState indicates that we are
1071 // within a block that should have been terminated.
1072
1073 if (parseState == PARSE_RULES) {
1074 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1075 lineNum, rulesFirstLine);
1076 }
1077 if (parseState == PARSE_DATA) {
1078 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1079 }
1080
1081
1082 end_test:
1083 delete [] testFile;
1084 #endif
1085 }
1086
1087
1088 //-------------------------------------------------------------------------------
1089 //
1090 // TestDictRules create a break iterator from source rules that includes a
1091 // dictionary range. Regression for bug #7130. Source rules
1092 // do not declare a break iterator type (word, line, sentence, etc.
1093 // but the dictionary code, without a type, would loop.
1094 //
1095 //-------------------------------------------------------------------------------
TestDictRules()1096 void RBBITest::TestDictRules() {
1097 const char *rules = "$dictionary = [a-z]; \n"
1098 "!!forward; \n"
1099 "$dictionary $dictionary; \n"
1100 "!!reverse; \n"
1101 "$dictionary $dictionary; \n";
1102 const char *text = "aa";
1103 UErrorCode status = U_ZERO_ERROR;
1104 UParseError parseError;
1105
1106 RuleBasedBreakIterator bi(rules, parseError, status);
1107 if (U_SUCCESS(status)) {
1108 UnicodeString utext = text;
1109 bi.setText(utext);
1110 int32_t position;
1111 int32_t loops;
1112 for (loops = 0; loops<10; loops++) {
1113 position = bi.next();
1114 if (position == RuleBasedBreakIterator::DONE) {
1115 break;
1116 }
1117 }
1118 TEST_ASSERT(loops == 1);
1119 } else {
1120 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1121 }
1122 }
1123
1124
1125
1126 //-------------------------------------------------------------------------------
1127 //
1128 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1129 // return the data in one big UChar * buffer, which the caller must delete.
1130 //
1131 // parameters:
1132 // fileName: the name of the file, with no directory part. The test data directory
1133 // is assumed.
1134 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1135 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1136 // specified here. The BOM, if it exists, will be stripped from the returned data.
1137 // Pass NULL for the system default encoding.
1138 // status
1139 // returns:
1140 // The file data, converted to UChar.
1141 // The caller must delete this when done with
1142 // delete [] theBuffer;
1143 //
1144 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1145 // Move this function to some common place.
1146 //
1147 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1148 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1149 UChar *retPtr = NULL;
1150 char *fileBuf = NULL;
1151 UConverter* conv = NULL;
1152 FILE *f = NULL;
1153
1154 ulen = 0;
1155 if (U_FAILURE(status)) {
1156 return retPtr;
1157 }
1158
1159 //
1160 // Open the file.
1161 //
1162 f = fopen(fileName, "rb");
1163 if (f == 0) {
1164 dataerrln("Error opening test data file %s\n", fileName);
1165 status = U_FILE_ACCESS_ERROR;
1166 return NULL;
1167 }
1168 //
1169 // Read it in
1170 //
1171 int fileSize;
1172 int amt_read;
1173
1174 fseek( f, 0, SEEK_END);
1175 fileSize = ftell(f);
1176 fileBuf = new char[fileSize];
1177 fseek(f, 0, SEEK_SET);
1178 amt_read = fread(fileBuf, 1, fileSize, f);
1179 if (amt_read != fileSize || fileSize <= 0) {
1180 errln("Error reading test data file.");
1181 goto cleanUpAndReturn;
1182 }
1183
1184 //
1185 // Look for a Unicode Signature (BOM) on the data just read
1186 //
1187 int32_t signatureLength;
1188 const char * fileBufC;
1189 const char* bomEncoding;
1190
1191 fileBufC = fileBuf;
1192 bomEncoding = ucnv_detectUnicodeSignature(
1193 fileBuf, fileSize, &signatureLength, &status);
1194 if(bomEncoding!=NULL ){
1195 fileBufC += signatureLength;
1196 fileSize -= signatureLength;
1197 encoding = bomEncoding;
1198 }
1199
1200 //
1201 // Open a converter to take the rule file to UTF-16
1202 //
1203 conv = ucnv_open(encoding, &status);
1204 if (U_FAILURE(status)) {
1205 goto cleanUpAndReturn;
1206 }
1207
1208 //
1209 // Convert the rules to UChar.
1210 // Preflight first to determine required buffer size.
1211 //
1212 ulen = ucnv_toUChars(conv,
1213 NULL, // dest,
1214 0, // destCapacity,
1215 fileBufC,
1216 fileSize,
1217 &status);
1218 if (status == U_BUFFER_OVERFLOW_ERROR) {
1219 // Buffer Overflow is expected from the preflight operation.
1220 status = U_ZERO_ERROR;
1221
1222 retPtr = new UChar[ulen+1];
1223 ucnv_toUChars(conv,
1224 retPtr, // dest,
1225 ulen+1,
1226 fileBufC,
1227 fileSize,
1228 &status);
1229 }
1230
1231 cleanUpAndReturn:
1232 fclose(f);
1233 delete []fileBuf;
1234 ucnv_close(conv);
1235 if (U_FAILURE(status)) {
1236 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1237 delete []retPtr;
1238 retPtr = 0;
1239 ulen = 0;
1240 };
1241 return retPtr;
1242 }
1243
1244
1245
1246 //--------------------------------------------------------------------------------------------
1247 //
1248 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1249 //
1250 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1251 void RBBITest::TestUnicodeFiles() {
1252 RuleBasedBreakIterator *bi;
1253 UErrorCode status = U_ZERO_ERROR;
1254
1255 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1256 TEST_ASSERT_SUCCESS(status);
1257 if (U_SUCCESS(status)) {
1258 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1259 }
1260 delete bi;
1261
1262 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1263 TEST_ASSERT_SUCCESS(status);
1264 if (U_SUCCESS(status)) {
1265 runUnicodeTestData("WordBreakTest.txt", bi);
1266 }
1267 delete bi;
1268
1269 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1270 TEST_ASSERT_SUCCESS(status);
1271 if (U_SUCCESS(status)) {
1272 runUnicodeTestData("SentenceBreakTest.txt", bi);
1273 }
1274 delete bi;
1275
1276 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1277 TEST_ASSERT_SUCCESS(status);
1278 if (U_SUCCESS(status)) {
1279 runUnicodeTestData("LineBreakTest.txt", bi);
1280 }
1281 delete bi;
1282 }
1283
1284
1285 // Check for test cases from the Unicode test data files that are known to fail
1286 // and should be skipped as known issues because ICU does not fully implement
1287 // the Unicode specifications, or because ICU includes tailorings that differ from
1288 // the Unicode standard.
1289 //
1290 // Test cases are identified by the test data sequence, which tends to be more stable
1291 // across Unicode versions than the test file line numbers.
1292 //
1293 // The test case with ticket "10666" is a dummy, included as an example.
1294
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1295 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1296 static struct TestCase {
1297 const char *fTicketNum;
1298 const char *fFileName;
1299 const UChar *fString;
1300 } badTestCases[] = {
1301 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1302 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1303 // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1304 // ICU is out of sync with Unicode.
1305 {"8151", "LineBreakTest.txt", u"-#"},
1306 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1307 {"8151", "LineBreakTest.txt", u"\u002d\u00a7"},
1308 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1309 {"8151", "LineBreakTest.txt", u"\u002d\U00050005"},
1310 {"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1311 {"8151", "LineBreakTest.txt", u"\u002d\u0e01"},
1312 {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1313 };
1314
1315 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1316 const TestCase &badCase = badTestCases[n];
1317 if (!strcmp(fileName, badCase.fFileName) &&
1318 testCase == UnicodeString(badCase.fString)) {
1319 return logKnownIssue(badCase.fTicketNum);
1320 }
1321 }
1322 return FALSE;
1323 }
1324
1325
1326 //--------------------------------------------------------------------------------------------
1327 //
1328 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1329 //
1330 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1331 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1332 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1333 UErrorCode status = U_ZERO_ERROR;
1334
1335 //
1336 // Open and read the test data file, put it into a UnicodeString.
1337 //
1338 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1339 char testFileName[1000];
1340 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1341 dataerrln("Can't open test data. Path too long.");
1342 return;
1343 }
1344 strcpy(testFileName, testDataDirectory);
1345 strcat(testFileName, fileName);
1346
1347 logln("Opening data file %s\n", fileName);
1348
1349 int len;
1350 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1351 if (status != U_FILE_ACCESS_ERROR) {
1352 TEST_ASSERT_SUCCESS(status);
1353 TEST_ASSERT(testFile != NULL);
1354 }
1355 if (U_FAILURE(status) || testFile == NULL) {
1356 return; /* something went wrong, error already output */
1357 }
1358 UnicodeString testFileAsString(TRUE, testFile, len);
1359
1360 //
1361 // Parse the test data file using a regular expression.
1362 // Each kind of token is recognized in its own capture group; what type of item was scanned
1363 // is identified by which group had a match.
1364 //
1365 // Caputure Group # 1 2 3 4 5
1366 // Parses this item: divide x hex digits comment \n unrecognized \n
1367 //
1368 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1369 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1370 UnicodeString testString;
1371 UVector32 breakPositions(status);
1372 int lineNumber = 1;
1373 TEST_ASSERT_SUCCESS(status);
1374 if (U_FAILURE(status)) {
1375 return;
1376 }
1377
1378 //
1379 // Scan through each test case, building up the string to be broken in testString,
1380 // and the positions that should be boundaries in the breakPositions vector.
1381 //
1382 int spin = 0;
1383 while (tokenMatcher.find()) {
1384 if(tokenMatcher.hitEnd()) {
1385 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1386 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1387 and caused an infinite loop here on EBCDIC systems!
1388 */
1389 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1390 // return;
1391 }
1392 if (tokenMatcher.start(1, status) >= 0) {
1393 // Scanned a divide sign, indicating a break position in the test data.
1394 if (testString.length()>0) {
1395 breakPositions.addElement(testString.length(), status);
1396 }
1397 }
1398 else if (tokenMatcher.start(2, status) >= 0) {
1399 // Scanned an 'x', meaning no break at this position in the test data
1400 // Nothing to be done here.
1401 }
1402 else if (tokenMatcher.start(3, status) >= 0) {
1403 // Scanned Hex digits. Convert them to binary, append to the character data string.
1404 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1405 int length = hexNumber.length();
1406 if (length<=8) {
1407 char buf[10];
1408 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1409 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1410 if (c<=0x10ffff) {
1411 testString.append(c);
1412 } else {
1413 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1414 fileName, lineNumber);
1415 }
1416 } else {
1417 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1418 fileName, lineNumber);
1419 }
1420 }
1421 else if (tokenMatcher.start(4, status) >= 0) {
1422 // Scanned to end of a line, possibly skipping over a comment in the process.
1423 // If the line from the file contained test data, run the test now.
1424 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1425 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1426 }
1427
1428 // Clear out this test case.
1429 // The string and breakPositions vector will be refilled as the next
1430 // test case is parsed.
1431 testString.remove();
1432 breakPositions.removeAllElements();
1433 lineNumber++;
1434 } else {
1435 // Scanner catchall. Something unrecognized appeared on the line.
1436 char token[16];
1437 UnicodeString uToken = tokenMatcher.group(0, status);
1438 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1439 token[sizeof(token)-1] = 0;
1440 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1441
1442 // Clean up, in preparation for continuing with the next line.
1443 testString.remove();
1444 breakPositions.removeAllElements();
1445 lineNumber++;
1446 }
1447 TEST_ASSERT_SUCCESS(status);
1448 if (U_FAILURE(status)) {
1449 break;
1450 }
1451 }
1452
1453 delete [] testFile;
1454 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1455 }
1456
1457 //--------------------------------------------------------------------------------------------
1458 //
1459 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1460 // test data files. Do only a simple, forward-only check -
1461 // this test is mostly to check that ICU and the Unicode
1462 // data agree with each other.
1463 //
1464 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1465 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1466 const UnicodeString &testString, // Text data to be broken
1467 UVector32 *breakPositions, // Positions where breaks should be found.
1468 RuleBasedBreakIterator *bi) {
1469 int32_t pos; // Break Position in the test string
1470 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1471 int32_t expectedPos; // Expected break position (index into test string)
1472
1473 bi->setText(testString);
1474 pos = bi->first();
1475 pos = bi->next();
1476
1477 while (pos != BreakIterator::DONE) {
1478 if (expectedI >= breakPositions->size()) {
1479 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1480 testFileName, lineNumber, pos);
1481 break;
1482 }
1483 expectedPos = breakPositions->elementAti(expectedI);
1484 if (pos < expectedPos) {
1485 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1486 testFileName, lineNumber, pos);
1487 break;
1488 }
1489 if (pos > expectedPos) {
1490 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1491 testFileName, lineNumber, expectedPos);
1492 break;
1493 }
1494 pos = bi->next();
1495 expectedI++;
1496 }
1497
1498 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1499 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1500 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1501 }
1502 }
1503
1504
1505
1506 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1507 //---------------------------------------------------------------------------------------
1508 //
1509 // classs RBBIMonkeyKind
1510 //
1511 // Monkey Test for Break Iteration
1512 // Abstract interface class. Concrete derived classes independently
1513 // implement the break rules for different iterator types.
1514 //
1515 // The Monkey Test itself uses doesn't know which type of break iterator it is
1516 // testing, but works purely in terms of the interface defined here.
1517 //
1518 //---------------------------------------------------------------------------------------
1519 class RBBIMonkeyKind {
1520 public:
1521 // Return a UVector of UnicodeSets, representing the character classes used
1522 // for this type of iterator.
1523 virtual UVector *charClasses() = 0;
1524
1525 // Set the test text on which subsequent calls to next() will operate
1526 virtual void setText(const UnicodeString &s) = 0;
1527
1528 // Find the next break postion, starting from the prev break position, or from zero.
1529 // Return -1 after reaching end of string.
1530 virtual int32_t next(int32_t i) = 0;
1531
1532 virtual ~RBBIMonkeyKind();
1533 UErrorCode deferredStatus;
1534
1535
1536 protected:
1537 RBBIMonkeyKind();
1538
1539 private:
1540 };
1541
RBBIMonkeyKind()1542 RBBIMonkeyKind::RBBIMonkeyKind() {
1543 deferredStatus = U_ZERO_ERROR;
1544 }
1545
~RBBIMonkeyKind()1546 RBBIMonkeyKind::~RBBIMonkeyKind() {
1547 }
1548
1549
1550 //----------------------------------------------------------------------------------------
1551 //
1552 // Random Numbers. Similar to standard lib rand() and srand()
1553 // Not using library to
1554 // 1. Get same results on all platforms.
1555 // 2. Get access to current seed, to more easily reproduce failures.
1556 //
1557 //---------------------------------------------------------------------------------------
1558 static uint32_t m_seed = 1;
1559
m_rand()1560 static uint32_t m_rand()
1561 {
1562 m_seed = m_seed * 1103515245 + 12345;
1563 return (uint32_t)(m_seed/65536) % 32768;
1564 }
1565
1566
1567 //------------------------------------------------------------------------------------------
1568 //
1569 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1570 // of RBBIMonkeyKind.
1571 //
1572 //------------------------------------------------------------------------------------------
1573 class RBBICharMonkey: public RBBIMonkeyKind {
1574 public:
1575 RBBICharMonkey();
1576 virtual ~RBBICharMonkey();
1577 virtual UVector *charClasses();
1578 virtual void setText(const UnicodeString &s);
1579 virtual int32_t next(int32_t i);
1580 private:
1581 UVector *fSets;
1582
1583 UnicodeSet *fCRLFSet;
1584 UnicodeSet *fControlSet;
1585 UnicodeSet *fExtendSet;
1586 UnicodeSet *fZWJSet;
1587 UnicodeSet *fRegionalIndicatorSet;
1588 UnicodeSet *fPrependSet;
1589 UnicodeSet *fSpacingSet;
1590 UnicodeSet *fLSet;
1591 UnicodeSet *fVSet;
1592 UnicodeSet *fTSet;
1593 UnicodeSet *fLVSet;
1594 UnicodeSet *fLVTSet;
1595 UnicodeSet *fHangulSet;
1596 UnicodeSet *fExtendedPictSet;
1597 UnicodeSet *fAnySet;
1598
1599 const UnicodeString *fText;
1600 };
1601
1602
RBBICharMonkey()1603 RBBICharMonkey::RBBICharMonkey() {
1604 UErrorCode status = U_ZERO_ERROR;
1605
1606 fText = NULL;
1607
1608 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1609 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1610 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1611 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1612 fRegionalIndicatorSet =
1613 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1614 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1615 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1616 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1617 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1618 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1619 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1620 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1621 fHangulSet = new UnicodeSet();
1622 fHangulSet->addAll(*fLSet);
1623 fHangulSet->addAll(*fVSet);
1624 fHangulSet->addAll(*fTSet);
1625 fHangulSet->addAll(*fLVSet);
1626 fHangulSet->addAll(*fLVTSet);
1627
1628 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1629 fAnySet = new UnicodeSet(0, 0x10ffff);
1630
1631 fSets = new UVector(status);
1632 fSets->addElement(fCRLFSet, status);
1633 fSets->addElement(fControlSet, status);
1634 fSets->addElement(fExtendSet, status);
1635 fSets->addElement(fRegionalIndicatorSet, status);
1636 if (!fPrependSet->isEmpty()) {
1637 fSets->addElement(fPrependSet, status);
1638 }
1639 fSets->addElement(fSpacingSet, status);
1640 fSets->addElement(fHangulSet, status);
1641 fSets->addElement(fAnySet, status);
1642 fSets->addElement(fZWJSet, status);
1643 fSets->addElement(fExtendedPictSet, status);
1644 if (U_FAILURE(status)) {
1645 deferredStatus = status;
1646 }
1647 }
1648
1649
setText(const UnicodeString & s)1650 void RBBICharMonkey::setText(const UnicodeString &s) {
1651 fText = &s;
1652 }
1653
1654
1655
next(int32_t prevPos)1656 int32_t RBBICharMonkey::next(int32_t prevPos) {
1657 int p0, p1, p2, p3; // Indices of the significant code points around the
1658 // break position being tested. The candidate break
1659 // location is before p2.
1660
1661 int breakPos = -1;
1662
1663 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1664 UChar32 cBase; // for (X Extend*) patterns, the X character.
1665
1666 if (U_FAILURE(deferredStatus)) {
1667 return -1;
1668 }
1669
1670 // Previous break at end of string. return DONE.
1671 if (prevPos >= fText->length()) {
1672 return -1;
1673 }
1674 p0 = p1 = p2 = p3 = prevPos;
1675 c3 = fText->char32At(prevPos);
1676 c0 = c1 = c2 = cBase = 0;
1677 (void)p0; // suppress set but not used warning.
1678 (void)c0;
1679
1680 // Loop runs once per "significant" character position in the input text.
1681 for (;;) {
1682 // Move all of the positions forward in the input string.
1683 p0 = p1; c0 = c1;
1684 p1 = p2; c1 = c2;
1685 p2 = p3; c2 = c3;
1686
1687 // Advancd p3 by one codepoint
1688 p3 = fText->moveIndex32(p3, 1);
1689 c3 = fText->char32At(p3);
1690
1691 if (p1 == p2) {
1692 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1693 continue;
1694 }
1695 if (p2 == fText->length()) {
1696 // Reached end of string. Always a break position.
1697 break;
1698 }
1699
1700 // Rule GB3 CR x LF
1701 // No Extend or Format characters may appear between the CR and LF,
1702 // which requires the additional check for p2 immediately following p1.
1703 //
1704 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1705 continue;
1706 }
1707
1708 // Rule (GB4). ( Control | CR | LF ) <break>
1709 if (fControlSet->contains(c1) ||
1710 c1 == 0x0D ||
1711 c1 == 0x0A) {
1712 break;
1713 }
1714
1715 // Rule (GB5) <break> ( Control | CR | LF )
1716 //
1717 if (fControlSet->contains(c2) ||
1718 c2 == 0x0D ||
1719 c2 == 0x0A) {
1720 break;
1721 }
1722
1723
1724 // Rule (GB6) L x ( L | V | LV | LVT )
1725 if (fLSet->contains(c1) &&
1726 (fLSet->contains(c2) ||
1727 fVSet->contains(c2) ||
1728 fLVSet->contains(c2) ||
1729 fLVTSet->contains(c2))) {
1730 continue;
1731 }
1732
1733 // Rule (GB7) ( LV | V ) x ( V | T )
1734 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1735 (fVSet->contains(c2) || fTSet->contains(c2))) {
1736 continue;
1737 }
1738
1739 // Rule (GB8) ( LVT | T) x T
1740 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1741 fTSet->contains(c2)) {
1742 continue;
1743 }
1744
1745 // Rule (GB9) x (Extend | ZWJ)
1746 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1747 if (!fExtendSet->contains(c1)) {
1748 cBase = c1;
1749 }
1750 continue;
1751 }
1752
1753 // Rule (GB9a) x SpacingMark
1754 if (fSpacingSet->contains(c2)) {
1755 continue;
1756 }
1757
1758 // Rule (GB9b) Prepend x
1759 if (fPrependSet->contains(c1)) {
1760 continue;
1761 }
1762
1763 // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic
1764 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1765 continue;
1766 }
1767
1768 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
1769 // Note: The first if condition is a little tricky. We only need to force
1770 // a break if there are three or more contiguous RIs. If there are
1771 // only two, a break following will occur via other rules, and will include
1772 // any trailing extend characters, which is needed behavior.
1773 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1774 && fRegionalIndicatorSet->contains(c2)) {
1775 break;
1776 }
1777 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1778 continue;
1779 }
1780
1781 // Rule (GB999) Any <break> Any
1782 break;
1783 }
1784
1785 breakPos = p2;
1786 return breakPos;
1787 }
1788
1789
1790
charClasses()1791 UVector *RBBICharMonkey::charClasses() {
1792 return fSets;
1793 }
1794
1795
~RBBICharMonkey()1796 RBBICharMonkey::~RBBICharMonkey() {
1797 delete fSets;
1798 delete fCRLFSet;
1799 delete fControlSet;
1800 delete fExtendSet;
1801 delete fRegionalIndicatorSet;
1802 delete fPrependSet;
1803 delete fSpacingSet;
1804 delete fLSet;
1805 delete fVSet;
1806 delete fTSet;
1807 delete fLVSet;
1808 delete fLVTSet;
1809 delete fHangulSet;
1810 delete fAnySet;
1811 delete fZWJSet;
1812 delete fExtendedPictSet;
1813 }
1814
1815 //------------------------------------------------------------------------------------------
1816 //
1817 // class RBBIWordMonkey Word Break specific implementation
1818 // of RBBIMonkeyKind.
1819 //
1820 //------------------------------------------------------------------------------------------
1821 class RBBIWordMonkey: public RBBIMonkeyKind {
1822 public:
1823 RBBIWordMonkey();
1824 virtual ~RBBIWordMonkey();
1825 virtual UVector *charClasses();
1826 virtual void setText(const UnicodeString &s);
1827 virtual int32_t next(int32_t i);
1828 private:
1829 UVector *fSets;
1830
1831 UnicodeSet *fCRSet;
1832 UnicodeSet *fLFSet;
1833 UnicodeSet *fNewlineSet;
1834 UnicodeSet *fRegionalIndicatorSet;
1835 UnicodeSet *fKatakanaSet;
1836 UnicodeSet *fHebrew_LetterSet;
1837 UnicodeSet *fALetterSet;
1838 UnicodeSet *fSingle_QuoteSet;
1839 UnicodeSet *fDouble_QuoteSet;
1840 UnicodeSet *fMidNumLetSet;
1841 UnicodeSet *fMidLetterSet;
1842 UnicodeSet *fMidNumSet;
1843 UnicodeSet *fNumericSet;
1844 UnicodeSet *fFormatSet;
1845 UnicodeSet *fOtherSet;
1846 UnicodeSet *fExtendSet;
1847 UnicodeSet *fExtendNumLetSet;
1848 UnicodeSet *fWSegSpaceSet;
1849 UnicodeSet *fDictionarySet;
1850 UnicodeSet *fZWJSet;
1851 UnicodeSet *fExtendedPictSet;
1852
1853 const UnicodeString *fText;
1854 };
1855
1856
RBBIWordMonkey()1857 RBBIWordMonkey::RBBIWordMonkey()
1858 {
1859 UErrorCode status = U_ZERO_ERROR;
1860
1861 fSets = new UVector(status);
1862
1863 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
1864 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
1865 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
1866 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
1867 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1868 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1869 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1870 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
1871 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
1872 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
1873 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status);
1874 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
1875 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1876 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
1877 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1878 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status);
1879 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
1880
1881 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
1882 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1883
1884 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1885 fDictionarySet->addAll(*fKatakanaSet);
1886 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1887
1888 fALetterSet->removeAll(*fDictionarySet);
1889
1890 fOtherSet = new UnicodeSet();
1891 if(U_FAILURE(status)) {
1892 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1893 deferredStatus = status;
1894 return;
1895 }
1896
1897 fOtherSet->complement();
1898 fOtherSet->removeAll(*fCRSet);
1899 fOtherSet->removeAll(*fLFSet);
1900 fOtherSet->removeAll(*fNewlineSet);
1901 fOtherSet->removeAll(*fKatakanaSet);
1902 fOtherSet->removeAll(*fHebrew_LetterSet);
1903 fOtherSet->removeAll(*fALetterSet);
1904 fOtherSet->removeAll(*fSingle_QuoteSet);
1905 fOtherSet->removeAll(*fDouble_QuoteSet);
1906 fOtherSet->removeAll(*fMidLetterSet);
1907 fOtherSet->removeAll(*fMidNumSet);
1908 fOtherSet->removeAll(*fNumericSet);
1909 fOtherSet->removeAll(*fExtendNumLetSet);
1910 fOtherSet->removeAll(*fWSegSpaceSet);
1911 fOtherSet->removeAll(*fFormatSet);
1912 fOtherSet->removeAll(*fExtendSet);
1913 fOtherSet->removeAll(*fRegionalIndicatorSet);
1914 fOtherSet->removeAll(*fZWJSet);
1915 fOtherSet->removeAll(*fExtendedPictSet);
1916
1917 // Inhibit dictionary characters from being tested at all.
1918 fOtherSet->removeAll(*fDictionarySet);
1919
1920 fSets->addElement(fCRSet, status);
1921 fSets->addElement(fLFSet, status);
1922 fSets->addElement(fNewlineSet, status);
1923 fSets->addElement(fRegionalIndicatorSet, status);
1924 fSets->addElement(fHebrew_LetterSet, status);
1925 fSets->addElement(fALetterSet, status);
1926 fSets->addElement(fSingle_QuoteSet, status);
1927 fSets->addElement(fDouble_QuoteSet, status);
1928 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
1929 // from the test data. They are all in the dictionary set,
1930 // which this (old, to be retired) monkey test cannot handle.
1931 fSets->addElement(fMidLetterSet, status);
1932 fSets->addElement(fMidNumLetSet, status);
1933 fSets->addElement(fMidNumSet, status);
1934 fSets->addElement(fNumericSet, status);
1935 fSets->addElement(fFormatSet, status);
1936 fSets->addElement(fExtendSet, status);
1937 fSets->addElement(fOtherSet, status);
1938 fSets->addElement(fExtendNumLetSet, status);
1939 fSets->addElement(fWSegSpaceSet, status);
1940
1941 fSets->addElement(fZWJSet, status);
1942 fSets->addElement(fExtendedPictSet, status);
1943
1944 if (U_FAILURE(status)) {
1945 deferredStatus = status;
1946 }
1947 }
1948
setText(const UnicodeString & s)1949 void RBBIWordMonkey::setText(const UnicodeString &s) {
1950 fText = &s;
1951 }
1952
1953
next(int32_t prevPos)1954 int32_t RBBIWordMonkey::next(int32_t prevPos) {
1955 int p0, p1, p2, p3; // Indices of the significant code points around the
1956 // break position being tested. The candidate break
1957 // location is before p2.
1958
1959 int breakPos = -1;
1960
1961 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1962
1963 if (U_FAILURE(deferredStatus)) {
1964 return -1;
1965 }
1966
1967 // Prev break at end of string. return DONE.
1968 if (prevPos >= fText->length()) {
1969 return -1;
1970 }
1971 p0 = p1 = p2 = p3 = prevPos;
1972 c3 = fText->char32At(prevPos);
1973 c0 = c1 = c2 = 0;
1974 (void)p0; // Suppress set but not used warning.
1975
1976 // Loop runs once per "significant" character position in the input text.
1977 for (;;) {
1978 // Move all of the positions forward in the input string.
1979 p0 = p1; c0 = c1;
1980 p1 = p2; c1 = c2;
1981 p2 = p3; c2 = c3;
1982
1983 // Advancd p3 by X(Extend | Format)* Rule 4
1984 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
1985 do {
1986 p3 = fText->moveIndex32(p3, 1);
1987 c3 = fText->char32At(p3);
1988 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
1989 break;
1990 };
1991 }
1992 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
1993
1994
1995 if (p1 == p2) {
1996 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1997 continue;
1998 }
1999 if (p2 == fText->length()) {
2000 // Reached end of string. Always a break position.
2001 break;
2002 }
2003
2004 // Rule (3) CR x LF
2005 // No Extend or Format characters may appear between the CR and LF,
2006 // which requires the additional check for p2 immediately following p1.
2007 //
2008 if (c1==0x0D && c2==0x0A) {
2009 continue;
2010 }
2011
2012 // Rule (3a) Break before and after newlines (including CR and LF)
2013 //
2014 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2015 break;
2016 };
2017 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2018 break;
2019 };
2020
2021 // Rule (3c) ZWJ x Extended_Pictographic
2022 // Not ignoring extend chars, so peek into input text to
2023 // get the potential ZWJ, the character immediately preceding c2.
2024 // Sloppy UChar32 indexing: p2-1 may reference trail half
2025 // but char32At will get the full code point.
2026 if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
2027 continue;
2028 }
2029
2030 // Rule (3d) Keep horizontal whitespace together.
2031 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2032 continue;
2033 }
2034
2035 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2036 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2037 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2038 continue;
2039 }
2040
2041 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2042 //
2043 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2044 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2045 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2046 continue;
2047 }
2048
2049 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2050 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2051 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2052 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2053 continue;
2054 }
2055
2056 // Rule (7a) Hebrew_Letter x Single_Quote
2057 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2058 continue;
2059 }
2060
2061 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2062 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2063 continue;
2064 }
2065
2066 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2067 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2068 continue;
2069 }
2070
2071 // Rule (8) Numeric x Numeric
2072 if (fNumericSet->contains(c1) &&
2073 fNumericSet->contains(c2)) {
2074 continue;
2075 }
2076
2077 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2078 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2079 fNumericSet->contains(c2)) {
2080 continue;
2081 }
2082
2083 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2084 if (fNumericSet->contains(c1) &&
2085 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2086 continue;
2087 }
2088
2089 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2090 if (fNumericSet->contains(c0) &&
2091 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2092 fNumericSet->contains(c2)) {
2093 continue;
2094 }
2095
2096 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2097 if (fNumericSet->contains(c1) &&
2098 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2099 fNumericSet->contains(c3)) {
2100 continue;
2101 }
2102
2103 // Rule (13) Katakana x Katakana
2104 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2105 // all Katakana are handled by the dictionary breaker.
2106 if (fKatakanaSet->contains(c1) &&
2107 fKatakanaSet->contains(c2)) {
2108 continue;
2109 }
2110
2111 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2112 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2113 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2114 fExtendNumLetSet->contains(c2)) {
2115 continue;
2116 }
2117
2118 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2119 if (fExtendNumLetSet->contains(c1) &&
2120 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2121 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2122 continue;
2123 }
2124
2125 // Rule 15 - 17 Group pairs of Regional Indicators.
2126 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2127 break;
2128 }
2129 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2130 continue;
2131 }
2132
2133 // Rule 999. Break found here.
2134 break;
2135 }
2136
2137 breakPos = p2;
2138 return breakPos;
2139 }
2140
2141
charClasses()2142 UVector *RBBIWordMonkey::charClasses() {
2143 return fSets;
2144 }
2145
2146
~RBBIWordMonkey()2147 RBBIWordMonkey::~RBBIWordMonkey() {
2148 delete fSets;
2149 delete fCRSet;
2150 delete fLFSet;
2151 delete fNewlineSet;
2152 delete fKatakanaSet;
2153 delete fHebrew_LetterSet;
2154 delete fALetterSet;
2155 delete fSingle_QuoteSet;
2156 delete fDouble_QuoteSet;
2157 delete fMidNumLetSet;
2158 delete fMidLetterSet;
2159 delete fMidNumSet;
2160 delete fNumericSet;
2161 delete fFormatSet;
2162 delete fExtendSet;
2163 delete fExtendNumLetSet;
2164 delete fWSegSpaceSet;
2165 delete fRegionalIndicatorSet;
2166 delete fDictionarySet;
2167 delete fOtherSet;
2168 delete fZWJSet;
2169 delete fExtendedPictSet;
2170 }
2171
2172
2173
2174
2175 //------------------------------------------------------------------------------------------
2176 //
2177 // class RBBISentMonkey Sentence Break specific implementation
2178 // of RBBIMonkeyKind.
2179 //
2180 //------------------------------------------------------------------------------------------
2181 class RBBISentMonkey: public RBBIMonkeyKind {
2182 public:
2183 RBBISentMonkey();
2184 virtual ~RBBISentMonkey();
2185 virtual UVector *charClasses();
2186 virtual void setText(const UnicodeString &s);
2187 virtual int32_t next(int32_t i);
2188 private:
2189 int moveBack(int posFrom);
2190 int moveForward(int posFrom);
2191 UChar32 cAt(int pos);
2192
2193 UVector *fSets;
2194
2195 UnicodeSet *fSepSet;
2196 UnicodeSet *fFormatSet;
2197 UnicodeSet *fSpSet;
2198 UnicodeSet *fLowerSet;
2199 UnicodeSet *fUpperSet;
2200 UnicodeSet *fOLetterSet;
2201 UnicodeSet *fNumericSet;
2202 UnicodeSet *fATermSet;
2203 UnicodeSet *fSContinueSet;
2204 UnicodeSet *fSTermSet;
2205 UnicodeSet *fCloseSet;
2206 UnicodeSet *fOtherSet;
2207 UnicodeSet *fExtendSet;
2208
2209 const UnicodeString *fText;
2210
2211 };
2212
RBBISentMonkey()2213 RBBISentMonkey::RBBISentMonkey()
2214 {
2215 UErrorCode status = U_ZERO_ERROR;
2216
2217 fSets = new UVector(status);
2218
2219 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2220 // set and made into character classes of their own. For the monkey impl,
2221 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2222 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2223 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2224 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2225 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2226 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2227 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2228 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2229 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2230 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2231 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2232 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2233 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2234 fOtherSet = new UnicodeSet();
2235
2236 if(U_FAILURE(status)) {
2237 deferredStatus = status;
2238 return;
2239 }
2240
2241 fOtherSet->complement();
2242 fOtherSet->removeAll(*fSepSet);
2243 fOtherSet->removeAll(*fFormatSet);
2244 fOtherSet->removeAll(*fSpSet);
2245 fOtherSet->removeAll(*fLowerSet);
2246 fOtherSet->removeAll(*fUpperSet);
2247 fOtherSet->removeAll(*fOLetterSet);
2248 fOtherSet->removeAll(*fNumericSet);
2249 fOtherSet->removeAll(*fATermSet);
2250 fOtherSet->removeAll(*fSContinueSet);
2251 fOtherSet->removeAll(*fSTermSet);
2252 fOtherSet->removeAll(*fCloseSet);
2253 fOtherSet->removeAll(*fExtendSet);
2254
2255 fSets->addElement(fSepSet, status);
2256 fSets->addElement(fFormatSet, status);
2257 fSets->addElement(fSpSet, status);
2258 fSets->addElement(fLowerSet, status);
2259 fSets->addElement(fUpperSet, status);
2260 fSets->addElement(fOLetterSet, status);
2261 fSets->addElement(fNumericSet, status);
2262 fSets->addElement(fATermSet, status);
2263 fSets->addElement(fSContinueSet, status);
2264 fSets->addElement(fSTermSet, status);
2265 fSets->addElement(fCloseSet, status);
2266 fSets->addElement(fOtherSet, status);
2267 fSets->addElement(fExtendSet, status);
2268
2269 if (U_FAILURE(status)) {
2270 deferredStatus = status;
2271 }
2272 }
2273
2274
2275
setText(const UnicodeString & s)2276 void RBBISentMonkey::setText(const UnicodeString &s) {
2277 fText = &s;
2278 }
2279
charClasses()2280 UVector *RBBISentMonkey::charClasses() {
2281 return fSets;
2282 }
2283
2284
2285 // moveBack() Find the "significant" code point preceding the index i.
2286 // Skips over ($Extend | $Format)* .
2287 //
moveBack(int i)2288 int RBBISentMonkey::moveBack(int i) {
2289 if (i <= 0) {
2290 return -1;
2291 }
2292 UChar32 c;
2293 int32_t j = i;
2294 do {
2295 j = fText->moveIndex32(j, -1);
2296 c = fText->char32At(j);
2297 }
2298 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2299 return j;
2300
2301 }
2302
2303
moveForward(int i)2304 int RBBISentMonkey::moveForward(int i) {
2305 if (i>=fText->length()) {
2306 return fText->length();
2307 }
2308 UChar32 c;
2309 int32_t j = i;
2310 do {
2311 j = fText->moveIndex32(j, 1);
2312 c = cAt(j);
2313 }
2314 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2315 return j;
2316 }
2317
cAt(int pos)2318 UChar32 RBBISentMonkey::cAt(int pos) {
2319 if (pos<0 || pos>=fText->length()) {
2320 return -1;
2321 } else {
2322 return fText->char32At(pos);
2323 }
2324 }
2325
next(int32_t prevPos)2326 int32_t RBBISentMonkey::next(int32_t prevPos) {
2327 int p0, p1, p2, p3; // Indices of the significant code points around the
2328 // break position being tested. The candidate break
2329 // location is before p2.
2330
2331 int breakPos = -1;
2332
2333 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2334 UChar32 c;
2335
2336 if (U_FAILURE(deferredStatus)) {
2337 return -1;
2338 }
2339
2340 // Prev break at end of string. return DONE.
2341 if (prevPos >= fText->length()) {
2342 return -1;
2343 }
2344 p0 = p1 = p2 = p3 = prevPos;
2345 c3 = fText->char32At(prevPos);
2346 c0 = c1 = c2 = 0;
2347 (void)p0; // Suppress set but not used warning.
2348
2349 // Loop runs once per "significant" character position in the input text.
2350 for (;;) {
2351 // Move all of the positions forward in the input string.
2352 p0 = p1; c0 = c1;
2353 p1 = p2; c1 = c2;
2354 p2 = p3; c2 = c3;
2355
2356 // Advancd p3 by X(Extend | Format)* Rule 4
2357 p3 = moveForward(p3);
2358 c3 = cAt(p3);
2359
2360 // Rule (3) CR x LF
2361 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2362 continue;
2363 }
2364
2365 // Rule (4). Sep <break>
2366 if (fSepSet->contains(c1)) {
2367 p2 = p1+1; // Separators don't combine with Extend or Format.
2368 break;
2369 }
2370
2371 if (p2 >= fText->length()) {
2372 // Reached end of string. Always a break position.
2373 break;
2374 }
2375
2376 if (p2 == prevPos) {
2377 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2378 continue;
2379 }
2380
2381 // Rule (6). ATerm x Numeric
2382 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2383 continue;
2384 }
2385
2386 // Rule (7). (Upper | Lower) ATerm x Uppper
2387 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2388 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2389 continue;
2390 }
2391
2392 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2393 // Note: STerm | ATerm are added to the negated part of the expression by a
2394 // note to the Unicode 5.0 documents.
2395 int p8 = p1;
2396 while (fSpSet->contains(cAt(p8))) {
2397 p8 = moveBack(p8);
2398 }
2399 while (fCloseSet->contains(cAt(p8))) {
2400 p8 = moveBack(p8);
2401 }
2402 if (fATermSet->contains(cAt(p8))) {
2403 p8=p2;
2404 for (;;) {
2405 c = cAt(p8);
2406 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2407 fLowerSet->contains(c) || fSepSet->contains(c) ||
2408 fATermSet->contains(c) || fSTermSet->contains(c)) {
2409 break;
2410 }
2411 p8 = moveForward(p8);
2412 }
2413 if (fLowerSet->contains(cAt(p8))) {
2414 continue;
2415 }
2416 }
2417
2418 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2419 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2420 p8 = p1;
2421 while (fSpSet->contains(cAt(p8))) {
2422 p8 = moveBack(p8);
2423 }
2424 while (fCloseSet->contains(cAt(p8))) {
2425 p8 = moveBack(p8);
2426 }
2427 c = cAt(p8);
2428 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2429 continue;
2430 }
2431 }
2432
2433 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2434 int p9 = p1;
2435 while (fCloseSet->contains(cAt(p9))) {
2436 p9 = moveBack(p9);
2437 }
2438 c = cAt(p9);
2439 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2440 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2441 continue;
2442 }
2443 }
2444
2445 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2446 int p10 = p1;
2447 while (fSpSet->contains(cAt(p10))) {
2448 p10 = moveBack(p10);
2449 }
2450 while (fCloseSet->contains(cAt(p10))) {
2451 p10 = moveBack(p10);
2452 }
2453 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2454 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2455 continue;
2456 }
2457 }
2458
2459 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2460 int p11 = p1;
2461 if (fSepSet->contains(cAt(p11))) {
2462 p11 = moveBack(p11);
2463 }
2464 while (fSpSet->contains(cAt(p11))) {
2465 p11 = moveBack(p11);
2466 }
2467 while (fCloseSet->contains(cAt(p11))) {
2468 p11 = moveBack(p11);
2469 }
2470 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2471 break;
2472 }
2473
2474 // Rule (12) Any x Any
2475 continue;
2476 }
2477 breakPos = p2;
2478 return breakPos;
2479 }
2480
~RBBISentMonkey()2481 RBBISentMonkey::~RBBISentMonkey() {
2482 delete fSets;
2483 delete fSepSet;
2484 delete fFormatSet;
2485 delete fSpSet;
2486 delete fLowerSet;
2487 delete fUpperSet;
2488 delete fOLetterSet;
2489 delete fNumericSet;
2490 delete fATermSet;
2491 delete fSContinueSet;
2492 delete fSTermSet;
2493 delete fCloseSet;
2494 delete fOtherSet;
2495 delete fExtendSet;
2496 }
2497
2498
2499
2500 //-------------------------------------------------------------------------------------------
2501 //
2502 // RBBILineMonkey
2503 //
2504 //-------------------------------------------------------------------------------------------
2505
2506 class RBBILineMonkey: public RBBIMonkeyKind {
2507 public:
2508 RBBILineMonkey();
2509 virtual ~RBBILineMonkey();
2510 virtual UVector *charClasses();
2511 virtual void setText(const UnicodeString &s);
2512 virtual int32_t next(int32_t i);
2513 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2514 private:
2515 UVector *fSets;
2516
2517 UnicodeSet *fBK;
2518 UnicodeSet *fCR;
2519 UnicodeSet *fLF;
2520 UnicodeSet *fCM;
2521 UnicodeSet *fNL;
2522 UnicodeSet *fSG;
2523 UnicodeSet *fWJ;
2524 UnicodeSet *fZW;
2525 UnicodeSet *fGL;
2526 UnicodeSet *fCB;
2527 UnicodeSet *fSP;
2528 UnicodeSet *fB2;
2529 UnicodeSet *fBA;
2530 UnicodeSet *fBB;
2531 UnicodeSet *fHH;
2532 UnicodeSet *fHY;
2533 UnicodeSet *fH2;
2534 UnicodeSet *fH3;
2535 UnicodeSet *fCL;
2536 UnicodeSet *fCP;
2537 UnicodeSet *fEX;
2538 UnicodeSet *fIN;
2539 UnicodeSet *fJL;
2540 UnicodeSet *fJV;
2541 UnicodeSet *fJT;
2542 UnicodeSet *fNS;
2543 UnicodeSet *fOP;
2544 UnicodeSet *fQU;
2545 UnicodeSet *fIS;
2546 UnicodeSet *fNU;
2547 UnicodeSet *fPO;
2548 UnicodeSet *fPR;
2549 UnicodeSet *fSY;
2550 UnicodeSet *fAI;
2551 UnicodeSet *fAL;
2552 UnicodeSet *fCJ;
2553 UnicodeSet *fHL;
2554 UnicodeSet *fID;
2555 UnicodeSet *fRI;
2556 UnicodeSet *fXX;
2557 UnicodeSet *fEB;
2558 UnicodeSet *fEM;
2559 UnicodeSet *fZWJ;
2560
2561 BreakIterator *fCharBI;
2562 const UnicodeString *fText;
2563 RegexMatcher *fNumberMatcher;
2564 };
2565
RBBILineMonkey()2566 RBBILineMonkey::RBBILineMonkey() :
2567 RBBIMonkeyKind(),
2568 fSets(NULL),
2569
2570 fCharBI(NULL),
2571 fText(NULL),
2572 fNumberMatcher(NULL)
2573
2574 {
2575 if (U_FAILURE(deferredStatus)) {
2576 return;
2577 }
2578
2579 UErrorCode status = U_ZERO_ERROR;
2580
2581 fSets = new UVector(status);
2582
2583 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2584 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2585 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2586 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2587 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2588 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2589 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2590 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2591 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2592 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2593 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2594 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2595 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2596 fHH = new UnicodeSet();
2597 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2598 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2599 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2600 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2601 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2602 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2603 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2604 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2605 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2606 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2607 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2608 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2609 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2610 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2611 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2612 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2613 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2614 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2615 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2616 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2617 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2618 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2619 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2620 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2621 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2622 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2623 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2624 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2625 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2626
2627 if (U_FAILURE(status)) {
2628 deferredStatus = status;
2629 return;
2630 }
2631
2632 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2633 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2634 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2635
2636 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2637 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2638
2639 fHH->add(u'\u2010'); // Hyphen, '‐'
2640
2641 fSets->addElement(fBK, status);
2642 fSets->addElement(fCR, status);
2643 fSets->addElement(fLF, status);
2644 fSets->addElement(fCM, status);
2645 fSets->addElement(fNL, status);
2646 fSets->addElement(fWJ, status);
2647 fSets->addElement(fZW, status);
2648 fSets->addElement(fGL, status);
2649 fSets->addElement(fCB, status);
2650 fSets->addElement(fSP, status);
2651 fSets->addElement(fB2, status);
2652 fSets->addElement(fBA, status);
2653 fSets->addElement(fBB, status);
2654 fSets->addElement(fHY, status);
2655 fSets->addElement(fH2, status);
2656 fSets->addElement(fH3, status);
2657 fSets->addElement(fCL, status);
2658 fSets->addElement(fCP, status);
2659 fSets->addElement(fEX, status);
2660 fSets->addElement(fIN, status);
2661 fSets->addElement(fJL, status);
2662 fSets->addElement(fJT, status);
2663 fSets->addElement(fJV, status);
2664 fSets->addElement(fNS, status);
2665 fSets->addElement(fOP, status);
2666 fSets->addElement(fQU, status);
2667 fSets->addElement(fIS, status);
2668 fSets->addElement(fNU, status);
2669 fSets->addElement(fPO, status);
2670 fSets->addElement(fPR, status);
2671 fSets->addElement(fSY, status);
2672 fSets->addElement(fAI, status);
2673 fSets->addElement(fAL, status);
2674 fSets->addElement(fHL, status);
2675 fSets->addElement(fID, status);
2676 fSets->addElement(fWJ, status);
2677 fSets->addElement(fRI, status);
2678 fSets->addElement(fSG, status);
2679 fSets->addElement(fEB, status);
2680 fSets->addElement(fEM, status);
2681 fSets->addElement(fZWJ, status);
2682
2683
2684 const char *rules =
2685 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2686 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2687 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2688 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2689 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2690 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2691
2692 fNumberMatcher = new RegexMatcher(
2693 UnicodeString(rules, -1, US_INV), 0, status);
2694
2695 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2696
2697 if (U_FAILURE(status)) {
2698 deferredStatus = status;
2699 }
2700 }
2701
2702
setText(const UnicodeString & s)2703 void RBBILineMonkey::setText(const UnicodeString &s) {
2704 fText = &s;
2705 fCharBI->setText(s);
2706 fNumberMatcher->reset(s);
2707 }
2708
2709 //
2710 // rule9Adjust
2711 // Line Break TR rules 9 and 10 implementation.
2712 // This deals with combining marks and other sequences that
2713 // that must be treated as if they were something other than what they actually are.
2714 //
2715 // This is factored out into a separate function because it must be applied twice for
2716 // each potential break, once to the chars before the position being checked, then
2717 // again to the text following the possible break.
2718 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2719 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2720 if (pos == -1) {
2721 // Invalid initial position. Happens during the warmup iteration of the
2722 // main loop in next().
2723 return;
2724 }
2725
2726 int32_t nPos = *nextPos;
2727
2728 // LB 9 Keep combining sequences together.
2729 // advance over any CM class chars. Note that Line Break CM is different
2730 // from the normal Grapheme Extend property.
2731 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2732 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2733 for (;;) {
2734 *nextChar = fText->char32At(nPos);
2735 if (!fCM->contains(*nextChar)) {
2736 break;
2737 }
2738 nPos = fText->moveIndex32(nPos, 1);
2739 }
2740 }
2741
2742
2743 // LB 9 Treat X CM* as if it were x.
2744 // No explicit action required.
2745
2746 // LB 10 Treat any remaining combining mark as AL
2747 if (fCM->contains(*posChar)) {
2748 *posChar = u'A';
2749 }
2750
2751 // Push the updated nextPos and nextChar back to our caller.
2752 // This only makes a difference if posChar got bigger by consuming a
2753 // combining sequence.
2754 *nextPos = nPos;
2755 *nextChar = fText->char32At(nPos);
2756 }
2757
2758
2759
next(int32_t startPos)2760 int32_t RBBILineMonkey::next(int32_t startPos) {
2761 UErrorCode status = U_ZERO_ERROR;
2762 int32_t pos; // Index of the char following a potential break position
2763 UChar32 thisChar; // Character at above position "pos"
2764
2765 int32_t prevPos; // Index of the char preceding a potential break position
2766 UChar32 prevChar; // Character at above position. Note that prevChar
2767 // and thisChar may not be adjacent because combining
2768 // characters between them will be ignored.
2769
2770 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2771 UChar32 prevCharX2;
2772
2773 int32_t nextPos; // Index of the next character following pos.
2774 // Usually skips over combining marks.
2775 int32_t nextCPPos; // Index of the code point following "pos."
2776 // May point to a combining mark.
2777 int32_t tPos; // temp value.
2778 UChar32 c;
2779
2780 if (U_FAILURE(deferredStatus)) {
2781 return -1;
2782 }
2783
2784 if (startPos >= fText->length()) {
2785 return -1;
2786 }
2787
2788
2789 // Initial values for loop. Loop will run the first time without finding breaks,
2790 // while the invalid values shift out and the "this" and
2791 // "prev" positions are filled in with good values.
2792 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2793 thisChar = prevChar = prevCharX2 = 0;
2794 nextPos = nextCPPos = startPos;
2795
2796
2797 // Loop runs once per position in the test text, until a break position
2798 // is found.
2799 for (;;) {
2800 prevPosX2 = prevPos;
2801 prevCharX2 = prevChar;
2802
2803 prevPos = pos;
2804 prevChar = thisChar;
2805
2806 pos = nextPos;
2807 thisChar = fText->char32At(pos);
2808
2809 nextCPPos = fText->moveIndex32(pos, 1);
2810 nextPos = nextCPPos;
2811
2812 // Rule LB2 - Break at end of text.
2813 if (pos >= fText->length()) {
2814 break;
2815 }
2816
2817 // Rule LB 9 - adjust for combining sequences.
2818 // We do this one out-of-order because the adjustment does not change anything
2819 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2820 // be applied.
2821 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2822 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2823 c = fText->char32At(nextPos);
2824 rule9Adjust(pos, &thisChar, &nextPos, &c);
2825
2826 // If the loop is still warming up - if we haven't shifted the initial
2827 // -1 positions out of prevPos yet - loop back to advance the
2828 // position in the input without any further looking for breaks.
2829 if (prevPos == -1) {
2830 continue;
2831 }
2832
2833 // LB 4 Always break after hard line breaks,
2834 if (fBK->contains(prevChar)) {
2835 break;
2836 }
2837
2838 // LB 5 Break after CR, LF, NL, but not inside CR LF
2839 if (prevChar == 0x0d && thisChar == 0x0a) {
2840 continue;
2841 }
2842 if (prevChar == 0x0d ||
2843 prevChar == 0x0a ||
2844 prevChar == 0x85) {
2845 break;
2846 }
2847
2848 // LB 6 Don't break before hard line breaks
2849 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2850 fBK->contains(thisChar)) {
2851 continue;
2852 }
2853
2854
2855 // LB 7 Don't break before spaces or zero-width space.
2856 if (fSP->contains(thisChar)) {
2857 continue;
2858 }
2859
2860 if (fZW->contains(thisChar)) {
2861 continue;
2862 }
2863
2864 // LB 8 Break after zero width space
2865 // ZW SP* ÷
2866 // Scan backwards from prevChar for SP* ZW
2867 tPos = prevPos;
2868 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2869 tPos = fText->moveIndex32(tPos, -1);
2870 }
2871 if (fZW->contains(fText->char32At(tPos))) {
2872 break;
2873 }
2874
2875 // LB 25 Numbers
2876 // Move this test up, before LB8a, because numbers can match a longer sequence that would
2877 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
2878 if (fNumberMatcher->lookingAt(prevPos, status)) {
2879 if (U_FAILURE(status)) {
2880 break;
2881 }
2882 // Matched a number. But could have been just a single digit, which would
2883 // not represent a "no break here" between prevChar and thisChar
2884 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
2885 if (numEndIdx > pos) {
2886 // Number match includes at least our two chars being checked
2887 if (numEndIdx > nextPos) {
2888 // Number match includes additional chars. Update pos and nextPos
2889 // so that next loop iteration will continue at the end of the number,
2890 // checking for breaks between last char in number & whatever follows.
2891 pos = nextPos = numEndIdx;
2892 do {
2893 pos = fText->moveIndex32(pos, -1);
2894 thisChar = fText->char32At(pos);
2895 } while (fCM->contains(thisChar));
2896 }
2897 continue;
2898 }
2899 }
2900
2901 // LB 8a ZWJ x
2902 // The monkey test's way of ignoring combining characters doesn't work
2903 // for this rule. ZJ is also a CM. Need to get the actual character
2904 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
2905 {
2906 int32_t prevIdx = fText->moveIndex32(pos, -1);
2907 UChar32 prevC = fText->char32At(prevIdx);
2908 if (fZWJ->contains(prevC)) {
2909 continue;
2910 }
2911 }
2912
2913 // LB 9, 10 Already done, at top of loop.
2914 //
2915
2916
2917 // LB 11 Do not break before or after WORD JOINER and related characters.
2918 // x WJ
2919 // WJ x
2920 //
2921 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
2922 continue;
2923 }
2924
2925 // LB 12
2926 // GL x
2927 if (fGL->contains(prevChar)) {
2928 continue;
2929 }
2930
2931 // LB 12a
2932 // [^SP BA HY] x GL
2933 if (!(fSP->contains(prevChar) ||
2934 fBA->contains(prevChar) ||
2935 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
2936 continue;
2937 }
2938
2939
2940
2941 // LB 13 Don't break before closings.
2942 // NU x CL, NU x CP and NU x IS are not matched here so that they will
2943 // fall into LB 17 and the more general number regular expression.
2944 //
2945 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
2946 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
2947 fEX->contains(thisChar) ||
2948 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
2949 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
2950 continue;
2951 }
2952
2953 // LB 14 Don't break after OP SP*
2954 // Scan backwards, checking for this sequence.
2955 // The OP char could include combining marks, so we actually check for
2956 // OP CM* SP*
2957 // Another Twist: The Rule 67 fixes may have changed a SP CM
2958 // sequence into a ID char, so before scanning back through spaces,
2959 // verify that prevChar is indeed a space. The prevChar variable
2960 // may differ from fText[prevPos]
2961 tPos = prevPos;
2962 if (fSP->contains(prevChar)) {
2963 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
2964 tPos=fText->moveIndex32(tPos, -1);
2965 }
2966 }
2967 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
2968 tPos=fText->moveIndex32(tPos, -1);
2969 }
2970 if (fOP->contains(fText->char32At(tPos))) {
2971 continue;
2972 }
2973
2974
2975 // LB 15 QU SP* x OP
2976 if (fOP->contains(thisChar)) {
2977 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
2978 int tPos = prevPos;
2979 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2980 tPos = fText->moveIndex32(tPos, -1);
2981 }
2982 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
2983 tPos = fText->moveIndex32(tPos, -1);
2984 }
2985 if (fQU->contains(fText->char32At(tPos))) {
2986 continue;
2987 }
2988 }
2989
2990
2991
2992 // LB 16 (CL | CP) SP* x NS
2993 // Scan backwards for SP* CM* (CL | CP)
2994 if (fNS->contains(thisChar)) {
2995 int tPos = prevPos;
2996 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2997 tPos = fText->moveIndex32(tPos, -1);
2998 }
2999 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3000 tPos = fText->moveIndex32(tPos, -1);
3001 }
3002 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3003 continue;
3004 }
3005 }
3006
3007
3008 // LB 17 B2 SP* x B2
3009 if (fB2->contains(thisChar)) {
3010 // Scan backwards, checking for the B2 CM* SP* sequence.
3011 tPos = prevPos;
3012 if (fSP->contains(prevChar)) {
3013 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3014 tPos=fText->moveIndex32(tPos, -1);
3015 }
3016 }
3017 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3018 tPos=fText->moveIndex32(tPos, -1);
3019 }
3020 if (fB2->contains(fText->char32At(tPos))) {
3021 continue;
3022 }
3023 }
3024
3025
3026 // LB 18 break after space
3027 if (fSP->contains(prevChar)) {
3028 break;
3029 }
3030
3031 // LB 19
3032 // x QU
3033 // QU x
3034 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3035 continue;
3036 }
3037
3038 // LB 20 Break around a CB
3039 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3040 break;
3041 }
3042
3043 // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen.
3044 // Formerly this was a Finnish tailoring.
3045 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3046 // ^($HY | $HH) $AL;
3047 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3048 prevPosX2 == -1) {
3049 continue;
3050 }
3051
3052 // LB 21
3053 if (fBA->contains(thisChar) ||
3054 fHY->contains(thisChar) ||
3055 fNS->contains(thisChar) ||
3056 fBB->contains(prevChar) ) {
3057 continue;
3058 }
3059
3060 // LB 21a
3061 // HL (HY | BA) x
3062 if (fHL->contains(prevCharX2) &&
3063 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3064 continue;
3065 }
3066
3067 // LB 21b
3068 // SY x HL
3069 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3070 continue;
3071 }
3072
3073 // LB 22
3074 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3075 (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3076 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3077 ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3078 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3079 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
3080 continue;
3081 }
3082
3083
3084 // LB 23 (AL | HL) x NU
3085 // NU x (AL | HL)
3086 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3087 continue;
3088 }
3089 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3090 continue;
3091 }
3092
3093 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3094 // PR x (ID | EB | EM)
3095 // (ID | EB | EM) x PO
3096 if (fPR->contains(prevChar) &&
3097 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3098 continue;
3099 }
3100 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3101 fPO->contains(thisChar)) {
3102 continue;
3103 }
3104
3105 // LB 24 Do not break between prefix and letters or ideographs.
3106 // (PR | PO) x (AL | HL)
3107 // (AL | HL) x (PR | PO)
3108 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3109 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3110 continue;
3111 }
3112 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3113 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3114 continue;
3115 }
3116
3117 // LB 25 numbers match, moved up, before LB 8a,
3118
3119 // LB 26 Do not break a Korean syllable.
3120 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3121 fJV->contains(thisChar) ||
3122 fH2->contains(thisChar) ||
3123 fH3->contains(thisChar))) {
3124 continue;
3125 }
3126
3127 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3128 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3129 continue;
3130 }
3131
3132 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3133 fJT->contains(thisChar)) {
3134 continue;
3135 }
3136
3137 // LB 27 Treat a Korean Syllable Block the same as ID.
3138 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3139 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3140 fIN->contains(thisChar)) {
3141 continue;
3142 }
3143 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3144 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3145 fPO->contains(thisChar)) {
3146 continue;
3147 }
3148 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3149 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3150 continue;
3151 }
3152
3153
3154
3155 // LB 28 Do not break between alphabetics ("at").
3156 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3157 continue;
3158 }
3159
3160 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3161 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3162 continue;
3163 }
3164
3165 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3166 // (AL | NU) x OP
3167 // CP x (AL | NU)
3168 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3169 continue;
3170 }
3171 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3172 continue;
3173 }
3174
3175 // LB30a RI RI ÷ RI
3176 // RI x RI
3177 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3178 break;
3179 }
3180 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3181 // Two Regional Indicators have been paired.
3182 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3183 // following RI. This is a hack.
3184 thisChar = -1;
3185 continue;
3186 }
3187
3188 // LB30b Emoji Base x Emoji Modifier
3189 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3190 continue;
3191 }
3192
3193 // LB 31 Break everywhere else
3194 break;
3195
3196 }
3197
3198 return pos;
3199 }
3200
3201
charClasses()3202 UVector *RBBILineMonkey::charClasses() {
3203 return fSets;
3204 }
3205
3206
~RBBILineMonkey()3207 RBBILineMonkey::~RBBILineMonkey() {
3208 delete fSets;
3209
3210 delete fBK;
3211 delete fCR;
3212 delete fLF;
3213 delete fCM;
3214 delete fNL;
3215 delete fWJ;
3216 delete fZW;
3217 delete fGL;
3218 delete fCB;
3219 delete fSP;
3220 delete fB2;
3221 delete fBA;
3222 delete fBB;
3223 delete fHH;
3224 delete fHY;
3225 delete fH2;
3226 delete fH3;
3227 delete fCL;
3228 delete fCP;
3229 delete fEX;
3230 delete fIN;
3231 delete fJL;
3232 delete fJV;
3233 delete fJT;
3234 delete fNS;
3235 delete fOP;
3236 delete fQU;
3237 delete fIS;
3238 delete fNU;
3239 delete fPO;
3240 delete fPR;
3241 delete fSY;
3242 delete fAI;
3243 delete fAL;
3244 delete fCJ;
3245 delete fHL;
3246 delete fID;
3247 delete fRI;
3248 delete fSG;
3249 delete fXX;
3250 delete fEB;
3251 delete fEM;
3252 delete fZWJ;
3253
3254 delete fCharBI;
3255 delete fNumberMatcher;
3256 }
3257
3258
3259 //-------------------------------------------------------------------------------------------
3260 //
3261 // TestMonkey
3262 //
3263 // params
3264 // seed=nnnnn Random number starting seed.
3265 // Setting the seed allows errors to be reproduced.
3266 // loop=nnn Looping count. Controls running time.
3267 // -1: run forever.
3268 // 0 or greater: run length.
3269 //
3270 // type = char | word | line | sent | title
3271 //
3272 // Example:
3273 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3274 //
3275 //-------------------------------------------------------------------------------------------
3276
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3277 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3278 int32_t val = defaultVal;
3279 name.append(" *= *(-?\\d+)");
3280 UErrorCode status = U_ZERO_ERROR;
3281 RegexMatcher m(name, params, 0, status);
3282 if (m.find()) {
3283 // The param exists. Convert the string to an int.
3284 char valString[100];
3285 int32_t paramLength = m.end(1, status) - m.start(1, status);
3286 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3287 paramLength = (int32_t)(sizeof(valString)-2);
3288 }
3289 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3290 val = strtol(valString, NULL, 10);
3291
3292 // Delete this parameter from the params string.
3293 m.reset();
3294 params = m.replaceFirst("", status);
3295 }
3296 U_ASSERT(U_SUCCESS(status));
3297 return val;
3298 }
3299 #endif
3300
3301 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3302 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3303 BreakIterator *bi,
3304 int expected[],
3305 int expectedcount)
3306 {
3307 int count = 0;
3308 int i = 0;
3309 int forward[50];
3310 bi->setText(ustr);
3311 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3312 forward[count] = i;
3313 if (count < expectedcount && expected[count] != i) {
3314 test->errln("%s:%d break forward test failed: expected %d but got %d",
3315 __FILE__, __LINE__, expected[count], i);
3316 break;
3317 }
3318 count ++;
3319 }
3320 if (count != expectedcount) {
3321 printStringBreaks(ustr, expected, expectedcount);
3322 test->errln("%s:%d break forward test failed: missed %d match",
3323 __FILE__, __LINE__, expectedcount - count);
3324 return;
3325 }
3326 // testing boundaries
3327 for (i = 1; i < expectedcount; i ++) {
3328 int j = expected[i - 1];
3329 if (!bi->isBoundary(j)) {
3330 printStringBreaks(ustr, expected, expectedcount);
3331 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3332 __FILE__, __LINE__, j);
3333 return;
3334 }
3335 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3336 if (bi->isBoundary(j)) {
3337 printStringBreaks(ustr, expected, expectedcount);
3338 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3339 __FILE__, __LINE__, j);
3340 return;
3341 }
3342 }
3343 }
3344
3345 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3346 count --;
3347 if (forward[count] != i) {
3348 printStringBreaks(ustr, expected, expectedcount);
3349 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3350 __FILE__, __LINE__, forward[count], i);
3351 break;
3352 }
3353 }
3354 if (count != 0) {
3355 printStringBreaks(ustr, expected, expectedcount);
3356 test->errln("break test previous() failed: missed a match");
3357 return;
3358 }
3359
3360 // testing preceding
3361 for (i = 0; i < expectedcount - 1; i ++) {
3362 // int j = expected[i] + 1;
3363 int j = ustr.moveIndex32(expected[i], 1);
3364 for (; j <= expected[i + 1]; j ++) {
3365 int32_t expectedPreceding = expected[i];
3366 int32_t actualPreceding = bi->preceding(j);
3367 if (actualPreceding != expectedPreceding) {
3368 printStringBreaks(ustr, expected, expectedcount);
3369 test->errln("%s:%d preceding(%d): expected %d, got %d",
3370 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3371 return;
3372 }
3373 }
3374 }
3375 }
3376 #endif
3377
TestWordBreaks(void)3378 void RBBITest::TestWordBreaks(void)
3379 {
3380 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3381
3382 Locale locale("en");
3383 UErrorCode status = U_ZERO_ERROR;
3384 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3385 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3386 // Replaced any C+J characters in a row with a random sequence of characters
3387 // of the same length to make our C+J segmentation not get in the way.
3388 static const char *strlist[] =
3389 {
3390 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3391 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3392 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3393 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3394 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3395 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3396 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3397 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3398 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3399 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3400 "\\u2027\\U000e0067\\u0a47\\u00b7",
3401 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3402 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3403 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3404 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3405 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3406 "\\u0027\\u11af\\U000e0057\\u0602",
3407 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3408 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3409 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3410 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3411 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3412 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3413 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3414 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3415 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3416 "\\u18f4\\U000e0049\\u20e7\\u2027",
3417 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3418 "\\ua183\\u102d\\u0bec\\u003a",
3419 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3420 "\\u003a\\u0e57\\u0fad\\u002e",
3421 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3422 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3423 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3424 "\\u003a\\u0664\\u00b7\\u1fba",
3425 "\\u003b\\u0027\\u00b7\\u47a3",
3426 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3427 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3428 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3429 };
3430 int loop;
3431 if (U_FAILURE(status)) {
3432 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3433 return;
3434 }
3435 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3436 // printf("looping %d\n", loop);
3437 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3438 // RBBICharMonkey monkey;
3439 RBBIWordMonkey monkey;
3440
3441 int expected[50];
3442 int expectedcount = 0;
3443
3444 monkey.setText(ustr);
3445 int i;
3446 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3447 expected[expectedcount ++] = i;
3448 }
3449
3450 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3451 }
3452 delete bi;
3453 #endif
3454 }
3455
TestWordBoundary(void)3456 void RBBITest::TestWordBoundary(void)
3457 {
3458 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3459 Locale locale("en");
3460 UErrorCode status = U_ZERO_ERROR;
3461 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3462 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3463 if (U_FAILURE(status)) {
3464 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3465 __FILE__, __LINE__, u_errorName(status));
3466 return;
3467 }
3468 UChar str[50];
3469 static const char *strlist[] =
3470 {
3471 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3472 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3473 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3474 "\\u2027\\U000e0067\\u0a47\\u00b7",
3475 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3476 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3477 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3478 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3479 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3480 "\\u0027\\u11af\\U000e0057\\u0602",
3481 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3482 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3483 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3484 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3485 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3486 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3487 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3488 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3489 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3490 "\\u58f4\\U000e0049\\u20e7\\u2027",
3491 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3492 "\\ua183\\u102d\\u0bec\\u003a",
3493 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3494 "\\u003a\\u0e57\\u0fad\\u002e",
3495 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3496 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3497 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3498 "\\u003a\\u0664\\u00b7\\u1fba",
3499 "\\u003b\\u0027\\u00b7\\u47a3",
3500 };
3501 int loop;
3502 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3503 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3504 UnicodeString ustr(str);
3505 int forward[50];
3506 int count = 0;
3507
3508 bi->setText(ustr);
3509 int prev = -1;
3510 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3511 ++count;
3512 if (count >= UPRV_LENGTHOF(forward)) {
3513 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3514 __FILE__, __LINE__, loop, count, boundary);
3515 return;
3516 }
3517 forward[count] = boundary;
3518 if (boundary <= prev) {
3519 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3520 __FILE__, __LINE__, loop, prev, boundary);
3521 break;
3522 }
3523 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3524 if (bi->isBoundary(nonBoundary)) {
3525 printStringBreaks(ustr, forward, count);
3526 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3527 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3528 return;
3529 }
3530 }
3531 if (!bi->isBoundary(boundary)) {
3532 printStringBreaks(ustr, forward, count);
3533 errln("%s:%d happy boundary test failed: expected %d a boundary",
3534 __FILE__, __LINE__, boundary);
3535 return;
3536 }
3537 prev = boundary;
3538 }
3539 }
3540 }
3541
TestLineBreaks(void)3542 void RBBITest::TestLineBreaks(void)
3543 {
3544 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3545 Locale locale("en");
3546 UErrorCode status = U_ZERO_ERROR;
3547 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3548 const int32_t STRSIZE = 50;
3549 UChar str[STRSIZE];
3550 static const char *strlist[] =
3551 {
3552 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3553 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3554 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3555 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3556 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3557 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3558 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3559 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3560 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3561 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3562 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3563 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3564 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3565 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3566 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3567 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3568 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3569 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3570 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3571 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3572 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3573 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3574 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3575 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3576 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3577 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3578 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3579 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3580 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3581 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3582 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3583 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3584 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3585 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3586 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3587 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3588 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3589 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3590 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3591 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3592 };
3593 int loop;
3594 TEST_ASSERT_SUCCESS(status);
3595 if (U_FAILURE(status)) {
3596 return;
3597 }
3598 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3599 // printf("looping %d\n", loop);
3600 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3601 if (t >= STRSIZE) {
3602 TEST_ASSERT(FALSE);
3603 continue;
3604 }
3605
3606
3607 UnicodeString ustr(str);
3608 RBBILineMonkey monkey;
3609 if (U_FAILURE(monkey.deferredStatus)) {
3610 continue;
3611 }
3612
3613 const int EXPECTEDSIZE = 50;
3614 int expected[EXPECTEDSIZE];
3615 int expectedcount = 0;
3616
3617 monkey.setText(ustr);
3618 int i;
3619 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3620 if (expectedcount >= EXPECTEDSIZE) {
3621 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3622 return;
3623 }
3624 expected[expectedcount ++] = i;
3625 }
3626
3627 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3628 }
3629 delete bi;
3630 #endif
3631 }
3632
TestSentBreaks(void)3633 void RBBITest::TestSentBreaks(void)
3634 {
3635 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3636 Locale locale("en");
3637 UErrorCode status = U_ZERO_ERROR;
3638 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3639 UChar str[200];
3640 static const char *strlist[] =
3641 {
3642 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3643 "This\n",
3644 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3645 "\"Sentence ending with a quote.\" Bye.",
3646 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3647 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3648 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3649 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3650 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3651 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3652 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3653 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3654 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3655 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3656 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3657 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3658 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3659 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3660 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3661 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3662 };
3663 int loop;
3664 if (U_FAILURE(status)) {
3665 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3666 return;
3667 }
3668 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3669 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3670 UnicodeString ustr(str);
3671
3672 RBBISentMonkey monkey;
3673 if (U_FAILURE(monkey.deferredStatus)) {
3674 continue;
3675 }
3676
3677 const int EXPECTEDSIZE = 50;
3678 int expected[EXPECTEDSIZE];
3679 int expectedcount = 0;
3680
3681 monkey.setText(ustr);
3682 int i;
3683 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3684 if (expectedcount >= EXPECTEDSIZE) {
3685 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3686 return;
3687 }
3688 expected[expectedcount ++] = i;
3689 }
3690
3691 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3692 }
3693 delete bi;
3694 #endif
3695 }
3696
TestMonkey()3697 void RBBITest::TestMonkey() {
3698 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3699
3700 UErrorCode status = U_ZERO_ERROR;
3701 int32_t loopCount = 500;
3702 int32_t seed = 1;
3703 UnicodeString breakType = "all";
3704 Locale locale("en");
3705 UBool useUText = FALSE;
3706
3707 if (quick == FALSE) {
3708 loopCount = 10000;
3709 }
3710
3711 if (fTestParams) {
3712 UnicodeString p(fTestParams);
3713 loopCount = getIntParam("loop", p, loopCount);
3714 seed = getIntParam("seed", p, seed);
3715
3716 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3717 if (m.find()) {
3718 breakType = m.group(1, status);
3719 m.reset();
3720 p = m.replaceFirst("", status);
3721 }
3722
3723 RegexMatcher u(" *utext", p, 0, status);
3724 if (u.find()) {
3725 useUText = TRUE;
3726 u.reset();
3727 p = u.replaceFirst("", status);
3728 }
3729
3730
3731 // m.reset(p);
3732 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3733 // Each option is stripped out of the option string as it is processed.
3734 // All options have been checked. The option string should have been completely emptied..
3735 char buf[100];
3736 p.extract(buf, sizeof(buf), NULL, status);
3737 buf[sizeof(buf)-1] = 0;
3738 errln("Unrecognized or extra parameter: %s\n", buf);
3739 return;
3740 }
3741
3742 }
3743
3744 if (breakType == "char" || breakType == "all") {
3745 RBBICharMonkey m;
3746 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3747 if (U_SUCCESS(status)) {
3748 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3749 if (breakType == "all" && useUText==FALSE) {
3750 // Also run a quick test with UText when "all" is specified
3751 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3752 }
3753 }
3754 else {
3755 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3756 }
3757 delete bi;
3758 }
3759
3760 if (breakType == "word" || breakType == "all") {
3761 logln("Word Break Monkey Test");
3762 RBBIWordMonkey m;
3763 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3764 if (U_SUCCESS(status)) {
3765 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3766 }
3767 else {
3768 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3769 }
3770 delete bi;
3771 }
3772
3773 if (breakType == "line" || breakType == "all") {
3774 logln("Line Break Monkey Test");
3775 RBBILineMonkey m;
3776 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3777 if (loopCount >= 10) {
3778 loopCount = loopCount / 5; // Line break runs slower than the others.
3779 }
3780 if (U_SUCCESS(status)) {
3781 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3782 }
3783 else {
3784 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3785 }
3786 delete bi;
3787 }
3788
3789 if (breakType == "sent" || breakType == "all" ) {
3790 logln("Sentence Break Monkey Test");
3791 RBBISentMonkey m;
3792 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3793 if (loopCount >= 10) {
3794 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3795 }
3796 if (U_SUCCESS(status)) {
3797 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3798 }
3799 else {
3800 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3801 }
3802 delete bi;
3803 }
3804
3805 #endif
3806 }
3807
3808 //
3809 // Run a RBBI monkey test. Common routine, for all break iterator types.
3810 // Parameters:
3811 // bi - the break iterator to use
3812 // mk - MonkeyKind, abstraction for obtaining expected results
3813 // name - Name of test (char, word, etc.) for use in error messages
3814 // seed - Seed for starting random number generator (parameter from user)
3815 // numIterations
3816 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)3817 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
3818 int32_t numIterations, UBool useUText) {
3819
3820 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3821
3822 const int32_t TESTSTRINGLEN = 500;
3823 UnicodeString testText;
3824 int32_t numCharClasses;
3825 UVector *chClasses;
3826 int expected[TESTSTRINGLEN*2 + 1];
3827 int expectedCount = 0;
3828 char expectedBreaks[TESTSTRINGLEN*2 + 1];
3829 char forwardBreaks[TESTSTRINGLEN*2 + 1];
3830 char reverseBreaks[TESTSTRINGLEN*2+1];
3831 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
3832 char followingBreaks[TESTSTRINGLEN*2+1];
3833 char precedingBreaks[TESTSTRINGLEN*2+1];
3834 int i;
3835 int loopCount = 0;
3836
3837 m_seed = seed;
3838
3839 numCharClasses = mk.charClasses()->size();
3840 chClasses = mk.charClasses();
3841
3842 // Check for errors that occured during the construction of the MonkeyKind object.
3843 // Can't report them where they occured because errln() is a method coming from intlTest,
3844 // and is not visible outside of RBBITest :-(
3845 if (U_FAILURE(mk.deferredStatus)) {
3846 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3847 return;
3848 }
3849
3850 // Verify that the character classes all have at least one member.
3851 for (i=0; i<numCharClasses; i++) {
3852 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3853 if (s == NULL || s->size() == 0) {
3854 errln("Character Class #%d is null or of zero size.", i);
3855 return;
3856 }
3857 }
3858
3859 while (loopCount < numIterations || numIterations == -1) {
3860 if (numIterations == -1 && loopCount % 10 == 0) {
3861 // If test is running in an infinite loop, display a periodic tic so
3862 // we can tell that it is making progress.
3863 fprintf(stderr, ".");
3864 }
3865 // Save current random number seed, so that we can recreate the random numbers
3866 // for this loop iteration in event of an error.
3867 seed = m_seed;
3868
3869 // Populate a test string with data.
3870 testText.truncate(0);
3871 for (i=0; i<TESTSTRINGLEN; i++) {
3872 int32_t aClassNum = m_rand() % numCharClasses;
3873 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3874 int32_t charIdx = m_rand() % classSet->size();
3875 UChar32 c = classSet->charAt(charIdx);
3876 if (c < 0) { // TODO: deal with sets containing strings.
3877 errln("%s:%d c < 0", __FILE__, __LINE__);
3878 break;
3879 }
3880 // Do not assemble a supplementary character from randomly generated separate surrogates.
3881 // (It could be a dictionary character)
3882 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
3883 continue;
3884 }
3885
3886 testText.append(c);
3887 }
3888
3889 // Calculate the expected results for this test string.
3890 mk.setText(testText);
3891 memset(expectedBreaks, 0, sizeof(expectedBreaks));
3892 expectedBreaks[0] = 1;
3893 int32_t breakPos = 0;
3894 expectedCount = 0;
3895 for (;;) {
3896 breakPos = mk.next(breakPos);
3897 if (breakPos == -1) {
3898 break;
3899 }
3900 if (breakPos > testText.length()) {
3901 errln("breakPos > testText.length()");
3902 }
3903 expectedBreaks[breakPos] = 1;
3904 U_ASSERT(expectedCount<testText.length());
3905 expected[expectedCount ++] = breakPos;
3906 (void)expected; // Set but not used warning.
3907 // TODO (andy): check it out.
3908 }
3909
3910 // Find the break positions using forward iteration
3911 memset(forwardBreaks, 0, sizeof(forwardBreaks));
3912 if (useUText) {
3913 UErrorCode status = U_ZERO_ERROR;
3914 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3915 // testUText = utext_openUnicodeString(testUText, &testText, &status);
3916 bi->setText(testUText, status);
3917 TEST_ASSERT_SUCCESS(status);
3918 utext_close(testUText); // The break iterator does a shallow clone of the UText
3919 // This UText can be closed immediately, so long as the
3920 // testText string continues to exist.
3921 } else {
3922 bi->setText(testText);
3923 }
3924
3925 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
3926 if (i < 0 || i > testText.length()) {
3927 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3928 break;
3929 }
3930 forwardBreaks[i] = 1;
3931 }
3932
3933 // Find the break positions using reverse iteration
3934 memset(reverseBreaks, 0, sizeof(reverseBreaks));
3935 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
3936 if (i < 0 || i > testText.length()) {
3937 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3938 break;
3939 }
3940 reverseBreaks[i] = 1;
3941 }
3942
3943 // Find the break positions using isBoundary() tests.
3944 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
3945 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
3946 for (i=0; i<=testText.length(); i++) {
3947 isBoundaryBreaks[i] = bi->isBoundary(i);
3948 }
3949
3950
3951 // Find the break positions using the following() function.
3952 // printf(".");
3953 memset(followingBreaks, 0, sizeof(followingBreaks));
3954 int32_t lastBreakPos = 0;
3955 followingBreaks[0] = 1;
3956 for (i=0; i<testText.length(); i++) {
3957 breakPos = bi->following(i);
3958 if (breakPos <= i ||
3959 breakPos < lastBreakPos ||
3960 breakPos > testText.length() ||
3961 (breakPos > lastBreakPos && lastBreakPos > i)) {
3962 errln("%s break monkey test: "
3963 "Out of range value returned by BreakIterator::following().\n"
3964 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
3965 name, seed, i, breakPos, lastBreakPos);
3966 break;
3967 }
3968 followingBreaks[breakPos] = 1;
3969 lastBreakPos = breakPos;
3970 }
3971
3972 // Find the break positions using the preceding() function.
3973 memset(precedingBreaks, 0, sizeof(precedingBreaks));
3974 lastBreakPos = testText.length();
3975 precedingBreaks[testText.length()] = 1;
3976 for (i=testText.length(); i>0; i--) {
3977 breakPos = bi->preceding(i);
3978 if (breakPos >= i ||
3979 breakPos > lastBreakPos ||
3980 (breakPos < 0 && testText.getChar32Start(i)>0) ||
3981 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
3982 errln("%s break monkey test: "
3983 "Out of range value returned by BreakIterator::preceding().\n"
3984 "index=%d; prev returned %d; lastBreak=%d" ,
3985 name, i, breakPos, lastBreakPos);
3986 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
3987 precedingBreaks[i] = 2; // Forces an error.
3988 }
3989 } else {
3990 if (breakPos >= 0) {
3991 precedingBreaks[breakPos] = 1;
3992 }
3993 lastBreakPos = breakPos;
3994 }
3995 }
3996
3997 // Compare the expected and actual results.
3998 for (i=0; i<=testText.length(); i++) {
3999 const char *errorType = NULL;
4000 if (forwardBreaks[i] != expectedBreaks[i]) {
4001 errorType = "next()";
4002 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4003 errorType = "previous()";
4004 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4005 errorType = "isBoundary()";
4006 } else if (followingBreaks[i] != expectedBreaks[i]) {
4007 errorType = "following()";
4008 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4009 errorType = "preceding()";
4010 }
4011
4012
4013 if (errorType != NULL) {
4014 // Format a range of the test text that includes the failure as
4015 // a data item that can be included in the rbbi test data file.
4016
4017 // Start of the range is the last point where expected and actual results
4018 // both agreed that there was a break position.
4019 int startContext = i;
4020 int32_t count = 0;
4021 for (;;) {
4022 if (startContext==0) { break; }
4023 startContext --;
4024 if (expectedBreaks[startContext] != 0) {
4025 if (count == 2) break;
4026 count ++;
4027 }
4028 }
4029
4030 // End of range is two expected breaks past the start position.
4031 int endContext = i + 1;
4032 int ci;
4033 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4034 for (;;) {
4035 if (endContext >= testText.length()) {break;}
4036 if (expectedBreaks[endContext-1] != 0) {
4037 if (count == 0) break;
4038 count --;
4039 }
4040 endContext ++;
4041 }
4042 }
4043
4044 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4045 UnicodeString errorText = "<data>";
4046 /***if (strcmp(errorType, "next()") == 0) {
4047 startContext = 0;
4048 endContext = testText.length();
4049
4050 printStringBreaks(testText, expected, expectedCount);
4051 }***/
4052
4053 for (ci=startContext; ci<endContext;) {
4054 UnicodeString hexChars("0123456789abcdef");
4055 UChar32 c;
4056 int bn;
4057 c = testText.char32At(ci);
4058 if (ci == i) {
4059 // This is the location of the error.
4060 errorText.append("<?>");
4061 } else if (expectedBreaks[ci] != 0) {
4062 // This a non-error expected break position.
4063 errorText.append("\\");
4064 }
4065 if (c < 0x10000) {
4066 errorText.append("\\u");
4067 for (bn=12; bn>=0; bn-=4) {
4068 errorText.append(hexChars.charAt((c>>bn)&0xf));
4069 }
4070 } else {
4071 errorText.append("\\U");
4072 for (bn=28; bn>=0; bn-=4) {
4073 errorText.append(hexChars.charAt((c>>bn)&0xf));
4074 }
4075 }
4076 ci = testText.moveIndex32(ci, 1);
4077 }
4078 errorText.append("\\");
4079 errorText.append("</data>\n");
4080
4081 // Output the error
4082 char charErrorTxt[500];
4083 UErrorCode status = U_ZERO_ERROR;
4084 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4085 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4086 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4087
4088 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4089 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4090 errorType, seed, i, charErrorTxt);
4091 break;
4092 }
4093 }
4094
4095 loopCount++;
4096 }
4097 #endif
4098 }
4099
4100
4101 // Bug 5532. UTF-8 based UText fails in dictionary code.
4102 // This test checks the initial patch,
4103 // which is to just keep it from crashing. Correct word boundaries
4104 // await a proper fix to the dictionary code.
4105 //
TestBug5532(void)4106 void RBBITest::TestBug5532(void) {
4107 // Text includes a mixture of Thai and Latin.
4108 const unsigned char utf8Data[] = {
4109 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4110 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4111 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4112 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4113 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4114 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4115 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4116 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4117 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4118 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4119 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4120
4121 UErrorCode status = U_ZERO_ERROR;
4122 UText utext=UTEXT_INITIALIZER;
4123 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4124 TEST_ASSERT_SUCCESS(status);
4125
4126 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4127 TEST_ASSERT_SUCCESS(status);
4128 if (U_SUCCESS(status)) {
4129 bi->setText(&utext, status);
4130 TEST_ASSERT_SUCCESS(status);
4131
4132 int32_t breakCount = 0;
4133 int32_t previousBreak = -1;
4134 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4135 // For now, just make sure that the break iterator doesn't hang.
4136 TEST_ASSERT(previousBreak < bi->current());
4137 previousBreak = bi->current();
4138 }
4139 TEST_ASSERT(breakCount > 0);
4140 }
4141 delete bi;
4142 utext_close(&utext);
4143 }
4144
4145
TestBug9983(void)4146 void RBBITest::TestBug9983(void) {
4147 UnicodeString text = UnicodeString("\\u002A" // * Other
4148 "\\uFF65" // Other
4149 "\\u309C" // Katakana
4150 "\\uFF9F" // Extend
4151 "\\uFF65" // Other
4152 "\\u0020" // Other
4153 "\\u0000").unescape();
4154
4155 UErrorCode status = U_ZERO_ERROR;
4156 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4157 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4158 TEST_ASSERT_SUCCESS(status);
4159 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4160 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4161 TEST_ASSERT_SUCCESS(status);
4162 if (U_FAILURE(status)) {
4163 return;
4164 }
4165 int32_t offset, rstatus, iterationCount;
4166
4167 brkiter->setText(text);
4168 brkiter->last();
4169 iterationCount = 0;
4170 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4171 iterationCount++;
4172 rstatus = brkiter->getRuleStatus();
4173 (void)rstatus; // Suppress set but not used warning.
4174 if (iterationCount >= 10) {
4175 break;
4176 }
4177 }
4178 TEST_ASSERT(iterationCount == 6);
4179
4180 brkiterPOSIX->setText(text);
4181 brkiterPOSIX->last();
4182 iterationCount = 0;
4183 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4184 iterationCount++;
4185 rstatus = brkiterPOSIX->getRuleStatus();
4186 (void)rstatus; // Suppress set but not used warning.
4187 if (iterationCount >= 10) {
4188 break;
4189 }
4190 }
4191 TEST_ASSERT(iterationCount == 6);
4192 }
4193
4194 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4195 //
TestBug7547()4196 void RBBITest::TestBug7547() {
4197 UnicodeString rules;
4198 UErrorCode status = U_ZERO_ERROR;
4199 UParseError parseError;
4200 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4201 if (status != U_BRK_RULE_SYNTAX) {
4202 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4203 }
4204 if (parseError.line != 1 || parseError.offset != 0) {
4205 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4206 }
4207 }
4208
4209
TestBug12797()4210 void RBBITest::TestBug12797() {
4211 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4212 UErrorCode status = U_ZERO_ERROR;
4213 UParseError parseError;
4214 RuleBasedBreakIterator bi(rules, parseError, status);
4215 if (U_FAILURE(status)) {
4216 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4217 return;
4218 }
4219 UnicodeString text = "abc";
4220 bi.setText(text);
4221 bi.first();
4222 int32_t boundary = bi.next();
4223 if (boundary != 3) {
4224 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4225 }
4226 }
4227
TestBug12918()4228 void RBBITest::TestBug12918() {
4229 // This test triggers an assertion failure in dictbe.cpp
4230 const UChar *crasherString = u"\u3325\u4a16";
4231 UErrorCode status = U_ZERO_ERROR;
4232 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4233 if (U_FAILURE(status)) {
4234 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4235 return;
4236 }
4237 ubrk_first(iter);
4238 int32_t pos = 0;
4239 int32_t lastPos = -1;
4240 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4241 if (pos <= lastPos) {
4242 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4243 break;
4244 }
4245 }
4246 ubrk_close(iter);
4247 }
4248
TestBug12932()4249 void RBBITest::TestBug12932() {
4250 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4251 UnicodeString ruleStr(
4252 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4253 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4254 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4255 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4256 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4257 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4258
4259 UErrorCode status = U_ZERO_ERROR;
4260 UParseError parseError;
4261 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4262 if (status != U_BRK_RULE_SYNTAX) {
4263 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4264 __FILE__, __LINE__, u_errorName(status));
4265 }
4266 }
4267
4268
4269 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4270 // remain undevided by ICU char, word and line break.
TestEmoji()4271 void RBBITest::TestEmoji() {
4272 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4273 UErrorCode status = U_ZERO_ERROR;
4274
4275 CharString testFileName;
4276 testFileName.append(IntlTest::getSourceTestData(status), status);
4277 testFileName.appendPathPart("emoji-test.txt", status);
4278 if (U_FAILURE(status)) {
4279 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4280 return;
4281 }
4282 logln("Opening data file %s\n", testFileName.data());
4283
4284 int len;
4285 UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4286 if (U_FAILURE(status) || testFile == NULL) {
4287 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4288 return;
4289 }
4290 UnicodeString testFileAsString(testFile, len);
4291 delete [] testFile;
4292
4293 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4294 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4295 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4296 int32_t lineNumber = 0;
4297
4298 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4299 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4300 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4301 if (U_FAILURE(status)) {
4302 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4303 return;
4304 }
4305
4306 while (lineMatcher.find()) {
4307 ++lineNumber;
4308 UnicodeString line = lineMatcher.group(status);
4309 hexMatcher.reset(line);
4310 UnicodeString testString; // accumulates the emoji sequence.
4311 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4312 UnicodeString hex = hexMatcher.group(1, status);
4313 if (hex.length() > 8) {
4314 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4315 break;
4316 }
4317 CharString hex8;
4318 hex8.appendInvariantChars(hex, status);
4319 UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4320 if (c<=0x10ffff) {
4321 testString.append(c);
4322 } else {
4323 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4324 __FILE__, __LINE__, lineNumber, hex8.data());
4325 break;
4326 }
4327 }
4328
4329 if (testString.length() > 1) {
4330 charBreaks->setText(testString);
4331 charBreaks->first();
4332 int32_t firstBreak = charBreaks->next();
4333 if (testString.length() != firstBreak) {
4334 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4335 __FILE__, __LINE__, lineNumber, firstBreak);
4336 }
4337 wordBreaks->setText(testString);
4338 wordBreaks->first();
4339 firstBreak = wordBreaks->next();
4340 if (testString.length() != firstBreak) {
4341 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4342 __FILE__, __LINE__, lineNumber, firstBreak);
4343 }
4344 lineBreaks->setText(testString);
4345 lineBreaks->first();
4346 firstBreak = lineBreaks->next();
4347 if (testString.length() != firstBreak) {
4348 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4349 __FILE__, __LINE__, lineNumber, firstBreak);
4350 }
4351 }
4352 }
4353 #endif
4354 }
4355
4356
4357 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4358
4359 // WHERE Macro yields a literal string of the form "source_file_name:line number "
4360 // TODO: propose something equivalent as a test framework addition.
4361
4362 #define WHERE __FILE__ ":" XLINE(__LINE__) " "
4363 #define XLINE(s) LINE(s)
4364 #define LINE(s) #s
4365
TestBug12519()4366 void RBBITest::TestBug12519() {
4367 UErrorCode status = U_ZERO_ERROR;
4368 LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4369 LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4370 if (!assertSuccess(WHERE, status)) {
4371 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4372 return;
4373 }
4374 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4375
4376 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4377 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4378
4379 LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
4380 assertTrue(WHERE, *biEn == *cloneEn);
4381 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4382
4383 LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
4384 assertTrue(WHERE, *biFr == *cloneFr);
4385 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4386
4387 LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4388 UnicodeString text("Hallo Welt");
4389 biDe->setText(text);
4390 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4391 *biDe = *biFr;
4392 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4393 }
4394
TestBug12677()4395 void RBBITest::TestBug12677() {
4396 // Check that stripping of comments from rules for getRules() is not confused by
4397 // the presence of '#' characters in the rules that do not introduce comments.
4398 UnicodeString rules(u"!!forward; \n"
4399 "$x = [ab#]; # a set with a # literal. \n"
4400 " # .; # a comment that looks sort of like a rule. \n"
4401 " '#' '?'; # a rule with a quoted # \n"
4402 );
4403
4404 UErrorCode status = U_ZERO_ERROR;
4405 UParseError pe;
4406 RuleBasedBreakIterator bi(rules, pe, status);
4407 assertSuccess(WHERE, status);
4408 UnicodeString rtRules = bi.getRules();
4409 assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
4410 }
4411
4412
TestTableRedundancies()4413 void RBBITest::TestTableRedundancies() {
4414 UErrorCode status = U_ZERO_ERROR;
4415
4416 LocalPointer<RuleBasedBreakIterator> bi (
4417 (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4418 assertSuccess(WHERE, status);
4419 if (U_FAILURE(status)) return;
4420
4421 RBBIDataWrapper *dw = bi->fData;
4422 const RBBIStateTable *fwtbl = dw->fForwardTable;
4423 int32_t numCharClasses = dw->fHeader->fCatCount;
4424 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4425
4426 // Check for duplicate columns (character categories)
4427
4428 std::vector<UnicodeString> columns;
4429 for (int32_t column = 0; column < numCharClasses; column++) {
4430 UnicodeString s;
4431 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4432 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4433 s.append(row->fNextState[column]);
4434 }
4435 columns.push_back(s);
4436 }
4437 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4438 for (int c1=1; c1<numCharClasses; c1++) {
4439 for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4440 if (columns.at(c1) == columns.at(c2)) {
4441 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4442 goto out;
4443 }
4444 }
4445 }
4446 out:
4447
4448 // Check for duplicate states
4449 std::vector<UnicodeString> rows;
4450 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4451 UnicodeString s;
4452 RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4453 assertTrue(WHERE, row->fAccepting >= -1);
4454 s.append(row->fAccepting + 1); // values of -1 are expected.
4455 s.append(row->fLookAhead);
4456 s.append(row->fTagIdx);
4457 for (int32_t column = 0; column < numCharClasses; column++) {
4458 s.append(row->fNextState[column]);
4459 }
4460 rows.push_back(s);
4461 }
4462 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4463 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4464 if (rows.at(r1) == rows.at(r2)) {
4465 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4466 return;
4467 }
4468 }
4469 }
4470 }
4471
4472 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4473 // even after next() has returned DONE.
4474
TestBug13447()4475 void RBBITest::TestBug13447() {
4476 UErrorCode status = U_ZERO_ERROR;
4477 LocalPointer<RuleBasedBreakIterator> bi(
4478 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4479 assertSuccess(WHERE, status);
4480 if (U_FAILURE(status)) return;
4481 UnicodeString data(u"1234");
4482 bi->setText(data);
4483 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4484 assertEquals(WHERE, 4, bi->next());
4485 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4486 assertEquals(WHERE, UBRK_DONE, bi->next());
4487 assertEquals(WHERE, 4, bi->current());
4488 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4489 }
4490
4491 // TestReverse exercises both the synthesized safe reverse rules and the logic
4492 // for filling the break iterator cache when starting from random positions
4493 // in the text.
4494 //
4495 // It's a monkey test, working on random data, with the expected data obtained
4496 // from forward iteration (no safe rules involved), comparing with results
4497 // when indexing into the interior of the string (safe rules needed).
4498
TestReverse()4499 void RBBITest::TestReverse() {
4500 UErrorCode status = U_ZERO_ERROR;
4501
4502 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4503 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4504 assertSuccess(WHERE, status, true);
4505 status = U_ZERO_ERROR;
4506 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4507 BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4508 assertSuccess(WHERE, status, true);
4509 status = U_ZERO_ERROR;
4510 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4511 BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4512 assertSuccess(WHERE, status, true);
4513 status = U_ZERO_ERROR;
4514 TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4515 BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4516 assertSuccess(WHERE, status, true);
4517 }
4518
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4519 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4520 if (!bi) {
4521 return;
4522 }
4523
4524 // From the mapping trie in the break iterator's internal data, create a
4525 // vector of UnicodeStrings, one for each character category, containing
4526 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4527 // to avoid an execess of unassigned code points.
4528
4529 RBBIDataWrapper *data = bi->fData;
4530 int32_t categoryCount = data->fHeader->fCatCount;
4531 UTrie2 *trie = data->fTrie;
4532
4533 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4534 for (int cp=0; cp<0x1fff0; ++cp) {
4535 int cat = utrie2_get32(trie, cp);
4536 cat &= ~0x4000; // And off the dictionary bit from the category.
4537 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4538 if (cat < 0 || cat >= categoryCount) return;
4539 strings[cat].append(cp);
4540 }
4541
4542 icu_rand randomGen;
4543 const int testStringLength = 10000;
4544 UnicodeString testString;
4545
4546 for (int i=0; i<testStringLength; ++i) {
4547 int charClass = randomGen() % categoryCount;
4548 if (strings[charClass].length() > 0) {
4549 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4550 testString.append(cp);
4551 }
4552 }
4553
4554 typedef std::pair<UBool, int32_t> Result;
4555 std::vector<Result> expectedResults;
4556 bi->setText(testString);
4557 for (int i=0; i<testString.length(); ++i) {
4558 bool isboundary = bi->isBoundary(i);
4559 int ruleStatus = bi->getRuleStatus();
4560 expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4561 }
4562
4563 for (int i=testString.length()-1; i>=0; --i) {
4564 bi->setText(testString); // clears the internal break cache
4565 Result expected = expectedResults[i];
4566 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4567 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4568 }
4569 }
4570
4571
4572 // Ticket 13692 - finding word boundaries in very large numbers or words could
4573 // be very time consuming. When the problem was present, this void test
4574 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4575
TestBug13692()4576 void RBBITest::TestBug13692() {
4577 UErrorCode status = U_ZERO_ERROR;
4578 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4579 BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4580 if (!assertSuccess(WHERE, status, true)) {
4581 return;
4582 }
4583 constexpr int32_t LENGTH = 1000000;
4584 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4585 for (int i=0; i<20; i+=2) {
4586 longNumber.setCharAt(i, u' ');
4587 }
4588 bi->setText(longNumber);
4589 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4590 assertSuccess(WHERE, status);
4591 }
4592
4593 //
4594 // TestDebug - A place-holder test for debugging purposes.
4595 // For putting in fragments of other tests that can be invoked
4596 // for tracing without a lot of unwanted extra stuff happening.
4597 //
TestDebug(void)4598 void RBBITest::TestDebug(void) {
4599 UErrorCode status = U_ZERO_ERROR;
4600 LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4601 BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4602 if (!assertSuccess(WHERE, status, true)) {
4603 return;
4604 }
4605 const UnicodeString &rules = bi->getRules();
4606 UParseError pe;
4607 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4608 assertSuccess(WHERE, status);
4609 }
4610
TestProperties()4611 void RBBITest::TestProperties() {
4612 UErrorCode errorCode = U_ZERO_ERROR;
4613 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4614 if (!prependSet.isEmpty()) {
4615 errln(
4616 "[:GCB=Prepend:] is not empty any more. "
4617 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4618 "change this test to the opposite condition.");
4619 }
4620 }
4621
4622 #endif // #if !UCONFIG_NO_BREAK_ITERATION
4623