1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2011, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
11
12 #include <typeinfo> // for 'typeid' to work
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
27 #include "unicode/ustring.h"
28 #include "unicode/utext.h"
29 #include "intltest.h"
30 #include "rbbitst.h"
31 #include <string.h>
32 #include "uvector.h"
33 #include "uvectr32.h"
34 #include "triedict.h"
35 #include <string.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38
39 #define TEST_ASSERT(x) {if (!(x)) { \
40 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
41
42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
43 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
44
45
46 //---------------------------------------------
47 // runIndexedTest
48 //---------------------------------------------
49
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)50 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
51 {
52 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
53
54 switch (index) {
55 #if !UCONFIG_NO_FILE_IO
56 case 0: name = "TestBug4153072";
57 if(exec) TestBug4153072(); break;
58 #else
59 case 0: name = "skip";
60 break;
61 #endif
62
63 case 1: name = "TestJapaneseLineBreak";
64 if(exec) TestJapaneseLineBreak(); break;
65 case 2: name = "TestStatusReturn";
66 if(exec) TestStatusReturn(); break;
67
68 #if !UCONFIG_NO_FILE_IO
69 case 3: name = "TestUnicodeFiles";
70 if(exec) TestUnicodeFiles(); break;
71 case 4: name = "TestEmptyString";
72 if(exec) TestEmptyString(); break;
73 #else
74 case 3: case 4: name = "skip";
75 break;
76 #endif
77
78 case 5: name = "TestGetAvailableLocales";
79 if(exec) TestGetAvailableLocales(); break;
80
81 case 6: name = "TestGetDisplayName";
82 if(exec) TestGetDisplayName(); break;
83
84 #if !UCONFIG_NO_FILE_IO
85 case 7: name = "TestEndBehaviour";
86 if(exec) TestEndBehaviour(); break;
87 case 8: name = "TestMixedThaiLineBreak";
88 if(exec) TestMixedThaiLineBreak(); break;
89 case 9: name = "TestThaiLineBreak";
90 if(exec) TestThaiLineBreak(); break;
91 case 10: name = "TestMaiyamok";
92 if(exec) TestMaiyamok(); break;
93 case 11: name = "TestWordBreaks";
94 if(exec) TestWordBreaks(); break;
95 case 12: name = "TestWordBoundary";
96 if(exec) TestWordBoundary(); break;
97 case 13: name = "TestLineBreaks";
98 if(exec) TestLineBreaks(); break;
99 case 14: name = "TestSentBreaks";
100 if(exec) TestSentBreaks(); break;
101 case 15: name = "TestExtended";
102 if(exec) TestExtended(); break;
103 #else
104 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
105 break;
106 #endif
107
108 case 16:
109 if(exec) {
110 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
111 name = "TestMonkey";
112 TestMonkey(params);
113 #else
114 name = "skip";
115 #endif
116 }
117 break;
118
119 #if !UCONFIG_NO_FILE_IO
120 case 17: name = "TestBug3818";
121 if(exec) TestBug3818(); break;
122 case 18: name = "TestJapaneseWordBreak";
123 if(exec) TestJapaneseWordBreak(); break;
124 #else
125 case 17: case 18: name = "skip";
126 break;
127 #endif
128
129 case 19: name = "TestDebug";
130 if(exec) TestDebug(); break;
131 case 20: name = "TestTrieDict";
132 if(exec) TestTrieDict(); break;
133
134 #if !UCONFIG_NO_FILE_IO
135 case 21: name = "TestBug5775";
136 if (exec) TestBug5775(); break;
137 case 22: name = "TestTailoredBreaks";
138 if (exec) TestTailoredBreaks(); break;
139 #else
140 case 21: case 22: name = "skip";
141 break;
142 #endif
143 case 23: name = "TestDictRules";
144 if (exec) TestDictRules(); break;
145 case 24: name = "TestBug5532";
146 if (exec) TestBug5532(); break;
147 default: name = ""; break; //needed to end loop
148 }
149 }
150
151
152 //---------------------------------------------------------------------------
153 //
154 // class BITestData Holds a set of Break iterator test data and results
155 // Includes
156 // - the string data to be broken
157 // - a vector of the expected break positions.
158 // - a vector of source line numbers for the data,
159 // (to help see where errors occured.)
160 // - The expected break tag values.
161 // - Vectors of actual break positions and tag values.
162 // - Functions for comparing actual with expected and
163 // reporting errors.
164 //
165 //----------------------------------------------------------------------------
166 class BITestData {
167 public:
168 UnicodeString fDataToBreak;
169 UVector fExpectedBreakPositions;
170 UVector fExpectedTags;
171 UVector fLineNum;
172 UVector fActualBreakPositions; // Test Results.
173 UVector fActualTags;
174
175 BITestData(UErrorCode &status);
176 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
177 void checkResults(const char *heading, RBBITest *test);
178 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
179 void clearResults();
180 };
181
182 //
183 // Constructor.
184 //
BITestData(UErrorCode & status)185 BITestData::BITestData(UErrorCode &status)
186 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
187 fActualTags(status)
188 {
189 }
190
191 //
192 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
193 // The macro form collects the line number, which is helpful
194 // when tracking down failures.
195 //
196 // A null data item is inserted at the start of each test's data
197 // to put the starting zero into the data list. The position saved for
198 // each non-null item is its ending position.
199 //
200 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
addDataChunk(const char * data,int32_t tag,int32_t lineNum,UErrorCode status)201 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
202 if (U_FAILURE(status)) {return;}
203 if (data != NULL) {
204 fDataToBreak.append(CharsToUnicodeString(data));
205 }
206 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
207 fExpectedTags.addElement(tag, status);
208 fLineNum.addElement(lineNum, status);
209 }
210
211
212 //
213 // checkResults. Compare the actual and expected break positions, report any differences.
214 //
checkResults(const char * heading,RBBITest * test)215 void BITestData::checkResults(const char *heading, RBBITest *test) {
216 int32_t expectedIndex = 0;
217 int32_t actualIndex = 0;
218
219 for (;;) {
220 // If we've run through both the expected and actual results vectors, we're done.
221 // break out of the loop.
222 if (expectedIndex >= fExpectedBreakPositions.size() &&
223 actualIndex >= fActualBreakPositions.size()) {
224 break;
225 }
226
227
228 if (expectedIndex >= fExpectedBreakPositions.size()) {
229 err(heading, test, expectedIndex-1, actualIndex);
230 actualIndex++;
231 continue;
232 }
233
234 if (actualIndex >= fActualBreakPositions.size()) {
235 err(heading, test, expectedIndex, actualIndex-1);
236 expectedIndex++;
237 continue;
238 }
239
240 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
241 err(heading, test, expectedIndex, actualIndex);
242 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
243 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
244 actualIndex++;
245 } else {
246 expectedIndex++;
247 }
248 continue;
249 }
250
251 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
252 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
253 heading, fLineNum.elementAt(expectedIndex),
254 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
255 }
256
257 actualIndex++;
258 expectedIndex++;
259 }
260 }
261
262 //
263 // err - An error was found. Report it, along with information about where the
264 // incorrectly broken test data appeared in the source file.
265 //
err(const char * heading,RBBITest * test,int32_t expectedIdx,int32_t actualIdx)266 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
267 {
268 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
269 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
270 int32_t o = 0;
271 int32_t line = fLineNum.elementAti(expectedIdx);
272 if (expectedIdx > 0) {
273 // The line numbers are off by one because a premature break occurs somewhere
274 // within the previous item, rather than at the start of the current (expected) item.
275 // We want to report the offset of the unexpected break from the start of
276 // this previous item.
277 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
278 }
279 if (actual < expected) {
280 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
281 } else {
282 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
283 }
284 }
285
286
clearResults()287 void BITestData::clearResults() {
288 fActualBreakPositions.removeAllElements();
289 fActualTags.removeAllElements();
290 }
291
292
293 //-----------------------------------------------------------------------------------
294 //
295 // Cannned Test Characters
296 //
297 //-----------------------------------------------------------------------------------
298
299 static const UChar cannedTestArray[] = {
300 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
301 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
302 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
303 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
304 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
305 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
306 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
307 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
308 };
309
310 static UnicodeString* cannedTestChars = 0;
311
312 #define halfNA "\\u0928\\u094d\\u200d"
313 #define halfSA "\\u0938\\u094d\\u200d"
314 #define halfCHA "\\u091a\\u094d\\u200d"
315 #define halfKA "\\u0915\\u094d\\u200d"
316 #define deadTA "\\u0924\\u094d"
317
318 //--------------------------------------------------------------------------------------
319 //
320 // RBBITest constructor and destructor
321 //
322 //--------------------------------------------------------------------------------------
323
RBBITest()324 RBBITest::RBBITest() {
325 UnicodeString temp(cannedTestArray);
326 cannedTestChars = new UnicodeString();
327 *cannedTestChars += (UChar)0x0000;
328 *cannedTestChars += temp;
329 }
330
331
~RBBITest()332 RBBITest::~RBBITest() {
333 delete cannedTestChars;
334 }
335
336
337 static const int T_NUMBER = 100;
338 static const int T_LETTER = 200;
339 static const int T_H_OR_K = 300;
340 static const int T_IDEO = 400;
341
342
343
344
345
346
347 //--------------------------------------------------------------------
348 //Testing the BreakIterator for devanagari script
349 //--------------------------------------------------------------------
350
351 #define deadRA "\\u0930\\u094d" /*deadform RA = devanagari RA + virama*/
352 #define deadPHA "\\u092b\\u094d" /*deadform PHA = devanagari PHA + virama*/
353 #define deadTTHA "\\u0920\\u094d"
354 #define deadPA "\\u092a\\u094d"
355 #define deadSA "\\u0938\\u094d"
356 #define visarga "\\u0903" /*devanagari visarga looks like a english colon*/
357
358
359
360
361
362
363 //-----------------------------------------------------------------------------------
364 //
365 // Test for status {tag} return value from break rules.
366 // TODO: a more thorough test.
367 //
368 //-----------------------------------------------------------------------------------
TestStatusReturn()369 void RBBITest::TestStatusReturn() {
370 UnicodeString rulesString1("$Letters = [:L:];\n"
371 "$Numbers = [:N:];\n"
372 "$Letters+{1};\n"
373 "$Numbers+{2};\n"
374 "Help\\ {4}/me\\!;\n"
375 "[^$Letters $Numbers];\n"
376 "!.*;\n", -1, US_INV);
377 UnicodeString testString1 = "abc123..abc Help me Help me!";
378 // 01234567890123456789012345678
379 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
380 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
381
382 UErrorCode status=U_ZERO_ERROR;
383 UParseError parseError;
384
385 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
386 if(U_FAILURE(status)) {
387 dataerrln("FAIL : in construction - %s", u_errorName(status));
388 } else {
389 int32_t pos;
390 int32_t i = 0;
391 bi->setText(testString1);
392 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
393 if (pos != bounds1[i]) {
394 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
395 break;
396 }
397
398 int tag = bi->getRuleStatus();
399 if (tag != brkStatus[i]) {
400 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
401 break;
402 }
403 i++;
404 }
405 }
406 delete bi;
407 }
408
409
printStringBreaks(UnicodeString ustr,int expected[],int expectedcount)410 static void printStringBreaks(UnicodeString ustr, int expected[],
411 int expectedcount)
412 {
413 UErrorCode status = U_ZERO_ERROR;
414 char name[100];
415 printf("code alpha extend alphanum type word sent line name\n");
416 int j;
417 for (j = 0; j < ustr.length(); j ++) {
418 if (expectedcount > 0) {
419 int k;
420 for (k = 0; k < expectedcount; k ++) {
421 if (j == expected[k]) {
422 printf("------------------------------------------------ %d\n",
423 j);
424 }
425 }
426 }
427 UChar32 c = ustr.char32At(j);
428 if (c > 0xffff) {
429 j ++;
430 }
431 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
432 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
433 u_isUAlphabetic(c),
434 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
435 u_isalnum(c),
436 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
437 u_charType(c),
438 U_SHORT_PROPERTY_NAME),
439 u_getPropertyValueName(UCHAR_WORD_BREAK,
440 u_getIntPropertyValue(c,
441 UCHAR_WORD_BREAK),
442 U_SHORT_PROPERTY_NAME),
443 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
444 u_getIntPropertyValue(c,
445 UCHAR_SENTENCE_BREAK),
446 U_SHORT_PROPERTY_NAME),
447 u_getPropertyValueName(UCHAR_LINE_BREAK,
448 u_getIntPropertyValue(c,
449 UCHAR_LINE_BREAK),
450 U_SHORT_PROPERTY_NAME),
451 name);
452 }
453 }
454
TestThaiLineBreak()455 void RBBITest::TestThaiLineBreak() {
456 UErrorCode status = U_ZERO_ERROR;
457 BITestData thaiLineSelection(status);
458
459 // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
460 // represents elided letters at the end of a long word. It should be bound to
461 // the end of the word and not treated as an independent punctuation mark.
462
463
464 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
465 ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
466 ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
467 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
468 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
469 // ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
470 // ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
471 ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
472 // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
473 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
474 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
475 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
476 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
477 ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
478 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
479
480 // the one time where the paiyannoi occurs somewhere other than at the end
481 // of a word is in the Thai abbrevation for "etc.", which both begins and
482 // ends with a paiyannoi
483 ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
484 ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
485 ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
486
487 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
488 Locale("th"), status);
489 if (U_FAILURE(status))
490 {
491 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
492 return;
493 }
494
495 generalIteratorTest(*e, thaiLineSelection);
496 delete e;
497 }
498
499
500
TestMixedThaiLineBreak()501 void RBBITest::TestMixedThaiLineBreak()
502 {
503 UErrorCode status = U_ZERO_ERROR;
504 BITestData thaiLineSelection(status);
505
506 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
507
508
509 // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
510 // start
511
512 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
513 ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
514 ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
515 ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
516 ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
517 ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
518 ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
519 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
520 ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
521 ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
522 ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
523 ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
524 ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
525 ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
526 ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
527 ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
528
529 // @suwit - end of changes
530
531
532 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
533 if (U_FAILURE(status))
534 {
535 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
536 return;
537 }
538
539
540 generalIteratorTest(*e, thaiLineSelection);
541 delete e;
542 }
543
544
TestMaiyamok()545 void RBBITest::TestMaiyamok()
546 {
547 UErrorCode status = U_ZERO_ERROR;
548 BITestData thaiLineSelection(status);
549 ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data
550 // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
551 // word". Instead of appearing as a word unto itself, however, it's kept together
552 // with the word before it
553 ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
554 ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
555 ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
556 ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
557 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
558 ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
559 ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
560 ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
561 ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
562
563 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
564 Locale("th"), status);
565
566 if (U_FAILURE(status))
567 {
568 errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
569 return;
570 }
571 generalIteratorTest(*e, thaiLineSelection);
572 delete e;
573 }
574
575
576
TestBug3818()577 void RBBITest::TestBug3818() {
578 UErrorCode status = U_ZERO_ERROR;
579
580 // Four Thai words...
581 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
582 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
583 UnicodeString thaiStr(thaiWordData);
584
585 RuleBasedBreakIterator* bi =
586 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
587 if (U_FAILURE(status) || bi == NULL) {
588 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
589 return;
590 }
591 bi->setText(thaiStr);
592
593 int32_t startOfSecondWord = bi->following(1);
594 if (startOfSecondWord != 4) {
595 errln("Fail at file %s, line %d expected start of word at 4, got %d",
596 __FILE__, __LINE__, startOfSecondWord);
597 }
598 startOfSecondWord = bi->following(0);
599 if (startOfSecondWord != 4) {
600 errln("Fail at file %s, line %d expected start of word at 4, got %d",
601 __FILE__, __LINE__, startOfSecondWord);
602 }
603 delete bi;
604 }
605
606
TestJapaneseWordBreak()607 void RBBITest::TestJapaneseWordBreak() {
608 UErrorCode status = U_ZERO_ERROR;
609 BITestData japaneseWordSelection(status);
610
611 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data
612 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
613 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
614 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
615 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
616 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
617 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
618
619 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
620 Locale("ja"), status);
621 if (U_FAILURE(status))
622 {
623 errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
624 return;
625 }
626
627 generalIteratorTest(*e, japaneseWordSelection);
628 delete e;
629 }
630
TestTrieDict()631 void RBBITest::TestTrieDict() {
632 UErrorCode status = U_ZERO_ERROR;
633
634 //
635 // Open and read the test data file.
636 //
637 const char *testDataDirectory = IntlTest::getSourceTestData(status);
638 char testFileName[1000];
639 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
640 errln("Can't open test data. Path too long.");
641 return;
642 }
643 strcpy(testFileName, testDataDirectory);
644 strcat(testFileName, "riwords.txt");
645
646 // Items needing deleting at the end
647 MutableTrieDictionary *mutableDict = NULL;
648 CompactTrieDictionary *compactDict = NULL;
649 UnicodeSet *breaks = NULL;
650 UChar *testFile = NULL;
651 StringEnumeration *enumer1 = NULL;
652 StringEnumeration *enumer2 = NULL;
653 MutableTrieDictionary *mutable2 = NULL;
654 StringEnumeration *cloneEnum = NULL;
655 CompactTrieDictionary *compact2 = NULL;
656
657
658 const UnicodeString *originalWord = NULL;
659 const UnicodeString *cloneWord = NULL;
660 UChar *current;
661 UChar *word;
662 UChar uc;
663 int32_t wordLen;
664 int32_t wordCount;
665 int32_t testCount;
666
667 int len;
668 testFile = ReadAndConvertFile(testFileName, len, NULL, status);
669 if (U_FAILURE(status)) {
670 goto cleanup; /* something went wrong, error already output */
671 }
672
673 mutableDict = new MutableTrieDictionary(0x0E1C, status);
674 if (U_FAILURE(status)) {
675 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
676 goto cleanup;
677 }
678
679 breaks = new UnicodeSet;
680 breaks->add(0x000A); // Line Feed
681 breaks->add(0x000D); // Carriage Return
682 breaks->add(0x2028); // Line Separator
683 breaks->add(0x2029); // Paragraph Separator
684
685 // Now add each non-comment line of the file as a word.
686 current = testFile;
687 word = current;
688 uc = *current++;
689 wordLen = 0;
690 wordCount = 0;
691
692 while (uc) {
693 if (uc == 0x0023) { // #comment line, skip
694 while (uc && !breaks->contains(uc)) {
695 uc = *current++;
696 }
697 }
698 else while (uc && !breaks->contains(uc)) {
699 ++wordLen;
700 uc = *current++;
701 }
702 if (wordLen > 0) {
703 mutableDict->addWord(word, wordLen, status);
704 if (U_FAILURE(status)) {
705 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
706 goto cleanup;
707 }
708 wordCount += 1;
709 }
710
711 // Find beginning of next line
712 while (uc && breaks->contains(uc)) {
713 uc = *current++;
714 }
715 word = current-1;
716 wordLen = 0;
717 }
718
719 if (wordCount < 50) {
720 errln("Word count (%d) unreasonably small\n", wordCount);
721 goto cleanup;
722 }
723
724 enumer1 = mutableDict->openWords(status);
725 if (U_FAILURE(status)) {
726 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
727 goto cleanup;
728 }
729
730 testCount = 0;
731 if (wordCount != (testCount = enumer1->count(status))) {
732 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
733 testCount, wordCount, u_errorName(status));
734 goto cleanup;
735 }
736
737 // Now compact it
738 compactDict = new CompactTrieDictionary(*mutableDict, status);
739 if (U_FAILURE(status)) {
740 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
741 goto cleanup;
742 }
743
744 enumer2 = compactDict->openWords(status);
745 if (U_FAILURE(status)) {
746 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
747 goto cleanup;
748 }
749
750 if (wordCount != (testCount = enumer2->count(status))) {
751 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
752 testCount, wordCount, u_errorName(status));
753 goto cleanup;
754 }
755
756 if (typeid(*enumer1) == typeid(*enumer2)) {
757 errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
758 }
759 delete enumer1;
760 enumer1 = NULL;
761 delete enumer2;
762 enumer2 = NULL;
763
764 // Now un-compact it
765 mutable2 = compactDict->cloneMutable(status);
766 if (U_FAILURE(status)) {
767 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
768 goto cleanup;
769 }
770
771 cloneEnum = mutable2->openWords(status);
772 if (U_FAILURE(status)) {
773 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
774 goto cleanup;
775 }
776
777 if (wordCount != (testCount = cloneEnum->count(status))) {
778 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
779 testCount, wordCount, u_errorName(status));
780 goto cleanup;
781 }
782
783 // Compact original dictionary to clone. Note that we can only compare the same kind of
784 // dictionary as the order of the enumerators is not guaranteed to be the same between
785 // different kinds
786 enumer1 = mutableDict->openWords(status);
787 if (U_FAILURE(status)) {
788 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
789 goto cleanup;
790 }
791
792 originalWord = enumer1->snext(status);
793 cloneWord = cloneEnum->snext(status);
794 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
795 if (*originalWord != *cloneWord) {
796 errln("Original and cloned MutableTrieDictionary word mismatch\n");
797 goto cleanup;
798 }
799 originalWord = enumer1->snext(status);
800 cloneWord = cloneEnum->snext(status);
801 }
802
803 if (U_FAILURE(status)) {
804 errln("Enumeration failed: %s\n", u_errorName(status));
805 goto cleanup;
806 }
807
808 if (originalWord != cloneWord) {
809 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
810 goto cleanup;
811 }
812
813 // Test the data copying constructor for CompactTrieDict, and the data access APIs.
814 compact2 = new CompactTrieDictionary(compactDict->data(), status);
815 if (U_FAILURE(status)) {
816 errln("CompactTrieDictionary(const void *,...) failed\n");
817 goto cleanup;
818 }
819
820 if (compact2->dataSize() == 0) {
821 errln("CompactTrieDictionary->dataSize() == 0\n");
822 goto cleanup;
823 }
824
825 // Now count the words via the second dictionary
826 delete enumer1;
827 enumer1 = compact2->openWords(status);
828 if (U_FAILURE(status)) {
829 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
830 goto cleanup;
831 }
832
833 if (wordCount != (testCount = enumer1->count(status))) {
834 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
835 testCount, wordCount, u_errorName(status));
836 goto cleanup;
837 }
838
839 cleanup:
840 delete compactDict;
841 delete mutableDict;
842 delete breaks;
843 delete[] testFile;
844 delete enumer1;
845 delete mutable2;
846 delete cloneEnum;
847 delete compact2;
848 }
849
850
851 //----------------------------------------------------------------------------
852 //
853 // generalIteratorTest Given a break iterator and a set of test data,
854 // Run the tests and report the results.
855 //
856 //----------------------------------------------------------------------------
generalIteratorTest(RuleBasedBreakIterator & bi,BITestData & td)857 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
858 {
859
860 bi.setText(td.fDataToBreak);
861
862 testFirstAndNext(bi, td);
863
864 testLastAndPrevious(bi, td);
865
866 testFollowing(bi, td);
867 testPreceding(bi, td);
868 testIsBoundary(bi, td);
869 doMultipleSelectionTest(bi, td);
870 }
871
872
873 //
874 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
875 // kind of loop.
876 //
testFirstAndNext(RuleBasedBreakIterator & bi,BITestData & td)877 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
878 {
879 UErrorCode status = U_ZERO_ERROR;
880 int32_t p;
881 int32_t lastP = -1;
882 int32_t tag;
883
884 logln("Test first and next");
885 bi.setText(td.fDataToBreak);
886 td.clearResults();
887
888 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
889 td.fActualBreakPositions.addElement(p, status); // Save result.
890 tag = bi.getRuleStatus();
891 td.fActualTags.addElement(tag, status);
892 if (p <= lastP) {
893 // If the iterator is not making forward progress, stop.
894 // No need to raise an error here, it'll be detected in the normal check of results.
895 break;
896 }
897 lastP = p;
898 }
899 td.checkResults("testFirstAndNext", this);
900 }
901
902
903 //
904 // TestLastAndPrevious. Run the iterator backwards, starting with last().
905 //
testLastAndPrevious(RuleBasedBreakIterator & bi,BITestData & td)906 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
907 {
908 UErrorCode status = U_ZERO_ERROR;
909 int32_t p;
910 int32_t lastP = 0x7ffffffe;
911 int32_t tag;
912
913 logln("Test last and previous");
914 bi.setText(td.fDataToBreak);
915 td.clearResults();
916
917 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
918 // Save break position. Insert it at start of vector of results, shoving
919 // already-saved results further towards the end.
920 td.fActualBreakPositions.insertElementAt(p, 0, status);
921 // bi.previous(); // TODO: Why does this fix things up????
922 // bi.next();
923 tag = bi.getRuleStatus();
924 td.fActualTags.insertElementAt(tag, 0, status);
925 if (p >= lastP) {
926 // If the iterator is not making progress, stop.
927 // No need to raise an error here, it'll be detected in the normal check of results.
928 break;
929 }
930 lastP = p;
931 }
932 td.checkResults("testLastAndPrevious", this);
933 }
934
935
testFollowing(RuleBasedBreakIterator & bi,BITestData & td)936 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
937 {
938 UErrorCode status = U_ZERO_ERROR;
939 int32_t p;
940 int32_t tag;
941 int32_t lastP = -2; // A value that will never be returned as a break position.
942 // cannot be -1; that is returned for DONE.
943 int i;
944
945 logln("testFollowing():");
946 bi.setText(td.fDataToBreak);
947 td.clearResults();
948
949 // Save the starting point, since we won't get that out of following.
950 p = bi.first();
951 td.fActualBreakPositions.addElement(p, status); // Save result.
952 tag = bi.getRuleStatus();
953 td.fActualTags.addElement(tag, status);
954
955 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
956 p = bi.following(i);
957 if (p != lastP) {
958 if (p == RuleBasedBreakIterator::DONE) {
959 break;
960 }
961 // We've reached a new break position. Save it.
962 td.fActualBreakPositions.addElement(p, status); // Save result.
963 tag = bi.getRuleStatus();
964 td.fActualTags.addElement(tag, status);
965 lastP = p;
966 }
967 }
968 // The loop normally exits by means of the break in the middle.
969 // Make sure that the index was at the correct position for the break iterator to have
970 // returned DONE.
971 if (i != td.fDataToBreak.length()) {
972 errln("testFollowing(): iterator returned DONE prematurely.");
973 }
974
975 // Full check of all results.
976 td.checkResults("testFollowing", this);
977 }
978
979
980
testPreceding(RuleBasedBreakIterator & bi,BITestData & td)981 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
982 UErrorCode status = U_ZERO_ERROR;
983 int32_t p;
984 int32_t tag;
985 int32_t lastP = 0x7ffffffe;
986 int i;
987
988 logln("testPreceding():");
989 bi.setText(td.fDataToBreak);
990 td.clearResults();
991
992 p = bi.last();
993 td.fActualBreakPositions.addElement(p, status);
994 tag = bi.getRuleStatus();
995 td.fActualTags.addElement(tag, status);
996
997 for (i = td.fDataToBreak.length(); i>=-1; i--) {
998 p = bi.preceding(i);
999 if (p != lastP) {
1000 if (p == RuleBasedBreakIterator::DONE) {
1001 break;
1002 }
1003 // We've reached a new break position. Save it.
1004 td.fActualBreakPositions.insertElementAt(p, 0, status);
1005 lastP = p;
1006 tag = bi.getRuleStatus();
1007 td.fActualTags.insertElementAt(tag, 0, status);
1008 }
1009 }
1010 // The loop normally exits by means of the break in the middle.
1011 // Make sure that the index was at the correct position for the break iterator to have
1012 // returned DONE.
1013 if (i != 0) {
1014 errln("testPreceding(): iterator returned DONE prematurely.");
1015 }
1016
1017 // Full check of all results.
1018 td.checkResults("testPreceding", this);
1019 }
1020
1021
1022
testIsBoundary(RuleBasedBreakIterator & bi,BITestData & td)1023 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
1024 UErrorCode status = U_ZERO_ERROR;
1025 int i;
1026 int32_t tag;
1027
1028 logln("testIsBoundary():");
1029 bi.setText(td.fDataToBreak);
1030 td.clearResults();
1031
1032 for (i = 0; i <= td.fDataToBreak.length(); i++) {
1033 if (bi.isBoundary(i)) {
1034 td.fActualBreakPositions.addElement(i, status); // Save result.
1035 tag = bi.getRuleStatus();
1036 td.fActualTags.addElement(tag, status);
1037 }
1038 }
1039 td.checkResults("testIsBoundary: ", this);
1040 }
1041
1042
1043
doMultipleSelectionTest(RuleBasedBreakIterator & iterator,BITestData & td)1044 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
1045 {
1046 iterator.setText(td.fDataToBreak);
1047
1048 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
1049 int32_t offset = iterator.first();
1050 int32_t testOffset;
1051 int32_t count = 0;
1052
1053 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
1054
1055 if (*testIterator != iterator)
1056 errln("clone() or operator!= failed: two clones compared unequal");
1057
1058 do {
1059 testOffset = testIterator->first();
1060 testOffset = testIterator->next(count);
1061 if (offset != testOffset)
1062 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1063
1064 if (offset != RuleBasedBreakIterator::DONE) {
1065 count++;
1066 offset = iterator.next();
1067
1068 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
1069 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
1070 if (count > 10000 || offset == -1) {
1071 errln("operator== failed too many times. Stopping test.");
1072 if (offset == -1) {
1073 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1074 }
1075 return;
1076 }
1077 }
1078 }
1079 } while (offset != RuleBasedBreakIterator::DONE);
1080
1081 // now do it backwards...
1082 offset = iterator.last();
1083 count = 0;
1084
1085 do {
1086 testOffset = testIterator->last();
1087 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
1088 if (offset != testOffset)
1089 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1090
1091 if (offset != RuleBasedBreakIterator::DONE) {
1092 count--;
1093 offset = iterator.previous();
1094 }
1095 } while (offset != RuleBasedBreakIterator::DONE);
1096
1097 delete testIterator;
1098 }
1099
1100
1101 //---------------------------------------------
1102 //
1103 // other tests
1104 //
1105 //---------------------------------------------
TestEmptyString()1106 void RBBITest::TestEmptyString()
1107 {
1108 UnicodeString text = "";
1109 UErrorCode status = U_ZERO_ERROR;
1110
1111 BITestData x(status);
1112 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
1113 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1114 if (U_FAILURE(status))
1115 {
1116 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
1117 return;
1118 }
1119 generalIteratorTest(*bi, x);
1120 delete bi;
1121 }
1122
TestGetAvailableLocales()1123 void RBBITest::TestGetAvailableLocales()
1124 {
1125 int32_t locCount = 0;
1126 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1127
1128 if (locCount == 0)
1129 dataerrln("getAvailableLocales() returned an empty list!");
1130 // Just make sure that it's returning good memory.
1131 int32_t i;
1132 for (i = 0; i < locCount; ++i) {
1133 logln(locList[i].getName());
1134 }
1135 }
1136
1137 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()1138 void RBBITest::TestGetDisplayName()
1139 {
1140 UnicodeString result;
1141
1142 BreakIterator::getDisplayName(Locale::getUS(), result);
1143 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1144 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1145 + result);
1146
1147 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1148 if (result != "French (France)")
1149 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1150 + result);
1151 }
1152 /**
1153 * Test End Behaviour
1154 * @bug 4068137
1155 */
TestEndBehaviour()1156 void RBBITest::TestEndBehaviour()
1157 {
1158 UErrorCode status = U_ZERO_ERROR;
1159 UnicodeString testString("boo.");
1160 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1161 if (U_FAILURE(status))
1162 {
1163 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
1164 return;
1165 }
1166 wb->setText(testString);
1167
1168 if (wb->first() != 0)
1169 errln("Didn't get break at beginning of string.");
1170 if (wb->next() != 3)
1171 errln("Didn't get break before period in \"boo.\"");
1172 if (wb->current() != 4 && wb->next() != 4)
1173 errln("Didn't get break at end of string.");
1174 delete wb;
1175 }
1176 /*
1177 * @bug 4153072
1178 */
TestBug4153072()1179 void RBBITest::TestBug4153072() {
1180 UErrorCode status = U_ZERO_ERROR;
1181 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1182 if (U_FAILURE(status))
1183 {
1184 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
1185 return;
1186 }
1187 UnicodeString str("...Hello, World!...");
1188 int32_t begin = 3;
1189 int32_t end = str.length() - 3;
1190 UBool onBoundary;
1191
1192 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1193 iter->adoptText(textIterator);
1194 int index;
1195 // Note: with the switch to UText, there is no way to restrict the
1196 // iteration range to begin at an index other than zero.
1197 // String character iterators created with a non-zero bound are
1198 // treated by RBBI as being empty.
1199 for (index = -1; index < begin + 1; ++index) {
1200 onBoundary = iter->isBoundary(index);
1201 if (index == 0? !onBoundary : onBoundary) {
1202 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1203 " and begin index = " + begin);
1204 }
1205 }
1206 delete iter;
1207 }
1208
1209
1210 //
1211 // Test for problem reported by Ashok Matoria on 9 July 2007
1212 // One.<kSoftHyphen><kSpace>Two.
1213 //
1214 // Sentence break at start (0) and then on calling next() it breaks at
1215 // 'T' of "Two". Now, at this point if I do next() and
1216 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1217 //
TestBug5775()1218 void RBBITest::TestBug5775() {
1219 UErrorCode status = U_ZERO_ERROR;
1220 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1221 TEST_ASSERT_SUCCESS(status);
1222 if (U_FAILURE(status)) {
1223 return;
1224 }
1225 // Check for status first for better handling of no data errors.
1226 TEST_ASSERT(bi != NULL);
1227 if (bi == NULL) {
1228 return;
1229 }
1230
1231 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
1232 // 01234 56789
1233 s = s.unescape();
1234 bi->setText(s);
1235 int pos = bi->next();
1236 TEST_ASSERT(pos == 6);
1237 pos = bi->next();
1238 TEST_ASSERT(pos == 10);
1239 pos = bi->previous();
1240 TEST_ASSERT(pos == 6);
1241 delete bi;
1242 }
1243
1244
1245
1246 /**
1247 * Test Japanese Line Break
1248 * @bug 4095322
1249 */
TestJapaneseLineBreak()1250 void RBBITest::TestJapaneseLineBreak()
1251 {
1252 #if 0
1253 // Test needs updating some more... Dump it for now.
1254
1255
1256 // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
1257 // as opening and closing punctuation for line breaking.
1258 // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
1259 // from these tests. 6-13-2002
1260 //
1261 UErrorCode status = U_ZERO_ERROR;
1262 UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1263 UnicodeString precedingChars = CharsToUnicodeString(
1264 //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1265 "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1266 UnicodeString followingChars = CharsToUnicodeString(
1267 // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1268 ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1269 // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1270 ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1271 "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1272 BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1273
1274 int32_t i;
1275 if (U_FAILURE(status))
1276 {
1277 errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1278 return;
1279 }
1280
1281 for (i = 0; i < precedingChars.length(); i++) {
1282 testString.setCharAt(1, precedingChars[i]);
1283 iter->setText(testString);
1284 int32_t j = iter->first();
1285 if (j != 0)
1286 errln("ja line break failure: failed to start at 0");
1287 j = iter->next();
1288 if (j != 1)
1289 errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1290 + "' (" + ((int)(precedingChars[i])) + ")");
1291 j = iter->next();
1292 if (j != 3)
1293 errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1294 + "' (" + ((int)(precedingChars[i])) + ")");
1295 }
1296
1297 for (i = 0; i < followingChars.length(); i++) {
1298 testString.setCharAt(1, followingChars[i]);
1299 iter->setText(testString);
1300 int j = iter->first();
1301 if (j != 0)
1302 errln("ja line break failure: failed to start at 0");
1303 j = iter->next();
1304 if (j != 2)
1305 errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1306 + "' (" + ((int)(followingChars[i])) + ")");
1307 j = iter->next();
1308 if (j != 3)
1309 errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1310 + "' (" + ((int)(followingChars[i])) + ")");
1311 }
1312 delete iter;
1313 #endif
1314 }
1315
1316
1317 //------------------------------------------------------------------------------
1318 //
1319 // RBBITest::Extended Run RBBI Tests from an external test data file
1320 //
1321 //------------------------------------------------------------------------------
1322
1323 struct TestParams {
1324 BreakIterator *bi;
1325 UnicodeString dataToBreak;
1326 UVector32 *expectedBreaks;
1327 UVector32 *srcLine;
1328 UVector32 *srcCol;
1329 };
1330
executeTest(TestParams * t)1331 void RBBITest::executeTest(TestParams *t) {
1332 int32_t bp;
1333 int32_t prevBP;
1334 int32_t i;
1335
1336 if (t->bi == NULL) {
1337 return;
1338 }
1339
1340 t->bi->setText(t->dataToBreak);
1341 //
1342 // Run the iterator forward
1343 //
1344 prevBP = -1;
1345 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1346 if (prevBP == bp) {
1347 // Fail for lack of forward progress.
1348 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1349 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1350 break;
1351 }
1352
1353 // Check that there were we didn't miss an expected break between the last one
1354 // and this one.
1355 for (i=prevBP+1; i<bp; i++) {
1356 if (t->expectedBreaks->elementAti(i) != 0) {
1357 int expected[] = {0, i};
1358 printStringBreaks(t->dataToBreak, expected, 2);
1359 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1360 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1361 }
1362 }
1363
1364 // Check that the break we did find was expected
1365 if (t->expectedBreaks->elementAti(bp) == 0) {
1366 int expected[] = {0, bp};
1367 printStringBreaks(t->dataToBreak, expected, 2);
1368 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1369 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1370 } else {
1371 // The break was expected.
1372 // Check that the {nnn} tag value is correct.
1373 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1374 if (expectedTagVal == -1) {
1375 expectedTagVal = 0;
1376 }
1377 int32_t line = t->srcLine->elementAti(bp);
1378 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1379 if (rs != expectedTagVal) {
1380 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1381 " Actual, Expected status = %4d, %4d",
1382 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1383 }
1384 }
1385
1386
1387 prevBP = bp;
1388 }
1389
1390 // Verify that there were no missed expected breaks after the last one found
1391 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1392 if (t->expectedBreaks->elementAti(i) != 0) {
1393 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1394 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1395 }
1396 }
1397
1398 //
1399 // Run the iterator backwards, verify that the same breaks are found.
1400 //
1401 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
1402 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1403 if (prevBP == bp) {
1404 // Fail for lack of progress.
1405 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1406 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1407 break;
1408 }
1409
1410 // Check that there were we didn't miss an expected break between the last one
1411 // and this one. (UVector returns zeros for index out of bounds.)
1412 for (i=prevBP-1; i>bp; i--) {
1413 if (t->expectedBreaks->elementAti(i) != 0) {
1414 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1415 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1416 }
1417 }
1418
1419 // Check that the break we did find was expected
1420 if (t->expectedBreaks->elementAti(bp) == 0) {
1421 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1422 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1423 } else {
1424 // The break was expected.
1425 // Check that the {nnn} tag value is correct.
1426 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1427 if (expectedTagVal == -1) {
1428 expectedTagVal = 0;
1429 }
1430 int line = t->srcLine->elementAti(bp);
1431 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1432 if (rs != expectedTagVal) {
1433 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1434 " Actual, Expected status = %4d, %4d",
1435 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1436 }
1437 }
1438
1439 prevBP = bp;
1440 }
1441
1442 // Verify that there were no missed breaks prior to the last one found
1443 for (i=prevBP-1; i>=0; i--) {
1444 if (t->expectedBreaks->elementAti(i) != 0) {
1445 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1446 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1447 }
1448 }
1449 }
1450
1451
TestExtended()1452 void RBBITest::TestExtended() {
1453 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1454 UErrorCode status = U_ZERO_ERROR;
1455 Locale locale("");
1456
1457 UnicodeString rules;
1458 TestParams tp;
1459 tp.bi = NULL;
1460 tp.expectedBreaks = new UVector32(status);
1461 tp.srcLine = new UVector32(status);
1462 tp.srcCol = new UVector32(status);
1463
1464 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1465 if (U_FAILURE(status)) {
1466 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1467 }
1468
1469
1470 //
1471 // Open and read the test data file.
1472 //
1473 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1474 char testFileName[1000];
1475 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1476 errln("Can't open test data. Path too long.");
1477 return;
1478 }
1479 strcpy(testFileName, testDataDirectory);
1480 strcat(testFileName, "rbbitst.txt");
1481
1482 int len;
1483 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1484 if (U_FAILURE(status)) {
1485 return; /* something went wrong, error already output */
1486 }
1487
1488
1489
1490
1491 //
1492 // Put the test data into a UnicodeString
1493 //
1494 UnicodeString testString(FALSE, testFile, len);
1495
1496 enum EParseState{
1497 PARSE_COMMENT,
1498 PARSE_TAG,
1499 PARSE_DATA,
1500 PARSE_NUM
1501 }
1502 parseState = PARSE_TAG;
1503
1504 EParseState savedState = PARSE_TAG;
1505
1506 static const UChar CH_LF = 0x0a;
1507 static const UChar CH_CR = 0x0d;
1508 static const UChar CH_HASH = 0x23;
1509 /*static const UChar CH_PERIOD = 0x2e;*/
1510 static const UChar CH_LT = 0x3c;
1511 static const UChar CH_GT = 0x3e;
1512 static const UChar CH_BACKSLASH = 0x5c;
1513 static const UChar CH_BULLET = 0x2022;
1514
1515 int32_t lineNum = 1;
1516 int32_t colStart = 0;
1517 int32_t column = 0;
1518 int32_t charIdx = 0;
1519
1520 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1521
1522 for (charIdx = 0; charIdx < len; ) {
1523 status = U_ZERO_ERROR;
1524 UChar c = testString.charAt(charIdx);
1525 charIdx++;
1526 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1527 // treat CRLF as a unit
1528 c = CH_LF;
1529 charIdx++;
1530 }
1531 if (c == CH_LF || c == CH_CR) {
1532 lineNum++;
1533 colStart = charIdx;
1534 }
1535 column = charIdx - colStart + 1;
1536
1537 switch (parseState) {
1538 case PARSE_COMMENT:
1539 if (c == 0x0a || c == 0x0d) {
1540 parseState = savedState;
1541 }
1542 break;
1543
1544 case PARSE_TAG:
1545 {
1546 if (c == CH_HASH) {
1547 parseState = PARSE_COMMENT;
1548 savedState = PARSE_TAG;
1549 break;
1550 }
1551 if (u_isUWhiteSpace(c)) {
1552 break;
1553 }
1554 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1555 delete tp.bi;
1556 tp.bi = BreakIterator::createWordInstance(locale, status);
1557 charIdx += 5;
1558 break;
1559 }
1560 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1561 delete tp.bi;
1562 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1563 charIdx += 5;
1564 break;
1565 }
1566 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1567 delete tp.bi;
1568 tp.bi = BreakIterator::createLineInstance(locale, status);
1569 charIdx += 5;
1570 break;
1571 }
1572 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1573 delete tp.bi;
1574 tp.bi = NULL;
1575 tp.bi = BreakIterator::createSentenceInstance(locale, status);
1576 charIdx += 5;
1577 break;
1578 }
1579 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1580 delete tp.bi;
1581 tp.bi = BreakIterator::createTitleInstance(locale, status);
1582 charIdx += 6;
1583 break;
1584 }
1585
1586 // <locale loc_name>
1587 localeMatcher.reset(testString);
1588 if (localeMatcher.lookingAt(charIdx-1, status)) {
1589 UnicodeString localeName = localeMatcher.group(1, status);
1590 char localeName8[100];
1591 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1592 locale = Locale::createFromName(localeName8);
1593 charIdx += localeMatcher.group(0, status).length();
1594 TEST_ASSERT_SUCCESS(status);
1595 break;
1596 }
1597 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1598 parseState = PARSE_DATA;
1599 charIdx += 5;
1600 tp.dataToBreak = "";
1601 tp.expectedBreaks->removeAllElements();
1602 tp.srcCol ->removeAllElements();
1603 tp.srcLine->removeAllElements();
1604 break;
1605 }
1606
1607 errln("line %d: Tag expected in test file.", lineNum);
1608 parseState = PARSE_COMMENT;
1609 savedState = PARSE_DATA;
1610 goto end_test; // Stop the test.
1611 }
1612 break;
1613
1614 case PARSE_DATA:
1615 if (c == CH_BULLET) {
1616 int32_t breakIdx = tp.dataToBreak.length();
1617 tp.expectedBreaks->setSize(breakIdx+1);
1618 tp.expectedBreaks->setElementAt(-1, breakIdx);
1619 tp.srcLine->setSize(breakIdx+1);
1620 tp.srcLine->setElementAt(lineNum, breakIdx);
1621 tp.srcCol ->setSize(breakIdx+1);
1622 tp.srcCol ->setElementAt(column, breakIdx);
1623 break;
1624 }
1625
1626 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1627 // Add final entry to mappings from break location to source file position.
1628 // Need one extra because last break position returned is after the
1629 // last char in the data, not at the last char.
1630 tp.srcLine->addElement(lineNum, status);
1631 tp.srcCol ->addElement(column, status);
1632
1633 parseState = PARSE_TAG;
1634 charIdx += 6;
1635
1636 // RUN THE TEST!
1637 executeTest(&tp);
1638 break;
1639 }
1640
1641 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1642 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1643 // Get the code point from the name and insert it into the test data.
1644 // (Damn, no API takes names in Unicode !!!
1645 // we've got to take it back to char *)
1646 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1647 int32_t nameLength = nameEndIdx - (charIdx+2);
1648 char charNameBuf[200];
1649 UChar32 theChar = -1;
1650 if (nameEndIdx != -1) {
1651 UErrorCode status = U_ZERO_ERROR;
1652 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1653 charNameBuf[sizeof(charNameBuf)-1] = 0;
1654 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1655 if (U_FAILURE(status)) {
1656 theChar = -1;
1657 }
1658 }
1659 if (theChar == -1) {
1660 errln("Error in named character in test file at line %d, col %d",
1661 lineNum, column);
1662 } else {
1663 // Named code point was recognized. Insert it
1664 // into the test data.
1665 tp.dataToBreak.append(theChar);
1666 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1667 tp.srcLine->addElement(lineNum, status);
1668 tp.srcCol ->addElement(column, status);
1669 }
1670 }
1671 if (nameEndIdx > charIdx) {
1672 charIdx = nameEndIdx+1;
1673
1674 }
1675 break;
1676 }
1677
1678
1679
1680
1681 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1682 charIdx++;
1683 int32_t breakIdx = tp.dataToBreak.length();
1684 tp.expectedBreaks->setSize(breakIdx+1);
1685 tp.expectedBreaks->setElementAt(-1, breakIdx);
1686 tp.srcLine->setSize(breakIdx+1);
1687 tp.srcLine->setElementAt(lineNum, breakIdx);
1688 tp.srcCol ->setSize(breakIdx+1);
1689 tp.srcCol ->setElementAt(column, breakIdx);
1690 break;
1691 }
1692
1693 if (c == CH_LT) {
1694 tagValue = 0;
1695 parseState = PARSE_NUM;
1696 break;
1697 }
1698
1699 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1700 parseState = PARSE_COMMENT;
1701 savedState = PARSE_DATA;
1702 break;
1703 }
1704
1705 if (c == CH_BACKSLASH) {
1706 // Check for \ at end of line, a line continuation.
1707 // Advance over (discard) the newline
1708 UChar32 cp = testString.char32At(charIdx);
1709 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1710 // We have a CR LF
1711 // Need an extra increment of the input ptr to move over both of them
1712 charIdx++;
1713 }
1714 if (cp == CH_LF || cp == CH_CR) {
1715 lineNum++;
1716 colStart = charIdx;
1717 charIdx++;
1718 break;
1719 }
1720
1721 // Let unescape handle the back slash.
1722 cp = testString.unescapeAt(charIdx);
1723 if (cp != -1) {
1724 // Escape sequence was recognized. Insert the char
1725 // into the test data.
1726 tp.dataToBreak.append(cp);
1727 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1728 tp.srcLine->addElement(lineNum, status);
1729 tp.srcCol ->addElement(column, status);
1730 }
1731 break;
1732 }
1733
1734
1735 // Not a recognized backslash escape sequence.
1736 // Take the next char as a literal.
1737 // TODO: Should this be an error?
1738 c = testString.charAt(charIdx);
1739 charIdx = testString.moveIndex32(charIdx, 1);
1740 }
1741
1742 // Normal, non-escaped data char.
1743 tp.dataToBreak.append(c);
1744
1745 // Save the mapping from offset in the data to line/column numbers in
1746 // the original input file. Will be used for better error messages only.
1747 // If there's an expected break before this char, the slot in the mapping
1748 // vector will already be set for this char; don't overwrite it.
1749 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1750 tp.srcLine->addElement(lineNum, status);
1751 tp.srcCol ->addElement(column, status);
1752 }
1753 break;
1754
1755
1756 case PARSE_NUM:
1757 // We are parsing an expected numeric tag value, like <1234>,
1758 // within a chunk of data.
1759 if (u_isUWhiteSpace(c)) {
1760 break;
1761 }
1762
1763 if (c == CH_GT) {
1764 // Finished the number. Add the info to the expected break data,
1765 // and switch parse state back to doing plain data.
1766 parseState = PARSE_DATA;
1767 if (tagValue == 0) {
1768 tagValue = -1;
1769 }
1770 int32_t breakIdx = tp.dataToBreak.length();
1771 tp.expectedBreaks->setSize(breakIdx+1);
1772 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1773 tp.srcLine->setSize(breakIdx+1);
1774 tp.srcLine->setElementAt(lineNum, breakIdx);
1775 tp.srcCol ->setSize(breakIdx+1);
1776 tp.srcCol ->setElementAt(column, breakIdx);
1777 break;
1778 }
1779
1780 if (u_isdigit(c)) {
1781 tagValue = tagValue*10 + u_charDigitValue(c);
1782 break;
1783 }
1784
1785 errln("Syntax Error in test file at line %d, col %d",
1786 lineNum, column);
1787 parseState = PARSE_COMMENT;
1788 goto end_test; // Stop the test
1789 break;
1790 }
1791
1792
1793 if (U_FAILURE(status)) {
1794 dataerrln("ICU Error %s while parsing test file at line %d.",
1795 u_errorName(status), lineNum);
1796 status = U_ZERO_ERROR;
1797 goto end_test; // Stop the test
1798 }
1799
1800 }
1801
1802 end_test:
1803 delete tp.bi;
1804 delete tp.expectedBreaks;
1805 delete tp.srcLine;
1806 delete tp.srcCol;
1807 delete [] testFile;
1808 #endif
1809 }
1810
1811 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
1812 // Words don't include colon or period (cldrbug #1969).
1813 static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types.";
1814 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
1815 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 };
1816
1817 // UBreakIteratorType UBRK_WORD, Locale "ja"
1818 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
1819 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
1820 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
1821 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 };
1822 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
1823
1824 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
1825 // Add break after Greek question mark (cldrbug #2069).
1826 static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
1827 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
1828 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
1829 static const int32_t elSentROffsets[] = { 20, 27, 35, 36 };
1830
1831 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
1832 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
1833 static const char thCharText[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
1834 "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
1835 "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
1836 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
1837 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
1838 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
1839 static const int32_t thCharROffsets[] = { 1, 3, 5, 6, 7, 8, 9, 11,
1840 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28,
1841 29, 32, 33, 35, 37, 38, 40, 41 };
1842
1843 typedef struct {
1844 UBreakIteratorType type;
1845 const char * locale;
1846 const char * escapedText;
1847 const int32_t * tailoredOffsets;
1848 int32_t tailoredOffsetsCount;
1849 const int32_t * rootOffsets;
1850 int32_t rootOffsetsCount;
1851 } TailoredBreakItem;
1852
1853 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
1854
1855 static const TailoredBreakItem tbItems[] = {
1856 { UBRK_WORD, "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
1857 { UBRK_WORD, "ja", jaWordText, ARRAY_PTR_LEN(jaWordTOffsets), ARRAY_PTR_LEN(jaWordROffsets) },
1858 { UBRK_SENTENCE, "el", elSentText, ARRAY_PTR_LEN(elSentTOffsets), ARRAY_PTR_LEN(elSentROffsets) },
1859 { UBRK_CHARACTER, "th", thCharText, ARRAY_PTR_LEN(thCharTOffsets), ARRAY_PTR_LEN(thCharROffsets) },
1860 { UBRK_CHARACTER, NULL, NULL, NULL,0, NULL,0 } // terminator
1861 };
1862
formatOffsets(char * buffer,int32_t buflen,int32_t count,const int32_t * offsets)1863 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
1864 while (count-- > 0) {
1865 int writeCount;
1866 sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
1867 buffer += writeCount;
1868 buflen -= writeCount;
1869 }
1870 }
1871
1872 enum { kMaxOffsetCount = 128 };
1873
TBTest(BreakIterator * brkitr,int type,const char * locale,const char * escapedText,const int32_t * expectOffsets,int32_t expectOffsetsCount)1874 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
1875 brkitr->setText( CharsToUnicodeString(escapedText) );
1876 int32_t foundOffsets[kMaxOffsetCount];
1877 int32_t offset, foundOffsetsCount = 0;
1878 // do forwards iteration test
1879 while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
1880 foundOffsets[foundOffsetsCount++] = offset;
1881 }
1882 if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
1883 // log error for forwards test
1884 char formatExpect[512], formatFound[512];
1885 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
1886 formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
1887 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
1888 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
1889 } else {
1890 // do backwards iteration test
1891 --foundOffsetsCount; // back off one from the end offset
1892 while ( foundOffsetsCount > 0 ) {
1893 offset = brkitr->previous();
1894 if ( offset != foundOffsets[--foundOffsetsCount] ) {
1895 // log error for backwards test
1896 char formatExpect[512];
1897 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
1898 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
1899 type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
1900 break;
1901 }
1902 }
1903 }
1904 }
1905
TestTailoredBreaks()1906 void RBBITest::TestTailoredBreaks() {
1907 const TailoredBreakItem * tbItemPtr;
1908 Locale rootLocale = Locale("root");
1909 for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
1910 Locale testLocale = Locale(tbItemPtr->locale);
1911 BreakIterator * tailoredBrkiter = NULL;
1912 BreakIterator * rootBrkiter = NULL;
1913 UErrorCode status = U_ZERO_ERROR;
1914 switch (tbItemPtr->type) {
1915 case UBRK_CHARACTER:
1916 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
1917 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
1918 break;
1919 case UBRK_WORD:
1920 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
1921 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
1922 break;
1923 case UBRK_LINE:
1924 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
1925 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
1926 break;
1927 case UBRK_SENTENCE:
1928 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
1929 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
1930 break;
1931 default:
1932 status = U_UNSUPPORTED_ERROR;
1933 break;
1934 }
1935 if (U_FAILURE(status)) {
1936 errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
1937 continue;
1938 }
1939 TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
1940 TBTest(rootBrkiter, (int)(tbItemPtr->type), "root", tbItemPtr->escapedText, tbItemPtr->rootOffsets, tbItemPtr->rootOffsetsCount);
1941
1942 delete rootBrkiter;
1943 delete tailoredBrkiter;
1944 }
1945 }
1946
1947
1948 //-------------------------------------------------------------------------------
1949 //
1950 // TestDictRules create a break iterator from source rules that includes a
1951 // dictionary range. Regression for bug #7130. Source rules
1952 // do not declare a break iterator type (word, line, sentence, etc.
1953 // but the dictionary code, without a type, would loop.
1954 //
1955 //-------------------------------------------------------------------------------
TestDictRules()1956 void RBBITest::TestDictRules() {
1957 const char *rules = "$dictionary = [a-z]; \n"
1958 "!!forward; \n"
1959 "$dictionary $dictionary; \n"
1960 "!!reverse; \n"
1961 "$dictionary $dictionary; \n";
1962 const char *text = "aa";
1963 UErrorCode status = U_ZERO_ERROR;
1964 UParseError parseError;
1965
1966 RuleBasedBreakIterator bi(rules, parseError, status);
1967 if (U_SUCCESS(status)) {
1968 UnicodeString utext = text;
1969 bi.setText(utext);
1970 int32_t position;
1971 int32_t loops;
1972 for (loops = 0; loops<10; loops++) {
1973 position = bi.next();
1974 if (position == RuleBasedBreakIterator::DONE) {
1975 break;
1976 }
1977 }
1978 TEST_ASSERT(loops == 1);
1979 } else {
1980 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1981 }
1982 }
1983
1984
1985
1986 //-------------------------------------------------------------------------------
1987 //
1988 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1989 // return the datain one big UChar * buffer, which the caller must delete.
1990 //
1991 // parameters:
1992 // fileName: the name of the file, with no directory part. The test data directory
1993 // is assumed.
1994 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1995 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1996 // specified here. The BOM, if it exists, will be stripped from the returned data.
1997 // Pass NULL for the system default encoding.
1998 // status
1999 // returns:
2000 // The file data, converted to UChar.
2001 // The caller must delete this when done with
2002 // delete [] theBuffer;
2003 //
2004 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
2005 // Move this function to some common place.
2006 //
2007 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)2008 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
2009 UChar *retPtr = NULL;
2010 char *fileBuf = NULL;
2011 UConverter* conv = NULL;
2012 FILE *f = NULL;
2013
2014 ulen = 0;
2015 if (U_FAILURE(status)) {
2016 return retPtr;
2017 }
2018
2019 //
2020 // Open the file.
2021 //
2022 f = fopen(fileName, "rb");
2023 if (f == 0) {
2024 dataerrln("Error opening test data file %s\n", fileName);
2025 status = U_FILE_ACCESS_ERROR;
2026 return NULL;
2027 }
2028 //
2029 // Read it in
2030 //
2031 int fileSize;
2032 int amt_read;
2033
2034 fseek( f, 0, SEEK_END);
2035 fileSize = ftell(f);
2036 fileBuf = new char[fileSize];
2037 fseek(f, 0, SEEK_SET);
2038 amt_read = fread(fileBuf, 1, fileSize, f);
2039 if (amt_read != fileSize || fileSize <= 0) {
2040 errln("Error reading test data file.");
2041 goto cleanUpAndReturn;
2042 }
2043
2044 //
2045 // Look for a Unicode Signature (BOM) on the data just read
2046 //
2047 int32_t signatureLength;
2048 const char * fileBufC;
2049 const char* bomEncoding;
2050
2051 fileBufC = fileBuf;
2052 bomEncoding = ucnv_detectUnicodeSignature(
2053 fileBuf, fileSize, &signatureLength, &status);
2054 if(bomEncoding!=NULL ){
2055 fileBufC += signatureLength;
2056 fileSize -= signatureLength;
2057 encoding = bomEncoding;
2058 }
2059
2060 //
2061 // Open a converter to take the rule file to UTF-16
2062 //
2063 conv = ucnv_open(encoding, &status);
2064 if (U_FAILURE(status)) {
2065 goto cleanUpAndReturn;
2066 }
2067
2068 //
2069 // Convert the rules to UChar.
2070 // Preflight first to determine required buffer size.
2071 //
2072 ulen = ucnv_toUChars(conv,
2073 NULL, // dest,
2074 0, // destCapacity,
2075 fileBufC,
2076 fileSize,
2077 &status);
2078 if (status == U_BUFFER_OVERFLOW_ERROR) {
2079 // Buffer Overflow is expected from the preflight operation.
2080 status = U_ZERO_ERROR;
2081
2082 retPtr = new UChar[ulen+1];
2083 ucnv_toUChars(conv,
2084 retPtr, // dest,
2085 ulen+1,
2086 fileBufC,
2087 fileSize,
2088 &status);
2089 }
2090
2091 cleanUpAndReturn:
2092 fclose(f);
2093 delete []fileBuf;
2094 ucnv_close(conv);
2095 if (U_FAILURE(status)) {
2096 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2097 delete []retPtr;
2098 retPtr = 0;
2099 ulen = 0;
2100 };
2101 return retPtr;
2102 }
2103
2104
2105
2106 //--------------------------------------------------------------------------------------------
2107 //
2108 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
2109 //
2110 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()2111 void RBBITest::TestUnicodeFiles() {
2112 RuleBasedBreakIterator *bi;
2113 UErrorCode status = U_ZERO_ERROR;
2114
2115 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2116 TEST_ASSERT_SUCCESS(status);
2117 if (U_SUCCESS(status)) {
2118 runUnicodeTestData("GraphemeBreakTest.txt", bi);
2119 }
2120 delete bi;
2121
2122 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
2123 TEST_ASSERT_SUCCESS(status);
2124 if (U_SUCCESS(status)) {
2125 runUnicodeTestData("WordBreakTest.txt", bi);
2126 }
2127 delete bi;
2128
2129 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
2130 TEST_ASSERT_SUCCESS(status);
2131 if (U_SUCCESS(status)) {
2132 runUnicodeTestData("SentenceBreakTest.txt", bi);
2133 }
2134 delete bi;
2135
2136 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
2137 TEST_ASSERT_SUCCESS(status);
2138 if (U_SUCCESS(status)) {
2139 runUnicodeTestData("LineBreakTest.txt", bi);
2140 }
2141 delete bi;
2142 }
2143
2144
2145 //--------------------------------------------------------------------------------------------
2146 //
2147 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
2148 //
2149 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)2150 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
2151 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2152 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb.
2153 UVersionInfo icu49 = { 4, 9, 0, 0 };
2154 UBool isICUVersionPast48 = isICUVersionAtLeast(icu49);
2155 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
2156 UErrorCode status = U_ZERO_ERROR;
2157
2158 //
2159 // Open and read the test data file, put it into a UnicodeString.
2160 //
2161 const char *testDataDirectory = IntlTest::getSourceTestData(status);
2162 char testFileName[1000];
2163 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
2164 dataerrln("Can't open test data. Path too long.");
2165 return;
2166 }
2167 strcpy(testFileName, testDataDirectory);
2168 strcat(testFileName, fileName);
2169
2170 logln("Opening data file %s\n", fileName);
2171
2172 int len;
2173 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
2174 if (status != U_FILE_ACCESS_ERROR) {
2175 TEST_ASSERT_SUCCESS(status);
2176 TEST_ASSERT(testFile != NULL);
2177 }
2178 if (U_FAILURE(status) || testFile == NULL) {
2179 return; /* something went wrong, error already output */
2180 }
2181 UnicodeString testFileAsString(TRUE, testFile, len);
2182
2183 //
2184 // Parse the test data file using a regular expression.
2185 // Each kind of token is recognized in its own capture group; what type of item was scanned
2186 // is identified by which group had a match.
2187 //
2188 // Caputure Group # 1 2 3 4 5
2189 // Parses this item: divide x hex digits comment \n unrecognized \n
2190 //
2191 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
2192 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
2193 UnicodeString testString;
2194 UVector32 breakPositions(status);
2195 int lineNumber = 1;
2196 TEST_ASSERT_SUCCESS(status);
2197 if (U_FAILURE(status)) {
2198 return;
2199 }
2200
2201 //
2202 // Scan through each test case, building up the string to be broken in testString,
2203 // and the positions that should be boundaries in the breakPositions vector.
2204 //
2205 int spin = 0;
2206 while (tokenMatcher.find()) {
2207 if(tokenMatcher.hitEnd()) {
2208 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
2209 This occurred when the text file was corrupt (wasn't marked as UTF-8)
2210 and caused an infinite loop here on EBCDIC systems!
2211 */
2212 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
2213 // return;
2214 }
2215 if (tokenMatcher.start(1, status) >= 0) {
2216 // Scanned a divide sign, indicating a break position in the test data.
2217 if (testString.length()>0) {
2218 breakPositions.addElement(testString.length(), status);
2219 }
2220 }
2221 else if (tokenMatcher.start(2, status) >= 0) {
2222 // Scanned an 'x', meaning no break at this position in the test data
2223 // Nothing to be done here.
2224 }
2225 else if (tokenMatcher.start(3, status) >= 0) {
2226 // Scanned Hex digits. Convert them to binary, append to the character data string.
2227 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
2228 int length = hexNumber.length();
2229 if (length<=8) {
2230 char buf[10];
2231 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
2232 UChar32 c = (UChar32)strtol(buf, NULL, 16);
2233 if (c<=0x10ffff) {
2234 testString.append(c);
2235 } else {
2236 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
2237 fileName, lineNumber);
2238 }
2239 } else {
2240 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
2241 fileName, lineNumber);
2242 }
2243 }
2244 else if (tokenMatcher.start(4, status) >= 0) {
2245 // Scanned to end of a line, possibly skipping over a comment in the process.
2246 // If the line from the file contained test data, run the test now.
2247 //
2248 if (testString.length() > 0) {
2249 // TODO(andy): Remove this time bomb code.
2250 if (!isLineBreak || isICUVersionPast48 || !(4658 <= lineNumber && lineNumber <= 4758)) {
2251 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
2252 }
2253 }
2254
2255 // Clear out this test case.
2256 // The string and breakPositions vector will be refilled as the next
2257 // test case is parsed.
2258 testString.remove();
2259 breakPositions.removeAllElements();
2260 lineNumber++;
2261 } else {
2262 // Scanner catchall. Something unrecognized appeared on the line.
2263 char token[16];
2264 UnicodeString uToken = tokenMatcher.group(0, status);
2265 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
2266 token[sizeof(token)-1] = 0;
2267 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
2268
2269 // Clean up, in preparation for continuing with the next line.
2270 testString.remove();
2271 breakPositions.removeAllElements();
2272 lineNumber++;
2273 }
2274 TEST_ASSERT_SUCCESS(status);
2275 if (U_FAILURE(status)) {
2276 break;
2277 }
2278 }
2279
2280 delete [] testFile;
2281 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
2282 }
2283
2284 //--------------------------------------------------------------------------------------------
2285 //
2286 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
2287 // test data files. Do only a simple, forward-only check -
2288 // this test is mostly to check that ICU and the Unicode
2289 // data agree with each other.
2290 //
2291 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)2292 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
2293 const UnicodeString &testString, // Text data to be broken
2294 UVector32 *breakPositions, // Positions where breaks should be found.
2295 RuleBasedBreakIterator *bi) {
2296 int32_t pos; // Break Position in the test string
2297 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
2298 int32_t expectedPos; // Expected break position (index into test string)
2299
2300 bi->setText(testString);
2301 pos = bi->first();
2302 pos = bi->next();
2303
2304 while (pos != BreakIterator::DONE) {
2305 if (expectedI >= breakPositions->size()) {
2306 errln("Test file \"%s\", line %d, unexpected break found at position %d",
2307 testFileName, lineNumber, pos);
2308 break;
2309 }
2310 expectedPos = breakPositions->elementAti(expectedI);
2311 if (pos < expectedPos) {
2312 errln("Test file \"%s\", line %d, unexpected break found at position %d",
2313 testFileName, lineNumber, pos);
2314 break;
2315 }
2316 if (pos > expectedPos) {
2317 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2318 testFileName, lineNumber, expectedPos);
2319 break;
2320 }
2321 pos = bi->next();
2322 expectedI++;
2323 }
2324
2325 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
2326 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2327 testFileName, lineNumber, breakPositions->elementAti(expectedI));
2328 }
2329 }
2330
2331
2332
2333 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2334 //---------------------------------------------------------------------------------------
2335 //
2336 // classs RBBIMonkeyKind
2337 //
2338 // Monkey Test for Break Iteration
2339 // Abstract interface class. Concrete derived classes independently
2340 // implement the break rules for different iterator types.
2341 //
2342 // The Monkey Test itself uses doesn't know which type of break iterator it is
2343 // testing, but works purely in terms of the interface defined here.
2344 //
2345 //---------------------------------------------------------------------------------------
2346 class RBBIMonkeyKind {
2347 public:
2348 // Return a UVector of UnicodeSets, representing the character classes used
2349 // for this type of iterator.
2350 virtual UVector *charClasses() = 0;
2351
2352 // Set the test text on which subsequent calls to next() will operate
2353 virtual void setText(const UnicodeString &s) = 0;
2354
2355 // Find the next break postion, starting from the prev break position, or from zero.
2356 // Return -1 after reaching end of string.
2357 virtual int32_t next(int32_t i) = 0;
2358
2359 virtual ~RBBIMonkeyKind();
2360 UErrorCode deferredStatus;
2361
2362
2363 protected:
2364 RBBIMonkeyKind();
2365
2366 private:
2367 };
2368
RBBIMonkeyKind()2369 RBBIMonkeyKind::RBBIMonkeyKind() {
2370 deferredStatus = U_ZERO_ERROR;
2371 }
2372
~RBBIMonkeyKind()2373 RBBIMonkeyKind::~RBBIMonkeyKind() {
2374 }
2375
2376
2377 //----------------------------------------------------------------------------------------
2378 //
2379 // Random Numbers. Similar to standard lib rand() and srand()
2380 // Not using library to
2381 // 1. Get same results on all platforms.
2382 // 2. Get access to current seed, to more easily reproduce failures.
2383 //
2384 //---------------------------------------------------------------------------------------
2385 static uint32_t m_seed = 1;
2386
m_rand()2387 static uint32_t m_rand()
2388 {
2389 m_seed = m_seed * 1103515245 + 12345;
2390 return (uint32_t)(m_seed/65536) % 32768;
2391 }
2392
2393
2394 //------------------------------------------------------------------------------------------
2395 //
2396 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2397 // of RBBIMonkeyKind.
2398 //
2399 //------------------------------------------------------------------------------------------
2400 class RBBICharMonkey: public RBBIMonkeyKind {
2401 public:
2402 RBBICharMonkey();
2403 virtual ~RBBICharMonkey();
2404 virtual UVector *charClasses();
2405 virtual void setText(const UnicodeString &s);
2406 virtual int32_t next(int32_t i);
2407 private:
2408 UVector *fSets;
2409
2410 UnicodeSet *fCRLFSet;
2411 UnicodeSet *fControlSet;
2412 UnicodeSet *fExtendSet;
2413 UnicodeSet *fPrependSet;
2414 UnicodeSet *fSpacingSet;
2415 UnicodeSet *fLSet;
2416 UnicodeSet *fVSet;
2417 UnicodeSet *fTSet;
2418 UnicodeSet *fLVSet;
2419 UnicodeSet *fLVTSet;
2420 UnicodeSet *fHangulSet;
2421 UnicodeSet *fAnySet;
2422
2423 const UnicodeString *fText;
2424 };
2425
2426
RBBICharMonkey()2427 RBBICharMonkey::RBBICharMonkey() {
2428 UErrorCode status = U_ZERO_ERROR;
2429
2430 fText = NULL;
2431
2432 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2433 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2434 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2435 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2436 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2437 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2438 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2439 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2440 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2441 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2442 fHangulSet = new UnicodeSet();
2443 fHangulSet->addAll(*fLSet);
2444 fHangulSet->addAll(*fVSet);
2445 fHangulSet->addAll(*fTSet);
2446 fHangulSet->addAll(*fLVSet);
2447 fHangulSet->addAll(*fLVTSet);
2448 fAnySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
2449
2450 fSets = new UVector(status);
2451 fSets->addElement(fCRLFSet, status);
2452 fSets->addElement(fControlSet, status);
2453 fSets->addElement(fExtendSet, status);
2454 fSets->addElement(fPrependSet, status);
2455 fSets->addElement(fSpacingSet, status);
2456 fSets->addElement(fHangulSet, status);
2457 fSets->addElement(fAnySet, status);
2458 if (U_FAILURE(status)) {
2459 deferredStatus = status;
2460 }
2461 }
2462
2463
setText(const UnicodeString & s)2464 void RBBICharMonkey::setText(const UnicodeString &s) {
2465 fText = &s;
2466 }
2467
2468
2469
next(int32_t prevPos)2470 int32_t RBBICharMonkey::next(int32_t prevPos) {
2471 int p0, p1, p2, p3; // Indices of the significant code points around the
2472 // break position being tested. The candidate break
2473 // location is before p2.
2474
2475 int breakPos = -1;
2476
2477 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2478
2479 if (U_FAILURE(deferredStatus)) {
2480 return -1;
2481 }
2482
2483 // Previous break at end of string. return DONE.
2484 if (prevPos >= fText->length()) {
2485 return -1;
2486 }
2487 p0 = p1 = p2 = p3 = prevPos;
2488 c3 = fText->char32At(prevPos);
2489 c0 = c1 = c2 = 0;
2490
2491 // Loop runs once per "significant" character position in the input text.
2492 for (;;) {
2493 // Move all of the positions forward in the input string.
2494 p0 = p1; c0 = c1;
2495 p1 = p2; c1 = c2;
2496 p2 = p3; c2 = c3;
2497
2498 // Advancd p3 by one codepoint
2499 p3 = fText->moveIndex32(p3, 1);
2500 c3 = fText->char32At(p3);
2501
2502 if (p1 == p2) {
2503 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2504 continue;
2505 }
2506 if (p2 == fText->length()) {
2507 // Reached end of string. Always a break position.
2508 break;
2509 }
2510
2511 // Rule GB3 CR x LF
2512 // No Extend or Format characters may appear between the CR and LF,
2513 // which requires the additional check for p2 immediately following p1.
2514 //
2515 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2516 continue;
2517 }
2518
2519 // Rule (GB4). ( Control | CR | LF ) <break>
2520 if (fControlSet->contains(c1) ||
2521 c1 == 0x0D ||
2522 c1 == 0x0A) {
2523 break;
2524 }
2525
2526 // Rule (GB5) <break> ( Control | CR | LF )
2527 //
2528 if (fControlSet->contains(c2) ||
2529 c2 == 0x0D ||
2530 c2 == 0x0A) {
2531 break;
2532 }
2533
2534
2535 // Rule (GB6) L x ( L | V | LV | LVT )
2536 if (fLSet->contains(c1) &&
2537 (fLSet->contains(c2) ||
2538 fVSet->contains(c2) ||
2539 fLVSet->contains(c2) ||
2540 fLVTSet->contains(c2))) {
2541 continue;
2542 }
2543
2544 // Rule (GB7) ( LV | V ) x ( V | T )
2545 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2546 (fVSet->contains(c2) || fTSet->contains(c2))) {
2547 continue;
2548 }
2549
2550 // Rule (GB8) ( LVT | T) x T
2551 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2552 fTSet->contains(c2)) {
2553 continue;
2554 }
2555
2556 // Rule (GB9) Numeric x ALetter
2557 if (fExtendSet->contains(c2)) {
2558 continue;
2559 }
2560
2561 // Rule (GB9a) x SpacingMark
2562 if (fSpacingSet->contains(c2)) {
2563 continue;
2564 }
2565
2566 // Rule (GB9b) Prepend x
2567 if (fPrependSet->contains(c1)) {
2568 continue;
2569 }
2570
2571 // Rule (GB10) Any <break> Any
2572 break;
2573 }
2574
2575 breakPos = p2;
2576 return breakPos;
2577 }
2578
2579
2580
charClasses()2581 UVector *RBBICharMonkey::charClasses() {
2582 return fSets;
2583 }
2584
2585
~RBBICharMonkey()2586 RBBICharMonkey::~RBBICharMonkey() {
2587 delete fSets;
2588 delete fCRLFSet;
2589 delete fControlSet;
2590 delete fExtendSet;
2591 delete fPrependSet;
2592 delete fSpacingSet;
2593 delete fLSet;
2594 delete fVSet;
2595 delete fTSet;
2596 delete fLVSet;
2597 delete fLVTSet;
2598 delete fHangulSet;
2599 delete fAnySet;
2600 }
2601
2602 //------------------------------------------------------------------------------------------
2603 //
2604 // class RBBIWordMonkey Word Break specific implementation
2605 // of RBBIMonkeyKind.
2606 //
2607 //------------------------------------------------------------------------------------------
2608 class RBBIWordMonkey: public RBBIMonkeyKind {
2609 public:
2610 RBBIWordMonkey();
2611 virtual ~RBBIWordMonkey();
2612 virtual UVector *charClasses();
2613 virtual void setText(const UnicodeString &s);
2614 virtual int32_t next(int32_t i);
2615 private:
2616 UVector *fSets;
2617
2618 UnicodeSet *fCRSet;
2619 UnicodeSet *fLFSet;
2620 UnicodeSet *fNewlineSet;
2621 UnicodeSet *fKatakanaSet;
2622 UnicodeSet *fALetterSet;
2623 UnicodeSet *fMidNumLetSet;
2624 UnicodeSet *fMidLetterSet;
2625 UnicodeSet *fMidNumSet;
2626 UnicodeSet *fNumericSet;
2627 UnicodeSet *fFormatSet;
2628 UnicodeSet *fOtherSet;
2629 UnicodeSet *fExtendSet;
2630 UnicodeSet *fExtendNumLetSet;
2631
2632 RegexMatcher *fMatcher;
2633
2634 const UnicodeString *fText;
2635 };
2636
2637
RBBIWordMonkey()2638 RBBIWordMonkey::RBBIWordMonkey()
2639 {
2640 UErrorCode status = U_ZERO_ERROR;
2641
2642 fSets = new UVector(status);
2643
2644 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2645 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2646 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
2647 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2648 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2649 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2650 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
2651 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
2652 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2653 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2654 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2655 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
2656
2657 fOtherSet = new UnicodeSet();
2658 if(U_FAILURE(status)) {
2659 deferredStatus = status;
2660 return;
2661 }
2662
2663 fOtherSet->complement();
2664 fOtherSet->removeAll(*fCRSet);
2665 fOtherSet->removeAll(*fLFSet);
2666 fOtherSet->removeAll(*fNewlineSet);
2667 fOtherSet->removeAll(*fKatakanaSet);
2668 fOtherSet->removeAll(*fALetterSet);
2669 fOtherSet->removeAll(*fMidLetterSet);
2670 fOtherSet->removeAll(*fMidNumSet);
2671 fOtherSet->removeAll(*fNumericSet);
2672 fOtherSet->removeAll(*fExtendNumLetSet);
2673 fOtherSet->removeAll(*fFormatSet);
2674 fOtherSet->removeAll(*fExtendSet);
2675 // Inhibit dictionary characters from being tested at all.
2676 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2677
2678 fSets->addElement(fCRSet, status);
2679 fSets->addElement(fLFSet, status);
2680 fSets->addElement(fNewlineSet, status);
2681 fSets->addElement(fALetterSet, status);
2682 fSets->addElement(fKatakanaSet, status);
2683 fSets->addElement(fMidLetterSet, status);
2684 fSets->addElement(fMidNumLetSet, status);
2685 fSets->addElement(fMidNumSet, status);
2686 fSets->addElement(fNumericSet, status);
2687 fSets->addElement(fFormatSet, status);
2688 fSets->addElement(fExtendSet, status);
2689 fSets->addElement(fOtherSet, status);
2690 fSets->addElement(fExtendNumLetSet, status);
2691
2692 if (U_FAILURE(status)) {
2693 deferredStatus = status;
2694 }
2695 }
2696
setText(const UnicodeString & s)2697 void RBBIWordMonkey::setText(const UnicodeString &s) {
2698 fText = &s;
2699 }
2700
2701
next(int32_t prevPos)2702 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2703 int p0, p1, p2, p3; // Indices of the significant code points around the
2704 // break position being tested. The candidate break
2705 // location is before p2.
2706
2707 int breakPos = -1;
2708
2709 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2710
2711 if (U_FAILURE(deferredStatus)) {
2712 return -1;
2713 }
2714
2715 // Prev break at end of string. return DONE.
2716 if (prevPos >= fText->length()) {
2717 return -1;
2718 }
2719 p0 = p1 = p2 = p3 = prevPos;
2720 c3 = fText->char32At(prevPos);
2721 c0 = c1 = c2 = 0;
2722
2723 // Loop runs once per "significant" character position in the input text.
2724 for (;;) {
2725 // Move all of the positions forward in the input string.
2726 p0 = p1; c0 = c1;
2727 p1 = p2; c1 = c2;
2728 p2 = p3; c2 = c3;
2729
2730 // Advancd p3 by X(Extend | Format)* Rule 4
2731 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2732 do {
2733 p3 = fText->moveIndex32(p3, 1);
2734 c3 = fText->char32At(p3);
2735 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2736 break;
2737 };
2738 }
2739 while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2740
2741
2742 if (p1 == p2) {
2743 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2744 continue;
2745 }
2746 if (p2 == fText->length()) {
2747 // Reached end of string. Always a break position.
2748 break;
2749 }
2750
2751 // Rule (3) CR x LF
2752 // No Extend or Format characters may appear between the CR and LF,
2753 // which requires the additional check for p2 immediately following p1.
2754 //
2755 if (c1==0x0D && c2==0x0A) {
2756 continue;
2757 }
2758
2759 // Rule (3a) Break before and after newlines (including CR and LF)
2760 //
2761 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2762 break;
2763 };
2764 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2765 break;
2766 };
2767
2768 // Rule (5). ALetter x ALetter
2769 if (fALetterSet->contains(c1) &&
2770 fALetterSet->contains(c2)) {
2771 continue;
2772 }
2773
2774 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2775 //
2776 if ( fALetterSet->contains(c1) &&
2777 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2778 fALetterSet->contains(c3)) {
2779 continue;
2780 }
2781
2782
2783 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2784 if (fALetterSet->contains(c0) &&
2785 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
2786 fALetterSet->contains(c2)) {
2787 continue;
2788 }
2789
2790 // Rule (8) Numeric x Numeric
2791 if (fNumericSet->contains(c1) &&
2792 fNumericSet->contains(c2)) {
2793 continue;
2794 }
2795
2796 // Rule (9) ALetter x Numeric
2797 if (fALetterSet->contains(c1) &&
2798 fNumericSet->contains(c2)) {
2799 continue;
2800 }
2801
2802 // Rule (10) Numeric x ALetter
2803 if (fNumericSet->contains(c1) &&
2804 fALetterSet->contains(c2)) {
2805 continue;
2806 }
2807
2808 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
2809 if (fNumericSet->contains(c0) &&
2810 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
2811 fNumericSet->contains(c2)) {
2812 continue;
2813 }
2814
2815 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2816 if (fNumericSet->contains(c1) &&
2817 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2818 fNumericSet->contains(c3)) {
2819 continue;
2820 }
2821
2822 // Rule (13) Katakana x Katakana
2823 if (fKatakanaSet->contains(c1) &&
2824 fKatakanaSet->contains(c2)) {
2825 continue;
2826 }
2827
2828 // Rule 13a
2829 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2830 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2831 fExtendNumLetSet->contains(c2)) {
2832 continue;
2833 }
2834
2835 // Rule 13b
2836 if (fExtendNumLetSet->contains(c1) &&
2837 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2838 fKatakanaSet->contains(c2))) {
2839 continue;
2840 }
2841
2842 // Rule 14. Break found here.
2843 break;
2844 }
2845
2846 breakPos = p2;
2847 return breakPos;
2848 }
2849
2850
charClasses()2851 UVector *RBBIWordMonkey::charClasses() {
2852 return fSets;
2853 }
2854
2855
~RBBIWordMonkey()2856 RBBIWordMonkey::~RBBIWordMonkey() {
2857 delete fSets;
2858 delete fCRSet;
2859 delete fLFSet;
2860 delete fNewlineSet;
2861 delete fKatakanaSet;
2862 delete fALetterSet;
2863 delete fMidNumLetSet;
2864 delete fMidLetterSet;
2865 delete fMidNumSet;
2866 delete fNumericSet;
2867 delete fFormatSet;
2868 delete fExtendSet;
2869 delete fExtendNumLetSet;
2870 delete fOtherSet;
2871 }
2872
2873
2874
2875
2876 //------------------------------------------------------------------------------------------
2877 //
2878 // class RBBISentMonkey Sentence Break specific implementation
2879 // of RBBIMonkeyKind.
2880 //
2881 //------------------------------------------------------------------------------------------
2882 class RBBISentMonkey: public RBBIMonkeyKind {
2883 public:
2884 RBBISentMonkey();
2885 virtual ~RBBISentMonkey();
2886 virtual UVector *charClasses();
2887 virtual void setText(const UnicodeString &s);
2888 virtual int32_t next(int32_t i);
2889 private:
2890 int moveBack(int posFrom);
2891 int moveForward(int posFrom);
2892 UChar32 cAt(int pos);
2893
2894 UVector *fSets;
2895
2896 UnicodeSet *fSepSet;
2897 UnicodeSet *fFormatSet;
2898 UnicodeSet *fSpSet;
2899 UnicodeSet *fLowerSet;
2900 UnicodeSet *fUpperSet;
2901 UnicodeSet *fOLetterSet;
2902 UnicodeSet *fNumericSet;
2903 UnicodeSet *fATermSet;
2904 UnicodeSet *fSContinueSet;
2905 UnicodeSet *fSTermSet;
2906 UnicodeSet *fCloseSet;
2907 UnicodeSet *fOtherSet;
2908 UnicodeSet *fExtendSet;
2909
2910 const UnicodeString *fText;
2911
2912 };
2913
RBBISentMonkey()2914 RBBISentMonkey::RBBISentMonkey()
2915 {
2916 UErrorCode status = U_ZERO_ERROR;
2917
2918 fSets = new UVector(status);
2919
2920 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2921 // set and made into character classes of their own. For the monkey impl,
2922 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2923 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2924 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2925 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2926 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2927 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2928 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2929 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2930 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2931 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2932 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2933 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2934 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2935 fOtherSet = new UnicodeSet();
2936
2937 if(U_FAILURE(status)) {
2938 deferredStatus = status;
2939 return;
2940 }
2941
2942 fOtherSet->complement();
2943 fOtherSet->removeAll(*fSepSet);
2944 fOtherSet->removeAll(*fFormatSet);
2945 fOtherSet->removeAll(*fSpSet);
2946 fOtherSet->removeAll(*fLowerSet);
2947 fOtherSet->removeAll(*fUpperSet);
2948 fOtherSet->removeAll(*fOLetterSet);
2949 fOtherSet->removeAll(*fNumericSet);
2950 fOtherSet->removeAll(*fATermSet);
2951 fOtherSet->removeAll(*fSContinueSet);
2952 fOtherSet->removeAll(*fSTermSet);
2953 fOtherSet->removeAll(*fCloseSet);
2954 fOtherSet->removeAll(*fExtendSet);
2955
2956 fSets->addElement(fSepSet, status);
2957 fSets->addElement(fFormatSet, status);
2958 fSets->addElement(fSpSet, status);
2959 fSets->addElement(fLowerSet, status);
2960 fSets->addElement(fUpperSet, status);
2961 fSets->addElement(fOLetterSet, status);
2962 fSets->addElement(fNumericSet, status);
2963 fSets->addElement(fATermSet, status);
2964 fSets->addElement(fSContinueSet, status);
2965 fSets->addElement(fSTermSet, status);
2966 fSets->addElement(fCloseSet, status);
2967 fSets->addElement(fOtherSet, status);
2968 fSets->addElement(fExtendSet, status);
2969
2970 if (U_FAILURE(status)) {
2971 deferredStatus = status;
2972 }
2973 }
2974
2975
2976
setText(const UnicodeString & s)2977 void RBBISentMonkey::setText(const UnicodeString &s) {
2978 fText = &s;
2979 }
2980
charClasses()2981 UVector *RBBISentMonkey::charClasses() {
2982 return fSets;
2983 }
2984
2985
2986 // moveBack() Find the "significant" code point preceding the index i.
2987 // Skips over ($Extend | $Format)* .
2988 //
moveBack(int i)2989 int RBBISentMonkey::moveBack(int i) {
2990 if (i <= 0) {
2991 return -1;
2992 }
2993 UChar32 c;
2994 int32_t j = i;
2995 do {
2996 j = fText->moveIndex32(j, -1);
2997 c = fText->char32At(j);
2998 }
2999 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
3000 return j;
3001
3002 }
3003
3004
moveForward(int i)3005 int RBBISentMonkey::moveForward(int i) {
3006 if (i>=fText->length()) {
3007 return fText->length();
3008 }
3009 UChar32 c;
3010 int32_t j = i;
3011 do {
3012 j = fText->moveIndex32(j, 1);
3013 c = cAt(j);
3014 }
3015 while (fFormatSet->contains(c) || fExtendSet->contains(c));
3016 return j;
3017 }
3018
cAt(int pos)3019 UChar32 RBBISentMonkey::cAt(int pos) {
3020 if (pos<0 || pos>=fText->length()) {
3021 return -1;
3022 } else {
3023 return fText->char32At(pos);
3024 }
3025 }
3026
next(int32_t prevPos)3027 int32_t RBBISentMonkey::next(int32_t prevPos) {
3028 int p0, p1, p2, p3; // Indices of the significant code points around the
3029 // break position being tested. The candidate break
3030 // location is before p2.
3031
3032 int breakPos = -1;
3033
3034 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
3035 UChar32 c;
3036
3037 if (U_FAILURE(deferredStatus)) {
3038 return -1;
3039 }
3040
3041 // Prev break at end of string. return DONE.
3042 if (prevPos >= fText->length()) {
3043 return -1;
3044 }
3045 p0 = p1 = p2 = p3 = prevPos;
3046 c3 = fText->char32At(prevPos);
3047 c0 = c1 = c2 = 0;
3048
3049 // Loop runs once per "significant" character position in the input text.
3050 for (;;) {
3051 // Move all of the positions forward in the input string.
3052 p0 = p1; c0 = c1;
3053 p1 = p2; c1 = c2;
3054 p2 = p3; c2 = c3;
3055
3056 // Advancd p3 by X(Extend | Format)* Rule 4
3057 p3 = moveForward(p3);
3058 c3 = cAt(p3);
3059
3060 // Rule (3) CR x LF
3061 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
3062 continue;
3063 }
3064
3065 // Rule (4). Sep <break>
3066 if (fSepSet->contains(c1)) {
3067 p2 = p1+1; // Separators don't combine with Extend or Format.
3068 break;
3069 }
3070
3071 if (p2 >= fText->length()) {
3072 // Reached end of string. Always a break position.
3073 break;
3074 }
3075
3076 if (p2 == prevPos) {
3077 // Still warming up the loop. (won't work with zero length strings, but we don't care)
3078 continue;
3079 }
3080
3081 // Rule (6). ATerm x Numeric
3082 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
3083 continue;
3084 }
3085
3086 // Rule (7). Upper ATerm x Uppper
3087 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
3088 continue;
3089 }
3090
3091 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
3092 // Note: STerm | ATerm are added to the negated part of the expression by a
3093 // note to the Unicode 5.0 documents.
3094 int p8 = p1;
3095 while (fSpSet->contains(cAt(p8))) {
3096 p8 = moveBack(p8);
3097 }
3098 while (fCloseSet->contains(cAt(p8))) {
3099 p8 = moveBack(p8);
3100 }
3101 if (fATermSet->contains(cAt(p8))) {
3102 p8=p2;
3103 for (;;) {
3104 c = cAt(p8);
3105 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
3106 fLowerSet->contains(c) || fSepSet->contains(c) ||
3107 fATermSet->contains(c) || fSTermSet->contains(c)) {
3108 break;
3109 }
3110 p8 = moveForward(p8);
3111 }
3112 if (fLowerSet->contains(cAt(p8))) {
3113 continue;
3114 }
3115 }
3116
3117 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
3118 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
3119 p8 = p1;
3120 while (fSpSet->contains(cAt(p8))) {
3121 p8 = moveBack(p8);
3122 }
3123 while (fCloseSet->contains(cAt(p8))) {
3124 p8 = moveBack(p8);
3125 }
3126 c = cAt(p8);
3127 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
3128 continue;
3129 }
3130 }
3131
3132 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
3133 int p9 = p1;
3134 while (fCloseSet->contains(cAt(p9))) {
3135 p9 = moveBack(p9);
3136 }
3137 c = cAt(p9);
3138 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
3139 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
3140 continue;
3141 }
3142 }
3143
3144 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
3145 int p10 = p1;
3146 while (fSpSet->contains(cAt(p10))) {
3147 p10 = moveBack(p10);
3148 }
3149 while (fCloseSet->contains(cAt(p10))) {
3150 p10 = moveBack(p10);
3151 }
3152 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
3153 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
3154 continue;
3155 }
3156 }
3157
3158 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
3159 int p11 = p1;
3160 if (fSepSet->contains(cAt(p11))) {
3161 p11 = moveBack(p11);
3162 }
3163 while (fSpSet->contains(cAt(p11))) {
3164 p11 = moveBack(p11);
3165 }
3166 while (fCloseSet->contains(cAt(p11))) {
3167 p11 = moveBack(p11);
3168 }
3169 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
3170 break;
3171 }
3172
3173 // Rule (12) Any x Any
3174 continue;
3175 }
3176 breakPos = p2;
3177 return breakPos;
3178 }
3179
~RBBISentMonkey()3180 RBBISentMonkey::~RBBISentMonkey() {
3181 delete fSets;
3182 delete fSepSet;
3183 delete fFormatSet;
3184 delete fSpSet;
3185 delete fLowerSet;
3186 delete fUpperSet;
3187 delete fOLetterSet;
3188 delete fNumericSet;
3189 delete fATermSet;
3190 delete fSContinueSet;
3191 delete fSTermSet;
3192 delete fCloseSet;
3193 delete fOtherSet;
3194 delete fExtendSet;
3195 }
3196
3197
3198
3199 //-------------------------------------------------------------------------------------------
3200 //
3201 // RBBILineMonkey
3202 //
3203 //-------------------------------------------------------------------------------------------
3204
3205 class RBBILineMonkey: public RBBIMonkeyKind {
3206 public:
3207 RBBILineMonkey();
3208 virtual ~RBBILineMonkey();
3209 virtual UVector *charClasses();
3210 virtual void setText(const UnicodeString &s);
3211 virtual int32_t next(int32_t i);
3212 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
3213 private:
3214 UVector *fSets;
3215
3216 UnicodeSet *fBK;
3217 UnicodeSet *fCR;
3218 UnicodeSet *fLF;
3219 UnicodeSet *fCM;
3220 UnicodeSet *fNL;
3221 UnicodeSet *fSG;
3222 UnicodeSet *fWJ;
3223 UnicodeSet *fZW;
3224 UnicodeSet *fGL;
3225 UnicodeSet *fCB;
3226 UnicodeSet *fSP;
3227 UnicodeSet *fB2;
3228 UnicodeSet *fBA;
3229 UnicodeSet *fBB;
3230 UnicodeSet *fHY;
3231 UnicodeSet *fH2;
3232 UnicodeSet *fH3;
3233 UnicodeSet *fCL;
3234 UnicodeSet *fCP;
3235 UnicodeSet *fEX;
3236 UnicodeSet *fIN;
3237 UnicodeSet *fJL;
3238 UnicodeSet *fJV;
3239 UnicodeSet *fJT;
3240 UnicodeSet *fNS;
3241 UnicodeSet *fOP;
3242 UnicodeSet *fQU;
3243 UnicodeSet *fIS;
3244 UnicodeSet *fNU;
3245 UnicodeSet *fPO;
3246 UnicodeSet *fPR;
3247 UnicodeSet *fSY;
3248 UnicodeSet *fAI;
3249 UnicodeSet *fAL;
3250 UnicodeSet *fID;
3251 UnicodeSet *fSA;
3252 UnicodeSet *fXX;
3253
3254 BreakIterator *fCharBI;
3255
3256 const UnicodeString *fText;
3257 int32_t *fOrigPositions;
3258
3259 RegexMatcher *fNumberMatcher;
3260 RegexMatcher *fLB11Matcher;
3261 };
3262
3263
RBBILineMonkey()3264 RBBILineMonkey::RBBILineMonkey()
3265 {
3266 UErrorCode status = U_ZERO_ERROR;
3267
3268 fSets = new UVector(status);
3269
3270 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3271 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3272 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3273 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3274 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3275 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3276 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3277 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3278 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3279 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3280 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3281 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3282 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3283 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3284 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3285 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3286 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3287 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3288 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3289 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3290 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3291 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3292 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3293 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3294 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3295 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3296 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3297 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3298 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3299 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3300 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3301 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3302 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3303 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3304 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
3305 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3306 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3307
3308 if (U_FAILURE(status)) {
3309 deferredStatus = status;
3310 fCharBI = NULL;
3311 fNumberMatcher = NULL;
3312 return;
3313 }
3314
3315 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
3316 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
3317 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
3318 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
3319
3320 fSets->addElement(fBK, status);
3321 fSets->addElement(fCR, status);
3322 fSets->addElement(fLF, status);
3323 fSets->addElement(fCM, status);
3324 fSets->addElement(fNL, status);
3325 fSets->addElement(fWJ, status);
3326 fSets->addElement(fZW, status);
3327 fSets->addElement(fGL, status);
3328 fSets->addElement(fCB, status);
3329 fSets->addElement(fSP, status);
3330 fSets->addElement(fB2, status);
3331 fSets->addElement(fBA, status);
3332 fSets->addElement(fBB, status);
3333 fSets->addElement(fHY, status);
3334 fSets->addElement(fH2, status);
3335 fSets->addElement(fH3, status);
3336 fSets->addElement(fCL, status);
3337 fSets->addElement(fCP, status);
3338 fSets->addElement(fEX, status);
3339 fSets->addElement(fIN, status);
3340 fSets->addElement(fJL, status);
3341 fSets->addElement(fJT, status);
3342 fSets->addElement(fJV, status);
3343 fSets->addElement(fNS, status);
3344 fSets->addElement(fOP, status);
3345 fSets->addElement(fQU, status);
3346 fSets->addElement(fIS, status);
3347 fSets->addElement(fNU, status);
3348 fSets->addElement(fPO, status);
3349 fSets->addElement(fPR, status);
3350 fSets->addElement(fSY, status);
3351 fSets->addElement(fAI, status);
3352 fSets->addElement(fAL, status);
3353 fSets->addElement(fID, status);
3354 fSets->addElement(fWJ, status);
3355 fSets->addElement(fSA, status);
3356 fSets->addElement(fSG, status);
3357
3358 const char *rules =
3359 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3360 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3361 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3362 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3363 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3364 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3365
3366 fNumberMatcher = new RegexMatcher(
3367 UnicodeString(rules, -1, US_INV), 0, status);
3368
3369 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3370
3371 if (U_FAILURE(status)) {
3372 deferredStatus = status;
3373 }
3374 }
3375
3376
setText(const UnicodeString & s)3377 void RBBILineMonkey::setText(const UnicodeString &s) {
3378 fText = &s;
3379 fCharBI->setText(s);
3380 fNumberMatcher->reset(s);
3381 }
3382
3383 //
3384 // rule9Adjust
3385 // Line Break TR rules 9 and 10 implementation.
3386 // This deals with combining marks and other sequences that
3387 // that must be treated as if they were something other than what they actually are.
3388 //
3389 // This is factored out into a separate function because it must be applied twice for
3390 // each potential break, once to the chars before the position being checked, then
3391 // again to the text following the possible break.
3392 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)3393 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3394 if (pos == -1) {
3395 // Invalid initial position. Happens during the warmup iteration of the
3396 // main loop in next().
3397 return;
3398 }
3399
3400 int32_t nPos = *nextPos;
3401
3402 // LB 9 Keep combining sequences together.
3403 // advance over any CM class chars. Note that Line Break CM is different
3404 // from the normal Grapheme Extend property.
3405 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3406 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3407 for (;;) {
3408 *nextChar = fText->char32At(nPos);
3409 if (!fCM->contains(*nextChar)) {
3410 break;
3411 }
3412 nPos = fText->moveIndex32(nPos, 1);
3413 }
3414 }
3415
3416
3417 // LB 9 Treat X CM* as if it were x.
3418 // No explicit action required.
3419
3420 // LB 10 Treat any remaining combining mark as AL
3421 if (fCM->contains(*posChar)) {
3422 *posChar = 0x41; // thisChar = 'A';
3423 }
3424
3425 // Push the updated nextPos and nextChar back to our caller.
3426 // This only makes a difference if posChar got bigger by consuming a
3427 // combining sequence.
3428 *nextPos = nPos;
3429 *nextChar = fText->char32At(nPos);
3430 }
3431
3432
3433
next(int32_t startPos)3434 int32_t RBBILineMonkey::next(int32_t startPos) {
3435 UErrorCode status = U_ZERO_ERROR;
3436 int32_t pos; // Index of the char following a potential break position
3437 UChar32 thisChar; // Character at above position "pos"
3438
3439 int32_t prevPos; // Index of the char preceding a potential break position
3440 UChar32 prevChar; // Character at above position. Note that prevChar
3441 // and thisChar may not be adjacent because combining
3442 // characters between them will be ignored.
3443
3444 int32_t nextPos; // Index of the next character following pos.
3445 // Usually skips over combining marks.
3446 int32_t nextCPPos; // Index of the code point following "pos."
3447 // May point to a combining mark.
3448 int32_t tPos; // temp value.
3449 UChar32 c;
3450
3451 if (U_FAILURE(deferredStatus)) {
3452 return -1;
3453 }
3454
3455 if (startPos >= fText->length()) {
3456 return -1;
3457 }
3458
3459
3460 // Initial values for loop. Loop will run the first time without finding breaks,
3461 // while the invalid values shift out and the "this" and
3462 // "prev" positions are filled in with good values.
3463 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
3464 thisChar = prevChar = 0;
3465 nextPos = nextCPPos = startPos;
3466
3467
3468 // Loop runs once per position in the test text, until a break position
3469 // is found.
3470 for (;;) {
3471 prevPos = pos;
3472 prevChar = thisChar;
3473
3474 pos = nextPos;
3475 thisChar = fText->char32At(pos);
3476
3477 nextCPPos = fText->moveIndex32(pos, 1);
3478 nextPos = nextCPPos;
3479
3480 // Rule LB2 - Break at end of text.
3481 if (pos >= fText->length()) {
3482 break;
3483 }
3484
3485 // Rule LB 9 - adjust for combining sequences.
3486 // We do this one out-of-order because the adjustment does not change anything
3487 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3488 // be applied.
3489 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3490 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3491 c = fText->char32At(nextPos);
3492 rule9Adjust(pos, &thisChar, &nextPos, &c);
3493
3494 // If the loop is still warming up - if we haven't shifted the initial
3495 // -1 positions out of prevPos yet - loop back to advance the
3496 // position in the input without any further looking for breaks.
3497 if (prevPos == -1) {
3498 continue;
3499 }
3500
3501 // LB 4 Always break after hard line breaks,
3502 if (fBK->contains(prevChar)) {
3503 break;
3504 }
3505
3506 // LB 5 Break after CR, LF, NL, but not inside CR LF
3507 if (prevChar == 0x0d && thisChar == 0x0a) {
3508 continue;
3509 }
3510 if (prevChar == 0x0d ||
3511 prevChar == 0x0a ||
3512 prevChar == 0x85) {
3513 break;
3514 }
3515
3516 // LB 6 Don't break before hard line breaks
3517 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3518 fBK->contains(thisChar)) {
3519 continue;
3520 }
3521
3522
3523 // LB 7 Don't break before spaces or zero-width space.
3524 if (fSP->contains(thisChar)) {
3525 continue;
3526 }
3527
3528 if (fZW->contains(thisChar)) {
3529 continue;
3530 }
3531
3532 // LB 8 Break after zero width space
3533 if (fZW->contains(prevChar)) {
3534 break;
3535 }
3536
3537 // LB 9, 10 Already done, at top of loop.
3538 //
3539
3540
3541 // LB 11 Do not break before or after WORD JOINER and related characters.
3542 // x WJ
3543 // WJ x
3544 //
3545 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3546 continue;
3547 }
3548
3549 // LB 12
3550 // GL x
3551 if (fGL->contains(prevChar)) {
3552 continue;
3553 }
3554
3555 // LB 12a
3556 // [^SP BA HY] x GL
3557 if (!(fSP->contains(prevChar) ||
3558 fBA->contains(prevChar) ||
3559 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3560 continue;
3561 }
3562
3563
3564
3565 // LB 13 Don't break before closings.
3566 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3567 // fall into LB 17 and the more general number regular expression.
3568 //
3569 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3570 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3571 fEX->contains(thisChar) ||
3572 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3573 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
3574 continue;
3575 }
3576
3577 // LB 14 Don't break after OP SP*
3578 // Scan backwards, checking for this sequence.
3579 // The OP char could include combining marks, so we actually check for
3580 // OP CM* SP*
3581 // Another Twist: The Rule 67 fixes may have changed a SP CM
3582 // sequence into a ID char, so before scanning back through spaces,
3583 // verify that prevChar is indeed a space. The prevChar variable
3584 // may differ from fText[prevPos]
3585 tPos = prevPos;
3586 if (fSP->contains(prevChar)) {
3587 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3588 tPos=fText->moveIndex32(tPos, -1);
3589 }
3590 }
3591 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3592 tPos=fText->moveIndex32(tPos, -1);
3593 }
3594 if (fOP->contains(fText->char32At(tPos))) {
3595 continue;
3596 }
3597
3598
3599 // LB 15 QU SP* x OP
3600 if (fOP->contains(thisChar)) {
3601 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3602 int tPos = prevPos;
3603 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3604 tPos = fText->moveIndex32(tPos, -1);
3605 }
3606 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3607 tPos = fText->moveIndex32(tPos, -1);
3608 }
3609 if (fQU->contains(fText->char32At(tPos))) {
3610 continue;
3611 }
3612 }
3613
3614
3615
3616 // LB 16 (CL | CP) SP* x NS
3617 // Scan backwards for SP* CM* (CL | CP)
3618 if (fNS->contains(thisChar)) {
3619 int tPos = prevPos;
3620 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3621 tPos = fText->moveIndex32(tPos, -1);
3622 }
3623 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3624 tPos = fText->moveIndex32(tPos, -1);
3625 }
3626 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3627 continue;
3628 }
3629 }
3630
3631
3632 // LB 17 B2 SP* x B2
3633 if (fB2->contains(thisChar)) {
3634 // Scan backwards, checking for the B2 CM* SP* sequence.
3635 tPos = prevPos;
3636 if (fSP->contains(prevChar)) {
3637 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3638 tPos=fText->moveIndex32(tPos, -1);
3639 }
3640 }
3641 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3642 tPos=fText->moveIndex32(tPos, -1);
3643 }
3644 if (fB2->contains(fText->char32At(tPos))) {
3645 continue;
3646 }
3647 }
3648
3649
3650 // LB 18 break after space
3651 if (fSP->contains(prevChar)) {
3652 break;
3653 }
3654
3655 // LB 19
3656 // x QU
3657 // QU x
3658 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3659 continue;
3660 }
3661
3662 // LB 20 Break around a CB
3663 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3664 break;
3665 }
3666
3667 // LB 21
3668 if (fBA->contains(thisChar) ||
3669 fHY->contains(thisChar) ||
3670 fNS->contains(thisChar) ||
3671 fBB->contains(prevChar) ) {
3672 continue;
3673 }
3674
3675 // LB 22
3676 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3677 (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3678 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3679 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
3680 continue;
3681 }
3682
3683
3684 // LB 23 ID x PO
3685 // AL x NU
3686 // NU x AL
3687 if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3688 (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3689 (fNU->contains(prevChar) && fAL->contains(thisChar)) ) {
3690 continue;
3691 }
3692
3693 // LB 24 Do not break between prefix and letters or ideographs.
3694 // PR x ID
3695 // PR x AL
3696 // PO x AL
3697 if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3698 (fPR->contains(prevChar) && fAL->contains(thisChar)) ||
3699 (fPO->contains(prevChar) && fAL->contains(thisChar)) ) {
3700 continue;
3701 }
3702
3703
3704
3705 // LB 25 Numbers
3706 if (fNumberMatcher->lookingAt(prevPos, status)) {
3707 if (U_FAILURE(status)) {
3708 break;
3709 }
3710 // Matched a number. But could have been just a single digit, which would
3711 // not represent a "no break here" between prevChar and thisChar
3712 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3713 if (numEndIdx > pos) {
3714 // Number match includes at least our two chars being checked
3715 if (numEndIdx > nextPos) {
3716 // Number match includes additional chars. Update pos and nextPos
3717 // so that next loop iteration will continue at the end of the number,
3718 // checking for breaks between last char in number & whatever follows.
3719 pos = nextPos = numEndIdx;
3720 do {
3721 pos = fText->moveIndex32(pos, -1);
3722 thisChar = fText->char32At(pos);
3723 } while (fCM->contains(thisChar));
3724 }
3725 continue;
3726 }
3727 }
3728
3729
3730 // LB 26 Do not break a Korean syllable.
3731 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3732 fJV->contains(thisChar) ||
3733 fH2->contains(thisChar) ||
3734 fH3->contains(thisChar))) {
3735 continue;
3736 }
3737
3738 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3739 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3740 continue;
3741 }
3742
3743 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3744 fJT->contains(thisChar)) {
3745 continue;
3746 }
3747
3748 // LB 27 Treat a Korean Syllable Block the same as ID.
3749 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3750 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3751 fIN->contains(thisChar)) {
3752 continue;
3753 }
3754 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3755 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3756 fPO->contains(thisChar)) {
3757 continue;
3758 }
3759 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3760 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3761 continue;
3762 }
3763
3764
3765
3766 // LB 28 Do not break between alphabetics ("at").
3767 if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3768 continue;
3769 }
3770
3771 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3772 if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3773 continue;
3774 }
3775
3776 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3777 // (AL | NU) x OP
3778 // CP x (AL | NU)
3779 if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3780 continue;
3781 }
3782 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) {
3783 continue;
3784 }
3785
3786 // LB 31 Break everywhere else
3787 break;
3788
3789 }
3790
3791 return pos;
3792 }
3793
3794
charClasses()3795 UVector *RBBILineMonkey::charClasses() {
3796 return fSets;
3797 }
3798
3799
~RBBILineMonkey()3800 RBBILineMonkey::~RBBILineMonkey() {
3801 delete fSets;
3802
3803 delete fBK;
3804 delete fCR;
3805 delete fLF;
3806 delete fCM;
3807 delete fNL;
3808 delete fWJ;
3809 delete fZW;
3810 delete fGL;
3811 delete fCB;
3812 delete fSP;
3813 delete fB2;
3814 delete fBA;
3815 delete fBB;
3816 delete fHY;
3817 delete fH2;
3818 delete fH3;
3819 delete fCL;
3820 delete fCP;
3821 delete fEX;
3822 delete fIN;
3823 delete fJL;
3824 delete fJV;
3825 delete fJT;
3826 delete fNS;
3827 delete fOP;
3828 delete fQU;
3829 delete fIS;
3830 delete fNU;
3831 delete fPO;
3832 delete fPR;
3833 delete fSY;
3834 delete fAI;
3835 delete fAL;
3836 delete fID;
3837 delete fSA;
3838 delete fSG;
3839 delete fXX;
3840
3841 delete fCharBI;
3842 delete fNumberMatcher;
3843 }
3844
3845
3846 //-------------------------------------------------------------------------------------------
3847 //
3848 // TestMonkey
3849 //
3850 // params
3851 // seed=nnnnn Random number starting seed.
3852 // Setting the seed allows errors to be reproduced.
3853 // loop=nnn Looping count. Controls running time.
3854 // -1: run forever.
3855 // 0 or greater: run length.
3856 //
3857 // type = char | word | line | sent | title
3858 //
3859 //-------------------------------------------------------------------------------------------
3860
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3861 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3862 int32_t val = defaultVal;
3863 name.append(" *= *(-?\\d+)");
3864 UErrorCode status = U_ZERO_ERROR;
3865 RegexMatcher m(name, params, 0, status);
3866 if (m.find()) {
3867 // The param exists. Convert the string to an int.
3868 char valString[100];
3869 int32_t paramLength = m.end(1, status) - m.start(1, status);
3870 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3871 paramLength = (int32_t)(sizeof(valString)-2);
3872 }
3873 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3874 val = strtol(valString, NULL, 10);
3875
3876 // Delete this parameter from the params string.
3877 m.reset();
3878 params = m.replaceFirst("", status);
3879 }
3880 U_ASSERT(U_SUCCESS(status));
3881 return val;
3882 }
3883 #endif
3884
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3885 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3886 BreakIterator *bi,
3887 int expected[],
3888 int expectedcount)
3889 {
3890 int count = 0;
3891 int i = 0;
3892 int forward[50];
3893 bi->setText(ustr);
3894 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3895 forward[count] = i;
3896 if (count < expectedcount && expected[count] != i) {
3897 test->errln("break forward test failed: expected %d but got %d",
3898 expected[count], i);
3899 break;
3900 }
3901 count ++;
3902 }
3903 if (count != expectedcount) {
3904 printStringBreaks(ustr, expected, expectedcount);
3905 test->errln("break forward test failed: missed %d match",
3906 expectedcount - count);
3907 return;
3908 }
3909 // testing boundaries
3910 for (i = 1; i < expectedcount; i ++) {
3911 int j = expected[i - 1];
3912 if (!bi->isBoundary(j)) {
3913 printStringBreaks(ustr, expected, expectedcount);
3914 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3915 return;
3916 }
3917 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3918 if (bi->isBoundary(j)) {
3919 printStringBreaks(ustr, expected, expectedcount);
3920 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3921 return;
3922 }
3923 }
3924 }
3925
3926 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3927 count --;
3928 if (forward[count] != i) {
3929 test->errln("happy break test previous() failed: expected %d but got %d",
3930 forward[count], i);
3931 break;
3932 }
3933 }
3934 if (count != 0) {
3935 printStringBreaks(ustr, expected, expectedcount);
3936 test->errln("break test previous() failed: missed a match");
3937 return;
3938 }
3939
3940 // testing preceding
3941 for (i = 0; i < expectedcount - 1; i ++) {
3942 // int j = expected[i] + 1;
3943 int j = ustr.moveIndex32(expected[i], 1);
3944 for (; j <= expected[i + 1]; j ++) {
3945 if (bi->preceding(j) != expected[i]) {
3946 printStringBreaks(ustr, expected, expectedcount);
3947 test->errln("preceding(): Not expecting boundary at position %d", j);
3948 return;
3949 }
3950 }
3951 }
3952 }
3953
TestWordBreaks(void)3954 void RBBITest::TestWordBreaks(void)
3955 {
3956 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3957
3958 Locale locale("en");
3959 UErrorCode status = U_ZERO_ERROR;
3960 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3961 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3962 static const char *strlist[] =
3963 {
3964 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3965 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3966 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3967 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3968 "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3969 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3970 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3971 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3972 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3973 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3974 "\\u2027\\U000e0067\\u0a47\\u00b7",
3975 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3976 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3977 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3978 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3979 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3980 "\\u0027\\u11af\\U000e0057\\u0602",
3981 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3982 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3983 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3984 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3985 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3986 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3987 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3988 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3989 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3990 "\\u58f4\\U000e0049\\u20e7\\u2027",
3991 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3992 "\\ua183\\u102d\\u0bec\\u003a",
3993 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3994 "\\u003a\\u0e57\\u0fad\\u002e",
3995 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3996 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3997 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3998 "\\u003a\\u0664\\u00b7\\u1fba",
3999 "\\u003b\\u0027\\u00b7\\u47a3",
4000 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
4001 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
4002 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
4003 };
4004 int loop;
4005 if (U_FAILURE(status)) {
4006 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4007 return;
4008 }
4009 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4010 // printf("looping %d\n", loop);
4011 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
4012 // RBBICharMonkey monkey;
4013 RBBIWordMonkey monkey;
4014
4015 int expected[50];
4016 int expectedcount = 0;
4017
4018 monkey.setText(ustr);
4019 int i;
4020 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4021 expected[expectedcount ++] = i;
4022 }
4023
4024 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4025 }
4026 delete bi;
4027 #endif
4028 }
4029
TestWordBoundary(void)4030 void RBBITest::TestWordBoundary(void)
4031 {
4032 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
4033 Locale locale("en");
4034 UErrorCode status = U_ZERO_ERROR;
4035 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4036 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4037 UChar str[50];
4038 static const char *strlist[] =
4039 {
4040 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
4041 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
4042 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
4043 "\\u2027\\U000e0067\\u0a47\\u00b7",
4044 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
4045 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4046 "\\u0589\\U000e006e\\u0a42\\U000104a5",
4047 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
4048 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4049 "\\u0027\\u11af\\U000e0057\\u0602",
4050 "\\U0001d7f2\\U000e007\\u0004\\u0589",
4051 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
4052 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4053 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4054 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4055 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
4056 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4057 "\\u0233\\U000e0020\\u0a69\\u0d6a",
4058 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4059 "\\u58f4\\U000e0049\\u20e7\\u2027",
4060 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4061 "\\ua183\\u102d\\u0bec\\u003a",
4062 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4063 "\\u003a\\u0e57\\u0fad\\u002e",
4064 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4065 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4066 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
4067 "\\u003a\\u0664\\u00b7\\u1fba",
4068 "\\u003b\\u0027\\u00b7\\u47a3",
4069 };
4070 int loop;
4071 if (U_FAILURE(status)) {
4072 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4073 return;
4074 }
4075 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4076 // printf("looping %d\n", loop);
4077 u_unescape(strlist[loop], str, 20);
4078 UnicodeString ustr(str);
4079 int forward[50];
4080 int count = 0;
4081
4082 bi->setText(ustr);
4083 int prev = 0;
4084 int i;
4085 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
4086 forward[count ++] = i;
4087 if (i > prev) {
4088 int j;
4089 for (j = prev + 1; j < i; j ++) {
4090 if (bi->isBoundary(j)) {
4091 printStringBreaks(ustr, forward, count);
4092 errln("happy boundary test failed: expected %d not a boundary",
4093 j);
4094 return;
4095 }
4096 }
4097 }
4098 if (!bi->isBoundary(i)) {
4099 printStringBreaks(ustr, forward, count);
4100 errln("happy boundary test failed: expected %d a boundary",
4101 i);
4102 return;
4103 }
4104 prev = i;
4105 }
4106 }
4107 delete bi;
4108 }
4109
TestLineBreaks(void)4110 void RBBITest::TestLineBreaks(void)
4111 {
4112 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4113 Locale locale("en");
4114 UErrorCode status = U_ZERO_ERROR;
4115 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4116 const int32_t STRSIZE = 50;
4117 UChar str[STRSIZE];
4118 static const char *strlist[] =
4119 {
4120 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4121 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4122 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4123 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4124 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4125 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4126 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4127 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4128 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4129 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4130 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
4131 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4132 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4133 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4134 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4135 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4136 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4137 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4138 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4139 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4140 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4141 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4142 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4143 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4144 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4145 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4146 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
4147 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4148 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4149 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4150 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4151 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4152 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
4153 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4154 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4155 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
4156 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4157 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4158 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4159 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4160 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4161 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4162 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
4163 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
4164 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
4165 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4166 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4167 };
4168 int loop;
4169 TEST_ASSERT_SUCCESS(status);
4170 if (U_FAILURE(status)) {
4171 return;
4172 }
4173 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4174 // printf("looping %d\n", loop);
4175 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4176 if (t >= STRSIZE) {
4177 TEST_ASSERT(FALSE);
4178 continue;
4179 }
4180
4181
4182 UnicodeString ustr(str);
4183 RBBILineMonkey monkey;
4184 if (U_FAILURE(monkey.deferredStatus)) {
4185 continue;
4186 }
4187
4188 const int EXPECTEDSIZE = 50;
4189 int expected[EXPECTEDSIZE];
4190 int expectedcount = 0;
4191
4192 monkey.setText(ustr);
4193 int i;
4194 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4195 if (expectedcount >= EXPECTEDSIZE) {
4196 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4197 return;
4198 }
4199 expected[expectedcount ++] = i;
4200 }
4201
4202 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4203 }
4204 delete bi;
4205 #endif
4206 }
4207
TestSentBreaks(void)4208 void RBBITest::TestSentBreaks(void)
4209 {
4210 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4211 Locale locale("en");
4212 UErrorCode status = U_ZERO_ERROR;
4213 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4214 UChar str[200];
4215 static const char *strlist[] =
4216 {
4217 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4218 "This\n",
4219 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4220 "\"Sentence ending with a quote.\" Bye.",
4221 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4222 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4223 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4224 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4225 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4226 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4227 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4228 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4229 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4230 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4231 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4232 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4233 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4234 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4235 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4236 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4237 };
4238 int loop;
4239 if (U_FAILURE(status)) {
4240 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4241 return;
4242 }
4243 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4244 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
4245 UnicodeString ustr(str);
4246
4247 RBBISentMonkey monkey;
4248 if (U_FAILURE(monkey.deferredStatus)) {
4249 continue;
4250 }
4251
4252 const int EXPECTEDSIZE = 50;
4253 int expected[EXPECTEDSIZE];
4254 int expectedcount = 0;
4255
4256 monkey.setText(ustr);
4257 int i;
4258 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4259 if (expectedcount >= EXPECTEDSIZE) {
4260 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4261 return;
4262 }
4263 expected[expectedcount ++] = i;
4264 }
4265
4266 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4267 }
4268 delete bi;
4269 #endif
4270 }
4271
TestMonkey(char * params)4272 void RBBITest::TestMonkey(char *params) {
4273 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4274
4275 UErrorCode status = U_ZERO_ERROR;
4276 int32_t loopCount = 500;
4277 int32_t seed = 1;
4278 UnicodeString breakType = "all";
4279 Locale locale("en");
4280 UBool useUText = FALSE;
4281
4282 if (quick == FALSE) {
4283 loopCount = 10000;
4284 }
4285
4286 if (params) {
4287 UnicodeString p(params);
4288 loopCount = getIntParam("loop", p, loopCount);
4289 seed = getIntParam("seed", p, seed);
4290
4291 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4292 if (m.find()) {
4293 breakType = m.group(1, status);
4294 m.reset();
4295 p = m.replaceFirst("", status);
4296 }
4297
4298 RegexMatcher u(" *utext", p, 0, status);
4299 if (u.find()) {
4300 useUText = TRUE;
4301 u.reset();
4302 p = u.replaceFirst("", status);
4303 }
4304
4305
4306 // m.reset(p);
4307 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4308 // Each option is stripped out of the option string as it is processed.
4309 // All options have been checked. The option string should have been completely emptied..
4310 char buf[100];
4311 p.extract(buf, sizeof(buf), NULL, status);
4312 buf[sizeof(buf)-1] = 0;
4313 errln("Unrecognized or extra parameter: %s\n", buf);
4314 return;
4315 }
4316
4317 }
4318
4319 if (breakType == "char" || breakType == "all") {
4320 RBBICharMonkey m;
4321 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4322 if (U_SUCCESS(status)) {
4323 RunMonkey(bi, m, "char", seed, loopCount, useUText);
4324 if (breakType == "all" && useUText==FALSE) {
4325 // Also run a quick test with UText when "all" is specified
4326 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4327 }
4328 }
4329 else {
4330 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4331 }
4332 delete bi;
4333 }
4334
4335 if (breakType == "word" || breakType == "all") {
4336 logln("Word Break Monkey Test");
4337 RBBIWordMonkey m;
4338 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4339 if (U_SUCCESS(status)) {
4340 RunMonkey(bi, m, "word", seed, loopCount, useUText);
4341 }
4342 else {
4343 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4344 }
4345 delete bi;
4346 }
4347
4348 if (breakType == "line" || breakType == "all") {
4349 logln("Line Break Monkey Test");
4350 RBBILineMonkey m;
4351 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4352 if (loopCount >= 10) {
4353 loopCount = loopCount / 5; // Line break runs slower than the others.
4354 }
4355 if (U_SUCCESS(status)) {
4356 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4357 }
4358 else {
4359 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4360 }
4361 delete bi;
4362 }
4363
4364 if (breakType == "sent" || breakType == "all" ) {
4365 logln("Sentence Break Monkey Test");
4366 RBBISentMonkey m;
4367 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4368 if (loopCount >= 10) {
4369 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4370 }
4371 if (U_SUCCESS(status)) {
4372 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4373 }
4374 else {
4375 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4376 }
4377 delete bi;
4378 }
4379
4380 #endif
4381 }
4382
4383 //
4384 // Run a RBBI monkey test. Common routine, for all break iterator types.
4385 // Parameters:
4386 // bi - the break iterator to use
4387 // mk - MonkeyKind, abstraction for obtaining expected results
4388 // name - Name of test (char, word, etc.) for use in error messages
4389 // seed - Seed for starting random number generator (parameter from user)
4390 // numIterations
4391 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4392 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4393 int32_t numIterations, UBool useUText) {
4394
4395 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4396
4397 const int32_t TESTSTRINGLEN = 500;
4398 UnicodeString testText;
4399 int32_t numCharClasses;
4400 UVector *chClasses;
4401 int expected[TESTSTRINGLEN*2 + 1];
4402 int expectedCount = 0;
4403 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4404 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4405 char reverseBreaks[TESTSTRINGLEN*2+1];
4406 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4407 char followingBreaks[TESTSTRINGLEN*2+1];
4408 char precedingBreaks[TESTSTRINGLEN*2+1];
4409 int i;
4410 int loopCount = 0;
4411
4412 m_seed = seed;
4413
4414 numCharClasses = mk.charClasses()->size();
4415 chClasses = mk.charClasses();
4416
4417 // Check for errors that occured during the construction of the MonkeyKind object.
4418 // Can't report them where they occured because errln() is a method coming from intlTest,
4419 // and is not visible outside of RBBITest :-(
4420 if (U_FAILURE(mk.deferredStatus)) {
4421 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4422 return;
4423 }
4424
4425 // Verify that the character classes all have at least one member.
4426 for (i=0; i<numCharClasses; i++) {
4427 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4428 if (s == NULL || s->size() == 0) {
4429 errln("Character Class #%d is null or of zero size.", i);
4430 return;
4431 }
4432 }
4433
4434 while (loopCount < numIterations || numIterations == -1) {
4435 if (numIterations == -1 && loopCount % 10 == 0) {
4436 // If test is running in an infinite loop, display a periodic tic so
4437 // we can tell that it is making progress.
4438 fprintf(stderr, ".");
4439 }
4440 // Save current random number seed, so that we can recreate the random numbers
4441 // for this loop iteration in event of an error.
4442 seed = m_seed;
4443
4444 // Populate a test string with data.
4445 testText.truncate(0);
4446 for (i=0; i<TESTSTRINGLEN; i++) {
4447 int32_t aClassNum = m_rand() % numCharClasses;
4448 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4449 int32_t charIdx = m_rand() % classSet->size();
4450 UChar32 c = classSet->charAt(charIdx);
4451 if (c < 0) { // TODO: deal with sets containing strings.
4452 errln("c < 0");
4453 break;
4454 }
4455 testText.append(c);
4456 }
4457
4458 // Calculate the expected results for this test string.
4459 mk.setText(testText);
4460 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4461 expectedBreaks[0] = 1;
4462 int32_t breakPos = 0;
4463 expectedCount = 0;
4464 for (;;) {
4465 breakPos = mk.next(breakPos);
4466 if (breakPos == -1) {
4467 break;
4468 }
4469 if (breakPos > testText.length()) {
4470 errln("breakPos > testText.length()");
4471 }
4472 expectedBreaks[breakPos] = 1;
4473 U_ASSERT(expectedCount<testText.length());
4474 expected[expectedCount ++] = breakPos;
4475 }
4476
4477 // Find the break positions using forward iteration
4478 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4479 if (useUText) {
4480 UErrorCode status = U_ZERO_ERROR;
4481 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4482 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4483 bi->setText(testUText, status);
4484 TEST_ASSERT_SUCCESS(status);
4485 utext_close(testUText); // The break iterator does a shallow clone of the UText
4486 // This UText can be closed immediately, so long as the
4487 // testText string continues to exist.
4488 } else {
4489 bi->setText(testText);
4490 }
4491
4492 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4493 if (i < 0 || i > testText.length()) {
4494 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4495 break;
4496 }
4497 forwardBreaks[i] = 1;
4498 }
4499
4500 // Find the break positions using reverse iteration
4501 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4502 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4503 if (i < 0 || i > testText.length()) {
4504 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4505 break;
4506 }
4507 reverseBreaks[i] = 1;
4508 }
4509
4510 // Find the break positions using isBoundary() tests.
4511 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4512 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4513 for (i=0; i<=testText.length(); i++) {
4514 isBoundaryBreaks[i] = bi->isBoundary(i);
4515 }
4516
4517
4518 // Find the break positions using the following() function.
4519 // printf(".");
4520 memset(followingBreaks, 0, sizeof(followingBreaks));
4521 int32_t lastBreakPos = 0;
4522 followingBreaks[0] = 1;
4523 for (i=0; i<testText.length(); i++) {
4524 breakPos = bi->following(i);
4525 if (breakPos <= i ||
4526 breakPos < lastBreakPos ||
4527 breakPos > testText.length() ||
4528 (breakPos > lastBreakPos && lastBreakPos > i)) {
4529 errln("%s break monkey test: "
4530 "Out of range value returned by BreakIterator::following().\n"
4531 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4532 name, seed, i, breakPos, lastBreakPos);
4533 break;
4534 }
4535 followingBreaks[breakPos] = 1;
4536 lastBreakPos = breakPos;
4537 }
4538
4539 // Find the break positions using the preceding() function.
4540 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4541 lastBreakPos = testText.length();
4542 precedingBreaks[testText.length()] = 1;
4543 for (i=testText.length(); i>0; i--) {
4544 breakPos = bi->preceding(i);
4545 if (breakPos >= i ||
4546 breakPos > lastBreakPos ||
4547 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4548 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4549 errln("%s break monkey test: "
4550 "Out of range value returned by BreakIterator::preceding().\n"
4551 "index=%d; prev returned %d; lastBreak=%d" ,
4552 name, i, breakPos, lastBreakPos);
4553 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4554 precedingBreaks[i] = 2; // Forces an error.
4555 }
4556 } else {
4557 if (breakPos >= 0) {
4558 precedingBreaks[breakPos] = 1;
4559 }
4560 lastBreakPos = breakPos;
4561 }
4562 }
4563
4564 // Compare the expected and actual results.
4565 for (i=0; i<=testText.length(); i++) {
4566 const char *errorType = NULL;
4567 if (forwardBreaks[i] != expectedBreaks[i]) {
4568 errorType = "next()";
4569 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4570 errorType = "previous()";
4571 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4572 errorType = "isBoundary()";
4573 } else if (followingBreaks[i] != expectedBreaks[i]) {
4574 errorType = "following()";
4575 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4576 errorType = "preceding()";
4577 }
4578
4579
4580 if (errorType != NULL) {
4581 // Format a range of the test text that includes the failure as
4582 // a data item that can be included in the rbbi test data file.
4583
4584 // Start of the range is the last point where expected and actual results
4585 // both agreed that there was a break position.
4586 int startContext = i;
4587 int32_t count = 0;
4588 for (;;) {
4589 if (startContext==0) { break; }
4590 startContext --;
4591 if (expectedBreaks[startContext] != 0) {
4592 if (count == 2) break;
4593 count ++;
4594 }
4595 }
4596
4597 // End of range is two expected breaks past the start position.
4598 int endContext = i + 1;
4599 int ci;
4600 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4601 for (;;) {
4602 if (endContext >= testText.length()) {break;}
4603 if (expectedBreaks[endContext-1] != 0) {
4604 if (count == 0) break;
4605 count --;
4606 }
4607 endContext ++;
4608 }
4609 }
4610
4611 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4612 UnicodeString errorText = "<data>";
4613 /***if (strcmp(errorType, "next()") == 0) {
4614 startContext = 0;
4615 endContext = testText.length();
4616
4617 printStringBreaks(testText, expected, expectedCount);
4618 }***/
4619
4620 for (ci=startContext; ci<endContext;) {
4621 UnicodeString hexChars("0123456789abcdef");
4622 UChar32 c;
4623 int bn;
4624 c = testText.char32At(ci);
4625 if (ci == i) {
4626 // This is the location of the error.
4627 errorText.append("<?>");
4628 } else if (expectedBreaks[ci] != 0) {
4629 // This a non-error expected break position.
4630 errorText.append("\\");
4631 }
4632 if (c < 0x10000) {
4633 errorText.append("\\u");
4634 for (bn=12; bn>=0; bn-=4) {
4635 errorText.append(hexChars.charAt((c>>bn)&0xf));
4636 }
4637 } else {
4638 errorText.append("\\U");
4639 for (bn=28; bn>=0; bn-=4) {
4640 errorText.append(hexChars.charAt((c>>bn)&0xf));
4641 }
4642 }
4643 ci = testText.moveIndex32(ci, 1);
4644 }
4645 errorText.append("\\");
4646 errorText.append("</data>\n");
4647
4648 // Output the error
4649 char charErrorTxt[500];
4650 UErrorCode status = U_ZERO_ERROR;
4651 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4652 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4653 errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4654 name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4655 errorType, seed, i, charErrorTxt);
4656 break;
4657 }
4658 }
4659
4660 loopCount++;
4661 }
4662 #endif
4663 }
4664
4665
4666 // Bug 5532. UTF-8 based UText fails in dictionary code.
4667 // This test checks the initial patch,
4668 // which is to just keep it from crashing. Correct word boundaries
4669 // await a proper fix to the dictionary code.
4670 //
TestBug5532(void)4671 void RBBITest::TestBug5532(void) {
4672 // Text includes a mixture of Thai and Latin.
4673 const unsigned char utf8Data[] = {
4674 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4675 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4676 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4677 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4678 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4679 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4680 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4681 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4682 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4683 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4684 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4685
4686 UErrorCode status = U_ZERO_ERROR;
4687 UText utext=UTEXT_INITIALIZER;
4688 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4689 TEST_ASSERT_SUCCESS(status);
4690
4691 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4692 TEST_ASSERT_SUCCESS(status);
4693 if (U_SUCCESS(status)) {
4694 bi->setText(&utext, status);
4695 TEST_ASSERT_SUCCESS(status);
4696
4697 int32_t breakCount = 0;
4698 int32_t previousBreak = -1;
4699 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4700 // For now, just make sure that the break iterator doesn't hang.
4701 TEST_ASSERT(previousBreak < bi->current());
4702 previousBreak = bi->current();
4703 }
4704 TEST_ASSERT(breakCount > 0);
4705 }
4706 delete bi;
4707 utext_close(&utext);
4708 }
4709
4710
4711 //
4712 // TestDebug - A place-holder test for debugging purposes.
4713 // For putting in fragments of other tests that can be invoked
4714 // for tracing without a lot of unwanted extra stuff happening.
4715 //
TestDebug(void)4716 void RBBITest::TestDebug(void) {
4717 #if 0
4718 UErrorCode status = U_ZERO_ERROR;
4719 int pos = 0;
4720 int ruleStatus = 0;
4721
4722 RuleBasedBreakIterator* bi =
4723 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4724 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4725 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4726 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4727 // UnicodeString s("Aaa. Bcd");
4728 s = s.unescape();
4729 bi->setText(s);
4730 UBool r = bi->isBoundary(8);
4731 printf("%s", r?"true":"false");
4732 return;
4733 pos = bi->last();
4734 do {
4735 // ruleStatus = bi->getRuleStatus();
4736 printf("%d\t%d\n", pos, ruleStatus);
4737 pos = bi->previous();
4738 } while (pos != BreakIterator::DONE);
4739 #endif
4740 }
4741
4742 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4743