1 /********************************************************************
2 * Copyright (c) 1999-2007, International Business Machines
3 * Corporation and others. All Rights Reserved.
4 ********************************************************************
5 * Date Name Description
6 * 12/14/99 Madhu Creation.
7 * 01/12/2000 Madhu updated for changed API
8 ********************************************************************/
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_BREAK_ITERATION
13
14 #include "unicode/uchar.h"
15 #include "intltest.h"
16 #include "unicode/rbbi.h"
17 #include "unicode/schriter.h"
18 #include "rbbiapts.h"
19 #include "rbbidata.h"
20 #include "cstring.h"
21 #include "ubrkimpl.h"
22 #include "unicode/ustring.h"
23 #include "unicode/utext.h"
24
25 /**
26 * API Test the RuleBasedBreakIterator class
27 */
28
29
30 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) {\
31 errln("Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
32
33 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
34 errln("Test Failure at file %s, line %d", __FILE__, __LINE__);}}
35
TestCloneEquals()36 void RBBIAPITest::TestCloneEquals()
37 {
38
39 UErrorCode status=U_ZERO_ERROR;
40 RuleBasedBreakIterator* bi1 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
41 RuleBasedBreakIterator* biequal = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
42 RuleBasedBreakIterator* bi3 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
43 RuleBasedBreakIterator* bi2 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
44 if(U_FAILURE(status)){
45 errln((UnicodeString)"FAIL : in construction");
46 return;
47 }
48
49
50 UnicodeString testString="Testing word break iterators's clone() and equals()";
51 bi1->setText(testString);
52 bi2->setText(testString);
53 biequal->setText(testString);
54
55 bi3->setText("hello");
56
57 logln((UnicodeString)"Testing equals()");
58
59 logln((UnicodeString)"Testing == and !=");
60 UBool b = (*bi1 != *biequal);
61 b |= *bi1 == *bi2;
62 b |= *bi1 == *bi3;
63 if (b) {
64 errln((UnicodeString)"ERROR:1 RBBI's == and != operator failed.");
65 }
66
67 if(*bi2 == *biequal || *bi2 == *bi1 || *biequal == *bi3)
68 errln((UnicodeString)"ERROR:2 RBBI's == and != operator failed.");
69
70
71 // Quick test of RulesBasedBreakIterator assignment -
72 // Check that
73 // two different iterators are !=
74 // they are == after assignment
75 // source and dest iterator produce the same next() after assignment.
76 // deleting one doesn't disable the other.
77 logln("Testing assignment");
78 RuleBasedBreakIterator *bix = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
79 if(U_FAILURE(status)){
80 errln((UnicodeString)"FAIL : in construction");
81 return;
82 }
83
84 RuleBasedBreakIterator biDefault, biDefault2;
85 if(U_FAILURE(status)){
86 errln((UnicodeString)"FAIL : in construction of default iterator");
87 return;
88 }
89 if (biDefault == *bix) {
90 errln((UnicodeString)"ERROR: iterators should not compare ==");
91 return;
92 }
93 if (biDefault != biDefault2) {
94 errln((UnicodeString)"ERROR: iterators should compare ==");
95 return;
96 }
97
98
99 UnicodeString HelloString("Hello Kitty");
100 bix->setText(HelloString);
101 if (*bix == *bi2) {
102 errln(UnicodeString("ERROR: strings should not be equal before assignment."));
103 }
104 *bix = *bi2;
105 if (*bix != *bi2) {
106 errln(UnicodeString("ERROR: strings should be equal before assignment."));
107 }
108
109 int bixnext = bix->next();
110 int bi2next = bi2->next();
111 if (! (bixnext == bi2next && bixnext == 7)) {
112 errln(UnicodeString("ERROR: iterators behaved differently after assignment."));
113 }
114 delete bix;
115 if (bi2->next() != 8) {
116 errln(UnicodeString("ERROR: iterator.next() failed after deleting copy."));
117 }
118
119
120
121 logln((UnicodeString)"Testing clone()");
122 RuleBasedBreakIterator* bi1clone=(RuleBasedBreakIterator*)bi1->clone();
123 RuleBasedBreakIterator* bi2clone=(RuleBasedBreakIterator*)bi2->clone();
124
125 if(*bi1clone != *bi1 || *bi1clone != *biequal ||
126 *bi1clone == *bi3 || *bi1clone == *bi2)
127 errln((UnicodeString)"ERROR:1 RBBI's clone() method failed");
128
129 if(*bi2clone == *bi1 || *bi2clone == *biequal ||
130 *bi2clone == *bi3 || *bi2clone != *bi2)
131 errln((UnicodeString)"ERROR:2 RBBI's clone() method failed");
132
133 if(bi1->getText() != bi1clone->getText() ||
134 bi2clone->getText() != bi2->getText() ||
135 *bi2clone == *bi1clone )
136 errln((UnicodeString)"ERROR: RBBI's clone() method failed");
137
138 delete bi1clone;
139 delete bi2clone;
140 delete bi1;
141 delete bi3;
142 delete bi2;
143 delete biequal;
144 }
145
TestBoilerPlate()146 void RBBIAPITest::TestBoilerPlate()
147 {
148 UErrorCode status = U_ZERO_ERROR;
149 BreakIterator* a = BreakIterator::createWordInstance(Locale("hi"), status);
150 BreakIterator* b = BreakIterator::createWordInstance(Locale("hi_IN"),status);
151 if (U_FAILURE(status)) {
152 errln("Creation of break iterator failed %s", u_errorName(status));
153 return;
154 }
155 if(*a!=*b){
156 errln("Failed: boilerplate method operator!= does not return correct results");
157 }
158 BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);
159 if(a && c){
160 if(*c==*a){
161 errln("Failed: boilerplate method opertator== does not return correct results");
162 }
163 }else{
164 errln("creation of break iterator failed");
165 }
166 delete a;
167 delete b;
168 delete c;
169 }
170
TestgetRules()171 void RBBIAPITest::TestgetRules()
172 {
173 UErrorCode status=U_ZERO_ERROR;
174
175 RuleBasedBreakIterator* bi1=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
176 RuleBasedBreakIterator* bi2=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
177 if(U_FAILURE(status)){
178 errln((UnicodeString)"FAIL: in construction");
179 delete bi1;
180 delete bi2;
181 return;
182 }
183
184
185
186 logln((UnicodeString)"Testing toString()");
187
188 bi1->setText((UnicodeString)"Hello there");
189
190 RuleBasedBreakIterator* bi3 =(RuleBasedBreakIterator*)bi1->clone();
191
192 UnicodeString temp=bi1->getRules();
193 UnicodeString temp2=bi2->getRules();
194 UnicodeString temp3=bi3->getRules();
195 if( temp2.compare(temp3) ==0 || temp.compare(temp2) == 0 || temp.compare(temp3) != 0)
196 errln((UnicodeString)"ERROR: error in getRules() method");
197
198 delete bi1;
199 delete bi2;
200 delete bi3;
201 }
TestHashCode()202 void RBBIAPITest::TestHashCode()
203 {
204 UErrorCode status=U_ZERO_ERROR;
205 RuleBasedBreakIterator* bi1 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
206 RuleBasedBreakIterator* bi3 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
207 RuleBasedBreakIterator* bi2 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
208 if(U_FAILURE(status)){
209 errln((UnicodeString)"FAIL : in construction");
210 delete bi1;
211 delete bi2;
212 delete bi3;
213 return;
214 }
215
216
217 logln((UnicodeString)"Testing hashCode()");
218
219 bi1->setText((UnicodeString)"Hash code");
220 bi2->setText((UnicodeString)"Hash code");
221 bi3->setText((UnicodeString)"Hash code");
222
223 RuleBasedBreakIterator* bi1clone= (RuleBasedBreakIterator*)bi1->clone();
224 RuleBasedBreakIterator* bi2clone= (RuleBasedBreakIterator*)bi2->clone();
225
226 if(bi1->hashCode() != bi1clone->hashCode() || bi1->hashCode() != bi3->hashCode() ||
227 bi1clone->hashCode() != bi3->hashCode() || bi2->hashCode() != bi2clone->hashCode())
228 errln((UnicodeString)"ERROR: identical objects have different hashcodes");
229
230 if(bi1->hashCode() == bi2->hashCode() || bi2->hashCode() == bi3->hashCode() ||
231 bi1clone->hashCode() == bi2clone->hashCode() || bi1clone->hashCode() == bi2->hashCode())
232 errln((UnicodeString)"ERROR: different objects have same hashcodes");
233
234 delete bi1clone;
235 delete bi2clone;
236 delete bi1;
237 delete bi2;
238 delete bi3;
239
240 }
TestGetSetAdoptText()241 void RBBIAPITest::TestGetSetAdoptText()
242 {
243 logln((UnicodeString)"Testing getText setText ");
244 UErrorCode status=U_ZERO_ERROR;
245 UnicodeString str1="first string.";
246 UnicodeString str2="Second string.";
247 RuleBasedBreakIterator* charIter1 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
248 RuleBasedBreakIterator* wordIter1 = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
249 if(U_FAILURE(status)){
250 errln((UnicodeString)"FAIL : in construction");
251 return;
252 }
253
254
255 CharacterIterator* text1= new StringCharacterIterator(str1);
256 CharacterIterator* text1Clone = text1->clone();
257 CharacterIterator* text2= new StringCharacterIterator(str2);
258 CharacterIterator* text3= new StringCharacterIterator(str2, 3, 10, 3); // "ond str"
259
260 wordIter1->setText(str1);
261 CharacterIterator *tci = &wordIter1->getText();
262 UnicodeString tstr;
263 tci->getText(tstr);
264 TEST_ASSERT(tstr == str1);
265 if(wordIter1->current() != 0)
266 errln((UnicodeString)"ERROR:1 setText did not set the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n");
267
268 wordIter1->next(2);
269
270 wordIter1->setText(str2);
271 if(wordIter1->current() != 0)
272 errln((UnicodeString)"ERROR:2 setText did not reset the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n");
273
274
275 charIter1->adoptText(text1Clone);
276 TEST_ASSERT(wordIter1->getText() != charIter1->getText());
277 tci = &wordIter1->getText();
278 tci->getText(tstr);
279 TEST_ASSERT(tstr == str2);
280 tci = &charIter1->getText();
281 tci->getText(tstr);
282 TEST_ASSERT(tstr == str1);
283
284
285 RuleBasedBreakIterator* rb=(RuleBasedBreakIterator*)wordIter1->clone();
286 rb->adoptText(text1);
287 if(rb->getText() != *text1)
288 errln((UnicodeString)"ERROR:1 error in adoptText ");
289 rb->adoptText(text2);
290 if(rb->getText() != *text2)
291 errln((UnicodeString)"ERROR:2 error in adoptText ");
292
293 // Adopt where iterator range is less than the entire orignal source string.
294 // (With the change of the break engine to working with UText internally,
295 // CharacterIterators starting at positions other than zero are not supported)
296 rb->adoptText(text3);
297 TEST_ASSERT(rb->preceding(2) == 0);
298 TEST_ASSERT(rb->following(11) == BreakIterator::DONE);
299 //if(rb->preceding(2) != 3) {
300 // errln((UnicodeString)"ERROR:3 error in adoptText ");
301 //}
302 //if(rb->following(11) != BreakIterator::DONE) {
303 // errln((UnicodeString)"ERROR:4 error in adoptText ");
304 //}
305
306 // UText API
307 //
308 // Quick test to see if UText is working at all.
309 //
310 const char *s1 = "\x68\x65\x6C\x6C\x6F\x20\x77\x6F\x72\x6C\x64"; /* "hello world" in UTF-8 */
311 const char *s2 = "\x73\x65\x65\x20\x79\x61"; /* "see ya" in UTF-8 */
312 // 012345678901
313
314 status = U_ZERO_ERROR;
315 UText *ut = utext_openUTF8(NULL, s1, -1, &status);
316 wordIter1->setText(ut, status);
317 TEST_ASSERT_SUCCESS(status);
318
319 int32_t pos;
320 pos = wordIter1->first();
321 TEST_ASSERT(pos==0);
322 pos = wordIter1->next();
323 TEST_ASSERT(pos==5);
324 pos = wordIter1->next();
325 TEST_ASSERT(pos==6);
326 pos = wordIter1->next();
327 TEST_ASSERT(pos==11);
328 pos = wordIter1->next();
329 TEST_ASSERT(pos==UBRK_DONE);
330
331 status = U_ZERO_ERROR;
332 UText *ut2 = utext_openUTF8(NULL, s2, -1, &status);
333 TEST_ASSERT_SUCCESS(status);
334 wordIter1->setText(ut2, status);
335 TEST_ASSERT_SUCCESS(status);
336
337 pos = wordIter1->first();
338 TEST_ASSERT(pos==0);
339 pos = wordIter1->next();
340 TEST_ASSERT(pos==3);
341 pos = wordIter1->next();
342 TEST_ASSERT(pos==4);
343
344 pos = wordIter1->last();
345 TEST_ASSERT(pos==6);
346 pos = wordIter1->previous();
347 TEST_ASSERT(pos==4);
348 pos = wordIter1->previous();
349 TEST_ASSERT(pos==3);
350 pos = wordIter1->previous();
351 TEST_ASSERT(pos==0);
352 pos = wordIter1->previous();
353 TEST_ASSERT(pos==UBRK_DONE);
354
355 status = U_ZERO_ERROR;
356 UnicodeString sEmpty;
357 UText *gut2 = utext_openUnicodeString(NULL, &sEmpty, &status);
358 wordIter1->getUText(gut2, status);
359 TEST_ASSERT_SUCCESS(status);
360 utext_close(gut2);
361
362 utext_close(ut);
363 utext_close(ut2);
364
365 delete wordIter1;
366 delete charIter1;
367 delete rb;
368
369 }
370
371
TestIteration()372 void RBBIAPITest::TestIteration()
373 {
374 // This test just verifies that the API is present.
375 // Testing for correct operation of the break rules happens elsewhere.
376
377 UErrorCode status=U_ZERO_ERROR;
378 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
379 if (U_FAILURE(status) || bi == NULL) {
380 errln("Failure creating character break iterator. Status = %s", u_errorName(status));
381 }
382 delete bi;
383
384 status=U_ZERO_ERROR;
385 bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
386 if (U_FAILURE(status) || bi == NULL) {
387 errln("Failure creating Word break iterator. Status = %s", u_errorName(status));
388 }
389 delete bi;
390
391 status=U_ZERO_ERROR;
392 bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createLineInstance(Locale::getDefault(), status);
393 if (U_FAILURE(status) || bi == NULL) {
394 errln("Failure creating Line break iterator. Status = %s", u_errorName(status));
395 }
396 delete bi;
397
398 status=U_ZERO_ERROR;
399 bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createSentenceInstance(Locale::getDefault(), status);
400 if (U_FAILURE(status) || bi == NULL) {
401 errln("Failure creating Sentence break iterator. Status = %s", u_errorName(status));
402 }
403 delete bi;
404
405 status=U_ZERO_ERROR;
406 bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status);
407 if (U_FAILURE(status) || bi == NULL) {
408 errln("Failure creating Title break iterator. Status = %s", u_errorName(status));
409 }
410 delete bi;
411
412 status=U_ZERO_ERROR;
413 bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
414 if (U_FAILURE(status) || bi == NULL) {
415 errln("Failure creating character break iterator. Status = %s", u_errorName(status));
416 return; // Skip the rest of these tests.
417 }
418
419
420 UnicodeString testString="0123456789";
421 bi->setText(testString);
422
423 int32_t i;
424 i = bi->first();
425 if (i != 0) {
426 errln("Incorrect value from bi->first(). Expected 0, got %d.", i);
427 }
428
429 i = bi->last();
430 if (i != 10) {
431 errln("Incorrect value from bi->last(). Expected 10, got %d", i);
432 }
433
434 //
435 // Previous
436 //
437 bi->last();
438 i = bi->previous();
439 if (i != 9) {
440 errln("Incorrect value from bi->last() at line %d. Expected 9, got %d", __LINE__, i);
441 }
442
443
444 bi->first();
445 i = bi->previous();
446 if (i != BreakIterator::DONE) {
447 errln("Incorrect value from bi->previous() at line %d. Expected DONE, got %d", __LINE__, i);
448 }
449
450 //
451 // next()
452 //
453 bi->first();
454 i = bi->next();
455 if (i != 1) {
456 errln("Incorrect value from bi->next() at line %d. Expected 1, got %d", __LINE__, i);
457 }
458
459 bi->last();
460 i = bi->next();
461 if (i != BreakIterator::DONE) {
462 errln("Incorrect value from bi->next() at line %d. Expected DONE, got %d", __LINE__, i);
463 }
464
465
466 //
467 // current()
468 //
469 bi->first();
470 i = bi->current();
471 if (i != 0) {
472 errln("Incorrect value from bi->previous() at line %d. Expected 0, got %d", __LINE__, i);
473 }
474
475 bi->next();
476 i = bi->current();
477 if (i != 1) {
478 errln("Incorrect value from bi->previous() at line %d. Expected 1, got %d", __LINE__, i);
479 }
480
481 bi->last();
482 bi->next();
483 i = bi->current();
484 if (i != 10) {
485 errln("Incorrect value from bi->previous() at line %d. Expected 10, got %d", __LINE__, i);
486 }
487
488 bi->first();
489 bi->previous();
490 i = bi->current();
491 if (i != 0) {
492 errln("Incorrect value from bi->previous() at line %d. Expected 0, got %d", __LINE__, i);
493 }
494
495
496 //
497 // Following()
498 //
499 i = bi->following(4);
500 if (i != 5) {
501 errln("Incorrect value from bi->following() at line %d. Expected 5, got %d", __LINE__, i);
502 }
503
504 i = bi->following(9);
505 if (i != 10) {
506 errln("Incorrect value from bi->following() at line %d. Expected 10, got %d", __LINE__, i);
507 }
508
509 i = bi->following(10);
510 if (i != BreakIterator::DONE) {
511 errln("Incorrect value from bi->following() at line %d. Expected DONE, got %d", __LINE__, i);
512 }
513
514
515 //
516 // Preceding
517 //
518 i = bi->preceding(4);
519 if (i != 3) {
520 errln("Incorrect value from bi->preceding() at line %d. Expected 3, got %d", __LINE__, i);
521 }
522
523 i = bi->preceding(10);
524 if (i != 9) {
525 errln("Incorrect value from bi->preceding() at line %d. Expected 9, got %d", __LINE__, i);
526 }
527
528 i = bi->preceding(1);
529 if (i != 0) {
530 errln("Incorrect value from bi->preceding() at line %d. Expected 0, got %d", __LINE__, i);
531 }
532
533 i = bi->preceding(0);
534 if (i != BreakIterator::DONE) {
535 errln("Incorrect value from bi->preceding() at line %d. Expected DONE, got %d", __LINE__, i);
536 }
537
538
539 //
540 // isBoundary()
541 //
542 bi->first();
543 if (bi->isBoundary(3) != TRUE) {
544 errln("Incorrect value from bi->isBoudary() at line %d. Expected TRUE, got FALSE", __LINE__, i);
545 }
546 i = bi->current();
547 if (i != 3) {
548 errln("Incorrect value from bi->current() at line %d. Expected 3, got %d", __LINE__, i);
549 }
550
551
552 if (bi->isBoundary(11) != FALSE) {
553 errln("Incorrect value from bi->isBoudary() at line %d. Expected FALSE, got TRUE", __LINE__, i);
554 }
555 i = bi->current();
556 if (i != 10) {
557 errln("Incorrect value from bi->current() at line %d. Expected 10, got %d", __LINE__, i);
558 }
559
560 //
561 // next(n)
562 //
563 bi->first();
564 i = bi->next(4);
565 if (i != 4) {
566 errln("Incorrect value from bi->next() at line %d. Expected 4, got %d", __LINE__, i);
567 }
568
569 i = bi->next(6);
570 if (i != 10) {
571 errln("Incorrect value from bi->next() at line %d. Expected 10, got %d", __LINE__, i);
572 }
573
574 bi->first();
575 i = bi->next(11);
576 if (i != BreakIterator::DONE) {
577 errln("Incorrect value from bi->next() at line %d. Expected BreakIterator::DONE, got %d", __LINE__, i);
578 }
579
580 delete bi;
581
582 }
583
584
585
586
587
588
TestBuilder()589 void RBBIAPITest::TestBuilder() {
590 UnicodeString rulesString1 = "$Letters = [:L:];\n"
591 "$Numbers = [:N:];\n"
592 "$Letters+;\n"
593 "$Numbers+;\n"
594 "[^$Letters $Numbers];\n"
595 "!.*;\n";
596 UnicodeString testString1 = "abc123..abc";
597 // 01234567890
598 int32_t bounds1[] = {0, 3, 6, 7, 8, 11};
599 UErrorCode status=U_ZERO_ERROR;
600 UParseError parseError;
601
602 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
603 if(U_FAILURE(status)) {
604 errln("FAIL : in construction");
605 } else {
606 bi->setText(testString1);
607 doBoundaryTest(*bi, testString1, bounds1);
608 }
609 delete bi;
610 }
611
612
613 //
614 // TestQuoteGrouping
615 // Single quotes within rules imply a grouping, so that a modifier
616 // following the quoted text (* or +) applies to all of the quoted chars.
617 //
TestQuoteGrouping()618 void RBBIAPITest::TestQuoteGrouping() {
619 UnicodeString rulesString1 = "#Here comes the rule...\n"
620 "'$@!'*;\n" // (\$\@\!)*
621 ".;\n";
622
623 UnicodeString testString1 = "$@!$@!X$@!!X";
624 // 0123456789012
625 int32_t bounds1[] = {0, 6, 7, 10, 11, 12};
626 UErrorCode status=U_ZERO_ERROR;
627 UParseError parseError;
628
629 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
630 if(U_FAILURE(status)) {
631 errln("FAIL : in construction");
632 } else {
633 bi->setText(testString1);
634 doBoundaryTest(*bi, testString1, bounds1);
635 }
636 delete bi;
637 }
638
639 //
640 // TestRuleStatus
641 // Test word break rule status constants.
642 //
TestRuleStatus()643 void RBBIAPITest::TestRuleStatus() {
644 UChar str[30];
645 u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094",
646 // 012345678901234567 8 9 0 1 2 3 4 5 6
647 // Ideographic Katakana Hiragana
648 str, 30);
649 UnicodeString testString1(str);
650 int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};
651 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,
652 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,
653 UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,
654 UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA};
655
656 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
657 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
658 UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT,
659 UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};
660
661 UErrorCode status=U_ZERO_ERROR;
662
663 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
664 if(U_FAILURE(status)) {
665 errln("FAIL : in construction");
666 } else {
667 bi->setText(testString1);
668 // First test that the breaks are in the right spots.
669 doBoundaryTest(*bi, testString1, bounds1);
670
671 // Then go back and check tag values
672 int32_t i = 0;
673 int32_t pos, tag;
674 for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) {
675 if (pos != bounds1[i]) {
676 errln("FAIL: unexpected word break at postion %d", pos);
677 break;
678 }
679 tag = bi->getRuleStatus();
680 if (tag < tag_lo[i] || tag >= tag_hi[i]) {
681 errln("FAIL: incorrect tag value %d at position %d", tag, pos);
682 break;
683 }
684
685 // Check that we get the same tag values from getRuleStatusVec()
686 int32_t vec[10];
687 int t = bi->getRuleStatusVec(vec, 10, status);
688 TEST_ASSERT_SUCCESS(status);
689 TEST_ASSERT(t==1);
690 TEST_ASSERT(vec[0] == tag);
691 }
692 }
693 delete bi;
694
695 // Now test line break status. This test mostly is to confirm that the status constants
696 // are correctly declared in the header.
697 testString1 = "test line. \n";
698 // break type s s h
699
700 bi = (RuleBasedBreakIterator *)
701 BreakIterator::createLineInstance(Locale::getEnglish(), status);
702 if(U_FAILURE(status)) {
703 errln("failed to create word break iterator.");
704 } else {
705 int32_t i = 0;
706 int32_t pos, tag;
707 UBool success;
708
709 bi->setText(testString1);
710 pos = bi->current();
711 tag = bi->getRuleStatus();
712 for (i=0; i<3; i++) {
713 switch (i) {
714 case 0:
715 success = pos==0 && tag==UBRK_LINE_SOFT; break;
716 case 1:
717 success = pos==5 && tag==UBRK_LINE_SOFT; break;
718 case 2:
719 success = pos==12 && tag==UBRK_LINE_HARD; break;
720 default:
721 success = FALSE; break;
722 }
723 if (success == FALSE) {
724 errln("Fail: incorrect word break status or position. i=%d, pos=%d, tag=%d",
725 i, pos, tag);
726 break;
727 }
728 pos = bi->next();
729 tag = bi->getRuleStatus();
730 }
731 if (UBRK_LINE_SOFT >= UBRK_LINE_SOFT_LIMIT ||
732 UBRK_LINE_HARD >= UBRK_LINE_HARD_LIMIT ||
733 UBRK_LINE_HARD > UBRK_LINE_SOFT && UBRK_LINE_HARD < UBRK_LINE_SOFT_LIMIT ) {
734 errln("UBRK_LINE_* constants from header are inconsistent.");
735 }
736 }
737 delete bi;
738
739 }
740
741
742 //
743 // TestRuleStatusVec
744 // Test the vector form of break rule status.
745 //
TestRuleStatusVec()746 void RBBIAPITest::TestRuleStatusVec() {
747 UnicodeString rulesString = "[A-N]{100}; \n"
748 "[a-w]{200}; \n"
749 "[\\p{L}]{300}; \n"
750 "[\\p{N}]{400}; \n"
751 "[0-5]{500}; \n"
752 "!.*;\n";
753 UnicodeString testString1 = "Aapz5?";
754 int32_t statusVals[10];
755 int32_t numStatuses;
756 int32_t pos;
757
758 UErrorCode status=U_ZERO_ERROR;
759 UParseError parseError;
760
761 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString, parseError, status);
762 TEST_ASSERT_SUCCESS(status);
763 if (U_SUCCESS(status)) {
764 bi->setText(testString1);
765
766 // A
767 pos = bi->next();
768 TEST_ASSERT(pos==1);
769 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
770 TEST_ASSERT_SUCCESS(status);
771 TEST_ASSERT(numStatuses == 2);
772 TEST_ASSERT(statusVals[0] == 100);
773 TEST_ASSERT(statusVals[1] == 300);
774
775 // a
776 pos = bi->next();
777 TEST_ASSERT(pos==2);
778 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
779 TEST_ASSERT_SUCCESS(status);
780 TEST_ASSERT(numStatuses == 2);
781 TEST_ASSERT(statusVals[0] == 200);
782 TEST_ASSERT(statusVals[1] == 300);
783
784 // p
785 pos = bi->next();
786 TEST_ASSERT(pos==3);
787 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
788 TEST_ASSERT_SUCCESS(status);
789 TEST_ASSERT(numStatuses == 2);
790 TEST_ASSERT(statusVals[0] == 200);
791 TEST_ASSERT(statusVals[1] == 300);
792
793 // z
794 pos = bi->next();
795 TEST_ASSERT(pos==4);
796 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
797 TEST_ASSERT_SUCCESS(status);
798 TEST_ASSERT(numStatuses == 1);
799 TEST_ASSERT(statusVals[0] == 300);
800
801 // 5
802 pos = bi->next();
803 TEST_ASSERT(pos==5);
804 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
805 TEST_ASSERT_SUCCESS(status);
806 TEST_ASSERT(numStatuses == 2);
807 TEST_ASSERT(statusVals[0] == 400);
808 TEST_ASSERT(statusVals[1] == 500);
809
810 // ?
811 pos = bi->next();
812 TEST_ASSERT(pos==6);
813 numStatuses = bi->getRuleStatusVec(statusVals, 10, status);
814 TEST_ASSERT_SUCCESS(status);
815 TEST_ASSERT(numStatuses == 1);
816 TEST_ASSERT(statusVals[0] == 0);
817
818 //
819 // Check buffer overflow error handling. Char == A
820 //
821 bi->first();
822 pos = bi->next();
823 TEST_ASSERT(pos==1);
824 memset(statusVals, -1, sizeof(statusVals));
825 numStatuses = bi->getRuleStatusVec(statusVals, 0, status);
826 TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
827 TEST_ASSERT(numStatuses == 2);
828 TEST_ASSERT(statusVals[0] == -1);
829
830 status = U_ZERO_ERROR;
831 memset(statusVals, -1, sizeof(statusVals));
832 numStatuses = bi->getRuleStatusVec(statusVals, 1, status);
833 TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
834 TEST_ASSERT(numStatuses == 2);
835 TEST_ASSERT(statusVals[0] == 100);
836 TEST_ASSERT(statusVals[1] == -1);
837
838 status = U_ZERO_ERROR;
839 memset(statusVals, -1, sizeof(statusVals));
840 numStatuses = bi->getRuleStatusVec(statusVals, 2, status);
841 TEST_ASSERT_SUCCESS(status);
842 TEST_ASSERT(numStatuses == 2);
843 TEST_ASSERT(statusVals[0] == 100);
844 TEST_ASSERT(statusVals[1] == 300);
845 TEST_ASSERT(statusVals[2] == -1);
846 }
847 delete bi;
848
849 }
850
851 //
852 // Bug 2190 Regression test. Builder crash on rule consisting of only a
853 // $variable reference
TestBug2190()854 void RBBIAPITest::TestBug2190() {
855 UnicodeString rulesString1 = "$aaa = abcd;\n"
856 "$bbb = $aaa;\n"
857 "$bbb;\n";
858 UnicodeString testString1 = "abcdabcd";
859 // 01234567890
860 int32_t bounds1[] = {0, 4, 8};
861 UErrorCode status=U_ZERO_ERROR;
862 UParseError parseError;
863
864 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
865 if(U_FAILURE(status)) {
866 errln("FAIL : in construction");
867 } else {
868 bi->setText(testString1);
869 doBoundaryTest(*bi, testString1, bounds1);
870 }
871 delete bi;
872 }
873
874
TestRegistration()875 void RBBIAPITest::TestRegistration() {
876 #if !UCONFIG_NO_SERVICE
877 UErrorCode status = U_ZERO_ERROR;
878 BreakIterator* ja_word = BreakIterator::createWordInstance("ja_JP", status);
879
880 // ok to not delete these if we exit because of error?
881 BreakIterator* ja_char = BreakIterator::createCharacterInstance("ja_JP", status);
882 BreakIterator* root_word = BreakIterator::createWordInstance("", status);
883 BreakIterator* root_char = BreakIterator::createCharacterInstance("", status);
884
885 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
886 {
887 if (ja_word && *ja_word == *root_word) {
888 errln("japan not different from root");
889 }
890 }
891
892 {
893 BreakIterator* result = BreakIterator::createWordInstance("xx_XX", status);
894 UBool fail = TRUE;
895 if(result){
896 fail = *result != *ja_word;
897 }
898 delete result;
899 if (fail) {
900 errln("bad result for xx_XX/word");
901 }
902 }
903
904 {
905 BreakIterator* result = BreakIterator::createCharacterInstance("ja_JP", status);
906 UBool fail = TRUE;
907 if(result){
908 fail = *result != *ja_char;
909 }
910 delete result;
911 if (fail) {
912 errln("bad result for ja_JP/char");
913 }
914 }
915
916 {
917 BreakIterator* result = BreakIterator::createCharacterInstance("xx_XX", status);
918 UBool fail = TRUE;
919 if(result){
920 fail = *result != *root_char;
921 }
922 delete result;
923 if (fail) {
924 errln("bad result for xx_XX/char");
925 }
926 }
927
928 {
929 StringEnumeration* avail = BreakIterator::getAvailableLocales();
930 UBool found = FALSE;
931 const UnicodeString* p;
932 while ((p = avail->snext(status))) {
933 if (p->compare("xx") == 0) {
934 found = TRUE;
935 break;
936 }
937 }
938 delete avail;
939 if (!found) {
940 errln("did not find test locale");
941 }
942 }
943
944 {
945 UBool unreg = BreakIterator::unregister(key, status);
946 if (!unreg) {
947 errln("unable to unregister");
948 }
949 }
950
951 {
952 BreakIterator* result = BreakIterator::createWordInstance("en_US", status);
953 BreakIterator* root = BreakIterator::createWordInstance("", status);
954 UBool fail = TRUE;
955 if(root){
956 fail = *root != *result;
957 }
958 delete root;
959 delete result;
960 if (fail) {
961 errln("did not get root break");
962 }
963 }
964
965 {
966 StringEnumeration* avail = BreakIterator::getAvailableLocales();
967 UBool found = FALSE;
968 const UnicodeString* p;
969 while ((p = avail->snext(status))) {
970 if (p->compare("xx") == 0) {
971 found = TRUE;
972 break;
973 }
974 }
975 delete avail;
976 if (found) {
977 errln("found test locale");
978 }
979 }
980
981 {
982 int32_t count;
983 UBool foundLocale = FALSE;
984 const Locale *avail = BreakIterator::getAvailableLocales(count);
985 for (int i=0; i<count; i++) {
986 if (avail[i] == Locale::getEnglish()) {
987 foundLocale = TRUE;
988 break;
989 }
990 }
991 if (foundLocale == FALSE) {
992 errln("BreakIterator::getAvailableLocales(&count), failed to find EN.");
993 }
994 }
995
996
997 // ja_word was adopted by factory
998 delete ja_char;
999 delete root_word;
1000 delete root_char;
1001 #endif
1002 }
1003
RoundtripRule(const char * dataFile)1004 void RBBIAPITest::RoundtripRule(const char *dataFile) {
1005 UErrorCode status = U_ZERO_ERROR;
1006 UParseError parseError;
1007 parseError.line = 0;
1008 parseError.offset = 0;
1009 UDataMemory *data = udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status);
1010 uint32_t length;
1011 const UChar *builtSource;
1012 const uint8_t *rbbiRules;
1013 const uint8_t *builtRules;
1014
1015 if (U_FAILURE(status)) {
1016 errln("Can't open \"%s\"", dataFile);
1017 return;
1018 }
1019
1020 builtRules = (const uint8_t *)udata_getMemory(data);
1021 builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
1022 RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
1023 if (U_FAILURE(status)) {
1024 errln("createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
1025 u_errorName(status), parseError.line, parseError.offset);
1026 return;
1027 };
1028 rbbiRules = brkItr->getBinaryRules(length);
1029 logln("Comparing \"%s\" len=%d", dataFile, length);
1030 if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) {
1031 errln("Built rules and rebuilt rules are different %s", dataFile);
1032 return;
1033 }
1034 delete brkItr;
1035 udata_close(data);
1036 }
1037
TestRoundtripRules()1038 void RBBIAPITest::TestRoundtripRules() {
1039 RoundtripRule("word");
1040 RoundtripRule("title");
1041 RoundtripRule("sent");
1042 RoundtripRule("line");
1043 RoundtripRule("char");
1044 if (!quick) {
1045 RoundtripRule("word_ja");
1046 RoundtripRule("word_POSIX");
1047 }
1048 }
1049
1050 //---------------------------------------------
1051 // runIndexedTest
1052 //---------------------------------------------
1053
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)1054 void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
1055 {
1056 if (exec) logln((UnicodeString)"TestSuite RuleBasedBreakIterator API ");
1057 switch (index) {
1058 // case 0: name = "TestConstruction"; if (exec) TestConstruction(); break;
1059 case 0: name = "TestCloneEquals"; if (exec) TestCloneEquals(); break;
1060 case 1: name = "TestgetRules"; if (exec) TestgetRules(); break;
1061 case 2: name = "TestHashCode"; if (exec) TestHashCode(); break;
1062 case 3: name = "TestGetSetAdoptText"; if (exec) TestGetSetAdoptText(); break;
1063 case 4: name = "TestIteration"; if (exec) TestIteration(); break;
1064 case 5: name = "TestBuilder"; if (exec) TestBuilder(); break;
1065 case 6: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break;
1066 case 7: name = "TestRuleStatus"; if (exec) TestRuleStatus(); break;
1067 case 8: name = "TestRuleStatusVec"; if (exec) TestRuleStatusVec(); break;
1068 case 9: name = "TestBug2190"; if (exec) TestBug2190(); break;
1069 case 10: name = "TestRegistration"; if (exec) TestRegistration(); break;
1070 case 11: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break;
1071 case 12: name = "TestRoundtripRules"; if (exec) TestRoundtripRules(); break;
1072
1073 default: name = ""; break; // needed to end loop
1074 }
1075 }
1076
1077 //---------------------------------------------
1078 //Internal subroutines
1079 //---------------------------------------------
1080
doBoundaryTest(RuleBasedBreakIterator & bi,UnicodeString & text,int32_t * boundaries)1081 void RBBIAPITest::doBoundaryTest(RuleBasedBreakIterator& bi, UnicodeString& text, int32_t *boundaries){
1082 logln((UnicodeString)"testIsBoundary():");
1083 int32_t p = 0;
1084 UBool isB;
1085 for (int32_t i = 0; i < text.length(); i++) {
1086 isB = bi.isBoundary(i);
1087 logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB);
1088
1089 if (i == boundaries[p]) {
1090 if (!isB)
1091 errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false");
1092 p++;
1093 }
1094 else {
1095 if (isB)
1096 errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true");
1097 }
1098 }
1099 }
doTest(UnicodeString & testString,int32_t start,int32_t gotoffset,int32_t expectedOffset,const char * expectedString)1100 void RBBIAPITest::doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expectedString){
1101 UnicodeString selected;
1102 UnicodeString expected=CharsToUnicodeString(expectedString);
1103
1104 if(gotoffset != expectedOffset)
1105 errln((UnicodeString)"ERROR:****returned #" + gotoffset + (UnicodeString)" instead of #" + expectedOffset);
1106 if(start <= gotoffset){
1107 testString.extractBetween(start, gotoffset, selected);
1108 }
1109 else{
1110 testString.extractBetween(gotoffset, start, selected);
1111 }
1112 if(selected.compare(expected) != 0)
1113 errln(prettify((UnicodeString)"ERROR:****selected \"" + selected + "\" instead of \"" + expected + "\""));
1114 else
1115 logln(prettify("****selected \"" + selected + "\""));
1116 }
1117
1118 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1119