1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_COLLATION
12
13 #include "unicode/unistr.h"
14 #include "unicode/putil.h"
15 #include "unicode/usearch.h"
16
17 #include "cmemory.h"
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/coleitr.h"
21 #include "unicode/ucoleitr.h"
22
23 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
24
25 #include "unicode/uniset.h"
26 #include "unicode/uset.h"
27 #include "unicode/ustring.h"
28 #include "hash.h"
29 #include "uhash.h"
30 #include "ucol_imp.h"
31
32 #include "intltest.h"
33 #include "ssearch.h"
34
35 #include "unicode/colldata.h"
36 #include "unicode/bmsearch.h"
37 #include "unicode/bms.h"
38
39 #include "xmlparser.h"
40 #include "ucbuf.h"
41
42 #include <stdlib.h>
43 #include <string.h>
44 #include <stdio.h>
45
46 char testId[100];
47
48 #define TEST_ASSERT(x) {if (!(x)) { \
49 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
50
51 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
52 errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}}
53
54 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
55 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
56 __FILE__, __LINE__, testId, u_errorName(errcode));}}
57
58 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
59 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
60 #define DELETE_ARRAY(array) uprv_free((void *) (array))
61
62 //---------------------------------------------------------------------------
63 //
64 // Test class boilerplate
65 //
66 //---------------------------------------------------------------------------
SSearchTest()67 SSearchTest::SSearchTest()
68 {
69 }
70
~SSearchTest()71 SSearchTest::~SSearchTest()
72 {
73 }
74
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)75 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
76 {
77 if (exec) logln("TestSuite SSearchTest: ");
78 switch (index) {
79 #if !UCONFIG_NO_BREAK_ITERATION
80 case 0: name = "searchTest";
81 if (exec) searchTest();
82 break;
83
84 case 1: name = "offsetTest";
85 if (exec) offsetTest();
86 break;
87
88 case 2: name = "monkeyTest";
89 if (exec) monkeyTest(params);
90 break;
91
92 case 3: name = "bmMonkeyTest";
93 if (exec) bmMonkeyTest(params);
94 break;
95
96 case 4: name = "boyerMooreTest";
97 if (exec) boyerMooreTest();
98 break;
99
100 case 5: name = "goodSuffixTest";
101 if (exec) goodSuffixTest();
102 break;
103
104 case 6: name = "searchTime";
105 if (exec) searchTime();
106 break;
107
108 case 7: name = "bmsTest";
109 if (exec) bmsTest();
110 break;
111
112 case 8: name = "bmSearchTest";
113 if (exec) bmSearchTest();
114 break;
115
116 case 9: name = "udhrTest";
117 if (exec) udhrTest();
118 break;
119 case 10: name = "stringListTest";
120 if (exec) stringListTest();
121 break;
122 #endif
123 default: name = "";
124 break; //needed to end loop
125 }
126 }
127
128
129 #if !UCONFIG_NO_BREAK_ITERATION
130
131 #define PATH_BUFFER_SIZE 2048
getPath(char buffer[2048],const char * filename)132 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
133 UErrorCode status = U_ZERO_ERROR;
134 const char *testDataDirectory = IntlTest::getSourceTestData(status);
135
136 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
137 errln("ERROR: getPath() failed - %s", u_errorName(status));
138 return NULL;
139 }
140
141 strcpy(buffer, testDataDirectory);
142 strcat(buffer, filename);
143 return buffer;
144 }
145
146
searchTest()147 void SSearchTest::searchTest()
148 {
149 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
150 UErrorCode status = U_ZERO_ERROR;
151 char path[PATH_BUFFER_SIZE];
152 const char *testFilePath = getPath(path, "ssearch.xml");
153
154 if (testFilePath == NULL) {
155 return; /* Couldn't get path: error message already output. */
156 }
157
158 LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
159 TEST_ASSERT_SUCCESS(status);
160 LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
161 TEST_ASSERT_SUCCESS(status);
162 if (U_FAILURE(status)) {
163 return;
164 }
165
166 const UnicodeString *debugTestCase = root->getAttribute("debug");
167 if (debugTestCase != NULL) {
168 // setenv("USEARCH_DEBUG", "1", 1);
169 }
170
171
172 const UXMLElement *testCase;
173 int32_t tc = 0;
174
175 while((testCase = root->nextChildElement(tc)) != NULL) {
176
177 if (testCase->getTagName().compare("test-case") != 0) {
178 errln("ssearch, unrecognized XML Element in test file");
179 continue;
180 }
181 const UnicodeString *id = testCase->getAttribute("id");
182 *testId = 0;
183 if (id != NULL) {
184 id->extract(0, id->length(), testId, sizeof(testId), US_INV);
185 }
186
187 // If debugging test case has been specified and this is not it, skip to next.
188 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
189 continue;
190 }
191 //
192 // Get the requested collation strength.
193 // Default is tertiary if the XML attribute is missing from the test case.
194 //
195 const UnicodeString *strength = testCase->getAttribute("strength");
196 UColAttributeValue collatorStrength = UCOL_PRIMARY;
197 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
198 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
199 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
200 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;}
201 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
202 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;}
203 else {
204 // Bogus value supplied for strength. Shouldn't happen, even from
205 // typos, if the XML source has been validated.
206 // This assert is a little deceiving in that strength can be
207 // any of the allowed values, not just TERTIARY, but it will
208 // do the job of getting the error output.
209 TEST_ASSERT(*strength=="TERTIARY")
210 }
211
212 //
213 // Get the collator normalization flag. Default is UCOL_OFF.
214 //
215 UColAttributeValue normalize = UCOL_OFF;
216 const UnicodeString *norm = testCase->getAttribute("norm");
217 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
218 if (norm!=NULL && *norm=="ON") {
219 normalize = UCOL_ON;
220 }
221
222 //
223 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
224 //
225 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
226 const UnicodeString *alt = testCase->getAttribute("alternate_handling");
227 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
228 if (alt != NULL && *alt == "SHIFTED") {
229 alternateHandling = UCOL_SHIFTED;
230 }
231
232 const UnicodeString defLocale("en");
233 char clocale[100];
234 const UnicodeString *locale = testCase->getAttribute("locale");
235 if (locale == NULL || locale->length()==0) {
236 locale = &defLocale;
237 };
238 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
239
240
241 UnicodeString text;
242 UnicodeString target;
243 UnicodeString pattern;
244 int32_t expectedMatchStart = -1;
245 int32_t expectedMatchLimit = -1;
246 const UXMLElement *n;
247 int32_t nodeCount = 0;
248
249 n = testCase->getChildElement("pattern");
250 TEST_ASSERT(n != NULL);
251 if (n==NULL) {
252 continue;
253 }
254 text = n->getText(FALSE);
255 text = text.unescape();
256 pattern.append(text);
257 nodeCount++;
258
259 n = testCase->getChildElement("pre");
260 if (n!=NULL) {
261 text = n->getText(FALSE);
262 text = text.unescape();
263 target.append(text);
264 nodeCount++;
265 }
266
267 n = testCase->getChildElement("m");
268 if (n!=NULL) {
269 expectedMatchStart = target.length();
270 text = n->getText(FALSE);
271 text = text.unescape();
272 target.append(text);
273 expectedMatchLimit = target.length();
274 nodeCount++;
275 }
276
277 n = testCase->getChildElement("post");
278 if (n!=NULL) {
279 text = n->getText(FALSE);
280 text = text.unescape();
281 target.append(text);
282 nodeCount++;
283 }
284
285 // Check that there weren't extra things in the XML
286 TEST_ASSERT(nodeCount == testCase->countChildren());
287
288 // Open a collator and StringSearch based on the parameters
289 // obtained from the XML.
290 //
291 status = U_ZERO_ERROR;
292 LocalUCollatorPointer collator(ucol_open(clocale, &status));
293 ucol_setStrength(collator.getAlias(), collatorStrength);
294 ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
295 ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
296 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
297 target.getBuffer(), target.length(),
298 collator.getAlias(),
299 NULL, // the break iterator
300 &status));
301
302 TEST_ASSERT_SUCCESS(status);
303 if (U_FAILURE(status)) {
304 continue;
305 }
306
307 int32_t foundStart = 0;
308 int32_t foundLimit = 0;
309 UBool foundMatch;
310
311 //
312 // Do the search, check the match result against the expected results.
313 //
314 foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
315 TEST_ASSERT_SUCCESS(status);
316 if ((foundMatch && expectedMatchStart<0) ||
317 (foundStart != expectedMatchStart) ||
318 (foundLimit != expectedMatchLimit)) {
319 TEST_ASSERT(FALSE); // ouput generic error position
320 infoln("Found, expected match start = %d, %d \n"
321 "Found, expected match limit = %d, %d",
322 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
323 }
324
325 // In case there are other matches...
326 // (should we only do this if the test case passed?)
327 while (foundMatch) {
328 expectedMatchStart = foundStart;
329 expectedMatchLimit = foundLimit;
330
331 foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
332 }
333
334 uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
335 target.getBuffer(), target.length(),
336 collator.getAlias(),
337 NULL,
338 &status));
339
340 //
341 // Do the backwards search, check the match result against the expected results.
342 //
343 foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
344 TEST_ASSERT_SUCCESS(status);
345 if ((foundMatch && expectedMatchStart<0) ||
346 (foundStart != expectedMatchStart) ||
347 (foundLimit != expectedMatchLimit)) {
348 TEST_ASSERT(FALSE); // ouput generic error position
349 infoln("Found, expected backwards match start = %d, %d \n"
350 "Found, expected backwards match limit = %d, %d",
351 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
352 }
353 }
354 #endif
355 }
356
357 struct UdhrTestCase
358 {
359 const char *locale;
360 const char *file;
361 };
362
udhrTest()363 void SSearchTest::udhrTest()
364 {
365 UErrorCode status = U_ZERO_ERROR;
366 char path[PATH_BUFFER_SIZE];
367 const char *udhrPath = getPath(path, "udhr");
368
369 if (udhrPath == NULL) {
370 // couldn't get path: error message already output...
371 return;
372 }
373
374 UdhrTestCase testCases[] = {
375 {"en", "udhr_eng.txt"},
376 {"de", "udhr_deu_1996.txt"},
377 {"fr", "udhr_fra.txt"},
378 {"ru", "udhr_rus.txt"},
379 {"th", "udhr_tha.txt"},
380 {"ja", "udhr_jpn.txt"},
381 {"ko", "udhr_kor.txt"},
382 {"zh", "udhr_cmn_hans.txt"},
383 {"zh_Hant", "udhr_cmn_hant.txt"}
384 };
385
386 int32_t testCount = ARRAY_SIZE(testCases);
387
388 for (int32_t t = 0; t < testCount; t += 1) {
389 int32_t len = 0;
390 char *resolvedFileName = NULL;
391 const char *encoding = NULL;
392 UCHARBUF *ucharBuf = NULL;
393
394 ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status);
395 resolvedFileName = NEW_ARRAY(char, len);
396
397 if(resolvedFileName == NULL){
398 continue;
399 }
400
401 if(status == U_BUFFER_OVERFLOW_ERROR){
402 status = U_ZERO_ERROR;
403 }
404
405 ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &len, &status);
406 ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status);
407
408 DELETE_ARRAY(resolvedFileName);
409
410 if(U_FAILURE(status)){
411 infoln("Could not open the input file %s. Test skipped\n", testCases[t].file);
412 continue;
413 }
414
415 int32_t targetLen = 0;
416 const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status);
417
418 /* The first line of the file contains the pattern */
419 int32_t start = 0, end = 0, plen = 0;
420
421 for(end = start; ; end += 1) {
422 UChar ch = target[end];
423
424 if (ch == 0x000A || ch == 0x000D || ch == 0x2028) {
425 break;
426 }
427 }
428
429 plen = end - start;
430
431 UChar *pattern = NEW_ARRAY(UChar, plen);
432 for (int32_t i = 0; i < plen; i += 1) {
433 pattern[i] = target[start++];
434 }
435
436 int32_t offset = 0;
437 UCollator *coll = ucol_open(testCases[t].locale, &status);
438 UCD *ucd = NULL;
439 BMS *bms = NULL;
440
441 if (U_FAILURE(status)) {
442 errln("Could not open collator for %s", testCases[t].locale);
443 goto delete_collator;
444 }
445
446 ucd = ucd_open(coll, &status);
447
448 if (U_FAILURE(status)) {
449 errln("Could not open CollData object for %s", testCases[t].locale);
450 goto delete_ucd;
451 }
452
453 bms = bms_open(ucd, pattern, plen, target, targetLen, &status);
454
455 if (U_FAILURE(status)) {
456 errln("Could not open search object for %s", testCases[t].locale);
457 goto delete_bms;
458 }
459
460 start = end = -1;
461 while (bms_search(bms, offset, &start, &end)) {
462 offset = end;
463 }
464
465 if (offset == 0) {
466 errln("Could not find pattern - locale: %s, file: %s ", testCases[t].locale, testCases[t].file);
467 }
468
469 delete_bms:
470 bms_close(bms);
471
472 delete_ucd:
473 ucd_close(ucd);
474
475 delete_collator:
476 ucol_close(coll);
477
478 DELETE_ARRAY(pattern);
479 ucbuf_close(ucharBuf);
480 }
481
482 ucd_flushCache();
483 }
484
bmSearchTest()485 void SSearchTest::bmSearchTest()
486 {
487 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
488 UErrorCode status = U_ZERO_ERROR;
489 char path[PATH_BUFFER_SIZE];
490 const char *testFilePath = getPath(path, "ssearch.xml");
491
492 if (testFilePath == NULL) {
493 return; /* Couldn't get path: error message already output. */
494 }
495
496 UXMLParser *parser = UXMLParser::createParser(status);
497 TEST_ASSERT_SUCCESS(status);
498 UXMLElement *root = parser->parseFile(testFilePath, status);
499 TEST_ASSERT_SUCCESS(status);
500 if (U_FAILURE(status)) {
501 return;
502 }
503
504 const UnicodeString *debugTestCase = root->getAttribute("debug");
505 if (debugTestCase != NULL) {
506 // setenv("USEARCH_DEBUG", "1", 1);
507 }
508
509
510 const UXMLElement *testCase;
511 int32_t tc = 0;
512
513 while((testCase = root->nextChildElement(tc)) != NULL) {
514
515 if (testCase->getTagName().compare("test-case") != 0) {
516 errln("ssearch, unrecognized XML Element in test file");
517 continue;
518 }
519 const UnicodeString *id = testCase->getAttribute("id");
520 *testId = 0;
521 if (id != NULL) {
522 id->extract(0, id->length(), testId, sizeof(testId), US_INV);
523 }
524
525 // If debugging test case has been specified and this is not it, skip to next.
526 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
527 continue;
528 }
529 //
530 // Get the requested collation strength.
531 // Default is tertiary if the XML attribute is missing from the test case.
532 //
533 const UnicodeString *strength = testCase->getAttribute("strength");
534 UColAttributeValue collatorStrength = UCOL_PRIMARY;
535 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
536 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
537 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
538 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;}
539 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
540 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;}
541 else {
542 // Bogus value supplied for strength. Shouldn't happen, even from
543 // typos, if the XML source has been validated.
544 // This assert is a little deceiving in that strength can be
545 // any of the allowed values, not just TERTIARY, but it will
546 // do the job of getting the error output.
547 TEST_ASSERT(*strength=="TERTIARY")
548 }
549
550 //
551 // Get the collator normalization flag. Default is UCOL_OFF.
552 //
553 UColAttributeValue normalize = UCOL_OFF;
554 const UnicodeString *norm = testCase->getAttribute("norm");
555 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
556 if (norm!=NULL && *norm=="ON") {
557 normalize = UCOL_ON;
558 }
559
560 //
561 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
562 //
563 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
564 const UnicodeString *alt = testCase->getAttribute("alternate_handling");
565 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
566 if (alt != NULL && *alt == "SHIFTED") {
567 alternateHandling = UCOL_SHIFTED;
568 }
569
570 const UnicodeString defLocale("en");
571 char clocale[100];
572 const UnicodeString *locale = testCase->getAttribute("locale");
573 if (locale == NULL || locale->length()==0) {
574 locale = &defLocale;
575 };
576 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
577
578
579 UnicodeString text;
580 UnicodeString target;
581 UnicodeString pattern;
582 int32_t expectedMatchStart = -1;
583 int32_t expectedMatchLimit = -1;
584 const UXMLElement *n;
585 int32_t nodeCount = 0;
586
587 n = testCase->getChildElement("pattern");
588 TEST_ASSERT(n != NULL);
589 if (n==NULL) {
590 continue;
591 }
592 text = n->getText(FALSE);
593 text = text.unescape();
594 pattern.append(text);
595 nodeCount++;
596
597 n = testCase->getChildElement("pre");
598 if (n!=NULL) {
599 text = n->getText(FALSE);
600 text = text.unescape();
601 target.append(text);
602 nodeCount++;
603 }
604
605 n = testCase->getChildElement("m");
606 if (n!=NULL) {
607 expectedMatchStart = target.length();
608 text = n->getText(FALSE);
609 text = text.unescape();
610 target.append(text);
611 expectedMatchLimit = target.length();
612 nodeCount++;
613 }
614
615 n = testCase->getChildElement("post");
616 if (n!=NULL) {
617 text = n->getText(FALSE);
618 text = text.unescape();
619 target.append(text);
620 nodeCount++;
621 }
622
623 // Check that there weren't extra things in the XML
624 TEST_ASSERT(nodeCount == testCase->countChildren());
625
626 // Open a collator and StringSearch based on the parameters
627 // obtained from the XML.
628 //
629 status = U_ZERO_ERROR;
630 UCollator *collator = ucol_open(clocale, &status);
631 ucol_setStrength(collator, collatorStrength);
632 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
633 ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
634 UCD *ucd = ucd_open(collator, &status);
635 BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), &status);
636
637 TEST_ASSERT_SUCCESS(status);
638 if (U_FAILURE(status)) {
639 bms_close(bms);
640 ucd_close(ucd);
641 ucol_close(collator);
642 continue;
643 }
644
645 int32_t foundStart = 0;
646 int32_t foundLimit = 0;
647 UBool foundMatch;
648
649 //
650 // Do the search, check the match result against the expected results.
651 //
652 foundMatch = bms_search(bms, 0, &foundStart, &foundLimit);
653 //TEST_ASSERT_SUCCESS(status);
654 if ((foundMatch && expectedMatchStart < 0) ||
655 (foundStart != expectedMatchStart) ||
656 (foundLimit != expectedMatchLimit)) {
657 TEST_ASSERT(FALSE); // ouput generic error position
658 infoln("Found, expected match start = %d, %d \n"
659 "Found, expected match limit = %d, %d",
660 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
661 }
662
663 bms_close(bms);
664 ucd_close(ucd);
665 ucol_close(collator);
666 }
667
668 ucd_flushCache();
669 delete root;
670 delete parser;
671 #endif
672 }
673
674 struct Order
675 {
676 int32_t order;
677 int32_t lowOffset;
678 int32_t highOffset;
679 };
680
681 class OrderList
682 {
683 public:
684 OrderList();
685 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
686 ~OrderList();
687
688 int32_t size(void) const;
689 void add(int32_t order, int32_t low, int32_t high);
690 const Order *get(int32_t index) const;
691 int32_t getLowOffset(int32_t index) const;
692 int32_t getHighOffset(int32_t index) const;
693 int32_t getOrder(int32_t index) const;
694 void reverse(void);
695 UBool compare(const OrderList &other) const;
696 UBool matchesAt(int32_t offset, const OrderList &other) const;
697
698 private:
699 Order *list;
700 int32_t listMax;
701 int32_t listSize;
702 };
703
OrderList()704 OrderList::OrderList()
705 : list(NULL), listMax(16), listSize(0)
706 {
707 list = new Order[listMax];
708 }
709
OrderList(UCollator * coll,const UnicodeString & string,int32_t stringOffset)710 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
711 : list(NULL), listMax(16), listSize(0)
712 {
713 UErrorCode status = U_ZERO_ERROR;
714 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
715 uint32_t strengthMask = 0;
716 int32_t order, low, high;
717
718 switch (ucol_getStrength(coll))
719 {
720 default:
721 strengthMask |= UCOL_TERTIARYORDERMASK;
722 /* fall through */
723
724 case UCOL_SECONDARY:
725 strengthMask |= UCOL_SECONDARYORDERMASK;
726 /* fall through */
727
728 case UCOL_PRIMARY:
729 strengthMask |= UCOL_PRIMARYORDERMASK;
730 }
731
732 list = new Order[listMax];
733
734 ucol_setOffset(elems, stringOffset, &status);
735
736 do {
737 low = ucol_getOffset(elems);
738 order = ucol_next(elems, &status);
739 high = ucol_getOffset(elems);
740
741 if (order != UCOL_NULLORDER) {
742 order &= strengthMask;
743 }
744
745 if (order != UCOL_IGNORABLE) {
746 add(order, low, high);
747 }
748 } while (order != UCOL_NULLORDER);
749
750 ucol_closeElements(elems);
751 }
752
~OrderList()753 OrderList::~OrderList()
754 {
755 delete[] list;
756 }
757
add(int32_t order,int32_t low,int32_t high)758 void OrderList::add(int32_t order, int32_t low, int32_t high)
759 {
760 if (listSize >= listMax) {
761 listMax *= 2;
762
763 Order *newList = new Order[listMax];
764
765 uprv_memcpy(newList, list, listSize * sizeof(Order));
766 delete[] list;
767 list = newList;
768 }
769
770 list[listSize].order = order;
771 list[listSize].lowOffset = low;
772 list[listSize].highOffset = high;
773
774 listSize += 1;
775 }
776
get(int32_t index) const777 const Order *OrderList::get(int32_t index) const
778 {
779 if (index >= listSize) {
780 return NULL;
781 }
782
783 return &list[index];
784 }
785
getLowOffset(int32_t index) const786 int32_t OrderList::getLowOffset(int32_t index) const
787 {
788 const Order *order = get(index);
789
790 if (order != NULL) {
791 return order->lowOffset;
792 }
793
794 return -1;
795 }
796
getHighOffset(int32_t index) const797 int32_t OrderList::getHighOffset(int32_t index) const
798 {
799 const Order *order = get(index);
800
801 if (order != NULL) {
802 return order->highOffset;
803 }
804
805 return -1;
806 }
807
getOrder(int32_t index) const808 int32_t OrderList::getOrder(int32_t index) const
809 {
810 const Order *order = get(index);
811
812 if (order != NULL) {
813 return order->order;
814 }
815
816 return UCOL_NULLORDER;
817 }
818
size() const819 int32_t OrderList::size() const
820 {
821 return listSize;
822 }
823
reverse()824 void OrderList::reverse()
825 {
826 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
827 Order swap = list[b];
828
829 list[b] = list[f];
830 list[f] = swap;
831 }
832 }
833
compare(const OrderList & other) const834 UBool OrderList::compare(const OrderList &other) const
835 {
836 if (listSize != other.listSize) {
837 return FALSE;
838 }
839
840 for(int32_t i = 0; i < listSize; i += 1) {
841 if (list[i].order != other.list[i].order ||
842 list[i].lowOffset != other.list[i].lowOffset ||
843 list[i].highOffset != other.list[i].highOffset) {
844 return FALSE;
845 }
846 }
847
848 return TRUE;
849 }
850
matchesAt(int32_t offset,const OrderList & other) const851 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
852 {
853 // NOTE: sizes include the NULLORDER, which we don't want to compare.
854 int32_t otherSize = other.size() - 1;
855
856 if (listSize - 1 - offset < otherSize) {
857 return FALSE;
858 }
859
860 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
861 if (getOrder(i) != other.getOrder(j)) {
862 return FALSE;
863 }
864 }
865
866 return TRUE;
867 }
868
printOffsets(char * buffer,OrderList & list)869 static char *printOffsets(char *buffer, OrderList &list)
870 {
871 int32_t size = list.size();
872 char *s = buffer;
873
874 for(int32_t i = 0; i < size; i += 1) {
875 const Order *order = list.get(i);
876
877 if (i != 0) {
878 s += sprintf(s, ", ");
879 }
880
881 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
882 }
883
884 return buffer;
885 }
886
printOrders(char * buffer,OrderList & list)887 static char *printOrders(char *buffer, OrderList &list)
888 {
889 int32_t size = list.size();
890 char *s = buffer;
891
892 for(int32_t i = 0; i < size; i += 1) {
893 const Order *order = list.get(i);
894
895 if (i != 0) {
896 s += sprintf(s, ", ");
897 }
898
899 s += sprintf(s, "%8.8X", order->order);
900 }
901
902 return buffer;
903 }
904
offsetTest()905 void SSearchTest::offsetTest()
906 {
907 static const UVersionInfo icu49 = { 4, 9, 0, 0 };
908 const char *test[] = {
909 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
910 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
911 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
912
913 "\\ua191\\u16ef\\u2036\\u017a",
914
915 #if 0
916 // This results in a complex interaction between contraction,
917 // expansion and normalization that confuses the backwards offset fixups.
918 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
919 #endif
920
921 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
922 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
923
924 "\\u02FE\\u02FF"
925 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
926 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
927 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
928 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
929 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
930
931 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
932 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
933 "a\\u02FF\\u0316\\u0301",
934 "a\\u0430\\u0301\\u0316",
935 "a\\u0430\\u0316\\u0301",
936 "abc\\u0E41\\u0301\\u0316",
937 "abc\\u0E41\\u0316\\u0301",
938 "\\u0E41\\u0301\\u0316",
939 "\\u0E41\\u0316\\u0301",
940 "a\\u0301\\u0316",
941 "a\\u0316\\u0301",
942 "\\uAC52\\uAC53",
943 "\\u34CA\\u34CB",
944 "\\u11ED\\u11EE",
945 "\\u30C3\\u30D0",
946 "p\\u00E9ch\\u00E9",
947 "a\\u0301\\u0325",
948 "a\\u0300\\u0325",
949 "a\\u0325\\u0300",
950 "A\\u0323\\u0300B",
951 "A\\u0300\\u0323B",
952 "A\\u0301\\u0323B",
953 "A\\u0302\\u0301\\u0323B",
954 "abc",
955 "ab\\u0300c",
956 "ab\\u0300\\u0323c",
957 " \\uD800\\uDC00\\uDC00",
958 "a\\uD800\\uDC00\\uDC00",
959 "A\\u0301\\u0301",
960 "A\\u0301\\u0323",
961 "A\\u0301\\u0323B",
962 "B\\u0301\\u0323C",
963 "A\\u0300\\u0323B",
964 "\\u0301A\\u0301\\u0301",
965 "abcd\\r\\u0301",
966 "p\\u00EAche",
967 "pe\\u0302che",
968 };
969
970 int32_t testCount = ARRAY_SIZE(test);
971 UErrorCode status = U_ZERO_ERROR;
972 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
973 if (U_FAILURE(status)) {
974 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
975 return;
976 }
977 char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
978 // We could allocate one that's the right size by (CE_count * 10) + 2
979 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
980
981 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
982
983 for(int32_t i = 0; i < testCount; i += 1) {
984 if (!isICUVersionAtLeast(icu49) && i>=4 && i<=6) {
985 continue; // timebomb until ticket #8080 is resolved
986 }
987 UnicodeString ts = CharsToUnicodeString(test[i]);
988 CollationElementIterator *iter = col->createCollationElementIterator(ts);
989 OrderList forwardList;
990 OrderList backwardList;
991 int32_t order, low, high;
992
993 do {
994 low = iter->getOffset();
995 order = iter->next(status);
996 high = iter->getOffset();
997
998 forwardList.add(order, low, high);
999 } while (order != CollationElementIterator::NULLORDER);
1000
1001 iter->reset();
1002 iter->setOffset(ts.length(), status);
1003
1004 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
1005
1006 do {
1007 high = iter->getOffset();
1008 order = iter->previous(status);
1009 low = iter->getOffset();
1010
1011 if (order == CollationElementIterator::NULLORDER) {
1012 break;
1013 }
1014
1015 backwardList.add(order, low, high);
1016 } while (TRUE);
1017
1018 backwardList.reverse();
1019
1020 if (forwardList.compare(backwardList)) {
1021 logln("Works with \"%s\"", test[i]);
1022 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
1023 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
1024
1025 logln("Forward CEs: [%s]", printOrders(buffer, forwardList));
1026 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
1027
1028 logln();
1029 } else {
1030 errln("Fails with \"%s\"", test[i]);
1031 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
1032 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
1033
1034 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList));
1035 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
1036
1037 infoln();
1038 }
1039 delete iter;
1040 }
1041 delete col;
1042 }
1043
1044 #if 0
1045 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
1046 {
1047 for(int32_t i = 0; i < string.length(); i += 1) {
1048 UChar32 ch = string.char32At(i);
1049
1050 if (ch >= 0x0020 && ch <= 0x007F) {
1051 if (ch == 0x005C) {
1052 buffer.append("\\\\");
1053 } else {
1054 buffer.append(ch);
1055 }
1056 } else {
1057 char cbuffer[12];
1058
1059 if (ch <= 0xFFFFL) {
1060 sprintf(cbuffer, "\\u%4.4X", ch);
1061 } else {
1062 sprintf(cbuffer, "\\U%8.8X", ch);
1063 }
1064
1065 buffer.append(cbuffer);
1066 }
1067
1068 if (ch >= 0x10000L) {
1069 i += 1;
1070 }
1071 }
1072
1073 return buffer;
1074 }
1075 #endif
1076
1077 #if 1
1078
1079 struct PCE
1080 {
1081 uint64_t ce;
1082 int32_t lowOffset;
1083 int32_t highOffset;
1084 };
1085
1086 class PCEList
1087 {
1088 public:
1089 PCEList(UCollator *coll, const UnicodeString &string);
1090 ~PCEList();
1091
1092 int32_t size() const;
1093
1094 const PCE *get(int32_t index) const;
1095
1096 int32_t getLowOffset(int32_t index) const;
1097 int32_t getHighOffset(int32_t index) const;
1098 uint64_t getOrder(int32_t index) const;
1099
1100 UBool matchesAt(int32_t offset, const PCEList &other) const;
1101
1102 uint64_t operator[](int32_t index) const;
1103
1104 private:
1105 void add(uint64_t ce, int32_t low, int32_t high);
1106
1107 PCE *list;
1108 int32_t listMax;
1109 int32_t listSize;
1110 };
1111
PCEList(UCollator * coll,const UnicodeString & string)1112 PCEList::PCEList(UCollator *coll, const UnicodeString &string)
1113 {
1114 UErrorCode status = U_ZERO_ERROR;
1115 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
1116 uint64_t order;
1117 int32_t low, high;
1118
1119 list = new PCE[listMax];
1120
1121 ucol_setOffset(elems, 0, &status);
1122
1123 do {
1124 order = ucol_nextProcessed(elems, &low, &high, &status);
1125 add(order, low, high);
1126 } while (order != UCOL_PROCESSED_NULLORDER);
1127
1128 ucol_closeElements(elems);
1129 }
1130
~PCEList()1131 PCEList::~PCEList()
1132 {
1133 delete[] list;
1134 }
1135
add(uint64_t order,int32_t low,int32_t high)1136 void PCEList::add(uint64_t order, int32_t low, int32_t high)
1137 {
1138 if (listSize >= listMax) {
1139 listMax *= 2;
1140
1141 PCE *newList = new PCE[listMax];
1142
1143 uprv_memcpy(newList, list, listSize * sizeof(Order));
1144 delete[] list;
1145 list = newList;
1146 }
1147
1148 list[listSize].ce = order;
1149 list[listSize].lowOffset = low;
1150 list[listSize].highOffset = high;
1151
1152 listSize += 1;
1153 }
1154
get(int32_t index) const1155 const PCE *PCEList::get(int32_t index) const
1156 {
1157 if (index >= listSize) {
1158 return NULL;
1159 }
1160
1161 return &list[index];
1162 }
1163
getLowOffset(int32_t index) const1164 int32_t PCEList::getLowOffset(int32_t index) const
1165 {
1166 const PCE *pce = get(index);
1167
1168 if (pce != NULL) {
1169 return pce->lowOffset;
1170 }
1171
1172 return -1;
1173 }
1174
getHighOffset(int32_t index) const1175 int32_t PCEList::getHighOffset(int32_t index) const
1176 {
1177 const PCE *pce = get(index);
1178
1179 if (pce != NULL) {
1180 return pce->highOffset;
1181 }
1182
1183 return -1;
1184 }
1185
getOrder(int32_t index) const1186 uint64_t PCEList::getOrder(int32_t index) const
1187 {
1188 const PCE *pce = get(index);
1189
1190 if (pce != NULL) {
1191 return pce->ce;
1192 }
1193
1194 return UCOL_PROCESSED_NULLORDER;
1195 }
1196
size() const1197 int32_t PCEList::size() const
1198 {
1199 return listSize;
1200 }
1201
matchesAt(int32_t offset,const PCEList & other) const1202 UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const
1203 {
1204 // NOTE: sizes include the NULLORDER, which we don't want to compare.
1205 int32_t otherSize = other.size() - 1;
1206
1207 if (listSize - 1 - offset < otherSize) {
1208 return FALSE;
1209 }
1210
1211 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
1212 if (getOrder(i) != other.getOrder(j)) {
1213 return FALSE;
1214 }
1215 }
1216
1217 return TRUE;
1218 }
1219
operator [](int32_t index) const1220 uint64_t PCEList::operator[](int32_t index) const
1221 {
1222 return getOrder(index);
1223 }
1224
boyerMooreTest()1225 void SSearchTest::boyerMooreTest()
1226 {
1227 UErrorCode status = U_ZERO_ERROR;
1228 UCollator *coll = NULL;
1229 CollData *data = NULL;
1230 const CEList* ce = NULL;
1231 const CEList* ce1 = NULL;
1232 UnicodeString lp = "fuss";
1233 UnicodeString sp = "fu\\u00DF";
1234 BoyerMooreSearch *longPattern = NULL;
1235 BoyerMooreSearch *shortPattern = NULL;
1236 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1237 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1238 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1239 int32_t start = -1, end = -1;
1240
1241 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
1242 if (U_FAILURE(status)) {
1243 errcheckln(status, "Could not open collator. - %s", u_errorName(status));
1244 return;
1245 }
1246
1247 data = CollData::open(coll, status);
1248 if (U_FAILURE(status)) {
1249 errln("Could not open CollData object.");
1250 goto close_data;
1251 }
1252
1253 data->getDynamicClassID();
1254 if (U_FAILURE(status)) {
1255 errln("Could not get dynamic class ID of CollData.");
1256 goto close_patterns;
1257 }
1258
1259 data->getStaticClassID();
1260 if (U_FAILURE(status)) {
1261 errln("Could not get static class ID of CollData.");
1262 goto close_patterns;
1263 }
1264
1265 longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status);
1266 shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status);
1267 if (U_FAILURE(status)) {
1268 errln("Could not create pattern objects.");
1269 goto close_patterns;
1270 }
1271
1272 longPattern->getBadCharacterTable();
1273 shortPattern->getBadCharacterTable();
1274 if (U_FAILURE(status)) {
1275 errln("Could not get bad character table.");
1276 goto close_patterns;
1277 }
1278
1279 longPattern->getGoodSuffixTable();
1280 shortPattern->getGoodSuffixTable();
1281 if (U_FAILURE(status)) {
1282 errln("Could not get good suffix table.");
1283 goto close_patterns;
1284 }
1285
1286 longPattern->getDynamicClassID();
1287 shortPattern->getDynamicClassID();
1288 if (U_FAILURE(status)) {
1289 errln("Could not get dynamic class ID of BoyerMooreSearch.");
1290 goto close_patterns;
1291 }
1292
1293 longPattern->getStaticClassID();
1294 shortPattern->getStaticClassID();
1295 if (U_FAILURE(status)) {
1296 errln("Could not get static class ID of BoyerMooreSearch.");
1297 goto close_patterns;
1298 }
1299
1300 longPattern->getData();
1301 shortPattern->getData();
1302 if (U_FAILURE(status)) {
1303 errln("Could not get collate data.");
1304 goto close_patterns;
1305 }
1306
1307 ce = longPattern->getPatternCEs();
1308 ce1 = shortPattern->getPatternCEs();
1309 if (U_FAILURE(status)) {
1310 errln("Could not get pattern CEs.");
1311 goto close_patterns;
1312 }
1313
1314 ce->getDynamicClassID();
1315 ce1->getDynamicClassID();
1316 if (U_FAILURE(status)) {
1317 errln("Could not get dynamic class ID of CEList.");
1318 goto close_patterns;
1319 }
1320
1321 ce->getStaticClassID();
1322 ce1->getStaticClassID();
1323 if (U_FAILURE(status)) {
1324 errln("Could not get static class ID of CEList.");
1325 goto close_patterns;
1326 }
1327
1328 if(data->minLengthInChars(ce,0) != 3){
1329 errln("Minimal Length in Characters for 'data' with 'ce' was suppose to give 3.");
1330 goto close_patterns;
1331 }
1332
1333 if(data->minLengthInChars(ce1,0) != 3){
1334 errln("Minimal Length in Characters for 'data' with 'ce1' was suppose to give 3.");
1335 goto close_patterns;
1336 }
1337
1338 for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
1339 UnicodeString target = targets[t].unescape();
1340
1341 longPattern->setTargetString(&target, status);
1342 if (longPattern->search(0, start, end)) {
1343 logln("Test %d: found long pattern at [%d, %d].", t, start, end);
1344 } else {
1345 errln("Test %d: did not find long pattern.", t);
1346 }
1347
1348 shortPattern->setTargetString(&target, status);
1349 if (shortPattern->search(0, start, end)) {
1350 logln("Test %d: found short pattern at [%d, %d].", t, start, end);
1351 } else {
1352 errln("Test %d: did not find short pattern.", t);
1353 }
1354
1355 if(longPattern->empty()){
1356 errln("Test %d: Long pattern should not have been empty.");
1357 }
1358
1359 if(shortPattern->empty()){
1360 errln("Test %d: Short pattern should not have been empty.");
1361 }
1362 }
1363
1364 close_patterns:
1365 delete shortPattern;
1366 delete longPattern;
1367
1368 close_data:
1369 CollData::close(data);
1370 ucol_close(coll);
1371 }
1372
bmsTest()1373 void SSearchTest::bmsTest()
1374 {
1375 UErrorCode status = U_ZERO_ERROR;
1376 UCollator *coll = NULL;
1377 UCD *data = NULL;
1378 UnicodeString lp = "fuss";
1379 UnicodeString lpu = lp.unescape();
1380 UnicodeString sp = "fu\\u00DF";
1381 UnicodeString spu = sp.unescape();
1382 BMS *longPattern = NULL;
1383 BMS *shortPattern = NULL;
1384 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1385 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1386 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1387 int32_t start = -1, end = -1;
1388
1389 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
1390 if (U_FAILURE(status)) {
1391 errcheckln(status, "Could not open collator. - %s", u_errorName(status));
1392 return;
1393 }
1394
1395 data = ucd_open(coll, &status);
1396 if (U_FAILURE(status)) {
1397 errln("Could not open CollData object.");
1398 goto close_data;
1399 }
1400
1401 longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status);
1402 shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &status);
1403 if (U_FAILURE(status)) {
1404 errln("Couldn't open pattern objects.");
1405 goto close_patterns;
1406 }
1407
1408 for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
1409 UnicodeString target = targets[t].unescape();
1410
1411 bms_setTargetString(longPattern, target.getBuffer(), target.length(), &status);
1412 if (bms_search(longPattern, 0, &start, &end)) {
1413 logln("Test %d: found long pattern at [%d, %d].", t, start, end);
1414 } else {
1415 errln("Test %d: did not find long pattern.", t);
1416 }
1417
1418 bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &status);
1419 if (bms_search(shortPattern, 0, &start, &end)) {
1420 logln("Test %d: found short pattern at [%d, %d].", t, start, end);
1421 } else {
1422 errln("Test %d: did not find short pattern.", t);
1423 }
1424 }
1425
1426 /* Add better coverage for bms code. */
1427 if(bms_empty(longPattern)) {
1428 errln("FAIL: longgPattern is empty.");
1429 }
1430
1431 if (!bms_getData(longPattern)) {
1432 errln("FAIL: bms_getData returned NULL.");
1433 }
1434
1435 if (!ucd_getCollator(data)) {
1436 errln("FAIL: ucd_getCollator returned NULL.");
1437 }
1438
1439 close_patterns:
1440 bms_close(shortPattern);
1441 bms_close(longPattern);
1442
1443 close_data:
1444 ucd_close(data);
1445 ucd_freeCache();
1446 ucol_close(coll);
1447 }
1448
goodSuffixTest()1449 void SSearchTest::goodSuffixTest()
1450 {
1451 UErrorCode status = U_ZERO_ERROR;
1452 UCollator *coll = NULL;
1453 CollData *data = NULL;
1454 UnicodeString pat = /*"gcagagag"*/ "fxeld";
1455 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
1456 BoyerMooreSearch *pattern = NULL;
1457 int32_t start = -1, end = -1;
1458
1459 coll = ucol_open(NULL, &status);
1460 if (U_FAILURE(status)) {
1461 errcheckln(status, "Couldn't open collator. - %s", u_errorName(status));
1462 return;
1463 }
1464
1465 data = CollData::open(coll, status);
1466 if (U_FAILURE(status)) {
1467 errln("Couldn't open CollData object.");
1468 goto close_data;
1469 }
1470
1471 pattern = new BoyerMooreSearch(data, pat, &target, status);
1472 if (U_FAILURE(status)) {
1473 errln("Couldn't open pattern object.");
1474 goto close_pattern;
1475 }
1476
1477 if (pattern->search(0, start, end)) {
1478 logln("Found pattern at [%d, %d].", start, end);
1479 } else {
1480 errln("Did not find pattern.");
1481 }
1482
1483 close_pattern:
1484 delete pattern;
1485
1486 close_data:
1487 CollData::close(data);
1488 ucol_close(coll);
1489 }
1490
1491 //
1492 // searchTime() A quick and dirty performance test for string search.
1493 // Probably doesn't really belong as part of intltest, but it
1494 // does check that the search succeeds, and gets the right result,
1495 // so it serves as a functionality test also.
1496 //
1497 // To run as a perf test, up the loop count, select by commenting
1498 // and uncommenting in the code the operation to be measured,
1499 // rebuild, and measure the running time of this test alone.
1500 //
1501 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime
1502 //
searchTime()1503 void SSearchTest::searchTime() {
1504 static const char *longishText =
1505 "Whylom, as olde stories tellen us,\n"
1506 "Ther was a duk that highte Theseus:\n"
1507 "Of Athenes he was lord and governour,\n"
1508 "And in his tyme swich a conquerour,\n"
1509 "That gretter was ther noon under the sonne.\n"
1510 "Ful many a riche contree hadde he wonne;\n"
1511 "What with his wisdom and his chivalrye,\n"
1512 "He conquered al the regne of Femenye,\n"
1513 "That whylom was y-cleped Scithia;\n"
1514 "And weddede the quene Ipolita,\n"
1515 "And broghte hir hoom with him in his contree\n"
1516 "With muchel glorie and greet solempnitee,\n"
1517 "And eek hir yonge suster Emelye.\n"
1518 "And thus with victorie and with melodye\n"
1519 "Lete I this noble duk to Athenes ryde,\n"
1520 "And al his hoost, in armes, him bisyde.\n"
1521 "And certes, if it nere to long to here,\n"
1522 "I wolde han told yow fully the manere,\n"
1523 "How wonnen was the regne of Femenye\n"
1524 "By Theseus, and by his chivalrye;\n"
1525 "And of the grete bataille for the nones\n"
1526 "Bitwixen Athen's and Amazones;\n"
1527 "And how asseged was Ipolita,\n"
1528 "The faire hardy quene of Scithia;\n"
1529 "And of the feste that was at hir weddinge,\n"
1530 "And of the tempest at hir hoom-cominge;\n"
1531 "But al that thing I moot as now forbere.\n"
1532 "I have, God woot, a large feeld to ere,\n"
1533 "And wayke been the oxen in my plough.\n"
1534 "The remenant of the tale is long y-nough.\n"
1535 "I wol nat letten eek noon of this route;\n"
1536 "Lat every felawe telle his tale aboute,\n"
1537 "And lat see now who shal the soper winne;\n"
1538 "And ther I lefte, I wol ageyn biginne.\n"
1539 "This duk, of whom I make mencioun,\n"
1540 "When he was come almost unto the toun,\n"
1541 "In al his wele and in his moste pryde,\n"
1542 "He was war, as he caste his eye asyde,\n"
1543 "Wher that ther kneled in the hye weye\n"
1544 "A companye of ladies, tweye and tweye,\n"
1545 "Ech after other, clad in clothes blake; \n"
1546 "But swich a cry and swich a wo they make,\n"
1547 "That in this world nis creature livinge,\n"
1548 "That herde swich another weymentinge;\n"
1549 "And of this cry they nolde never stenten,\n"
1550 "Til they the reynes of his brydel henten.\n"
1551 "'What folk ben ye, that at myn hoomcominge\n"
1552 "Perturben so my feste with cryinge'?\n"
1553 "Quod Theseus, 'have ye so greet envye\n"
1554 "Of myn honour, that thus compleyne and crye? \n"
1555 "Or who hath yow misboden, or offended?\n"
1556 "And telleth me if it may been amended;\n"
1557 "And why that ye ben clothed thus in blak'?\n"
1558 "The eldest lady of hem alle spak,\n"
1559 "When she hadde swowned with a deedly chere,\n"
1560 "That it was routhe for to seen and here,\n"
1561 "And seyde: 'Lord, to whom Fortune hath yiven\n"
1562 "Victorie, and as a conquerour to liven,\n"
1563 "Noght greveth us your glorie and your honour;\n"
1564 "But we biseken mercy and socour.\n"
1565 "Have mercy on our wo and our distresse.\n"
1566 "Som drope of pitee, thurgh thy gentilesse,\n"
1567 "Up-on us wrecched wommen lat thou falle.\n"
1568 "For certes, lord, ther nis noon of us alle,\n"
1569 "That she nath been a duchesse or a quene;\n"
1570 "Now be we caitifs, as it is wel sene:\n"
1571 "Thanked be Fortune, and hir false wheel,\n"
1572 "That noon estat assureth to be weel.\n"
1573 "And certes, lord, t'abyden your presence,\n"
1574 "Here in the temple of the goddesse Clemence\n"
1575 "We han ben waytinge al this fourtenight;\n"
1576 "Now help us, lord, sith it is in thy might.\n"
1577 "I wrecche, which that wepe and waille thus,\n"
1578 "Was whylom wyf to king Capaneus,\n"
1579 "That starf at Thebes, cursed be that day!\n"
1580 "And alle we, that been in this array,\n"
1581 "And maken al this lamentacioun,\n"
1582 "We losten alle our housbondes at that toun,\n"
1583 "Whyl that the sege ther-aboute lay.\n"
1584 "And yet now th'olde Creon, weylaway!\n"
1585 "The lord is now of Thebes the citee, \n"
1586 "Fulfild of ire and of iniquitee,\n"
1587 "He, for despyt, and for his tirannye,\n"
1588 "To do the dede bodyes vileinye,\n"
1589 "Of alle our lordes, whiche that ben slawe,\n"
1590 "Hath alle the bodyes on an heep y-drawe,\n"
1591 "And wol nat suffren hem, by noon assent,\n"
1592 "Neither to been y-buried nor y-brent,\n"
1593 "But maketh houndes ete hem in despyt. zet'\n";
1594
1595 #define TEST_BOYER_MOORE 1
1596 const char *cPattern = "maketh houndes ete hem";
1597 //const char *cPattern = "Whylom";
1598 //const char *cPattern = "zet";
1599 const char *testId = "searchTime()"; // for error macros.
1600 UnicodeString target = longishText;
1601 UErrorCode status = U_ZERO_ERROR;
1602
1603
1604 LocalUCollatorPointer collator(ucol_open("en", &status));
1605 CollData *data = CollData::open(collator.getAlias(), status);
1606 if (U_FAILURE(status) || collator.isNull() || data == NULL) {
1607 errcheckln(status, "Unable to open UCollator or CollData. - %s", u_errorName(status));
1608 return;
1609 }
1610 //ucol_setStrength(collator.getAlias(), collatorStrength);
1611 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
1612 UnicodeString uPattern = cPattern;
1613 #ifndef TEST_BOYER_MOORE
1614 LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
1615 target.getBuffer(), target.length(),
1616 collator.getAlias(),
1617 NULL, // the break iterator
1618 &status));
1619 TEST_ASSERT_SUCCESS(status);
1620 #else
1621 BoyerMooreSearch bms(data, uPattern, &target, status);
1622 TEST_ASSERT_SUCCESS(status);
1623 #endif
1624
1625 // int32_t foundStart;
1626 // int32_t foundEnd;
1627 UBool found;
1628
1629 // Find the match position usgin strstr
1630 const char *pm = strstr(longishText, cPattern);
1631 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
1632 int32_t refMatchPos = (int32_t)(pm - longishText);
1633 int32_t icuMatchPos;
1634 int32_t icuMatchEnd;
1635 #ifndef TEST_BOYER_MOORE
1636 usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
1637 TEST_ASSERT_SUCCESS(status);
1638 #else
1639 found = bms.search(0, icuMatchPos, icuMatchEnd);
1640 #endif
1641 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
1642
1643 int32_t i;
1644 // int32_t j=0;
1645
1646 // Try loopcounts around 100000 to some millions, depending on the operation,
1647 // to get runtimes of at least several seconds.
1648 for (i=0; i<10000; i++) {
1649 #ifndef TEST_BOYER_MOORE
1650 found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
1651 #else
1652 found = bms.search(0, icuMatchPos, icuMatchEnd);
1653 #endif
1654 //TEST_ASSERT_SUCCESS(status);
1655 //TEST_ASSERT(found);
1656
1657 // usearch_setOffset(uss.getAlias(), 0, &status);
1658 // icuMatchPos = usearch_next(uss.getAlias(), &status);
1659
1660 // The i+j stuff is to confuse the optimizer and get it to actually leave the
1661 // call to strstr in place.
1662 //pm = strstr(longishText+j, cPattern);
1663 //j = (j + i)%5;
1664 }
1665
1666 //printf("%ld, %d\n", pm-longishText, j);
1667 #ifdef TEST_BOYER_MOORE
1668 CollData::close(data);
1669 #endif
1670 }
1671 #endif
1672
1673 //----------------------------------------------------------------------------------------
1674 //
1675 // Random Numbers. Similar to standard lib rand() and srand()
1676 // Not using library to
1677 // 1. Get same results on all platforms.
1678 // 2. Get access to current seed, to more easily reproduce failures.
1679 //
1680 //---------------------------------------------------------------------------------------
1681 static uint32_t m_seed = 1;
1682
m_rand()1683 static uint32_t m_rand()
1684 {
1685 m_seed = m_seed * 1103515245 + 12345;
1686 return (uint32_t)(m_seed/65536) % 32768;
1687 }
1688
1689 class Monkey
1690 {
1691 public:
1692 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
1693
1694 protected:
1695 Monkey();
1696 virtual ~Monkey();
1697 };
1698
Monkey()1699 Monkey::Monkey()
1700 {
1701 // ook?
1702 }
1703
~Monkey()1704 Monkey::~Monkey()
1705 {
1706 // ook?
1707 }
1708
1709 class SetMonkey : public Monkey
1710 {
1711 public:
1712 SetMonkey(const USet *theSet);
1713 ~SetMonkey();
1714
1715 virtual void append(UnicodeString &test, UnicodeString &alternate);
1716
1717 private:
1718 const USet *set;
1719 };
1720
SetMonkey(const USet * theSet)1721 SetMonkey::SetMonkey(const USet *theSet)
1722 : Monkey(), set(theSet)
1723 {
1724 // ook?
1725 }
1726
~SetMonkey()1727 SetMonkey::~SetMonkey()
1728 {
1729 //ook...
1730 }
1731
append(UnicodeString & test,UnicodeString & alternate)1732 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1733 {
1734 int32_t size = uset_size(set);
1735 int32_t index = m_rand() % size;
1736 UChar32 ch = uset_charAt(set, index);
1737 UnicodeString str(ch);
1738
1739 test.append(str);
1740 alternate.append(str); // flip case, or some junk?
1741 }
1742
1743 class StringSetMonkey : public Monkey
1744 {
1745 public:
1746 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1747 ~StringSetMonkey();
1748
1749 void append(UnicodeString &testCase, UnicodeString &alternate);
1750
1751 private:
1752 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1753
1754 const USet *set;
1755 UCollator *coll;
1756 CollData *collData;
1757 };
1758
StringSetMonkey(const USet * theSet,UCollator * theCollator,CollData * theCollData)1759 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1760 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1761 {
1762 // ook.
1763 }
1764
~StringSetMonkey()1765 StringSetMonkey::~StringSetMonkey()
1766 {
1767 // ook?
1768 }
1769
append(UnicodeString & testCase,UnicodeString & alternate)1770 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1771 {
1772 int32_t itemCount = uset_getItemCount(set), len = 0;
1773 int32_t index = m_rand() % itemCount;
1774 UChar32 rangeStart = 0, rangeEnd = 0;
1775 UChar buffer[16];
1776 UErrorCode err = U_ZERO_ERROR;
1777
1778 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1779
1780 if (len == 0) {
1781 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1782 UChar32 ch = rangeStart + offset;
1783 UnicodeString str(ch);
1784
1785 testCase.append(str);
1786 generateAlternative(str, alternate);
1787 } else if (len > 0) {
1788 // should check that len < 16...
1789 UnicodeString str(buffer, len);
1790
1791 testCase.append(str);
1792 generateAlternative(str, alternate);
1793 } else {
1794 // shouldn't happen...
1795 }
1796 }
1797
generateAlternative(const UnicodeString & testCase,UnicodeString & alternate)1798 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1799 {
1800 // find out shortest string for the longest sequence of ces.
1801 // needs to be refined to use dynamic programming, but will be roughly right
1802 UErrorCode status = U_ZERO_ERROR;
1803 CEList ceList(coll, testCase, status);
1804 UnicodeString alt;
1805 int32_t offset = 0;
1806
1807 if (ceList.size() == 0) {
1808 return alternate.append(testCase);
1809 }
1810
1811 while (offset < ceList.size()) {
1812 int32_t ce = ceList.get(offset);
1813 const StringList *strings = collData->getStringList(ce);
1814
1815 if (strings == NULL) {
1816 return alternate.append(testCase);
1817 }
1818
1819 int32_t stringCount = strings->size();
1820 int32_t tries = 0;
1821
1822 // find random string that generates the same CEList
1823 const CEList *ceList2 = NULL;
1824 const UnicodeString *string = NULL;
1825 UBool matches = FALSE;
1826
1827 do {
1828 int32_t s = m_rand() % stringCount;
1829
1830 if (tries++ > stringCount) {
1831 alternate.append(testCase);
1832 return alternate;
1833 }
1834
1835 string = strings->get(s);
1836 ceList2 = collData->getCEList(string);
1837 matches = ceList.matchesAt(offset, ceList2);
1838
1839 if (! matches) {
1840 collData->freeCEList((CEList *) ceList2);
1841 }
1842 } while (! matches);
1843
1844 alt.append(*string);
1845 offset += ceList2->size();
1846 collData->freeCEList(ceList2);
1847 }
1848
1849 const CEList altCEs(coll, alt, status);
1850
1851 if (ceList.matchesAt(0, &altCEs)) {
1852 return alternate.append(alt);
1853 }
1854
1855 return alternate.append(testCase);
1856 }
1857
generateTestCase(UCollator * coll,Monkey * monkeys[],int32_t monkeyCount,UnicodeString & testCase,UnicodeString & alternate)1858 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1859 {
1860 int32_t pieces = (m_rand() % 4) + 1;
1861 UErrorCode status = U_ZERO_ERROR;
1862 UBool matches;
1863
1864 do {
1865 testCase.remove();
1866 alternate.remove();
1867 monkeys[0]->append(testCase, alternate);
1868
1869 for(int32_t piece = 0; piece < pieces; piece += 1) {
1870 int32_t monkey = m_rand() % monkeyCount;
1871
1872 monkeys[monkey]->append(testCase, alternate);
1873 }
1874
1875 const CEList ceTest(coll, testCase, status);
1876 const CEList ceAlt(coll, alternate, status);
1877
1878 matches = ceTest.matchesAt(0, &ceAlt);
1879 } while (! matches);
1880 }
1881
1882 //
1883 // Find the next acceptable boundary following the specified starting index
1884 // in the target text being searched.
1885 // TODO: refine what is an acceptable boundary. For the moment,
1886 // choose the next position not within a combining sequence.
1887 //
1888 #if 0
1889 static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) {
1890 const UChar *text = string.getBuffer();
1891 int32_t textLen = string.length();
1892
1893 if (startIndex >= textLen) {
1894 return startIndex;
1895 }
1896
1897 UChar32 c;
1898 int32_t i = startIndex;
1899
1900 U16_NEXT(text, i, textLen, c);
1901
1902 // If we are on a control character, stop without looking for combining marks.
1903 // Control characters do not combine.
1904 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1905 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
1906 return i;
1907 }
1908
1909 // The initial character was not a control, and can thus accept trailing
1910 // combining characters. Advance over however many of them there are.
1911 int32_t indexOfLastCharChecked;
1912
1913 for (;;) {
1914 indexOfLastCharChecked = i;
1915
1916 if (i>=textLen) {
1917 break;
1918 }
1919
1920 U16_NEXT(text, i, textLen, c);
1921 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1922
1923 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1924 break;
1925 }
1926 }
1927
1928 return indexOfLastCharChecked;
1929 }
1930 #endif
1931
1932 #if 0
1933 static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) {
1934 const UChar *text = string.getBuffer();
1935 int32_t textLen = string.length();
1936
1937 if (index>=textLen || index<=0) {
1938 return FALSE;
1939 }
1940
1941 // If the character at the current index is not a GRAPHEME_EXTEND
1942 // then we can not be within a combining sequence.
1943 UChar32 c;
1944 U16_GET(text, 0, index, textLen, c);
1945 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1946 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1947 return FALSE;
1948 }
1949
1950 // We are at a combining mark. If the preceding character is anything
1951 // except a CONTROL, CR or LF, we are in a combining sequence.
1952 U16_PREV(text, 0, index, c);
1953 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1954
1955 return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
1956 }
1957 #endif
1958
simpleSearch(UCollator * coll,const UnicodeString & target,int32_t offset,const UnicodeString & pattern,int32_t & matchStart,int32_t & matchEnd)1959 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1960 {
1961 UErrorCode status = U_ZERO_ERROR;
1962 OrderList targetOrders(coll, target, offset);
1963 OrderList patternOrders(coll, pattern);
1964 int32_t targetSize = targetOrders.size() - 1;
1965 int32_t patternSize = patternOrders.size() - 1;
1966 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1967 target.getBuffer(), target.length(), &status);
1968
1969 if (patternSize == 0) {
1970 // Searching for an empty pattern always fails
1971 matchStart = matchEnd = -1;
1972 ubrk_close(charBreakIterator);
1973 return FALSE;
1974 }
1975
1976 matchStart = matchEnd = -1;
1977
1978 for(int32_t i = 0; i < targetSize; i += 1) {
1979 if (targetOrders.matchesAt(i, patternOrders)) {
1980 int32_t start = targetOrders.getLowOffset(i);
1981 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1982 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1983
1984 // if the low and high offsets of the first CE in
1985 // the match are the same, it means that the match
1986 // starts in the middle of an expansion - all but
1987 // the first CE of the expansion will have the offset
1988 // of the following character.
1989 if (start == targetOrders.getHighOffset(i)) {
1990 continue;
1991 }
1992
1993 // Make sure match starts on a grapheme boundary
1994 if (! ubrk_isBoundary(charBreakIterator, start)) {
1995 continue;
1996 }
1997
1998 // If the low and high offsets of the CE after the match
1999 // are the same, it means that the match ends in the middle
2000 // of an expansion sequence.
2001 if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
2002 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
2003 continue;
2004 }
2005
2006 int32_t mend = maxLimit;
2007
2008 // Find the first grapheme break after the character index
2009 // of the last CE in the match. If it's after character index
2010 // that's after the last CE in the match, use that index
2011 // as the end of the match.
2012 if (minLimit < maxLimit) {
2013 // When the last CE's low index is same with its high index, the CE is likely
2014 // a part of expansion. In this case, the index is located just after the
2015 // character corresponding to the CEs compared above. If the index is right
2016 // at the break boundary, move the position to the next boundary will result
2017 // incorrect match length when there are ignorable characters exist between
2018 // the position and the next character produces CE(s). See ticket#8482.
2019 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
2020 mend = minLimit;
2021 } else {
2022 int32_t nba = ubrk_following(charBreakIterator, minLimit);
2023
2024 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
2025 mend = nba;
2026 }
2027 }
2028 }
2029
2030 if (mend > maxLimit) {
2031 continue;
2032 }
2033
2034 if (! ubrk_isBoundary(charBreakIterator, mend)) {
2035 continue;
2036 }
2037
2038 matchStart = start;
2039 matchEnd = mend;
2040
2041 ubrk_close(charBreakIterator);
2042 return TRUE;
2043 }
2044 }
2045
2046 ubrk_close(charBreakIterator);
2047 return FALSE;
2048 }
2049
2050 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)2051 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
2052 int32_t val = defaultVal;
2053
2054 name.append(" *= *(-?\\d+)");
2055
2056 UErrorCode status = U_ZERO_ERROR;
2057 RegexMatcher m(name, params, 0, status);
2058
2059 if (m.find()) {
2060 // The param exists. Convert the string to an int.
2061 char valString[100];
2062 int32_t paramLength = m.end(1, status) - m.start(1, status);
2063
2064 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
2065 paramLength = (int32_t)(sizeof(valString)-2);
2066 }
2067
2068 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
2069 val = strtol(valString, NULL, 10);
2070
2071 // Delete this parameter from the params string.
2072 m.reset();
2073 params = m.replaceFirst("", status);
2074 }
2075
2076 //U_ASSERT(U_SUCCESS(status));
2077 if (! U_SUCCESS(status)) {
2078 val = defaultVal;
2079 }
2080
2081 return val;
2082 }
2083 #endif
2084
2085 #if !UCONFIG_NO_COLLATION
monkeyTestCase(UCollator * coll,const UnicodeString & testCase,const UnicodeString & pattern,const UnicodeString & altPattern,const char * name,const char * strength,uint32_t seed)2086 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
2087 const char *name, const char *strength, uint32_t seed)
2088 {
2089 UErrorCode status = U_ZERO_ERROR;
2090 int32_t actualStart = -1, actualEnd = -1;
2091 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
2092 int32_t expectedStart = -1, expectedEnd = -1;
2093 int32_t notFoundCount = 0;
2094 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
2095 testCase.getBuffer(), testCase.length(),
2096 coll,
2097 NULL, // the break iterator
2098 &status));
2099
2100 // **** TODO: find *all* matches, not just first one ****
2101 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
2102
2103 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
2104
2105 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2106 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2107 " strength=%s seed=%d",
2108 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
2109 }
2110
2111 if (expectedStart == -1 && actualStart == -1) {
2112 notFoundCount += 1;
2113 }
2114
2115 // **** TODO: find *all* matches, not just first one ****
2116 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
2117
2118 usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
2119
2120 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
2121
2122 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2123 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2124 " strength=%s seed=%d",
2125 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
2126 }
2127
2128 if (expectedStart == -1 && actualStart == -1) {
2129 notFoundCount += 1;
2130 }
2131
2132 return notFoundCount;
2133 }
2134
hexForUnicodeString(const UnicodeString & ustr,char * cbuf,int32_t cbuflen)2135 static void hexForUnicodeString(const UnicodeString &ustr, char * cbuf, int32_t cbuflen)
2136 {
2137 int32_t ustri, ustrlen = ustr.length();
2138
2139 for (ustri = 0; ustri < ustrlen; ++ustri) {
2140 if (cbuflen >= 9 /* format width for single code unit(5) + terminating ellipsis(3) + null(1) */) {
2141 int len = sprintf(cbuf, " %04X", ustr.charAt(ustri));
2142 cbuflen -= len;
2143 cbuf += len;
2144 } else {
2145 if (cbuflen >= 4 /* terminating ellipsis(3) + null(1) */) {
2146 sprintf(cbuf, "...");
2147 } else if (cbuflen >= 1) {
2148 cbuf = 0;
2149 }
2150 break;
2151 }
2152 }
2153 }
2154
bmMonkeyTestCase(UCollator * coll,const UnicodeString & testCase,const UnicodeString & pattern,const UnicodeString & altPattern,BoyerMooreSearch * bms,BoyerMooreSearch * abms,const char * name,const char * strength,uint32_t seed)2155 int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
2156 BoyerMooreSearch *bms, BoyerMooreSearch *abms,
2157 const char *name, const char *strength, uint32_t seed)
2158 {
2159 UErrorCode status = U_ZERO_ERROR;
2160 int32_t actualStart = -1, actualEnd = -1;
2161 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
2162 int32_t expectedStart = -1, expectedEnd = -1;
2163 int32_t notFoundCount = 0;
2164 char hexbuf[128];
2165
2166 // **** TODO: find *all* matches, not just first one ****
2167 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
2168
2169 bms->setTargetString(&testCase, status);
2170 bms->search(0, actualStart, actualEnd);
2171
2172 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2173 hexForUnicodeString(pattern, hexbuf, sizeof(hexbuf));
2174 errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2175 " strength=%s seed=%d <pattern>: %s",
2176 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed, hexbuf);
2177 }
2178
2179 if (expectedStart == -1 && actualStart == -1) {
2180 notFoundCount += 1;
2181 }
2182
2183 // **** TODO: find *all* matches, not just first one ****
2184 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
2185
2186 abms->setTargetString(&testCase, status);
2187 abms->search(0, actualStart, actualEnd);
2188
2189 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2190 hexForUnicodeString(altPattern, hexbuf, sizeof(hexbuf));
2191 errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2192 " strength=%s seed=%d <pattern>: %s",
2193 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed, hexbuf);
2194 }
2195
2196 if (expectedStart == -1 && actualStart == -1) {
2197 notFoundCount += 1;
2198 }
2199
2200
2201 return notFoundCount;
2202 }
2203 #endif
2204
monkeyTest(char * params)2205 void SSearchTest::monkeyTest(char *params)
2206 {
2207 // ook!
2208 UErrorCode status = U_ZERO_ERROR;
2209 //UCollator *coll = ucol_open(NULL, &status);
2210 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
2211
2212 if (U_FAILURE(status)) {
2213 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
2214 return;
2215 }
2216
2217 CollData *monkeyData = CollData::open(coll, status);
2218
2219 USet *expansions = uset_openEmpty();
2220 USet *contractions = uset_openEmpty();
2221
2222 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
2223
2224 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2225 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2226 USet *letters = uset_openPattern(letter_pattern, 39, &status);
2227 SetMonkey letterMonkey(letters);
2228 StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
2229 StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
2230 UnicodeString testCase;
2231 UnicodeString alternate;
2232 UnicodeString pattern, altPattern;
2233 UnicodeString prefix, altPrefix;
2234 UnicodeString suffix, altSuffix;
2235
2236 Monkey *monkeys[] = {
2237 &letterMonkey,
2238 &contractionMonkey,
2239 &expansionMonkey,
2240 &contractionMonkey,
2241 &expansionMonkey,
2242 &contractionMonkey,
2243 &expansionMonkey,
2244 &contractionMonkey,
2245 &expansionMonkey};
2246 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
2247 // int32_t nonMatchCount = 0;
2248
2249 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
2250 const char *strengthNames[] = {"primary", "secondary", "tertiary"};
2251 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
2252 int32_t loopCount = quick? 1000 : 10000;
2253 int32_t firstStrength = 0;
2254 int32_t lastStrength = strengthCount - 1; //*/ 0;
2255
2256 if (params != NULL) {
2257 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2258 UnicodeString p(params);
2259
2260 loopCount = getIntParam("loop", p, loopCount);
2261 m_seed = getIntParam("seed", p, m_seed);
2262
2263 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
2264 if (m.find()) {
2265 UnicodeString breakType = m.group(1, status);
2266
2267 for (int32_t s = 0; s < strengthCount; s += 1) {
2268 if (breakType == strengthNames[s]) {
2269 firstStrength = lastStrength = s;
2270 break;
2271 }
2272 }
2273
2274 m.reset();
2275 p = m.replaceFirst("", status);
2276 }
2277
2278 if (RegexMatcher("\\S", p, 0, status).find()) {
2279 // Each option is stripped out of the option string as it is processed.
2280 // All options have been checked. The option string should have been completely emptied..
2281 char buf[100];
2282 p.extract(buf, sizeof(buf), NULL, status);
2283 buf[sizeof(buf)-1] = 0;
2284 errln("Unrecognized or extra parameter: %s\n", buf);
2285 return;
2286 }
2287 #else
2288 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2289 #endif
2290 }
2291
2292 for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
2293 int32_t notFoundCount = 0;
2294
2295 logln("Setting strength to %s.", strengthNames[s]);
2296 ucol_setStrength(coll, strengths[s]);
2297
2298 // TODO: try alternate prefix and suffix too?
2299 // TODO: alterntaes are only equal at primary strength. Is this OK?
2300 for(int32_t t = 0; t < loopCount; t += 1) {
2301 uint32_t seed = m_seed;
2302 // int32_t nmc = 0;
2303
2304 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
2305 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
2306 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix);
2307
2308 // pattern
2309 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
2310
2311 testCase.remove();
2312 testCase.append(prefix);
2313 testCase.append(/*alt*/pattern);
2314
2315 // prefix + pattern
2316 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
2317
2318 testCase.append(suffix);
2319
2320 // prefix + pattern + suffix
2321 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
2322
2323 testCase.remove();
2324 testCase.append(pattern);
2325 testCase.append(suffix);
2326
2327 // pattern + suffix
2328 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
2329 }
2330
2331 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
2332 }
2333
2334 uset_close(contractions);
2335 uset_close(expansions);
2336 uset_close(letters);
2337
2338 CollData::close(monkeyData);
2339
2340 ucol_close(coll);
2341 }
2342
bmMonkeyTest(char * params)2343 void SSearchTest::bmMonkeyTest(char *params)
2344 {
2345 static const UVersionInfo icu49 = { 4, 9, 0, 0 }; // for timebomb
2346 static const UChar skipChars[] = { 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0xAAB5, 0xAAB6, 0xAAB9, 0xAABB, 0xAABC, 0 }; // for timebomb
2347 // ook!
2348 UErrorCode status = U_ZERO_ERROR;
2349 UCollator *coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
2350
2351 if (U_FAILURE(status)) {
2352 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
2353 return;
2354 }
2355
2356 CollData *monkeyData = CollData::open(coll, status);
2357
2358 USet *expansions = uset_openEmpty();
2359 USet *contractions = uset_openEmpty();
2360
2361 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
2362
2363 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2364 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2365 USet *letters = uset_openPattern(letter_pattern, 39, &status);
2366 SetMonkey letterMonkey(letters);
2367 StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
2368 StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
2369 UnicodeString testCase;
2370 UnicodeString alternate;
2371 UnicodeString pattern, altPattern;
2372 UnicodeString prefix, altPrefix;
2373 UnicodeString suffix, altSuffix;
2374
2375 Monkey *monkeys[] = {
2376 &letterMonkey,
2377 &contractionMonkey,
2378 &expansionMonkey,
2379 &contractionMonkey,
2380 &expansionMonkey,
2381 &contractionMonkey,
2382 &expansionMonkey,
2383 &contractionMonkey,
2384 &expansionMonkey};
2385 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
2386 // int32_t nonMatchCount = 0;
2387
2388 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
2389 const char *strengthNames[] = {"primary", "secondary", "tertiary"};
2390 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
2391 int32_t loopCount = quick? 1000 : 10000;
2392 int32_t firstStrength = 0;
2393 int32_t lastStrength = strengthCount - 1; //*/ 0;
2394
2395 if (params != NULL) {
2396 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2397 UnicodeString p(params);
2398
2399 loopCount = getIntParam("loop", p, loopCount);
2400 m_seed = getIntParam("seed", p, m_seed);
2401
2402 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
2403 if (m.find()) {
2404 UnicodeString breakType = m.group(1, status);
2405
2406 for (int32_t s = 0; s < strengthCount; s += 1) {
2407 if (breakType == strengthNames[s]) {
2408 firstStrength = lastStrength = s;
2409 break;
2410 }
2411 }
2412
2413 m.reset();
2414 p = m.replaceFirst("", status);
2415 }
2416
2417 if (RegexMatcher("\\S", p, 0, status).find()) {
2418 // Each option is stripped out of the option string as it is processed.
2419 // All options have been checked. The option string should have been completely emptied..
2420 char buf[100];
2421 p.extract(buf, sizeof(buf), NULL, status);
2422 buf[sizeof(buf)-1] = 0;
2423 errln("Unrecognized or extra parameter: %s\n", buf);
2424 return;
2425 }
2426 #else
2427 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2428 #endif
2429 }
2430
2431 for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
2432 int32_t notFoundCount = 0;
2433
2434 logln("Setting strength to %s.", strengthNames[s]);
2435 ucol_setStrength(coll, strengths[s]);
2436
2437 CollData *data = CollData::open(coll, status);
2438
2439 UnicodeString skipString(skipChars); // for timebomb
2440 UnicodeSet* skipSet = UnicodeSet::createFromAll(skipString); // for timebomb
2441 // TODO: try alternate prefix and suffix too?
2442 // TODO: alterntaes are only equal at primary strength. Is this OK?
2443 for(int32_t t = 0; t < loopCount; t += 1) {
2444 uint32_t seed = m_seed;
2445 // int32_t nmc = 0;
2446
2447 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
2448 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
2449 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix);
2450
2451 if (!isICUVersionAtLeast(icu49) && skipSet->containsSome(pattern)) {
2452 continue; // timebomb until ticket #8080 is resolved
2453 }
2454
2455 BoyerMooreSearch pat(data, pattern, NULL, status);
2456 BoyerMooreSearch alt(data, altPattern, NULL, status);
2457
2458 // **** need a better way to deal with this ****
2459 #if 0
2460 if (pat.empty() ||
2461 alt.empty()) {
2462 continue;
2463 }
2464 #endif
2465
2466 // pattern
2467 notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern, &pat, &alt, "pattern", strengthNames[s], seed);
2468
2469 testCase.remove();
2470 testCase.append(prefix);
2471 testCase.append(/*alt*/pattern);
2472
2473 // prefix + pattern
2474 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern", strengthNames[s], seed);
2475
2476 testCase.append(suffix);
2477
2478 // prefix + pattern + suffix
2479 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed);
2480
2481 testCase.remove();
2482 testCase.append(pattern);
2483 testCase.append(suffix);
2484
2485 // pattern + suffix
2486 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "pattern + suffix", strengthNames[s], seed);
2487 }
2488 delete skipSet; // for timebomb
2489
2490 CollData::close(data);
2491
2492 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
2493 }
2494
2495 uset_close(contractions);
2496 uset_close(expansions);
2497 uset_close(letters);
2498
2499 CollData::close(monkeyData);
2500
2501 ucol_close(coll);
2502 }
2503
stringListTest()2504 void SSearchTest::stringListTest(){
2505 UErrorCode status = U_ZERO_ERROR;
2506 StringList *sl = new StringList(status);
2507 if(U_FAILURE(status)){
2508 errln("ERROR: stringListTest: Could not start StringList");
2509 }
2510
2511 const UChar chars[] = {
2512 0x0000
2513 };
2514 sl->add(chars, (int32_t) 0, status);
2515 if(U_FAILURE(status)){
2516 errln("ERROR: stringListTest: StringList::add");
2517 }
2518
2519 if(sl->getDynamicClassID() != StringList::getStaticClassID()){
2520 errln("ERROR: stringListTest: getDynamicClassID and getStaticClassID does not match");
2521 }
2522 delete sl;
2523 }
2524
2525 #endif
2526
2527 #endif
2528