1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_COLLATION
12
13 #include "unicode/unistr.h"
14 #include "unicode/putil.h"
15 #include "unicode/usearch.h"
16
17 #include "cmemory.h"
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/coleitr.h"
21 #include "unicode/ucoleitr.h"
22
23 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
24
25 #include "unicode/uniset.h"
26 #include "unicode/uset.h"
27 #include "unicode/ustring.h"
28 #include "hash.h"
29 #include "uhash.h"
30 #include "ucol_imp.h"
31
32 #include "intltest.h"
33 #include "ssearch.h"
34
35 #include "unicode/colldata.h"
36 #include "unicode/bmsearch.h"
37 #include "unicode/bms.h"
38
39 #include "xmlparser.h"
40 #include "ucbuf.h"
41
42 #include <stdlib.h>
43 #include <string.h>
44 #include <stdio.h>
45
46 char testId[100];
47
48 #define TEST_ASSERT(x) {if (!(x)) { \
49 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
50
51 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
52 errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}}
53
54 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
55 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
56 __FILE__, __LINE__, testId, u_errorName(errcode));}}
57
58 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
59 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
60 #define DELETE_ARRAY(array) uprv_free((void *) (array))
61
62 //---------------------------------------------------------------------------
63 //
64 // Test class boilerplate
65 //
66 //---------------------------------------------------------------------------
SSearchTest()67 SSearchTest::SSearchTest()
68 {
69 }
70
~SSearchTest()71 SSearchTest::~SSearchTest()
72 {
73 }
74
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)75 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
76 {
77 if (exec) logln("TestSuite SSearchTest: ");
78 switch (index) {
79 #if !UCONFIG_NO_BREAK_ITERATION
80 case 0: name = "searchTest";
81 if (exec) searchTest();
82 break;
83
84 case 1: name = "offsetTest";
85 if (exec) offsetTest();
86 break;
87
88 case 2: name = "monkeyTest";
89 if (exec) monkeyTest(params);
90 break;
91
92 case 3: name = "bmMonkeyTest";
93 if (exec) bmMonkeyTest(params);
94 break;
95
96 case 4: name = "boyerMooreTest";
97 if (exec) boyerMooreTest();
98 break;
99
100 case 5: name = "goodSuffixTest";
101 if (exec) goodSuffixTest();
102 break;
103
104 case 6: name = "searchTime";
105 if (exec) searchTime();
106 break;
107
108 case 7: name = "bmsTest";
109 if (exec) bmsTest();
110 break;
111
112 case 8: name = "bmSearchTest";
113 if (exec) bmSearchTest();
114 break;
115
116 case 9: name = "udhrTest";
117 if (exec) udhrTest();
118 break;
119 #endif
120 default: name = "";
121 break; //needed to end loop
122 }
123 }
124
125
126 #if !UCONFIG_NO_BREAK_ITERATION
127
128 #define PATH_BUFFER_SIZE 2048
getPath(char buffer[2048],const char * filename)129 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
130 UErrorCode status = U_ZERO_ERROR;
131 const char *testDataDirectory = IntlTest::getSourceTestData(status);
132
133 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
134 errln("ERROR: getPath() failed - %s", u_errorName(status));
135 return NULL;
136 }
137
138 strcpy(buffer, testDataDirectory);
139 strcat(buffer, filename);
140 return buffer;
141 }
142
143
searchTest()144 void SSearchTest::searchTest()
145 {
146 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
147 UErrorCode status = U_ZERO_ERROR;
148 char path[PATH_BUFFER_SIZE];
149 const char *testFilePath = getPath(path, "ssearch.xml");
150
151 if (testFilePath == NULL) {
152 return; /* Couldn't get path: error message already output. */
153 }
154
155 UXMLParser *parser = UXMLParser::createParser(status);
156 TEST_ASSERT_SUCCESS(status);
157 UXMLElement *root = parser->parseFile(testFilePath, status);
158 TEST_ASSERT_SUCCESS(status);
159 if (U_FAILURE(status)) {
160 return;
161 }
162
163 const UnicodeString *debugTestCase = root->getAttribute("debug");
164 if (debugTestCase != NULL) {
165 // setenv("USEARCH_DEBUG", "1", 1);
166 }
167
168
169 const UXMLElement *testCase;
170 int32_t tc = 0;
171
172 while((testCase = root->nextChildElement(tc)) != NULL) {
173
174 if (testCase->getTagName().compare("test-case") != 0) {
175 errln("ssearch, unrecognized XML Element in test file");
176 continue;
177 }
178 const UnicodeString *id = testCase->getAttribute("id");
179 *testId = 0;
180 if (id != NULL) {
181 id->extract(0, id->length(), testId, sizeof(testId), US_INV);
182 }
183
184 // If debugging test case has been specified and this is not it, skip to next.
185 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
186 continue;
187 }
188 //
189 // Get the requested collation strength.
190 // Default is tertiary if the XML attribute is missing from the test case.
191 //
192 const UnicodeString *strength = testCase->getAttribute("strength");
193 UColAttributeValue collatorStrength;
194 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
195 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
196 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
197 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;}
198 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
199 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;}
200 else {
201 // Bogus value supplied for strength. Shouldn't happen, even from
202 // typos, if the XML source has been validated.
203 // This assert is a little deceiving in that strength can be
204 // any of the allowed values, not just TERTIARY, but it will
205 // do the job of getting the error output.
206 TEST_ASSERT(*strength=="TERTIARY")
207 }
208
209 //
210 // Get the collator normalization flag. Default is UCOL_OFF.
211 //
212 UColAttributeValue normalize = UCOL_OFF;
213 const UnicodeString *norm = testCase->getAttribute("norm");
214 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
215 if (norm!=NULL && *norm=="ON") {
216 normalize = UCOL_ON;
217 }
218
219 //
220 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
221 //
222 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
223 const UnicodeString *alt = testCase->getAttribute("alternate_handling");
224 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
225 if (alt != NULL && *alt == "SHIFTED") {
226 alternateHandling = UCOL_SHIFTED;
227 }
228
229 const UnicodeString defLocale("en");
230 char clocale[100];
231 const UnicodeString *locale = testCase->getAttribute("locale");
232 if (locale == NULL || locale->length()==0) {
233 locale = &defLocale;
234 };
235 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
236
237
238 UnicodeString text;
239 UnicodeString target;
240 UnicodeString pattern;
241 int32_t expectedMatchStart = -1;
242 int32_t expectedMatchLimit = -1;
243 const UXMLElement *n;
244 int32_t nodeCount = 0;
245
246 n = testCase->getChildElement("pattern");
247 TEST_ASSERT(n != NULL);
248 if (n==NULL) {
249 continue;
250 }
251 text = n->getText(FALSE);
252 text = text.unescape();
253 pattern.append(text);
254 nodeCount++;
255
256 n = testCase->getChildElement("pre");
257 if (n!=NULL) {
258 text = n->getText(FALSE);
259 text = text.unescape();
260 target.append(text);
261 nodeCount++;
262 }
263
264 n = testCase->getChildElement("m");
265 if (n!=NULL) {
266 expectedMatchStart = target.length();
267 text = n->getText(FALSE);
268 text = text.unescape();
269 target.append(text);
270 expectedMatchLimit = target.length();
271 nodeCount++;
272 }
273
274 n = testCase->getChildElement("post");
275 if (n!=NULL) {
276 text = n->getText(FALSE);
277 text = text.unescape();
278 target.append(text);
279 nodeCount++;
280 }
281
282 // Check that there weren't extra things in the XML
283 TEST_ASSERT(nodeCount == testCase->countChildren());
284
285 // Open a collator and StringSearch based on the parameters
286 // obtained from the XML.
287 //
288 status = U_ZERO_ERROR;
289 UCollator *collator = ucol_open(clocale, &status);
290 ucol_setStrength(collator, collatorStrength);
291 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
292 ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
293 UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
294 target.getBuffer(), target.length(),
295 collator,
296 NULL, // the break iterator
297 &status);
298
299 TEST_ASSERT_SUCCESS(status);
300 if (U_FAILURE(status)) {
301 usearch_close(uss);
302 ucol_close(collator);
303 continue;
304 }
305
306 int32_t foundStart = 0;
307 int32_t foundLimit = 0;
308 UBool foundMatch;
309
310 //
311 // Do the search, check the match result against the expected results.
312 //
313 foundMatch= usearch_search(uss, 0, &foundStart, &foundLimit, &status);
314 TEST_ASSERT_SUCCESS(status);
315 if (foundMatch && expectedMatchStart<0 ||
316 foundStart != expectedMatchStart ||
317 foundLimit != expectedMatchLimit) {
318 TEST_ASSERT(FALSE); // ouput generic error position
319 infoln("Found, expected match start = %d, %d \n"
320 "Found, expected match limit = %d, %d",
321 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
322 }
323
324 // In case there are other matches...
325 // (should we only do this if the test case passed?)
326 while (foundMatch) {
327 expectedMatchStart = foundStart;
328 expectedMatchLimit = foundLimit;
329
330 foundMatch = usearch_search(uss, foundLimit, &foundStart, &foundLimit, &status);
331 }
332
333 usearch_close(uss);
334
335 uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
336 target.getBuffer(), target.length(),
337 collator,
338 NULL,
339 &status);
340
341 //
342 // Do the backwards search, check the match result against the expected results.
343 //
344 foundMatch= usearch_searchBackwards(uss, target.length(), &foundStart, &foundLimit, &status);
345 TEST_ASSERT_SUCCESS(status);
346 if (foundMatch && expectedMatchStart<0 ||
347 foundStart != expectedMatchStart ||
348 foundLimit != expectedMatchLimit) {
349 TEST_ASSERT(FALSE); // ouput generic error position
350 infoln("Found, expected backwards match start = %d, %d \n"
351 "Found, expected backwards match limit = %d, %d",
352 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
353 }
354
355 usearch_close(uss);
356 ucol_close(collator);
357 }
358
359 delete root;
360 delete parser;
361 #endif
362 }
363
364 struct UdhrTestCase
365 {
366 char *locale;
367 char *file;
368 };
369
udhrTest()370 void SSearchTest::udhrTest()
371 {
372 UErrorCode status = U_ZERO_ERROR;
373 char path[PATH_BUFFER_SIZE];
374 const char *udhrPath = getPath(path, "udhr");
375
376 if (udhrPath == NULL) {
377 // couldn't get path: error message already output...
378 return;
379 }
380
381 UdhrTestCase testCases[] = {
382 {"en", "udhr_eng.txt"},
383 {"de", "udhr_deu_1996.txt"},
384 {"fr", "udhr_fra.txt"},
385 {"ru", "udhr_rus.txt"},
386 {"th", "udhr_tha.txt"},
387 {"ja", "udhr_jpn.txt"},
388 {"ko", "udhr_kor.txt"},
389 {"zh", "udhr_cmn_hans.txt"},
390 {"zh_Hant", "udhr_cmn_hant.txt"}
391 };
392
393 int32_t testCount = ARRAY_SIZE(testCases);
394
395 for (int32_t t = 0; t < testCount; t += 1) {
396 int32_t len = 0;
397 char *resolvedFileName = NULL;
398 const char *encoding = NULL;
399 UCHARBUF *ucharBuf = NULL;
400
401 ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status);
402 resolvedFileName = NEW_ARRAY(char, len);
403
404 if(resolvedFileName == NULL){
405 continue;
406 }
407
408 if(status == U_BUFFER_OVERFLOW_ERROR){
409 status = U_ZERO_ERROR;
410 }
411
412 ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &len, &status);
413 ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status);
414
415 DELETE_ARRAY(resolvedFileName);
416
417 if(U_FAILURE(status)){
418 infoln("Could not open the input file %s. Test skipped\n", testCases[t].file);
419 continue;
420 }
421
422 int32_t targetLen = 0;
423 const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status);
424
425 /* The first line of the file contains the pattern */
426 int32_t start = 0, end = 0, plen = 0;
427
428 for(end = start; ; end += 1) {
429 UChar ch = target[end];
430
431 if (ch == 0x000A || ch == 0x000D || ch == 0x2028) {
432 break;
433 }
434 }
435
436 plen = end - start;
437
438 UChar *pattern = NEW_ARRAY(UChar, plen);
439 for (int32_t i = 0; i < plen; i += 1) {
440 pattern[i] = target[start++];
441 }
442
443 int32_t offset = 0;
444 UCollator *coll = ucol_open(testCases[t].locale, &status);
445 UCD *ucd = NULL;
446 BMS *bms = NULL;
447
448 if (U_FAILURE(status)) {
449 errln("Could not open collator for %s", testCases[t].locale);
450 goto delete_collator;
451 }
452
453 ucd = ucd_open(coll, &status);
454
455 if (U_FAILURE(status)) {
456 errln("Could not open CollData object for %s", testCases[t].locale);
457 goto delete_ucd;
458 }
459
460 bms = bms_open(ucd, pattern, plen, target, targetLen, &status);
461
462 if (U_FAILURE(status)) {
463 errln("Could not open search object for %s", testCases[t].locale);
464 goto delete_bms;
465 }
466
467 start = end = -1;
468 while (bms_search(bms, offset, &start, &end)) {
469 offset = end;
470 }
471
472 if (offset == 0) {
473 errln("Could not find pattern - locale: %s, file: %s ", testCases[t].locale, testCases[t].file);
474 }
475
476 delete_bms:
477 bms_close(bms);
478
479 delete_ucd:
480 ucd_close(ucd);
481
482 delete_collator:
483 ucol_close(coll);
484
485 DELETE_ARRAY(pattern);
486 ucbuf_close(ucharBuf);
487 }
488
489 ucd_flushCache();
490 }
491
bmSearchTest()492 void SSearchTest::bmSearchTest()
493 {
494 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
495 UErrorCode status = U_ZERO_ERROR;
496 char path[PATH_BUFFER_SIZE];
497 const char *testFilePath = getPath(path, "ssearch.xml");
498
499 if (testFilePath == NULL) {
500 return; /* Couldn't get path: error message already output. */
501 }
502
503 UXMLParser *parser = UXMLParser::createParser(status);
504 TEST_ASSERT_SUCCESS(status);
505 UXMLElement *root = parser->parseFile(testFilePath, status);
506 TEST_ASSERT_SUCCESS(status);
507 if (U_FAILURE(status)) {
508 return;
509 }
510
511 const UnicodeString *debugTestCase = root->getAttribute("debug");
512 if (debugTestCase != NULL) {
513 // setenv("USEARCH_DEBUG", "1", 1);
514 }
515
516
517 const UXMLElement *testCase;
518 int32_t tc = 0;
519
520 while((testCase = root->nextChildElement(tc)) != NULL) {
521
522 if (testCase->getTagName().compare("test-case") != 0) {
523 errln("ssearch, unrecognized XML Element in test file");
524 continue;
525 }
526 const UnicodeString *id = testCase->getAttribute("id");
527 *testId = 0;
528 if (id != NULL) {
529 id->extract(0, id->length(), testId, sizeof(testId), US_INV);
530 }
531
532 // If debugging test case has been specified and this is not it, skip to next.
533 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
534 continue;
535 }
536 //
537 // Get the requested collation strength.
538 // Default is tertiary if the XML attribute is missing from the test case.
539 //
540 const UnicodeString *strength = testCase->getAttribute("strength");
541 UColAttributeValue collatorStrength;
542 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
543 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
544 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
545 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;}
546 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
547 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;}
548 else {
549 // Bogus value supplied for strength. Shouldn't happen, even from
550 // typos, if the XML source has been validated.
551 // This assert is a little deceiving in that strength can be
552 // any of the allowed values, not just TERTIARY, but it will
553 // do the job of getting the error output.
554 TEST_ASSERT(*strength=="TERTIARY")
555 }
556
557 //
558 // Get the collator normalization flag. Default is UCOL_OFF.
559 //
560 UColAttributeValue normalize = UCOL_OFF;
561 const UnicodeString *norm = testCase->getAttribute("norm");
562 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
563 if (norm!=NULL && *norm=="ON") {
564 normalize = UCOL_ON;
565 }
566
567 //
568 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
569 //
570 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
571 const UnicodeString *alt = testCase->getAttribute("alternate_handling");
572 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
573 if (alt != NULL && *alt == "SHIFTED") {
574 alternateHandling = UCOL_SHIFTED;
575 }
576
577 const UnicodeString defLocale("en");
578 char clocale[100];
579 const UnicodeString *locale = testCase->getAttribute("locale");
580 if (locale == NULL || locale->length()==0) {
581 locale = &defLocale;
582 };
583 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
584
585
586 UnicodeString text;
587 UnicodeString target;
588 UnicodeString pattern;
589 int32_t expectedMatchStart = -1;
590 int32_t expectedMatchLimit = -1;
591 const UXMLElement *n;
592 int32_t nodeCount = 0;
593
594 n = testCase->getChildElement("pattern");
595 TEST_ASSERT(n != NULL);
596 if (n==NULL) {
597 continue;
598 }
599 text = n->getText(FALSE);
600 text = text.unescape();
601 pattern.append(text);
602 nodeCount++;
603
604 n = testCase->getChildElement("pre");
605 if (n!=NULL) {
606 text = n->getText(FALSE);
607 text = text.unescape();
608 target.append(text);
609 nodeCount++;
610 }
611
612 n = testCase->getChildElement("m");
613 if (n!=NULL) {
614 expectedMatchStart = target.length();
615 text = n->getText(FALSE);
616 text = text.unescape();
617 target.append(text);
618 expectedMatchLimit = target.length();
619 nodeCount++;
620 }
621
622 n = testCase->getChildElement("post");
623 if (n!=NULL) {
624 text = n->getText(FALSE);
625 text = text.unescape();
626 target.append(text);
627 nodeCount++;
628 }
629
630 // Check that there weren't extra things in the XML
631 TEST_ASSERT(nodeCount == testCase->countChildren());
632
633 // Open a collator and StringSearch based on the parameters
634 // obtained from the XML.
635 //
636 status = U_ZERO_ERROR;
637 UCollator *collator = ucol_open(clocale, &status);
638 ucol_setStrength(collator, collatorStrength);
639 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
640 ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
641 UCD *ucd = ucd_open(collator, &status);
642 BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), &status);
643
644 TEST_ASSERT_SUCCESS(status);
645 if (U_FAILURE(status)) {
646 bms_close(bms);
647 ucd_close(ucd);
648 ucol_close(collator);
649 continue;
650 }
651
652 int32_t foundStart = 0;
653 int32_t foundLimit = 0;
654 UBool foundMatch;
655
656 //
657 // Do the search, check the match result against the expected results.
658 //
659 foundMatch = bms_search(bms, 0, &foundStart, &foundLimit);
660 //TEST_ASSERT_SUCCESS(status);
661 if (foundMatch && expectedMatchStart < 0 ||
662 foundStart != expectedMatchStart ||
663 foundLimit != expectedMatchLimit) {
664 TEST_ASSERT(FALSE); // ouput generic error position
665 infoln("Found, expected match start = %d, %d \n"
666 "Found, expected match limit = %d, %d",
667 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
668 }
669
670 bms_close(bms);
671 ucd_close(ucd);
672 ucol_close(collator);
673 }
674
675 ucd_flushCache();
676 delete root;
677 delete parser;
678 #endif
679 }
680
681 struct Order
682 {
683 int32_t order;
684 int32_t lowOffset;
685 int32_t highOffset;
686 };
687
688 class OrderList
689 {
690 public:
691 OrderList();
692 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
693 ~OrderList();
694
695 int32_t size(void) const;
696 void add(int32_t order, int32_t low, int32_t high);
697 const Order *get(int32_t index) const;
698 int32_t getLowOffset(int32_t index) const;
699 int32_t getHighOffset(int32_t index) const;
700 int32_t getOrder(int32_t index) const;
701 void reverse(void);
702 UBool compare(const OrderList &other) const;
703 UBool matchesAt(int32_t offset, const OrderList &other) const;
704
705 private:
706 Order *list;
707 int32_t listMax;
708 int32_t listSize;
709 };
710
OrderList()711 OrderList::OrderList()
712 : list(NULL), listSize(0), listMax(16)
713 {
714 list = new Order[listMax];
715 }
716
OrderList(UCollator * coll,const UnicodeString & string,int32_t stringOffset)717 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
718 : list(NULL), listMax(16), listSize(0)
719 {
720 UErrorCode status = U_ZERO_ERROR;
721 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
722 uint32_t strengthMask = 0;
723 int32_t order, low, high;
724
725 switch (ucol_getStrength(coll))
726 {
727 default:
728 strengthMask |= UCOL_TERTIARYORDERMASK;
729 /* fall through */
730
731 case UCOL_SECONDARY:
732 strengthMask |= UCOL_SECONDARYORDERMASK;
733 /* fall through */
734
735 case UCOL_PRIMARY:
736 strengthMask |= UCOL_PRIMARYORDERMASK;
737 }
738
739 list = new Order[listMax];
740
741 ucol_setOffset(elems, stringOffset, &status);
742
743 do {
744 low = ucol_getOffset(elems);
745 order = ucol_next(elems, &status);
746 high = ucol_getOffset(elems);
747
748 if (order != UCOL_NULLORDER) {
749 order &= strengthMask;
750 }
751
752 if (order != UCOL_IGNORABLE) {
753 add(order, low, high);
754 }
755 } while (order != UCOL_NULLORDER);
756
757 ucol_closeElements(elems);
758 }
759
~OrderList()760 OrderList::~OrderList()
761 {
762 delete[] list;
763 }
764
add(int32_t order,int32_t low,int32_t high)765 void OrderList::add(int32_t order, int32_t low, int32_t high)
766 {
767 if (listSize >= listMax) {
768 listMax *= 2;
769
770 Order *newList = new Order[listMax];
771
772 uprv_memcpy(newList, list, listSize * sizeof(Order));
773 delete[] list;
774 list = newList;
775 }
776
777 list[listSize].order = order;
778 list[listSize].lowOffset = low;
779 list[listSize].highOffset = high;
780
781 listSize += 1;
782 }
783
get(int32_t index) const784 const Order *OrderList::get(int32_t index) const
785 {
786 if (index >= listSize) {
787 return NULL;
788 }
789
790 return &list[index];
791 }
792
getLowOffset(int32_t index) const793 int32_t OrderList::getLowOffset(int32_t index) const
794 {
795 const Order *order = get(index);
796
797 if (order != NULL) {
798 return order->lowOffset;
799 }
800
801 return -1;
802 }
803
getHighOffset(int32_t index) const804 int32_t OrderList::getHighOffset(int32_t index) const
805 {
806 const Order *order = get(index);
807
808 if (order != NULL) {
809 return order->highOffset;
810 }
811
812 return -1;
813 }
814
getOrder(int32_t index) const815 int32_t OrderList::getOrder(int32_t index) const
816 {
817 const Order *order = get(index);
818
819 if (order != NULL) {
820 return order->order;
821 }
822
823 return UCOL_NULLORDER;
824 }
825
size() const826 int32_t OrderList::size() const
827 {
828 return listSize;
829 }
830
reverse()831 void OrderList::reverse()
832 {
833 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
834 Order swap = list[b];
835
836 list[b] = list[f];
837 list[f] = swap;
838 }
839 }
840
compare(const OrderList & other) const841 UBool OrderList::compare(const OrderList &other) const
842 {
843 if (listSize != other.listSize) {
844 return FALSE;
845 }
846
847 for(int32_t i = 0; i < listSize; i += 1) {
848 if (list[i].order != other.list[i].order ||
849 list[i].lowOffset != other.list[i].lowOffset ||
850 list[i].highOffset != other.list[i].highOffset) {
851 return FALSE;
852 }
853 }
854
855 return TRUE;
856 }
857
matchesAt(int32_t offset,const OrderList & other) const858 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
859 {
860 // NOTE: sizes include the NULLORDER, which we don't want to compare.
861 int32_t otherSize = other.size() - 1;
862
863 if (listSize - 1 - offset < otherSize) {
864 return FALSE;
865 }
866
867 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
868 if (getOrder(i) != other.getOrder(j)) {
869 return FALSE;
870 }
871 }
872
873 return TRUE;
874 }
875
printOffsets(char * buffer,OrderList & list)876 static char *printOffsets(char *buffer, OrderList &list)
877 {
878 int32_t size = list.size();
879 char *s = buffer;
880
881 for(int32_t i = 0; i < size; i += 1) {
882 const Order *order = list.get(i);
883
884 if (i != 0) {
885 s += sprintf(s, ", ");
886 }
887
888 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
889 }
890
891 return buffer;
892 }
893
printOrders(char * buffer,OrderList & list)894 static char *printOrders(char *buffer, OrderList &list)
895 {
896 int32_t size = list.size();
897 char *s = buffer;
898
899 for(int32_t i = 0; i < size; i += 1) {
900 const Order *order = list.get(i);
901
902 if (i != 0) {
903 s += sprintf(s, ", ");
904 }
905
906 s += sprintf(s, "%8.8X", order->order);
907 }
908
909 return buffer;
910 }
911
offsetTest()912 void SSearchTest::offsetTest()
913 {
914 const char *test[] = {
915 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
916 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
917 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
918
919 "\\ua191\\u16ef\\u2036\\u017a",
920
921 #if 0
922 // This results in a complex interaction between contraction,
923 // expansion and normalization that confuses the backwards offset fixups.
924 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
925 #endif
926
927 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
928 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
929
930 "\\u02FE\\u02FF"
931 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
932 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
933 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
934 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
935 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E",
936
937 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318",
938 "abc\\u0E41\\u0301\\u0316",
939 "abc\\u0E41\\u0316\\u0301",
940 "\\u0E41\\u0301\\u0316",
941 "\\u0E41\\u0316\\u0301",
942 "a\\u0301\\u0316",
943 "a\\u0316\\u0301",
944 "\\uAC52\\uAC53",
945 "\\u34CA\\u34CB",
946 "\\u11ED\\u11EE",
947 "\\u30C3\\u30D0",
948 "p\\u00E9ch\\u00E9",
949 "a\\u0301\\u0325",
950 "a\\u0300\\u0325",
951 "a\\u0325\\u0300",
952 "A\\u0323\\u0300B",
953 "A\\u0300\\u0323B",
954 "A\\u0301\\u0323B",
955 "A\\u0302\\u0301\\u0323B",
956 "abc",
957 "ab\\u0300c",
958 "ab\\u0300\\u0323c",
959 " \\uD800\\uDC00\\uDC00",
960 "a\\uD800\\uDC00\\uDC00",
961 "A\\u0301\\u0301",
962 "A\\u0301\\u0323",
963 "A\\u0301\\u0323B",
964 "B\\u0301\\u0323C",
965 "A\\u0300\\u0323B",
966 "\\u0301A\\u0301\\u0301",
967 "abcd\\r\\u0301",
968 "p\\u00EAche",
969 "pe\\u0302che",
970 };
971
972 int32_t testCount = ARRAY_SIZE(test);
973 UErrorCode status = U_ZERO_ERROR;
974 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
975 if (U_FAILURE(status)) {
976 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
977 return;
978 }
979 char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
980 // We could allocate one that's the right size by (CE_count * 10) + 2
981 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
982
983 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
984
985 for(int32_t i = 0; i < testCount; i += 1) {
986 UnicodeString ts = CharsToUnicodeString(test[i]);
987 CollationElementIterator *iter = col->createCollationElementIterator(ts);
988 OrderList forwardList;
989 OrderList backwardList;
990 int32_t order, low, high;
991
992 do {
993 low = iter->getOffset();
994 order = iter->next(status);
995 high = iter->getOffset();
996
997 forwardList.add(order, low, high);
998 } while (order != CollationElementIterator::NULLORDER);
999
1000 iter->reset();
1001 iter->setOffset(ts.length(), status);
1002
1003 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
1004
1005 do {
1006 high = iter->getOffset();
1007 order = iter->previous(status);
1008 low = iter->getOffset();
1009
1010 if (order == CollationElementIterator::NULLORDER) {
1011 break;
1012 }
1013
1014 backwardList.add(order, low, high);
1015 } while (TRUE);
1016
1017 backwardList.reverse();
1018
1019 if (forwardList.compare(backwardList)) {
1020 logln("Works with \"%s\"", test[i]);
1021 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
1022 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
1023
1024 logln("Forward CEs: [%s]", printOrders(buffer, forwardList));
1025 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
1026
1027 logln();
1028 } else {
1029 errln("Fails with \"%s\"", test[i]);
1030 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
1031 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
1032
1033 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList));
1034 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
1035
1036 infoln();
1037 }
1038 delete iter;
1039 }
1040 delete col;
1041 }
1042
escape(const UnicodeString & string,UnicodeString & buffer)1043 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
1044 {
1045 for(int32_t i = 0; i < string.length(); i += 1) {
1046 UChar32 ch = string.char32At(i);
1047
1048 if (ch >= 0x0020 && ch <= 0x007F) {
1049 if (ch == 0x005C) {
1050 buffer.append("\\\\");
1051 } else {
1052 buffer.append(ch);
1053 }
1054 } else {
1055 char cbuffer[12];
1056
1057 if (ch <= 0xFFFFL) {
1058 sprintf(cbuffer, "\\u%4.4X", ch);
1059 } else {
1060 sprintf(cbuffer, "\\U%8.8X", ch);
1061 }
1062
1063 buffer.append(cbuffer);
1064 }
1065
1066 if (ch >= 0x10000L) {
1067 i += 1;
1068 }
1069 }
1070
1071 return buffer;
1072 }
1073
1074 #if 1
1075
1076 struct PCE
1077 {
1078 uint64_t ce;
1079 int32_t lowOffset;
1080 int32_t highOffset;
1081 };
1082
1083 class PCEList
1084 {
1085 public:
1086 PCEList(UCollator *coll, const UnicodeString &string);
1087 ~PCEList();
1088
1089 int32_t size() const;
1090
1091 const PCE *get(int32_t index) const;
1092
1093 int32_t getLowOffset(int32_t index) const;
1094 int32_t getHighOffset(int32_t index) const;
1095 uint64_t getOrder(int32_t index) const;
1096
1097 UBool matchesAt(int32_t offset, const PCEList &other) const;
1098
1099 uint64_t operator[](int32_t index) const;
1100
1101 private:
1102 void add(uint64_t ce, int32_t low, int32_t high);
1103
1104 PCE *list;
1105 int32_t listMax;
1106 int32_t listSize;
1107 };
1108
PCEList(UCollator * coll,const UnicodeString & string)1109 PCEList::PCEList(UCollator *coll, const UnicodeString &string)
1110 {
1111 UErrorCode status = U_ZERO_ERROR;
1112 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
1113 uint64_t order;
1114 int32_t low, high;
1115
1116 list = new PCE[listMax];
1117
1118 ucol_setOffset(elems, 0, &status);
1119
1120 do {
1121 order = ucol_nextProcessed(elems, &low, &high, &status);
1122 add(order, low, high);
1123 } while (order != UCOL_PROCESSED_NULLORDER);
1124
1125 ucol_closeElements(elems);
1126 }
1127
~PCEList()1128 PCEList::~PCEList()
1129 {
1130 delete[] list;
1131 }
1132
add(uint64_t order,int32_t low,int32_t high)1133 void PCEList::add(uint64_t order, int32_t low, int32_t high)
1134 {
1135 if (listSize >= listMax) {
1136 listMax *= 2;
1137
1138 PCE *newList = new PCE[listMax];
1139
1140 uprv_memcpy(newList, list, listSize * sizeof(Order));
1141 delete[] list;
1142 list = newList;
1143 }
1144
1145 list[listSize].ce = order;
1146 list[listSize].lowOffset = low;
1147 list[listSize].highOffset = high;
1148
1149 listSize += 1;
1150 }
1151
get(int32_t index) const1152 const PCE *PCEList::get(int32_t index) const
1153 {
1154 if (index >= listSize) {
1155 return NULL;
1156 }
1157
1158 return &list[index];
1159 }
1160
getLowOffset(int32_t index) const1161 int32_t PCEList::getLowOffset(int32_t index) const
1162 {
1163 const PCE *pce = get(index);
1164
1165 if (pce != NULL) {
1166 return pce->lowOffset;
1167 }
1168
1169 return -1;
1170 }
1171
getHighOffset(int32_t index) const1172 int32_t PCEList::getHighOffset(int32_t index) const
1173 {
1174 const PCE *pce = get(index);
1175
1176 if (pce != NULL) {
1177 return pce->highOffset;
1178 }
1179
1180 return -1;
1181 }
1182
getOrder(int32_t index) const1183 uint64_t PCEList::getOrder(int32_t index) const
1184 {
1185 const PCE *pce = get(index);
1186
1187 if (pce != NULL) {
1188 return pce->ce;
1189 }
1190
1191 return UCOL_PROCESSED_NULLORDER;
1192 }
1193
size() const1194 int32_t PCEList::size() const
1195 {
1196 return listSize;
1197 }
1198
matchesAt(int32_t offset,const PCEList & other) const1199 UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const
1200 {
1201 // NOTE: sizes include the NULLORDER, which we don't want to compare.
1202 int32_t otherSize = other.size() - 1;
1203
1204 if (listSize - 1 - offset < otherSize) {
1205 return FALSE;
1206 }
1207
1208 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
1209 if (getOrder(i) != other.getOrder(j)) {
1210 return FALSE;
1211 }
1212 }
1213
1214 return TRUE;
1215 }
1216
operator [](int32_t index) const1217 uint64_t PCEList::operator[](int32_t index) const
1218 {
1219 return getOrder(index);
1220 }
1221
boyerMooreTest()1222 void SSearchTest::boyerMooreTest()
1223 {
1224 UErrorCode status = U_ZERO_ERROR;
1225 UCollator *coll = NULL;
1226 CollData *data = NULL;
1227 UnicodeString lp = "fuss";
1228 UnicodeString sp = "fu\\u00DF";
1229 BoyerMooreSearch *longPattern = NULL;
1230 BoyerMooreSearch *shortPattern = NULL;
1231 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1232 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1233 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1234 int32_t start = -1, end = -1;
1235
1236 coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1237 if (U_FAILURE(status)) {
1238 errcheckln(status, "Could not open collator. - %s", u_errorName(status));
1239 return;
1240 }
1241
1242 data = CollData::open(coll, status);
1243 if (U_FAILURE(status)) {
1244 errln("Could not open CollData object.");
1245 goto close_data;
1246 }
1247
1248
1249 longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status);
1250 shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status);
1251 if (U_FAILURE(status)) {
1252 errln("Could not create pattern objects.");
1253 goto close_patterns;
1254 }
1255
1256 for (int32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
1257 UnicodeString target = targets[t].unescape();
1258
1259 longPattern->setTargetString(&target, status);
1260 if (longPattern->search(0, start, end)) {
1261 logln("Test %d: found long pattern at [%d, %d].", t, start, end);
1262 } else {
1263 errln("Test %d: did not find long pattern.", t);
1264 }
1265
1266 shortPattern->setTargetString(&target, status);
1267 if (shortPattern->search(0, start, end)) {
1268 logln("Test %d: found short pattern at [%d, %d].", t, start, end);
1269 } else {
1270 errln("Test %d: did not find short pattern.", t);
1271 }
1272 }
1273
1274 close_patterns:
1275 delete shortPattern;
1276 delete longPattern;
1277
1278 close_data:
1279 CollData::close(data);
1280 ucol_close(coll);
1281 }
1282
bmsTest()1283 void SSearchTest::bmsTest()
1284 {
1285 UErrorCode status = U_ZERO_ERROR;
1286 UCollator *coll = NULL;
1287 UCD *data = NULL;
1288 UnicodeString lp = "fuss";
1289 UnicodeString lpu = lp.unescape();
1290 UnicodeString sp = "fu\\u00DF";
1291 UnicodeString spu = sp.unescape();
1292 BMS *longPattern = NULL;
1293 BMS *shortPattern = NULL;
1294 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1295 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1296 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1297 int32_t start = -1, end = -1;
1298
1299 coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1300 if (U_FAILURE(status)) {
1301 errcheckln(status, "Could not open collator. - %s", u_errorName(status));
1302 return;
1303 }
1304
1305 data = ucd_open(coll, &status);
1306 if (U_FAILURE(status)) {
1307 errln("Could not open CollData object.");
1308 goto close_data;
1309 }
1310
1311 longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status);
1312 shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &status);
1313 if (U_FAILURE(status)) {
1314 errln("Couldn't open pattern objects.");
1315 goto close_patterns;
1316 }
1317
1318 for (int32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
1319 UnicodeString target = targets[t].unescape();
1320
1321 bms_setTargetString(longPattern, target.getBuffer(), target.length(), &status);
1322 if (bms_search(longPattern, 0, &start, &end)) {
1323 logln("Test %d: found long pattern at [%d, %d].", t, start, end);
1324 } else {
1325 errln("Test %d: did not find long pattern.", t);
1326 }
1327
1328 bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &status);
1329 if (bms_search(shortPattern, 0, &start, &end)) {
1330 logln("Test %d: found short pattern at [%d, %d].", t, start, end);
1331 } else {
1332 errln("Test %d: did not find short pattern.", t);
1333 }
1334 }
1335
1336 close_patterns:
1337 bms_close(shortPattern);
1338 bms_close(longPattern);
1339
1340 close_data:
1341 ucd_close(data);
1342 ucol_close(coll);
1343 }
1344
goodSuffixTest()1345 void SSearchTest::goodSuffixTest()
1346 {
1347 UErrorCode status = U_ZERO_ERROR;
1348 UCollator *coll = NULL;
1349 CollData *data = NULL;
1350 UnicodeString pat = /*"gcagagag"*/ "fxeld";
1351 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
1352 BoyerMooreSearch *pattern = NULL;
1353 int32_t start = -1, end = -1;
1354
1355 coll = ucol_open(NULL, &status);
1356 if (U_FAILURE(status)) {
1357 errcheckln(status, "Couldn't open collator. - %s", u_errorName(status));
1358 return;
1359 }
1360
1361 data = CollData::open(coll, status);
1362 if (U_FAILURE(status)) {
1363 errln("Couldn't open CollData object.");
1364 goto close_data;
1365 }
1366
1367 pattern = new BoyerMooreSearch(data, pat, &target, status);
1368 if (U_FAILURE(status)) {
1369 errln("Couldn't open pattern object.");
1370 goto close_pattern;
1371 }
1372
1373 if (pattern->search(0, start, end)) {
1374 logln("Found pattern at [%d, %d].", start, end);
1375 } else {
1376 errln("Did not find pattern.");
1377 }
1378
1379 close_pattern:
1380 delete pattern;
1381
1382 close_data:
1383 CollData::close(data);
1384 ucol_close(coll);
1385 }
1386
1387 //
1388 // searchTime() A quick and dirty performance test for string search.
1389 // Probably doesn't really belong as part of intltest, but it
1390 // does check that the search succeeds, and gets the right result,
1391 // so it serves as a functionality test also.
1392 //
1393 // To run as a perf test, up the loop count, select by commenting
1394 // and uncommenting in the code the operation to be measured,
1395 // rebuild, and measure the running time of this test alone.
1396 //
1397 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime
1398 //
searchTime()1399 void SSearchTest::searchTime() {
1400 static const char *longishText =
1401 "Whylom, as olde stories tellen us,\n"
1402 "Ther was a duk that highte Theseus:\n"
1403 "Of Athenes he was lord and governour,\n"
1404 "And in his tyme swich a conquerour,\n"
1405 "That gretter was ther noon under the sonne.\n"
1406 "Ful many a riche contree hadde he wonne;\n"
1407 "What with his wisdom and his chivalrye,\n"
1408 "He conquered al the regne of Femenye,\n"
1409 "That whylom was y-cleped Scithia;\n"
1410 "And weddede the quene Ipolita,\n"
1411 "And broghte hir hoom with him in his contree\n"
1412 "With muchel glorie and greet solempnitee,\n"
1413 "And eek hir yonge suster Emelye.\n"
1414 "And thus with victorie and with melodye\n"
1415 "Lete I this noble duk to Athenes ryde,\n"
1416 "And al his hoost, in armes, him bisyde.\n"
1417 "And certes, if it nere to long to here,\n"
1418 "I wolde han told yow fully the manere,\n"
1419 "How wonnen was the regne of Femenye\n"
1420 "By Theseus, and by his chivalrye;\n"
1421 "And of the grete bataille for the nones\n"
1422 "Bitwixen Athen's and Amazones;\n"
1423 "And how asseged was Ipolita,\n"
1424 "The faire hardy quene of Scithia;\n"
1425 "And of the feste that was at hir weddinge,\n"
1426 "And of the tempest at hir hoom-cominge;\n"
1427 "But al that thing I moot as now forbere.\n"
1428 "I have, God woot, a large feeld to ere,\n"
1429 "And wayke been the oxen in my plough.\n"
1430 "The remenant of the tale is long y-nough.\n"
1431 "I wol nat letten eek noon of this route;\n"
1432 "Lat every felawe telle his tale aboute,\n"
1433 "And lat see now who shal the soper winne;\n"
1434 "And ther I lefte, I wol ageyn biginne.\n"
1435 "This duk, of whom I make mencioun,\n"
1436 "When he was come almost unto the toun,\n"
1437 "In al his wele and in his moste pryde,\n"
1438 "He was war, as he caste his eye asyde,\n"
1439 "Wher that ther kneled in the hye weye\n"
1440 "A companye of ladies, tweye and tweye,\n"
1441 "Ech after other, clad in clothes blake; \n"
1442 "But swich a cry and swich a wo they make,\n"
1443 "That in this world nis creature livinge,\n"
1444 "That herde swich another weymentinge;\n"
1445 "And of this cry they nolde never stenten,\n"
1446 "Til they the reynes of his brydel henten.\n"
1447 "'What folk ben ye, that at myn hoomcominge\n"
1448 "Perturben so my feste with cryinge'?\n"
1449 "Quod Theseus, 'have ye so greet envye\n"
1450 "Of myn honour, that thus compleyne and crye? \n"
1451 "Or who hath yow misboden, or offended?\n"
1452 "And telleth me if it may been amended;\n"
1453 "And why that ye ben clothed thus in blak'?\n"
1454 "The eldest lady of hem alle spak,\n"
1455 "When she hadde swowned with a deedly chere,\n"
1456 "That it was routhe for to seen and here,\n"
1457 "And seyde: 'Lord, to whom Fortune hath yiven\n"
1458 "Victorie, and as a conquerour to liven,\n"
1459 "Noght greveth us your glorie and your honour;\n"
1460 "But we biseken mercy and socour.\n"
1461 "Have mercy on our wo and our distresse.\n"
1462 "Som drope of pitee, thurgh thy gentilesse,\n"
1463 "Up-on us wrecched wommen lat thou falle.\n"
1464 "For certes, lord, ther nis noon of us alle,\n"
1465 "That she nath been a duchesse or a quene;\n"
1466 "Now be we caitifs, as it is wel sene:\n"
1467 "Thanked be Fortune, and hir false wheel,\n"
1468 "That noon estat assureth to be weel.\n"
1469 "And certes, lord, t'abyden your presence,\n"
1470 "Here in the temple of the goddesse Clemence\n"
1471 "We han ben waytinge al this fourtenight;\n"
1472 "Now help us, lord, sith it is in thy might.\n"
1473 "I wrecche, which that wepe and waille thus,\n"
1474 "Was whylom wyf to king Capaneus,\n"
1475 "That starf at Thebes, cursed be that day!\n"
1476 "And alle we, that been in this array,\n"
1477 "And maken al this lamentacioun,\n"
1478 "We losten alle our housbondes at that toun,\n"
1479 "Whyl that the sege ther-aboute lay.\n"
1480 "And yet now th'olde Creon, weylaway!\n"
1481 "The lord is now of Thebes the citee, \n"
1482 "Fulfild of ire and of iniquitee,\n"
1483 "He, for despyt, and for his tirannye,\n"
1484 "To do the dede bodyes vileinye,\n"
1485 "Of alle our lordes, whiche that ben slawe,\n"
1486 "Hath alle the bodyes on an heep y-drawe,\n"
1487 "And wol nat suffren hem, by noon assent,\n"
1488 "Neither to been y-buried nor y-brent,\n"
1489 "But maketh houndes ete hem in despyt. zet'\n";
1490
1491 #define TEST_BOYER_MOORE 1
1492 const char *cPattern = "maketh houndes ete hem";
1493 //const char *cPattern = "Whylom";
1494 //const char *cPattern = "zet";
1495 const char *testId = "searchTime()"; // for error macros.
1496 UnicodeString target = longishText;
1497 UErrorCode status = U_ZERO_ERROR;
1498
1499
1500 UCollator *collator = ucol_open("en", &status);
1501 CollData *data = CollData::open(collator, status);
1502 if (U_FAILURE(status) || collator == NULL || data == NULL) {
1503 errcheckln(status, "Unable to open UCollator or CollData. - %s", u_errorName(status));
1504 return;
1505 }
1506 //ucol_setStrength(collator, collatorStrength);
1507 //ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
1508 UnicodeString uPattern = cPattern;
1509 #ifndef TEST_BOYER_MOORE
1510 UStringSearch *uss = usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
1511 target.getBuffer(), target.length(),
1512 collator,
1513 NULL, // the break iterator
1514 &status);
1515 TEST_ASSERT_SUCCESS(status);
1516 #else
1517 BoyerMooreSearch bms(data, uPattern, &target, status);
1518 TEST_ASSERT_SUCCESS(status);
1519 #endif
1520
1521 // int32_t foundStart;
1522 // int32_t foundEnd;
1523 UBool found;
1524
1525 // Find the match position usgin strstr
1526 const char *pm = strstr(longishText, cPattern);
1527 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
1528 int32_t refMatchPos = (int32_t)(pm - longishText);
1529 int32_t icuMatchPos;
1530 int32_t icuMatchEnd;
1531 #ifndef TEST_BOYER_MOORE
1532 usearch_search(uss, 0, &icuMatchPos, &icuMatchEnd, &status);
1533 TEST_ASSERT_SUCCESS(status);
1534 #else
1535 found = bms.search(0, icuMatchPos, icuMatchEnd);
1536 #endif
1537 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
1538
1539 int32_t i;
1540 int32_t j=0;
1541
1542 // Try loopcounts around 100000 to some millions, depending on the operation,
1543 // to get runtimes of at least several seconds.
1544 for (i=0; i<10000; i++) {
1545 #ifndef TEST_BOYER_MOORE
1546 found = usearch_search(uss, 0, &icuMatchPos, &icuMatchEnd, &status);
1547 #else
1548 found = bms.search(0, icuMatchPos, icuMatchEnd);
1549 #endif
1550 //TEST_ASSERT_SUCCESS(status);
1551 //TEST_ASSERT(found);
1552
1553 // usearch_setOffset(uss, 0, &status);
1554 // icuMatchPos = usearch_next(uss, &status);
1555
1556 // The i+j stuff is to confuse the optimizer and get it to actually leave the
1557 // call to strstr in place.
1558 //pm = strstr(longishText+j, cPattern);
1559 //j = (j + i)%5;
1560 }
1561
1562 printf("%d\n", pm-longishText, j);
1563 #ifndef TEST_BOYER_MOORE
1564 usearch_close(uss);
1565 #else
1566 CollData::close(data);
1567 #endif
1568 ucol_close(collator);
1569 }
1570 #endif
1571
1572 //----------------------------------------------------------------------------------------
1573 //
1574 // Random Numbers. Similar to standard lib rand() and srand()
1575 // Not using library to
1576 // 1. Get same results on all platforms.
1577 // 2. Get access to current seed, to more easily reproduce failures.
1578 //
1579 //---------------------------------------------------------------------------------------
1580 static uint32_t m_seed = 1;
1581
m_rand()1582 static uint32_t m_rand()
1583 {
1584 m_seed = m_seed * 1103515245 + 12345;
1585 return (uint32_t)(m_seed/65536) % 32768;
1586 }
1587
1588 class Monkey
1589 {
1590 public:
1591 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
1592
1593 protected:
1594 Monkey();
1595 virtual ~Monkey();
1596 };
1597
Monkey()1598 Monkey::Monkey()
1599 {
1600 // ook?
1601 }
1602
~Monkey()1603 Monkey::~Monkey()
1604 {
1605 // ook?
1606 }
1607
1608 class SetMonkey : public Monkey
1609 {
1610 public:
1611 SetMonkey(const USet *theSet);
1612 ~SetMonkey();
1613
1614 virtual void append(UnicodeString &test, UnicodeString &alternate);
1615
1616 private:
1617 const USet *set;
1618 };
1619
SetMonkey(const USet * theSet)1620 SetMonkey::SetMonkey(const USet *theSet)
1621 : Monkey(), set(theSet)
1622 {
1623 // ook?
1624 }
1625
~SetMonkey()1626 SetMonkey::~SetMonkey()
1627 {
1628 //ook...
1629 }
1630
append(UnicodeString & test,UnicodeString & alternate)1631 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1632 {
1633 int32_t size = uset_size(set);
1634 int32_t index = m_rand() % size;
1635 UChar32 ch = uset_charAt(set, index);
1636 UnicodeString str(ch);
1637
1638 test.append(str);
1639 alternate.append(str); // flip case, or some junk?
1640 }
1641
1642 class StringSetMonkey : public Monkey
1643 {
1644 public:
1645 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1646 ~StringSetMonkey();
1647
1648 void append(UnicodeString &testCase, UnicodeString &alternate);
1649
1650 private:
1651 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1652
1653 const USet *set;
1654 UCollator *coll;
1655 CollData *collData;
1656 };
1657
StringSetMonkey(const USet * theSet,UCollator * theCollator,CollData * theCollData)1658 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1659 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1660 {
1661 // ook.
1662 }
1663
~StringSetMonkey()1664 StringSetMonkey::~StringSetMonkey()
1665 {
1666 // ook?
1667 }
1668
append(UnicodeString & testCase,UnicodeString & alternate)1669 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1670 {
1671 int32_t itemCount = uset_getItemCount(set), len = 0;
1672 int32_t index = m_rand() % itemCount;
1673 UChar32 rangeStart = 0, rangeEnd = 0;
1674 UChar buffer[16];
1675 UErrorCode err = U_ZERO_ERROR;
1676
1677 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1678
1679 if (len == 0) {
1680 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1681 UChar32 ch = rangeStart + offset;
1682 UnicodeString str(ch);
1683
1684 testCase.append(str);
1685 generateAlternative(str, alternate);
1686 } else if (len > 0) {
1687 // should check that len < 16...
1688 UnicodeString str(buffer, len);
1689
1690 testCase.append(str);
1691 generateAlternative(str, alternate);
1692 } else {
1693 // shouldn't happen...
1694 }
1695 }
1696
generateAlternative(const UnicodeString & testCase,UnicodeString & alternate)1697 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1698 {
1699 // find out shortest string for the longest sequence of ces.
1700 // needs to be refined to use dynamic programming, but will be roughly right
1701 UErrorCode status = U_ZERO_ERROR;
1702 CEList ceList(coll, testCase, status);
1703 UnicodeString alt;
1704 int32_t offset = 0;
1705
1706 if (ceList.size() == 0) {
1707 return alternate.append(testCase);
1708 }
1709
1710 while (offset < ceList.size()) {
1711 int32_t ce = ceList.get(offset);
1712 const StringList *strings = collData->getStringList(ce);
1713
1714 if (strings == NULL) {
1715 return alternate.append(testCase);
1716 }
1717
1718 int32_t stringCount = strings->size();
1719 int32_t tries = 0;
1720
1721 // find random string that generates the same CEList
1722 const CEList *ceList2 = NULL;
1723 const UnicodeString *string = NULL;
1724 UBool matches = FALSE;
1725
1726 do {
1727 int32_t s = m_rand() % stringCount;
1728
1729 if (tries++ > stringCount) {
1730 alternate.append(testCase);
1731 return alternate;
1732 }
1733
1734 string = strings->get(s);
1735 ceList2 = collData->getCEList(string);
1736 matches = ceList.matchesAt(offset, ceList2);
1737
1738 if (! matches) {
1739 collData->freeCEList((CEList *) ceList2);
1740 }
1741 } while (! matches);
1742
1743 alt.append(*string);
1744 offset += ceList2->size();
1745 collData->freeCEList(ceList2);
1746 }
1747
1748 const CEList altCEs(coll, alt, status);
1749
1750 if (ceList.matchesAt(0, &altCEs)) {
1751 return alternate.append(alt);
1752 }
1753
1754 return alternate.append(testCase);
1755 }
1756
generateTestCase(UCollator * coll,Monkey * monkeys[],int32_t monkeyCount,UnicodeString & testCase,UnicodeString & alternate)1757 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1758 {
1759 int32_t pieces = (m_rand() % 4) + 1;
1760 UErrorCode status = U_ZERO_ERROR;
1761 UBool matches;
1762
1763 do {
1764 testCase.remove();
1765 alternate.remove();
1766 monkeys[0]->append(testCase, alternate);
1767
1768 for(int32_t piece = 0; piece < pieces; piece += 1) {
1769 int32_t monkey = m_rand() % monkeyCount;
1770
1771 monkeys[monkey]->append(testCase, alternate);
1772 }
1773
1774 const CEList ceTest(coll, testCase, status);
1775 const CEList ceAlt(coll, alternate, status);
1776
1777 matches = ceTest.matchesAt(0, &ceAlt);
1778 } while (! matches);
1779 }
1780
1781 //
1782 // Find the next acceptable boundary following the specified starting index
1783 // in the target text being searched.
1784 // TODO: refine what is an acceptable boundary. For the moment,
1785 // choose the next position not within a combining sequence.
1786 //
nextBoundaryAfter(const UnicodeString & string,int32_t startIndex)1787 static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) {
1788 const UChar *text = string.getBuffer();
1789 int32_t textLen = string.length();
1790
1791 if (startIndex >= textLen) {
1792 return startIndex;
1793 }
1794
1795 UChar32 c;
1796 int32_t i = startIndex;
1797
1798 U16_NEXT(text, i, textLen, c);
1799
1800 // If we are on a control character, stop without looking for combining marks.
1801 // Control characters do not combine.
1802 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1803 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
1804 return i;
1805 }
1806
1807 // The initial character was not a control, and can thus accept trailing
1808 // combining characters. Advance over however many of them there are.
1809 int32_t indexOfLastCharChecked;
1810
1811 for (;;) {
1812 indexOfLastCharChecked = i;
1813
1814 if (i>=textLen) {
1815 break;
1816 }
1817
1818 U16_NEXT(text, i, textLen, c);
1819 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1820
1821 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1822 break;
1823 }
1824 }
1825
1826 return indexOfLastCharChecked;
1827 }
1828
isInCombiningSequence(const UnicodeString & string,int32_t index)1829 static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) {
1830 const UChar *text = string.getBuffer();
1831 int32_t textLen = string.length();
1832
1833 if (index>=textLen || index<=0) {
1834 return FALSE;
1835 }
1836
1837 // If the character at the current index is not a GRAPHEME_EXTEND
1838 // then we can not be within a combining sequence.
1839 UChar32 c;
1840 U16_GET(text, 0, index, textLen, c);
1841 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1842 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1843 return FALSE;
1844 }
1845
1846 // We are at a combining mark. If the preceding character is anything
1847 // except a CONTROL, CR or LF, we are in a combining sequence.
1848 U16_PREV(text, 0, index, c);
1849 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1850
1851 return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
1852 }
1853
simpleSearch(UCollator * coll,const UnicodeString & target,int32_t offset,const UnicodeString & pattern,int32_t & matchStart,int32_t & matchEnd)1854 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1855 {
1856 UErrorCode status = U_ZERO_ERROR;
1857 OrderList targetOrders(coll, target, offset);
1858 OrderList patternOrders(coll, pattern);
1859 int32_t targetSize = targetOrders.size() - 1;
1860 int32_t patternSize = patternOrders.size() - 1;
1861 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1862 target.getBuffer(), target.length(), &status);
1863
1864 if (patternSize == 0) {
1865 // Searching for an empty pattern always fails
1866 matchStart = matchEnd = -1;
1867 ubrk_close(charBreakIterator);
1868 return FALSE;
1869 }
1870
1871 matchStart = matchEnd = -1;
1872
1873 for(int32_t i = 0; i < targetSize; i += 1) {
1874 if (targetOrders.matchesAt(i, patternOrders)) {
1875 int32_t start = targetOrders.getLowOffset(i);
1876 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1877 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1878
1879 // if the low and high offsets of the first CE in
1880 // the match are the same, it means that the match
1881 // starts in the middle of an expansion - all but
1882 // the first CE of the expansion will have the offset
1883 // of the following character.
1884 if (start == targetOrders.getHighOffset(i)) {
1885 continue;
1886 }
1887
1888 // Make sure match starts on a grapheme boundary
1889 if (! ubrk_isBoundary(charBreakIterator, start)) {
1890 continue;
1891 }
1892
1893 // If the low and high offsets of the CE after the match
1894 // are the same, it means that the match ends in the middle
1895 // of an expansion sequence.
1896 if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1897 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1898 continue;
1899 }
1900
1901 int32_t mend = maxLimit;
1902
1903 // Find the first grapheme break after the character index
1904 // of the last CE in the match. If it's after character index
1905 // that's after the last CE in the match, use that index
1906 // as the end of the match.
1907 if (minLimit < maxLimit) {
1908 int32_t nba = ubrk_following(charBreakIterator, minLimit);
1909
1910 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1911 mend = nba;
1912 }
1913 }
1914
1915 if (mend > maxLimit) {
1916 continue;
1917 }
1918
1919 if (! ubrk_isBoundary(charBreakIterator, mend)) {
1920 continue;
1921 }
1922
1923 matchStart = start;
1924 matchEnd = mend;
1925
1926 ubrk_close(charBreakIterator);
1927 return TRUE;
1928 }
1929 }
1930
1931 ubrk_close(charBreakIterator);
1932 return FALSE;
1933 }
1934
1935 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)1936 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
1937 int32_t val = defaultVal;
1938
1939 name.append(" *= *(-?\\d+)");
1940
1941 UErrorCode status = U_ZERO_ERROR;
1942 RegexMatcher m(name, params, 0, status);
1943
1944 if (m.find()) {
1945 // The param exists. Convert the string to an int.
1946 char valString[100];
1947 int32_t paramLength = m.end(1, status) - m.start(1, status);
1948
1949 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1950 paramLength = (int32_t)(sizeof(valString)-2);
1951 }
1952
1953 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1954 val = strtol(valString, NULL, 10);
1955
1956 // Delete this parameter from the params string.
1957 m.reset();
1958 params = m.replaceFirst("", status);
1959 }
1960
1961 //U_ASSERT(U_SUCCESS(status));
1962 if (! U_SUCCESS(status)) {
1963 val = defaultVal;
1964 }
1965
1966 return val;
1967 }
1968 #endif
1969
1970 #if !UCONFIG_NO_COLLATION
monkeyTestCase(UCollator * coll,const UnicodeString & testCase,const UnicodeString & pattern,const UnicodeString & altPattern,const char * name,const char * strength,uint32_t seed)1971 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1972 const char *name, const char *strength, uint32_t seed)
1973 {
1974 UErrorCode status = U_ZERO_ERROR;
1975 int32_t actualStart = -1, actualEnd = -1;
1976 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1977 int32_t expectedStart = -1, expectedEnd = -1;
1978 int32_t notFoundCount = 0;
1979 UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1980 testCase.getBuffer(), testCase.length(),
1981 coll,
1982 NULL, // the break iterator
1983 &status);
1984
1985 // **** TODO: find *all* matches, not just first one ****
1986 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1987
1988 usearch_search(uss, 0, &actualStart, &actualEnd, &status);
1989
1990 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1991 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1992 " strength=%s seed=%d",
1993 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1994 }
1995
1996 if (expectedStart == -1 && actualStart == -1) {
1997 notFoundCount += 1;
1998 }
1999
2000 // **** TODO: find *all* matches, not just first one ****
2001 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
2002
2003 usearch_setPattern(uss, altPattern.getBuffer(), altPattern.length(), &status);
2004
2005 usearch_search(uss, 0, &actualStart, &actualEnd, &status);
2006
2007 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2008 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2009 " strength=%s seed=%d",
2010 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
2011 }
2012
2013 if (expectedStart == -1 && actualStart == -1) {
2014 notFoundCount += 1;
2015 }
2016
2017 usearch_close(uss);
2018
2019 return notFoundCount;
2020 }
2021
bmMonkeyTestCase(UCollator * coll,const UnicodeString & testCase,const UnicodeString & pattern,const UnicodeString & altPattern,BoyerMooreSearch * bms,BoyerMooreSearch * abms,const char * name,const char * strength,uint32_t seed)2022 int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
2023 BoyerMooreSearch *bms, BoyerMooreSearch *abms,
2024 const char *name, const char *strength, uint32_t seed)
2025 {
2026 UErrorCode status = U_ZERO_ERROR;
2027 int32_t actualStart = -1, actualEnd = -1;
2028 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
2029 int32_t expectedStart = -1, expectedEnd = -1;
2030 int32_t notFoundCount = 0;
2031
2032 // **** TODO: find *all* matches, not just first one ****
2033 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
2034
2035 bms->setTargetString(&testCase, status);
2036 bms->search(0, actualStart, actualEnd);
2037
2038 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2039 errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2040 " strength=%s seed=%d",
2041 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
2042 }
2043
2044 if (expectedStart == -1 && actualStart == -1) {
2045 notFoundCount += 1;
2046 }
2047
2048 // **** TODO: find *all* matches, not just first one ****
2049 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
2050
2051 abms->setTargetString(&testCase, status);
2052 abms->search(0, actualStart, actualEnd);
2053
2054 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2055 errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2056 " strength=%s seed=%d",
2057 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
2058 }
2059
2060 if (expectedStart == -1 && actualStart == -1) {
2061 notFoundCount += 1;
2062 }
2063
2064
2065 return notFoundCount;
2066 }
2067 #endif
2068
monkeyTest(char * params)2069 void SSearchTest::monkeyTest(char *params)
2070 {
2071 // ook!
2072 UErrorCode status = U_ZERO_ERROR;
2073 //UCollator *coll = ucol_open(NULL, &status);
2074 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
2075
2076 if (U_FAILURE(status)) {
2077 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
2078 return;
2079 }
2080
2081 CollData *monkeyData = CollData::open(coll, status);
2082
2083 USet *expansions = uset_openEmpty();
2084 USet *contractions = uset_openEmpty();
2085
2086 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
2087
2088 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2089 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2090 USet *letters = uset_openPattern(letter_pattern, 39, &status);
2091 SetMonkey letterMonkey(letters);
2092 StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
2093 StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
2094 UnicodeString testCase;
2095 UnicodeString alternate;
2096 UnicodeString pattern, altPattern;
2097 UnicodeString prefix, altPrefix;
2098 UnicodeString suffix, altSuffix;
2099
2100 Monkey *monkeys[] = {
2101 &letterMonkey,
2102 &contractionMonkey,
2103 &expansionMonkey,
2104 &contractionMonkey,
2105 &expansionMonkey,
2106 &contractionMonkey,
2107 &expansionMonkey,
2108 &contractionMonkey,
2109 &expansionMonkey};
2110 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
2111 int32_t nonMatchCount = 0;
2112
2113 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
2114 const char *strengthNames[] = {"primary", "secondary", "tertiary"};
2115 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
2116 int32_t loopCount = quick? 1000 : 10000;
2117 int32_t firstStrength = 0;
2118 int32_t lastStrength = strengthCount - 1; //*/ 0;
2119
2120 if (params != NULL) {
2121 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2122 UnicodeString p(params);
2123
2124 loopCount = getIntParam("loop", p, loopCount);
2125 m_seed = getIntParam("seed", p, m_seed);
2126
2127 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
2128 if (m.find()) {
2129 UnicodeString breakType = m.group(1, status);
2130
2131 for (int32_t s = 0; s < strengthCount; s += 1) {
2132 if (breakType == strengthNames[s]) {
2133 firstStrength = lastStrength = s;
2134 break;
2135 }
2136 }
2137
2138 m.reset();
2139 p = m.replaceFirst("", status);
2140 }
2141
2142 if (RegexMatcher("\\S", p, 0, status).find()) {
2143 // Each option is stripped out of the option string as it is processed.
2144 // All options have been checked. The option string should have been completely emptied..
2145 char buf[100];
2146 p.extract(buf, sizeof(buf), NULL, status);
2147 buf[sizeof(buf)-1] = 0;
2148 errln("Unrecognized or extra parameter: %s\n", buf);
2149 return;
2150 }
2151 #else
2152 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2153 #endif
2154 }
2155
2156 for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
2157 int32_t notFoundCount = 0;
2158
2159 logln("Setting strength to %s.", strengthNames[s]);
2160 ucol_setStrength(coll, strengths[s]);
2161
2162 // TODO: try alternate prefix and suffix too?
2163 // TODO: alterntaes are only equal at primary strength. Is this OK?
2164 for(int32_t t = 0; t < loopCount; t += 1) {
2165 uint32_t seed = m_seed;
2166 int32_t nmc = 0;
2167
2168 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
2169 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
2170 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix);
2171
2172 // pattern
2173 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
2174
2175 testCase.remove();
2176 testCase.append(prefix);
2177 testCase.append(/*alt*/pattern);
2178
2179 // prefix + pattern
2180 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
2181
2182 testCase.append(suffix);
2183
2184 // prefix + pattern + suffix
2185 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
2186
2187 testCase.remove();
2188 testCase.append(pattern);
2189 testCase.append(suffix);
2190
2191 // pattern + suffix
2192 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
2193 }
2194
2195 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
2196 }
2197
2198 uset_close(contractions);
2199 uset_close(expansions);
2200 uset_close(letters);
2201
2202 CollData::close(monkeyData);
2203
2204 ucol_close(coll);
2205 }
2206
bmMonkeyTest(char * params)2207 void SSearchTest::bmMonkeyTest(char *params)
2208 {
2209 // ook!
2210 UErrorCode status = U_ZERO_ERROR;
2211 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
2212
2213 if (U_FAILURE(status)) {
2214 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
2215 return;
2216 }
2217
2218 CollData *monkeyData = CollData::open(coll, status);
2219
2220 USet *expansions = uset_openEmpty();
2221 USet *contractions = uset_openEmpty();
2222
2223 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
2224
2225 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2226 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2227 USet *letters = uset_openPattern(letter_pattern, 39, &status);
2228 SetMonkey letterMonkey(letters);
2229 StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
2230 StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
2231 UnicodeString testCase;
2232 UnicodeString alternate;
2233 UnicodeString pattern, altPattern;
2234 UnicodeString prefix, altPrefix;
2235 UnicodeString suffix, altSuffix;
2236
2237 Monkey *monkeys[] = {
2238 &letterMonkey,
2239 &contractionMonkey,
2240 &expansionMonkey,
2241 &contractionMonkey,
2242 &expansionMonkey,
2243 &contractionMonkey,
2244 &expansionMonkey,
2245 &contractionMonkey,
2246 &expansionMonkey};
2247 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
2248 int32_t nonMatchCount = 0;
2249
2250 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
2251 const char *strengthNames[] = {"primary", "secondary", "tertiary"};
2252 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
2253 int32_t loopCount = quick? 1000 : 10000;
2254 int32_t firstStrength = 0;
2255 int32_t lastStrength = strengthCount - 1; //*/ 0;
2256
2257 if (params != NULL) {
2258 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2259 UnicodeString p(params);
2260
2261 loopCount = getIntParam("loop", p, loopCount);
2262 m_seed = getIntParam("seed", p, m_seed);
2263
2264 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
2265 if (m.find()) {
2266 UnicodeString breakType = m.group(1, status);
2267
2268 for (int32_t s = 0; s < strengthCount; s += 1) {
2269 if (breakType == strengthNames[s]) {
2270 firstStrength = lastStrength = s;
2271 break;
2272 }
2273 }
2274
2275 m.reset();
2276 p = m.replaceFirst("", status);
2277 }
2278
2279 if (RegexMatcher("\\S", p, 0, status).find()) {
2280 // Each option is stripped out of the option string as it is processed.
2281 // All options have been checked. The option string should have been completely emptied..
2282 char buf[100];
2283 p.extract(buf, sizeof(buf), NULL, status);
2284 buf[sizeof(buf)-1] = 0;
2285 errln("Unrecognized or extra parameter: %s\n", buf);
2286 return;
2287 }
2288 #else
2289 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2290 #endif
2291 }
2292
2293 for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
2294 int32_t notFoundCount = 0;
2295
2296 logln("Setting strength to %s.", strengthNames[s]);
2297 ucol_setStrength(coll, strengths[s]);
2298
2299 CollData *data = CollData::open(coll, status);
2300
2301 // TODO: try alternate prefix and suffix too?
2302 // TODO: alterntaes are only equal at primary strength. Is this OK?
2303 for(int32_t t = 0; t < loopCount; t += 1) {
2304 uint32_t seed = m_seed;
2305 int32_t nmc = 0;
2306
2307 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
2308 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
2309 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix);
2310
2311 BoyerMooreSearch pat(data, pattern, NULL, status);
2312 BoyerMooreSearch alt(data, altPattern, NULL, status);
2313
2314 // **** need a better way to deal with this ****
2315 #if 0
2316 if (pat.empty() ||
2317 alt.empty()) {
2318 continue;
2319 }
2320 #endif
2321
2322 // pattern
2323 notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern, &pat, &alt, "pattern", strengthNames[s], seed);
2324
2325 testCase.remove();
2326 testCase.append(prefix);
2327 testCase.append(/*alt*/pattern);
2328
2329 // prefix + pattern
2330 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern", strengthNames[s], seed);
2331
2332 testCase.append(suffix);
2333
2334 // prefix + pattern + suffix
2335 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed);
2336
2337 testCase.remove();
2338 testCase.append(pattern);
2339 testCase.append(suffix);
2340
2341 // pattern + suffix
2342 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "pattern + suffix", strengthNames[s], seed);
2343 }
2344
2345 CollData::close(data);
2346
2347 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
2348 }
2349
2350 uset_close(contractions);
2351 uset_close(expansions);
2352 uset_close(letters);
2353
2354 CollData::close(monkeyData);
2355
2356 ucol_close(coll);
2357 }
2358
2359 #endif
2360
2361 #endif
2362