• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2009, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 
9 #include "unicode/utypes.h"
10 
11 #if !UCONFIG_NO_COLLATION
12 
13 #include "unicode/unistr.h"
14 #include "unicode/putil.h"
15 #include "unicode/usearch.h"
16 
17 #include "cmemory.h"
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/coleitr.h"
21 #include "unicode/ucoleitr.h"
22 
23 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
24 
25 #include "unicode/uniset.h"
26 #include "unicode/uset.h"
27 #include "unicode/ustring.h"
28 #include "hash.h"
29 #include "uhash.h"
30 #include "ucol_imp.h"
31 
32 #include "intltest.h"
33 #include "ssearch.h"
34 
35 #include "unicode/colldata.h"
36 #include "unicode/bmsearch.h"
37 #include "unicode/bms.h"
38 
39 #include "xmlparser.h"
40 #include "ucbuf.h"
41 
42 #include <stdlib.h>
43 #include <string.h>
44 #include <stdio.h>
45 
46 char testId[100];
47 
48 #define TEST_ASSERT(x) {if (!(x)) { \
49     errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
50 
51 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
52     errln("Failure in file %s, line %d.   \"%s\"", __FILE__, __LINE__, m);return;}}
53 
54 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
55     dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
56           __FILE__, __LINE__, testId, u_errorName(errcode));}}
57 
58 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
59 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
60 #define DELETE_ARRAY(array) uprv_free((void *) (array))
61 
62 //---------------------------------------------------------------------------
63 //
64 //  Test class boilerplate
65 //
66 //---------------------------------------------------------------------------
SSearchTest()67 SSearchTest::SSearchTest()
68 {
69 }
70 
~SSearchTest()71 SSearchTest::~SSearchTest()
72 {
73 }
74 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)75 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
76 {
77     if (exec) logln("TestSuite SSearchTest: ");
78     switch (index) {
79 #if !UCONFIG_NO_BREAK_ITERATION
80        case 0: name = "searchTest";
81             if (exec) searchTest();
82             break;
83 
84         case 1: name = "offsetTest";
85             if (exec) offsetTest();
86             break;
87 
88         case 2: name = "monkeyTest";
89             if (exec) monkeyTest(params);
90             break;
91 
92         case 3: name = "bmMonkeyTest";
93             if (exec) bmMonkeyTest(params);
94             break;
95 
96         case 4: name = "boyerMooreTest";
97             if (exec) boyerMooreTest();
98             break;
99 
100         case 5: name = "goodSuffixTest";
101             if (exec) goodSuffixTest();
102             break;
103 
104         case 6: name = "searchTime";
105             if (exec) searchTime();
106             break;
107 
108         case 7: name = "bmsTest";
109             if (exec) bmsTest();
110             break;
111 
112         case 8: name = "bmSearchTest";
113             if (exec) bmSearchTest();
114             break;
115 
116         case 9: name = "udhrTest";
117             if (exec) udhrTest();
118             break;
119 #endif
120         default: name = "";
121             break; //needed to end loop
122     }
123 }
124 
125 
126 #if !UCONFIG_NO_BREAK_ITERATION
127 
128 #define PATH_BUFFER_SIZE 2048
getPath(char buffer[2048],const char * filename)129 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
130     UErrorCode status = U_ZERO_ERROR;
131     const char *testDataDirectory = IntlTest::getSourceTestData(status);
132 
133     if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
134         errln("ERROR: getPath() failed - %s", u_errorName(status));
135         return NULL;
136     }
137 
138     strcpy(buffer, testDataDirectory);
139     strcat(buffer, filename);
140     return buffer;
141 }
142 
143 
searchTest()144 void SSearchTest::searchTest()
145 {
146 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
147     UErrorCode status = U_ZERO_ERROR;
148     char path[PATH_BUFFER_SIZE];
149     const char *testFilePath = getPath(path, "ssearch.xml");
150 
151     if (testFilePath == NULL) {
152         return; /* Couldn't get path: error message already output. */
153     }
154 
155     UXMLParser  *parser = UXMLParser::createParser(status);
156     TEST_ASSERT_SUCCESS(status);
157     UXMLElement *root   = parser->parseFile(testFilePath, status);
158     TEST_ASSERT_SUCCESS(status);
159     if (U_FAILURE(status)) {
160         return;
161     }
162 
163     const UnicodeString *debugTestCase = root->getAttribute("debug");
164     if (debugTestCase != NULL) {
165 //       setenv("USEARCH_DEBUG", "1", 1);
166     }
167 
168 
169     const UXMLElement *testCase;
170     int32_t tc = 0;
171 
172     while((testCase = root->nextChildElement(tc)) != NULL) {
173 
174         if (testCase->getTagName().compare("test-case") != 0) {
175             errln("ssearch, unrecognized XML Element in test file");
176             continue;
177         }
178         const UnicodeString *id       = testCase->getAttribute("id");
179         *testId = 0;
180         if (id != NULL) {
181             id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
182         }
183 
184         // If debugging test case has been specified and this is not it, skip to next.
185         if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
186             continue;
187         }
188         //
189         //  Get the requested collation strength.
190         //    Default is tertiary if the XML attribute is missing from the test case.
191         //
192         const UnicodeString *strength = testCase->getAttribute("strength");
193         UColAttributeValue collatorStrength;
194         if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
195         else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
196         else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
197         else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
198         else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
199         else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
200         else {
201             // Bogus value supplied for strength.  Shouldn't happen, even from
202             //  typos, if the  XML source has been validated.
203             //  This assert is a little deceiving in that strength can be
204             //   any of the allowed values, not just TERTIARY, but it will
205             //   do the job of getting the error output.
206             TEST_ASSERT(*strength=="TERTIARY")
207         }
208 
209         //
210         // Get the collator normalization flag.  Default is UCOL_OFF.
211         //
212         UColAttributeValue normalize = UCOL_OFF;
213         const UnicodeString *norm = testCase->getAttribute("norm");
214         TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
215         if (norm!=NULL && *norm=="ON") {
216             normalize = UCOL_ON;
217         }
218 
219         //
220         // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
221         //
222         UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
223         const UnicodeString *alt = testCase->getAttribute("alternate_handling");
224         TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
225         if (alt != NULL && *alt == "SHIFTED") {
226             alternateHandling = UCOL_SHIFTED;
227         }
228 
229         const UnicodeString defLocale("en");
230         char  clocale[100];
231         const UnicodeString *locale   = testCase->getAttribute("locale");
232         if (locale == NULL || locale->length()==0) {
233             locale = &defLocale;
234         };
235         locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
236 
237 
238         UnicodeString  text;
239         UnicodeString  target;
240         UnicodeString  pattern;
241         int32_t        expectedMatchStart = -1;
242         int32_t        expectedMatchLimit = -1;
243         const UXMLElement  *n;
244         int32_t                nodeCount = 0;
245 
246         n = testCase->getChildElement("pattern");
247         TEST_ASSERT(n != NULL);
248         if (n==NULL) {
249             continue;
250         }
251         text = n->getText(FALSE);
252         text = text.unescape();
253         pattern.append(text);
254         nodeCount++;
255 
256         n = testCase->getChildElement("pre");
257         if (n!=NULL) {
258             text = n->getText(FALSE);
259             text = text.unescape();
260             target.append(text);
261             nodeCount++;
262         }
263 
264         n = testCase->getChildElement("m");
265         if (n!=NULL) {
266             expectedMatchStart = target.length();
267             text = n->getText(FALSE);
268             text = text.unescape();
269             target.append(text);
270             expectedMatchLimit = target.length();
271             nodeCount++;
272         }
273 
274         n = testCase->getChildElement("post");
275         if (n!=NULL) {
276             text = n->getText(FALSE);
277             text = text.unescape();
278             target.append(text);
279             nodeCount++;
280         }
281 
282         //  Check that there weren't extra things in the XML
283         TEST_ASSERT(nodeCount == testCase->countChildren());
284 
285         // Open a collator and StringSearch based on the parameters
286         //   obtained from the XML.
287         //
288         status = U_ZERO_ERROR;
289         UCollator *collator = ucol_open(clocale, &status);
290         ucol_setStrength(collator, collatorStrength);
291         ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
292         ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
293         UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
294                                          target.getBuffer(), target.length(),
295                                          collator,
296                                          NULL,     // the break iterator
297                                          &status);
298 
299         TEST_ASSERT_SUCCESS(status);
300         if (U_FAILURE(status)) {
301             usearch_close(uss);
302             ucol_close(collator);
303             continue;
304         }
305 
306         int32_t foundStart = 0;
307         int32_t foundLimit = 0;
308         UBool   foundMatch;
309 
310         //
311         // Do the search, check the match result against the expected results.
312         //
313         foundMatch= usearch_search(uss, 0, &foundStart, &foundLimit, &status);
314         TEST_ASSERT_SUCCESS(status);
315         if (foundMatch && expectedMatchStart<0 ||
316             foundStart != expectedMatchStart   ||
317             foundLimit != expectedMatchLimit) {
318                 TEST_ASSERT(FALSE);   //  ouput generic error position
319                 infoln("Found, expected match start = %d, %d \n"
320                        "Found, expected match limit = %d, %d",
321                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
322         }
323 
324         // In case there are other matches...
325         // (should we only do this if the test case passed?)
326         while (foundMatch) {
327             expectedMatchStart = foundStart;
328             expectedMatchLimit = foundLimit;
329 
330             foundMatch = usearch_search(uss, foundLimit, &foundStart, &foundLimit, &status);
331         }
332 
333         usearch_close(uss);
334 
335         uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
336             target.getBuffer(), target.length(),
337             collator,
338             NULL,
339             &status);
340 
341         //
342         // Do the backwards search, check the match result against the expected results.
343         //
344         foundMatch= usearch_searchBackwards(uss, target.length(), &foundStart, &foundLimit, &status);
345         TEST_ASSERT_SUCCESS(status);
346         if (foundMatch && expectedMatchStart<0 ||
347             foundStart != expectedMatchStart   ||
348             foundLimit != expectedMatchLimit) {
349                 TEST_ASSERT(FALSE);   //  ouput generic error position
350                 infoln("Found, expected backwards match start = %d, %d \n"
351                        "Found, expected backwards match limit = %d, %d",
352                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
353         }
354 
355         usearch_close(uss);
356         ucol_close(collator);
357     }
358 
359     delete root;
360     delete parser;
361 #endif
362 }
363 
364 struct UdhrTestCase
365 {
366     char *locale;
367     char *file;
368 };
369 
udhrTest()370 void SSearchTest::udhrTest()
371 {
372     UErrorCode status = U_ZERO_ERROR;
373     char path[PATH_BUFFER_SIZE];
374     const char *udhrPath = getPath(path, "udhr");
375 
376     if (udhrPath == NULL) {
377         // couldn't get path: error message already output...
378         return;
379     }
380 
381     UdhrTestCase testCases[] = {
382         {"en", "udhr_eng.txt"},
383         {"de", "udhr_deu_1996.txt"},
384         {"fr", "udhr_fra.txt"},
385         {"ru", "udhr_rus.txt"},
386         {"th", "udhr_tha.txt"},
387         {"ja", "udhr_jpn.txt"},
388         {"ko", "udhr_kor.txt"},
389         {"zh", "udhr_cmn_hans.txt"},
390         {"zh_Hant", "udhr_cmn_hant.txt"}
391     };
392 
393     int32_t testCount = ARRAY_SIZE(testCases);
394 
395     for (int32_t t = 0; t < testCount; t += 1) {
396         int32_t len = 0;
397         char *resolvedFileName = NULL;
398         const char *encoding = NULL;
399         UCHARBUF *ucharBuf = NULL;
400 
401         ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status);
402         resolvedFileName = NEW_ARRAY(char, len);
403 
404         if(resolvedFileName == NULL){
405             continue;
406         }
407 
408         if(status == U_BUFFER_OVERFLOW_ERROR){
409             status = U_ZERO_ERROR;
410         }
411 
412         ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &len, &status);
413         ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status);
414 
415         DELETE_ARRAY(resolvedFileName);
416 
417         if(U_FAILURE(status)){
418             infoln("Could not open the input file %s. Test skipped\n", testCases[t].file);
419             continue;
420         }
421 
422         int32_t targetLen = 0;
423         const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status);
424 
425         /* The first line of the file contains the pattern */
426         int32_t start = 0, end = 0, plen = 0;
427 
428         for(end = start; ; end += 1) {
429             UChar ch = target[end];
430 
431             if (ch == 0x000A || ch == 0x000D || ch == 0x2028) {
432                 break;
433             }
434         }
435 
436         plen = end - start;
437 
438         UChar *pattern = NEW_ARRAY(UChar, plen);
439         for (int32_t i = 0; i < plen; i += 1) {
440             pattern[i] =  target[start++];
441         }
442 
443         int32_t offset = 0;
444         UCollator *coll = ucol_open(testCases[t].locale, &status);
445         UCD *ucd = NULL;
446         BMS *bms = NULL;
447 
448         if (U_FAILURE(status)) {
449             errln("Could not open collator for %s", testCases[t].locale);
450             goto delete_collator;
451         }
452 
453         ucd = ucd_open(coll, &status);
454 
455         if (U_FAILURE(status)) {
456             errln("Could not open CollData object for %s", testCases[t].locale);
457             goto delete_ucd;
458         }
459 
460         bms = bms_open(ucd, pattern, plen, target, targetLen, &status);
461 
462         if (U_FAILURE(status)) {
463             errln("Could not open search object for %s", testCases[t].locale);
464             goto delete_bms;
465         }
466 
467         start = end = -1;
468         while (bms_search(bms, offset, &start, &end)) {
469             offset = end;
470         }
471 
472         if (offset == 0) {
473             errln("Could not find pattern - locale: %s, file: %s ", testCases[t].locale, testCases[t].file);
474         }
475 
476 delete_bms:
477         bms_close(bms);
478 
479 delete_ucd:
480         ucd_close(ucd);
481 
482 delete_collator:
483         ucol_close(coll);
484 
485         DELETE_ARRAY(pattern);
486         ucbuf_close(ucharBuf);
487     }
488 
489     ucd_flushCache();
490 }
491 
bmSearchTest()492 void SSearchTest::bmSearchTest()
493 {
494 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
495     UErrorCode status = U_ZERO_ERROR;
496     char path[PATH_BUFFER_SIZE];
497     const char *testFilePath = getPath(path, "ssearch.xml");
498 
499     if (testFilePath == NULL) {
500         return; /* Couldn't get path: error message already output. */
501     }
502 
503     UXMLParser  *parser = UXMLParser::createParser(status);
504     TEST_ASSERT_SUCCESS(status);
505     UXMLElement *root   = parser->parseFile(testFilePath, status);
506     TEST_ASSERT_SUCCESS(status);
507     if (U_FAILURE(status)) {
508         return;
509     }
510 
511     const UnicodeString *debugTestCase = root->getAttribute("debug");
512     if (debugTestCase != NULL) {
513 //       setenv("USEARCH_DEBUG", "1", 1);
514     }
515 
516 
517     const UXMLElement *testCase;
518     int32_t tc = 0;
519 
520     while((testCase = root->nextChildElement(tc)) != NULL) {
521 
522         if (testCase->getTagName().compare("test-case") != 0) {
523             errln("ssearch, unrecognized XML Element in test file");
524             continue;
525         }
526         const UnicodeString *id       = testCase->getAttribute("id");
527         *testId = 0;
528         if (id != NULL) {
529             id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
530         }
531 
532         // If debugging test case has been specified and this is not it, skip to next.
533         if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
534             continue;
535         }
536         //
537         //  Get the requested collation strength.
538         //    Default is tertiary if the XML attribute is missing from the test case.
539         //
540         const UnicodeString *strength = testCase->getAttribute("strength");
541         UColAttributeValue collatorStrength;
542         if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
543         else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
544         else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
545         else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
546         else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
547         else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
548         else {
549             // Bogus value supplied for strength.  Shouldn't happen, even from
550             //  typos, if the  XML source has been validated.
551             //  This assert is a little deceiving in that strength can be
552             //   any of the allowed values, not just TERTIARY, but it will
553             //   do the job of getting the error output.
554             TEST_ASSERT(*strength=="TERTIARY")
555         }
556 
557         //
558         // Get the collator normalization flag.  Default is UCOL_OFF.
559         //
560         UColAttributeValue normalize = UCOL_OFF;
561         const UnicodeString *norm = testCase->getAttribute("norm");
562         TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
563         if (norm!=NULL && *norm=="ON") {
564             normalize = UCOL_ON;
565         }
566 
567         //
568         // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
569         //
570         UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
571         const UnicodeString *alt = testCase->getAttribute("alternate_handling");
572         TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
573         if (alt != NULL && *alt == "SHIFTED") {
574             alternateHandling = UCOL_SHIFTED;
575         }
576 
577         const UnicodeString defLocale("en");
578         char  clocale[100];
579         const UnicodeString *locale   = testCase->getAttribute("locale");
580         if (locale == NULL || locale->length()==0) {
581             locale = &defLocale;
582         };
583         locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
584 
585 
586         UnicodeString  text;
587         UnicodeString  target;
588         UnicodeString  pattern;
589         int32_t        expectedMatchStart = -1;
590         int32_t        expectedMatchLimit = -1;
591         const UXMLElement  *n;
592         int32_t                nodeCount = 0;
593 
594         n = testCase->getChildElement("pattern");
595         TEST_ASSERT(n != NULL);
596         if (n==NULL) {
597             continue;
598         }
599         text = n->getText(FALSE);
600         text = text.unescape();
601         pattern.append(text);
602         nodeCount++;
603 
604         n = testCase->getChildElement("pre");
605         if (n!=NULL) {
606             text = n->getText(FALSE);
607             text = text.unescape();
608             target.append(text);
609             nodeCount++;
610         }
611 
612         n = testCase->getChildElement("m");
613         if (n!=NULL) {
614             expectedMatchStart = target.length();
615             text = n->getText(FALSE);
616             text = text.unescape();
617             target.append(text);
618             expectedMatchLimit = target.length();
619             nodeCount++;
620         }
621 
622         n = testCase->getChildElement("post");
623         if (n!=NULL) {
624             text = n->getText(FALSE);
625             text = text.unescape();
626             target.append(text);
627             nodeCount++;
628         }
629 
630         //  Check that there weren't extra things in the XML
631         TEST_ASSERT(nodeCount == testCase->countChildren());
632 
633         // Open a collator and StringSearch based on the parameters
634         //   obtained from the XML.
635         //
636         status = U_ZERO_ERROR;
637         UCollator *collator = ucol_open(clocale, &status);
638         ucol_setStrength(collator, collatorStrength);
639         ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
640         ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
641         UCD *ucd = ucd_open(collator, &status);
642         BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.getBuffer(), target.length(), &status);
643 
644         TEST_ASSERT_SUCCESS(status);
645         if (U_FAILURE(status)) {
646             bms_close(bms);
647             ucd_close(ucd);
648             ucol_close(collator);
649             continue;
650         }
651 
652         int32_t foundStart = 0;
653         int32_t foundLimit = 0;
654         UBool   foundMatch;
655 
656         //
657         // Do the search, check the match result against the expected results.
658         //
659         foundMatch = bms_search(bms, 0, &foundStart, &foundLimit);
660       //TEST_ASSERT_SUCCESS(status);
661         if (foundMatch && expectedMatchStart < 0 ||
662             foundStart != expectedMatchStart   ||
663             foundLimit != expectedMatchLimit) {
664                 TEST_ASSERT(FALSE);   //  ouput generic error position
665                 infoln("Found, expected match start = %d, %d \n"
666                        "Found, expected match limit = %d, %d",
667                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
668         }
669 
670         bms_close(bms);
671         ucd_close(ucd);
672         ucol_close(collator);
673     }
674 
675     ucd_flushCache();
676     delete root;
677     delete parser;
678 #endif
679 }
680 
681 struct Order
682 {
683     int32_t order;
684     int32_t lowOffset;
685     int32_t highOffset;
686 };
687 
688 class OrderList
689 {
690 public:
691     OrderList();
692     OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
693     ~OrderList();
694 
695     int32_t size(void) const;
696     void add(int32_t order, int32_t low, int32_t high);
697     const Order *get(int32_t index) const;
698     int32_t getLowOffset(int32_t index) const;
699     int32_t getHighOffset(int32_t index) const;
700     int32_t getOrder(int32_t index) const;
701     void reverse(void);
702     UBool compare(const OrderList &other) const;
703     UBool matchesAt(int32_t offset, const OrderList &other) const;
704 
705 private:
706     Order *list;
707     int32_t listMax;
708     int32_t listSize;
709 };
710 
OrderList()711 OrderList::OrderList()
712   : list(NULL), listSize(0), listMax(16)
713 {
714     list = new Order[listMax];
715 }
716 
OrderList(UCollator * coll,const UnicodeString & string,int32_t stringOffset)717 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
718     : list(NULL), listMax(16), listSize(0)
719 {
720     UErrorCode status = U_ZERO_ERROR;
721     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
722     uint32_t strengthMask = 0;
723     int32_t order, low, high;
724 
725     switch (ucol_getStrength(coll))
726     {
727     default:
728         strengthMask |= UCOL_TERTIARYORDERMASK;
729         /* fall through */
730 
731     case UCOL_SECONDARY:
732         strengthMask |= UCOL_SECONDARYORDERMASK;
733         /* fall through */
734 
735     case UCOL_PRIMARY:
736         strengthMask |= UCOL_PRIMARYORDERMASK;
737     }
738 
739     list = new Order[listMax];
740 
741     ucol_setOffset(elems, stringOffset, &status);
742 
743     do {
744         low   = ucol_getOffset(elems);
745         order = ucol_next(elems, &status);
746         high  = ucol_getOffset(elems);
747 
748         if (order != UCOL_NULLORDER) {
749             order &= strengthMask;
750         }
751 
752         if (order != UCOL_IGNORABLE) {
753             add(order, low, high);
754         }
755     } while (order != UCOL_NULLORDER);
756 
757     ucol_closeElements(elems);
758 }
759 
~OrderList()760 OrderList::~OrderList()
761 {
762     delete[] list;
763 }
764 
add(int32_t order,int32_t low,int32_t high)765 void OrderList::add(int32_t order, int32_t low, int32_t high)
766 {
767     if (listSize >= listMax) {
768         listMax *= 2;
769 
770         Order *newList = new Order[listMax];
771 
772         uprv_memcpy(newList, list, listSize * sizeof(Order));
773         delete[] list;
774         list = newList;
775     }
776 
777     list[listSize].order      = order;
778     list[listSize].lowOffset  = low;
779     list[listSize].highOffset = high;
780 
781     listSize += 1;
782 }
783 
get(int32_t index) const784 const Order *OrderList::get(int32_t index) const
785 {
786     if (index >= listSize) {
787         return NULL;
788     }
789 
790     return &list[index];
791 }
792 
getLowOffset(int32_t index) const793 int32_t OrderList::getLowOffset(int32_t index) const
794 {
795     const Order *order = get(index);
796 
797     if (order != NULL) {
798         return order->lowOffset;
799     }
800 
801     return -1;
802 }
803 
getHighOffset(int32_t index) const804 int32_t OrderList::getHighOffset(int32_t index) const
805 {
806     const Order *order = get(index);
807 
808     if (order != NULL) {
809         return order->highOffset;
810     }
811 
812     return -1;
813 }
814 
getOrder(int32_t index) const815 int32_t OrderList::getOrder(int32_t index) const
816 {
817     const Order *order = get(index);
818 
819     if (order != NULL) {
820         return order->order;
821     }
822 
823     return UCOL_NULLORDER;
824 }
825 
size() const826 int32_t OrderList::size() const
827 {
828     return listSize;
829 }
830 
reverse()831 void OrderList::reverse()
832 {
833     for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
834         Order swap = list[b];
835 
836         list[b] = list[f];
837         list[f] = swap;
838     }
839 }
840 
compare(const OrderList & other) const841 UBool OrderList::compare(const OrderList &other) const
842 {
843     if (listSize != other.listSize) {
844         return FALSE;
845     }
846 
847     for(int32_t i = 0; i < listSize; i += 1) {
848         if (list[i].order  != other.list[i].order ||
849             list[i].lowOffset != other.list[i].lowOffset ||
850             list[i].highOffset != other.list[i].highOffset) {
851                 return FALSE;
852         }
853     }
854 
855     return TRUE;
856 }
857 
matchesAt(int32_t offset,const OrderList & other) const858 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
859 {
860     // NOTE: sizes include the NULLORDER, which we don't want to compare.
861     int32_t otherSize = other.size() - 1;
862 
863     if (listSize - 1 - offset < otherSize) {
864         return FALSE;
865     }
866 
867     for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
868         if (getOrder(i) != other.getOrder(j)) {
869             return FALSE;
870         }
871     }
872 
873     return TRUE;
874 }
875 
printOffsets(char * buffer,OrderList & list)876 static char *printOffsets(char *buffer, OrderList &list)
877 {
878     int32_t size = list.size();
879     char *s = buffer;
880 
881     for(int32_t i = 0; i < size; i += 1) {
882         const Order *order = list.get(i);
883 
884         if (i != 0) {
885             s += sprintf(s, ", ");
886         }
887 
888         s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
889     }
890 
891     return buffer;
892 }
893 
printOrders(char * buffer,OrderList & list)894 static char *printOrders(char *buffer, OrderList &list)
895 {
896     int32_t size = list.size();
897     char *s = buffer;
898 
899     for(int32_t i = 0; i < size; i += 1) {
900         const Order *order = list.get(i);
901 
902         if (i != 0) {
903             s += sprintf(s, ", ");
904         }
905 
906         s += sprintf(s, "%8.8X", order->order);
907     }
908 
909     return buffer;
910 }
911 
offsetTest()912 void SSearchTest::offsetTest()
913 {
914     const char *test[] = {
915         // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
916         // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
917         "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
918 
919         "\\ua191\\u16ef\\u2036\\u017a",
920 
921 #if 0
922         // This results in a complex interaction between contraction,
923         // expansion and normalization that confuses the backwards offset fixups.
924         "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
925 #endif
926 
927         "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
928         "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
929 
930         "\\u02FE\\u02FF"
931         "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
932         "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
933         "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
934         "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
935         "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E",
936 
937         "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318",
938         "abc\\u0E41\\u0301\\u0316",
939 		"abc\\u0E41\\u0316\\u0301",
940 		"\\u0E41\\u0301\\u0316",
941 		"\\u0E41\\u0316\\u0301",
942 		"a\\u0301\\u0316",
943 		"a\\u0316\\u0301",
944 		"\\uAC52\\uAC53",
945 		"\\u34CA\\u34CB",
946 		"\\u11ED\\u11EE",
947 		"\\u30C3\\u30D0",
948 		"p\\u00E9ch\\u00E9",
949         "a\\u0301\\u0325",
950         "a\\u0300\\u0325",
951         "a\\u0325\\u0300",
952         "A\\u0323\\u0300B",
953         "A\\u0300\\u0323B",
954         "A\\u0301\\u0323B",
955         "A\\u0302\\u0301\\u0323B",
956         "abc",
957         "ab\\u0300c",
958         "ab\\u0300\\u0323c",
959         " \\uD800\\uDC00\\uDC00",
960         "a\\uD800\\uDC00\\uDC00",
961         "A\\u0301\\u0301",
962         "A\\u0301\\u0323",
963         "A\\u0301\\u0323B",
964         "B\\u0301\\u0323C",
965         "A\\u0300\\u0323B",
966         "\\u0301A\\u0301\\u0301",
967         "abcd\\r\\u0301",
968         "p\\u00EAche",
969         "pe\\u0302che",
970     };
971 
972     int32_t testCount = ARRAY_SIZE(test);
973     UErrorCode status = U_ZERO_ERROR;
974     RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
975     if (U_FAILURE(status)) {
976         errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
977         return;
978     }
979     char buffer[4096];  // A bit of a hack... just happens to be long enough for all the test cases...
980                         // We could allocate one that's the right size by (CE_count * 10) + 2
981                         // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
982 
983     col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
984 
985     for(int32_t i = 0; i < testCount; i += 1) {
986         UnicodeString ts = CharsToUnicodeString(test[i]);
987         CollationElementIterator *iter = col->createCollationElementIterator(ts);
988         OrderList forwardList;
989         OrderList backwardList;
990         int32_t order, low, high;
991 
992         do {
993             low   = iter->getOffset();
994             order = iter->next(status);
995             high  = iter->getOffset();
996 
997             forwardList.add(order, low, high);
998         } while (order != CollationElementIterator::NULLORDER);
999 
1000         iter->reset();
1001         iter->setOffset(ts.length(), status);
1002 
1003         backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
1004 
1005         do {
1006             high  = iter->getOffset();
1007             order = iter->previous(status);
1008             low   = iter->getOffset();
1009 
1010             if (order == CollationElementIterator::NULLORDER) {
1011                 break;
1012             }
1013 
1014             backwardList.add(order, low, high);
1015         } while (TRUE);
1016 
1017         backwardList.reverse();
1018 
1019         if (forwardList.compare(backwardList)) {
1020             logln("Works with \"%s\"", test[i]);
1021             logln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
1022 //          logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
1023 
1024             logln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
1025 //          logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
1026 
1027             logln();
1028         } else {
1029             errln("Fails with \"%s\"", test[i]);
1030             infoln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
1031             infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
1032 
1033             infoln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
1034             infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
1035 
1036             infoln();
1037         }
1038         delete iter;
1039     }
1040     delete col;
1041 }
1042 
escape(const UnicodeString & string,UnicodeString & buffer)1043 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
1044 {
1045     for(int32_t i = 0; i < string.length(); i += 1) {
1046         UChar32 ch = string.char32At(i);
1047 
1048         if (ch >= 0x0020 && ch <= 0x007F) {
1049             if (ch == 0x005C) {
1050                 buffer.append("\\\\");
1051             } else {
1052                 buffer.append(ch);
1053             }
1054         } else {
1055             char cbuffer[12];
1056 
1057             if (ch <= 0xFFFFL) {
1058                 sprintf(cbuffer, "\\u%4.4X", ch);
1059             } else {
1060                 sprintf(cbuffer, "\\U%8.8X", ch);
1061             }
1062 
1063             buffer.append(cbuffer);
1064         }
1065 
1066         if (ch >= 0x10000L) {
1067             i += 1;
1068         }
1069     }
1070 
1071     return buffer;
1072 }
1073 
1074 #if 1
1075 
1076 struct PCE
1077 {
1078     uint64_t ce;
1079     int32_t  lowOffset;
1080     int32_t  highOffset;
1081 };
1082 
1083 class PCEList
1084 {
1085 public:
1086     PCEList(UCollator *coll, const UnicodeString &string);
1087     ~PCEList();
1088 
1089     int32_t size() const;
1090 
1091     const PCE *get(int32_t index) const;
1092 
1093     int32_t getLowOffset(int32_t index) const;
1094     int32_t getHighOffset(int32_t index) const;
1095     uint64_t getOrder(int32_t index) const;
1096 
1097     UBool matchesAt(int32_t offset, const PCEList &other) const;
1098 
1099     uint64_t operator[](int32_t index) const;
1100 
1101 private:
1102     void add(uint64_t ce, int32_t low, int32_t high);
1103 
1104     PCE *list;
1105     int32_t listMax;
1106     int32_t listSize;
1107 };
1108 
PCEList(UCollator * coll,const UnicodeString & string)1109 PCEList::PCEList(UCollator *coll, const UnicodeString &string)
1110 {
1111     UErrorCode status = U_ZERO_ERROR;
1112     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
1113     uint64_t order;
1114     int32_t low, high;
1115 
1116     list = new PCE[listMax];
1117 
1118     ucol_setOffset(elems, 0, &status);
1119 
1120     do {
1121         order = ucol_nextProcessed(elems, &low, &high, &status);
1122         add(order, low, high);
1123     } while (order != UCOL_PROCESSED_NULLORDER);
1124 
1125     ucol_closeElements(elems);
1126 }
1127 
~PCEList()1128 PCEList::~PCEList()
1129 {
1130     delete[] list;
1131 }
1132 
add(uint64_t order,int32_t low,int32_t high)1133 void PCEList::add(uint64_t order, int32_t low, int32_t high)
1134 {
1135     if (listSize >= listMax) {
1136         listMax *= 2;
1137 
1138         PCE *newList = new PCE[listMax];
1139 
1140         uprv_memcpy(newList, list, listSize * sizeof(Order));
1141         delete[] list;
1142         list = newList;
1143     }
1144 
1145     list[listSize].ce         = order;
1146     list[listSize].lowOffset  = low;
1147     list[listSize].highOffset = high;
1148 
1149     listSize += 1;
1150 }
1151 
get(int32_t index) const1152 const PCE *PCEList::get(int32_t index) const
1153 {
1154     if (index >= listSize) {
1155         return NULL;
1156     }
1157 
1158     return &list[index];
1159 }
1160 
getLowOffset(int32_t index) const1161 int32_t PCEList::getLowOffset(int32_t index) const
1162 {
1163     const PCE *pce = get(index);
1164 
1165     if (pce != NULL) {
1166         return pce->lowOffset;
1167     }
1168 
1169     return -1;
1170 }
1171 
getHighOffset(int32_t index) const1172 int32_t PCEList::getHighOffset(int32_t index) const
1173 {
1174     const PCE *pce = get(index);
1175 
1176     if (pce != NULL) {
1177         return pce->highOffset;
1178     }
1179 
1180     return -1;
1181 }
1182 
getOrder(int32_t index) const1183 uint64_t PCEList::getOrder(int32_t index) const
1184 {
1185     const PCE *pce = get(index);
1186 
1187     if (pce != NULL) {
1188         return pce->ce;
1189     }
1190 
1191     return UCOL_PROCESSED_NULLORDER;
1192 }
1193 
size() const1194 int32_t PCEList::size() const
1195 {
1196     return listSize;
1197 }
1198 
matchesAt(int32_t offset,const PCEList & other) const1199 UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const
1200 {
1201     // NOTE: sizes include the NULLORDER, which we don't want to compare.
1202     int32_t otherSize = other.size() - 1;
1203 
1204     if (listSize - 1 - offset < otherSize) {
1205         return FALSE;
1206     }
1207 
1208     for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
1209         if (getOrder(i) != other.getOrder(j)) {
1210             return FALSE;
1211         }
1212     }
1213 
1214     return TRUE;
1215 }
1216 
operator [](int32_t index) const1217 uint64_t PCEList::operator[](int32_t index) const
1218 {
1219     return getOrder(index);
1220 }
1221 
boyerMooreTest()1222 void SSearchTest::boyerMooreTest()
1223 {
1224     UErrorCode status = U_ZERO_ERROR;
1225     UCollator *coll = NULL;
1226     CollData *data = NULL;
1227     UnicodeString lp  = "fuss";
1228     UnicodeString sp = "fu\\u00DF";
1229     BoyerMooreSearch *longPattern = NULL;
1230     BoyerMooreSearch *shortPattern = NULL;
1231     UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1232                                 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1233                                 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1234     int32_t start = -1, end = -1;
1235 
1236     coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1237     if (U_FAILURE(status)) {
1238         errcheckln(status, "Could not open collator. - %s", u_errorName(status));
1239         return;
1240     }
1241 
1242     data = CollData::open(coll, status);
1243     if (U_FAILURE(status)) {
1244         errln("Could not open CollData object.");
1245         goto close_data;
1246     }
1247 
1248 
1249     longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status);
1250     shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status);
1251     if (U_FAILURE(status)) {
1252         errln("Could not create pattern objects.");
1253         goto close_patterns;
1254     }
1255 
1256     for (int32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
1257         UnicodeString target = targets[t].unescape();
1258 
1259         longPattern->setTargetString(&target, status);
1260         if (longPattern->search(0, start, end)) {
1261             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
1262         } else {
1263             errln("Test %d: did not find long pattern.", t);
1264         }
1265 
1266         shortPattern->setTargetString(&target, status);
1267         if (shortPattern->search(0, start, end)) {
1268             logln("Test %d: found short pattern at [%d, %d].", t, start, end);
1269         } else {
1270             errln("Test %d: did not find short pattern.", t);
1271         }
1272     }
1273 
1274 close_patterns:
1275     delete shortPattern;
1276     delete longPattern;
1277 
1278 close_data:
1279     CollData::close(data);
1280     ucol_close(coll);
1281 }
1282 
bmsTest()1283 void SSearchTest::bmsTest()
1284 {
1285     UErrorCode status = U_ZERO_ERROR;
1286     UCollator *coll = NULL;
1287     UCD *data = NULL;
1288     UnicodeString lp  = "fuss";
1289     UnicodeString lpu = lp.unescape();
1290     UnicodeString sp  = "fu\\u00DF";
1291     UnicodeString spu = sp.unescape();
1292     BMS *longPattern = NULL;
1293     BMS *shortPattern = NULL;
1294     UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
1295                                 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
1296                                 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
1297     int32_t start = -1, end = -1;
1298 
1299     coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1300     if (U_FAILURE(status)) {
1301         errcheckln(status, "Could not open collator. - %s", u_errorName(status));
1302         return;
1303     }
1304 
1305     data = ucd_open(coll, &status);
1306     if (U_FAILURE(status)) {
1307         errln("Could not open CollData object.");
1308         goto close_data;
1309     }
1310 
1311     longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status);
1312     shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &status);
1313     if (U_FAILURE(status)) {
1314         errln("Couldn't open pattern objects.");
1315         goto close_patterns;
1316     }
1317 
1318     for (int32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
1319         UnicodeString target = targets[t].unescape();
1320 
1321         bms_setTargetString(longPattern, target.getBuffer(), target.length(), &status);
1322         if (bms_search(longPattern, 0, &start, &end)) {
1323             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
1324         } else {
1325             errln("Test %d: did not find long pattern.", t);
1326         }
1327 
1328         bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &status);
1329         if (bms_search(shortPattern, 0, &start, &end)) {
1330             logln("Test %d: found short pattern at [%d, %d].", t, start, end);
1331         } else {
1332             errln("Test %d: did not find short pattern.", t);
1333         }
1334     }
1335 
1336 close_patterns:
1337     bms_close(shortPattern);
1338     bms_close(longPattern);
1339 
1340 close_data:
1341     ucd_close(data);
1342     ucol_close(coll);
1343 }
1344 
goodSuffixTest()1345 void SSearchTest::goodSuffixTest()
1346 {
1347     UErrorCode status = U_ZERO_ERROR;
1348     UCollator *coll = NULL;
1349     CollData *data = NULL;
1350     UnicodeString pat = /*"gcagagag"*/ "fxeld";
1351     UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
1352     BoyerMooreSearch *pattern = NULL;
1353     int32_t start = -1, end = -1;
1354 
1355     coll = ucol_open(NULL, &status);
1356     if (U_FAILURE(status)) {
1357         errcheckln(status, "Couldn't open collator. - %s", u_errorName(status));
1358         return;
1359     }
1360 
1361     data = CollData::open(coll, status);
1362     if (U_FAILURE(status)) {
1363         errln("Couldn't open CollData object.");
1364         goto close_data;
1365     }
1366 
1367     pattern = new BoyerMooreSearch(data, pat, &target, status);
1368     if (U_FAILURE(status)) {
1369         errln("Couldn't open pattern object.");
1370         goto close_pattern;
1371     }
1372 
1373     if (pattern->search(0, start, end)) {
1374         logln("Found pattern at [%d, %d].", start, end);
1375     } else {
1376         errln("Did not find pattern.");
1377     }
1378 
1379 close_pattern:
1380     delete pattern;
1381 
1382 close_data:
1383     CollData::close(data);
1384     ucol_close(coll);
1385 }
1386 
1387 //
1388 //  searchTime()    A quick and dirty performance test for string search.
1389 //                  Probably  doesn't really belong as part of intltest, but it
1390 //                  does check that the search succeeds, and gets the right result,
1391 //                  so it serves as a functionality test also.
1392 //
1393 //                  To run as a perf test, up the loop count, select by commenting
1394 //                  and uncommenting in the code the operation to be measured,
1395 //                  rebuild, and measure the running time of this test alone.
1396 //
1397 //                     time LD_LIBRARY_PATH=whatever  ./intltest  collate/SSearchTest/searchTime
1398 //
searchTime()1399 void SSearchTest::searchTime() {
1400     static const char *longishText =
1401 "Whylom, as olde stories tellen us,\n"
1402 "Ther was a duk that highte Theseus:\n"
1403 "Of Athenes he was lord and governour,\n"
1404 "And in his tyme swich a conquerour,\n"
1405 "That gretter was ther noon under the sonne.\n"
1406 "Ful many a riche contree hadde he wonne;\n"
1407 "What with his wisdom and his chivalrye,\n"
1408 "He conquered al the regne of Femenye,\n"
1409 "That whylom was y-cleped Scithia;\n"
1410 "And weddede the quene Ipolita,\n"
1411 "And broghte hir hoom with him in his contree\n"
1412 "With muchel glorie and greet solempnitee,\n"
1413 "And eek hir yonge suster Emelye.\n"
1414 "And thus with victorie and with melodye\n"
1415 "Lete I this noble duk to Athenes ryde,\n"
1416 "And al his hoost, in armes, him bisyde.\n"
1417 "And certes, if it nere to long to here,\n"
1418 "I wolde han told yow fully the manere,\n"
1419 "How wonnen was the regne of Femenye\n"
1420 "By Theseus, and by his chivalrye;\n"
1421 "And of the grete bataille for the nones\n"
1422 "Bitwixen Athen's and Amazones;\n"
1423 "And how asseged was Ipolita,\n"
1424 "The faire hardy quene of Scithia;\n"
1425 "And of the feste that was at hir weddinge,\n"
1426 "And of the tempest at hir hoom-cominge;\n"
1427 "But al that thing I moot as now forbere.\n"
1428 "I have, God woot, a large feeld to ere,\n"
1429 "And wayke been the oxen in my plough.\n"
1430 "The remenant of the tale is long y-nough.\n"
1431 "I wol nat letten eek noon of this route;\n"
1432 "Lat every felawe telle his tale aboute,\n"
1433 "And lat see now who shal the soper winne;\n"
1434 "And ther I lefte, I wol ageyn biginne.\n"
1435 "This duk, of whom I make mencioun,\n"
1436 "When he was come almost unto the toun,\n"
1437 "In al his wele and in his moste pryde,\n"
1438 "He was war, as he caste his eye asyde,\n"
1439 "Wher that ther kneled in the hye weye\n"
1440 "A companye of ladies, tweye and tweye,\n"
1441 "Ech after other, clad in clothes blake; \n"
1442 "But swich a cry and swich a wo they make,\n"
1443 "That in this world nis creature livinge,\n"
1444 "That herde swich another weymentinge;\n"
1445 "And of this cry they nolde never stenten,\n"
1446 "Til they the reynes of his brydel henten.\n"
1447 "'What folk ben ye, that at myn hoomcominge\n"
1448 "Perturben so my feste with cryinge'?\n"
1449 "Quod Theseus, 'have ye so greet envye\n"
1450 "Of myn honour, that thus compleyne and crye? \n"
1451 "Or who hath yow misboden, or offended?\n"
1452 "And telleth me if it may been amended;\n"
1453 "And why that ye ben clothed thus in blak'?\n"
1454 "The eldest lady of hem alle spak,\n"
1455 "When she hadde swowned with a deedly chere,\n"
1456 "That it was routhe for to seen and here,\n"
1457 "And seyde: 'Lord, to whom Fortune hath yiven\n"
1458 "Victorie, and as a conquerour to liven,\n"
1459 "Noght greveth us your glorie and your honour;\n"
1460 "But we biseken mercy and socour.\n"
1461 "Have mercy on our wo and our distresse.\n"
1462 "Som drope of pitee, thurgh thy gentilesse,\n"
1463 "Up-on us wrecched wommen lat thou falle.\n"
1464 "For certes, lord, ther nis noon of us alle,\n"
1465 "That she nath been a duchesse or a quene;\n"
1466 "Now be we caitifs, as it is wel sene:\n"
1467 "Thanked be Fortune, and hir false wheel,\n"
1468 "That noon estat assureth to be weel.\n"
1469 "And certes, lord, t'abyden your presence,\n"
1470 "Here in the temple of the goddesse Clemence\n"
1471 "We han ben waytinge al this fourtenight;\n"
1472 "Now help us, lord, sith it is in thy might.\n"
1473 "I wrecche, which that wepe and waille thus,\n"
1474 "Was whylom wyf to king Capaneus,\n"
1475 "That starf at Thebes, cursed be that day!\n"
1476 "And alle we, that been in this array,\n"
1477 "And maken al this lamentacioun,\n"
1478 "We losten alle our housbondes at that toun,\n"
1479 "Whyl that the sege ther-aboute lay.\n"
1480 "And yet now th'olde Creon, weylaway!\n"
1481 "The lord is now of Thebes the citee, \n"
1482 "Fulfild of ire and of iniquitee,\n"
1483 "He, for despyt, and for his tirannye,\n"
1484 "To do the dede bodyes vileinye,\n"
1485 "Of alle our lordes, whiche that ben slawe,\n"
1486 "Hath alle the bodyes on an heep y-drawe,\n"
1487 "And wol nat suffren hem, by noon assent,\n"
1488 "Neither to been y-buried nor y-brent,\n"
1489 "But maketh houndes ete hem in despyt. zet'\n";
1490 
1491 #define TEST_BOYER_MOORE 1
1492 const char *cPattern = "maketh houndes ete hem";
1493 //const char *cPattern = "Whylom";
1494 //const char *cPattern = "zet";
1495     const char *testId = "searchTime()";   // for error macros.
1496     UnicodeString target = longishText;
1497     UErrorCode status = U_ZERO_ERROR;
1498 
1499 
1500     UCollator *collator = ucol_open("en", &status);
1501     CollData *data = CollData::open(collator, status);
1502     if (U_FAILURE(status) || collator == NULL || data == NULL) {
1503         errcheckln(status, "Unable to open UCollator or CollData. - %s", u_errorName(status));
1504         return;
1505     }
1506     //ucol_setStrength(collator, collatorStrength);
1507     //ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status);
1508     UnicodeString uPattern = cPattern;
1509 #ifndef TEST_BOYER_MOORE
1510     UStringSearch *uss = usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
1511                                         target.getBuffer(), target.length(),
1512                                         collator,
1513                                         NULL,     // the break iterator
1514                                         &status);
1515     TEST_ASSERT_SUCCESS(status);
1516 #else
1517     BoyerMooreSearch bms(data, uPattern, &target, status);
1518     TEST_ASSERT_SUCCESS(status);
1519 #endif
1520 
1521 //  int32_t foundStart;
1522 //  int32_t foundEnd;
1523     UBool   found;
1524 
1525     // Find the match position usgin strstr
1526     const char *pm = strstr(longishText, cPattern);
1527     TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
1528     int32_t  refMatchPos = (int32_t)(pm - longishText);
1529     int32_t  icuMatchPos;
1530     int32_t  icuMatchEnd;
1531 #ifndef TEST_BOYER_MOORE
1532     usearch_search(uss, 0, &icuMatchPos, &icuMatchEnd, &status);
1533     TEST_ASSERT_SUCCESS(status);
1534 #else
1535     found = bms.search(0, icuMatchPos, icuMatchEnd);
1536 #endif
1537     TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
1538 
1539     int32_t i;
1540     int32_t j=0;
1541 
1542     // Try loopcounts around 100000 to some millions, depending on the operation,
1543     //   to get runtimes of at least several seconds.
1544     for (i=0; i<10000; i++) {
1545 #ifndef TEST_BOYER_MOORE
1546         found = usearch_search(uss, 0, &icuMatchPos, &icuMatchEnd, &status);
1547 #else
1548         found = bms.search(0, icuMatchPos, icuMatchEnd);
1549 #endif
1550         //TEST_ASSERT_SUCCESS(status);
1551         //TEST_ASSERT(found);
1552 
1553         // usearch_setOffset(uss, 0, &status);
1554         // icuMatchPos = usearch_next(uss, &status);
1555 
1556          // The i+j stuff is to confuse the optimizer and get it to actually leave the
1557          //   call to strstr in place.
1558          //pm = strstr(longishText+j, cPattern);
1559          //j = (j + i)%5;
1560     }
1561 
1562     printf("%d\n", pm-longishText, j);
1563 #ifndef TEST_BOYER_MOORE
1564     usearch_close(uss);
1565 #else
1566     CollData::close(data);
1567 #endif
1568     ucol_close(collator);
1569 }
1570 #endif
1571 
1572 //----------------------------------------------------------------------------------------
1573 //
1574 //   Random Numbers.  Similar to standard lib rand() and srand()
1575 //                    Not using library to
1576 //                      1.  Get same results on all platforms.
1577 //                      2.  Get access to current seed, to more easily reproduce failures.
1578 //
1579 //---------------------------------------------------------------------------------------
1580 static uint32_t m_seed = 1;
1581 
m_rand()1582 static uint32_t m_rand()
1583 {
1584     m_seed = m_seed * 1103515245 + 12345;
1585     return (uint32_t)(m_seed/65536) % 32768;
1586 }
1587 
1588 class Monkey
1589 {
1590 public:
1591     virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
1592 
1593 protected:
1594     Monkey();
1595     virtual ~Monkey();
1596 };
1597 
Monkey()1598 Monkey::Monkey()
1599 {
1600     // ook?
1601 }
1602 
~Monkey()1603 Monkey::~Monkey()
1604 {
1605     // ook?
1606 }
1607 
1608 class SetMonkey : public Monkey
1609 {
1610 public:
1611     SetMonkey(const USet *theSet);
1612     ~SetMonkey();
1613 
1614     virtual void append(UnicodeString &test, UnicodeString &alternate);
1615 
1616 private:
1617     const USet *set;
1618 };
1619 
SetMonkey(const USet * theSet)1620 SetMonkey::SetMonkey(const USet *theSet)
1621     : Monkey(), set(theSet)
1622 {
1623     // ook?
1624 }
1625 
~SetMonkey()1626 SetMonkey::~SetMonkey()
1627 {
1628     //ook...
1629 }
1630 
append(UnicodeString & test,UnicodeString & alternate)1631 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1632 {
1633     int32_t size = uset_size(set);
1634     int32_t index = m_rand() % size;
1635     UChar32 ch = uset_charAt(set, index);
1636     UnicodeString str(ch);
1637 
1638     test.append(str);
1639     alternate.append(str); // flip case, or some junk?
1640 }
1641 
1642 class StringSetMonkey : public Monkey
1643 {
1644 public:
1645     StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1646     ~StringSetMonkey();
1647 
1648     void append(UnicodeString &testCase, UnicodeString &alternate);
1649 
1650 private:
1651     UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1652 
1653     const USet *set;
1654     UCollator  *coll;
1655     CollData   *collData;
1656 };
1657 
StringSetMonkey(const USet * theSet,UCollator * theCollator,CollData * theCollData)1658 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1659 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1660 {
1661     // ook.
1662 }
1663 
~StringSetMonkey()1664 StringSetMonkey::~StringSetMonkey()
1665 {
1666     // ook?
1667 }
1668 
append(UnicodeString & testCase,UnicodeString & alternate)1669 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1670 {
1671     int32_t itemCount = uset_getItemCount(set), len = 0;
1672     int32_t index = m_rand() % itemCount;
1673     UChar32 rangeStart = 0, rangeEnd = 0;
1674     UChar buffer[16];
1675     UErrorCode err = U_ZERO_ERROR;
1676 
1677     len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1678 
1679     if (len == 0) {
1680         int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1681         UChar32 ch = rangeStart + offset;
1682         UnicodeString str(ch);
1683 
1684         testCase.append(str);
1685         generateAlternative(str, alternate);
1686     } else if (len > 0) {
1687         // should check that len < 16...
1688         UnicodeString str(buffer, len);
1689 
1690         testCase.append(str);
1691         generateAlternative(str, alternate);
1692     } else {
1693         // shouldn't happen...
1694     }
1695 }
1696 
generateAlternative(const UnicodeString & testCase,UnicodeString & alternate)1697 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1698 {
1699     // find out shortest string for the longest sequence of ces.
1700     // needs to be refined to use dynamic programming, but will be roughly right
1701     UErrorCode status = U_ZERO_ERROR;
1702     CEList ceList(coll, testCase, status);
1703     UnicodeString alt;
1704     int32_t offset = 0;
1705 
1706     if (ceList.size() == 0) {
1707         return alternate.append(testCase);
1708     }
1709 
1710     while (offset < ceList.size()) {
1711         int32_t ce = ceList.get(offset);
1712         const StringList *strings = collData->getStringList(ce);
1713 
1714         if (strings == NULL) {
1715             return alternate.append(testCase);
1716         }
1717 
1718         int32_t stringCount = strings->size();
1719         int32_t tries = 0;
1720 
1721         // find random string that generates the same CEList
1722         const CEList *ceList2 = NULL;
1723         const UnicodeString *string = NULL;
1724               UBool matches = FALSE;
1725 
1726         do {
1727             int32_t s = m_rand() % stringCount;
1728 
1729             if (tries++ > stringCount) {
1730                 alternate.append(testCase);
1731                 return alternate;
1732             }
1733 
1734             string = strings->get(s);
1735             ceList2 = collData->getCEList(string);
1736             matches = ceList.matchesAt(offset, ceList2);
1737 
1738             if (! matches) {
1739                 collData->freeCEList((CEList *) ceList2);
1740             }
1741         } while (! matches);
1742 
1743         alt.append(*string);
1744         offset += ceList2->size();
1745         collData->freeCEList(ceList2);
1746     }
1747 
1748     const CEList altCEs(coll, alt, status);
1749 
1750     if (ceList.matchesAt(0, &altCEs)) {
1751         return alternate.append(alt);
1752     }
1753 
1754     return alternate.append(testCase);
1755 }
1756 
generateTestCase(UCollator * coll,Monkey * monkeys[],int32_t monkeyCount,UnicodeString & testCase,UnicodeString & alternate)1757 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1758 {
1759     int32_t pieces = (m_rand() % 4) + 1;
1760     UErrorCode status = U_ZERO_ERROR;
1761     UBool matches;
1762 
1763     do {
1764         testCase.remove();
1765         alternate.remove();
1766         monkeys[0]->append(testCase, alternate);
1767 
1768         for(int32_t piece = 0; piece < pieces; piece += 1) {
1769             int32_t monkey = m_rand() % monkeyCount;
1770 
1771             monkeys[monkey]->append(testCase, alternate);
1772         }
1773 
1774         const CEList ceTest(coll, testCase, status);
1775         const CEList ceAlt(coll, alternate, status);
1776 
1777         matches = ceTest.matchesAt(0, &ceAlt);
1778     } while (! matches);
1779 }
1780 
1781 //
1782 //  Find the next acceptable boundary following the specified starting index
1783 //     in the target text being searched.
1784 //      TODO:  refine what is an acceptable boundary.  For the moment,
1785 //             choose the next position not within a combining sequence.
1786 //
nextBoundaryAfter(const UnicodeString & string,int32_t startIndex)1787 static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex) {
1788     const UChar *text = string.getBuffer();
1789     int32_t textLen   = string.length();
1790 
1791     if (startIndex >= textLen) {
1792         return startIndex;
1793     }
1794 
1795     UChar32  c;
1796     int32_t  i = startIndex;
1797 
1798     U16_NEXT(text, i, textLen, c);
1799 
1800     // If we are on a control character, stop without looking for combining marks.
1801     //    Control characters do not combine.
1802     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1803     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
1804         return i;
1805     }
1806 
1807     // The initial character was not a control, and can thus accept trailing
1808     //   combining characters.  Advance over however many of them there are.
1809     int32_t  indexOfLastCharChecked;
1810 
1811     for (;;) {
1812         indexOfLastCharChecked = i;
1813 
1814         if (i>=textLen) {
1815             break;
1816         }
1817 
1818         U16_NEXT(text, i, textLen, c);
1819         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1820 
1821         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1822             break;
1823         }
1824     }
1825 
1826     return indexOfLastCharChecked;
1827 }
1828 
isInCombiningSequence(const UnicodeString & string,int32_t index)1829 static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) {
1830     const UChar *text = string.getBuffer();
1831     int32_t textLen   = string.length();
1832 
1833     if (index>=textLen || index<=0) {
1834         return FALSE;
1835     }
1836 
1837     // If the character at the current index is not a GRAPHEME_EXTEND
1838     //    then we can not be within a combining sequence.
1839     UChar32  c;
1840     U16_GET(text, 0, index, textLen, c);
1841     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1842     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
1843         return FALSE;
1844     }
1845 
1846     // We are at a combining mark.  If the preceding character is anything
1847     //   except a CONTROL, CR or LF, we are in a combining sequence.
1848     U16_PREV(text, 0, index, c);
1849     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
1850 
1851     return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
1852 }
1853 
simpleSearch(UCollator * coll,const UnicodeString & target,int32_t offset,const UnicodeString & pattern,int32_t & matchStart,int32_t & matchEnd)1854 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1855 {
1856     UErrorCode      status = U_ZERO_ERROR;
1857     OrderList       targetOrders(coll, target, offset);
1858     OrderList       patternOrders(coll, pattern);
1859     int32_t         targetSize  = targetOrders.size() - 1;
1860     int32_t         patternSize = patternOrders.size() - 1;
1861     UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1862                                                   target.getBuffer(), target.length(), &status);
1863 
1864     if (patternSize == 0) {
1865         // Searching for an empty pattern always fails
1866         matchStart = matchEnd = -1;
1867         ubrk_close(charBreakIterator);
1868         return FALSE;
1869     }
1870 
1871     matchStart = matchEnd = -1;
1872 
1873     for(int32_t i = 0; i < targetSize; i += 1) {
1874         if (targetOrders.matchesAt(i, patternOrders)) {
1875             int32_t start    = targetOrders.getLowOffset(i);
1876             int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1877             int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1878 
1879             // if the low and high offsets of the first CE in
1880             // the match are the same, it means that the match
1881             // starts in the middle of an expansion - all but
1882             // the first CE of the expansion will have the offset
1883             // of the following character.
1884             if (start == targetOrders.getHighOffset(i)) {
1885                 continue;
1886             }
1887 
1888             // Make sure match starts on a grapheme boundary
1889             if (! ubrk_isBoundary(charBreakIterator, start)) {
1890                 continue;
1891             }
1892 
1893             // If the low and high offsets of the CE after the match
1894             // are the same, it means that the match ends in the middle
1895             // of an expansion sequence.
1896             if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1897                 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1898                 continue;
1899             }
1900 
1901             int32_t mend = maxLimit;
1902 
1903             // Find the first grapheme break after the character index
1904             // of the last CE in the match. If it's after character index
1905             // that's after the last CE in the match, use that index
1906             // as the end of the match.
1907             if (minLimit < maxLimit) {
1908                 int32_t nba = ubrk_following(charBreakIterator, minLimit);
1909 
1910                 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1911                     mend = nba;
1912                 }
1913             }
1914 
1915             if (mend > maxLimit) {
1916                 continue;
1917             }
1918 
1919             if (! ubrk_isBoundary(charBreakIterator, mend)) {
1920                 continue;
1921             }
1922 
1923             matchStart = start;
1924             matchEnd   = mend;
1925 
1926             ubrk_close(charBreakIterator);
1927             return TRUE;
1928         }
1929     }
1930 
1931     ubrk_close(charBreakIterator);
1932     return FALSE;
1933 }
1934 
1935 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)1936 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
1937     int32_t val = defaultVal;
1938 
1939     name.append(" *= *(-?\\d+)");
1940 
1941     UErrorCode status = U_ZERO_ERROR;
1942     RegexMatcher m(name, params, 0, status);
1943 
1944     if (m.find()) {
1945         // The param exists.  Convert the string to an int.
1946         char valString[100];
1947         int32_t paramLength = m.end(1, status) - m.start(1, status);
1948 
1949         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1950             paramLength = (int32_t)(sizeof(valString)-2);
1951         }
1952 
1953         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1954         val = strtol(valString,  NULL, 10);
1955 
1956         // Delete this parameter from the params string.
1957         m.reset();
1958         params = m.replaceFirst("", status);
1959     }
1960 
1961   //U_ASSERT(U_SUCCESS(status));
1962     if (! U_SUCCESS(status)) {
1963         val = defaultVal;
1964     }
1965 
1966     return val;
1967 }
1968 #endif
1969 
1970 #if !UCONFIG_NO_COLLATION
monkeyTestCase(UCollator * coll,const UnicodeString & testCase,const UnicodeString & pattern,const UnicodeString & altPattern,const char * name,const char * strength,uint32_t seed)1971 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1972                                     const char *name, const char *strength, uint32_t seed)
1973 {
1974     UErrorCode status = U_ZERO_ERROR;
1975     int32_t actualStart = -1, actualEnd = -1;
1976   //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1977     int32_t expectedStart = -1, expectedEnd = -1;
1978     int32_t notFoundCount = 0;
1979     UStringSearch *uss = usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1980                                 testCase.getBuffer(), testCase.length(),
1981                                 coll,
1982                                 NULL,     // the break iterator
1983                                 &status);
1984 
1985     // **** TODO: find *all* matches, not just first one ****
1986     simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1987 
1988     usearch_search(uss, 0, &actualStart, &actualEnd, &status);
1989 
1990     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1991         errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1992               "    strength=%s seed=%d",
1993               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1994     }
1995 
1996     if (expectedStart == -1 && actualStart == -1) {
1997         notFoundCount += 1;
1998     }
1999 
2000     // **** TODO: find *all* matches, not just first one ****
2001     simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
2002 
2003     usearch_setPattern(uss, altPattern.getBuffer(), altPattern.length(), &status);
2004 
2005     usearch_search(uss, 0, &actualStart, &actualEnd, &status);
2006 
2007     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2008         errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2009               "    strength=%s seed=%d",
2010               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
2011     }
2012 
2013     if (expectedStart == -1 && actualStart == -1) {
2014         notFoundCount += 1;
2015     }
2016 
2017     usearch_close(uss);
2018 
2019     return notFoundCount;
2020 }
2021 
bmMonkeyTestCase(UCollator * coll,const UnicodeString & testCase,const UnicodeString & pattern,const UnicodeString & altPattern,BoyerMooreSearch * bms,BoyerMooreSearch * abms,const char * name,const char * strength,uint32_t seed)2022 int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
2023                                     BoyerMooreSearch *bms, BoyerMooreSearch *abms,
2024                                     const char *name, const char *strength, uint32_t seed)
2025 {
2026     UErrorCode status = U_ZERO_ERROR;
2027     int32_t actualStart = -1, actualEnd = -1;
2028   //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
2029     int32_t expectedStart = -1, expectedEnd = -1;
2030     int32_t notFoundCount = 0;
2031 
2032     // **** TODO: find *all* matches, not just first one ****
2033     simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
2034 
2035     bms->setTargetString(&testCase, status);
2036     bms->search(0, actualStart, actualEnd);
2037 
2038     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2039         errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2040               "    strength=%s seed=%d",
2041               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
2042     }
2043 
2044     if (expectedStart == -1 && actualStart == -1) {
2045         notFoundCount += 1;
2046     }
2047 
2048     // **** TODO: find *all* matches, not just first one ****
2049     simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
2050 
2051     abms->setTargetString(&testCase, status);
2052     abms->search(0, actualStart, actualEnd);
2053 
2054     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
2055         errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
2056               "    strength=%s seed=%d",
2057               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
2058     }
2059 
2060     if (expectedStart == -1 && actualStart == -1) {
2061         notFoundCount += 1;
2062     }
2063 
2064 
2065     return notFoundCount;
2066 }
2067 #endif
2068 
monkeyTest(char * params)2069 void SSearchTest::monkeyTest(char *params)
2070 {
2071     // ook!
2072     UErrorCode status = U_ZERO_ERROR;
2073   //UCollator *coll = ucol_open(NULL, &status);
2074     UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
2075 
2076     if (U_FAILURE(status)) {
2077         errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
2078         return;
2079     }
2080 
2081     CollData  *monkeyData = CollData::open(coll, status);
2082 
2083     USet *expansions   = uset_openEmpty();
2084     USet *contractions = uset_openEmpty();
2085 
2086     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
2087 
2088     U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2089     U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2090     USet *letters = uset_openPattern(letter_pattern, 39, &status);
2091     SetMonkey letterMonkey(letters);
2092     StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
2093     StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
2094     UnicodeString testCase;
2095     UnicodeString alternate;
2096     UnicodeString pattern, altPattern;
2097     UnicodeString prefix, altPrefix;
2098     UnicodeString suffix, altSuffix;
2099 
2100     Monkey *monkeys[] = {
2101         &letterMonkey,
2102         &contractionMonkey,
2103         &expansionMonkey,
2104         &contractionMonkey,
2105         &expansionMonkey,
2106         &contractionMonkey,
2107         &expansionMonkey,
2108         &contractionMonkey,
2109         &expansionMonkey};
2110     int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
2111     int32_t nonMatchCount = 0;
2112 
2113     UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
2114     const char *strengthNames[] = {"primary", "secondary", "tertiary"};
2115     int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
2116     int32_t loopCount = quick? 1000 : 10000;
2117     int32_t firstStrength = 0;
2118     int32_t lastStrength  = strengthCount - 1; //*/ 0;
2119 
2120     if (params != NULL) {
2121 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2122         UnicodeString p(params);
2123 
2124         loopCount = getIntParam("loop", p, loopCount);
2125         m_seed    = getIntParam("seed", p, m_seed);
2126 
2127         RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
2128         if (m.find()) {
2129             UnicodeString breakType = m.group(1, status);
2130 
2131             for (int32_t s = 0; s < strengthCount; s += 1) {
2132                 if (breakType == strengthNames[s]) {
2133                     firstStrength = lastStrength = s;
2134                     break;
2135                 }
2136             }
2137 
2138             m.reset();
2139             p = m.replaceFirst("", status);
2140         }
2141 
2142         if (RegexMatcher("\\S", p, 0, status).find()) {
2143             // Each option is stripped out of the option string as it is processed.
2144             // All options have been checked.  The option string should have been completely emptied..
2145             char buf[100];
2146             p.extract(buf, sizeof(buf), NULL, status);
2147             buf[sizeof(buf)-1] = 0;
2148             errln("Unrecognized or extra parameter:  %s\n", buf);
2149             return;
2150         }
2151 #else
2152         infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2153 #endif
2154     }
2155 
2156     for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
2157         int32_t notFoundCount = 0;
2158 
2159         logln("Setting strength to %s.", strengthNames[s]);
2160         ucol_setStrength(coll, strengths[s]);
2161 
2162         // TODO: try alternate prefix and suffix too?
2163         // TODO: alterntaes are only equal at primary strength. Is this OK?
2164         for(int32_t t = 0; t < loopCount; t += 1) {
2165             uint32_t seed = m_seed;
2166             int32_t  nmc = 0;
2167 
2168             generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
2169             generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
2170             generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
2171 
2172             // pattern
2173             notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
2174 
2175             testCase.remove();
2176             testCase.append(prefix);
2177             testCase.append(/*alt*/pattern);
2178 
2179             // prefix + pattern
2180             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
2181 
2182             testCase.append(suffix);
2183 
2184             // prefix + pattern + suffix
2185             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
2186 
2187             testCase.remove();
2188             testCase.append(pattern);
2189             testCase.append(suffix);
2190 
2191             // pattern + suffix
2192             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
2193         }
2194 
2195        logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
2196     }
2197 
2198     uset_close(contractions);
2199     uset_close(expansions);
2200     uset_close(letters);
2201 
2202     CollData::close(monkeyData);
2203 
2204     ucol_close(coll);
2205 }
2206 
bmMonkeyTest(char * params)2207 void SSearchTest::bmMonkeyTest(char *params)
2208 {
2209     // ook!
2210     UErrorCode status = U_ZERO_ERROR;
2211     UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
2212 
2213     if (U_FAILURE(status)) {
2214         errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
2215         return;
2216     }
2217 
2218     CollData  *monkeyData = CollData::open(coll, status);
2219 
2220     USet *expansions   = uset_openEmpty();
2221     USet *contractions = uset_openEmpty();
2222 
2223     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
2224 
2225     U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2226     U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
2227     USet *letters = uset_openPattern(letter_pattern, 39, &status);
2228     SetMonkey letterMonkey(letters);
2229     StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
2230     StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
2231     UnicodeString testCase;
2232     UnicodeString alternate;
2233     UnicodeString pattern, altPattern;
2234     UnicodeString prefix, altPrefix;
2235     UnicodeString suffix, altSuffix;
2236 
2237     Monkey *monkeys[] = {
2238         &letterMonkey,
2239         &contractionMonkey,
2240         &expansionMonkey,
2241         &contractionMonkey,
2242         &expansionMonkey,
2243         &contractionMonkey,
2244         &expansionMonkey,
2245         &contractionMonkey,
2246         &expansionMonkey};
2247     int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
2248     int32_t nonMatchCount = 0;
2249 
2250     UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
2251     const char *strengthNames[] = {"primary", "secondary", "tertiary"};
2252     int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
2253     int32_t loopCount = quick? 1000 : 10000;
2254     int32_t firstStrength = 0;
2255     int32_t lastStrength  = strengthCount - 1; //*/ 0;
2256 
2257     if (params != NULL) {
2258 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2259         UnicodeString p(params);
2260 
2261         loopCount = getIntParam("loop", p, loopCount);
2262         m_seed    = getIntParam("seed", p, m_seed);
2263 
2264         RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
2265         if (m.find()) {
2266             UnicodeString breakType = m.group(1, status);
2267 
2268             for (int32_t s = 0; s < strengthCount; s += 1) {
2269                 if (breakType == strengthNames[s]) {
2270                     firstStrength = lastStrength = s;
2271                     break;
2272                 }
2273             }
2274 
2275             m.reset();
2276             p = m.replaceFirst("", status);
2277         }
2278 
2279         if (RegexMatcher("\\S", p, 0, status).find()) {
2280             // Each option is stripped out of the option string as it is processed.
2281             // All options have been checked.  The option string should have been completely emptied..
2282             char buf[100];
2283             p.extract(buf, sizeof(buf), NULL, status);
2284             buf[sizeof(buf)-1] = 0;
2285             errln("Unrecognized or extra parameter:  %s\n", buf);
2286             return;
2287         }
2288 #else
2289         infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
2290 #endif
2291     }
2292 
2293     for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
2294         int32_t notFoundCount = 0;
2295 
2296         logln("Setting strength to %s.", strengthNames[s]);
2297         ucol_setStrength(coll, strengths[s]);
2298 
2299         CollData *data = CollData::open(coll, status);
2300 
2301         // TODO: try alternate prefix and suffix too?
2302         // TODO: alterntaes are only equal at primary strength. Is this OK?
2303         for(int32_t t = 0; t < loopCount; t += 1) {
2304             uint32_t seed = m_seed;
2305             int32_t  nmc = 0;
2306 
2307             generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
2308             generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
2309             generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
2310 
2311             BoyerMooreSearch pat(data, pattern, NULL, status);
2312             BoyerMooreSearch alt(data, altPattern, NULL, status);
2313 
2314             // **** need a better way to deal with this ****
2315 #if 0
2316             if (pat.empty() ||
2317                 alt.empty()) {
2318                     continue;
2319             }
2320 #endif
2321 
2322             // pattern
2323             notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern, &pat, &alt, "pattern", strengthNames[s], seed);
2324 
2325             testCase.remove();
2326             testCase.append(prefix);
2327             testCase.append(/*alt*/pattern);
2328 
2329             // prefix + pattern
2330             notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern", strengthNames[s], seed);
2331 
2332             testCase.append(suffix);
2333 
2334             // prefix + pattern + suffix
2335             notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed);
2336 
2337             testCase.remove();
2338             testCase.append(pattern);
2339             testCase.append(suffix);
2340 
2341             // pattern + suffix
2342             notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPattern, &pat, &alt, "pattern + suffix", strengthNames[s], seed);
2343         }
2344 
2345         CollData::close(data);
2346 
2347         logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
2348     }
2349 
2350     uset_close(contractions);
2351     uset_close(expansions);
2352     uset_close(letters);
2353 
2354     CollData::close(monkeyData);
2355 
2356     ucol_close(coll);
2357 }
2358 
2359 #endif
2360 
2361 #endif
2362