1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_COLLATION
13
14 #include "cmemory.h"
15 #include "cstring.h"
16 #include "usrchimp.h"
17
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/usearch.h"
21 #include "unicode/uset.h"
22 #include "unicode/ustring.h"
23
24 #include "unicode/coleitr.h"
25 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
26
27 #include "colldata.h"
28 #include "ssearch.h"
29 #include "xmlparser.h"
30
31 #include <stdio.h> // for sprintf
32
33 char testId[100];
34
35 #define TEST_ASSERT(x) {if (!(x)) { \
36 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
37
38 #define TEST_ASSERT_M(x, m) {if (!(x)) { \
39 dataerrln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return;}}
40
41 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
42 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
43 __FILE__, __LINE__, testId, u_errorName(errcode));}}
44
45 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
46 #define DELETE_ARRAY(array) uprv_free((void *) (array))
47
48 //---------------------------------------------------------------------------
49 //
50 // Test class boilerplate
51 //
52 //---------------------------------------------------------------------------
SSearchTest()53 SSearchTest::SSearchTest()
54 {
55 }
56
~SSearchTest()57 SSearchTest::~SSearchTest()
58 {
59 }
60
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)61 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
62 {
63 if (exec) logln("TestSuite SSearchTest: ");
64 switch (index) {
65 #if !UCONFIG_NO_BREAK_ITERATION
66 case 0: name = "searchTest";
67 if (exec) searchTest();
68 break;
69
70 case 1: name = "offsetTest";
71 if (exec) offsetTest();
72 break;
73
74 case 2: name = "monkeyTest";
75 if (exec) monkeyTest(params);
76 break;
77
78 case 3: name = "sharpSTest";
79 if (exec) sharpSTest();
80 break;
81
82 case 4: name = "goodSuffixTest";
83 if (exec) goodSuffixTest();
84 break;
85
86 case 5: name = "searchTime";
87 if (exec) searchTime();
88 break;
89 #endif
90 default: name = "";
91 break; //needed to end loop
92 }
93 }
94
95
96 #if !UCONFIG_NO_BREAK_ITERATION
97
98 #define PATH_BUFFER_SIZE 2048
getPath(char buffer[2048],const char * filename)99 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
100 UErrorCode status = U_ZERO_ERROR;
101 const char *testDataDirectory = IntlTest::getSourceTestData(status);
102
103 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
104 errln("ERROR: getPath() failed - %s", u_errorName(status));
105 return NULL;
106 }
107
108 strcpy(buffer, testDataDirectory);
109 strcat(buffer, filename);
110 return buffer;
111 }
112
113
searchTest()114 void SSearchTest::searchTest()
115 {
116 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
117 UErrorCode status = U_ZERO_ERROR;
118 char path[PATH_BUFFER_SIZE];
119 const char *testFilePath = getPath(path, "ssearch.xml");
120
121 if (testFilePath == NULL) {
122 return; /* Couldn't get path: error message already output. */
123 }
124
125 LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
126 TEST_ASSERT_SUCCESS(status);
127 LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
128 TEST_ASSERT_SUCCESS(status);
129 if (U_FAILURE(status)) {
130 return;
131 }
132
133 const UnicodeString *debugTestCase = root->getAttribute("debug");
134 if (debugTestCase != NULL) {
135 // setenv("USEARCH_DEBUG", "1", 1);
136 }
137
138
139 const UXMLElement *testCase;
140 int32_t tc = 0;
141
142 while((testCase = root->nextChildElement(tc)) != NULL) {
143
144 if (testCase->getTagName().compare("test-case") != 0) {
145 errln("ssearch, unrecognized XML Element in test file");
146 continue;
147 }
148 const UnicodeString *id = testCase->getAttribute("id");
149 *testId = 0;
150 if (id != NULL) {
151 id->extract(0, id->length(), testId, sizeof(testId), US_INV);
152 }
153
154 // If debugging test case has been specified and this is not it, skip to next.
155 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
156 continue;
157 }
158 //
159 // Get the requested collation strength.
160 // Default is tertiary if the XML attribute is missing from the test case.
161 //
162 const UnicodeString *strength = testCase->getAttribute("strength");
163 UColAttributeValue collatorStrength = UCOL_PRIMARY;
164 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
165 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
166 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
167 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;}
168 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
169 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;}
170 else {
171 // Bogus value supplied for strength. Shouldn't happen, even from
172 // typos, if the XML source has been validated.
173 // This assert is a little deceiving in that strength can be
174 // any of the allowed values, not just TERTIARY, but it will
175 // do the job of getting the error output.
176 TEST_ASSERT(*strength=="TERTIARY")
177 }
178
179 //
180 // Get the collator normalization flag. Default is UCOL_OFF.
181 //
182 UColAttributeValue normalize = UCOL_OFF;
183 const UnicodeString *norm = testCase->getAttribute("norm");
184 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
185 if (norm!=NULL && *norm=="ON") {
186 normalize = UCOL_ON;
187 }
188
189 //
190 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
191 //
192 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
193 const UnicodeString *alt = testCase->getAttribute("alternate_handling");
194 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
195 if (alt != NULL && *alt == "SHIFTED") {
196 alternateHandling = UCOL_SHIFTED;
197 }
198
199 const UnicodeString defLocale("en");
200 char clocale[100];
201 const UnicodeString *locale = testCase->getAttribute("locale");
202 if (locale == NULL || locale->length()==0) {
203 locale = &defLocale;
204 };
205 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
206
207
208 UnicodeString text;
209 UnicodeString target;
210 UnicodeString pattern;
211 int32_t expectedMatchStart = -1;
212 int32_t expectedMatchLimit = -1;
213 const UXMLElement *n;
214 int32_t nodeCount = 0;
215
216 n = testCase->getChildElement("pattern");
217 TEST_ASSERT(n != NULL);
218 if (n==NULL) {
219 continue;
220 }
221 text = n->getText(FALSE);
222 text = text.unescape();
223 pattern.append(text);
224 nodeCount++;
225
226 n = testCase->getChildElement("pre");
227 if (n!=NULL) {
228 text = n->getText(FALSE);
229 text = text.unescape();
230 target.append(text);
231 nodeCount++;
232 }
233
234 n = testCase->getChildElement("m");
235 if (n!=NULL) {
236 expectedMatchStart = target.length();
237 text = n->getText(FALSE);
238 text = text.unescape();
239 target.append(text);
240 expectedMatchLimit = target.length();
241 nodeCount++;
242 }
243
244 n = testCase->getChildElement("post");
245 if (n!=NULL) {
246 text = n->getText(FALSE);
247 text = text.unescape();
248 target.append(text);
249 nodeCount++;
250 }
251
252 // Check that there weren't extra things in the XML
253 TEST_ASSERT(nodeCount == testCase->countChildren());
254
255 // Open a collator and StringSearch based on the parameters
256 // obtained from the XML.
257 //
258 status = U_ZERO_ERROR;
259 LocalUCollatorPointer collator(ucol_open(clocale, &status));
260 ucol_setStrength(collator.getAlias(), collatorStrength);
261 ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
262 ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
263 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
264 target.getBuffer(), target.length(),
265 collator.getAlias(),
266 NULL, // the break iterator
267 &status));
268
269 TEST_ASSERT_SUCCESS(status);
270 if (U_FAILURE(status)) {
271 continue;
272 }
273
274 int32_t foundStart = 0;
275 int32_t foundLimit = 0;
276 UBool foundMatch;
277
278 //
279 // Do the search, check the match result against the expected results.
280 //
281 foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
282 TEST_ASSERT_SUCCESS(status);
283 if ((foundMatch && expectedMatchStart<0) ||
284 (foundStart != expectedMatchStart) ||
285 (foundLimit != expectedMatchLimit)) {
286 TEST_ASSERT(FALSE); // ouput generic error position
287 infoln("Found, expected match start = %d, %d \n"
288 "Found, expected match limit = %d, %d",
289 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
290 }
291
292 // In case there are other matches...
293 // (should we only do this if the test case passed?)
294 while (foundMatch) {
295 expectedMatchStart = foundStart;
296 expectedMatchLimit = foundLimit;
297
298 foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
299 }
300
301 uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
302 target.getBuffer(), target.length(),
303 collator.getAlias(),
304 NULL,
305 &status));
306
307 //
308 // Do the backwards search, check the match result against the expected results.
309 //
310 foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
311 TEST_ASSERT_SUCCESS(status);
312 if ((foundMatch && expectedMatchStart<0) ||
313 (foundStart != expectedMatchStart) ||
314 (foundLimit != expectedMatchLimit)) {
315 TEST_ASSERT(FALSE); // ouput generic error position
316 infoln("Found, expected backwards match start = %d, %d \n"
317 "Found, expected backwards match limit = %d, %d",
318 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
319 }
320 }
321 #endif
322 }
323
324 struct Order
325 {
326 int32_t order;
327 int32_t lowOffset;
328 int32_t highOffset;
329 };
330
331 class OrderList
332 {
333 public:
334 OrderList();
335 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
336 ~OrderList();
337
338 int32_t size(void) const;
339 void add(int32_t order, int32_t low, int32_t high);
340 const Order *get(int32_t index) const;
341 int32_t getLowOffset(int32_t index) const;
342 int32_t getHighOffset(int32_t index) const;
343 int32_t getOrder(int32_t index) const;
344 void reverse(void);
345 UBool compare(const OrderList &other) const;
346 UBool matchesAt(int32_t offset, const OrderList &other) const;
347
348 private:
349 Order *list;
350 int32_t listMax;
351 int32_t listSize;
352 };
353
OrderList()354 OrderList::OrderList()
355 : list(NULL), listMax(16), listSize(0)
356 {
357 list = new Order[listMax];
358 }
359
OrderList(UCollator * coll,const UnicodeString & string,int32_t stringOffset)360 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
361 : list(NULL), listMax(16), listSize(0)
362 {
363 UErrorCode status = U_ZERO_ERROR;
364 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
365 uint32_t strengthMask = 0;
366 int32_t order, low, high;
367
368 switch (ucol_getStrength(coll))
369 {
370 default:
371 strengthMask |= UCOL_TERTIARYORDERMASK;
372 U_FALLTHROUGH;
373 case UCOL_SECONDARY:
374 strengthMask |= UCOL_SECONDARYORDERMASK;
375 U_FALLTHROUGH;
376 case UCOL_PRIMARY:
377 strengthMask |= UCOL_PRIMARYORDERMASK;
378 }
379
380 list = new Order[listMax];
381
382 ucol_setOffset(elems, stringOffset, &status);
383
384 do {
385 low = ucol_getOffset(elems);
386 order = ucol_next(elems, &status);
387 high = ucol_getOffset(elems);
388
389 if (order != UCOL_NULLORDER) {
390 order &= strengthMask;
391 }
392
393 if (order != UCOL_IGNORABLE) {
394 add(order, low, high);
395 }
396 } while (order != UCOL_NULLORDER);
397
398 ucol_closeElements(elems);
399 }
400
~OrderList()401 OrderList::~OrderList()
402 {
403 delete[] list;
404 }
405
add(int32_t order,int32_t low,int32_t high)406 void OrderList::add(int32_t order, int32_t low, int32_t high)
407 {
408 if (listSize >= listMax) {
409 listMax *= 2;
410
411 Order *newList = new Order[listMax];
412
413 uprv_memcpy(newList, list, listSize * sizeof(Order));
414 delete[] list;
415 list = newList;
416 }
417
418 list[listSize].order = order;
419 list[listSize].lowOffset = low;
420 list[listSize].highOffset = high;
421
422 listSize += 1;
423 }
424
get(int32_t index) const425 const Order *OrderList::get(int32_t index) const
426 {
427 if (index >= listSize) {
428 return NULL;
429 }
430
431 return &list[index];
432 }
433
getLowOffset(int32_t index) const434 int32_t OrderList::getLowOffset(int32_t index) const
435 {
436 const Order *order = get(index);
437
438 if (order != NULL) {
439 return order->lowOffset;
440 }
441
442 return -1;
443 }
444
getHighOffset(int32_t index) const445 int32_t OrderList::getHighOffset(int32_t index) const
446 {
447 const Order *order = get(index);
448
449 if (order != NULL) {
450 return order->highOffset;
451 }
452
453 return -1;
454 }
455
getOrder(int32_t index) const456 int32_t OrderList::getOrder(int32_t index) const
457 {
458 const Order *order = get(index);
459
460 if (order != NULL) {
461 return order->order;
462 }
463
464 return UCOL_NULLORDER;
465 }
466
size() const467 int32_t OrderList::size() const
468 {
469 return listSize;
470 }
471
reverse()472 void OrderList::reverse()
473 {
474 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
475 Order swap = list[b];
476
477 list[b] = list[f];
478 list[f] = swap;
479 }
480 }
481
compare(const OrderList & other) const482 UBool OrderList::compare(const OrderList &other) const
483 {
484 if (listSize != other.listSize) {
485 return FALSE;
486 }
487
488 for(int32_t i = 0; i < listSize; i += 1) {
489 if (list[i].order != other.list[i].order ||
490 list[i].lowOffset != other.list[i].lowOffset ||
491 list[i].highOffset != other.list[i].highOffset) {
492 return FALSE;
493 }
494 }
495
496 return TRUE;
497 }
498
matchesAt(int32_t offset,const OrderList & other) const499 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
500 {
501 // NOTE: sizes include the NULLORDER, which we don't want to compare.
502 int32_t otherSize = other.size() - 1;
503
504 if (listSize - 1 - offset < otherSize) {
505 return FALSE;
506 }
507
508 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
509 if (getOrder(i) != other.getOrder(j)) {
510 return FALSE;
511 }
512 }
513
514 return TRUE;
515 }
516
printOffsets(char * buffer,OrderList & list)517 static char *printOffsets(char *buffer, OrderList &list)
518 {
519 int32_t size = list.size();
520 char *s = buffer;
521
522 for(int32_t i = 0; i < size; i += 1) {
523 const Order *order = list.get(i);
524
525 if (i != 0) {
526 s += sprintf(s, ", ");
527 }
528
529 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
530 }
531
532 return buffer;
533 }
534
printOrders(char * buffer,OrderList & list)535 static char *printOrders(char *buffer, OrderList &list)
536 {
537 int32_t size = list.size();
538 char *s = buffer;
539
540 for(int32_t i = 0; i < size; i += 1) {
541 const Order *order = list.get(i);
542
543 if (i != 0) {
544 s += sprintf(s, ", ");
545 }
546
547 s += sprintf(s, "%8.8X", order->order);
548 }
549
550 return buffer;
551 }
552
offsetTest()553 void SSearchTest::offsetTest()
554 {
555 const char *test[] = {
556 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
557 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
558 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
559
560 "\\ua191\\u16ef\\u2036\\u017a",
561
562 #if 0
563 // This results in a complex interaction between contraction,
564 // expansion and normalization that confuses the backwards offset fixups.
565 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
566 #endif
567
568 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
569 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
570
571 "\\u02FE\\u02FF"
572 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
573 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
574 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
575 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
576 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
577
578 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
579 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
580 "a\\u02FF\\u0316\\u0301",
581 "a\\u0430\\u0301\\u0316",
582 "a\\u0430\\u0316\\u0301",
583 "abc\\u0E41\\u0301\\u0316",
584 "abc\\u0E41\\u0316\\u0301",
585 "\\u0E41\\u0301\\u0316",
586 "\\u0E41\\u0316\\u0301",
587 "a\\u0301\\u0316",
588 "a\\u0316\\u0301",
589 "\\uAC52\\uAC53",
590 "\\u34CA\\u34CB",
591 "\\u11ED\\u11EE",
592 "\\u30C3\\u30D0",
593 "p\\u00E9ch\\u00E9",
594 "a\\u0301\\u0325",
595 "a\\u0300\\u0325",
596 "a\\u0325\\u0300",
597 "A\\u0323\\u0300B",
598 "A\\u0300\\u0323B",
599 "A\\u0301\\u0323B",
600 "A\\u0302\\u0301\\u0323B",
601 "abc",
602 "ab\\u0300c",
603 "ab\\u0300\\u0323c",
604 " \\uD800\\uDC00\\uDC00",
605 "a\\uD800\\uDC00\\uDC00",
606 "A\\u0301\\u0301",
607 "A\\u0301\\u0323",
608 "A\\u0301\\u0323B",
609 "B\\u0301\\u0323C",
610 "A\\u0300\\u0323B",
611 "\\u0301A\\u0301\\u0301",
612 "abcd\\r\\u0301",
613 "p\\u00EAche",
614 "pe\\u0302che",
615 };
616
617 int32_t testCount = UPRV_LENGTHOF(test);
618 UErrorCode status = U_ZERO_ERROR;
619 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
620 if (U_FAILURE(status)) {
621 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
622 return;
623 }
624 char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
625 // We could allocate one that's the right size by (CE_count * 10) + 2
626 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
627
628 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
629
630 for(int32_t i = 0; i < testCount; i += 1) {
631 UnicodeString ts = CharsToUnicodeString(test[i]);
632 CollationElementIterator *iter = col->createCollationElementIterator(ts);
633 OrderList forwardList;
634 OrderList backwardList;
635 int32_t order, low, high;
636
637 do {
638 low = iter->getOffset();
639 order = iter->next(status);
640 high = iter->getOffset();
641
642 forwardList.add(order, low, high);
643 } while (order != CollationElementIterator::NULLORDER);
644
645 iter->reset();
646 iter->setOffset(ts.length(), status);
647
648 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
649
650 do {
651 high = iter->getOffset();
652 order = iter->previous(status);
653 low = iter->getOffset();
654
655 if (order == CollationElementIterator::NULLORDER) {
656 break;
657 }
658
659 backwardList.add(order, low, high);
660 } while (TRUE);
661
662 backwardList.reverse();
663
664 if (forwardList.compare(backwardList)) {
665 logln("Works with \"%s\"", test[i]);
666 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
667 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
668
669 logln("Forward CEs: [%s]", printOrders(buffer, forwardList));
670 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
671
672 logln();
673 } else {
674 errln("Fails with \"%s\"", test[i]);
675 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
676 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
677
678 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList));
679 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
680
681 infoln();
682 }
683 delete iter;
684 }
685 delete col;
686 }
687
688 #if 0
689 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
690 {
691 for(int32_t i = 0; i < string.length(); i += 1) {
692 UChar32 ch = string.char32At(i);
693
694 if (ch >= 0x0020 && ch <= 0x007F) {
695 if (ch == 0x005C) {
696 buffer.append("\\\\");
697 } else {
698 buffer.append(ch);
699 }
700 } else {
701 char cbuffer[12];
702
703 if (ch <= 0xFFFFL) {
704 sprintf(cbuffer, "\\u%4.4X", ch);
705 } else {
706 sprintf(cbuffer, "\\U%8.8X", ch);
707 }
708
709 buffer.append(cbuffer);
710 }
711
712 if (ch >= 0x10000L) {
713 i += 1;
714 }
715 }
716
717 return buffer;
718 }
719 #endif
720
sharpSTest()721 void SSearchTest::sharpSTest()
722 {
723 UErrorCode status = U_ZERO_ERROR;
724 UCollator *coll = NULL;
725 UnicodeString lp = "fuss";
726 UnicodeString sp = "fu\\u00DF";
727 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
728 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
729 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
730 int32_t start = -1, end = -1;
731
732 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
733 TEST_ASSERT_SUCCESS(status);
734
735 UnicodeString lpUnescaped = lp.unescape();
736 UnicodeString spUnescaped = sp.unescape();
737
738 LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(),
739 lpUnescaped.getBuffer(), lpUnescaped.length(), // actual test data will be set later
740 coll,
741 NULL, // the break iterator
742 &status));
743
744 LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(),
745 spUnescaped.getBuffer(), spUnescaped.length(), // actual test data will be set later
746 coll,
747 NULL, // the break iterator
748 &status));
749 TEST_ASSERT_SUCCESS(status);
750
751 for (uint32_t t = 0; t < UPRV_LENGTHOF(targets); t += 1) {
752 UBool bFound;
753 UnicodeString target = targets[t].unescape();
754
755 start = end = -1;
756 usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status);
757 bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status);
758 TEST_ASSERT_SUCCESS(status);
759 if (bFound) {
760 logln("Test %d: found long pattern at [%d, %d].", t, start, end);
761 } else {
762 dataerrln("Test %d: did not find long pattern.", t);
763 }
764
765 usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status);
766 bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status);
767 TEST_ASSERT_SUCCESS(status);
768 if (bFound) {
769 logln("Test %d: found long pattern at [%d, %d].", t, start, end);
770 } else {
771 dataerrln("Test %d: did not find long pattern.", t);
772 }
773 }
774
775 ucol_close(coll);
776 }
777
goodSuffixTest()778 void SSearchTest::goodSuffixTest()
779 {
780 UErrorCode status = U_ZERO_ERROR;
781 UCollator *coll = NULL;
782 UnicodeString pat = /*"gcagagag"*/ "fxeld";
783 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
784 int32_t start = -1, end = -1;
785 UBool bFound;
786
787 coll = ucol_open(NULL, &status);
788 TEST_ASSERT_SUCCESS(status);
789
790 LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(),
791 target.getBuffer(), target.length(),
792 coll,
793 NULL, // the break iterator
794 &status));
795 TEST_ASSERT_SUCCESS(status);
796
797 bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status);
798 TEST_ASSERT_SUCCESS(status);
799 if (bFound) {
800 logln("Found pattern at [%d, %d].", start, end);
801 } else {
802 dataerrln("Did not find pattern.");
803 }
804
805 ucol_close(coll);
806 }
807
808 //
809 // searchTime() A quick and dirty performance test for string search.
810 // Probably doesn't really belong as part of intltest, but it
811 // does check that the search succeeds, and gets the right result,
812 // so it serves as a functionality test also.
813 //
814 // To run as a perf test, up the loop count, select by commenting
815 // and uncommenting in the code the operation to be measured,
816 // rebuild, and measure the running time of this test alone.
817 //
818 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime
819 //
searchTime()820 void SSearchTest::searchTime() {
821 static const char *longishText =
822 "Whylom, as olde stories tellen us,\n"
823 "Ther was a duk that highte Theseus:\n"
824 "Of Athenes he was lord and governour,\n"
825 "And in his tyme swich a conquerour,\n"
826 "That gretter was ther noon under the sonne.\n"
827 "Ful many a riche contree hadde he wonne;\n"
828 "What with his wisdom and his chivalrye,\n"
829 "He conquered al the regne of Femenye,\n"
830 "That whylom was y-cleped Scithia;\n"
831 "And weddede the quene Ipolita,\n"
832 "And broghte hir hoom with him in his contree\n"
833 "With muchel glorie and greet solempnitee,\n"
834 "And eek hir yonge suster Emelye.\n"
835 "And thus with victorie and with melodye\n"
836 "Lete I this noble duk to Athenes ryde,\n"
837 "And al his hoost, in armes, him bisyde.\n"
838 "And certes, if it nere to long to here,\n"
839 "I wolde han told yow fully the manere,\n"
840 "How wonnen was the regne of Femenye\n"
841 "By Theseus, and by his chivalrye;\n"
842 "And of the grete bataille for the nones\n"
843 "Bitwixen Athen's and Amazones;\n"
844 "And how asseged was Ipolita,\n"
845 "The faire hardy quene of Scithia;\n"
846 "And of the feste that was at hir weddinge,\n"
847 "And of the tempest at hir hoom-cominge;\n"
848 "But al that thing I moot as now forbere.\n"
849 "I have, God woot, a large feeld to ere,\n"
850 "And wayke been the oxen in my plough.\n"
851 "The remenant of the tale is long y-nough.\n"
852 "I wol nat letten eek noon of this route;\n"
853 "Lat every felawe telle his tale aboute,\n"
854 "And lat see now who shal the soper winne;\n"
855 "And ther I lefte, I wol ageyn biginne.\n"
856 "This duk, of whom I make mencioun,\n"
857 "When he was come almost unto the toun,\n"
858 "In al his wele and in his moste pryde,\n"
859 "He was war, as he caste his eye asyde,\n"
860 "Wher that ther kneled in the hye weye\n"
861 "A companye of ladies, tweye and tweye,\n"
862 "Ech after other, clad in clothes blake; \n"
863 "But swich a cry and swich a wo they make,\n"
864 "That in this world nis creature livinge,\n"
865 "That herde swich another weymentinge;\n"
866 "And of this cry they nolde never stenten,\n"
867 "Til they the reynes of his brydel henten.\n"
868 "'What folk ben ye, that at myn hoomcominge\n"
869 "Perturben so my feste with cryinge'?\n"
870 "Quod Theseus, 'have ye so greet envye\n"
871 "Of myn honour, that thus compleyne and crye? \n"
872 "Or who hath yow misboden, or offended?\n"
873 "And telleth me if it may been amended;\n"
874 "And why that ye ben clothed thus in blak'?\n"
875 "The eldest lady of hem alle spak,\n"
876 "When she hadde swowned with a deedly chere,\n"
877 "That it was routhe for to seen and here,\n"
878 "And seyde: 'Lord, to whom Fortune hath yiven\n"
879 "Victorie, and as a conquerour to liven,\n"
880 "Noght greveth us your glorie and your honour;\n"
881 "But we biseken mercy and socour.\n"
882 "Have mercy on our wo and our distresse.\n"
883 "Som drope of pitee, thurgh thy gentilesse,\n"
884 "Up-on us wrecched wommen lat thou falle.\n"
885 "For certes, lord, ther nis noon of us alle,\n"
886 "That she nath been a duchesse or a quene;\n"
887 "Now be we caitifs, as it is wel sene:\n"
888 "Thanked be Fortune, and hir false wheel,\n"
889 "That noon estat assureth to be weel.\n"
890 "And certes, lord, t'abyden your presence,\n"
891 "Here in the temple of the goddesse Clemence\n"
892 "We han ben waytinge al this fourtenight;\n"
893 "Now help us, lord, sith it is in thy might.\n"
894 "I wrecche, which that wepe and waille thus,\n"
895 "Was whylom wyf to king Capaneus,\n"
896 "That starf at Thebes, cursed be that day!\n"
897 "And alle we, that been in this array,\n"
898 "And maken al this lamentacioun,\n"
899 "We losten alle our housbondes at that toun,\n"
900 "Whyl that the sege ther-aboute lay.\n"
901 "And yet now th'olde Creon, weylaway!\n"
902 "The lord is now of Thebes the citee, \n"
903 "Fulfild of ire and of iniquitee,\n"
904 "He, for despyt, and for his tirannye,\n"
905 "To do the dede bodyes vileinye,\n"
906 "Of alle our lordes, whiche that ben slawe,\n"
907 "Hath alle the bodyes on an heep y-drawe,\n"
908 "And wol nat suffren hem, by noon assent,\n"
909 "Neither to been y-buried nor y-brent,\n"
910 "But maketh houndes ete hem in despyt. zet'\n";
911
912 const char *cPattern = "maketh houndes ete hem";
913 //const char *cPattern = "Whylom";
914 //const char *cPattern = "zet";
915 const char *testId = "searchTime()"; // for error macros.
916 UnicodeString target = longishText;
917 UErrorCode status = U_ZERO_ERROR;
918
919
920 LocalUCollatorPointer collator(ucol_open("en", &status));
921 //ucol_setStrength(collator.getAlias(), collatorStrength);
922 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
923 UnicodeString uPattern = cPattern;
924 LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
925 target.getBuffer(), target.length(),
926 collator.getAlias(),
927 NULL, // the break iterator
928 &status));
929 TEST_ASSERT_SUCCESS(status);
930
931 // int32_t foundStart;
932 // int32_t foundEnd;
933 UBool found;
934
935 // Find the match position usgin strstr
936 const char *pm = strstr(longishText, cPattern);
937 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
938 int32_t refMatchPos = (int32_t)(pm - longishText);
939 int32_t icuMatchPos;
940 int32_t icuMatchEnd;
941 usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
942 TEST_ASSERT_SUCCESS(status);
943 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
944
945 int32_t i;
946 // int32_t j=0;
947
948 // Try loopcounts around 100000 to some millions, depending on the operation,
949 // to get runtimes of at least several seconds.
950 for (i=0; i<10000; i++) {
951 found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
952 (void)found; // Suppress set but not used warning.
953 //TEST_ASSERT_SUCCESS(status);
954 //TEST_ASSERT(found);
955
956 // usearch_setOffset(uss.getAlias(), 0, &status);
957 // icuMatchPos = usearch_next(uss.getAlias(), &status);
958
959 // The i+j stuff is to confuse the optimizer and get it to actually leave the
960 // call to strstr in place.
961 //pm = strstr(longishText+j, cPattern);
962 //j = (j + i)%5;
963 }
964
965 //printf("%ld, %d\n", pm-longishText, j);
966 }
967
968 //----------------------------------------------------------------------------------------
969 //
970 // Random Numbers. Similar to standard lib rand() and srand()
971 // Not using library to
972 // 1. Get same results on all platforms.
973 // 2. Get access to current seed, to more easily reproduce failures.
974 //
975 //---------------------------------------------------------------------------------------
976 static uint32_t m_seed = 1;
977
m_rand()978 static uint32_t m_rand()
979 {
980 m_seed = m_seed * 1103515245 + 12345;
981 return (uint32_t)(m_seed/65536) % 32768;
982 }
983
984 class Monkey
985 {
986 public:
987 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
988
989 protected:
990 Monkey();
991 virtual ~Monkey();
992 };
993
Monkey()994 Monkey::Monkey()
995 {
996 // ook?
997 }
998
~Monkey()999 Monkey::~Monkey()
1000 {
1001 // ook?
1002 }
1003
1004 class SetMonkey : public Monkey
1005 {
1006 public:
1007 SetMonkey(const USet *theSet);
1008 ~SetMonkey();
1009
1010 virtual void append(UnicodeString &test, UnicodeString &alternate);
1011
1012 private:
1013 const USet *set;
1014 };
1015
SetMonkey(const USet * theSet)1016 SetMonkey::SetMonkey(const USet *theSet)
1017 : Monkey(), set(theSet)
1018 {
1019 // ook?
1020 }
1021
~SetMonkey()1022 SetMonkey::~SetMonkey()
1023 {
1024 //ook...
1025 }
1026
append(UnicodeString & test,UnicodeString & alternate)1027 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1028 {
1029 int32_t size = uset_size(set);
1030 int32_t index = m_rand() % size;
1031 UChar32 ch = uset_charAt(set, index);
1032 UnicodeString str(ch);
1033
1034 test.append(str);
1035 alternate.append(str); // flip case, or some junk?
1036 }
1037
1038 class StringSetMonkey : public Monkey
1039 {
1040 public:
1041 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1042 ~StringSetMonkey();
1043
1044 void append(UnicodeString &testCase, UnicodeString &alternate);
1045
1046 private:
1047 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1048
1049 const USet *set;
1050 UCollator *coll;
1051 CollData *collData;
1052 };
1053
StringSetMonkey(const USet * theSet,UCollator * theCollator,CollData * theCollData)1054 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1055 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1056 {
1057 // ook.
1058 }
1059
~StringSetMonkey()1060 StringSetMonkey::~StringSetMonkey()
1061 {
1062 // ook?
1063 }
1064
append(UnicodeString & testCase,UnicodeString & alternate)1065 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1066 {
1067 int32_t itemCount = uset_getItemCount(set), len = 0;
1068 int32_t index = m_rand() % itemCount;
1069 UChar32 rangeStart = 0, rangeEnd = 0;
1070 UChar buffer[16];
1071 UErrorCode err = U_ZERO_ERROR;
1072
1073 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1074
1075 if (len == 0) {
1076 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1077 UChar32 ch = rangeStart + offset;
1078 UnicodeString str(ch);
1079
1080 testCase.append(str);
1081 generateAlternative(str, alternate);
1082 } else if (len > 0) {
1083 // should check that len < 16...
1084 UnicodeString str(buffer, len);
1085
1086 testCase.append(str);
1087 generateAlternative(str, alternate);
1088 } else {
1089 // shouldn't happen...
1090 }
1091 }
1092
generateAlternative(const UnicodeString & testCase,UnicodeString & alternate)1093 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1094 {
1095 // find out shortest string for the longest sequence of ces.
1096 // needs to be refined to use dynamic programming, but will be roughly right
1097 UErrorCode status = U_ZERO_ERROR;
1098 CEList ceList(coll, testCase, status);
1099 UnicodeString alt;
1100 int32_t offset = 0;
1101
1102 if (ceList.size() == 0) {
1103 return alternate.append(testCase);
1104 }
1105
1106 while (offset < ceList.size()) {
1107 int32_t ce = ceList.get(offset);
1108 const StringList *strings = collData->getStringList(ce);
1109
1110 if (strings == NULL) {
1111 return alternate.append(testCase);
1112 }
1113
1114 int32_t stringCount = strings->size();
1115 int32_t tries = 0;
1116
1117 // find random string that generates the same CEList
1118 const CEList *ceList2 = NULL;
1119 const UnicodeString *string = NULL;
1120 UBool matches = FALSE;
1121
1122 do {
1123 int32_t s = m_rand() % stringCount;
1124
1125 if (tries++ > stringCount) {
1126 alternate.append(testCase);
1127 return alternate;
1128 }
1129
1130 string = strings->get(s);
1131 ceList2 = collData->getCEList(string);
1132 matches = ceList.matchesAt(offset, ceList2);
1133
1134 if (! matches) {
1135 collData->freeCEList((CEList *) ceList2);
1136 }
1137 } while (! matches);
1138
1139 alt.append(*string);
1140 offset += ceList2->size();
1141 collData->freeCEList(ceList2);
1142 }
1143
1144 const CEList altCEs(coll, alt, status);
1145
1146 if (ceList.matchesAt(0, &altCEs)) {
1147 return alternate.append(alt);
1148 }
1149
1150 return alternate.append(testCase);
1151 }
1152
generateTestCase(UCollator * coll,Monkey * monkeys[],int32_t monkeyCount,UnicodeString & testCase,UnicodeString & alternate)1153 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1154 {
1155 int32_t pieces = (m_rand() % 4) + 1;
1156 UErrorCode status = U_ZERO_ERROR;
1157 UBool matches;
1158
1159 do {
1160 testCase.remove();
1161 alternate.remove();
1162 monkeys[0]->append(testCase, alternate);
1163
1164 for(int32_t piece = 0; piece < pieces; piece += 1) {
1165 int32_t monkey = m_rand() % monkeyCount;
1166
1167 monkeys[monkey]->append(testCase, alternate);
1168 }
1169
1170 const CEList ceTest(coll, testCase, status);
1171 const CEList ceAlt(coll, alternate, status);
1172
1173 matches = ceTest.matchesAt(0, &ceAlt);
1174 } while (! matches);
1175 }
1176
simpleSearch(UCollator * coll,const UnicodeString & target,int32_t offset,const UnicodeString & pattern,int32_t & matchStart,int32_t & matchEnd)1177 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1178 {
1179 UErrorCode status = U_ZERO_ERROR;
1180 OrderList targetOrders(coll, target, offset);
1181 OrderList patternOrders(coll, pattern);
1182 int32_t targetSize = targetOrders.size() - 1;
1183 int32_t patternSize = patternOrders.size() - 1;
1184 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1185 target.getBuffer(), target.length(), &status);
1186
1187 if (patternSize == 0) {
1188 // Searching for an empty pattern always fails
1189 matchStart = matchEnd = -1;
1190 ubrk_close(charBreakIterator);
1191 return FALSE;
1192 }
1193
1194 matchStart = matchEnd = -1;
1195
1196 for(int32_t i = 0; i < targetSize; i += 1) {
1197 if (targetOrders.matchesAt(i, patternOrders)) {
1198 int32_t start = targetOrders.getLowOffset(i);
1199 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1200 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1201
1202 // if the low and high offsets of the first CE in
1203 // the match are the same, it means that the match
1204 // starts in the middle of an expansion - all but
1205 // the first CE of the expansion will have the offset
1206 // of the following character.
1207 if (start == targetOrders.getHighOffset(i)) {
1208 continue;
1209 }
1210
1211 // Make sure match starts on a grapheme boundary
1212 if (! ubrk_isBoundary(charBreakIterator, start)) {
1213 continue;
1214 }
1215
1216 // If the low and high offsets of the CE after the match
1217 // are the same, it means that the match ends in the middle
1218 // of an expansion sequence.
1219 if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1220 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1221 continue;
1222 }
1223
1224 int32_t mend = maxLimit;
1225
1226 // Find the first grapheme break after the character index
1227 // of the last CE in the match. If it's after character index
1228 // that's after the last CE in the match, use that index
1229 // as the end of the match.
1230 if (minLimit < maxLimit) {
1231 // When the last CE's low index is same with its high index, the CE is likely
1232 // a part of expansion. In this case, the index is located just after the
1233 // character corresponding to the CEs compared above. If the index is right
1234 // at the break boundary, move the position to the next boundary will result
1235 // incorrect match length when there are ignorable characters exist between
1236 // the position and the next character produces CE(s). See ticket#8482.
1237 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
1238 mend = minLimit;
1239 } else {
1240 int32_t nba = ubrk_following(charBreakIterator, minLimit);
1241
1242 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1243 mend = nba;
1244 }
1245 }
1246 }
1247
1248 if (mend > maxLimit) {
1249 continue;
1250 }
1251
1252 if (! ubrk_isBoundary(charBreakIterator, mend)) {
1253 continue;
1254 }
1255
1256 matchStart = start;
1257 matchEnd = mend;
1258
1259 ubrk_close(charBreakIterator);
1260 return TRUE;
1261 }
1262 }
1263
1264 ubrk_close(charBreakIterator);
1265 return FALSE;
1266 }
1267
1268 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)1269 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
1270 int32_t val = defaultVal;
1271
1272 name.append(" *= *(-?\\d+)");
1273
1274 UErrorCode status = U_ZERO_ERROR;
1275 RegexMatcher m(name, params, 0, status);
1276
1277 if (m.find()) {
1278 // The param exists. Convert the string to an int.
1279 char valString[100];
1280 int32_t paramLength = m.end(1, status) - m.start(1, status);
1281
1282 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1283 paramLength = (int32_t)(sizeof(valString)-2);
1284 }
1285
1286 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1287 val = uprv_strtol(valString, NULL, 10);
1288
1289 // Delete this parameter from the params string.
1290 m.reset();
1291 params = m.replaceFirst("", status);
1292 }
1293
1294 //U_ASSERT(U_SUCCESS(status));
1295 if (! U_SUCCESS(status)) {
1296 val = defaultVal;
1297 }
1298
1299 return val;
1300 }
1301 #endif
1302
1303 #if !UCONFIG_NO_COLLATION
monkeyTestCase(UCollator * coll,const UnicodeString & testCase,const UnicodeString & pattern,const UnicodeString & altPattern,const char * name,const char * strength,uint32_t seed)1304 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1305 const char *name, const char *strength, uint32_t seed)
1306 {
1307 UErrorCode status = U_ZERO_ERROR;
1308 int32_t actualStart = -1, actualEnd = -1;
1309 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1310 int32_t expectedStart = -1, expectedEnd = -1;
1311 int32_t notFoundCount = 0;
1312 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1313 testCase.getBuffer(), testCase.length(),
1314 coll,
1315 NULL, // the break iterator
1316 &status));
1317
1318 // **** TODO: find *all* matches, not just first one ****
1319 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1320
1321 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1322
1323 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1324 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1325 " strength=%s seed=%d",
1326 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1327 }
1328
1329 if (expectedStart == -1 && actualStart == -1) {
1330 notFoundCount += 1;
1331 }
1332
1333 // **** TODO: find *all* matches, not just first one ****
1334 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1335
1336 usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
1337
1338 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1339
1340 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1341 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1342 " strength=%s seed=%d",
1343 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1344 }
1345
1346 if (expectedStart == -1 && actualStart == -1) {
1347 notFoundCount += 1;
1348 }
1349
1350 return notFoundCount;
1351 }
1352 #endif
1353
monkeyTest(char * params)1354 void SSearchTest::monkeyTest(char *params)
1355 {
1356 // ook!
1357 UErrorCode status = U_ZERO_ERROR;
1358 //UCollator *coll = ucol_open(NULL, &status);
1359 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1360
1361 if (U_FAILURE(status)) {
1362 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
1363 return;
1364 }
1365
1366 CollData *monkeyData = new CollData(coll, status);
1367
1368 USet *expansions = uset_openEmpty();
1369 USet *contractions = uset_openEmpty();
1370
1371 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1372
1373 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1374 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1375 USet *letters = uset_openPattern(letter_pattern, 39, &status);
1376 SetMonkey letterMonkey(letters);
1377 StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
1378 StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
1379 UnicodeString testCase;
1380 UnicodeString alternate;
1381 UnicodeString pattern, altPattern;
1382 UnicodeString prefix, altPrefix;
1383 UnicodeString suffix, altSuffix;
1384
1385 Monkey *monkeys[] = {
1386 &letterMonkey,
1387 &contractionMonkey,
1388 &expansionMonkey,
1389 &contractionMonkey,
1390 &expansionMonkey,
1391 &contractionMonkey,
1392 &expansionMonkey,
1393 &contractionMonkey,
1394 &expansionMonkey};
1395 int32_t monkeyCount = UPRV_LENGTHOF(monkeys);
1396 // int32_t nonMatchCount = 0;
1397
1398 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1399 const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1400 int32_t strengthCount = UPRV_LENGTHOF(strengths);
1401 int32_t loopCount = quick? 1000 : 10000;
1402 int32_t firstStrength = 0;
1403 int32_t lastStrength = strengthCount - 1; //*/ 0;
1404
1405 if (params != NULL) {
1406 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1407 UnicodeString p(params);
1408
1409 loopCount = getIntParam("loop", p, loopCount);
1410 m_seed = getIntParam("seed", p, m_seed);
1411
1412 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1413 if (m.find()) {
1414 UnicodeString breakType = m.group(1, status);
1415
1416 for (int32_t s = 0; s < strengthCount; s += 1) {
1417 if (breakType == strengthNames[s]) {
1418 firstStrength = lastStrength = s;
1419 break;
1420 }
1421 }
1422
1423 m.reset();
1424 p = m.replaceFirst("", status);
1425 }
1426
1427 if (RegexMatcher("\\S", p, 0, status).find()) {
1428 // Each option is stripped out of the option string as it is processed.
1429 // All options have been checked. The option string should have been completely emptied..
1430 char buf[100];
1431 p.extract(buf, sizeof(buf), NULL, status);
1432 buf[sizeof(buf)-1] = 0;
1433 errln("Unrecognized or extra parameter: %s\n", buf);
1434 return;
1435 }
1436 #else
1437 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1438 #endif
1439 }
1440
1441 for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1442 int32_t notFoundCount = 0;
1443
1444 logln("Setting strength to %s.", strengthNames[s]);
1445 ucol_setStrength(coll, strengths[s]);
1446
1447 // TODO: try alternate prefix and suffix too?
1448 // TODO: alternates are only equal at primary strength. Is this OK?
1449 for(int32_t t = 0; t < loopCount; t += 1) {
1450 uint32_t seed = m_seed;
1451 // int32_t nmc = 0;
1452
1453 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1454 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
1455 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix);
1456
1457 // pattern
1458 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1459
1460 testCase.remove();
1461 testCase.append(prefix);
1462 testCase.append(/*alt*/pattern);
1463
1464 // prefix + pattern
1465 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1466
1467 testCase.append(suffix);
1468
1469 // prefix + pattern + suffix
1470 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1471
1472 testCase.remove();
1473 testCase.append(pattern);
1474 testCase.append(suffix);
1475
1476 // pattern + suffix
1477 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1478 }
1479
1480 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1481 }
1482
1483 uset_close(contractions);
1484 uset_close(expansions);
1485 uset_close(letters);
1486 delete monkeyData;
1487
1488 ucol_close(coll);
1489 }
1490
1491 #endif
1492
1493 #endif
1494