1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_COLLATION
13
14 #include "cmemory.h"
15 #include "cstring.h"
16 #include "usrchimp.h"
17
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/usearch.h"
21 #include "unicode/uset.h"
22 #include "unicode/ustring.h"
23
24 #include "unicode/coleitr.h"
25 #include "unicode/regex.h" // TODO: make conditional on regexp being built.
26
27 #include "colldata.h"
28 #include "ssearch.h"
29 #include "xmlparser.h"
30
31 #include <stdio.h> // for sprintf
32
33 char testId[100];
34
35 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
36 if (!(x)) { \
37 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId); \
38 } \
39 } UPRV_BLOCK_MACRO_END
40
41 #define TEST_ASSERT_M(x, m) UPRV_BLOCK_MACRO_BEGIN { \
42 if (!(x)) { \
43 dataerrln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m); \
44 return; \
45 } \
46 } UPRV_BLOCK_MACRO_END
47
48 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
49 if (U_FAILURE(errcode)) { \
50 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
51 __FILE__, __LINE__, testId, u_errorName(errcode)); \
52 } \
53 } UPRV_BLOCK_MACRO_END
54
55 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
56 #define DELETE_ARRAY(array) uprv_free((void *) (array))
57
58 //---------------------------------------------------------------------------
59 //
60 // Test class boilerplate
61 //
62 //---------------------------------------------------------------------------
SSearchTest()63 SSearchTest::SSearchTest()
64 {
65 }
66
~SSearchTest()67 SSearchTest::~SSearchTest()
68 {
69 }
70
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)71 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
72 {
73 if (exec) logln("TestSuite SSearchTest: ");
74 switch (index) {
75 #if !UCONFIG_NO_BREAK_ITERATION
76 case 0: name = "searchTest";
77 if (exec) searchTest();
78 break;
79
80 case 1: name = "offsetTest";
81 if (exec) offsetTest();
82 break;
83
84 case 2: name = "monkeyTest";
85 if (exec) monkeyTest(params);
86 break;
87
88 case 3: name = "sharpSTest";
89 if (exec) sharpSTest();
90 break;
91
92 case 4: name = "goodSuffixTest";
93 if (exec) goodSuffixTest();
94 break;
95
96 case 5: name = "searchTime";
97 if (exec) searchTime();
98 break;
99 #endif
100 default: name = "";
101 break; //needed to end loop
102 }
103 }
104
105
106 #if !UCONFIG_NO_BREAK_ITERATION
107
108 #define PATH_BUFFER_SIZE 2048
getPath(char buffer[2048],const char * filename)109 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
110 UErrorCode status = U_ZERO_ERROR;
111 const char *testDataDirectory = IntlTest::getSourceTestData(status);
112
113 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
114 errln("ERROR: getPath() failed - %s", u_errorName(status));
115 return NULL;
116 }
117
118 strcpy(buffer, testDataDirectory);
119 strcat(buffer, filename);
120 return buffer;
121 }
122
123
searchTest()124 void SSearchTest::searchTest()
125 {
126 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
127 UErrorCode status = U_ZERO_ERROR;
128 char path[PATH_BUFFER_SIZE];
129 const char *testFilePath = getPath(path, "ssearch.xml");
130
131 if (testFilePath == NULL) {
132 return; /* Couldn't get path: error message already output. */
133 }
134
135 LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
136 TEST_ASSERT_SUCCESS(status);
137 LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
138 TEST_ASSERT_SUCCESS(status);
139 if (U_FAILURE(status)) {
140 return;
141 }
142
143 const UnicodeString *debugTestCase = root->getAttribute("debug");
144 if (debugTestCase != NULL) {
145 // setenv("USEARCH_DEBUG", "1", 1);
146 }
147
148
149 const UXMLElement *testCase;
150 int32_t tc = 0;
151
152 while((testCase = root->nextChildElement(tc)) != NULL) {
153
154 if (testCase->getTagName().compare("test-case") != 0) {
155 errln("ssearch, unrecognized XML Element in test file");
156 continue;
157 }
158 const UnicodeString *id = testCase->getAttribute("id");
159 *testId = 0;
160 if (id != NULL) {
161 id->extract(0, id->length(), testId, sizeof(testId), US_INV);
162 }
163
164 // If debugging test case has been specified and this is not it, skip to next.
165 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
166 continue;
167 }
168 //
169 // Get the requested collation strength.
170 // Default is tertiary if the XML attribute is missing from the test case.
171 //
172 const UnicodeString *strength = testCase->getAttribute("strength");
173 UColAttributeValue collatorStrength = UCOL_PRIMARY;
174 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;}
175 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;}
176 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;}
177 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;}
178 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
179 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;}
180 else {
181 // Bogus value supplied for strength. Shouldn't happen, even from
182 // typos, if the XML source has been validated.
183 // This assert is a little deceiving in that strength can be
184 // any of the allowed values, not just TERTIARY, but it will
185 // do the job of getting the error output.
186 TEST_ASSERT(*strength=="TERTIARY");
187 }
188
189 //
190 // Get the collator normalization flag. Default is UCOL_OFF.
191 //
192 UColAttributeValue normalize = UCOL_OFF;
193 const UnicodeString *norm = testCase->getAttribute("norm");
194 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
195 if (norm!=NULL && *norm=="ON") {
196 normalize = UCOL_ON;
197 }
198
199 //
200 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
201 //
202 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
203 const UnicodeString *alt = testCase->getAttribute("alternate_handling");
204 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
205 if (alt != NULL && *alt == "SHIFTED") {
206 alternateHandling = UCOL_SHIFTED;
207 }
208
209 const UnicodeString defLocale("en");
210 char clocale[100];
211 const UnicodeString *locale = testCase->getAttribute("locale");
212 if (locale == NULL || locale->length()==0) {
213 locale = &defLocale;
214 }
215 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
216
217
218 UnicodeString text;
219 UnicodeString target;
220 UnicodeString pattern;
221 int32_t expectedMatchStart = -1;
222 int32_t expectedMatchLimit = -1;
223 const UXMLElement *n;
224 int32_t nodeCount = 0;
225
226 n = testCase->getChildElement("pattern");
227 TEST_ASSERT(n != NULL);
228 if (n==NULL) {
229 continue;
230 }
231 text = n->getText(FALSE);
232 text = text.unescape();
233 pattern.append(text);
234 nodeCount++;
235
236 n = testCase->getChildElement("pre");
237 if (n!=NULL) {
238 text = n->getText(FALSE);
239 text = text.unescape();
240 target.append(text);
241 nodeCount++;
242 }
243
244 n = testCase->getChildElement("m");
245 if (n!=NULL) {
246 expectedMatchStart = target.length();
247 text = n->getText(FALSE);
248 text = text.unescape();
249 target.append(text);
250 expectedMatchLimit = target.length();
251 nodeCount++;
252 }
253
254 n = testCase->getChildElement("post");
255 if (n!=NULL) {
256 text = n->getText(FALSE);
257 text = text.unescape();
258 target.append(text);
259 nodeCount++;
260 }
261
262 // Check that there weren't extra things in the XML
263 TEST_ASSERT(nodeCount == testCase->countChildren());
264
265 // Open a collator and StringSearch based on the parameters
266 // obtained from the XML.
267 //
268 status = U_ZERO_ERROR;
269 LocalUCollatorPointer collator(ucol_open(clocale, &status));
270 ucol_setStrength(collator.getAlias(), collatorStrength);
271 ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
272 ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
273 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
274 target.getBuffer(), target.length(),
275 collator.getAlias(),
276 NULL, // the break iterator
277 &status));
278
279 TEST_ASSERT_SUCCESS(status);
280 if (U_FAILURE(status)) {
281 continue;
282 }
283
284 int32_t foundStart = 0;
285 int32_t foundLimit = 0;
286 UBool foundMatch;
287
288 //
289 // Do the search, check the match result against the expected results.
290 //
291 foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
292 TEST_ASSERT_SUCCESS(status);
293 if ((foundMatch && expectedMatchStart<0) ||
294 (foundStart != expectedMatchStart) ||
295 (foundLimit != expectedMatchLimit)) {
296 TEST_ASSERT(FALSE); // ouput generic error position
297 infoln("Found, expected match start = %d, %d \n"
298 "Found, expected match limit = %d, %d",
299 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
300 }
301
302 // In case there are other matches...
303 // (should we only do this if the test case passed?)
304 while (foundMatch) {
305 expectedMatchStart = foundStart;
306 expectedMatchLimit = foundLimit;
307
308 foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
309 }
310
311 uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
312 target.getBuffer(), target.length(),
313 collator.getAlias(),
314 NULL,
315 &status));
316
317 //
318 // Do the backwards search, check the match result against the expected results.
319 //
320 foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
321 TEST_ASSERT_SUCCESS(status);
322 if ((foundMatch && expectedMatchStart<0) ||
323 (foundStart != expectedMatchStart) ||
324 (foundLimit != expectedMatchLimit)) {
325 TEST_ASSERT(FALSE); // ouput generic error position
326 infoln("Found, expected backwards match start = %d, %d \n"
327 "Found, expected backwards match limit = %d, %d",
328 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
329 }
330 }
331 #endif
332 }
333
334 struct Order
335 {
336 int32_t order;
337 int32_t lowOffset;
338 int32_t highOffset;
339 };
340
341 class OrderList
342 {
343 public:
344 OrderList();
345 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
346 ~OrderList();
347
348 int32_t size(void) const;
349 void add(int32_t order, int32_t low, int32_t high);
350 const Order *get(int32_t index) const;
351 int32_t getLowOffset(int32_t index) const;
352 int32_t getHighOffset(int32_t index) const;
353 int32_t getOrder(int32_t index) const;
354 void reverse(void);
355 UBool compare(const OrderList &other) const;
356 UBool matchesAt(int32_t offset, const OrderList &other) const;
357
358 private:
359 Order *list;
360 int32_t listMax;
361 int32_t listSize;
362 };
363
OrderList()364 OrderList::OrderList()
365 : list(NULL), listMax(16), listSize(0)
366 {
367 list = new Order[listMax];
368 }
369
OrderList(UCollator * coll,const UnicodeString & string,int32_t stringOffset)370 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
371 : list(NULL), listMax(16), listSize(0)
372 {
373 UErrorCode status = U_ZERO_ERROR;
374 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
375 uint32_t strengthMask = 0;
376 int32_t order, low, high;
377
378 switch (ucol_getStrength(coll))
379 {
380 default:
381 strengthMask |= UCOL_TERTIARYORDERMASK;
382 U_FALLTHROUGH;
383 case UCOL_SECONDARY:
384 strengthMask |= UCOL_SECONDARYORDERMASK;
385 U_FALLTHROUGH;
386 case UCOL_PRIMARY:
387 strengthMask |= UCOL_PRIMARYORDERMASK;
388 }
389
390 list = new Order[listMax];
391
392 ucol_setOffset(elems, stringOffset, &status);
393
394 do {
395 low = ucol_getOffset(elems);
396 order = ucol_next(elems, &status);
397 high = ucol_getOffset(elems);
398
399 if (order != UCOL_NULLORDER) {
400 order &= strengthMask;
401 }
402
403 if (order != UCOL_IGNORABLE) {
404 add(order, low, high);
405 }
406 } while (order != UCOL_NULLORDER);
407
408 ucol_closeElements(elems);
409 }
410
~OrderList()411 OrderList::~OrderList()
412 {
413 delete[] list;
414 }
415
add(int32_t order,int32_t low,int32_t high)416 void OrderList::add(int32_t order, int32_t low, int32_t high)
417 {
418 if (listSize >= listMax) {
419 listMax *= 2;
420
421 Order *newList = new Order[listMax];
422
423 uprv_memcpy(newList, list, listSize * sizeof(Order));
424 delete[] list;
425 list = newList;
426 }
427
428 list[listSize].order = order;
429 list[listSize].lowOffset = low;
430 list[listSize].highOffset = high;
431
432 listSize += 1;
433 }
434
get(int32_t index) const435 const Order *OrderList::get(int32_t index) const
436 {
437 if (index >= listSize) {
438 return NULL;
439 }
440
441 return &list[index];
442 }
443
getLowOffset(int32_t index) const444 int32_t OrderList::getLowOffset(int32_t index) const
445 {
446 const Order *order = get(index);
447
448 if (order != NULL) {
449 return order->lowOffset;
450 }
451
452 return -1;
453 }
454
getHighOffset(int32_t index) const455 int32_t OrderList::getHighOffset(int32_t index) const
456 {
457 const Order *order = get(index);
458
459 if (order != NULL) {
460 return order->highOffset;
461 }
462
463 return -1;
464 }
465
getOrder(int32_t index) const466 int32_t OrderList::getOrder(int32_t index) const
467 {
468 const Order *order = get(index);
469
470 if (order != NULL) {
471 return order->order;
472 }
473
474 return UCOL_NULLORDER;
475 }
476
size() const477 int32_t OrderList::size() const
478 {
479 return listSize;
480 }
481
reverse()482 void OrderList::reverse()
483 {
484 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
485 Order swap = list[b];
486
487 list[b] = list[f];
488 list[f] = swap;
489 }
490 }
491
compare(const OrderList & other) const492 UBool OrderList::compare(const OrderList &other) const
493 {
494 if (listSize != other.listSize) {
495 return FALSE;
496 }
497
498 for(int32_t i = 0; i < listSize; i += 1) {
499 if (list[i].order != other.list[i].order ||
500 list[i].lowOffset != other.list[i].lowOffset ||
501 list[i].highOffset != other.list[i].highOffset) {
502 return FALSE;
503 }
504 }
505
506 return TRUE;
507 }
508
matchesAt(int32_t offset,const OrderList & other) const509 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
510 {
511 // NOTE: sizes include the NULLORDER, which we don't want to compare.
512 int32_t otherSize = other.size() - 1;
513
514 if (listSize - 1 - offset < otherSize) {
515 return FALSE;
516 }
517
518 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
519 if (getOrder(i) != other.getOrder(j)) {
520 return FALSE;
521 }
522 }
523
524 return TRUE;
525 }
526
printOffsets(char * buffer,OrderList & list)527 static char *printOffsets(char *buffer, OrderList &list)
528 {
529 int32_t size = list.size();
530 char *s = buffer;
531
532 for(int32_t i = 0; i < size; i += 1) {
533 const Order *order = list.get(i);
534
535 if (i != 0) {
536 s += sprintf(s, ", ");
537 }
538
539 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
540 }
541
542 return buffer;
543 }
544
printOrders(char * buffer,OrderList & list)545 static char *printOrders(char *buffer, OrderList &list)
546 {
547 int32_t size = list.size();
548 char *s = buffer;
549
550 for(int32_t i = 0; i < size; i += 1) {
551 const Order *order = list.get(i);
552
553 if (i != 0) {
554 s += sprintf(s, ", ");
555 }
556
557 s += sprintf(s, "%8.8X", order->order);
558 }
559
560 return buffer;
561 }
562
offsetTest()563 void SSearchTest::offsetTest()
564 {
565 const char *test[] = {
566 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
567 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
568 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
569
570 "\\ua191\\u16ef\\u2036\\u017a",
571
572 #if 0
573 // This results in a complex interaction between contraction,
574 // expansion and normalization that confuses the backwards offset fixups.
575 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
576 #endif
577
578 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
579 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
580
581 "\\u02FE\\u02FF"
582 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
583 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
584 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
585 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
586 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
587
588 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
589 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
590 "a\\u02FF\\u0316\\u0301",
591 "a\\u0430\\u0301\\u0316",
592 "a\\u0430\\u0316\\u0301",
593 "abc\\u0E41\\u0301\\u0316",
594 "abc\\u0E41\\u0316\\u0301",
595 "\\u0E41\\u0301\\u0316",
596 "\\u0E41\\u0316\\u0301",
597 "a\\u0301\\u0316",
598 "a\\u0316\\u0301",
599 "\\uAC52\\uAC53",
600 "\\u34CA\\u34CB",
601 "\\u11ED\\u11EE",
602 "\\u30C3\\u30D0",
603 "p\\u00E9ch\\u00E9",
604 "a\\u0301\\u0325",
605 "a\\u0300\\u0325",
606 "a\\u0325\\u0300",
607 "A\\u0323\\u0300B",
608 "A\\u0300\\u0323B",
609 "A\\u0301\\u0323B",
610 "A\\u0302\\u0301\\u0323B",
611 "abc",
612 "ab\\u0300c",
613 "ab\\u0300\\u0323c",
614 " \\uD800\\uDC00\\uDC00",
615 "a\\uD800\\uDC00\\uDC00",
616 "A\\u0301\\u0301",
617 "A\\u0301\\u0323",
618 "A\\u0301\\u0323B",
619 "B\\u0301\\u0323C",
620 "A\\u0300\\u0323B",
621 "\\u0301A\\u0301\\u0301",
622 "abcd\\r\\u0301",
623 "p\\u00EAche",
624 "pe\\u0302che",
625 };
626
627 int32_t testCount = UPRV_LENGTHOF(test);
628 UErrorCode status = U_ZERO_ERROR;
629 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
630 if (U_FAILURE(status)) {
631 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
632 return;
633 }
634 char buffer[4096]; // A bit of a hack... just happens to be long enough for all the test cases...
635 // We could allocate one that's the right size by (CE_count * 10) + 2
636 // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
637
638 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
639
640 for(int32_t i = 0; i < testCount; i += 1) {
641 UnicodeString ts = CharsToUnicodeString(test[i]);
642 CollationElementIterator *iter = col->createCollationElementIterator(ts);
643 OrderList forwardList;
644 OrderList backwardList;
645 int32_t order, low, high;
646
647 do {
648 low = iter->getOffset();
649 order = iter->next(status);
650 high = iter->getOffset();
651
652 forwardList.add(order, low, high);
653 } while (order != CollationElementIterator::NULLORDER);
654
655 iter->reset();
656 iter->setOffset(ts.length(), status);
657
658 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
659
660 do {
661 high = iter->getOffset();
662 order = iter->previous(status);
663 low = iter->getOffset();
664
665 if (order == CollationElementIterator::NULLORDER) {
666 break;
667 }
668
669 backwardList.add(order, low, high);
670 } while (TRUE);
671
672 backwardList.reverse();
673
674 if (forwardList.compare(backwardList)) {
675 logln("Works with \"%s\"", test[i]);
676 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
677 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
678
679 logln("Forward CEs: [%s]", printOrders(buffer, forwardList));
680 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
681
682 logln();
683 } else {
684 errln("Fails with \"%s\"", test[i]);
685 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList));
686 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
687
688 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList));
689 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
690
691 infoln();
692 }
693 delete iter;
694 }
695 delete col;
696 }
697
698 #if 0
699 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
700 {
701 for(int32_t i = 0; i < string.length(); i += 1) {
702 UChar32 ch = string.char32At(i);
703
704 if (ch >= 0x0020 && ch <= 0x007F) {
705 if (ch == 0x005C) {
706 buffer.append("\\\\");
707 } else {
708 buffer.append(ch);
709 }
710 } else {
711 char cbuffer[12];
712
713 if (ch <= 0xFFFFL) {
714 sprintf(cbuffer, "\\u%4.4X", ch);
715 } else {
716 sprintf(cbuffer, "\\U%8.8X", ch);
717 }
718
719 buffer.append(cbuffer);
720 }
721
722 if (ch >= 0x10000L) {
723 i += 1;
724 }
725 }
726
727 return buffer;
728 }
729 #endif
730
sharpSTest()731 void SSearchTest::sharpSTest()
732 {
733 UErrorCode status = U_ZERO_ERROR;
734 UCollator *coll = NULL;
735 UnicodeString lp = "fuss";
736 UnicodeString sp = "fu\\u00DF";
737 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
738 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
739 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
740 int32_t start = -1, end = -1;
741
742 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
743 TEST_ASSERT_SUCCESS(status);
744
745 UnicodeString lpUnescaped = lp.unescape();
746 UnicodeString spUnescaped = sp.unescape();
747
748 LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(),
749 lpUnescaped.getBuffer(), lpUnescaped.length(), // actual test data will be set later
750 coll,
751 NULL, // the break iterator
752 &status));
753
754 LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(),
755 spUnescaped.getBuffer(), spUnescaped.length(), // actual test data will be set later
756 coll,
757 NULL, // the break iterator
758 &status));
759 TEST_ASSERT_SUCCESS(status);
760
761 for (uint32_t t = 0; t < UPRV_LENGTHOF(targets); t += 1) {
762 UBool bFound;
763 UnicodeString target = targets[t].unescape();
764
765 start = end = -1;
766 usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status);
767 bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status);
768 TEST_ASSERT_SUCCESS(status);
769 if (bFound) {
770 logln("Test %d: found long pattern at [%d, %d].", t, start, end);
771 } else {
772 dataerrln("Test %d: did not find long pattern.", t);
773 }
774
775 usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status);
776 bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status);
777 TEST_ASSERT_SUCCESS(status);
778 if (bFound) {
779 logln("Test %d: found long pattern at [%d, %d].", t, start, end);
780 } else {
781 dataerrln("Test %d: did not find long pattern.", t);
782 }
783 }
784
785 ucol_close(coll);
786 }
787
goodSuffixTest()788 void SSearchTest::goodSuffixTest()
789 {
790 UErrorCode status = U_ZERO_ERROR;
791 UCollator *coll = NULL;
792 UnicodeString pat = /*"gcagagag"*/ "fxeld";
793 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
794 int32_t start = -1, end = -1;
795 UBool bFound;
796
797 coll = ucol_open(NULL, &status);
798 TEST_ASSERT_SUCCESS(status);
799
800 LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(),
801 target.getBuffer(), target.length(),
802 coll,
803 NULL, // the break iterator
804 &status));
805 TEST_ASSERT_SUCCESS(status);
806
807 bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status);
808 TEST_ASSERT_SUCCESS(status);
809 if (bFound) {
810 logln("Found pattern at [%d, %d].", start, end);
811 } else {
812 dataerrln("Did not find pattern.");
813 }
814
815 ucol_close(coll);
816 }
817
818 //
819 // searchTime() A quick and dirty performance test for string search.
820 // Probably doesn't really belong as part of intltest, but it
821 // does check that the search succeeds, and gets the right result,
822 // so it serves as a functionality test also.
823 //
824 // To run as a perf test, up the loop count, select by commenting
825 // and uncommenting in the code the operation to be measured,
826 // rebuild, and measure the running time of this test alone.
827 //
828 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearchTest/searchTime
829 //
searchTime()830 void SSearchTest::searchTime() {
831 static const char *longishText =
832 "Whylom, as olde stories tellen us,\n"
833 "Ther was a duk that highte Theseus:\n"
834 "Of Athenes he was lord and governour,\n"
835 "And in his tyme swich a conquerour,\n"
836 "That gretter was ther noon under the sonne.\n"
837 "Ful many a riche contree hadde he wonne;\n"
838 "What with his wisdom and his chivalrye,\n"
839 "He conquered al the regne of Femenye,\n"
840 "That whylom was y-cleped Scithia;\n"
841 "And weddede the quene Ipolita,\n"
842 "And broghte hir hoom with him in his contree\n"
843 "With muchel glorie and greet solempnitee,\n"
844 "And eek hir yonge suster Emelye.\n"
845 "And thus with victorie and with melodye\n"
846 "Lete I this noble duk to Athenes ryde,\n"
847 "And al his hoost, in armes, him bisyde.\n"
848 "And certes, if it nere to long to here,\n"
849 "I wolde han told yow fully the manere,\n"
850 "How wonnen was the regne of Femenye\n"
851 "By Theseus, and by his chivalrye;\n"
852 "And of the grete bataille for the nones\n"
853 "Bitwixen Athen's and Amazones;\n"
854 "And how asseged was Ipolita,\n"
855 "The faire hardy quene of Scithia;\n"
856 "And of the feste that was at hir weddinge,\n"
857 "And of the tempest at hir hoom-cominge;\n"
858 "But al that thing I moot as now forbere.\n"
859 "I have, God woot, a large feeld to ere,\n"
860 "And wayke been the oxen in my plough.\n"
861 "The remenant of the tale is long y-nough.\n"
862 "I wol nat letten eek noon of this route;\n"
863 "Lat every felawe telle his tale aboute,\n"
864 "And lat see now who shal the soper winne;\n"
865 "And ther I lefte, I wol ageyn biginne.\n"
866 "This duk, of whom I make mencioun,\n"
867 "When he was come almost unto the toun,\n"
868 "In al his wele and in his moste pryde,\n"
869 "He was war, as he caste his eye asyde,\n"
870 "Wher that ther kneled in the hye weye\n"
871 "A companye of ladies, tweye and tweye,\n"
872 "Ech after other, clad in clothes blake; \n"
873 "But swich a cry and swich a wo they make,\n"
874 "That in this world nis creature livinge,\n"
875 "That herde swich another weymentinge;\n"
876 "And of this cry they nolde never stenten,\n"
877 "Til they the reynes of his brydel henten.\n"
878 "'What folk ben ye, that at myn hoomcominge\n"
879 "Perturben so my feste with cryinge'?\n"
880 "Quod Theseus, 'have ye so greet envye\n"
881 "Of myn honour, that thus compleyne and crye? \n"
882 "Or who hath yow misboden, or offended?\n"
883 "And telleth me if it may been amended;\n"
884 "And why that ye ben clothed thus in blak'?\n"
885 "The eldest lady of hem alle spak,\n"
886 "When she hadde swowned with a deedly chere,\n"
887 "That it was routhe for to seen and here,\n"
888 "And seyde: 'Lord, to whom Fortune hath yiven\n"
889 "Victorie, and as a conquerour to liven,\n"
890 "Noght greveth us your glorie and your honour;\n"
891 "But we biseken mercy and socour.\n"
892 "Have mercy on our wo and our distresse.\n"
893 "Som drope of pitee, thurgh thy gentilesse,\n"
894 "Up-on us wrecched wommen lat thou falle.\n"
895 "For certes, lord, ther nis noon of us alle,\n"
896 "That she nath been a duchesse or a quene;\n"
897 "Now be we caitifs, as it is wel sene:\n"
898 "Thanked be Fortune, and hir false wheel,\n"
899 "That noon estat assureth to be weel.\n"
900 "And certes, lord, t'abyden your presence,\n"
901 "Here in the temple of the goddesse Clemence\n"
902 "We han ben waytinge al this fourtenight;\n"
903 "Now help us, lord, sith it is in thy might.\n"
904 "I wrecche, which that wepe and waille thus,\n"
905 "Was whylom wyf to king Capaneus,\n"
906 "That starf at Thebes, cursed be that day!\n"
907 "And alle we, that been in this array,\n"
908 "And maken al this lamentacioun,\n"
909 "We losten alle our housbondes at that toun,\n"
910 "Whyl that the sege ther-aboute lay.\n"
911 "And yet now th'olde Creon, weylaway!\n"
912 "The lord is now of Thebes the citee, \n"
913 "Fulfild of ire and of iniquitee,\n"
914 "He, for despyt, and for his tirannye,\n"
915 "To do the dede bodyes vileinye,\n"
916 "Of alle our lordes, whiche that ben slawe,\n"
917 "Hath alle the bodyes on an heep y-drawe,\n"
918 "And wol nat suffren hem, by noon assent,\n"
919 "Neither to been y-buried nor y-brent,\n"
920 "But maketh houndes ete hem in despyt. zet'\n";
921
922 const char *cPattern = "maketh houndes ete hem";
923 //const char *cPattern = "Whylom";
924 //const char *cPattern = "zet";
925 const char *testId = "searchTime()"; // for error macros.
926 UnicodeString target = longishText;
927 UErrorCode status = U_ZERO_ERROR;
928
929
930 LocalUCollatorPointer collator(ucol_open("en", &status));
931 //ucol_setStrength(collator.getAlias(), collatorStrength);
932 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
933 UnicodeString uPattern = cPattern;
934 LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
935 target.getBuffer(), target.length(),
936 collator.getAlias(),
937 NULL, // the break iterator
938 &status));
939 TEST_ASSERT_SUCCESS(status);
940
941 // int32_t foundStart;
942 // int32_t foundEnd;
943 UBool found;
944
945 // Find the match position usgin strstr
946 const char *pm = strstr(longishText, cPattern);
947 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
948 int32_t refMatchPos = (int32_t)(pm - longishText);
949 int32_t icuMatchPos;
950 int32_t icuMatchEnd;
951 usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
952 TEST_ASSERT_SUCCESS(status);
953 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
954
955 int32_t i;
956 // int32_t j=0;
957
958 // Try loopcounts around 100000 to some millions, depending on the operation,
959 // to get runtimes of at least several seconds.
960 for (i=0; i<10000; i++) {
961 found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
962 (void)found; // Suppress set but not used warning.
963 //TEST_ASSERT_SUCCESS(status);
964 //TEST_ASSERT(found);
965
966 // usearch_setOffset(uss.getAlias(), 0, &status);
967 // icuMatchPos = usearch_next(uss.getAlias(), &status);
968
969 // The i+j stuff is to confuse the optimizer and get it to actually leave the
970 // call to strstr in place.
971 //pm = strstr(longishText+j, cPattern);
972 //j = (j + i)%5;
973 }
974
975 //printf("%ld, %d\n", pm-longishText, j);
976 }
977
978 //----------------------------------------------------------------------------------------
979 //
980 // Random Numbers. Similar to standard lib rand() and srand()
981 // Not using library to
982 // 1. Get same results on all platforms.
983 // 2. Get access to current seed, to more easily reproduce failures.
984 //
985 //---------------------------------------------------------------------------------------
986 static uint32_t m_seed = 1;
987
m_rand()988 static uint32_t m_rand()
989 {
990 m_seed = m_seed * 1103515245 + 12345;
991 return (uint32_t)(m_seed/65536) % 32768;
992 }
993
994 class Monkey
995 {
996 public:
997 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
998
999 protected:
1000 Monkey();
1001 virtual ~Monkey();
1002 };
1003
Monkey()1004 Monkey::Monkey()
1005 {
1006 // ook?
1007 }
1008
~Monkey()1009 Monkey::~Monkey()
1010 {
1011 // ook?
1012 }
1013
1014 class SetMonkey : public Monkey
1015 {
1016 public:
1017 SetMonkey(const USet *theSet);
1018 ~SetMonkey();
1019
1020 virtual void append(UnicodeString &test, UnicodeString &alternate);
1021
1022 private:
1023 const USet *set;
1024 };
1025
SetMonkey(const USet * theSet)1026 SetMonkey::SetMonkey(const USet *theSet)
1027 : Monkey(), set(theSet)
1028 {
1029 // ook?
1030 }
1031
~SetMonkey()1032 SetMonkey::~SetMonkey()
1033 {
1034 //ook...
1035 }
1036
append(UnicodeString & test,UnicodeString & alternate)1037 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1038 {
1039 int32_t size = uset_size(set);
1040 int32_t index = m_rand() % size;
1041 UChar32 ch = uset_charAt(set, index);
1042 UnicodeString str(ch);
1043
1044 test.append(str);
1045 alternate.append(str); // flip case, or some junk?
1046 }
1047
1048 class StringSetMonkey : public Monkey
1049 {
1050 public:
1051 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1052 ~StringSetMonkey();
1053
1054 void append(UnicodeString &testCase, UnicodeString &alternate);
1055
1056 private:
1057 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1058
1059 const USet *set;
1060 UCollator *coll;
1061 CollData *collData;
1062 };
1063
StringSetMonkey(const USet * theSet,UCollator * theCollator,CollData * theCollData)1064 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1065 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1066 {
1067 // ook.
1068 }
1069
~StringSetMonkey()1070 StringSetMonkey::~StringSetMonkey()
1071 {
1072 // ook?
1073 }
1074
append(UnicodeString & testCase,UnicodeString & alternate)1075 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1076 {
1077 int32_t itemCount = uset_getItemCount(set), len = 0;
1078 int32_t index = m_rand() % itemCount;
1079 UChar32 rangeStart = 0, rangeEnd = 0;
1080 UChar buffer[16];
1081 UErrorCode err = U_ZERO_ERROR;
1082
1083 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1084
1085 if (len == 0) {
1086 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1087 UChar32 ch = rangeStart + offset;
1088 UnicodeString str(ch);
1089
1090 testCase.append(str);
1091 generateAlternative(str, alternate);
1092 } else if (len > 0) {
1093 // should check that len < 16...
1094 UnicodeString str(buffer, len);
1095
1096 testCase.append(str);
1097 generateAlternative(str, alternate);
1098 } else {
1099 // shouldn't happen...
1100 }
1101 }
1102
generateAlternative(const UnicodeString & testCase,UnicodeString & alternate)1103 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1104 {
1105 // find out shortest string for the longest sequence of ces.
1106 // needs to be refined to use dynamic programming, but will be roughly right
1107 UErrorCode status = U_ZERO_ERROR;
1108 CEList ceList(coll, testCase, status);
1109 UnicodeString alt;
1110 int32_t offset = 0;
1111
1112 if (ceList.size() == 0) {
1113 return alternate.append(testCase);
1114 }
1115
1116 while (offset < ceList.size()) {
1117 int32_t ce = ceList.get(offset);
1118 const StringList *strings = collData->getStringList(ce);
1119
1120 if (strings == NULL) {
1121 return alternate.append(testCase);
1122 }
1123
1124 int32_t stringCount = strings->size();
1125 int32_t tries = 0;
1126
1127 // find random string that generates the same CEList
1128 const CEList *ceList2 = NULL;
1129 const UnicodeString *string = NULL;
1130 UBool matches = FALSE;
1131
1132 do {
1133 int32_t s = m_rand() % stringCount;
1134
1135 if (tries++ > stringCount) {
1136 alternate.append(testCase);
1137 return alternate;
1138 }
1139
1140 string = strings->get(s);
1141 ceList2 = collData->getCEList(string);
1142 matches = ceList.matchesAt(offset, ceList2);
1143
1144 if (! matches) {
1145 collData->freeCEList((CEList *) ceList2);
1146 }
1147 } while (! matches);
1148
1149 alt.append(*string);
1150 offset += ceList2->size();
1151 collData->freeCEList(ceList2);
1152 }
1153
1154 const CEList altCEs(coll, alt, status);
1155
1156 if (ceList.matchesAt(0, &altCEs)) {
1157 return alternate.append(alt);
1158 }
1159
1160 return alternate.append(testCase);
1161 }
1162
generateTestCase(UCollator * coll,Monkey * monkeys[],int32_t monkeyCount,UnicodeString & testCase,UnicodeString & alternate)1163 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1164 {
1165 int32_t pieces = (m_rand() % 4) + 1;
1166 UErrorCode status = U_ZERO_ERROR;
1167 UBool matches;
1168
1169 do {
1170 testCase.remove();
1171 alternate.remove();
1172 monkeys[0]->append(testCase, alternate);
1173
1174 for(int32_t piece = 0; piece < pieces; piece += 1) {
1175 int32_t monkey = m_rand() % monkeyCount;
1176
1177 monkeys[monkey]->append(testCase, alternate);
1178 }
1179
1180 const CEList ceTest(coll, testCase, status);
1181 const CEList ceAlt(coll, alternate, status);
1182
1183 matches = ceTest.matchesAt(0, &ceAlt);
1184 } while (! matches);
1185 }
1186
simpleSearch(UCollator * coll,const UnicodeString & target,int32_t offset,const UnicodeString & pattern,int32_t & matchStart,int32_t & matchEnd)1187 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1188 {
1189 UErrorCode status = U_ZERO_ERROR;
1190 OrderList targetOrders(coll, target, offset);
1191 OrderList patternOrders(coll, pattern);
1192 int32_t targetSize = targetOrders.size() - 1;
1193 int32_t patternSize = patternOrders.size() - 1;
1194 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1195 target.getBuffer(), target.length(), &status);
1196
1197 if (patternSize == 0) {
1198 // Searching for an empty pattern always fails
1199 matchStart = matchEnd = -1;
1200 ubrk_close(charBreakIterator);
1201 return FALSE;
1202 }
1203
1204 matchStart = matchEnd = -1;
1205
1206 for(int32_t i = 0; i < targetSize; i += 1) {
1207 if (targetOrders.matchesAt(i, patternOrders)) {
1208 int32_t start = targetOrders.getLowOffset(i);
1209 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1210 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1211
1212 // if the low and high offsets of the first CE in
1213 // the match are the same, it means that the match
1214 // starts in the middle of an expansion - all but
1215 // the first CE of the expansion will have the offset
1216 // of the following character.
1217 if (start == targetOrders.getHighOffset(i)) {
1218 continue;
1219 }
1220
1221 // Make sure match starts on a grapheme boundary
1222 if (! ubrk_isBoundary(charBreakIterator, start)) {
1223 continue;
1224 }
1225
1226 // If the low and high offsets of the CE after the match
1227 // are the same, it means that the match ends in the middle
1228 // of an expansion sequence.
1229 if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1230 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1231 continue;
1232 }
1233
1234 int32_t mend = maxLimit;
1235
1236 // Find the first grapheme break after the character index
1237 // of the last CE in the match. If it's after character index
1238 // that's after the last CE in the match, use that index
1239 // as the end of the match.
1240 if (minLimit < maxLimit) {
1241 // When the last CE's low index is same with its high index, the CE is likely
1242 // a part of expansion. In this case, the index is located just after the
1243 // character corresponding to the CEs compared above. If the index is right
1244 // at the break boundary, move the position to the next boundary will result
1245 // incorrect match length when there are ignorable characters exist between
1246 // the position and the next character produces CE(s). See ticket#8482.
1247 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
1248 mend = minLimit;
1249 } else {
1250 int32_t nba = ubrk_following(charBreakIterator, minLimit);
1251
1252 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1253 mend = nba;
1254 }
1255 }
1256 }
1257
1258 if (mend > maxLimit) {
1259 continue;
1260 }
1261
1262 if (! ubrk_isBoundary(charBreakIterator, mend)) {
1263 continue;
1264 }
1265
1266 matchStart = start;
1267 matchEnd = mend;
1268
1269 ubrk_close(charBreakIterator);
1270 return TRUE;
1271 }
1272 }
1273
1274 ubrk_close(charBreakIterator);
1275 return FALSE;
1276 }
1277
1278 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)1279 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
1280 int32_t val = defaultVal;
1281
1282 name.append(" *= *(-?\\d+)");
1283
1284 UErrorCode status = U_ZERO_ERROR;
1285 RegexMatcher m(name, params, 0, status);
1286
1287 if (m.find()) {
1288 // The param exists. Convert the string to an int.
1289 char valString[100];
1290 int32_t paramLength = m.end(1, status) - m.start(1, status);
1291
1292 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1293 paramLength = (int32_t)(sizeof(valString)-2);
1294 }
1295
1296 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1297 val = uprv_strtol(valString, NULL, 10);
1298
1299 // Delete this parameter from the params string.
1300 m.reset();
1301 params = m.replaceFirst("", status);
1302 }
1303
1304 //U_ASSERT(U_SUCCESS(status));
1305 if (! U_SUCCESS(status)) {
1306 val = defaultVal;
1307 }
1308
1309 return val;
1310 }
1311 #endif
1312
1313 #if !UCONFIG_NO_COLLATION
monkeyTestCase(UCollator * coll,const UnicodeString & testCase,const UnicodeString & pattern,const UnicodeString & altPattern,const char * name,const char * strength,uint32_t seed)1314 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1315 const char *name, const char *strength, uint32_t seed)
1316 {
1317 UErrorCode status = U_ZERO_ERROR;
1318 int32_t actualStart = -1, actualEnd = -1;
1319 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1320 int32_t expectedStart = -1, expectedEnd = -1;
1321 int32_t notFoundCount = 0;
1322 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1323 testCase.getBuffer(), testCase.length(),
1324 coll,
1325 NULL, // the break iterator
1326 &status));
1327
1328 // **** TODO: find *all* matches, not just first one ****
1329 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1330
1331 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1332
1333 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1334 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1335 " strength=%s seed=%d",
1336 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1337 }
1338
1339 if (expectedStart == -1 && actualStart == -1) {
1340 notFoundCount += 1;
1341 }
1342
1343 // **** TODO: find *all* matches, not just first one ****
1344 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1345
1346 usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
1347
1348 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1349
1350 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1351 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1352 " strength=%s seed=%d",
1353 name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1354 }
1355
1356 if (expectedStart == -1 && actualStart == -1) {
1357 notFoundCount += 1;
1358 }
1359
1360 return notFoundCount;
1361 }
1362 #endif
1363
monkeyTest(char * params)1364 void SSearchTest::monkeyTest(char *params)
1365 {
1366 // ook!
1367 UErrorCode status = U_ZERO_ERROR;
1368 //UCollator *coll = ucol_open(NULL, &status);
1369 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1370
1371 if (U_FAILURE(status)) {
1372 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
1373 return;
1374 }
1375
1376 CollData *monkeyData = new CollData(coll, status);
1377
1378 USet *expansions = uset_openEmpty();
1379 USet *contractions = uset_openEmpty();
1380
1381 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1382
1383 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1384 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1385 USet *letters = uset_openPattern(letter_pattern, 39, &status);
1386 SetMonkey letterMonkey(letters);
1387 StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
1388 StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
1389 UnicodeString testCase;
1390 UnicodeString alternate;
1391 UnicodeString pattern, altPattern;
1392 UnicodeString prefix, altPrefix;
1393 UnicodeString suffix, altSuffix;
1394
1395 Monkey *monkeys[] = {
1396 &letterMonkey,
1397 &contractionMonkey,
1398 &expansionMonkey,
1399 &contractionMonkey,
1400 &expansionMonkey,
1401 &contractionMonkey,
1402 &expansionMonkey,
1403 &contractionMonkey,
1404 &expansionMonkey};
1405 int32_t monkeyCount = UPRV_LENGTHOF(monkeys);
1406 // int32_t nonMatchCount = 0;
1407
1408 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1409 const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1410 int32_t strengthCount = UPRV_LENGTHOF(strengths);
1411 int32_t loopCount = quick? 1000 : 10000;
1412 int32_t firstStrength = 0;
1413 int32_t lastStrength = strengthCount - 1; //*/ 0;
1414
1415 if (params != NULL) {
1416 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1417 UnicodeString p(params);
1418
1419 loopCount = getIntParam("loop", p, loopCount);
1420 m_seed = getIntParam("seed", p, m_seed);
1421
1422 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1423 if (m.find()) {
1424 UnicodeString breakType = m.group(1, status);
1425
1426 for (int32_t s = 0; s < strengthCount; s += 1) {
1427 if (breakType == strengthNames[s]) {
1428 firstStrength = lastStrength = s;
1429 break;
1430 }
1431 }
1432
1433 m.reset();
1434 p = m.replaceFirst("", status);
1435 }
1436
1437 if (RegexMatcher("\\S", p, 0, status).find()) {
1438 // Each option is stripped out of the option string as it is processed.
1439 // All options have been checked. The option string should have been completely emptied..
1440 char buf[100];
1441 p.extract(buf, sizeof(buf), NULL, status);
1442 buf[sizeof(buf)-1] = 0;
1443 errln("Unrecognized or extra parameter: %s\n", buf);
1444 return;
1445 }
1446 #else
1447 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1448 #endif
1449 }
1450
1451 for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1452 int32_t notFoundCount = 0;
1453
1454 logln("Setting strength to %s.", strengthNames[s]);
1455 ucol_setStrength(coll, strengths[s]);
1456
1457 // TODO: try alternate prefix and suffix too?
1458 // TODO: alternates are only equal at primary strength. Is this OK?
1459 for(int32_t t = 0; t < loopCount; t += 1) {
1460 uint32_t seed = m_seed;
1461 // int32_t nmc = 0;
1462
1463 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1464 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix);
1465 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix);
1466
1467 // pattern
1468 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1469
1470 testCase.remove();
1471 testCase.append(prefix);
1472 testCase.append(/*alt*/pattern);
1473
1474 // prefix + pattern
1475 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1476
1477 testCase.append(suffix);
1478
1479 // prefix + pattern + suffix
1480 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1481
1482 testCase.remove();
1483 testCase.append(pattern);
1484 testCase.append(suffix);
1485
1486 // pattern + suffix
1487 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1488 }
1489
1490 logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1491 }
1492
1493 uset_close(contractions);
1494 uset_close(expansions);
1495 uset_close(letters);
1496 delete monkeyData;
1497
1498 ucol_close(coll);
1499 }
1500
1501 #endif
1502
1503 #endif
1504