1 /***********************************************************************
2 * © 2016 and later: Unicode, Inc. and others.
3 * License & terms of use: http://www.unicode.org/copyright.html#License
4 ***********************************************************************
5 ***********************************************************************
6 * COPYRIGHT:
7 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
8 *
9 ***********************************************************************/
10 /********************************************************************************
11 *
12 * File CALLCOLL.C
13 *
14 * Modification History:
15 * Name Description
16 * Andy Heninger First Version
17 *
18 *********************************************************************************
19 */
20
21 //
22 // This program tests string collation and sort key generation performance.
23 // Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
24 // A file of names is required as input, one per line. It must be in utf-8 or utf-16 format,
25 // and include a byte order mark. Either LE or BE format is OK.
26 //
27
28 const char gUsageString[] =
29 "usage: collperf options...\n"
30 "-help Display this message.\n"
31 "-file file_name utf-16 format file of names.\n"
32 "-locale name ICU locale to use. Default is en_US\n"
33 "-rules file_name Collation rules file (overrides locale)\n"
34 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
35 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
36 "-win Run test using Windows native services. (ICU is default)\n"
37 "-unix Run test using Unix strxfrm, strcoll services.\n"
38 "-uselen Use API with string lengths. Default is null-terminated strings\n"
39 "-usekeys Run tests using sortkeys rather than strcoll\n"
40 "-strcmp Run tests using u_strcmp rather than strcoll\n"
41 "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n"
42 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
43 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
44 " under test at each call point. For measuring test overhead.\n"
45 "-terse Terse numbers-only output. Intended for use by scripts.\n"
46 "-french French accent ordering\n"
47 "-frenchoff No French accent ordering (for use with French locales.)\n"
48 "-norm Normalizing mode on\n"
49 "-shifted Shifted mode\n"
50 "-lower Lower case first\n"
51 "-upper Upper case first\n"
52 "-case Enable separate case level\n"
53 "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
54 "-keyhist Produce a table sort key size vs. string length\n"
55 "-binsearch Binary Search timing test\n"
56 "-keygen Sort Key Generation timing test\n"
57 "-qsort Quicksort timing test\n"
58 "-iter Iteration Performance Test\n"
59 "-dump Display strings, sort keys and CEs.\n"
60 ;
61
62
63
64 #include <stdio.h>
65 #include <string.h>
66 #include <stdlib.h>
67 #include <math.h>
68 #include <locale.h>
69 #include <errno.h>
70
71 #include <unicode/utypes.h>
72 #include <unicode/ucol.h>
73 #include <unicode/ucoleitr.h>
74 #include <unicode/uloc.h>
75 #include <unicode/ustring.h>
76 #include <unicode/ures.h>
77 #include <unicode/uchar.h>
78 #include <unicode/ucnv.h>
79 #include <unicode/utf8.h>
80
81 #ifdef WIN32
82 #include <windows.h>
83 #else
84 //
85 // Stubs for Windows API functions when building on UNIXes.
86 //
87 typedef int DWORD;
CompareStringW(DWORD,DWORD,UChar *,int,UChar *,int)88 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
89 #include <sys/time.h>
timeGetTime()90 unsigned long timeGetTime() {
91 struct timeval t;
92 gettimeofday(&t, 0);
93 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
94 val += t.tv_usec / 1000;
95 return val;
96 }
LCMapStringW(DWORD,DWORD,UChar *,int,UChar *,int)97 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
98 const int LCMAP_SORTKEY = 0;
99 #define MAKELCID(a,b) 0
100 const int SORT_DEFAULT = 0;
101 #endif
102
103
104
105 //
106 // Command line option variables
107 // These global variables are set according to the options specified
108 // on the command line by the user.
109 char * opt_fName = 0;
110 const char * opt_locale = "en_US";
111 int opt_langid = 0; // Defaults to value corresponding to opt_locale.
112 char * opt_rules = 0;
113 UBool opt_help = FALSE;
114 int opt_loopCount = 1;
115 int opt_iLoopCount = 1;
116 UBool opt_terse = FALSE;
117 UBool opt_qsort = FALSE;
118 UBool opt_binsearch = FALSE;
119 UBool opt_icu = TRUE;
120 UBool opt_win = FALSE; // Run with Windows native functions.
121 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions.
122 UBool opt_uselen = FALSE;
123 UBool opt_usekeys = FALSE;
124 UBool opt_strcmp = FALSE;
125 UBool opt_strcmpCPO = FALSE;
126 UBool opt_norm = FALSE;
127 UBool opt_keygen = FALSE;
128 UBool opt_french = FALSE;
129 UBool opt_frenchoff = FALSE;
130 UBool opt_shifted = FALSE;
131 UBool opt_lower = FALSE;
132 UBool opt_upper = FALSE;
133 UBool opt_case = FALSE;
134 int opt_level = 0;
135 UBool opt_keyhist = FALSE;
136 UBool opt_itertest = FALSE;
137 UBool opt_dump = FALSE;
138
139
140
141 //
142 // Definitions for the command line options
143 //
144 struct OptSpec {
145 const char *name;
146 enum {FLAG, NUM, STRING} type;
147 void *pVar;
148 };
149
150 OptSpec opts[] = {
151 {"-file", OptSpec::STRING, &opt_fName},
152 {"-locale", OptSpec::STRING, &opt_locale},
153 {"-langid", OptSpec::NUM, &opt_langid},
154 {"-rules", OptSpec::STRING, &opt_rules},
155 {"-qsort", OptSpec::FLAG, &opt_qsort},
156 {"-binsearch", OptSpec::FLAG, &opt_binsearch},
157 {"-iter", OptSpec::FLAG, &opt_itertest},
158 {"-win", OptSpec::FLAG, &opt_win},
159 {"-unix", OptSpec::FLAG, &opt_unix},
160 {"-uselen", OptSpec::FLAG, &opt_uselen},
161 {"-usekeys", OptSpec::FLAG, &opt_usekeys},
162 {"-strcmp", OptSpec::FLAG, &opt_strcmp},
163 {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO},
164 {"-norm", OptSpec::FLAG, &opt_norm},
165 {"-french", OptSpec::FLAG, &opt_french},
166 {"-frenchoff", OptSpec::FLAG, &opt_frenchoff},
167 {"-shifted", OptSpec::FLAG, &opt_shifted},
168 {"-lower", OptSpec::FLAG, &opt_lower},
169 {"-upper", OptSpec::FLAG, &opt_upper},
170 {"-case", OptSpec::FLAG, &opt_case},
171 {"-level", OptSpec::NUM, &opt_level},
172 {"-keyhist", OptSpec::FLAG, &opt_keyhist},
173 {"-keygen", OptSpec::FLAG, &opt_keygen},
174 {"-loop", OptSpec::NUM, &opt_loopCount},
175 {"-iloop", OptSpec::NUM, &opt_iLoopCount},
176 {"-terse", OptSpec::FLAG, &opt_terse},
177 {"-dump", OptSpec::FLAG, &opt_dump},
178 {"-help", OptSpec::FLAG, &opt_help},
179 {"-?", OptSpec::FLAG, &opt_help},
180 {0, OptSpec::FLAG, 0}
181 };
182
183
184 //---------------------------------------------------------------------------
185 //
186 // Global variables pointing to and describing the test file
187 //
188 //---------------------------------------------------------------------------
189
190 //
191 // struct Line
192 //
193 // Each line from the source file (containing a name, presumably) gets
194 // one of these structs.
195 //
196 struct Line {
197 UChar *name;
198 int len;
199 char *winSortKey;
200 char *icuSortKey;
201 char *unixSortKey;
202 char *unixName;
203 };
204
205
206
207 Line *gFileLines; // Ptr to array of Line structs, one per line in the file.
208 int gNumFileLines;
209 UCollator *gCol;
210 DWORD gWinLCID;
211
212 Line **gSortedLines;
213 Line **gRandomLines;
214 int gCount;
215
216
217
218 //---------------------------------------------------------------------------
219 //
220 // ProcessOptions() Function to read the command line options.
221 //
222 //---------------------------------------------------------------------------
ProcessOptions(int argc,const char ** argv,OptSpec opts[])223 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
224 {
225 int i;
226 int argNum;
227 const char *pArgName;
228 OptSpec *pOpt;
229
230 for (argNum=1; argNum<argc; argNum++) {
231 pArgName = argv[argNum];
232 for (pOpt = opts; pOpt->name != 0; pOpt++) {
233 if (strcmp(pOpt->name, pArgName) == 0) {
234 switch (pOpt->type) {
235 case OptSpec::FLAG:
236 *(UBool *)(pOpt->pVar) = TRUE;
237 break;
238 case OptSpec::STRING:
239 argNum ++;
240 if (argNum >= argc) {
241 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
242 return FALSE;
243 }
244 *(const char **)(pOpt->pVar) = argv[argNum];
245 break;
246 case OptSpec::NUM:
247 argNum ++;
248 if (argNum >= argc) {
249 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
250 return FALSE;
251 }
252 char *endp;
253 i = strtol(argv[argNum], &endp, 0);
254 if (endp == argv[argNum]) {
255 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
256 return FALSE;
257 }
258 *(int *)(pOpt->pVar) = i;
259 }
260 break;
261 }
262 }
263 if (pOpt->name == 0)
264 {
265 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
266 return FALSE;
267 }
268 }
269 return TRUE;
270 }
271
272 //---------------------------------------------------------------------------------------
273 //
274 // Comparison functions for use by qsort.
275 //
276 // Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
277 // or null terminated.
278 //
279 //---------------------------------------------------------------------------------------
ICUstrcmpK(const void * a,const void * b)280 int ICUstrcmpK(const void *a, const void *b) {
281 gCount++;
282 int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
283 return t;
284 }
285
286
ICUstrcmpL(const void * a,const void * b)287 int ICUstrcmpL(const void *a, const void *b) {
288 gCount++;
289 UCollationResult t;
290 t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
291 if (t == UCOL_LESS) return -1;
292 if (t == UCOL_GREATER) return +1;
293 return 0;
294 }
295
296
ICUstrcmp(const void * a,const void * b)297 int ICUstrcmp(const void *a, const void *b) {
298 gCount++;
299 UCollationResult t;
300 t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
301 if (t == UCOL_LESS) return -1;
302 if (t == UCOL_GREATER) return +1;
303 return 0;
304 }
305
306
Winstrcmp(const void * a,const void * b)307 int Winstrcmp(const void *a, const void *b) {
308 gCount++;
309 int t;
310 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
311 return t-2;
312 }
313
314
UNIXstrcmp(const void * a,const void * b)315 int UNIXstrcmp(const void *a, const void *b) {
316 gCount++;
317 int t;
318 t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
319 return t;
320 }
321
322
WinstrcmpL(const void * a,const void * b)323 int WinstrcmpL(const void *a, const void *b) {
324 gCount++;
325 int t;
326 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
327 return t-2;
328 }
329
330
WinstrcmpK(const void * a,const void * b)331 int WinstrcmpK(const void *a, const void *b) {
332 gCount++;
333 int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
334 return t;
335 }
336
337
338 //---------------------------------------------------------------------------------------
339 //
340 // Function for sorting the names (lines) into a random order.
341 // Order is based on a hash of the ICU Sort key for the lines
342 // The randomized order is used as input for the sorting timing tests.
343 //
344 //---------------------------------------------------------------------------------------
ICURandomCmp(const void * a,const void * b)345 int ICURandomCmp(const void *a, const void *b) {
346 char *ask = (*(Line **)a)->icuSortKey;
347 char *bsk = (*(Line **)b)->icuSortKey;
348 int aVal = 0;
349 int bVal = 0;
350 int retVal;
351 while (*ask != 0) {
352 aVal += aVal*37 + *ask++;
353 }
354 while (*bsk != 0) {
355 bVal += bVal*37 + *bsk++;
356 }
357 retVal = -1;
358 if (aVal == bVal) {
359 retVal = 0;
360 }
361 else if (aVal > bVal) {
362 retVal = 1;
363 }
364 return retVal;
365 }
366
367 //---------------------------------------------------------------------------------------
368 //
369 // doKeyGen() Key Generation Timing Test
370 //
371 //---------------------------------------------------------------------------------------
doKeyGen()372 void doKeyGen()
373 {
374 int line;
375 int loops = 0;
376 int iLoop;
377 int t;
378 int len=-1;
379
380 // Adjust loop count to compensate for file size. Should be order n
381 double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines));
382 int adj_loopCount = int(dLoopCount);
383 if (adj_loopCount < 1) adj_loopCount = 1;
384
385
386 unsigned long startTime = timeGetTime();
387
388 if (opt_win) {
389 for (loops=0; loops<adj_loopCount; loops++) {
390 for (line=0; line < gNumFileLines; line++) {
391 if (opt_uselen) {
392 len = gFileLines[line].len;
393 }
394 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
395 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY,
396 gFileLines[line].name, len,
397 (unsigned short *)gFileLines[line].winSortKey, 5000); // TODO something with length.
398 }
399 }
400 }
401 }
402 else if (opt_icu)
403 {
404 for (loops=0; loops<adj_loopCount; loops++) {
405 for (line=0; line < gNumFileLines; line++) {
406 if (opt_uselen) {
407 len = gFileLines[line].len;
408 }
409 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
410 t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
411 }
412 }
413 }
414 }
415 else if (opt_unix)
416 {
417 for (loops=0; loops<adj_loopCount; loops++) {
418 for (line=0; line < gNumFileLines; line++) {
419 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
420 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
421 }
422 }
423 }
424 }
425
426 unsigned long elapsedTime = timeGetTime() - startTime;
427 int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
428
429 if (opt_terse == FALSE) {
430 printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLines);
431 printf("Sort Key Generation: time per key = %d ns\n", ns);
432 }
433 else {
434 printf("%d, ", ns);
435 }
436
437 int totalKeyLen = 0;
438 int totalChars = 0;
439 for (line=0; line<gNumFileLines; line++) {
440 totalChars += u_strlen(gFileLines[line].name);
441 if (opt_win) {
442 totalKeyLen += strlen(gFileLines[line].winSortKey);
443 }
444 else if (opt_icu) {
445 totalKeyLen += strlen(gFileLines[line].icuSortKey);
446 }
447 else if (opt_unix) {
448 totalKeyLen += strlen(gFileLines[line].unixSortKey);
449 }
450
451 }
452 if (opt_terse == FALSE) {
453 printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
454 } else {
455 printf("%f, ", (float)totalKeyLen / (float)totalChars);
456 }
457 }
458
459
460
461 //---------------------------------------------------------------------------------------
462 //
463 // doBinarySearch() Binary Search timing test. Each name from the list
464 // is looked up in the full sorted list of names.
465 //
466 //---------------------------------------------------------------------------------------
doBinarySearch()467 void doBinarySearch()
468 {
469
470 gCount = 0;
471 int line;
472 int loops = 0;
473 int iLoop = 0;
474 unsigned long elapsedTime = 0;
475
476 // Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup)
477 // Accurate timings do not depend on this being perfect. The correction is just to try to
478 // get total running times of about the right order, so the that user doesn't need to
479 // manually adjust the loop count for every different file size.
480 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
481 if (opt_usekeys) dLoopCount *= 5;
482 int adj_loopCount = int(dLoopCount);
483 if (adj_loopCount < 1) adj_loopCount = 1;
484
485
486 for (;;) { // not really a loop, just allows "break" to work, to simplify
487 // inadvertantly running more than one test through here.
488 if (opt_strcmp || opt_strcmpCPO)
489 {
490 unsigned long startTime = timeGetTime();
491 typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
492 PF pf = u_strcmp;
493 if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
494 //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int
495 // which forces the use of a cast here.
496
497 int r = 0;
498 for (loops=0; loops<adj_loopCount; loops++) {
499
500 for (line=0; line < gNumFileLines; line++) {
501 int hi = gNumFileLines-1;
502 int lo = 0;
503 int guess = -1;
504 for (;;) {
505 int newGuess = (hi + lo) / 2;
506 if (newGuess == guess)
507 break;
508 guess = newGuess;
509 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
510 r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
511 }
512 gCount++;
513 if (r== 0)
514 break;
515 if (r < 0)
516 hi = guess;
517 else
518 lo = guess;
519 }
520 }
521 }
522 elapsedTime = timeGetTime() - startTime;
523 break;
524 }
525
526
527 if (opt_icu)
528 {
529 unsigned long startTime = timeGetTime();
530 UCollationResult r = UCOL_EQUAL;
531 for (loops=0; loops<adj_loopCount; loops++) {
532
533 for (line=0; line < gNumFileLines; line++) {
534 int lineLen = -1;
535 int guessLen = -1;
536 if (opt_uselen) {
537 lineLen = (gSortedLines[line])->len;
538 }
539 int hi = gNumFileLines-1;
540 int lo = 0;
541 int guess = -1;
542 for (;;) {
543 int newGuess = (hi + lo) / 2;
544 if (newGuess == guess)
545 break;
546 guess = newGuess;
547 int ri = 0;
548 if (opt_usekeys) {
549 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
550 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
551 }
552 gCount++;
553 r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
554 }
555 else
556 {
557 if (opt_uselen) {
558 guessLen = (gSortedLines[guess])->len;
559 }
560 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
561 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
562 }
563 gCount++;
564 }
565 if (r== UCOL_EQUAL)
566 break;
567 if (r == UCOL_LESS)
568 hi = guess;
569 else
570 lo = guess;
571 }
572 }
573 }
574 elapsedTime = timeGetTime() - startTime;
575 break;
576 }
577
578 if (opt_win)
579 {
580 unsigned long startTime = timeGetTime();
581 int r = 0;
582 for (loops=0; loops<adj_loopCount; loops++) {
583
584 for (line=0; line < gNumFileLines; line++) {
585 int lineLen = -1;
586 int guessLen = -1;
587 if (opt_uselen) {
588 lineLen = (gSortedLines[line])->len;
589 }
590 int hi = gNumFileLines-1;
591 int lo = 0;
592 int guess = -1;
593 for (;;) {
594 int newGuess = (hi + lo) / 2;
595 if (newGuess == guess)
596 break;
597 guess = newGuess;
598 if (opt_usekeys) {
599 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
600 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
601 }
602 gCount++;
603 r+=2;
604 }
605 else
606 {
607 if (opt_uselen) {
608 guessLen = (gSortedLines[guess])->len;
609 }
610 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
611 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
612 }
613 if (r == 0) {
614 if (opt_terse == FALSE) {
615 fprintf(stderr, "Error returned from Windows CompareStringW.\n");
616 }
617 exit(-1);
618 }
619 gCount++;
620 }
621 if (r== 2) // strings ==
622 break;
623 if (r == 1) // line < guess
624 hi = guess;
625 else // line > guess
626 lo = guess;
627 }
628 }
629 }
630 elapsedTime = timeGetTime() - startTime;
631 break;
632 }
633
634 if (opt_unix)
635 {
636 unsigned long startTime = timeGetTime();
637 int r = 0;
638 for (loops=0; loops<adj_loopCount; loops++) {
639
640 for (line=0; line < gNumFileLines; line++) {
641 int hi = gNumFileLines-1;
642 int lo = 0;
643 int guess = -1;
644 for (;;) {
645 int newGuess = (hi + lo) / 2;
646 if (newGuess == guess)
647 break;
648 guess = newGuess;
649 if (opt_usekeys) {
650 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
651 r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
652 }
653 gCount++;
654 }
655 else
656 {
657 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
658 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
659 }
660 errno = 0;
661 if (errno != 0) {
662 fprintf(stderr, "Error %d returned from strcoll.\n", errno);
663 exit(-1);
664 }
665 gCount++;
666 }
667 if (r == 0) // strings ==
668 break;
669 if (r < 0) // line < guess
670 hi = guess;
671 else // line > guess
672 lo = guess;
673 }
674 }
675 }
676 elapsedTime = timeGetTime() - startTime;
677 break;
678 }
679 break;
680 }
681
682 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
683 if (opt_terse == FALSE) {
684 printf("binary search: total # of string compares = %d\n", gCount);
685 printf("binary search: compares per loop = %d\n", gCount / loops);
686 printf("binary search: time per compare = %d ns\n", ns);
687 } else {
688 printf("%d, ", ns);
689 }
690
691 }
692
693
694
695
696 //---------------------------------------------------------------------------------------
697 //
698 // doQSort() The quick sort timing test. Uses the C library qsort function.
699 //
700 //---------------------------------------------------------------------------------------
doQSort()701 void doQSort() {
702 int i;
703 Line **sortBuf = new Line *[gNumFileLines];
704
705 // Adjust loop count to compensate for file size. QSort should be n log(n)
706 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
707 if (opt_usekeys) dLoopCount *= 5;
708 int adj_loopCount = int(dLoopCount);
709 if (adj_loopCount < 1) adj_loopCount = 1;
710
711
712 gCount = 0;
713 unsigned long startTime = timeGetTime();
714 if (opt_win && opt_usekeys) {
715 for (i=0; i<opt_loopCount; i++) {
716 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
717 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
718 }
719 }
720
721 else if (opt_win && opt_uselen) {
722 for (i=0; i<adj_loopCount; i++) {
723 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
724 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
725 }
726 }
727
728
729 else if (opt_win && !opt_uselen) {
730 for (i=0; i<adj_loopCount; i++) {
731 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
732 qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
733 }
734 }
735
736 else if (opt_icu && opt_usekeys) {
737 for (i=0; i<adj_loopCount; i++) {
738 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
739 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
740 }
741 }
742
743 else if (opt_icu && opt_uselen) {
744 for (i=0; i<adj_loopCount; i++) {
745 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
746 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
747 }
748 }
749
750
751 else if (opt_icu && !opt_uselen) {
752 for (i=0; i<adj_loopCount; i++) {
753 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
754 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
755 }
756 }
757
758 else if (opt_unix && !opt_usekeys) {
759 for (i=0; i<adj_loopCount; i++) {
760 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
761 qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
762 }
763 }
764
765 unsigned long elapsedTime = timeGetTime() - startTime;
766 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
767 if (opt_terse == FALSE) {
768 printf("qsort: total # of string compares = %d\n", gCount);
769 printf("qsort: time per compare = %d ns\n", ns);
770 } else {
771 printf("%d, ", ns);
772 }
773 }
774
775
776
777 //---------------------------------------------------------------------------------------
778 //
779 // doKeyHist() Output a table of data for
780 // average sort key size vs. string length.
781 //
782 //---------------------------------------------------------------------------------------
doKeyHist()783 void doKeyHist() {
784 int i;
785 int maxLen = 0;
786
787 // Find the maximum string length
788 for (i=0; i<gNumFileLines; i++) {
789 if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
790 }
791
792 // Allocate arrays to hold the histogram data
793 int *accumulatedLen = new int[maxLen+1];
794 int *numKeysOfSize = new int[maxLen+1];
795 for (i=0; i<=maxLen; i++) {
796 accumulatedLen[i] = 0;
797 numKeysOfSize[i] = 0;
798 }
799
800 // Fill the arrays...
801 for (i=0; i<gNumFileLines; i++) {
802 int len = gFileLines[i].len;
803 accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
804 numKeysOfSize[len] += 1;
805 }
806
807 // And write out averages
808 printf("String Length, Avg Key Length, Avg Key Len per char\n");
809 for (i=1; i<=maxLen; i++) {
810 if (numKeysOfSize[i] > 0) {
811 printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
812 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
813 }
814 }
815 delete []accumulatedLen;
816 delete []numKeysOfSize ;
817 }
818
819 //---------------------------------------------------------------------------------------
820 //
821 // doForwardIterTest(UBool) Forward iteration test
822 // argument null-terminated string used
823 //
824 //---------------------------------------------------------------------------------------
doForwardIterTest(UBool haslen)825 void doForwardIterTest(UBool haslen) {
826 int count = 0;
827
828 UErrorCode error = U_ZERO_ERROR;
829 printf("\n\nPerforming forward iteration performance test with ");
830
831 if (haslen) {
832 printf("non-null terminated data -----------\n");
833 }
834 else {
835 printf("null terminated data -----------\n");
836 }
837 printf("performance test on strings from file -----------\n");
838
839 UChar dummytext[] = {0, 0};
840 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
841 ucol_setText(iter, dummytext, 1, &error);
842
843 gCount = 0;
844 unsigned long startTime = timeGetTime();
845 while (count < opt_loopCount) {
846 int linecount = 0;
847 while (linecount < gNumFileLines) {
848 UChar *str = gFileLines[linecount].name;
849 int strlen = haslen?gFileLines[linecount].len:-1;
850 ucol_setText(iter, str, strlen, &error);
851 while (ucol_next(iter, &error) != UCOL_NULLORDER) {
852 gCount++;
853 }
854
855 linecount ++;
856 }
857 count ++;
858 }
859 unsigned long elapsedTime = timeGetTime() - startTime;
860 printf("elapsedTime %ld\n", elapsedTime);
861
862 // empty loop recalculation
863 count = 0;
864 startTime = timeGetTime();
865 while (count < opt_loopCount) {
866 int linecount = 0;
867 while (linecount < gNumFileLines) {
868 UChar *str = gFileLines[linecount].name;
869 int strlen = haslen?gFileLines[linecount].len:-1;
870 ucol_setText(iter, str, strlen, &error);
871 linecount ++;
872 }
873 count ++;
874 }
875 elapsedTime -= (timeGetTime() - startTime);
876 printf("elapsedTime %ld\n", elapsedTime);
877
878 ucol_closeElements(iter);
879
880 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
881 printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
882 opt_loopCount);
883 printf("Average time per ucol_next() nano seconds %d\n", ns);
884
885 printf("performance test on skipped-5 concatenated strings from file -----------\n");
886
887 UChar *str;
888 int strlen = 0;
889 // appending all the strings
890 int linecount = 0;
891 while (linecount < gNumFileLines) {
892 strlen += haslen?gFileLines[linecount].len:
893 u_strlen(gFileLines[linecount].name);
894 linecount ++;
895 }
896 str = (UChar *)malloc(sizeof(UChar) * strlen);
897 int strindex = 0;
898 linecount = 0;
899 while (strindex < strlen) {
900 int len = 0;
901 len += haslen?gFileLines[linecount].len:
902 u_strlen(gFileLines[linecount].name);
903 memcpy(str + strindex, gFileLines[linecount].name,
904 sizeof(UChar) * len);
905 strindex += len;
906 linecount ++;
907 }
908
909 printf("Total size of strings %d\n", strlen);
910
911 gCount = 0;
912 count = 0;
913
914 if (!haslen) {
915 strlen = -1;
916 }
917 iter = ucol_openElements(gCol, str, strlen, &error);
918 if (!haslen) {
919 strlen = u_strlen(str);
920 }
921 strlen -= 5; // any left over characters are not iterated,
922 // this is to ensure the backwards and forwards iterators
923 // gets the same position
924 startTime = timeGetTime();
925 while (count < opt_loopCount) {
926 int count5 = 5;
927 strindex = 0;
928 ucol_setOffset(iter, strindex, &error);
929 while (TRUE) {
930 if (ucol_next(iter, &error) == UCOL_NULLORDER) {
931 break;
932 }
933 gCount++;
934 count5 --;
935 if (count5 == 0) {
936 strindex += 10;
937 if (strindex > strlen) {
938 break;
939 }
940 ucol_setOffset(iter, strindex, &error);
941 count5 = 5;
942 }
943 }
944 count ++;
945 }
946
947 elapsedTime = timeGetTime() - startTime;
948 printf("elapsedTime %ld\n", elapsedTime);
949
950 // empty loop recalculation
951 int tempgCount = 0;
952 count = 0;
953 startTime = timeGetTime();
954 while (count < opt_loopCount) {
955 int count5 = 5;
956 strindex = 0;
957 ucol_setOffset(iter, strindex, &error);
958 while (TRUE) {
959 tempgCount ++;
960 count5 --;
961 if (count5 == 0) {
962 strindex += 10;
963 if (strindex > strlen) {
964 break;
965 }
966 ucol_setOffset(iter, strindex, &error);
967 count5 = 5;
968 }
969 }
970 count ++;
971 }
972 elapsedTime -= (timeGetTime() - startTime);
973 printf("elapsedTime %ld\n", elapsedTime);
974
975 ucol_closeElements(iter);
976
977 printf("gCount %d\n", gCount);
978 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
979 printf("Average time per ucol_next() nano seconds %d\n", ns);
980 }
981
982 //---------------------------------------------------------------------------------------
983 //
984 // doBackwardIterTest(UBool) Backwards iteration test
985 // argument null-terminated string used
986 //
987 //---------------------------------------------------------------------------------------
doBackwardIterTest(UBool haslen)988 void doBackwardIterTest(UBool haslen) {
989 int count = 0;
990 UErrorCode error = U_ZERO_ERROR;
991 printf("\n\nPerforming backward iteration performance test with ");
992
993 if (haslen) {
994 printf("non-null terminated data -----------\n");
995 }
996 else {
997 printf("null terminated data -----------\n");
998 }
999
1000 printf("performance test on strings from file -----------\n");
1001
1002 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
1003 UChar dummytext[] = {0, 0};
1004 ucol_setText(iter, dummytext, 1, &error);
1005
1006 gCount = 0;
1007 unsigned long startTime = timeGetTime();
1008 while (count < opt_loopCount) {
1009 int linecount = 0;
1010 while (linecount < gNumFileLines) {
1011 UChar *str = gFileLines[linecount].name;
1012 int strlen = haslen?gFileLines[linecount].len:-1;
1013 ucol_setText(iter, str, strlen, &error);
1014 while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
1015 gCount ++;
1016 }
1017
1018 linecount ++;
1019 }
1020 count ++;
1021 }
1022 unsigned long elapsedTime = timeGetTime() - startTime;
1023
1024 printf("elapsedTime %ld\n", elapsedTime);
1025
1026 // empty loop recalculation
1027 count = 0;
1028 startTime = timeGetTime();
1029 while (count < opt_loopCount) {
1030 int linecount = 0;
1031 while (linecount < gNumFileLines) {
1032 UChar *str = gFileLines[linecount].name;
1033 int strlen = haslen?gFileLines[linecount].len:-1;
1034 ucol_setText(iter, str, strlen, &error);
1035 linecount ++;
1036 }
1037 count ++;
1038 }
1039 elapsedTime -= (timeGetTime() - startTime);
1040
1041 printf("elapsedTime %ld\n", elapsedTime);
1042 ucol_closeElements(iter);
1043
1044 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1045 printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
1046 opt_loopCount);
1047 printf("Average time per ucol_previous() nano seconds %d\n", ns);
1048
1049 printf("performance test on skipped-5 concatenated strings from file -----------\n");
1050
1051 UChar *str;
1052 int strlen = 0;
1053 // appending all the strings
1054 int linecount = 0;
1055 while (linecount < gNumFileLines) {
1056 strlen += haslen?gFileLines[linecount].len:
1057 u_strlen(gFileLines[linecount].name);
1058 linecount ++;
1059 }
1060 str = (UChar *)malloc(sizeof(UChar) * strlen);
1061 int strindex = 0;
1062 linecount = 0;
1063 while (strindex < strlen) {
1064 int len = 0;
1065 len += haslen?gFileLines[linecount].len:
1066 u_strlen(gFileLines[linecount].name);
1067 memcpy(str + strindex, gFileLines[linecount].name,
1068 sizeof(UChar) * len);
1069 strindex += len;
1070 linecount ++;
1071 }
1072
1073 printf("Total size of strings %d\n", strlen);
1074
1075 gCount = 0;
1076 count = 0;
1077
1078 if (!haslen) {
1079 strlen = -1;
1080 }
1081
1082 iter = ucol_openElements(gCol, str, strlen, &error);
1083 if (!haslen) {
1084 strlen = u_strlen(str);
1085 }
1086
1087 startTime = timeGetTime();
1088 while (count < opt_loopCount) {
1089 int count5 = 5;
1090 strindex = 5;
1091 ucol_setOffset(iter, strindex, &error);
1092 while (TRUE) {
1093 if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
1094 break;
1095 }
1096 gCount ++;
1097 count5 --;
1098 if (count5 == 0) {
1099 strindex += 10;
1100 if (strindex > strlen) {
1101 break;
1102 }
1103 ucol_setOffset(iter, strindex, &error);
1104 count5 = 5;
1105 }
1106 }
1107 count ++;
1108 }
1109
1110 elapsedTime = timeGetTime() - startTime;
1111 printf("elapsedTime %ld\n", elapsedTime);
1112
1113 // empty loop recalculation
1114 count = 0;
1115 int tempgCount = 0;
1116 startTime = timeGetTime();
1117 while (count < opt_loopCount) {
1118 int count5 = 5;
1119 strindex = 5;
1120 ucol_setOffset(iter, strindex, &error);
1121 while (TRUE) {
1122 tempgCount ++;
1123 count5 --;
1124 if (count5 == 0) {
1125 strindex += 10;
1126 if (strindex > strlen) {
1127 break;
1128 }
1129 ucol_setOffset(iter, strindex, &error);
1130 count5 = 5;
1131 }
1132 }
1133 count ++;
1134 }
1135 elapsedTime -= (timeGetTime() - startTime);
1136 printf("elapsedTime %ld\n", elapsedTime);
1137 ucol_closeElements(iter);
1138
1139 printf("gCount %d\n", gCount);
1140 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1141 printf("Average time per ucol_previous() nano seconds %d\n", ns);
1142 }
1143
1144 //---------------------------------------------------------------------------------------
1145 //
1146 // doIterTest() Iteration test
1147 //
1148 //---------------------------------------------------------------------------------------
doIterTest()1149 void doIterTest() {
1150 doForwardIterTest(opt_uselen);
1151 doBackwardIterTest(opt_uselen);
1152 }
1153
1154
1155 //----------------------------------------------------------------------------------------
1156 //
1157 // UnixConvert -- Convert the lines of the file to the encoding for UNIX
1158 // Since it appears that Unicode support is going in the general
1159 // direction of the use of UTF-8 locales, that is the approach
1160 // that is used here.
1161 //
1162 //----------------------------------------------------------------------------------------
UnixConvert()1163 void UnixConvert() {
1164 int line;
1165
1166 UConverter *cvrtr; // An ICU code page converter.
1167 UErrorCode status = U_ZERO_ERROR;
1168
1169
1170 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
1171 if (U_FAILURE(status)) {
1172 fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
1173 exit(-1);
1174 }
1175
1176 for (line=0; line < gNumFileLines; line++) {
1177 int sizeNeeded = ucnv_fromUChars(cvrtr,
1178 0, // ptr to target buffer.
1179 0, // length of target buffer.
1180 gFileLines[line].name,
1181 -1, // source is null terminated
1182 &status);
1183 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
1184 //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
1185 //exit(-1);
1186 }
1187 status = U_ZERO_ERROR;
1188 gFileLines[line].unixName = new char[sizeNeeded+1];
1189 sizeNeeded = ucnv_fromUChars(cvrtr,
1190 gFileLines[line].unixName, // ptr to target buffer.
1191 sizeNeeded+1, // length of target buffer.
1192 gFileLines[line].name,
1193 -1, // source is null terminated
1194 &status);
1195 if (U_FAILURE(status)) {
1196 fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
1197 exit(-1);
1198 }
1199 gFileLines[line].unixName[sizeNeeded] = 0;
1200 };
1201 ucnv_close(cvrtr);
1202 }
1203
1204
1205 //----------------------------------------------------------------------------------------
1206 //
1207 // class UCharFile Class to hide all the gorp to read a file in
1208 // and produce a stream of UChars.
1209 //
1210 //----------------------------------------------------------------------------------------
1211 class UCharFile {
1212 public:
1213 UCharFile(const char *fileName);
1214 ~UCharFile();
1215 UChar get();
eof()1216 UBool eof() {return fEof;};
error()1217 UBool error() {return fError;};
1218
1219 private:
UCharFile(const UCharFile &)1220 UCharFile (const UCharFile & /*other*/) {}; // No copy constructor.
operator =(const UCharFile &)1221 UCharFile & operator = (const UCharFile &/*other*/) {return *this;}; // No assignment op
1222
1223 FILE *fFile;
1224 const char *fName;
1225 UBool fEof;
1226 UBool fError;
1227 UChar fPending2ndSurrogate;
1228
1229 enum {UTF16LE, UTF16BE, UTF8} fEncoding;
1230 };
1231
UCharFile(const char * fileName)1232 UCharFile::UCharFile(const char * fileName) {
1233 fEof = FALSE;
1234 fError = FALSE;
1235 fName = fileName;
1236 fFile = fopen(fName, "rb");
1237 fPending2ndSurrogate = 0;
1238 if (fFile == NULL) {
1239 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
1240 fError = TRUE;
1241 return;
1242 }
1243 //
1244 // Look for the byte order mark at the start of the file.
1245 //
1246 int BOMC1, BOMC2, BOMC3;
1247 BOMC1 = fgetc(fFile);
1248 BOMC2 = fgetc(fFile);
1249
1250 if (BOMC1 == 0xff && BOMC2 == 0xfe) {
1251 fEncoding = UTF16LE; }
1252 else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
1253 fEncoding = UTF16BE; }
1254 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
1255 fEncoding = UTF8; }
1256 else
1257 {
1258 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
1259 "must include a BOM.\n", fileName);
1260 fError = true;
1261 return;
1262 }
1263 }
1264
1265
~UCharFile()1266 UCharFile::~UCharFile() {
1267 fclose(fFile);
1268 }
1269
1270
1271
get()1272 UChar UCharFile::get() {
1273 UChar c;
1274 switch (fEncoding) {
1275 case UTF16LE:
1276 {
1277 int cL, cH;
1278 cL = fgetc(fFile);
1279 cH = fgetc(fFile);
1280 c = cL | (cH << 8);
1281 if (cH == EOF) {
1282 c = 0;
1283 fEof = TRUE;
1284 }
1285 break;
1286 }
1287 case UTF16BE:
1288 {
1289 int cL, cH;
1290 cH = fgetc(fFile);
1291 cL = fgetc(fFile);
1292 c = cL | (cH << 8);
1293 if (cL == EOF) {
1294 c = 0;
1295 fEof = TRUE;
1296 }
1297 break;
1298 }
1299 case UTF8:
1300 {
1301 if (fPending2ndSurrogate != 0) {
1302 c = fPending2ndSurrogate;
1303 fPending2ndSurrogate = 0;
1304 break;
1305 }
1306
1307 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type.
1308 if (ch == EOF) {
1309 c = 0;
1310 fEof = TRUE;
1311 break;
1312 }
1313
1314 if (ch <= 0x7f) {
1315 // It's ascii. No further utf-8 conversion.
1316 c = ch;
1317 break;
1318 }
1319
1320 // Figure out the lenght of the char and read the rest of the bytes
1321 // into a temp array.
1322 int nBytes;
1323 if (ch >= 0xF0) {nBytes=4;}
1324 else if (ch >= 0xE0) {nBytes=3;}
1325 else if (ch >= 0xC0) {nBytes=2;}
1326 else {
1327 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1328 fError = TRUE;
1329 return 0;
1330 }
1331
1332 unsigned char bytes[10];
1333 bytes[0] = (unsigned char)ch;
1334 int i;
1335 for (i=1; i<nBytes; i++) {
1336 bytes[i] = fgetc(fFile);
1337 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
1338 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1339 fError = TRUE;
1340 return 0;
1341 }
1342 }
1343
1344 // Convert the bytes from the temp array to a Unicode char.
1345 i = 0;
1346 uint32_t cp;
1347 U8_NEXT_UNSAFE(bytes, i, cp);
1348 c = (UChar)cp;
1349
1350 if (cp >= 0x10000) {
1351 // The code point needs to be broken up into a utf-16 surrogate pair.
1352 // Process first half this time through the main loop, and
1353 // remember the other half for the next time through.
1354 UChar utf16Buf[3];
1355 i = 0;
1356 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
1357 fPending2ndSurrogate = utf16Buf[1];
1358 c = utf16Buf[0];
1359 }
1360 break;
1361 };
1362 default:
1363 c = 0xFFFD; /* Error, unspecified codepage*/
1364 fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
1365 exit(1);
1366 }
1367 return c;
1368 }
1369
1370 //----------------------------------------------------------------------------------------
1371 //
1372 // openRulesCollator - Command line specified a rules file. Read it in
1373 // and open a collator with it.
1374 //
1375 //----------------------------------------------------------------------------------------
openRulesCollator()1376 UCollator *openRulesCollator() {
1377 UCharFile f(opt_rules);
1378 if (f.error()) {
1379 return 0;
1380 }
1381
1382 int bufLen = 10000;
1383 UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
1384 UChar *tmp;
1385 int i = 0;
1386
1387 for(;;) {
1388 buf[i] = f.get();
1389 if (f.eof()) {
1390 break;
1391 }
1392 if (f.error()) {
1393 return 0;
1394 }
1395 i++;
1396 if (i >= bufLen) {
1397 tmp = buf;
1398 bufLen += 10000;
1399 buf = (UChar *)realloc(buf, bufLen);
1400 if (buf == NULL) {
1401 free(tmp);
1402 return 0;
1403 }
1404 }
1405 }
1406 buf[i] = 0;
1407
1408 UErrorCode status = U_ZERO_ERROR;
1409 UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
1410 UCOL_DEFAULT_STRENGTH, NULL, &status);
1411 if (U_FAILURE(status)) {
1412 fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
1413 return 0;
1414 }
1415 free(buf);
1416 return coll;
1417 }
1418
1419
1420
1421
1422
1423 //----------------------------------------------------------------------------------------
1424 //
1425 // Main -- process command line, read in and pre-process the test file,
1426 // call other functions to do the actual tests.
1427 //
1428 //----------------------------------------------------------------------------------------
main(int argc,const char ** argv)1429 int main(int argc, const char** argv) {
1430 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
1431 printf(gUsageString);
1432 exit (1);
1433 }
1434
1435 // Make sure that we've only got one API selected.
1436 if (opt_unix || opt_win) opt_icu = FALSE;
1437 if (opt_unix) opt_win = FALSE;
1438
1439 //
1440 // Set up an ICU collator
1441 //
1442 UErrorCode status = U_ZERO_ERROR;
1443
1444 if (opt_rules != 0) {
1445 gCol = openRulesCollator();
1446 if (gCol == 0) {return -1;}
1447 }
1448 else {
1449 gCol = ucol_open(opt_locale, &status);
1450 if (U_FAILURE(status)) {
1451 fprintf(stderr, "Collator creation failed.: %d\n", status);
1452 return -1;
1453 }
1454 }
1455 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
1456 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
1457 }
1458 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
1459 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
1460 }
1461
1462 if (opt_norm) {
1463 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
1464 }
1465 if (opt_french && opt_frenchoff) {
1466 fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options.");
1467 exit(-1);
1468 }
1469 if (opt_french) {
1470 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
1471 }
1472 if (opt_frenchoff) {
1473 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1474 }
1475 if (opt_lower) {
1476 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
1477 }
1478 if (opt_upper) {
1479 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
1480 }
1481 if (opt_case) {
1482 ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
1483 }
1484 if (opt_shifted) {
1485 ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
1486 }
1487 if (opt_level != 0) {
1488 switch (opt_level) {
1489 case 1:
1490 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1491 break;
1492 case 2:
1493 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1494 break;
1495 case 3:
1496 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
1497 break;
1498 case 4:
1499 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
1500 break;
1501 case 5:
1502 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
1503 break;
1504 default:
1505 fprintf(stderr, "-level param must be between 1 and 5\n");
1506 exit(-1);
1507 }
1508 }
1509
1510 if (U_FAILURE(status)) {
1511 fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
1512 return -1;
1513 }
1514
1515
1516 //
1517 // Set up a Windows LCID
1518 //
1519 if (opt_langid != 0) {
1520 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
1521 }
1522 else {
1523 gWinLCID = uloc_getLCID(opt_locale);
1524 }
1525
1526
1527 //
1528 // Set the UNIX locale
1529 //
1530 if (opt_unix) {
1531 if (setlocale(LC_ALL, opt_locale) == 0) {
1532 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
1533 exit(-1);
1534 }
1535 }
1536
1537 // Read in the input file.
1538 // File assumed to be utf-16.
1539 // Lines go onto heap buffers. Global index array to line starts is created.
1540 // Lines themselves are null terminated.
1541 //
1542
1543 UCharFile f(opt_fName);
1544 if (f.error()) {
1545 exit(-1);
1546 }
1547
1548 const int MAXLINES = 100000;
1549 gFileLines = new Line[MAXLINES];
1550 UChar buf[1024];
1551 int column = 0;
1552
1553 // Read the file, split into lines, and save in memory.
1554 // Loop runs once per utf-16 value from the input file,
1555 // (The number of bytes read from file per loop iteration depends on external encoding.)
1556 for (;;) {
1557
1558 UChar c = f.get();
1559 if (f.error()){
1560 exit(-1);
1561 }
1562
1563
1564 // We now have a good UTF-16 value in c.
1565
1566 // Watch for CR, LF, EOF; these finish off a line.
1567 if (c == 0xd) {
1568 continue;
1569 }
1570
1571 if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line separators!
1572 buf[column++] = 0;
1573 if (column > 1) {
1574 gFileLines[gNumFileLines].name = new UChar[column];
1575 gFileLines[gNumFileLines].len = column-1;
1576 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
1577 gNumFileLines++;
1578 column = 0;
1579 if (gNumFileLines >= MAXLINES) {
1580 fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES);
1581 exit(-1);
1582 }
1583
1584 }
1585 if (c == 0xa || c == 0x2028)
1586 continue;
1587 else
1588 break; // EOF
1589 }
1590 buf[column++] = c;
1591 if (column >= 1023)
1592 {
1593 static UBool warnFlag = TRUE;
1594 if (warnFlag) {
1595 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
1596 warnFlag = FALSE;
1597 }
1598 column--;
1599 }
1600 }
1601
1602 if (opt_terse == FALSE) {
1603 printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
1604 }
1605
1606
1607 // Convert the lines to the UNIX encoding.
1608 if (opt_unix) {
1609 UnixConvert();
1610 }
1611
1612 //
1613 // Pre-compute ICU sort keys for the lines of the file.
1614 //
1615 int line;
1616 int32_t t;
1617
1618 for (line=0; line<gNumFileLines; line++) {
1619 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
1620 gFileLines[line].icuSortKey = new char[t];
1621
1622 if (t > (int32_t)sizeof(buf)) {
1623 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
1624 }
1625 else
1626 {
1627 memcpy(gFileLines[line].icuSortKey, buf, t);
1628 }
1629 }
1630
1631
1632
1633 //
1634 // Pre-compute Windows sort keys for the lines of the file.
1635 //
1636 for (line=0; line<gNumFileLines; line++) {
1637 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
1638 gFileLines[line].winSortKey = new char[t];
1639 if (t > (int32_t)sizeof(buf)) {
1640 t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t);
1641 }
1642 else
1643 {
1644 memcpy(gFileLines[line].winSortKey, buf, t);
1645 }
1646 }
1647
1648 //
1649 // Pre-compute UNIX sort keys for the lines of the file.
1650 //
1651 if (opt_unix) {
1652 for (line=0; line<gNumFileLines; line++) {
1653 t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf));
1654 gFileLines[line].unixSortKey = new char[t];
1655 if (t > (int32_t)sizeof(buf)) {
1656 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf));
1657 }
1658 else
1659 {
1660 memcpy(gFileLines[line].unixSortKey, buf, t);
1661 }
1662 }
1663 }
1664
1665
1666 //
1667 // Dump file lines, CEs, Sort Keys if requested.
1668 //
1669 if (opt_dump) {
1670 int i;
1671 for (line=0; line<gNumFileLines; line++) {
1672 for (i=0;;i++) {
1673 UChar c = gFileLines[line].name[i];
1674 if (c == 0)
1675 break;
1676 if (c < 0x20 || c > 0x7e) {
1677 printf("\\u%.4x", c);
1678 }
1679 else {
1680 printf("%c", c);
1681 }
1682 }
1683 printf("\n");
1684
1685 printf(" CEs: ");
1686 UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
1687 int32_t ce;
1688 i = 0;
1689 for (;;) {
1690 ce = ucol_next(CEiter, &status);
1691 if (ce == UCOL_NULLORDER) {
1692 break;
1693 }
1694 printf(" %.8x", ce);
1695 if (++i > 8) {
1696 printf("\n ");
1697 i = 0;
1698 }
1699 }
1700 printf("\n");
1701 ucol_closeElements(CEiter);
1702
1703
1704 printf(" ICU Sort Key: ");
1705 for (i=0; ; i++) {
1706 unsigned char c = gFileLines[line].icuSortKey[i];
1707 printf("%02x ", c);
1708 if (c == 0) {
1709 break;
1710 }
1711 if (i > 0 && i % 20 == 0) {
1712 printf("\n ");
1713 }
1714 }
1715 printf("\n");
1716 }
1717 }
1718
1719
1720 //
1721 // Pre-sort the lines.
1722 //
1723 int i;
1724 gSortedLines = new Line *[gNumFileLines];
1725 for (i=0; i<gNumFileLines; i++) {
1726 gSortedLines[i] = &gFileLines[i];
1727 }
1728
1729 if (opt_win) {
1730 qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
1731 }
1732 else if (opt_unix) {
1733 qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
1734 }
1735 else /* ICU */
1736 {
1737 qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
1738 }
1739
1740
1741 //
1742 // Make up a randomized order, will be used for sorting tests.
1743 //
1744 gRandomLines = new Line *[gNumFileLines];
1745 for (i=0; i<gNumFileLines; i++) {
1746 gRandomLines[i] = &gFileLines[i];
1747 }
1748 qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
1749
1750
1751
1752
1753 //
1754 // We've got the file read into memory. Go do something with it.
1755 //
1756
1757 if (opt_qsort) doQSort();
1758 if (opt_binsearch) doBinarySearch();
1759 if (opt_keygen) doKeyGen();
1760 if (opt_keyhist) doKeyHist();
1761 if (opt_itertest) doIterTest();
1762
1763 return 0;
1764
1765 }
1766