1 /***********************************************************************
2 * © 2016 and later: Unicode, Inc. and others.
3 * License & terms of use: http://www.unicode.org/copyright.html
4 ***********************************************************************
5 ***********************************************************************
6 * COPYRIGHT:
7 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
8 *
9 ***********************************************************************/
10 /********************************************************************************
11 *
12 * File CALLCOLL.C
13 *
14 * Modification History:
15 * Name Description
16 * Andy Heninger First Version
17 *
18 *********************************************************************************
19 */
20
21 //
22 // This program tests string collation and sort key generation performance.
23 // Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
24 // A file of names is required as input, one per line. It must be in utf-8 or utf-16 format,
25 // and include a byte order mark. Either LE or BE format is OK.
26 //
27
28 const char gUsageString[] =
29 "usage: collperf options...\n"
30 "-help Display this message.\n"
31 "-file file_name utf-16 format file of names.\n"
32 "-locale name ICU locale to use. Default is en_US\n"
33 "-rules file_name Collation rules file (overrides locale)\n"
34 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
35 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
36 "-win Run test using Windows native services. (ICU is default)\n"
37 "-unix Run test using Unix strxfrm, strcoll services.\n"
38 "-uselen Use API with string lengths. Default is null-terminated strings\n"
39 "-usekeys Run tests using sortkeys rather than strcoll\n"
40 "-strcmp Run tests using u_strcmp rather than strcoll\n"
41 "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n"
42 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
43 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
44 " under test at each call point. For measuring test overhead.\n"
45 "-terse Terse numbers-only output. Intended for use by scripts.\n"
46 "-french French accent ordering\n"
47 "-frenchoff No French accent ordering (for use with French locales.)\n"
48 "-norm Normalizing mode on\n"
49 "-shifted Shifted mode\n"
50 "-lower Lower case first\n"
51 "-upper Upper case first\n"
52 "-case Enable separate case level\n"
53 "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
54 "-keyhist Produce a table sort key size vs. string length\n"
55 "-binsearch Binary Search timing test\n"
56 "-keygen Sort Key Generation timing test\n"
57 "-qsort Quicksort timing test\n"
58 "-iter Iteration Performance Test\n"
59 "-dump Display strings, sort keys and CEs.\n"
60 ;
61
62
63
64 #include <stdio.h>
65 #include <string.h>
66 #include <stdlib.h>
67 #include <math.h>
68 #include <locale.h>
69 #include <errno.h>
70
71 #include <unicode/utypes.h>
72 #include <unicode/ucol.h>
73 #include <unicode/ucoleitr.h>
74 #include <unicode/uloc.h>
75 #include <unicode/ustring.h>
76 #include <unicode/ures.h>
77 #include <unicode/uchar.h>
78 #include <unicode/ucnv.h>
79 #include <unicode/utf8.h>
80
81 #ifdef WIN32
82 #include <windows.h>
83 #else
84 //
85 // Stubs for Windows API functions when building on UNIXes.
86 //
87 typedef int DWORD;
CompareStringW(DWORD,DWORD,UChar *,int,UChar *,int)88 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
89 #include <sys/time.h>
timeGetTime()90 unsigned long timeGetTime() {
91 struct timeval t;
92 gettimeofday(&t, 0);
93 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
94 val += t.tv_usec / 1000;
95 return val;
96 }
LCMapStringW(DWORD,DWORD,UChar *,int,UChar *,int)97 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
98 const int LCMAP_SORTKEY = 0;
99 #define MAKELCID(a,b) 0
100 const int SORT_DEFAULT = 0;
101 #endif
102
103
104
105 //
106 // Command line option variables
107 // These global variables are set according to the options specified
108 // on the command line by the user.
109 char * opt_fName = 0;
110 const char * opt_locale = "en_US";
111 int opt_langid = 0; // Defaults to value corresponding to opt_locale.
112 char * opt_rules = 0;
113 UBool opt_help = false;
114 int opt_loopCount = 1;
115 int opt_iLoopCount = 1;
116 UBool opt_terse = false;
117 UBool opt_qsort = false;
118 UBool opt_binsearch = false;
119 UBool opt_icu = true;
120 UBool opt_win = false; // Run with Windows native functions.
121 UBool opt_unix = false; // Run with UNIX strcoll, strxfrm functions.
122 UBool opt_uselen = false;
123 UBool opt_usekeys = false;
124 UBool opt_strcmp = false;
125 UBool opt_strcmpCPO = false;
126 UBool opt_norm = false;
127 UBool opt_keygen = false;
128 UBool opt_french = false;
129 UBool opt_frenchoff = false;
130 UBool opt_shifted = false;
131 UBool opt_lower = false;
132 UBool opt_upper = false;
133 UBool opt_case = false;
134 int opt_level = 0;
135 UBool opt_keyhist = false;
136 UBool opt_itertest = false;
137 UBool opt_dump = false;
138
139
140
141 //
142 // Definitions for the command line options
143 //
144 struct OptSpec {
145 const char *name;
146 enum {FLAG, NUM, STRING} type;
147 void *pVar;
148 };
149
150 OptSpec opts[] = {
151 {"-file", OptSpec::STRING, &opt_fName},
152 {"-locale", OptSpec::STRING, &opt_locale},
153 {"-langid", OptSpec::NUM, &opt_langid},
154 {"-rules", OptSpec::STRING, &opt_rules},
155 {"-qsort", OptSpec::FLAG, &opt_qsort},
156 {"-binsearch", OptSpec::FLAG, &opt_binsearch},
157 {"-iter", OptSpec::FLAG, &opt_itertest},
158 {"-win", OptSpec::FLAG, &opt_win},
159 {"-unix", OptSpec::FLAG, &opt_unix},
160 {"-uselen", OptSpec::FLAG, &opt_uselen},
161 {"-usekeys", OptSpec::FLAG, &opt_usekeys},
162 {"-strcmp", OptSpec::FLAG, &opt_strcmp},
163 {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO},
164 {"-norm", OptSpec::FLAG, &opt_norm},
165 {"-french", OptSpec::FLAG, &opt_french},
166 {"-frenchoff", OptSpec::FLAG, &opt_frenchoff},
167 {"-shifted", OptSpec::FLAG, &opt_shifted},
168 {"-lower", OptSpec::FLAG, &opt_lower},
169 {"-upper", OptSpec::FLAG, &opt_upper},
170 {"-case", OptSpec::FLAG, &opt_case},
171 {"-level", OptSpec::NUM, &opt_level},
172 {"-keyhist", OptSpec::FLAG, &opt_keyhist},
173 {"-keygen", OptSpec::FLAG, &opt_keygen},
174 {"-loop", OptSpec::NUM, &opt_loopCount},
175 {"-iloop", OptSpec::NUM, &opt_iLoopCount},
176 {"-terse", OptSpec::FLAG, &opt_terse},
177 {"-dump", OptSpec::FLAG, &opt_dump},
178 {"-help", OptSpec::FLAG, &opt_help},
179 {"-?", OptSpec::FLAG, &opt_help},
180 {0, OptSpec::FLAG, 0}
181 };
182
183
184 //---------------------------------------------------------------------------
185 //
186 // Global variables pointing to and describing the test file
187 //
188 //---------------------------------------------------------------------------
189
190 //
191 // struct Line
192 //
193 // Each line from the source file (containing a name, presumably) gets
194 // one of these structs.
195 //
196 struct Line {
197 UChar *name;
198 int len;
199 char *winSortKey;
200 char *icuSortKey;
201 char *unixSortKey;
202 char *unixName;
203 };
204
205
206
207 Line *gFileLines; // Ptr to array of Line structs, one per line in the file.
208 int gNumFileLines;
209 UCollator *gCol;
210 DWORD gWinLCID;
211
212 Line **gSortedLines;
213 Line **gRandomLines;
214 int gCount;
215
216
217
218 //---------------------------------------------------------------------------
219 //
220 // ProcessOptions() Function to read the command line options.
221 //
222 //---------------------------------------------------------------------------
ProcessOptions(int argc,const char ** argv,OptSpec opts[])223 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
224 {
225 int i;
226 int argNum;
227 const char *pArgName;
228 OptSpec *pOpt;
229
230 for (argNum=1; argNum<argc; argNum++) {
231 pArgName = argv[argNum];
232 for (pOpt = opts; pOpt->name != 0; pOpt++) {
233 if (strcmp(pOpt->name, pArgName) == 0) {
234 switch (pOpt->type) {
235 case OptSpec::FLAG:
236 *(UBool *)(pOpt->pVar) = true;
237 break;
238 case OptSpec::STRING:
239 argNum ++;
240 if (argNum >= argc) {
241 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
242 return false;
243 }
244 *(const char **)(pOpt->pVar) = argv[argNum];
245 break;
246 case OptSpec::NUM:
247 argNum ++;
248 if (argNum >= argc) {
249 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
250 return false;
251 }
252 char *endp;
253 i = strtol(argv[argNum], &endp, 0);
254 if (endp == argv[argNum]) {
255 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
256 return false;
257 }
258 *(int *)(pOpt->pVar) = i;
259 }
260 break;
261 }
262 }
263 if (pOpt->name == 0)
264 {
265 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
266 return false;
267 }
268 }
269 return true;
270 }
271
272 //---------------------------------------------------------------------------------------
273 //
274 // Comparison functions for use by qsort.
275 //
276 // Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
277 // or null terminated.
278 //
279 //---------------------------------------------------------------------------------------
ICUstrcmpK(const void * a,const void * b)280 int ICUstrcmpK(const void *a, const void *b) {
281 gCount++;
282 int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
283 return t;
284 }
285
286
ICUstrcmpL(const void * a,const void * b)287 int ICUstrcmpL(const void *a, const void *b) {
288 gCount++;
289 UCollationResult t;
290 t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
291 if (t == UCOL_LESS) return -1;
292 if (t == UCOL_GREATER) return +1;
293 return 0;
294 }
295
296
ICUstrcmp(const void * a,const void * b)297 int ICUstrcmp(const void *a, const void *b) {
298 gCount++;
299 UCollationResult t;
300 t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
301 if (t == UCOL_LESS) return -1;
302 if (t == UCOL_GREATER) return +1;
303 return 0;
304 }
305
306
Winstrcmp(const void * a,const void * b)307 int Winstrcmp(const void *a, const void *b) {
308 gCount++;
309 int t;
310 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
311 return t-2;
312 }
313
314
UNIXstrcmp(const void * a,const void * b)315 int UNIXstrcmp(const void *a, const void *b) {
316 gCount++;
317 int t;
318 t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
319 return t;
320 }
321
322
WinstrcmpL(const void * a,const void * b)323 int WinstrcmpL(const void *a, const void *b) {
324 gCount++;
325 int t;
326 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
327 return t-2;
328 }
329
330
WinstrcmpK(const void * a,const void * b)331 int WinstrcmpK(const void *a, const void *b) {
332 gCount++;
333 int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
334 return t;
335 }
336
337
338 //---------------------------------------------------------------------------------------
339 //
340 // Function for sorting the names (lines) into a random order.
341 // Order is based on a hash of the ICU Sort key for the lines
342 // The randomized order is used as input for the sorting timing tests.
343 //
344 //---------------------------------------------------------------------------------------
ICURandomCmp(const void * a,const void * b)345 int ICURandomCmp(const void *a, const void *b) {
346 char *ask = (*(Line **)a)->icuSortKey;
347 char *bsk = (*(Line **)b)->icuSortKey;
348 int aVal = 0;
349 int bVal = 0;
350 int retVal;
351 while (*ask != 0) {
352 aVal += aVal*37 + *ask++;
353 }
354 while (*bsk != 0) {
355 bVal += bVal*37 + *bsk++;
356 }
357 retVal = -1;
358 if (aVal == bVal) {
359 retVal = 0;
360 }
361 else if (aVal > bVal) {
362 retVal = 1;
363 }
364 return retVal;
365 }
366
367 //---------------------------------------------------------------------------------------
368 //
369 // doKeyGen() Key Generation Timing Test
370 //
371 //---------------------------------------------------------------------------------------
doKeyGen()372 void doKeyGen()
373 {
374 int line;
375 int loops = 0;
376 int iLoop;
377 int len=-1;
378
379 // Adjust loop count to compensate for file size. Should be order n
380 double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines));
381 int adj_loopCount = int(dLoopCount);
382 if (adj_loopCount < 1) adj_loopCount = 1;
383
384
385 unsigned long startTime = timeGetTime();
386
387 if (opt_win) {
388 for (loops=0; loops<adj_loopCount; loops++) {
389 for (line=0; line < gNumFileLines; line++) {
390 if (opt_uselen) {
391 len = gFileLines[line].len;
392 }
393 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
394 LCMapStringW(gWinLCID, LCMAP_SORTKEY,
395 gFileLines[line].name, len,
396 (UChar *)gFileLines[line].winSortKey, 5000); // TODO something with length.
397 }
398 }
399 }
400 }
401 else if (opt_icu)
402 {
403 for (loops=0; loops<adj_loopCount; loops++) {
404 for (line=0; line < gNumFileLines; line++) {
405 if (opt_uselen) {
406 len = gFileLines[line].len;
407 }
408 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
409 ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
410 }
411 }
412 }
413 }
414 else if (opt_unix)
415 {
416 for (loops=0; loops<adj_loopCount; loops++) {
417 for (line=0; line < gNumFileLines; line++) {
418 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
419 strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
420 }
421 }
422 }
423 }
424
425 unsigned long elapsedTime = timeGetTime() - startTime;
426 int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
427
428 if (opt_terse == false) {
429 printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLines);
430 printf("Sort Key Generation: time per key = %d ns\n", ns);
431 }
432 else {
433 printf("%d, ", ns);
434 }
435
436 int totalKeyLen = 0;
437 int totalChars = 0;
438 for (line=0; line<gNumFileLines; line++) {
439 totalChars += u_strlen(gFileLines[line].name);
440 if (opt_win) {
441 totalKeyLen += strlen(gFileLines[line].winSortKey);
442 }
443 else if (opt_icu) {
444 totalKeyLen += strlen(gFileLines[line].icuSortKey);
445 }
446 else if (opt_unix) {
447 totalKeyLen += strlen(gFileLines[line].unixSortKey);
448 }
449
450 }
451 if (opt_terse == false) {
452 printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
453 } else {
454 printf("%f, ", (float)totalKeyLen / (float)totalChars);
455 }
456 }
457
458
459
460 //---------------------------------------------------------------------------------------
461 //
462 // doBinarySearch() Binary Search timing test. Each name from the list
463 // is looked up in the full sorted list of names.
464 //
465 //---------------------------------------------------------------------------------------
doBinarySearch()466 void doBinarySearch()
467 {
468
469 gCount = 0;
470 int line;
471 int loops = 0;
472 int iLoop = 0;
473 unsigned long elapsedTime = 0;
474
475 // Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup)
476 // Accurate timings do not depend on this being perfect. The correction is just to try to
477 // get total running times of about the right order, so the that user doesn't need to
478 // manually adjust the loop count for every different file size.
479 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
480 if (opt_usekeys) dLoopCount *= 5;
481 int adj_loopCount = int(dLoopCount);
482 if (adj_loopCount < 1) adj_loopCount = 1;
483
484
485 for (;;) { // not really a loop, just allows "break" to work, to simplify
486 // inadvertantly running more than one test through here.
487 if (opt_strcmp || opt_strcmpCPO)
488 {
489 unsigned long startTime = timeGetTime();
490 typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
491 PF pf = u_strcmp;
492 if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
493 //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int
494 // which forces the use of a cast here.
495
496 int r = 0;
497 for (loops=0; loops<adj_loopCount; loops++) {
498
499 for (line=0; line < gNumFileLines; line++) {
500 int hi = gNumFileLines-1;
501 int lo = 0;
502 int guess = -1;
503 for (;;) {
504 int newGuess = (hi + lo) / 2;
505 if (newGuess == guess)
506 break;
507 guess = newGuess;
508 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
509 r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
510 }
511 gCount++;
512 if (r== 0)
513 break;
514 if (r < 0)
515 hi = guess;
516 else
517 lo = guess;
518 }
519 }
520 }
521 elapsedTime = timeGetTime() - startTime;
522 break;
523 }
524
525
526 if (opt_icu)
527 {
528 unsigned long startTime = timeGetTime();
529 UCollationResult r = UCOL_EQUAL;
530 for (loops=0; loops<adj_loopCount; loops++) {
531
532 for (line=0; line < gNumFileLines; line++) {
533 int lineLen = -1;
534 int guessLen = -1;
535 if (opt_uselen) {
536 lineLen = (gSortedLines[line])->len;
537 }
538 int hi = gNumFileLines-1;
539 int lo = 0;
540 int guess = -1;
541 for (;;) {
542 int newGuess = (hi + lo) / 2;
543 if (newGuess == guess)
544 break;
545 guess = newGuess;
546 int ri = 0;
547 if (opt_usekeys) {
548 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
549 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
550 }
551 gCount++;
552 r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
553 }
554 else
555 {
556 if (opt_uselen) {
557 guessLen = (gSortedLines[guess])->len;
558 }
559 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
560 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
561 }
562 gCount++;
563 }
564 if (r== UCOL_EQUAL)
565 break;
566 if (r == UCOL_LESS)
567 hi = guess;
568 else
569 lo = guess;
570 }
571 }
572 }
573 elapsedTime = timeGetTime() - startTime;
574 break;
575 }
576
577 if (opt_win)
578 {
579 unsigned long startTime = timeGetTime();
580 int r = 0;
581 for (loops=0; loops<adj_loopCount; loops++) {
582
583 for (line=0; line < gNumFileLines; line++) {
584 int lineLen = -1;
585 int guessLen = -1;
586 if (opt_uselen) {
587 lineLen = (gSortedLines[line])->len;
588 }
589 int hi = gNumFileLines-1;
590 int lo = 0;
591 int guess = -1;
592 for (;;) {
593 int newGuess = (hi + lo) / 2;
594 if (newGuess == guess)
595 break;
596 guess = newGuess;
597 if (opt_usekeys) {
598 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
599 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
600 }
601 gCount++;
602 r+=2;
603 }
604 else
605 {
606 if (opt_uselen) {
607 guessLen = (gSortedLines[guess])->len;
608 }
609 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
610 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
611 }
612 if (r == 0) {
613 if (opt_terse == false) {
614 fprintf(stderr, "Error returned from Windows CompareStringW.\n");
615 }
616 exit(-1);
617 }
618 gCount++;
619 }
620 if (r== 2) // strings ==
621 break;
622 if (r == 1) // line < guess
623 hi = guess;
624 else // line > guess
625 lo = guess;
626 }
627 }
628 }
629 elapsedTime = timeGetTime() - startTime;
630 break;
631 }
632
633 if (opt_unix)
634 {
635 unsigned long startTime = timeGetTime();
636 int r = 0;
637 for (loops=0; loops<adj_loopCount; loops++) {
638
639 for (line=0; line < gNumFileLines; line++) {
640 int hi = gNumFileLines-1;
641 int lo = 0;
642 int guess = -1;
643 for (;;) {
644 int newGuess = (hi + lo) / 2;
645 if (newGuess == guess)
646 break;
647 guess = newGuess;
648 if (opt_usekeys) {
649 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
650 r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
651 }
652 gCount++;
653 }
654 else
655 {
656 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
657 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
658 }
659 errno = 0;
660 if (errno != 0) {
661 fprintf(stderr, "Error %d returned from strcoll.\n", errno);
662 exit(-1);
663 }
664 gCount++;
665 }
666 if (r == 0) // strings ==
667 break;
668 if (r < 0) // line < guess
669 hi = guess;
670 else // line > guess
671 lo = guess;
672 }
673 }
674 }
675 elapsedTime = timeGetTime() - startTime;
676 break;
677 }
678 break;
679 }
680
681 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
682 if (opt_terse == false) {
683 printf("binary search: total # of string compares = %d\n", gCount);
684 printf("binary search: compares per loop = %d\n", gCount / loops);
685 printf("binary search: time per compare = %d ns\n", ns);
686 } else {
687 printf("%d, ", ns);
688 }
689
690 }
691
692
693
694
695 //---------------------------------------------------------------------------------------
696 //
697 // doQSort() The quick sort timing test. Uses the C library qsort function.
698 //
699 //---------------------------------------------------------------------------------------
doQSort()700 void doQSort() {
701 int i;
702 Line **sortBuf = new Line *[gNumFileLines];
703
704 // Adjust loop count to compensate for file size. QSort should be n log(n)
705 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
706 if (opt_usekeys) dLoopCount *= 5;
707 int adj_loopCount = int(dLoopCount);
708 if (adj_loopCount < 1) adj_loopCount = 1;
709
710
711 gCount = 0;
712 unsigned long startTime = timeGetTime();
713 if (opt_win && opt_usekeys) {
714 for (i=0; i<opt_loopCount; i++) {
715 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
716 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
717 }
718 }
719
720 else if (opt_win && opt_uselen) {
721 for (i=0; i<adj_loopCount; i++) {
722 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
723 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
724 }
725 }
726
727
728 else if (opt_win && !opt_uselen) {
729 for (i=0; i<adj_loopCount; i++) {
730 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
731 qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
732 }
733 }
734
735 else if (opt_icu && opt_usekeys) {
736 for (i=0; i<adj_loopCount; i++) {
737 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
738 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
739 }
740 }
741
742 else if (opt_icu && opt_uselen) {
743 for (i=0; i<adj_loopCount; i++) {
744 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
745 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
746 }
747 }
748
749
750 else if (opt_icu && !opt_uselen) {
751 for (i=0; i<adj_loopCount; i++) {
752 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
753 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
754 }
755 }
756
757 else if (opt_unix && !opt_usekeys) {
758 for (i=0; i<adj_loopCount; i++) {
759 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
760 qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
761 }
762 }
763
764 unsigned long elapsedTime = timeGetTime() - startTime;
765 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
766 if (opt_terse == false) {
767 printf("qsort: total # of string compares = %d\n", gCount);
768 printf("qsort: time per compare = %d ns\n", ns);
769 } else {
770 printf("%d, ", ns);
771 }
772 }
773
774
775
776 //---------------------------------------------------------------------------------------
777 //
778 // doKeyHist() Output a table of data for
779 // average sort key size vs. string length.
780 //
781 //---------------------------------------------------------------------------------------
doKeyHist()782 void doKeyHist() {
783 int i;
784 int maxLen = 0;
785
786 // Find the maximum string length
787 for (i=0; i<gNumFileLines; i++) {
788 if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
789 }
790
791 // Allocate arrays to hold the histogram data
792 int *accumulatedLen = new int[maxLen+1];
793 int *numKeysOfSize = new int[maxLen+1];
794 for (i=0; i<=maxLen; i++) {
795 accumulatedLen[i] = 0;
796 numKeysOfSize[i] = 0;
797 }
798
799 // Fill the arrays...
800 for (i=0; i<gNumFileLines; i++) {
801 int len = gFileLines[i].len;
802 accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
803 numKeysOfSize[len] += 1;
804 }
805
806 // And write out averages
807 printf("String Length, Avg Key Length, Avg Key Len per char\n");
808 for (i=1; i<=maxLen; i++) {
809 if (numKeysOfSize[i] > 0) {
810 printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
811 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
812 }
813 }
814 delete []accumulatedLen;
815 delete []numKeysOfSize ;
816 }
817
818 //---------------------------------------------------------------------------------------
819 //
820 // doForwardIterTest(UBool) Forward iteration test
821 // argument null-terminated string used
822 //
823 //---------------------------------------------------------------------------------------
doForwardIterTest(UBool haslen)824 void doForwardIterTest(UBool haslen) {
825 int count = 0;
826
827 UErrorCode error = U_ZERO_ERROR;
828 printf("\n\nPerforming forward iteration performance test with ");
829
830 if (haslen) {
831 printf("non-null terminated data -----------\n");
832 }
833 else {
834 printf("null terminated data -----------\n");
835 }
836 printf("performance test on strings from file -----------\n");
837
838 UChar dummytext[] = {0, 0};
839 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
840 ucol_setText(iter, dummytext, 1, &error);
841
842 gCount = 0;
843 unsigned long startTime = timeGetTime();
844 while (count < opt_loopCount) {
845 int linecount = 0;
846 while (linecount < gNumFileLines) {
847 UChar *str = gFileLines[linecount].name;
848 int strlen = haslen?gFileLines[linecount].len:-1;
849 ucol_setText(iter, str, strlen, &error);
850 while (ucol_next(iter, &error) != UCOL_NULLORDER) {
851 gCount++;
852 }
853
854 linecount ++;
855 }
856 count ++;
857 }
858 unsigned long elapsedTime = timeGetTime() - startTime;
859 printf("elapsedTime %ld\n", elapsedTime);
860
861 // empty loop recalculation
862 count = 0;
863 startTime = timeGetTime();
864 while (count < opt_loopCount) {
865 int linecount = 0;
866 while (linecount < gNumFileLines) {
867 UChar *str = gFileLines[linecount].name;
868 int strlen = haslen?gFileLines[linecount].len:-1;
869 ucol_setText(iter, str, strlen, &error);
870 linecount ++;
871 }
872 count ++;
873 }
874 elapsedTime -= (timeGetTime() - startTime);
875 printf("elapsedTime %ld\n", elapsedTime);
876
877 ucol_closeElements(iter);
878
879 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
880 printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
881 opt_loopCount);
882 printf("Average time per ucol_next() nano seconds %d\n", ns);
883
884 printf("performance test on skipped-5 concatenated strings from file -----------\n");
885
886 UChar *str;
887 int strlen = 0;
888 // appending all the strings
889 int linecount = 0;
890 while (linecount < gNumFileLines) {
891 strlen += haslen?gFileLines[linecount].len:
892 u_strlen(gFileLines[linecount].name);
893 linecount ++;
894 }
895 str = (UChar *)malloc(sizeof(UChar) * strlen);
896 int strindex = 0;
897 linecount = 0;
898 while (strindex < strlen) {
899 int len = 0;
900 len += haslen?gFileLines[linecount].len:
901 u_strlen(gFileLines[linecount].name);
902 memcpy(str + strindex, gFileLines[linecount].name,
903 sizeof(UChar) * len);
904 strindex += len;
905 linecount ++;
906 }
907
908 printf("Total size of strings %d\n", strlen);
909
910 gCount = 0;
911 count = 0;
912
913 if (!haslen) {
914 strlen = -1;
915 }
916 iter = ucol_openElements(gCol, str, strlen, &error);
917 if (!haslen) {
918 strlen = u_strlen(str);
919 }
920 strlen -= 5; // any left over characters are not iterated,
921 // this is to ensure the backwards and forwards iterators
922 // gets the same position
923 startTime = timeGetTime();
924 while (count < opt_loopCount) {
925 int count5 = 5;
926 strindex = 0;
927 ucol_setOffset(iter, strindex, &error);
928 while (true) {
929 if (ucol_next(iter, &error) == UCOL_NULLORDER) {
930 break;
931 }
932 gCount++;
933 count5 --;
934 if (count5 == 0) {
935 strindex += 10;
936 if (strindex > strlen) {
937 break;
938 }
939 ucol_setOffset(iter, strindex, &error);
940 count5 = 5;
941 }
942 }
943 count ++;
944 }
945
946 elapsedTime = timeGetTime() - startTime;
947 printf("elapsedTime %ld\n", elapsedTime);
948
949 // empty loop recalculation
950 int tempgCount = 0;
951 count = 0;
952 startTime = timeGetTime();
953 while (count < opt_loopCount) {
954 int count5 = 5;
955 strindex = 0;
956 ucol_setOffset(iter, strindex, &error);
957 while (true) {
958 tempgCount ++;
959 count5 --;
960 if (count5 == 0) {
961 strindex += 10;
962 if (strindex > strlen) {
963 break;
964 }
965 ucol_setOffset(iter, strindex, &error);
966 count5 = 5;
967 }
968 }
969 count ++;
970 }
971 elapsedTime -= (timeGetTime() - startTime);
972 printf("elapsedTime %ld\n", elapsedTime);
973
974 ucol_closeElements(iter);
975
976 printf("gCount %d\n", gCount);
977 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
978 printf("Average time per ucol_next() nano seconds %d\n", ns);
979 }
980
981 //---------------------------------------------------------------------------------------
982 //
983 // doBackwardIterTest(UBool) Backwards iteration test
984 // argument null-terminated string used
985 //
986 //---------------------------------------------------------------------------------------
doBackwardIterTest(UBool haslen)987 void doBackwardIterTest(UBool haslen) {
988 int count = 0;
989 UErrorCode error = U_ZERO_ERROR;
990 printf("\n\nPerforming backward iteration performance test with ");
991
992 if (haslen) {
993 printf("non-null terminated data -----------\n");
994 }
995 else {
996 printf("null terminated data -----------\n");
997 }
998
999 printf("performance test on strings from file -----------\n");
1000
1001 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
1002 UChar dummytext[] = {0, 0};
1003 ucol_setText(iter, dummytext, 1, &error);
1004
1005 gCount = 0;
1006 unsigned long startTime = timeGetTime();
1007 while (count < opt_loopCount) {
1008 int linecount = 0;
1009 while (linecount < gNumFileLines) {
1010 UChar *str = gFileLines[linecount].name;
1011 int strlen = haslen?gFileLines[linecount].len:-1;
1012 ucol_setText(iter, str, strlen, &error);
1013 while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
1014 gCount ++;
1015 }
1016
1017 linecount ++;
1018 }
1019 count ++;
1020 }
1021 unsigned long elapsedTime = timeGetTime() - startTime;
1022
1023 printf("elapsedTime %ld\n", elapsedTime);
1024
1025 // empty loop recalculation
1026 count = 0;
1027 startTime = timeGetTime();
1028 while (count < opt_loopCount) {
1029 int linecount = 0;
1030 while (linecount < gNumFileLines) {
1031 UChar *str = gFileLines[linecount].name;
1032 int strlen = haslen?gFileLines[linecount].len:-1;
1033 ucol_setText(iter, str, strlen, &error);
1034 linecount ++;
1035 }
1036 count ++;
1037 }
1038 elapsedTime -= (timeGetTime() - startTime);
1039
1040 printf("elapsedTime %ld\n", elapsedTime);
1041 ucol_closeElements(iter);
1042
1043 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1044 printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
1045 opt_loopCount);
1046 printf("Average time per ucol_previous() nano seconds %d\n", ns);
1047
1048 printf("performance test on skipped-5 concatenated strings from file -----------\n");
1049
1050 UChar *str;
1051 int strlen = 0;
1052 // appending all the strings
1053 int linecount = 0;
1054 while (linecount < gNumFileLines) {
1055 strlen += haslen?gFileLines[linecount].len:
1056 u_strlen(gFileLines[linecount].name);
1057 linecount ++;
1058 }
1059 str = (UChar *)malloc(sizeof(UChar) * strlen);
1060 int strindex = 0;
1061 linecount = 0;
1062 while (strindex < strlen) {
1063 int len = 0;
1064 len += haslen?gFileLines[linecount].len:
1065 u_strlen(gFileLines[linecount].name);
1066 memcpy(str + strindex, gFileLines[linecount].name,
1067 sizeof(UChar) * len);
1068 strindex += len;
1069 linecount ++;
1070 }
1071
1072 printf("Total size of strings %d\n", strlen);
1073
1074 gCount = 0;
1075 count = 0;
1076
1077 if (!haslen) {
1078 strlen = -1;
1079 }
1080
1081 iter = ucol_openElements(gCol, str, strlen, &error);
1082 if (!haslen) {
1083 strlen = u_strlen(str);
1084 }
1085
1086 startTime = timeGetTime();
1087 while (count < opt_loopCount) {
1088 int count5 = 5;
1089 strindex = 5;
1090 ucol_setOffset(iter, strindex, &error);
1091 while (true) {
1092 if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
1093 break;
1094 }
1095 gCount ++;
1096 count5 --;
1097 if (count5 == 0) {
1098 strindex += 10;
1099 if (strindex > strlen) {
1100 break;
1101 }
1102 ucol_setOffset(iter, strindex, &error);
1103 count5 = 5;
1104 }
1105 }
1106 count ++;
1107 }
1108
1109 elapsedTime = timeGetTime() - startTime;
1110 printf("elapsedTime %ld\n", elapsedTime);
1111
1112 // empty loop recalculation
1113 count = 0;
1114 int tempgCount = 0;
1115 startTime = timeGetTime();
1116 while (count < opt_loopCount) {
1117 int count5 = 5;
1118 strindex = 5;
1119 ucol_setOffset(iter, strindex, &error);
1120 while (true) {
1121 tempgCount ++;
1122 count5 --;
1123 if (count5 == 0) {
1124 strindex += 10;
1125 if (strindex > strlen) {
1126 break;
1127 }
1128 ucol_setOffset(iter, strindex, &error);
1129 count5 = 5;
1130 }
1131 }
1132 count ++;
1133 }
1134 elapsedTime -= (timeGetTime() - startTime);
1135 printf("elapsedTime %ld\n", elapsedTime);
1136 ucol_closeElements(iter);
1137
1138 printf("gCount %d\n", gCount);
1139 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1140 printf("Average time per ucol_previous() nano seconds %d\n", ns);
1141 }
1142
1143 //---------------------------------------------------------------------------------------
1144 //
1145 // doIterTest() Iteration test
1146 //
1147 //---------------------------------------------------------------------------------------
doIterTest()1148 void doIterTest() {
1149 doForwardIterTest(opt_uselen);
1150 doBackwardIterTest(opt_uselen);
1151 }
1152
1153
1154 //----------------------------------------------------------------------------------------
1155 //
1156 // UnixConvert -- Convert the lines of the file to the encoding for UNIX
1157 // Since it appears that Unicode support is going in the general
1158 // direction of the use of UTF-8 locales, that is the approach
1159 // that is used here.
1160 //
1161 //----------------------------------------------------------------------------------------
UnixConvert()1162 void UnixConvert() {
1163 int line;
1164
1165 UConverter *cvrtr; // An ICU code page converter.
1166 UErrorCode status = U_ZERO_ERROR;
1167
1168
1169 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
1170 if (U_FAILURE(status)) {
1171 fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
1172 exit(-1);
1173 }
1174
1175 for (line=0; line < gNumFileLines; line++) {
1176 int sizeNeeded = ucnv_fromUChars(cvrtr,
1177 0, // ptr to target buffer.
1178 0, // length of target buffer.
1179 gFileLines[line].name,
1180 -1, // source is null terminated
1181 &status);
1182 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
1183 //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
1184 //exit(-1);
1185 }
1186 status = U_ZERO_ERROR;
1187 gFileLines[line].unixName = new char[sizeNeeded+1];
1188 sizeNeeded = ucnv_fromUChars(cvrtr,
1189 gFileLines[line].unixName, // ptr to target buffer.
1190 sizeNeeded+1, // length of target buffer.
1191 gFileLines[line].name,
1192 -1, // source is null terminated
1193 &status);
1194 if (U_FAILURE(status)) {
1195 fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
1196 exit(-1);
1197 }
1198 gFileLines[line].unixName[sizeNeeded] = 0;
1199 };
1200 ucnv_close(cvrtr);
1201 }
1202
1203
1204 //----------------------------------------------------------------------------------------
1205 //
1206 // class UCharFile Class to hide all the gorp to read a file in
1207 // and produce a stream of UChars.
1208 //
1209 //----------------------------------------------------------------------------------------
1210 class UCharFile {
1211 public:
1212 UCharFile(const char *fileName);
1213 ~UCharFile();
1214 UChar get();
eof()1215 UBool eof() {return fEof;};
error()1216 UBool error() {return fError;};
1217
1218 private:
UCharFile(const UCharFile &)1219 UCharFile (const UCharFile & /*other*/) {}; // No copy constructor.
operator =(const UCharFile &)1220 UCharFile & operator = (const UCharFile &/*other*/) {return *this;}; // No assignment op
1221
1222 FILE *fFile;
1223 const char *fName;
1224 UBool fEof;
1225 UBool fError;
1226 UChar fPending2ndSurrogate;
1227
1228 enum {UTF16LE, UTF16BE, UTF8} fEncoding;
1229 };
1230
UCharFile(const char * fileName)1231 UCharFile::UCharFile(const char * fileName) {
1232 fEof = false;
1233 fError = false;
1234 fName = fileName;
1235 fFile = fopen(fName, "rb");
1236 fPending2ndSurrogate = 0;
1237 if (fFile == NULL) {
1238 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
1239 fError = true;
1240 return;
1241 }
1242 //
1243 // Look for the byte order mark at the start of the file.
1244 //
1245 int BOMC1, BOMC2, BOMC3;
1246 BOMC1 = fgetc(fFile);
1247 BOMC2 = fgetc(fFile);
1248
1249 if (BOMC1 == 0xff && BOMC2 == 0xfe) {
1250 fEncoding = UTF16LE; }
1251 else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
1252 fEncoding = UTF16BE; }
1253 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
1254 fEncoding = UTF8; }
1255 else
1256 {
1257 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
1258 "must include a BOM.\n", fileName);
1259 fError = true;
1260 return;
1261 }
1262 }
1263
1264
~UCharFile()1265 UCharFile::~UCharFile() {
1266 fclose(fFile);
1267 }
1268
1269
1270
get()1271 UChar UCharFile::get() {
1272 UChar c;
1273 switch (fEncoding) {
1274 case UTF16LE:
1275 {
1276 int cL, cH;
1277 cL = fgetc(fFile);
1278 cH = fgetc(fFile);
1279 c = cL | (cH << 8);
1280 if (cH == EOF) {
1281 c = 0;
1282 fEof = true;
1283 }
1284 break;
1285 }
1286 case UTF16BE:
1287 {
1288 int cL, cH;
1289 cH = fgetc(fFile);
1290 cL = fgetc(fFile);
1291 c = cL | (cH << 8);
1292 if (cL == EOF) {
1293 c = 0;
1294 fEof = true;
1295 }
1296 break;
1297 }
1298 case UTF8:
1299 {
1300 if (fPending2ndSurrogate != 0) {
1301 c = fPending2ndSurrogate;
1302 fPending2ndSurrogate = 0;
1303 break;
1304 }
1305
1306 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type.
1307 if (ch == EOF) {
1308 c = 0;
1309 fEof = true;
1310 break;
1311 }
1312
1313 if (ch <= 0x7f) {
1314 // It's ascii. No further utf-8 conversion.
1315 c = ch;
1316 break;
1317 }
1318
1319 // Figure out the lenght of the char and read the rest of the bytes
1320 // into a temp array.
1321 int nBytes;
1322 if (ch >= 0xF0) {nBytes=4;}
1323 else if (ch >= 0xE0) {nBytes=3;}
1324 else if (ch >= 0xC0) {nBytes=2;}
1325 else {
1326 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1327 fError = true;
1328 return 0;
1329 }
1330
1331 unsigned char bytes[10];
1332 bytes[0] = (unsigned char)ch;
1333 int i;
1334 for (i=1; i<nBytes; i++) {
1335 bytes[i] = fgetc(fFile);
1336 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
1337 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1338 fError = true;
1339 return 0;
1340 }
1341 }
1342
1343 // Convert the bytes from the temp array to a Unicode char.
1344 i = 0;
1345 uint32_t cp;
1346 U8_NEXT_UNSAFE(bytes, i, cp);
1347 c = (UChar)cp;
1348
1349 if (cp >= 0x10000) {
1350 // The code point needs to be broken up into a utf-16 surrogate pair.
1351 // Process first half this time through the main loop, and
1352 // remember the other half for the next time through.
1353 UChar utf16Buf[3];
1354 i = 0;
1355 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
1356 fPending2ndSurrogate = utf16Buf[1];
1357 c = utf16Buf[0];
1358 }
1359 break;
1360 };
1361 default:
1362 c = 0xFFFD; /* Error, unspecified codepage*/
1363 fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
1364 exit(1);
1365 }
1366 return c;
1367 }
1368
1369 //----------------------------------------------------------------------------------------
1370 //
1371 // openRulesCollator - Command line specified a rules file. Read it in
1372 // and open a collator with it.
1373 //
1374 //----------------------------------------------------------------------------------------
openRulesCollator()1375 UCollator *openRulesCollator() {
1376 UCharFile f(opt_rules);
1377 if (f.error()) {
1378 return 0;
1379 }
1380
1381 int bufLen = 10000;
1382 UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
1383 UChar *tmp;
1384 int i = 0;
1385
1386 for(;;) {
1387 buf[i] = f.get();
1388 if (f.eof()) {
1389 break;
1390 }
1391 if (f.error()) {
1392 return 0;
1393 }
1394 i++;
1395 if (i >= bufLen) {
1396 tmp = buf;
1397 bufLen += 10000;
1398 buf = (UChar *)realloc(buf, bufLen);
1399 if (buf == NULL) {
1400 free(tmp);
1401 return 0;
1402 }
1403 }
1404 }
1405 buf[i] = 0;
1406
1407 UErrorCode status = U_ZERO_ERROR;
1408 UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
1409 UCOL_DEFAULT_STRENGTH, NULL, &status);
1410 if (U_FAILURE(status)) {
1411 fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
1412 return 0;
1413 }
1414 free(buf);
1415 return coll;
1416 }
1417
1418
1419
1420
1421
1422 //----------------------------------------------------------------------------------------
1423 //
1424 // Main -- process command line, read in and pre-process the test file,
1425 // call other functions to do the actual tests.
1426 //
1427 //----------------------------------------------------------------------------------------
main(int argc,const char ** argv)1428 int main(int argc, const char** argv) {
1429 if (ProcessOptions(argc, argv, opts) != true || opt_help || opt_fName == 0) {
1430 printf(gUsageString);
1431 exit (1);
1432 }
1433
1434 // Make sure that we've only got one API selected.
1435 if (opt_unix || opt_win) opt_icu = false;
1436 if (opt_unix) opt_win = false;
1437
1438 //
1439 // Set up an ICU collator
1440 //
1441 UErrorCode status = U_ZERO_ERROR;
1442
1443 if (opt_rules != 0) {
1444 gCol = openRulesCollator();
1445 if (gCol == 0) {return -1;}
1446 }
1447 else {
1448 gCol = ucol_open(opt_locale, &status);
1449 if (U_FAILURE(status)) {
1450 fprintf(stderr, "Collator creation failed.: %d\n", status);
1451 return -1;
1452 }
1453 }
1454 if (status==U_USING_DEFAULT_WARNING && opt_terse==false) {
1455 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
1456 }
1457 if (status==U_USING_FALLBACK_WARNING && opt_terse==false) {
1458 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
1459 }
1460
1461 if (opt_norm) {
1462 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
1463 }
1464 if (opt_french && opt_frenchoff) {
1465 fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options.");
1466 exit(-1);
1467 }
1468 if (opt_french) {
1469 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
1470 }
1471 if (opt_frenchoff) {
1472 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1473 }
1474 if (opt_lower) {
1475 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
1476 }
1477 if (opt_upper) {
1478 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
1479 }
1480 if (opt_case) {
1481 ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
1482 }
1483 if (opt_shifted) {
1484 ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
1485 }
1486 if (opt_level != 0) {
1487 switch (opt_level) {
1488 case 1:
1489 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1490 break;
1491 case 2:
1492 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1493 break;
1494 case 3:
1495 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
1496 break;
1497 case 4:
1498 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
1499 break;
1500 case 5:
1501 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
1502 break;
1503 default:
1504 fprintf(stderr, "-level param must be between 1 and 5\n");
1505 exit(-1);
1506 }
1507 }
1508
1509 if (U_FAILURE(status)) {
1510 fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
1511 return -1;
1512 }
1513
1514
1515 //
1516 // Set up a Windows LCID
1517 //
1518 if (opt_langid != 0) {
1519 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
1520 }
1521 else {
1522 gWinLCID = uloc_getLCID(opt_locale);
1523 }
1524
1525
1526 //
1527 // Set the UNIX locale
1528 //
1529 if (opt_unix) {
1530 if (setlocale(LC_ALL, opt_locale) == 0) {
1531 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
1532 exit(-1);
1533 }
1534 }
1535
1536 // Read in the input file.
1537 // File assumed to be utf-16.
1538 // Lines go onto heap buffers. Global index array to line starts is created.
1539 // Lines themselves are null terminated.
1540 //
1541
1542 UCharFile f(opt_fName);
1543 if (f.error()) {
1544 exit(-1);
1545 }
1546
1547 const int MAXLINES = 100000;
1548 gFileLines = new Line[MAXLINES];
1549 UChar buf[1024];
1550 int column = 0;
1551
1552 // Read the file, split into lines, and save in memory.
1553 // Loop runs once per utf-16 value from the input file,
1554 // (The number of bytes read from file per loop iteration depends on external encoding.)
1555 for (;;) {
1556
1557 UChar c = f.get();
1558 if (f.error()){
1559 exit(-1);
1560 }
1561
1562
1563 // We now have a good UTF-16 value in c.
1564
1565 // Watch for CR, LF, EOF; these finish off a line.
1566 if (c == 0xd) {
1567 continue;
1568 }
1569
1570 if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line separators!
1571 buf[column++] = 0;
1572 if (column > 1) {
1573 gFileLines[gNumFileLines].name = new UChar[column];
1574 gFileLines[gNumFileLines].len = column-1;
1575 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
1576 gNumFileLines++;
1577 column = 0;
1578 if (gNumFileLines >= MAXLINES) {
1579 fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES);
1580 exit(-1);
1581 }
1582
1583 }
1584 if (c == 0xa || c == 0x2028)
1585 continue;
1586 else
1587 break; // EOF
1588 }
1589 buf[column++] = c;
1590 if (column >= 1023)
1591 {
1592 static UBool warnFlag = true;
1593 if (warnFlag) {
1594 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
1595 warnFlag = false;
1596 }
1597 column--;
1598 }
1599 }
1600
1601 if (opt_terse == false) {
1602 printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
1603 }
1604
1605
1606 // Convert the lines to the UNIX encoding.
1607 if (opt_unix) {
1608 UnixConvert();
1609 }
1610
1611 //
1612 // Pre-compute ICU sort keys for the lines of the file.
1613 //
1614 int line;
1615 int32_t t;
1616
1617 for (line=0; line<gNumFileLines; line++) {
1618 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
1619 gFileLines[line].icuSortKey = new char[t];
1620
1621 if (t > (int32_t)sizeof(buf)) {
1622 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
1623 }
1624 else
1625 {
1626 memcpy(gFileLines[line].icuSortKey, buf, t);
1627 }
1628 }
1629
1630
1631
1632 //
1633 // Pre-compute Windows sort keys for the lines of the file.
1634 //
1635 for (line=0; line<gNumFileLines; line++) {
1636 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
1637 gFileLines[line].winSortKey = new char[t];
1638 if (t > (int32_t)sizeof(buf)) {
1639 t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (UChar *)(gFileLines[line].winSortKey), t);
1640 }
1641 else
1642 {
1643 memcpy(gFileLines[line].winSortKey, buf, t);
1644 }
1645 }
1646
1647 //
1648 // Pre-compute UNIX sort keys for the lines of the file.
1649 //
1650 if (opt_unix) {
1651 for (line=0; line<gNumFileLines; line++) {
1652 t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf));
1653 gFileLines[line].unixSortKey = new char[t];
1654 if (t > (int32_t)sizeof(buf)) {
1655 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf));
1656 }
1657 else
1658 {
1659 memcpy(gFileLines[line].unixSortKey, buf, t);
1660 }
1661 }
1662 }
1663
1664
1665 //
1666 // Dump file lines, CEs, Sort Keys if requested.
1667 //
1668 if (opt_dump) {
1669 int i;
1670 for (line=0; line<gNumFileLines; line++) {
1671 for (i=0;;i++) {
1672 UChar c = gFileLines[line].name[i];
1673 if (c == 0)
1674 break;
1675 if (c < 0x20 || c > 0x7e) {
1676 printf("\\u%.4x", c);
1677 }
1678 else {
1679 printf("%c", c);
1680 }
1681 }
1682 printf("\n");
1683
1684 printf(" CEs: ");
1685 UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
1686 int32_t ce;
1687 i = 0;
1688 for (;;) {
1689 ce = ucol_next(CEiter, &status);
1690 if (ce == UCOL_NULLORDER) {
1691 break;
1692 }
1693 printf(" %.8x", ce);
1694 if (++i > 8) {
1695 printf("\n ");
1696 i = 0;
1697 }
1698 }
1699 printf("\n");
1700 ucol_closeElements(CEiter);
1701
1702
1703 printf(" ICU Sort Key: ");
1704 for (i=0; ; i++) {
1705 unsigned char c = gFileLines[line].icuSortKey[i];
1706 printf("%02x ", c);
1707 if (c == 0) {
1708 break;
1709 }
1710 if (i > 0 && i % 20 == 0) {
1711 printf("\n ");
1712 }
1713 }
1714 printf("\n");
1715 }
1716 }
1717
1718
1719 //
1720 // Pre-sort the lines.
1721 //
1722 int i;
1723 gSortedLines = new Line *[gNumFileLines];
1724 for (i=0; i<gNumFileLines; i++) {
1725 gSortedLines[i] = &gFileLines[i];
1726 }
1727
1728 if (opt_win) {
1729 qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
1730 }
1731 else if (opt_unix) {
1732 qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
1733 }
1734 else /* ICU */
1735 {
1736 qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
1737 }
1738
1739
1740 //
1741 // Make up a randomized order, will be used for sorting tests.
1742 //
1743 gRandomLines = new Line *[gNumFileLines];
1744 for (i=0; i<gNumFileLines; i++) {
1745 gRandomLines[i] = &gFileLines[i];
1746 }
1747 qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
1748
1749
1750
1751
1752 //
1753 // We've got the file read into memory. Go do something with it.
1754 //
1755
1756 if (opt_qsort) doQSort();
1757 if (opt_binsearch) doBinarySearch();
1758 if (opt_keygen) doKeyGen();
1759 if (opt_keyhist) doKeyHist();
1760 if (opt_itertest) doIterTest();
1761
1762 return 0;
1763
1764 }
1765