• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (C) 2001-2009 IBM, Inc.   All Rights Reserved.
4  *
5  ********************************************************************/
6 /********************************************************************************
7 *
8 * File CALLCOLL.C
9 *
10 * Modification History:
11 *        Name                     Description
12 *     Andy Heninger             First Version
13 *
14 *********************************************************************************
15 */
16 
17 //
18 //  This program tests string collation and sort key generation performance.
19 //      Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
20 //      A file of names is required as input, one per line.  It must be in utf-8 or utf-16 format,
21 //      and include a byte order mark.  Either LE or BE format is OK.
22 //
23 
24 const char gUsageString[] =
25  "usage:  collperf options...\n"
26     "-help                      Display this message.\n"
27     "-file file_name            utf-16 format file of names.\n"
28     "-locale name               ICU locale to use.  Default is en_US\n"
29     "-rules file_name           Collation rules file (overrides locale)\n"
30     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
31     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
32     "-win                       Run test using Windows native services.  (ICU is default)\n"
33     "-unix                      Run test using Unix strxfrm, strcoll services.\n"
34     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
35     "-usekeys                   Run tests using sortkeys rather than strcoll\n"
36     "-strcmp                    Run tests using u_strcmp rather than strcoll\n"
37     "-strcmpCPO                 Run tests using u_strcmpCodePointOrder rather than strcoll\n"
38     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
39     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
40     "                               under test at each call point.  For measuring test overhead.\n"
41     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
42     "-french                    French accent ordering\n"
43     "-frenchoff                 No French accent ordering (for use with French locales.)\n"
44     "-norm                      Normalizing mode on\n"
45     "-shifted                   Shifted mode\n"
46     "-lower                     Lower case first\n"
47     "-upper                     Upper case first\n"
48     "-case                      Enable separate case level\n"
49     "-level n                   Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
50     "-keyhist                   Produce a table sort key size vs. string length\n"
51     "-binsearch                 Binary Search timing test\n"
52     "-keygen                    Sort Key Generation timing test\n"
53     "-qsort                     Quicksort timing test\n"
54     "-iter                      Iteration Performance Test\n"
55     "-dump                      Display strings, sort keys and CEs.\n"
56     ;
57 
58 
59 
60 #include <stdio.h>
61 #include <string.h>
62 #include <stdlib.h>
63 #include <math.h>
64 #include <locale.h>
65 #include <errno.h>
66 
67 #include <unicode/utypes.h>
68 #include <unicode/ucol.h>
69 #include <unicode/ucoleitr.h>
70 #include <unicode/uloc.h>
71 #include <unicode/ustring.h>
72 #include <unicode/ures.h>
73 #include <unicode/uchar.h>
74 #include <unicode/ucnv.h>
75 #include <unicode/utf8.h>
76 
77 #ifdef WIN32
78 #include <windows.h>
79 #else
80 //
81 //  Stubs for Windows API functions when building on UNIXes.
82 //
83 typedef int DWORD;
CompareStringW(DWORD,DWORD,UChar *,int,UChar *,int)84 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
85 #include <sys/time.h>
timeGetTime()86 unsigned long timeGetTime() {
87     struct timeval t;
88     gettimeofday(&t, 0);
89     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
90     val += t.tv_usec / 1000;
91     return val;
92 }
LCMapStringW(DWORD,DWORD,UChar *,int,UChar *,int)93 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
94 const int LCMAP_SORTKEY = 0;
95 #define MAKELCID(a,b) 0
96 const int SORT_DEFAULT = 0;
97 #endif
98 
99 
100 
101 //
102 //  Command line option variables
103 //     These global variables are set according to the options specified
104 //     on the command line by the user.
105 char * opt_fName      = 0;
106 const char * opt_locale     = "en_US";
107 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
108 char * opt_rules      = 0;
109 UBool  opt_help       = FALSE;
110 int    opt_loopCount  = 1;
111 int    opt_iLoopCount = 1;
112 UBool  opt_terse      = FALSE;
113 UBool  opt_qsort      = FALSE;
114 UBool  opt_binsearch  = FALSE;
115 UBool  opt_icu        = TRUE;
116 UBool  opt_win        = FALSE;      // Run with Windows native functions.
117 UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
118 UBool  opt_uselen     = FALSE;
119 UBool  opt_usekeys    = FALSE;
120 UBool  opt_strcmp     = FALSE;
121 UBool  opt_strcmpCPO  = FALSE;
122 UBool  opt_norm       = FALSE;
123 UBool  opt_keygen     = FALSE;
124 UBool  opt_french     = FALSE;
125 UBool  opt_frenchoff  = FALSE;
126 UBool  opt_shifted    = FALSE;
127 UBool  opt_lower      = FALSE;
128 UBool  opt_upper      = FALSE;
129 UBool  opt_case       = FALSE;
130 int    opt_level      = 0;
131 UBool  opt_keyhist    = FALSE;
132 UBool  opt_itertest   = FALSE;
133 UBool  opt_dump       = FALSE;
134 
135 
136 
137 //
138 //   Definitions for the command line options
139 //
140 struct OptSpec {
141     const char *name;
142     enum {FLAG, NUM, STRING} type;
143     void *pVar;
144 };
145 
146 OptSpec opts[] = {
147     {"-file",        OptSpec::STRING, &opt_fName},
148     {"-locale",      OptSpec::STRING, &opt_locale},
149     {"-langid",      OptSpec::NUM,    &opt_langid},
150     {"-rules",       OptSpec::STRING, &opt_rules},
151     {"-qsort",       OptSpec::FLAG,   &opt_qsort},
152     {"-binsearch",   OptSpec::FLAG,   &opt_binsearch},
153     {"-iter",        OptSpec::FLAG,   &opt_itertest},
154     {"-win",         OptSpec::FLAG,   &opt_win},
155     {"-unix",        OptSpec::FLAG,   &opt_unix},
156     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
157     {"-usekeys",     OptSpec::FLAG,   &opt_usekeys},
158     {"-strcmp",      OptSpec::FLAG,   &opt_strcmp},
159     {"-strcmpCPO",   OptSpec::FLAG,   &opt_strcmpCPO},
160     {"-norm",        OptSpec::FLAG,   &opt_norm},
161     {"-french",      OptSpec::FLAG,   &opt_french},
162     {"-frenchoff",   OptSpec::FLAG,   &opt_frenchoff},
163     {"-shifted",     OptSpec::FLAG,   &opt_shifted},
164     {"-lower",       OptSpec::FLAG,   &opt_lower},
165     {"-upper",       OptSpec::FLAG,   &opt_upper},
166     {"-case",        OptSpec::FLAG,   &opt_case},
167     {"-level",       OptSpec::NUM,    &opt_level},
168     {"-keyhist",     OptSpec::FLAG,   &opt_keyhist},
169     {"-keygen",      OptSpec::FLAG,   &opt_keygen},
170     {"-loop",        OptSpec::NUM,    &opt_loopCount},
171     {"-iloop",       OptSpec::NUM,    &opt_iLoopCount},
172     {"-terse",       OptSpec::FLAG,   &opt_terse},
173     {"-dump",        OptSpec::FLAG,   &opt_dump},
174     {"-help",        OptSpec::FLAG,   &opt_help},
175     {"-?",           OptSpec::FLAG,   &opt_help},
176     {0, OptSpec::FLAG, 0}
177 };
178 
179 
180 //---------------------------------------------------------------------------
181 //
182 //  Global variables pointing to and describing the test file
183 //
184 //---------------------------------------------------------------------------
185 
186 //
187 //   struct Line
188 //
189 //      Each line from the source file (containing a name, presumably) gets
190 //      one of these structs.
191 //
192 struct  Line {
193     UChar     *name;
194     int        len;
195     char      *winSortKey;
196     char      *icuSortKey;
197     char      *unixSortKey;
198     char      *unixName;
199 };
200 
201 
202 
203 Line          *gFileLines;           // Ptr to array of Line structs, one per line in the file.
204 int            gNumFileLines;
205 UCollator     *gCol;
206 DWORD          gWinLCID;
207 
208 Line          **gSortedLines;
209 Line          **gRandomLines;
210 int            gCount;
211 
212 
213 
214 //---------------------------------------------------------------------------
215 //
216 //  ProcessOptions()    Function to read the command line options.
217 //
218 //---------------------------------------------------------------------------
ProcessOptions(int argc,const char ** argv,OptSpec opts[])219 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
220 {
221     int         i;
222     int         argNum;
223     const char  *pArgName;
224     OptSpec    *pOpt;
225 
226     for (argNum=1; argNum<argc; argNum++) {
227         pArgName = argv[argNum];
228         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
229             if (strcmp(pOpt->name, pArgName) == 0) {
230                 switch (pOpt->type) {
231                 case OptSpec::FLAG:
232                     *(UBool *)(pOpt->pVar) = TRUE;
233                     break;
234                 case OptSpec::STRING:
235                     argNum ++;
236                     if (argNum >= argc) {
237                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
238                         return FALSE;
239                     }
240                     *(const char **)(pOpt->pVar)  = argv[argNum];
241                     break;
242                 case OptSpec::NUM:
243                     argNum ++;
244                     if (argNum >= argc) {
245                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
246                         return FALSE;
247                     }
248                     char *endp;
249                     i = strtol(argv[argNum], &endp, 0);
250                     if (endp == argv[argNum]) {
251                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
252                         return FALSE;
253                     }
254                     *(int *)(pOpt->pVar) = i;
255                 }
256                 break;
257             }
258         }
259         if (pOpt->name == 0)
260         {
261             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
262             return FALSE;
263         }
264     }
265 return TRUE;
266 }
267 
268 //---------------------------------------------------------------------------------------
269 //
270 //   Comparison functions for use by qsort.
271 //
272 //       Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
273 //           or null terminated.
274 //
275 //---------------------------------------------------------------------------------------
ICUstrcmpK(const void * a,const void * b)276 int ICUstrcmpK(const void *a, const void *b) {
277     gCount++;
278     int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
279     return t;
280 }
281 
282 
ICUstrcmpL(const void * a,const void * b)283 int ICUstrcmpL(const void *a, const void *b) {
284     gCount++;
285     UCollationResult t;
286     t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
287     if (t == UCOL_LESS) return -1;
288     if (t == UCOL_GREATER) return +1;
289     return 0;
290 }
291 
292 
ICUstrcmp(const void * a,const void * b)293 int ICUstrcmp(const void *a, const void *b) {
294     gCount++;
295     UCollationResult t;
296     t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
297     if (t == UCOL_LESS) return -1;
298     if (t == UCOL_GREATER) return +1;
299     return 0;
300 }
301 
302 
Winstrcmp(const void * a,const void * b)303 int Winstrcmp(const void *a, const void *b) {
304     gCount++;
305     int t;
306     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
307     return t-2;
308 }
309 
310 
UNIXstrcmp(const void * a,const void * b)311 int UNIXstrcmp(const void *a, const void *b) {
312     gCount++;
313     int t;
314     t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
315     return t;
316 }
317 
318 
WinstrcmpL(const void * a,const void * b)319 int WinstrcmpL(const void *a, const void *b) {
320     gCount++;
321     int t;
322     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
323     return t-2;
324 }
325 
326 
WinstrcmpK(const void * a,const void * b)327 int WinstrcmpK(const void *a, const void *b) {
328     gCount++;
329     int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
330     return t;
331 }
332 
333 
334 //---------------------------------------------------------------------------------------
335 //
336 //   Function for sorting the names (lines) into a random order.
337 //      Order is based on a hash of the  ICU Sort key for the lines
338 //      The randomized order is used as input for the sorting timing tests.
339 //
340 //---------------------------------------------------------------------------------------
ICURandomCmp(const void * a,const void * b)341 int ICURandomCmp(const void *a, const void *b) {
342     char  *ask = (*(Line **)a)->icuSortKey;
343     char  *bsk = (*(Line **)b)->icuSortKey;
344     int   aVal = 0;
345     int   bVal = 0;
346     int   retVal;
347     while (*ask != 0) {
348         aVal += aVal*37 + *ask++;
349     }
350     while (*bsk != 0) {
351         bVal += bVal*37 + *bsk++;
352     }
353     retVal = -1;
354     if (aVal == bVal) {
355         retVal = 0;
356     }
357     else if (aVal > bVal) {
358         retVal = 1;
359     }
360     return retVal;
361 }
362 
363 //---------------------------------------------------------------------------------------
364 //
365 //   doKeyGen()     Key Generation Timing Test
366 //
367 //---------------------------------------------------------------------------------------
doKeyGen()368 void doKeyGen()
369 {
370     int  line;
371     int  loops = 0;
372     int  iLoop;
373     int  t;
374     int  len=-1;
375 
376     // Adjust loop count to compensate for file size.   Should be order n
377     double dLoopCount = double(opt_loopCount) * (1000. /  double(gNumFileLines));
378     int adj_loopCount = int(dLoopCount);
379     if (adj_loopCount < 1) adj_loopCount = 1;
380 
381 
382     unsigned long startTime = timeGetTime();
383 
384     if (opt_win) {
385         for (loops=0; loops<adj_loopCount; loops++) {
386             for (line=0; line < gNumFileLines; line++) {
387                 if (opt_uselen) {
388                     len = gFileLines[line].len;
389                 }
390                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
391                     t=LCMapStringW(gWinLCID, LCMAP_SORTKEY,
392                         gFileLines[line].name, len,
393                         (unsigned short *)gFileLines[line].winSortKey, 5000);    // TODO  something with length.
394                 }
395             }
396         }
397     }
398     else if (opt_icu)
399     {
400         for (loops=0; loops<adj_loopCount; loops++) {
401             for (line=0; line < gNumFileLines; line++) {
402                 if (opt_uselen) {
403                     len = gFileLines[line].len;
404                 }
405                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
406                     t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
407                 }
408             }
409         }
410     }
411     else if (opt_unix)
412     {
413         for (loops=0; loops<adj_loopCount; loops++) {
414             for (line=0; line < gNumFileLines; line++) {
415                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
416                 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
417                 }
418             }
419         }
420     }
421 
422     unsigned long elapsedTime = timeGetTime() - startTime;
423     int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
424 
425     if (opt_terse == FALSE) {
426         printf("Sort Key Generation:  total # of keys = %d\n", loops*gNumFileLines);
427         printf("Sort Key Generation:  time per key = %d ns\n", ns);
428     }
429     else {
430         printf("%d,  ", ns);
431     }
432 
433     int   totalKeyLen = 0;
434     int   totalChars  = 0;
435     for (line=0; line<gNumFileLines; line++) {
436         totalChars += u_strlen(gFileLines[line].name);
437         if (opt_win) {
438             totalKeyLen += strlen(gFileLines[line].winSortKey);
439         }
440         else if (opt_icu) {
441             totalKeyLen += strlen(gFileLines[line].icuSortKey);
442         }
443         else if (opt_unix) {
444             totalKeyLen += strlen(gFileLines[line].unixSortKey);
445         }
446 
447     }
448     if (opt_terse == FALSE) {
449         printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
450     } else {
451         printf("%f, ", (float)totalKeyLen / (float)totalChars);
452     }
453 }
454 
455 
456 
457 //---------------------------------------------------------------------------------------
458 //
459 //    doBinarySearch()    Binary Search timing test.  Each name from the list
460 //                        is looked up in the full sorted list of names.
461 //
462 //---------------------------------------------------------------------------------------
doBinarySearch()463 void doBinarySearch()
464 {
465 
466     gCount = 0;
467     int  line;
468     int  loops = 0;
469     int  iLoop = 0;
470     unsigned long elapsedTime = 0;
471 
472     // Adjust loop count to compensate for file size.   Should be order n (lookups) * log n  (compares/lookup)
473     // Accurate timings do not depend on this being perfect.  The correction is just to try to
474     //   get total running times of about the right order, so the that user doesn't need to
475     //   manually adjust the loop count for every different file size.
476     double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines));
477     if (opt_usekeys) dLoopCount *= 5;
478     int adj_loopCount = int(dLoopCount);
479     if (adj_loopCount < 1) adj_loopCount = 1;
480 
481 
482     for (;;) {  // not really a loop, just allows "break" to work, to simplify
483                 //   inadvertantly running more than one test through here.
484         if (opt_strcmp || opt_strcmpCPO)
485         {
486             unsigned long startTime = timeGetTime();
487             typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
488             PF pf = u_strcmp;
489             if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
490             //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;}   // Damn the difference between int32_t and int
491                                                             //   which forces the use of a cast here.
492 
493             int r = 0;
494             for (loops=0; loops<adj_loopCount; loops++) {
495 
496                 for (line=0; line < gNumFileLines; line++) {
497                     int hi      = gNumFileLines-1;
498                     int lo      = 0;
499                     int  guess = -1;
500                     for (;;) {
501                         int newGuess = (hi + lo) / 2;
502                         if (newGuess == guess)
503                             break;
504                         guess = newGuess;
505                         for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
506                             r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
507                         }
508                         gCount++;
509                         if (r== 0)
510                             break;
511                         if (r < 0)
512                             hi = guess;
513                         else
514                             lo   = guess;
515                     }
516                 }
517             }
518             elapsedTime = timeGetTime() - startTime;
519             break;
520         }
521 
522 
523         if (opt_icu)
524         {
525             unsigned long startTime = timeGetTime();
526             UCollationResult  r = UCOL_EQUAL;
527             for (loops=0; loops<adj_loopCount; loops++) {
528 
529                 for (line=0; line < gNumFileLines; line++) {
530                     int lineLen  = -1;
531                     int guessLen = -1;
532                     if (opt_uselen) {
533                         lineLen = (gSortedLines[line])->len;
534                     }
535                     int hi      = gNumFileLines-1;
536                     int lo      = 0;
537                     int  guess = -1;
538                     for (;;) {
539                         int newGuess = (hi + lo) / 2;
540                         if (newGuess == guess)
541                             break;
542                         guess = newGuess;
543                         int ri = 0;
544                         if (opt_usekeys) {
545                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
546                                 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
547                             }
548                             gCount++;
549                             r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
550                         }
551                         else
552                         {
553                             if (opt_uselen) {
554                                 guessLen = (gSortedLines[guess])->len;
555                             }
556                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
557                                 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
558                             }
559                             gCount++;
560                         }
561                         if (r== UCOL_EQUAL)
562                             break;
563                         if (r == UCOL_LESS)
564                             hi = guess;
565                         else
566                             lo   = guess;
567                     }
568                 }
569             }
570             elapsedTime = timeGetTime() - startTime;
571             break;
572         }
573 
574         if (opt_win)
575         {
576             unsigned long startTime = timeGetTime();
577             int r = 0;
578             for (loops=0; loops<adj_loopCount; loops++) {
579 
580                 for (line=0; line < gNumFileLines; line++) {
581                     int lineLen  = -1;
582                     int guessLen = -1;
583                     if (opt_uselen) {
584                         lineLen = (gSortedLines[line])->len;
585                     }
586                     int hi   = gNumFileLines-1;
587                     int lo   = 0;
588                     int  guess = -1;
589                     for (;;) {
590                         int newGuess = (hi + lo) / 2;
591                         if (newGuess == guess)
592                             break;
593                         guess = newGuess;
594                         if (opt_usekeys) {
595                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
596                                 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
597                             }
598                             gCount++;
599                             r+=2;
600                         }
601                         else
602                         {
603                             if (opt_uselen) {
604                                 guessLen = (gSortedLines[guess])->len;
605                             }
606                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
607                                 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
608                             }
609                             if (r == 0) {
610                                 if (opt_terse == FALSE) {
611                                     fprintf(stderr, "Error returned from Windows CompareStringW.\n");
612                                 }
613                                 exit(-1);
614                             }
615                             gCount++;
616                         }
617                         if (r== 2)   //  strings ==
618                             break;
619                         if (r == 1)  //  line < guess
620                             hi = guess;
621                         else         //  line > guess
622                             lo   = guess;
623                     }
624                 }
625             }
626             elapsedTime = timeGetTime() - startTime;
627             break;
628         }
629 
630         if (opt_unix)
631         {
632             unsigned long startTime = timeGetTime();
633             int r = 0;
634             for (loops=0; loops<adj_loopCount; loops++) {
635 
636                 for (line=0; line < gNumFileLines; line++) {
637                     int hi   = gNumFileLines-1;
638                     int lo   = 0;
639                     int  guess = -1;
640                     for (;;) {
641                         int newGuess = (hi + lo) / 2;
642                         if (newGuess == guess)
643                             break;
644                         guess = newGuess;
645                         if (opt_usekeys) {
646                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
647                                  r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
648                             }
649                             gCount++;
650                         }
651                         else
652                         {
653                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
654                                 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
655                             }
656                             errno = 0;
657                             if (errno != 0) {
658                                 fprintf(stderr, "Error %d returned from strcoll.\n", errno);
659                                 exit(-1);
660                             }
661                             gCount++;
662                         }
663                         if (r == 0)   //  strings ==
664                             break;
665                         if (r < 0)  //  line < guess
666                             hi = guess;
667                         else         //  line > guess
668                             lo   = guess;
669                     }
670                 }
671             }
672             elapsedTime = timeGetTime() - startTime;
673             break;
674         }
675         break;
676     }
677 
678     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
679     if (opt_terse == FALSE) {
680         printf("binary search:  total # of string compares = %d\n", gCount);
681         printf("binary search:  compares per loop = %d\n", gCount / loops);
682         printf("binary search:  time per compare = %d ns\n", ns);
683     } else {
684         printf("%d, ", ns);
685     }
686 
687 }
688 
689 
690 
691 
692 //---------------------------------------------------------------------------------------
693 //
694 //   doQSort()    The quick sort timing test.  Uses the C library qsort function.
695 //
696 //---------------------------------------------------------------------------------------
doQSort()697 void doQSort() {
698     int i;
699     Line **sortBuf = new Line *[gNumFileLines];
700 
701     // Adjust loop count to compensate for file size.   QSort should be n log(n)
702     double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines));
703     if (opt_usekeys) dLoopCount *= 5;
704     int adj_loopCount = int(dLoopCount);
705     if (adj_loopCount < 1) adj_loopCount = 1;
706 
707 
708     gCount = 0;
709     unsigned long startTime = timeGetTime();
710     if (opt_win && opt_usekeys) {
711         for (i=0; i<opt_loopCount; i++) {
712             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
713             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
714         }
715     }
716 
717     else if (opt_win && opt_uselen) {
718         for (i=0; i<adj_loopCount; i++) {
719             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
720             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
721         }
722     }
723 
724 
725     else if (opt_win && !opt_uselen) {
726         for (i=0; i<adj_loopCount; i++) {
727             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
728             qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
729         }
730     }
731 
732     else if (opt_icu && opt_usekeys) {
733         for (i=0; i<adj_loopCount; i++) {
734             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
735             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
736         }
737     }
738 
739     else if (opt_icu && opt_uselen) {
740         for (i=0; i<adj_loopCount; i++) {
741             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
742             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
743         }
744     }
745 
746 
747     else if (opt_icu && !opt_uselen) {
748         for (i=0; i<adj_loopCount; i++) {
749             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
750             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
751         }
752     }
753 
754     else if (opt_unix && !opt_usekeys) {
755         for (i=0; i<adj_loopCount; i++) {
756             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
757             qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
758         }
759     }
760 
761     unsigned long elapsedTime = timeGetTime() - startTime;
762     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
763     if (opt_terse == FALSE) {
764         printf("qsort:  total # of string compares = %d\n", gCount);
765         printf("qsort:  time per compare = %d ns\n", ns);
766     } else {
767         printf("%d, ", ns);
768     }
769 }
770 
771 
772 
773 //---------------------------------------------------------------------------------------
774 //
775 //    doKeyHist()       Output a table of data for
776 //                        average sort key size vs. string length.
777 //
778 //---------------------------------------------------------------------------------------
doKeyHist()779 void doKeyHist() {
780     int     i;
781     int     maxLen = 0;
782 
783     // Find the maximum string length
784     for (i=0; i<gNumFileLines; i++) {
785         if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
786     }
787 
788     // Allocate arrays to hold the histogram data
789     int *accumulatedLen  = new int[maxLen+1];
790     int *numKeysOfSize   = new int[maxLen+1];
791     for (i=0; i<=maxLen; i++) {
792         accumulatedLen[i] = 0;
793         numKeysOfSize[i] = 0;
794     }
795 
796     // Fill the arrays...
797     for (i=0; i<gNumFileLines; i++) {
798         int len = gFileLines[i].len;
799         accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
800         numKeysOfSize[len] += 1;
801     }
802 
803     // And write out averages
804     printf("String Length,  Avg Key Length,  Avg Key Len per char\n");
805     for (i=1; i<=maxLen; i++) {
806         if (numKeysOfSize[i] > 0) {
807             printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
808                 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
809         }
810     }
811 }
812 
813 //---------------------------------------------------------------------------------------
814 //
815 //    doForwardIterTest(UBool)       Forward iteration test
816 //                                   argument null-terminated string used
817 //
818 //---------------------------------------------------------------------------------------
doForwardIterTest(UBool haslen)819 void doForwardIterTest(UBool haslen) {
820     int count = 0;
821 
822     UErrorCode error = U_ZERO_ERROR;
823     printf("\n\nPerforming forward iteration performance test with ");
824 
825     if (haslen) {
826         printf("non-null terminated data -----------\n");
827     }
828     else {
829         printf("null terminated data -----------\n");
830     }
831     printf("performance test on strings from file -----------\n");
832 
833     UChar dummytext[] = {0, 0};
834     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
835     ucol_setText(iter, dummytext, 1, &error);
836 
837     gCount = 0;
838     unsigned long startTime = timeGetTime();
839     while (count < opt_loopCount) {
840         int linecount = 0;
841         while (linecount < gNumFileLines) {
842             UChar *str = gFileLines[linecount].name;
843             int strlen = haslen?gFileLines[linecount].len:-1;
844             ucol_setText(iter, str, strlen, &error);
845             while (ucol_next(iter, &error) != UCOL_NULLORDER) {
846                 gCount++;
847             }
848 
849             linecount ++;
850         }
851         count ++;
852     }
853     unsigned long elapsedTime = timeGetTime() - startTime;
854     printf("elapsedTime %ld\n", elapsedTime);
855 
856     // empty loop recalculation
857     count = 0;
858     startTime = timeGetTime();
859     while (count < opt_loopCount) {
860         int linecount = 0;
861         while (linecount < gNumFileLines) {
862             UChar *str = gFileLines[linecount].name;
863             int strlen = haslen?gFileLines[linecount].len:-1;
864             ucol_setText(iter, str, strlen, &error);
865             linecount ++;
866         }
867         count ++;
868     }
869     elapsedTime -= (timeGetTime() - startTime);
870     printf("elapsedTime %ld\n", elapsedTime);
871 
872     ucol_closeElements(iter);
873 
874     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
875     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
876                                                                 opt_loopCount);
877     printf("Average time per ucol_next() nano seconds %d\n", ns);
878 
879     printf("performance test on skipped-5 concatenated strings from file -----------\n");
880 
881     UChar *str;
882     int    strlen = 0;
883     // appending all the strings
884     int linecount = 0;
885     while (linecount < gNumFileLines) {
886         strlen += haslen?gFileLines[linecount].len:
887                                       u_strlen(gFileLines[linecount].name);
888         linecount ++;
889     }
890     str = (UChar *)malloc(sizeof(UChar) * strlen);
891     int strindex = 0;
892     linecount = 0;
893     while (strindex < strlen) {
894         int len = 0;
895         len += haslen?gFileLines[linecount].len:
896                                       u_strlen(gFileLines[linecount].name);
897         memcpy(str + strindex, gFileLines[linecount].name,
898                sizeof(UChar) * len);
899         strindex += len;
900         linecount ++;
901     }
902 
903     printf("Total size of strings %d\n", strlen);
904 
905     gCount = 0;
906     count  = 0;
907 
908     if (!haslen) {
909         strlen = -1;
910     }
911     iter = ucol_openElements(gCol, str, strlen, &error);
912     if (!haslen) {
913         strlen = u_strlen(str);
914     }
915     strlen -= 5; // any left over characters are not iterated,
916                  // this is to ensure the backwards and forwards iterators
917                  // gets the same position
918     startTime = timeGetTime();
919     while (count < opt_loopCount) {
920         int count5 = 5;
921         strindex = 0;
922         ucol_setOffset(iter, strindex, &error);
923         while (TRUE) {
924             if (ucol_next(iter, &error) == UCOL_NULLORDER) {
925                 break;
926             }
927             gCount++;
928             count5 --;
929             if (count5 == 0) {
930                 strindex += 10;
931                 if (strindex > strlen) {
932                     break;
933                 }
934                 ucol_setOffset(iter, strindex, &error);
935                 count5 = 5;
936             }
937         }
938         count ++;
939     }
940 
941     elapsedTime = timeGetTime() - startTime;
942     printf("elapsedTime %ld\n", elapsedTime);
943 
944     // empty loop recalculation
945     int tempgCount = 0;
946     count = 0;
947     startTime = timeGetTime();
948     while (count < opt_loopCount) {
949         int count5 = 5;
950         strindex = 0;
951         ucol_setOffset(iter, strindex, &error);
952         while (TRUE) {
953             tempgCount ++;
954             count5 --;
955             if (count5 == 0) {
956                 strindex += 10;
957                 if (strindex > strlen) {
958                     break;
959                 }
960                 ucol_setOffset(iter, strindex, &error);
961                 count5 = 5;
962             }
963         }
964         count ++;
965     }
966     elapsedTime -= (timeGetTime() - startTime);
967     printf("elapsedTime %ld\n", elapsedTime);
968 
969     ucol_closeElements(iter);
970 
971     printf("gCount %d\n", gCount);
972     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
973     printf("Average time per ucol_next() nano seconds %d\n", ns);
974 }
975 
976 //---------------------------------------------------------------------------------------
977 //
978 //    doBackwardIterTest(UBool)      Backwards iteration test
979 //                                   argument null-terminated string used
980 //
981 //---------------------------------------------------------------------------------------
doBackwardIterTest(UBool haslen)982 void doBackwardIterTest(UBool haslen) {
983     int count = 0;
984     UErrorCode error = U_ZERO_ERROR;
985     printf("\n\nPerforming backward iteration performance test with ");
986 
987     if (haslen) {
988         printf("non-null terminated data -----------\n");
989     }
990     else {
991         printf("null terminated data -----------\n");
992     }
993 
994     printf("performance test on strings from file -----------\n");
995 
996     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
997     UChar dummytext[] = {0, 0};
998     ucol_setText(iter, dummytext, 1, &error);
999 
1000     gCount = 0;
1001     unsigned long startTime = timeGetTime();
1002     while (count < opt_loopCount) {
1003         int linecount = 0;
1004         while (linecount < gNumFileLines) {
1005             UChar *str = gFileLines[linecount].name;
1006             int strlen = haslen?gFileLines[linecount].len:-1;
1007             ucol_setText(iter, str, strlen, &error);
1008             while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
1009                 gCount ++;
1010             }
1011 
1012             linecount ++;
1013         }
1014         count ++;
1015     }
1016     unsigned long elapsedTime = timeGetTime() - startTime;
1017 
1018     printf("elapsedTime %ld\n", elapsedTime);
1019 
1020     // empty loop recalculation
1021     count = 0;
1022     startTime = timeGetTime();
1023     while (count < opt_loopCount) {
1024         int linecount = 0;
1025         while (linecount < gNumFileLines) {
1026             UChar *str = gFileLines[linecount].name;
1027             int strlen = haslen?gFileLines[linecount].len:-1;
1028             ucol_setText(iter, str, strlen, &error);
1029             linecount ++;
1030         }
1031         count ++;
1032     }
1033     elapsedTime -= (timeGetTime() - startTime);
1034 
1035     printf("elapsedTime %ld\n", elapsedTime);
1036     ucol_closeElements(iter);
1037 
1038     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1039     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
1040                                                                 opt_loopCount);
1041     printf("Average time per ucol_previous() nano seconds %d\n", ns);
1042 
1043     printf("performance test on skipped-5 concatenated strings from file -----------\n");
1044 
1045     UChar *str;
1046     int    strlen = 0;
1047     // appending all the strings
1048     int linecount = 0;
1049     while (linecount < gNumFileLines) {
1050         strlen += haslen?gFileLines[linecount].len:
1051                                       u_strlen(gFileLines[linecount].name);
1052         linecount ++;
1053     }
1054     str = (UChar *)malloc(sizeof(UChar) * strlen);
1055     int strindex = 0;
1056     linecount = 0;
1057     while (strindex < strlen) {
1058         int len = 0;
1059         len += haslen?gFileLines[linecount].len:
1060                                       u_strlen(gFileLines[linecount].name);
1061         memcpy(str + strindex, gFileLines[linecount].name,
1062                sizeof(UChar) * len);
1063         strindex += len;
1064         linecount ++;
1065     }
1066 
1067     printf("Total size of strings %d\n", strlen);
1068 
1069     gCount = 0;
1070     count  = 0;
1071 
1072     if (!haslen) {
1073         strlen = -1;
1074     }
1075 
1076     iter = ucol_openElements(gCol, str, strlen, &error);
1077     if (!haslen) {
1078         strlen = u_strlen(str);
1079     }
1080 
1081     startTime = timeGetTime();
1082     while (count < opt_loopCount) {
1083         int count5 = 5;
1084         strindex = 5;
1085         ucol_setOffset(iter, strindex, &error);
1086         while (TRUE) {
1087             if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
1088                 break;
1089             }
1090              gCount ++;
1091              count5 --;
1092              if (count5 == 0) {
1093                  strindex += 10;
1094                  if (strindex > strlen) {
1095                     break;
1096                  }
1097                  ucol_setOffset(iter, strindex, &error);
1098                  count5 = 5;
1099              }
1100         }
1101         count ++;
1102     }
1103 
1104     elapsedTime = timeGetTime() - startTime;
1105     printf("elapsedTime %ld\n", elapsedTime);
1106 
1107     // empty loop recalculation
1108     count = 0;
1109     int tempgCount = 0;
1110     startTime = timeGetTime();
1111     while (count < opt_loopCount) {
1112         int count5 = 5;
1113         strindex = 5;
1114         ucol_setOffset(iter, strindex, &error);
1115         while (TRUE) {
1116              tempgCount ++;
1117              count5 --;
1118              if (count5 == 0) {
1119                  strindex += 10;
1120                  if (strindex > strlen) {
1121                     break;
1122                  }
1123                  ucol_setOffset(iter, strindex, &error);
1124                  count5 = 5;
1125              }
1126         }
1127         count ++;
1128     }
1129     elapsedTime -= (timeGetTime() - startTime);
1130     printf("elapsedTime %ld\n", elapsedTime);
1131     ucol_closeElements(iter);
1132 
1133     printf("gCount %d\n", gCount);
1134     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1135     printf("Average time per ucol_previous() nano seconds %d\n", ns);
1136 }
1137 
1138 //---------------------------------------------------------------------------------------
1139 //
1140 //    doIterTest()       Iteration test
1141 //
1142 //---------------------------------------------------------------------------------------
doIterTest()1143 void doIterTest() {
1144     doForwardIterTest(opt_uselen);
1145     doBackwardIterTest(opt_uselen);
1146 }
1147 
1148 
1149 //----------------------------------------------------------------------------------------
1150 //
1151 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
1152 //                    Since it appears that Unicode support is going in the general
1153 //                    direction of the use of UTF-8 locales, that is the approach
1154 //                    that is used here.
1155 //
1156 //----------------------------------------------------------------------------------------
UnixConvert()1157 void  UnixConvert() {
1158     int    line;
1159 
1160     UConverter   *cvrtr;    // An ICU code page converter.
1161     UErrorCode    status = U_ZERO_ERROR;
1162 
1163 
1164     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
1165     if (U_FAILURE(status)) {
1166         fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
1167         exit(-1);
1168     }
1169 
1170     for (line=0; line < gNumFileLines; line++) {
1171         int sizeNeeded = ucnv_fromUChars(cvrtr,
1172                                          0,            // ptr to target buffer.
1173                                          0,            // length of target buffer.
1174                                          gFileLines[line].name,
1175                                          -1,           //  source is null terminated
1176                                          &status);
1177         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
1178             //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
1179             //exit(-1);
1180         }
1181         status = U_ZERO_ERROR;
1182         gFileLines[line].unixName = new char[sizeNeeded+1];
1183         sizeNeeded = ucnv_fromUChars(cvrtr,
1184                                          gFileLines[line].unixName, // ptr to target buffer.
1185                                          sizeNeeded+1, // length of target buffer.
1186                                          gFileLines[line].name,
1187                                          -1,           //  source is null terminated
1188                                          &status);
1189         if (U_FAILURE(status)) {
1190             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
1191             exit(-1);
1192         }
1193         gFileLines[line].unixName[sizeNeeded] = 0;
1194     };
1195     ucnv_close(cvrtr);
1196 }
1197 
1198 
1199 //----------------------------------------------------------------------------------------
1200 //
1201 //  class UCharFile   Class to hide all the gorp to read a file in
1202 //                    and produce a stream of UChars.
1203 //
1204 //----------------------------------------------------------------------------------------
1205 class UCharFile {
1206 public:
1207     UCharFile(const char *fileName);
1208     ~UCharFile();
1209     UChar   get();
eof()1210     UBool   eof() {return fEof;};
error()1211     UBool   error() {return fError;};
1212 
1213 private:
UCharFile(const UCharFile &)1214     UCharFile (const UCharFile & /*other*/) {};                         // No copy constructor.
operator =(const UCharFile &)1215     UCharFile & operator = (const UCharFile &/*other*/) {return *this;};   // No assignment op
1216 
1217     FILE         *fFile;
1218     const char   *fName;
1219     UBool        fEof;
1220     UBool        fError;
1221     UChar        fPending2ndSurrogate;
1222 
1223     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
1224 };
1225 
UCharFile(const char * fileName)1226 UCharFile::UCharFile(const char * fileName) {
1227     fEof                 = FALSE;
1228     fError               = FALSE;
1229     fName                = fileName;
1230     fFile                = fopen(fName, "rb");
1231     fPending2ndSurrogate = 0;
1232     if (fFile == NULL) {
1233         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
1234         fError = TRUE;
1235         return;
1236     }
1237     //
1238     //  Look for the byte order mark at the start of the file.
1239     //
1240     int BOMC1, BOMC2, BOMC3;
1241     BOMC1 = fgetc(fFile);
1242     BOMC2 = fgetc(fFile);
1243 
1244     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
1245         fEncoding = UTF16LE; }
1246     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
1247         fEncoding = UTF16BE; }
1248     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
1249         fEncoding = UTF8; }
1250     else
1251     {
1252         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
1253             "must include a BOM.\n", fileName);
1254         fError = true;
1255         return;
1256     }
1257 }
1258 
1259 
~UCharFile()1260 UCharFile::~UCharFile() {
1261     fclose(fFile);
1262 }
1263 
1264 
1265 
get()1266 UChar UCharFile::get() {
1267     UChar   c;
1268     switch (fEncoding) {
1269     case UTF16LE:
1270         {
1271             int  cL, cH;
1272             cL = fgetc(fFile);
1273             cH = fgetc(fFile);
1274             c  = cL  | (cH << 8);
1275             if (cH == EOF) {
1276                 c   = 0;
1277                 fEof = TRUE;
1278             }
1279             break;
1280         }
1281     case UTF16BE:
1282         {
1283             int  cL, cH;
1284             cH = fgetc(fFile);
1285             cL = fgetc(fFile);
1286             c  = cL  | (cH << 8);
1287             if (cL == EOF) {
1288                 c   = 0;
1289                 fEof = TRUE;
1290             }
1291             break;
1292         }
1293     case UTF8:
1294         {
1295             if (fPending2ndSurrogate != 0) {
1296                 c = fPending2ndSurrogate;
1297                 fPending2ndSurrogate = 0;
1298                 break;
1299             }
1300 
1301             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
1302             if (ch == EOF) {
1303                 c = 0;
1304                 fEof = TRUE;
1305                 break;
1306             }
1307 
1308             if (ch <= 0x7f) {
1309                 // It's ascii.  No further utf-8 conversion.
1310                 c = ch;
1311                 break;
1312             }
1313 
1314             // Figure out the lenght of the char and read the rest of the bytes
1315             //   into a temp array.
1316             int nBytes;
1317             if (ch >= 0xF0) {nBytes=4;}
1318             else if (ch >= 0xE0) {nBytes=3;}
1319             else if (ch >= 0xC0) {nBytes=2;}
1320             else {
1321                 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1322                 fError = TRUE;
1323                 return 0;
1324             }
1325 
1326             unsigned char  bytes[10];
1327             bytes[0] = (unsigned char)ch;
1328             int i;
1329             for (i=1; i<nBytes; i++) {
1330                 bytes[i] = fgetc(fFile);
1331                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
1332                     fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1333                     fError = TRUE;
1334                     return 0;
1335                 }
1336             }
1337 
1338             // Convert the bytes from the temp array to a Unicode char.
1339             i = 0;
1340             uint32_t  cp;
1341             UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp);
1342             c = (UChar)cp;
1343 
1344             if (cp >= 0x10000) {
1345                 // The code point needs to be broken up into a utf-16 surrogate pair.
1346                 //  Process first half this time through the main loop, and
1347                 //   remember the other half for the next time through.
1348                 UChar utf16Buf[3];
1349                 i = 0;
1350                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
1351                 fPending2ndSurrogate = utf16Buf[1];
1352                 c = utf16Buf[0];
1353             }
1354             break;
1355         };
1356     default:
1357         c = 0xFFFD; /* Error, unspecified codepage*/
1358         fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
1359         exit(1);
1360     }
1361     return c;
1362 }
1363 
1364 //----------------------------------------------------------------------------------------
1365 //
1366 //   openRulesCollator  - Command line specified a rules file.  Read it in
1367 //                        and open a collator with it.
1368 //
1369 //----------------------------------------------------------------------------------------
openRulesCollator()1370 UCollator *openRulesCollator() {
1371     UCharFile f(opt_rules);
1372     if (f.error()) {
1373         return 0;
1374     }
1375 
1376     int  bufLen = 10000;
1377     UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
1378     int i = 0;
1379 
1380     for(;;) {
1381         buf[i] = f.get();
1382         if (f.eof()) {
1383             break;
1384         }
1385         if (f.error()) {
1386             return 0;
1387         }
1388         i++;
1389         if (i >= bufLen) {
1390             bufLen += 10000;
1391             buf = (UChar *)realloc(buf, bufLen);
1392         }
1393     }
1394     buf[i] = 0;
1395 
1396     UErrorCode    status = U_ZERO_ERROR;
1397     UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
1398                                          UCOL_DEFAULT_STRENGTH, NULL, &status);
1399     if (U_FAILURE(status)) {
1400         fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
1401         return 0;
1402     }
1403     free(buf);
1404     return coll;
1405 }
1406 
1407 
1408 
1409 
1410 
1411 //----------------------------------------------------------------------------------------
1412 //
1413 //    Main   --  process command line, read in and pre-process the test file,
1414 //                 call other functions to do the actual tests.
1415 //
1416 //----------------------------------------------------------------------------------------
main(int argc,const char ** argv)1417 int main(int argc, const char** argv) {
1418     if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
1419         printf(gUsageString);
1420         exit (1);
1421     }
1422 
1423     // Make sure that we've only got one API selected.
1424     if (opt_unix || opt_win) opt_icu = FALSE;
1425     if (opt_unix) opt_win = FALSE;
1426 
1427     //
1428     //  Set up an ICU collator
1429     //
1430     UErrorCode          status = U_ZERO_ERROR;
1431 
1432     if (opt_rules != 0) {
1433         gCol = openRulesCollator();
1434         if (gCol == 0) {return -1;}
1435     }
1436     else {
1437         gCol = ucol_open(opt_locale, &status);
1438         if (U_FAILURE(status)) {
1439             fprintf(stderr, "Collator creation failed.: %d\n", status);
1440             return -1;
1441         }
1442     }
1443     if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
1444         fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
1445     }
1446     if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
1447         fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
1448     }
1449 
1450     if (opt_norm) {
1451         ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
1452     }
1453     if (opt_french && opt_frenchoff) {
1454         fprintf(stderr, "collperf:  Error, specified both -french and -frenchoff options.");
1455         exit(-1);
1456     }
1457     if (opt_french) {
1458         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
1459     }
1460     if (opt_frenchoff) {
1461         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1462     }
1463     if (opt_lower) {
1464         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
1465     }
1466     if (opt_upper) {
1467         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
1468     }
1469     if (opt_case) {
1470         ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
1471     }
1472     if (opt_shifted) {
1473         ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
1474     }
1475     if (opt_level != 0) {
1476         switch (opt_level) {
1477         case 1:
1478             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1479             break;
1480         case 2:
1481             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1482             break;
1483         case 3:
1484             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
1485             break;
1486         case 4:
1487             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
1488             break;
1489         case 5:
1490             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
1491             break;
1492         default:
1493             fprintf(stderr, "-level param must be between 1 and 5\n");
1494             exit(-1);
1495         }
1496     }
1497 
1498     if (U_FAILURE(status)) {
1499         fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
1500         return -1;
1501     }
1502 
1503 
1504     //
1505     //  Set up a Windows LCID
1506     //
1507     if (opt_langid != 0) {
1508         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
1509     }
1510     else {
1511         gWinLCID = uloc_getLCID(opt_locale);
1512     }
1513 
1514 
1515     //
1516     //  Set the UNIX locale
1517     //
1518     if (opt_unix) {
1519         if (setlocale(LC_ALL, opt_locale) == 0) {
1520             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
1521             exit(-1);
1522         }
1523     }
1524 
1525     // Read in  the input file.
1526     //   File assumed to be utf-16.
1527     //   Lines go onto heap buffers.  Global index array to line starts is created.
1528     //   Lines themselves are null terminated.
1529     //
1530 
1531     UCharFile f(opt_fName);
1532     if (f.error()) {
1533         exit(-1);
1534     }
1535 
1536     const int MAXLINES = 100000;
1537     gFileLines = new Line[MAXLINES];
1538     UChar buf[1024];
1539     int   column = 0;
1540 
1541     //  Read the file, split into lines, and save in memory.
1542     //  Loop runs once per utf-16 value from the input file,
1543     //    (The number of bytes read from file per loop iteration depends on external encoding.)
1544     for (;;) {
1545 
1546         UChar c = f.get();
1547         if (f.error()){
1548             exit(-1);
1549         }
1550 
1551 
1552         // We now have a good UTF-16 value in c.
1553 
1554         // Watch for CR, LF, EOF; these finish off a line.
1555         if (c == 0xd) {
1556             continue;
1557         }
1558 
1559         if (f.eof() || c == 0x0a || c==0x2028) {  // Unipad inserts 2028 line separators!
1560             buf[column++] = 0;
1561             if (column > 1) {
1562                 gFileLines[gNumFileLines].name  = new UChar[column];
1563                 gFileLines[gNumFileLines].len   = column-1;
1564                 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
1565                 gNumFileLines++;
1566                 column = 0;
1567                 if (gNumFileLines >= MAXLINES) {
1568                     fprintf(stderr, "File too big.  Max number of lines is %d\n", MAXLINES);
1569                     exit(-1);
1570                 }
1571 
1572             }
1573             if (c == 0xa || c == 0x2028)
1574                 continue;
1575             else
1576                 break;  // EOF
1577         }
1578         buf[column++] = c;
1579         if (column >= 1023)
1580         {
1581             static UBool warnFlag = TRUE;
1582             if (warnFlag) {
1583                 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
1584                 warnFlag = FALSE;
1585             }
1586             column--;
1587         }
1588     }
1589 
1590     if (opt_terse == FALSE) {
1591         printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
1592     }
1593 
1594 
1595     // Convert the lines to the UNIX encoding.
1596     if (opt_unix) {
1597         UnixConvert();
1598     }
1599 
1600     //
1601     //  Pre-compute ICU sort keys for the lines of the file.
1602     //
1603     int line;
1604     int32_t t;
1605 
1606     for (line=0; line<gNumFileLines; line++) {
1607          t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
1608          gFileLines[line].icuSortKey  = new char[t];
1609 
1610          if (t > (int32_t)sizeof(buf)) {
1611              t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
1612          }
1613          else
1614          {
1615              memcpy(gFileLines[line].icuSortKey, buf, t);
1616          }
1617     }
1618 
1619 
1620 
1621     //
1622     //  Pre-compute Windows sort keys for the lines of the file.
1623     //
1624     for (line=0; line<gNumFileLines; line++) {
1625          t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
1626          gFileLines[line].winSortKey  = new char[t];
1627          if (t > (int32_t)sizeof(buf)) {
1628              t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t);
1629          }
1630          else
1631          {
1632              memcpy(gFileLines[line].winSortKey, buf, t);
1633          }
1634     }
1635 
1636     //
1637     //  Pre-compute UNIX sort keys for the lines of the file.
1638     //
1639     if (opt_unix) {
1640         for (line=0; line<gNumFileLines; line++) {
1641             t=strxfrm((char *)buf,  gFileLines[line].unixName,  sizeof(buf));
1642             gFileLines[line].unixSortKey  = new char[t];
1643             if (t > (int32_t)sizeof(buf)) {
1644                 t = strxfrm(gFileLines[line].unixSortKey,  gFileLines[line].unixName,  sizeof(buf));
1645             }
1646             else
1647             {
1648                 memcpy(gFileLines[line].unixSortKey, buf, t);
1649             }
1650         }
1651     }
1652 
1653 
1654     //
1655     //  Dump file lines, CEs, Sort Keys if requested.
1656     //
1657     if (opt_dump) {
1658         int  i;
1659         for (line=0; line<gNumFileLines; line++) {
1660             for (i=0;;i++) {
1661                 UChar  c = gFileLines[line].name[i];
1662                 if (c == 0)
1663                     break;
1664                 if (c < 0x20 || c > 0x7e) {
1665                     printf("\\u%.4x", c);
1666                 }
1667                 else {
1668                     printf("%c", c);
1669                 }
1670             }
1671             printf("\n");
1672 
1673             printf("   CEs: ");
1674             UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
1675             int32_t ce;
1676             i = 0;
1677             for (;;) {
1678                 ce = ucol_next(CEiter, &status);
1679                 if (ce == UCOL_NULLORDER) {
1680                     break;
1681                 }
1682                 printf(" %.8x", ce);
1683                 if (++i > 8) {
1684                     printf("\n        ");
1685                     i = 0;
1686                 }
1687             }
1688             printf("\n");
1689             ucol_closeElements(CEiter);
1690 
1691 
1692             printf("   ICU Sort Key: ");
1693             for (i=0; ; i++) {
1694                 unsigned char c = gFileLines[line].icuSortKey[i];
1695                 printf("%02x ", c);
1696                 if (c == 0) {
1697                     break;
1698                 }
1699                 if (i > 0 && i % 20 == 0) {
1700                     printf("\n                 ");
1701                 }
1702            }
1703             printf("\n");
1704         }
1705     }
1706 
1707 
1708     //
1709     //  Pre-sort the lines.
1710     //
1711     int i;
1712     gSortedLines = new Line *[gNumFileLines];
1713     for (i=0; i<gNumFileLines; i++) {
1714         gSortedLines[i] = &gFileLines[i];
1715     }
1716 
1717     if (opt_win) {
1718         qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
1719     }
1720     else if (opt_unix) {
1721         qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
1722     }
1723     else   /* ICU */
1724     {
1725         qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
1726     }
1727 
1728 
1729     //
1730     //  Make up a randomized order, will be used for sorting tests.
1731     //
1732     gRandomLines = new Line *[gNumFileLines];
1733     for (i=0; i<gNumFileLines; i++) {
1734         gRandomLines[i] = &gFileLines[i];
1735     }
1736     qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
1737 
1738 
1739 
1740 
1741     //
1742     //  We've got the file read into memory.  Go do something with it.
1743     //
1744 
1745     if (opt_qsort)     doQSort();
1746     if (opt_binsearch) doBinarySearch();
1747     if (opt_keygen)    doKeyGen();
1748     if (opt_keyhist)   doKeyHist();
1749     if (opt_itertest)  doIterTest();
1750 
1751     return 0;
1752 
1753 }
1754