• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
4  *
5  ********************************************************************/
6 /********************************************************************************
7 *
8 * File ubrkperf.cpp
9 *
10 * Modification History:
11 *        Name                     Description
12 *     Vladimir Weinstein          First Version, based on collperf
13 *
14 *********************************************************************************
15 */
16 
17 //
18 //  This program tests break iterator performance
19 //      Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
20 //      (if any)
21 //      A text file is required as input.  It must be in utf-8 or utf-16 format,
22 //      and include a byte order mark.  Either LE or BE format is OK.
23 //
24 
25 const char gUsageString[] =
26  "usage:  ubrkperf options...\n"
27     "-help                      Display this message.\n"
28     "-file file_name            utf-16/utf-8 format file.\n"
29     "-locale name               ICU locale to use.  Default is en_US\n"
30     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
31     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
32     "-win                       Run test using Windows native services. (currently not working) (ICU is default)\n"
33     "-unix                      Run test using Unix word breaking services. (currently not working) \n"
34     "-mac                       Run test using MacOSX word breaking services.\n"
35     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
36     "-char                      Use character break iterator\n"
37     "-word                      Use word break iterator\n"
38     "-line                      Use line break iterator\n"
39     "-sentence                  Use sentence break iterator\n"
40     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
41     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
42     "                               under test at each call point.  For measuring test overhead.\n"
43     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
44     "-dump                      Display stuff.\n"
45     "-capi                      Use C APIs instead of C++ APIs (currently not working)\n"
46     "-next                      Do the next test\n"
47     "-isBound                   Do the isBound test\n"
48     ;
49 
50 
51 #include <stdio.h>
52 #include <string.h>
53 #include <stdlib.h>
54 #include <math.h>
55 #include <locale.h>
56 #include <errno.h>
57 #include <sys/stat.h>
58 
59 #include <unicode/utypes.h>
60 #include <unicode/ucol.h>
61 #include <unicode/ucoleitr.h>
62 #include <unicode/uloc.h>
63 #include <unicode/ustring.h>
64 #include <unicode/ures.h>
65 #include <unicode/uchar.h>
66 #include <unicode/ucnv.h>
67 #include <unicode/utf8.h>
68 
69 #include <unicode/brkiter.h>
70 
71 
72 #if U_PLATFORM_HAS_WIN32_API
73 #include <windows.h>
74 #else
75 //
76 //  Stubs for Windows API functions when building on UNIXes.
77 //
78 #include <sys/time.h>
timeGetTime()79 unsigned long timeGetTime() {
80     struct timeval t;
81     gettimeofday(&t, 0);
82     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
83     val += t.tv_usec / 1000;
84     return val;
85 };
86 #define MAKELCID(a,b) 0
87 #endif
88 
89 
90 //
91 //  Command line option variables
92 //     These global variables are set according to the options specified
93 //     on the command line by the user.
94 char * opt_fName      = 0;
95 char * opt_locale     = "en_US";
96 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
97 char * opt_rules      = 0;
98 UBool  opt_help       = FALSE;
99 int    opt_time       = 0;
100 int    opt_loopCount  = 0;
101 int    opt_passesCount= 1;
102 UBool  opt_terse      = FALSE;
103 UBool  opt_icu        = TRUE;
104 UBool  opt_win        = FALSE;      // Run with Windows native functions.
105 UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
106 UBool  opt_mac        = FALSE;      // Run with MacOSX word break services.
107 UBool  opt_uselen     = FALSE;
108 UBool  opt_dump       = FALSE;
109 UBool  opt_char       = FALSE;
110 UBool  opt_word       = FALSE;
111 UBool  opt_line       = FALSE;
112 UBool  opt_sentence   = FALSE;
113 UBool  opt_capi       = FALSE;
114 
115 UBool  opt_next       = FALSE;
116 UBool  opt_isBound    = FALSE;
117 
118 
119 
120 //
121 //   Definitions for the command line options
122 //
123 struct OptSpec {
124     const char *name;
125     enum {FLAG, NUM, STRING} type;
126     void *pVar;
127 };
128 
129 OptSpec opts[] = {
130     {"-file",        OptSpec::STRING, &opt_fName},
131     {"-locale",      OptSpec::STRING, &opt_locale},
132     {"-langid",      OptSpec::NUM,    &opt_langid},
133     {"-win",         OptSpec::FLAG,   &opt_win},
134     {"-unix",        OptSpec::FLAG,   &opt_unix},
135     {"-mac",         OptSpec::FLAG,   &opt_mac},
136     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
137     {"-loop",        OptSpec::NUM,    &opt_loopCount},
138     {"-time",        OptSpec::NUM,    &opt_time},
139     {"-passes",      OptSpec::NUM,    &opt_passesCount},
140     {"-char",        OptSpec::FLAG,   &opt_char},
141     {"-word",        OptSpec::FLAG,   &opt_word},
142     {"-line",        OptSpec::FLAG,   &opt_line},
143     {"-sentence",    OptSpec::FLAG,   &opt_sentence},
144     {"-terse",       OptSpec::FLAG,   &opt_terse},
145     {"-dump",        OptSpec::FLAG,   &opt_dump},
146     {"-capi",        OptSpec::FLAG,   &opt_capi},
147     {"-next",        OptSpec::FLAG,   &opt_next},
148     {"-isBound",     OptSpec::FLAG,   &opt_isBound},
149     {"-help",        OptSpec::FLAG,   &opt_help},
150     {"-?",           OptSpec::FLAG,   &opt_help},
151     {0, OptSpec::FLAG, 0}
152 };
153 
154 
155 //---------------------------------------------------------------------------
156 //
157 //  Global variables pointing to and describing the test file
158 //
159 //---------------------------------------------------------------------------
160 
161 //DWORD          gWinLCID;
162 BreakIterator *brkit = NULL;
163 UChar *text = NULL;
164 int32_t textSize = 0;
165 
166 
167 
168 #if U_PLATFORM_IS_DARWIN_BASED
169 #include <ApplicationServices/ApplicationServices.h>
170 enum{
171   kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
172     };
173 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
174 TextBreakLocatorRef breakRef;
175 UCTextBreakType macBreakType;
176 
createMACBrkIt()177 void createMACBrkIt() {
178   OSStatus status = noErr;
179   LocaleRef lref;
180   status = LocaleRefFromLocaleString(opt_locale, &lref);
181   status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
182   if(opt_char == TRUE) {
183     macBreakType = kUCTextBreakClusterMask;
184   } else if(opt_word == TRUE) {
185     macBreakType = kUCTextBreakWordMask;
186   } else if(opt_line == TRUE) {
187     macBreakType = kUCTextBreakLineMask;
188   } else if(opt_sentence == TRUE) {
189     // error
190     // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
191   } else {
192     // default is character iterator
193     macBreakType = kUCTextBreakClusterMask;
194       }
195 }
196 #endif
197 
createICUBrkIt()198 void createICUBrkIt() {
199   //
200   //  Set up an ICU break iterator
201   //
202   UErrorCode          status = U_ZERO_ERROR;
203   if(opt_char == TRUE) {
204     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
205   } else if(opt_word == TRUE) {
206     brkit = BreakIterator::createWordInstance(opt_locale, status);
207   } else if(opt_line == TRUE) {
208     brkit = BreakIterator::createLineInstance(opt_locale, status);
209   } else if(opt_sentence == TRUE) {
210     brkit = BreakIterator::createSentenceInstance(opt_locale, status);
211   } else {
212     // default is character iterator
213     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
214   }
215   if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
216     fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
217   }
218   if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
219     fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
220   }
221 
222 }
223 
224 //---------------------------------------------------------------------------
225 //
226 //  ProcessOptions()    Function to read the command line options.
227 //
228 //---------------------------------------------------------------------------
ProcessOptions(int argc,const char ** argv,OptSpec opts[])229 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
230 {
231     int         i;
232     int         argNum;
233     const char  *pArgName;
234     OptSpec    *pOpt;
235 
236     for (argNum=1; argNum<argc; argNum++) {
237         pArgName = argv[argNum];
238         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
239             if (strcmp(pOpt->name, pArgName) == 0) {
240                 switch (pOpt->type) {
241                 case OptSpec::FLAG:
242                     *(UBool *)(pOpt->pVar) = TRUE;
243                     break;
244                 case OptSpec::STRING:
245                     argNum ++;
246                     if (argNum >= argc) {
247                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
248                         return FALSE;
249                     }
250                     *(const char **)(pOpt->pVar)  = argv[argNum];
251                     break;
252                 case OptSpec::NUM:
253                     argNum ++;
254                     if (argNum >= argc) {
255                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
256                         return FALSE;
257                     }
258                     char *endp;
259                     i = strtol(argv[argNum], &endp, 0);
260                     if (endp == argv[argNum]) {
261                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
262                         return FALSE;
263                     }
264                     *(int *)(pOpt->pVar) = i;
265                 }
266                 break;
267             }
268         }
269         if (pOpt->name == 0)
270         {
271             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
272             return FALSE;
273         }
274     }
275 return TRUE;
276 }
277 
278 
doForwardTest()279 void doForwardTest() {
280   if (opt_terse == FALSE) {
281     printf("Doing the forward test\n");
282   }
283   int32_t noBreaks = 0;
284   int32_t i = 0;
285   unsigned long startTime = timeGetTime();
286   unsigned long elapsedTime = 0;
287   if(opt_icu) {
288     createICUBrkIt();
289     brkit->setText(UnicodeString(text, textSize));
290     brkit->first();
291     if (opt_terse == FALSE) {
292       printf("Warmup\n");
293     }
294     int j;
295     while((j = brkit->next()) != BreakIterator::DONE) {
296       noBreaks++;
297       //fprintf(stderr, "%d ", j);
298     }
299 
300     if (opt_terse == FALSE) {
301       printf("Measure\n");
302     }
303     startTime = timeGetTime();
304     for(i = 0; i < opt_loopCount; i++) {
305       brkit->first();
306       while(brkit->next() != BreakIterator::DONE) {
307       }
308     }
309 
310     elapsedTime = timeGetTime()-startTime;
311   } else if(opt_mac) {
312 #if U_PLATFORM_IS_DARWIN_BASED
313     createMACBrkIt();
314     UniChar* filePtr = text;
315     OSStatus status = noErr;
316     UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
317     startOffset = 0;
318     //printf("\t---Search forward--\n");
319 
320     while (startOffset < numUniChars)
321     {
322 	status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
323                                startOffset, &breakOffset);
324       //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
325       //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
326 
327       // Output break
328       //printf("\t%d\n", (int)breakOffset);
329 
330       // Increment counters
331 	noBreaks++;
332       startOffset = breakOffset;
333     }
334     startTime = timeGetTime();
335     for(i = 0; i < opt_loopCount; i++) {
336       startOffset = 0;
337 
338       while (startOffset < numUniChars)
339 	{
340 	  status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
341 				   startOffset, &breakOffset);
342 	  // Increment counters
343 	  startOffset = breakOffset;
344 	}
345     }
346     elapsedTime = timeGetTime()-startTime;
347     UCDisposeTextBreakLocator(&breakRef);
348 #endif
349 
350 
351   }
352 
353 
354   if (opt_terse == FALSE) {
355   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
356       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
357       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
358       printf("forward break iteration average loop time %d\n", loopTime);
359       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
360       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
361   } else {
362       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
363   }
364 
365 
366 }
367 
doIsBoundTest()368 void doIsBoundTest() {
369   int32_t noBreaks = 0, hit = 0;
370   int32_t i = 0, j = 0;
371   unsigned long startTime = timeGetTime();
372   unsigned long elapsedTime = 0;
373   createICUBrkIt();
374   brkit->setText(UnicodeString(text, textSize));
375   brkit->first();
376   for(j = 0; j < textSize; j++) {
377     if(brkit->isBoundary(j)) {
378       noBreaks++;
379       //fprintf(stderr, "%d ", j);
380     }
381   }
382   /*
383   while(brkit->next() != BreakIterator::DONE) {
384     noBreaks++;
385   }
386   */
387 
388   startTime = timeGetTime();
389   for(i = 0; i < opt_loopCount; i++) {
390     for(j = 0; j < textSize; j++) {
391       if(brkit->isBoundary(j)) {
392         hit++;
393       }
394     }
395   }
396 
397   elapsedTime = timeGetTime()-startTime;
398   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
399   if (opt_terse == FALSE) {
400       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
401       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
402       printf("forward break iteration average loop time %d\n", loopTime);
403       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
404       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
405   } else {
406       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
407   }
408 }
409 
410 //----------------------------------------------------------------------------------------
411 //
412 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
413 //                    Since it appears that Unicode support is going in the general
414 //                    direction of the use of UTF-8 locales, that is the approach
415 //                    that is used here.
416 //
417 //----------------------------------------------------------------------------------------
UnixConvert()418 void  UnixConvert() {
419 #if 0
420     int    line;
421 
422     UConverter   *cvrtr;    // An ICU code page converter.
423     UErrorCode    status = U_ZERO_ERROR;
424 
425 
426     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
427     if (U_FAILURE(status)) {
428         fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
429         exit(-1);
430     }
431     // redo for unix
432     for (line=0; line < gNumFileLines; line++) {
433         int sizeNeeded = ucnv_fromUChars(cvrtr,
434                                          0,            // ptr to target buffer.
435                                          0,            // length of target buffer.
436                                          gFileLines[line].name,
437                                          -1,           //  source is null terminated
438                                          &status);
439         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
440             fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
441             exit(-1);
442         }
443         status = U_ZERO_ERROR;
444         gFileLines[line].unixName = new char[sizeNeeded+1];
445         sizeNeeded = ucnv_fromUChars(cvrtr,
446                                          gFileLines[line].unixName, // ptr to target buffer.
447                                          sizeNeeded+1, // length of target buffer.
448                                          gFileLines[line].name,
449                                          -1,           //  source is null terminated
450                                          &status);
451         if (U_FAILURE(status)) {
452             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
453             exit(-1);
454         }
455         gFileLines[line].unixName[sizeNeeded] = 0;
456     };
457     ucnv_close(cvrtr);
458 #endif
459 }
460 
461 
462 //----------------------------------------------------------------------------------------
463 //
464 //  class UCharFile   Class to hide all the gorp to read a file in
465 //                    and produce a stream of UChars.
466 //
467 //----------------------------------------------------------------------------------------
468 class UCharFile {
469 public:
470     UCharFile(const char *fileName);
471     ~UCharFile();
472     UChar   get();
eof()473     UBool   eof() {return fEof;};
error()474     UBool   error() {return fError;};
size()475     int32_t size() { return fFileSize; };
476 
477 private:
UCharFile(const UCharFile & other)478     UCharFile (const UCharFile &other) {};                         // No copy constructor.
operator =(const UCharFile & other)479     UCharFile & operator = (const UCharFile &other) {return *this;};   // No assignment op
480 
481     FILE         *fFile;
482     const char   *fName;
483     UBool        fEof;
484     UBool        fError;
485     UChar        fPending2ndSurrogate;
486     int32_t      fFileSize;
487 
488     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
489 };
490 
UCharFile(const char * fileName)491 UCharFile::UCharFile(const char * fileName) {
492     fEof                 = FALSE;
493     fError               = FALSE;
494     fName                = fileName;
495     struct stat buf;
496     int32_t result = stat(fileName, &buf);
497     if(result != 0) {
498       fprintf(stderr, "Error getting info\n");
499       fFileSize = -1;
500     } else {
501       fFileSize = buf.st_size;
502     }
503     fFile                = fopen(fName, "rb");
504     fPending2ndSurrogate = 0;
505     if (fFile == NULL) {
506         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
507         fError = TRUE;
508         return;
509     }
510     //
511     //  Look for the byte order mark at the start of the file.
512     //
513     int BOMC1, BOMC2, BOMC3;
514     BOMC1 = fgetc(fFile);
515     BOMC2 = fgetc(fFile);
516 
517     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
518         fEncoding = UTF16LE; }
519     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
520         fEncoding = UTF16BE; }
521     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
522         fEncoding = UTF8; }
523     else
524     {
525         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
526             "must include a BOM.\n", fileName);
527         fError = true;
528         return;
529     }
530 }
531 
532 
~UCharFile()533 UCharFile::~UCharFile() {
534     fclose(fFile);
535 }
536 
537 
538 
get()539 UChar UCharFile::get() {
540     UChar   c;
541     switch (fEncoding) {
542     case UTF16LE:
543         {
544             int  cL, cH;
545             cL = fgetc(fFile);
546             cH = fgetc(fFile);
547             c  = cL  | (cH << 8);
548             if (cH == EOF) {
549                 c   = 0;
550                 fEof = TRUE;
551             }
552             break;
553         }
554     case UTF16BE:
555         {
556             int  cL, cH;
557             cH = fgetc(fFile);
558             cL = fgetc(fFile);
559             c  = cL  | (cH << 8);
560             if (cL == EOF) {
561                 c   = 0;
562                 fEof = TRUE;
563             }
564             break;
565         }
566     case UTF8:
567         {
568             if (fPending2ndSurrogate != 0) {
569                 c = fPending2ndSurrogate;
570                 fPending2ndSurrogate = 0;
571                 break;
572             }
573 
574             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
575             if (ch == EOF) {
576                 c = 0;
577                 fEof = TRUE;
578                 break;
579             }
580 
581             if (ch <= 0x7f) {
582                 // It's ascii.  No further utf-8 conversion.
583                 c = ch;
584                 break;
585             }
586 
587             // Figure out the lenght of the char and read the rest of the bytes
588             //   into a temp array.
589             int nBytes;
590             if (ch >= 0xF0) {nBytes=4;}
591             else if (ch >= 0xE0) {nBytes=3;}
592             else if (ch >= 0xC0) {nBytes=2;}
593             else {
594                 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
595                 fError = TRUE;
596                 return 0;
597             }
598 
599             unsigned char  bytes[10];
600             bytes[0] = (unsigned char)ch;
601             int i;
602             for (i=1; i<nBytes; i++) {
603                 bytes[i] = fgetc(fFile);
604                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
605                     fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
606                     fError = TRUE;
607                     return 0;
608                 }
609             }
610 
611             // Convert the bytes from the temp array to a Unicode char.
612             i = 0;
613             uint32_t  cp;
614             U8_NEXT_UNSAFE(bytes, i, cp);
615             c = (UChar)cp;
616 
617             if (cp >= 0x10000) {
618                 // The code point needs to be broken up into a utf-16 surrogate pair.
619                 //  Process first half this time through the main loop, and
620                 //   remember the other half for the next time through.
621                 UChar utf16Buf[3];
622                 i = 0;
623                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
624                 fPending2ndSurrogate = utf16Buf[1];
625                 c = utf16Buf[0];
626             }
627             break;
628         };
629     }
630     return c;
631 }
632 
633 
634 //----------------------------------------------------------------------------------------
635 //
636 //    Main   --  process command line, read in and pre-process the test file,
637 //                 call other functions to do the actual tests.
638 //
639 //----------------------------------------------------------------------------------------
main(int argc,const char ** argv)640 int main(int argc, const char** argv) {
641     if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
642         printf(gUsageString);
643         exit (1);
644     }
645     // Make sure that we've only got one API selected.
646     if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
647     if (opt_mac || opt_unix) opt_win = FALSE;
648     if (opt_mac) opt_unix = FALSE;
649 
650     UErrorCode          status = U_ZERO_ERROR;
651 
652 
653 
654     //
655     //  Set up a Windows LCID
656     //
657   /*
658     if (opt_langid != 0) {
659         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
660     }
661     else {
662         gWinLCID = uloc_getLCID(opt_locale);
663     }
664   */
665 
666     //
667     //  Set the UNIX locale
668     //
669     if (opt_unix) {
670         if (setlocale(LC_ALL, opt_locale) == 0) {
671             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
672             exit(-1);
673         }
674     }
675 
676     // Read in  the input file.
677     //   File assumed to be utf-16.
678     //   Lines go onto heap buffers.  Global index array to line starts is created.
679     //   Lines themselves are null terminated.
680     //
681 
682     UCharFile f(opt_fName);
683     if (f.error()) {
684         exit(-1);
685     }
686     int32_t fileSize = f.size();
687     const int STARTSIZE = 70000;
688     int32_t bufSize = 0;
689     int32_t charCount = 0;
690     if(fileSize != -1) {
691       text = (UChar *)malloc(fileSize*sizeof(UChar));
692       bufSize = fileSize;
693     } else {
694       text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
695       bufSize = STARTSIZE;
696     }
697     if(text == NULL) {
698       fprintf(stderr, "Allocating buffer failed\n");
699       exit(-1);
700     }
701 
702 
703     //  Read the file, split into lines, and save in memory.
704     //  Loop runs once per utf-16 value from the input file,
705     //    (The number of bytes read from file per loop iteration depends on external encoding.)
706     for (;;) {
707 
708         UChar c = f.get();
709         if(f.eof()) {
710           break;
711         }
712         if (f.error()){
713           exit(-1);
714         }
715         // We now have a good UTF-16 value in c.
716         text[charCount++] = c;
717         if(charCount == bufSize) {
718           text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
719           if(text == NULL) {
720             fprintf(stderr, "Reallocating buffer failed\n");
721             exit(-1);
722           }
723           bufSize *= 2;
724         }
725     }
726 
727 
728     if (opt_terse == FALSE) {
729         printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
730     }
731 
732     textSize = charCount;
733 
734 
735 
736 
737     //
738     //  Dump file contents if requested.
739     //
740     if (opt_dump) {
741       // dump file, etc... possibly
742     }
743 
744 
745     //
746     //  We've got the file read into memory.  Go do something with it.
747     //
748     int32_t i = 0;
749     for(i = 0; i < opt_passesCount; i++) {
750       if(opt_loopCount != 0) {
751         if(opt_next) {
752           doForwardTest();
753         } else if(opt_isBound) {
754           doIsBoundTest();
755         } else {
756           doForwardTest();
757         }
758       } else if(opt_time != 0) {
759 
760       }
761     }
762 
763   if(text != NULL) {
764     free(text);
765   }
766     if(brkit != NULL) {
767       delete brkit;
768     }
769 
770     return 0;
771 }
772