• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /***********************************************************************
2  * © 2016 and later: Unicode, Inc. and others.
3  * License & terms of use: http://www.unicode.org/copyright.html
4  *
5  ***********************************************************************
6  ***********************************************************************
7  * COPYRIGHT:
8  * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
9  *
10  ***********************************************************************/
11 /********************************************************************************
12 *
13 * File ubrkperf.cpp
14 *
15 * Modification History:
16 *        Name                     Description
17 *     Vladimir Weinstein          First Version, based on collperf
18 *
19 *********************************************************************************
20 */
21 
22 //
23 //  This program tests break iterator performance
24 //      Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
25 //      (if any)
26 //      A text file is required as input.  It must be in utf-8 or utf-16 format,
27 //      and include a byte order mark.  Either LE or BE format is OK.
28 //
29 
30 const char gUsageString[] =
31  "usage:  ubrkperf options...\n"
32     "-help                      Display this message.\n"
33     "-file file_name            utf-16/utf-8 format file.\n"
34     "-locale name               ICU locale to use.  Default is en_US\n"
35     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
36     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
37     "-win                       Run test using Windows native services. (currently not working) (ICU is default)\n"
38     "-unix                      Run test using Unix word breaking services. (currently not working) \n"
39     "-mac                       Run test using MacOSX word breaking services.\n"
40     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
41     "-char                      Use character break iterator\n"
42     "-word                      Use word break iterator\n"
43     "-line                      Use line break iterator\n"
44     "-sentence                  Use sentence break iterator\n"
45     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
46     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
47     "                               under test at each call point.  For measuring test overhead.\n"
48     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
49     "-dump                      Display stuff.\n"
50     "-capi                      Use C APIs instead of C++ APIs (currently not working)\n"
51     "-next                      Do the next test\n"
52     "-isBound                   Do the isBound test\n"
53     ;
54 
55 
56 #include <stdio.h>
57 #include <string.h>
58 #include <stdlib.h>
59 #include <math.h>
60 #include <locale.h>
61 #include <errno.h>
62 #include <sys/stat.h>
63 
64 #include <unicode/utypes.h>
65 #include <unicode/ucol.h>
66 #include <unicode/ucoleitr.h>
67 #include <unicode/uloc.h>
68 #include <unicode/ustring.h>
69 #include <unicode/ures.h>
70 #include <unicode/uchar.h>
71 #include <unicode/ucnv.h>
72 #include <unicode/utf8.h>
73 
74 #include <unicode/brkiter.h>
75 
76 
77 #if U_PLATFORM_HAS_WIN32_API
78 #include <windows.h>
79 #else
80 //
81 //  Stubs for Windows API functions when building on UNIXes.
82 //
83 #include <sys/time.h>
timeGetTime()84 unsigned long timeGetTime() {
85     struct timeval t;
86     gettimeofday(&t, 0);
87     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
88     val += t.tv_usec / 1000;
89     return val;
90 };
91 #define MAKELCID(a,b) 0
92 #endif
93 
94 
95 //
96 //  Command line option variables
97 //     These global variables are set according to the options specified
98 //     on the command line by the user.
99 char * opt_fName      = 0;
100 char * opt_locale     = "en_US";
101 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
102 char * opt_rules      = 0;
103 UBool  opt_help       = false;
104 int    opt_time       = 0;
105 int    opt_loopCount  = 0;
106 int    opt_passesCount= 1;
107 UBool  opt_terse      = false;
108 UBool  opt_icu        = true;
109 UBool  opt_win        = false;      // Run with Windows native functions.
110 UBool  opt_unix       = false;      // Run with UNIX strcoll, strxfrm functions.
111 UBool  opt_mac        = false;      // Run with MacOSX word break services.
112 UBool  opt_uselen     = false;
113 UBool  opt_dump       = false;
114 UBool  opt_char       = false;
115 UBool  opt_word       = false;
116 UBool  opt_line       = false;
117 UBool  opt_sentence   = false;
118 UBool  opt_capi       = false;
119 
120 UBool  opt_next       = false;
121 UBool  opt_isBound    = false;
122 
123 
124 
125 //
126 //   Definitions for the command line options
127 //
128 struct OptSpec {
129     const char *name;
130     enum {FLAG, NUM, STRING} type;
131     void *pVar;
132 };
133 
134 OptSpec opts[] = {
135     {"-file",        OptSpec::STRING, &opt_fName},
136     {"-locale",      OptSpec::STRING, &opt_locale},
137     {"-langid",      OptSpec::NUM,    &opt_langid},
138     {"-win",         OptSpec::FLAG,   &opt_win},
139     {"-unix",        OptSpec::FLAG,   &opt_unix},
140     {"-mac",         OptSpec::FLAG,   &opt_mac},
141     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
142     {"-loop",        OptSpec::NUM,    &opt_loopCount},
143     {"-time",        OptSpec::NUM,    &opt_time},
144     {"-passes",      OptSpec::NUM,    &opt_passesCount},
145     {"-char",        OptSpec::FLAG,   &opt_char},
146     {"-word",        OptSpec::FLAG,   &opt_word},
147     {"-line",        OptSpec::FLAG,   &opt_line},
148     {"-sentence",    OptSpec::FLAG,   &opt_sentence},
149     {"-terse",       OptSpec::FLAG,   &opt_terse},
150     {"-dump",        OptSpec::FLAG,   &opt_dump},
151     {"-capi",        OptSpec::FLAG,   &opt_capi},
152     {"-next",        OptSpec::FLAG,   &opt_next},
153     {"-isBound",     OptSpec::FLAG,   &opt_isBound},
154     {"-help",        OptSpec::FLAG,   &opt_help},
155     {"-?",           OptSpec::FLAG,   &opt_help},
156     {0, OptSpec::FLAG, 0}
157 };
158 
159 
160 //---------------------------------------------------------------------------
161 //
162 //  Global variables pointing to and describing the test file
163 //
164 //---------------------------------------------------------------------------
165 
166 //DWORD          gWinLCID;
167 BreakIterator *brkit = NULL;
168 UChar *text = NULL;
169 int32_t textSize = 0;
170 
171 
172 
173 #if U_PLATFORM_IS_DARWIN_BASED
174 #include <ApplicationServices/ApplicationServices.h>
175 enum{
176   kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
177     };
178 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
179 TextBreakLocatorRef breakRef;
180 UCTextBreakType macBreakType;
181 
createMACBrkIt()182 void createMACBrkIt() {
183   OSStatus status = noErr;
184   LocaleRef lref;
185   status = LocaleRefFromLocaleString(opt_locale, &lref);
186   status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
187   if(opt_char == true) {
188     macBreakType = kUCTextBreakClusterMask;
189   } else if(opt_word == true) {
190     macBreakType = kUCTextBreakWordMask;
191   } else if(opt_line == true) {
192     macBreakType = kUCTextBreakLineMask;
193   } else if(opt_sentence == true) {
194     // error
195     // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
196   } else {
197     // default is character iterator
198     macBreakType = kUCTextBreakClusterMask;
199       }
200 }
201 #endif
202 
createICUBrkIt()203 void createICUBrkIt() {
204   //
205   //  Set up an ICU break iterator
206   //
207   UErrorCode          status = U_ZERO_ERROR;
208   if(opt_char == true) {
209     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
210   } else if(opt_word == true) {
211     brkit = BreakIterator::createWordInstance(opt_locale, status);
212   } else if(opt_line == true) {
213     brkit = BreakIterator::createLineInstance(opt_locale, status);
214   } else if(opt_sentence == true) {
215     brkit = BreakIterator::createSentenceInstance(opt_locale, status);
216   } else {
217     // default is character iterator
218     brkit = BreakIterator::createCharacterInstance(opt_locale, status);
219   }
220   if (status==U_USING_DEFAULT_WARNING && opt_terse==false) {
221     fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
222   }
223   if (status==U_USING_FALLBACK_WARNING && opt_terse==false) {
224     fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
225   }
226 
227 }
228 
229 //---------------------------------------------------------------------------
230 //
231 //  ProcessOptions()    Function to read the command line options.
232 //
233 //---------------------------------------------------------------------------
ProcessOptions(int argc,const char ** argv,OptSpec opts[])234 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
235 {
236     int         i;
237     int         argNum;
238     const char  *pArgName;
239     OptSpec    *pOpt;
240 
241     for (argNum=1; argNum<argc; argNum++) {
242         pArgName = argv[argNum];
243         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
244             if (strcmp(pOpt->name, pArgName) == 0) {
245                 switch (pOpt->type) {
246                 case OptSpec::FLAG:
247                     *(UBool *)(pOpt->pVar) = true;
248                     break;
249                 case OptSpec::STRING:
250                     argNum ++;
251                     if (argNum >= argc) {
252                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
253                         return false;
254                     }
255                     *(const char **)(pOpt->pVar)  = argv[argNum];
256                     break;
257                 case OptSpec::NUM:
258                     argNum ++;
259                     if (argNum >= argc) {
260                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
261                         return false;
262                     }
263                     char *endp;
264                     i = strtol(argv[argNum], &endp, 0);
265                     if (endp == argv[argNum]) {
266                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
267                         return false;
268                     }
269                     *(int *)(pOpt->pVar) = i;
270                 }
271                 break;
272             }
273         }
274         if (pOpt->name == 0)
275         {
276             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
277             return false;
278         }
279     }
280 return true;
281 }
282 
283 
doForwardTest()284 void doForwardTest() {
285   if (opt_terse == false) {
286     printf("Doing the forward test\n");
287   }
288   int32_t noBreaks = 0;
289   int32_t i = 0;
290   unsigned long startTime = timeGetTime();
291   unsigned long elapsedTime = 0;
292   if(opt_icu) {
293     createICUBrkIt();
294     brkit->setText(UnicodeString(text, textSize));
295     brkit->first();
296     if (opt_terse == false) {
297       printf("Warmup\n");
298     }
299     int j;
300     while((j = brkit->next()) != BreakIterator::DONE) {
301       noBreaks++;
302       //fprintf(stderr, "%d ", j);
303     }
304 
305     if (opt_terse == false) {
306       printf("Measure\n");
307     }
308     startTime = timeGetTime();
309     for(i = 0; i < opt_loopCount; i++) {
310       brkit->first();
311       while(brkit->next() != BreakIterator::DONE) {
312       }
313     }
314 
315     elapsedTime = timeGetTime()-startTime;
316   } else if(opt_mac) {
317 #if U_PLATFORM_IS_DARWIN_BASED
318     createMACBrkIt();
319     UniChar* filePtr = text;
320     OSStatus status = noErr;
321     UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
322     startOffset = 0;
323     //printf("\t---Search forward--\n");
324 
325     while (startOffset < numUniChars)
326     {
327 	status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
328                                startOffset, &breakOffset);
329       //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
330       //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
331 
332       // Output break
333       //printf("\t%d\n", (int)breakOffset);
334 
335       // Increment counters
336 	noBreaks++;
337       startOffset = breakOffset;
338     }
339     startTime = timeGetTime();
340     for(i = 0; i < opt_loopCount; i++) {
341       startOffset = 0;
342 
343       while (startOffset < numUniChars)
344 	{
345 	  status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
346 				   startOffset, &breakOffset);
347 	  // Increment counters
348 	  startOffset = breakOffset;
349 	}
350     }
351     elapsedTime = timeGetTime()-startTime;
352     UCDisposeTextBreakLocator(&breakRef);
353 #endif
354 
355 
356   }
357 
358 
359   if (opt_terse == false) {
360   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
361       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
362       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
363       printf("forward break iteration average loop time %d\n", loopTime);
364       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
365       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
366   } else {
367       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
368   }
369 
370 
371 }
372 
doIsBoundTest()373 void doIsBoundTest() {
374   int32_t noBreaks = 0, hit = 0;
375   int32_t i = 0, j = 0;
376   unsigned long startTime = timeGetTime();
377   unsigned long elapsedTime = 0;
378   createICUBrkIt();
379   brkit->setText(UnicodeString(text, textSize));
380   brkit->first();
381   for(j = 0; j < textSize; j++) {
382     if(brkit->isBoundary(j)) {
383       noBreaks++;
384       //fprintf(stderr, "%d ", j);
385     }
386   }
387   /*
388   while(brkit->next() != BreakIterator::DONE) {
389     noBreaks++;
390   }
391   */
392 
393   startTime = timeGetTime();
394   for(i = 0; i < opt_loopCount; i++) {
395     for(j = 0; j < textSize; j++) {
396       if(brkit->isBoundary(j)) {
397         hit++;
398       }
399     }
400   }
401 
402   elapsedTime = timeGetTime()-startTime;
403   int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
404   if (opt_terse == false) {
405       int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
406       int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
407       printf("forward break iteration average loop time %d\n", loopTime);
408       printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
409       printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
410   } else {
411       printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
412   }
413 }
414 
415 //----------------------------------------------------------------------------------------
416 //
417 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
418 //                    Since it appears that Unicode support is going in the general
419 //                    direction of the use of UTF-8 locales, that is the approach
420 //                    that is used here.
421 //
422 //----------------------------------------------------------------------------------------
UnixConvert()423 void  UnixConvert() {
424 #if 0
425     int    line;
426 
427     UConverter   *cvrtr;    // An ICU code page converter.
428     UErrorCode    status = U_ZERO_ERROR;
429 
430 
431     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
432     if (U_FAILURE(status)) {
433         fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
434         exit(-1);
435     }
436     // redo for unix
437     for (line=0; line < gNumFileLines; line++) {
438         int sizeNeeded = ucnv_fromUChars(cvrtr,
439                                          0,            // ptr to target buffer.
440                                          0,            // length of target buffer.
441                                          gFileLines[line].name,
442                                          -1,           //  source is null terminated
443                                          &status);
444         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
445             fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
446             exit(-1);
447         }
448         status = U_ZERO_ERROR;
449         gFileLines[line].unixName = new char[sizeNeeded+1];
450         sizeNeeded = ucnv_fromUChars(cvrtr,
451                                          gFileLines[line].unixName, // ptr to target buffer.
452                                          sizeNeeded+1, // length of target buffer.
453                                          gFileLines[line].name,
454                                          -1,           //  source is null terminated
455                                          &status);
456         if (U_FAILURE(status)) {
457             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
458             exit(-1);
459         }
460         gFileLines[line].unixName[sizeNeeded] = 0;
461     };
462     ucnv_close(cvrtr);
463 #endif
464 }
465 
466 
467 //----------------------------------------------------------------------------------------
468 //
469 //  class UCharFile   Class to hide all the gorp to read a file in
470 //                    and produce a stream of UChars.
471 //
472 //----------------------------------------------------------------------------------------
473 class UCharFile {
474 public:
475     UCharFile(const char *fileName);
476     ~UCharFile();
477     UChar   get();
eof()478     UBool   eof() {return fEof;};
error()479     UBool   error() {return fError;};
size()480     int32_t size() { return fFileSize; };
481 
482 private:
UCharFile(const UCharFile & other)483     UCharFile (const UCharFile &other) {};                         // No copy constructor.
operator =(const UCharFile & other)484     UCharFile & operator = (const UCharFile &other) {return *this;};   // No assignment op
485 
486     FILE         *fFile;
487     const char   *fName;
488     UBool        fEof;
489     UBool        fError;
490     UChar        fPending2ndSurrogate;
491     int32_t      fFileSize;
492 
493     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
494 };
495 
UCharFile(const char * fileName)496 UCharFile::UCharFile(const char * fileName) {
497     fEof                 = false;
498     fError               = false;
499     fName                = fileName;
500     struct stat buf;
501     int32_t result = stat(fileName, &buf);
502     if(result != 0) {
503       fprintf(stderr, "Error getting info\n");
504       fFileSize = -1;
505     } else {
506       fFileSize = buf.st_size;
507     }
508     fFile                = fopen(fName, "rb");
509     fPending2ndSurrogate = 0;
510     if (fFile == NULL) {
511         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
512         fError = true;
513         return;
514     }
515     //
516     //  Look for the byte order mark at the start of the file.
517     //
518     int BOMC1, BOMC2, BOMC3;
519     BOMC1 = fgetc(fFile);
520     BOMC2 = fgetc(fFile);
521 
522     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
523         fEncoding = UTF16LE; }
524     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
525         fEncoding = UTF16BE; }
526     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
527         fEncoding = UTF8; }
528     else
529     {
530         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
531             "must include a BOM.\n", fileName);
532         fError = true;
533         return;
534     }
535 }
536 
537 
~UCharFile()538 UCharFile::~UCharFile() {
539     fclose(fFile);
540 }
541 
542 
543 
get()544 UChar UCharFile::get() {
545     UChar   c;
546     switch (fEncoding) {
547     case UTF16LE:
548         {
549             int  cL, cH;
550             cL = fgetc(fFile);
551             cH = fgetc(fFile);
552             c  = cL  | (cH << 8);
553             if (cH == EOF) {
554                 c   = 0;
555                 fEof = true;
556             }
557             break;
558         }
559     case UTF16BE:
560         {
561             int  cL, cH;
562             cH = fgetc(fFile);
563             cL = fgetc(fFile);
564             c  = cL  | (cH << 8);
565             if (cL == EOF) {
566                 c   = 0;
567                 fEof = true;
568             }
569             break;
570         }
571     case UTF8:
572         {
573             if (fPending2ndSurrogate != 0) {
574                 c = fPending2ndSurrogate;
575                 fPending2ndSurrogate = 0;
576                 break;
577             }
578 
579             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
580             if (ch == EOF) {
581                 c = 0;
582                 fEof = true;
583                 break;
584             }
585 
586             if (ch <= 0x7f) {
587                 // It's ascii.  No further utf-8 conversion.
588                 c = ch;
589                 break;
590             }
591 
592             // Figure out the length of the char and read the rest of the bytes
593             //   into a temp array.
594             int nBytes;
595             if (ch >= 0xF0) {nBytes=4;}
596             else if (ch >= 0xE0) {nBytes=3;}
597             else if (ch >= 0xC0) {nBytes=2;}
598             else {
599                 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
600                 fError = true;
601                 return 0;
602             }
603 
604             unsigned char  bytes[10];
605             bytes[0] = (unsigned char)ch;
606             int i;
607             for (i=1; i<nBytes; i++) {
608                 bytes[i] = fgetc(fFile);
609                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
610                     fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
611                     fError = true;
612                     return 0;
613                 }
614             }
615 
616             // Convert the bytes from the temp array to a Unicode char.
617             i = 0;
618             uint32_t  cp;
619             U8_NEXT_UNSAFE(bytes, i, cp);
620             c = (UChar)cp;
621 
622             if (cp >= 0x10000) {
623                 // The code point needs to be broken up into a utf-16 surrogate pair.
624                 //  Process first half this time through the main loop, and
625                 //   remember the other half for the next time through.
626                 UChar utf16Buf[3];
627                 i = 0;
628                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
629                 fPending2ndSurrogate = utf16Buf[1];
630                 c = utf16Buf[0];
631             }
632             break;
633         };
634     }
635     return c;
636 }
637 
638 
639 //----------------------------------------------------------------------------------------
640 //
641 //    Main   --  process command line, read in and pre-process the test file,
642 //                 call other functions to do the actual tests.
643 //
644 //----------------------------------------------------------------------------------------
main(int argc,const char ** argv)645 int main(int argc, const char** argv) {
646     if (ProcessOptions(argc, argv, opts) != true || opt_help || opt_fName == 0) {
647         printf(gUsageString);
648         exit (1);
649     }
650     // Make sure that we've only got one API selected.
651     if (opt_mac || opt_unix || opt_win) opt_icu = false;
652     if (opt_mac || opt_unix) opt_win = false;
653     if (opt_mac) opt_unix = false;
654 
655     UErrorCode          status = U_ZERO_ERROR;
656 
657 
658 
659     //
660     //  Set up a Windows LCID
661     //
662   /*
663     if (opt_langid != 0) {
664         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
665     }
666     else {
667         gWinLCID = uloc_getLCID(opt_locale);
668     }
669   */
670 
671     //
672     //  Set the UNIX locale
673     //
674     if (opt_unix) {
675         if (setlocale(LC_ALL, opt_locale) == 0) {
676             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
677             exit(-1);
678         }
679     }
680 
681     // Read in  the input file.
682     //   File assumed to be utf-16.
683     //   Lines go onto heap buffers.  Global index array to line starts is created.
684     //   Lines themselves are null terminated.
685     //
686 
687     UCharFile f(opt_fName);
688     if (f.error()) {
689         exit(-1);
690     }
691     int32_t fileSize = f.size();
692     const int STARTSIZE = 70000;
693     int32_t bufSize = 0;
694     int32_t charCount = 0;
695     if(fileSize != -1) {
696       text = (UChar *)malloc(fileSize*sizeof(UChar));
697       bufSize = fileSize;
698     } else {
699       text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
700       bufSize = STARTSIZE;
701     }
702     if(text == NULL) {
703       fprintf(stderr, "Allocating buffer failed\n");
704       exit(-1);
705     }
706 
707 
708     //  Read the file, split into lines, and save in memory.
709     //  Loop runs once per utf-16 value from the input file,
710     //    (The number of bytes read from file per loop iteration depends on external encoding.)
711     for (;;) {
712 
713         UChar c = f.get();
714         if(f.eof()) {
715           break;
716         }
717         if (f.error()){
718           exit(-1);
719         }
720         // We now have a good UTF-16 value in c.
721         text[charCount++] = c;
722         if(charCount == bufSize) {
723           text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
724           if(text == NULL) {
725             fprintf(stderr, "Reallocating buffer failed\n");
726             exit(-1);
727           }
728           bufSize *= 2;
729         }
730     }
731 
732 
733     if (opt_terse == false) {
734         printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
735     }
736 
737     textSize = charCount;
738 
739 
740 
741 
742     //
743     //  Dump file contents if requested.
744     //
745     if (opt_dump) {
746       // dump file, etc... possibly
747     }
748 
749 
750     //
751     //  We've got the file read into memory.  Go do something with it.
752     //
753     int32_t i = 0;
754     for(i = 0; i < opt_passesCount; i++) {
755       if(opt_loopCount != 0) {
756         if(opt_next) {
757           doForwardTest();
758         } else if(opt_isBound) {
759           doIsBoundTest();
760         } else {
761           doForwardTest();
762         }
763       } else if(opt_time != 0) {
764 
765       }
766     }
767 
768   if(text != NULL) {
769     free(text);
770   }
771     if(brkit != NULL) {
772       delete brkit;
773     }
774 
775     return 0;
776 }
777