1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (C) 2001-2005 IBM, Inc. All Rights Reserved.
4 *
5 ********************************************************************/
6 /********************************************************************************
7 *
8 * File ubrkperf.cpp
9 *
10 * Modification History:
11 * Name Description
12 * Vladimir Weinstein First Version, based on collperf
13 *
14 *********************************************************************************
15 */
16
17 //
18 // This program tests break iterator performance
19 // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
20 // (if any)
21 // A text file is required as input. It must be in utf-8 or utf-16 format,
22 // and include a byte order mark. Either LE or BE format is OK.
23 //
24
25 const char gUsageString[] =
26 "usage: ubrkperf options...\n"
27 "-help Display this message.\n"
28 "-file file_name utf-16/utf-8 format file.\n"
29 "-locale name ICU locale to use. Default is en_US\n"
30 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
31 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
32 "-win Run test using Windows native services. (currently not working) (ICU is default)\n"
33 "-unix Run test using Unix word breaking services. (currently not working) \n"
34 "-mac Run test using MacOSX word breaking services.\n"
35 "-uselen Use API with string lengths. Default is null-terminated strings\n"
36 "-char Use character break iterator\n"
37 "-word Use word break iterator\n"
38 "-line Use line break iterator\n"
39 "-sentence Use sentence break iterator\n"
40 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
41 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
42 " under test at each call point. For measuring test overhead.\n"
43 "-terse Terse numbers-only output. Intended for use by scripts.\n"
44 "-dump Display stuff.\n"
45 "-capi Use C APIs instead of C++ APIs (currently not working)\n"
46 "-next Do the next test\n"
47 "-isBound Do the isBound test\n"
48 ;
49
50
51 #include <stdio.h>
52 #include <string.h>
53 #include <stdlib.h>
54 #include <math.h>
55 #include <locale.h>
56 #include <errno.h>
57 #include <sys/stat.h>
58
59 #include <unicode/utypes.h>
60 #include <unicode/ucol.h>
61 #include <unicode/ucoleitr.h>
62 #include <unicode/uloc.h>
63 #include <unicode/ustring.h>
64 #include <unicode/ures.h>
65 #include <unicode/uchar.h>
66 #include <unicode/ucnv.h>
67 #include <unicode/utf8.h>
68
69 #include <unicode/brkiter.h>
70
71
72 #ifdef U_WINDOWS
73 #include <windows.h>
74 #else
75 //
76 // Stubs for Windows API functions when building on UNIXes.
77 //
78 #include <sys/time.h>
timeGetTime()79 unsigned long timeGetTime() {
80 struct timeval t;
81 gettimeofday(&t, 0);
82 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
83 val += t.tv_usec / 1000;
84 return val;
85 };
86 #define MAKELCID(a,b) 0
87 #endif
88
89
90 //
91 // Command line option variables
92 // These global variables are set according to the options specified
93 // on the command line by the user.
94 char * opt_fName = 0;
95 char * opt_locale = "en_US";
96 int opt_langid = 0; // Defaults to value corresponding to opt_locale.
97 char * opt_rules = 0;
98 UBool opt_help = FALSE;
99 int opt_time = 0;
100 int opt_loopCount = 0;
101 int opt_passesCount= 1;
102 UBool opt_terse = FALSE;
103 UBool opt_icu = TRUE;
104 UBool opt_win = FALSE; // Run with Windows native functions.
105 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions.
106 UBool opt_mac = FALSE; // Run with MacOSX word break services.
107 UBool opt_uselen = FALSE;
108 UBool opt_dump = FALSE;
109 UBool opt_char = FALSE;
110 UBool opt_word = FALSE;
111 UBool opt_line = FALSE;
112 UBool opt_sentence = FALSE;
113 UBool opt_capi = FALSE;
114
115 UBool opt_next = FALSE;
116 UBool opt_isBound = FALSE;
117
118
119
120 //
121 // Definitions for the command line options
122 //
123 struct OptSpec {
124 const char *name;
125 enum {FLAG, NUM, STRING} type;
126 void *pVar;
127 };
128
129 OptSpec opts[] = {
130 {"-file", OptSpec::STRING, &opt_fName},
131 {"-locale", OptSpec::STRING, &opt_locale},
132 {"-langid", OptSpec::NUM, &opt_langid},
133 {"-win", OptSpec::FLAG, &opt_win},
134 {"-unix", OptSpec::FLAG, &opt_unix},
135 {"-mac", OptSpec::FLAG, &opt_mac},
136 {"-uselen", OptSpec::FLAG, &opt_uselen},
137 {"-loop", OptSpec::NUM, &opt_loopCount},
138 {"-time", OptSpec::NUM, &opt_time},
139 {"-passes", OptSpec::NUM, &opt_passesCount},
140 {"-char", OptSpec::FLAG, &opt_char},
141 {"-word", OptSpec::FLAG, &opt_word},
142 {"-line", OptSpec::FLAG, &opt_line},
143 {"-sentence", OptSpec::FLAG, &opt_sentence},
144 {"-terse", OptSpec::FLAG, &opt_terse},
145 {"-dump", OptSpec::FLAG, &opt_dump},
146 {"-capi", OptSpec::FLAG, &opt_capi},
147 {"-next", OptSpec::FLAG, &opt_next},
148 {"-isBound", OptSpec::FLAG, &opt_isBound},
149 {"-help", OptSpec::FLAG, &opt_help},
150 {"-?", OptSpec::FLAG, &opt_help},
151 {0, OptSpec::FLAG, 0}
152 };
153
154
155 //---------------------------------------------------------------------------
156 //
157 // Global variables pointing to and describing the test file
158 //
159 //---------------------------------------------------------------------------
160
161 //DWORD gWinLCID;
162 BreakIterator *brkit = NULL;
163 UChar *text = NULL;
164 int32_t textSize = 0;
165
166
167
168 #ifdef U_DARWIN
169 #include <ApplicationServices/ApplicationServices.h>
170 enum{
171 kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
172 };
173 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
174 TextBreakLocatorRef breakRef;
175 UCTextBreakType macBreakType;
176
createMACBrkIt()177 void createMACBrkIt() {
178 OSStatus status = noErr;
179 LocaleRef lref;
180 status = LocaleRefFromLocaleString(opt_locale, &lref);
181 status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
182 if(opt_char == TRUE) {
183 macBreakType = kUCTextBreakClusterMask;
184 } else if(opt_word == TRUE) {
185 macBreakType = kUCTextBreakWordMask;
186 } else if(opt_line == TRUE) {
187 macBreakType = kUCTextBreakLineMask;
188 } else if(opt_sentence == TRUE) {
189 // error
190 // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
191 } else {
192 // default is character iterator
193 macBreakType = kUCTextBreakClusterMask;
194 }
195 }
196 #endif
197
createICUBrkIt()198 void createICUBrkIt() {
199 //
200 // Set up an ICU break iterator
201 //
202 UErrorCode status = U_ZERO_ERROR;
203 if(opt_char == TRUE) {
204 brkit = BreakIterator::createCharacterInstance(opt_locale, status);
205 } else if(opt_word == TRUE) {
206 brkit = BreakIterator::createWordInstance(opt_locale, status);
207 } else if(opt_line == TRUE) {
208 brkit = BreakIterator::createLineInstance(opt_locale, status);
209 } else if(opt_sentence == TRUE) {
210 brkit = BreakIterator::createSentenceInstance(opt_locale, status);
211 } else {
212 // default is character iterator
213 brkit = BreakIterator::createCharacterInstance(opt_locale, status);
214 }
215 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
216 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
217 }
218 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
219 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
220 }
221
222 }
223
224 //---------------------------------------------------------------------------
225 //
226 // ProcessOptions() Function to read the command line options.
227 //
228 //---------------------------------------------------------------------------
ProcessOptions(int argc,const char ** argv,OptSpec opts[])229 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
230 {
231 int i;
232 int argNum;
233 const char *pArgName;
234 OptSpec *pOpt;
235
236 for (argNum=1; argNum<argc; argNum++) {
237 pArgName = argv[argNum];
238 for (pOpt = opts; pOpt->name != 0; pOpt++) {
239 if (strcmp(pOpt->name, pArgName) == 0) {
240 switch (pOpt->type) {
241 case OptSpec::FLAG:
242 *(UBool *)(pOpt->pVar) = TRUE;
243 break;
244 case OptSpec::STRING:
245 argNum ++;
246 if (argNum >= argc) {
247 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
248 return FALSE;
249 }
250 *(const char **)(pOpt->pVar) = argv[argNum];
251 break;
252 case OptSpec::NUM:
253 argNum ++;
254 if (argNum >= argc) {
255 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
256 return FALSE;
257 }
258 char *endp;
259 i = strtol(argv[argNum], &endp, 0);
260 if (endp == argv[argNum]) {
261 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
262 return FALSE;
263 }
264 *(int *)(pOpt->pVar) = i;
265 }
266 break;
267 }
268 }
269 if (pOpt->name == 0)
270 {
271 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
272 return FALSE;
273 }
274 }
275 return TRUE;
276 }
277
278
doForwardTest()279 void doForwardTest() {
280 if (opt_terse == FALSE) {
281 printf("Doing the forward test\n");
282 }
283 int32_t noBreaks = 0;
284 int32_t i = 0;
285 unsigned long startTime = timeGetTime();
286 unsigned long elapsedTime = 0;
287 if(opt_icu) {
288 createICUBrkIt();
289 brkit->setText(UnicodeString(text, textSize));
290 brkit->first();
291 if (opt_terse == FALSE) {
292 printf("Warmup\n");
293 }
294 int j;
295 while((j = brkit->next()) != BreakIterator::DONE) {
296 noBreaks++;
297 //fprintf(stderr, "%d ", j);
298 }
299
300 if (opt_terse == FALSE) {
301 printf("Measure\n");
302 }
303 startTime = timeGetTime();
304 for(i = 0; i < opt_loopCount; i++) {
305 brkit->first();
306 while(brkit->next() != BreakIterator::DONE) {
307 }
308 }
309
310 elapsedTime = timeGetTime()-startTime;
311 } else if(opt_mac) {
312 #ifdef U_DARWIN
313 createMACBrkIt();
314 UniChar* filePtr = text;
315 OSStatus status = noErr;
316 UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
317 startOffset = 0;
318 //printf("\t---Search forward--\n");
319
320 while (startOffset < numUniChars)
321 {
322 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
323 startOffset, &breakOffset);
324 //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
325 //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
326
327 // Output break
328 //printf("\t%d\n", (int)breakOffset);
329
330 // Increment counters
331 noBreaks++;
332 startOffset = breakOffset;
333 }
334 startTime = timeGetTime();
335 for(i = 0; i < opt_loopCount; i++) {
336 startOffset = 0;
337
338 while (startOffset < numUniChars)
339 {
340 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
341 startOffset, &breakOffset);
342 // Increment counters
343 startOffset = breakOffset;
344 }
345 }
346 elapsedTime = timeGetTime()-startTime;
347 UCDisposeTextBreakLocator(&breakRef);
348 #endif
349
350
351 }
352
353
354 if (opt_terse == FALSE) {
355 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
356 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
357 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
358 printf("forward break iteration average loop time %d\n", loopTime);
359 printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
360 printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
361 } else {
362 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
363 }
364
365
366 }
367
doIsBoundTest()368 void doIsBoundTest() {
369 int32_t noBreaks = 0, hit = 0;
370 int32_t i = 0, j = 0;
371 unsigned long startTime = timeGetTime();
372 unsigned long elapsedTime = 0;
373 createICUBrkIt();
374 brkit->setText(UnicodeString(text, textSize));
375 brkit->first();
376 for(j = 0; j < textSize; j++) {
377 if(brkit->isBoundary(j)) {
378 noBreaks++;
379 //fprintf(stderr, "%d ", j);
380 }
381 }
382 /*
383 while(brkit->next() != BreakIterator::DONE) {
384 noBreaks++;
385 }
386 */
387
388 startTime = timeGetTime();
389 for(i = 0; i < opt_loopCount; i++) {
390 for(j = 0; j < textSize; j++) {
391 if(brkit->isBoundary(j)) {
392 hit++;
393 }
394 }
395 }
396
397 elapsedTime = timeGetTime()-startTime;
398 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
399 if (opt_terse == FALSE) {
400 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
401 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
402 printf("forward break iteration average loop time %d\n", loopTime);
403 printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
404 printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
405 } else {
406 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
407 }
408 }
409
410 //----------------------------------------------------------------------------------------
411 //
412 // UnixConvert -- Convert the lines of the file to the encoding for UNIX
413 // Since it appears that Unicode support is going in the general
414 // direction of the use of UTF-8 locales, that is the approach
415 // that is used here.
416 //
417 //----------------------------------------------------------------------------------------
UnixConvert()418 void UnixConvert() {
419 #if 0
420 int line;
421
422 UConverter *cvrtr; // An ICU code page converter.
423 UErrorCode status = U_ZERO_ERROR;
424
425
426 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
427 if (U_FAILURE(status)) {
428 fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
429 exit(-1);
430 }
431 // redo for unix
432 for (line=0; line < gNumFileLines; line++) {
433 int sizeNeeded = ucnv_fromUChars(cvrtr,
434 0, // ptr to target buffer.
435 0, // length of target buffer.
436 gFileLines[line].name,
437 -1, // source is null terminated
438 &status);
439 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
440 fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
441 exit(-1);
442 }
443 status = U_ZERO_ERROR;
444 gFileLines[line].unixName = new char[sizeNeeded+1];
445 sizeNeeded = ucnv_fromUChars(cvrtr,
446 gFileLines[line].unixName, // ptr to target buffer.
447 sizeNeeded+1, // length of target buffer.
448 gFileLines[line].name,
449 -1, // source is null terminated
450 &status);
451 if (U_FAILURE(status)) {
452 fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
453 exit(-1);
454 }
455 gFileLines[line].unixName[sizeNeeded] = 0;
456 };
457 ucnv_close(cvrtr);
458 #endif
459 }
460
461
462 //----------------------------------------------------------------------------------------
463 //
464 // class UCharFile Class to hide all the gorp to read a file in
465 // and produce a stream of UChars.
466 //
467 //----------------------------------------------------------------------------------------
468 class UCharFile {
469 public:
470 UCharFile(const char *fileName);
471 ~UCharFile();
472 UChar get();
eof()473 UBool eof() {return fEof;};
error()474 UBool error() {return fError;};
size()475 int32_t size() { return fFileSize; };
476
477 private:
UCharFile(const UCharFile & other)478 UCharFile (const UCharFile &other) {}; // No copy constructor.
operator =(const UCharFile & other)479 UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op
480
481 FILE *fFile;
482 const char *fName;
483 UBool fEof;
484 UBool fError;
485 UChar fPending2ndSurrogate;
486 int32_t fFileSize;
487
488 enum {UTF16LE, UTF16BE, UTF8} fEncoding;
489 };
490
UCharFile(const char * fileName)491 UCharFile::UCharFile(const char * fileName) {
492 fEof = FALSE;
493 fError = FALSE;
494 fName = fileName;
495 struct stat buf;
496 int32_t result = stat(fileName, &buf);
497 if(result != 0) {
498 fprintf(stderr, "Error getting info\n");
499 fFileSize = -1;
500 } else {
501 fFileSize = buf.st_size;
502 }
503 fFile = fopen(fName, "rb");
504 fPending2ndSurrogate = 0;
505 if (fFile == NULL) {
506 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
507 fError = TRUE;
508 return;
509 }
510 //
511 // Look for the byte order mark at the start of the file.
512 //
513 int BOMC1, BOMC2, BOMC3;
514 BOMC1 = fgetc(fFile);
515 BOMC2 = fgetc(fFile);
516
517 if (BOMC1 == 0xff && BOMC2 == 0xfe) {
518 fEncoding = UTF16LE; }
519 else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
520 fEncoding = UTF16BE; }
521 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
522 fEncoding = UTF8; }
523 else
524 {
525 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
526 "must include a BOM.\n", fileName);
527 fError = true;
528 return;
529 }
530 }
531
532
~UCharFile()533 UCharFile::~UCharFile() {
534 fclose(fFile);
535 }
536
537
538
get()539 UChar UCharFile::get() {
540 UChar c;
541 switch (fEncoding) {
542 case UTF16LE:
543 {
544 int cL, cH;
545 cL = fgetc(fFile);
546 cH = fgetc(fFile);
547 c = cL | (cH << 8);
548 if (cH == EOF) {
549 c = 0;
550 fEof = TRUE;
551 }
552 break;
553 }
554 case UTF16BE:
555 {
556 int cL, cH;
557 cH = fgetc(fFile);
558 cL = fgetc(fFile);
559 c = cL | (cH << 8);
560 if (cL == EOF) {
561 c = 0;
562 fEof = TRUE;
563 }
564 break;
565 }
566 case UTF8:
567 {
568 if (fPending2ndSurrogate != 0) {
569 c = fPending2ndSurrogate;
570 fPending2ndSurrogate = 0;
571 break;
572 }
573
574 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type.
575 if (ch == EOF) {
576 c = 0;
577 fEof = TRUE;
578 break;
579 }
580
581 if (ch <= 0x7f) {
582 // It's ascii. No further utf-8 conversion.
583 c = ch;
584 break;
585 }
586
587 // Figure out the lenght of the char and read the rest of the bytes
588 // into a temp array.
589 int nBytes;
590 if (ch >= 0xF0) {nBytes=4;}
591 else if (ch >= 0xE0) {nBytes=3;}
592 else if (ch >= 0xC0) {nBytes=2;}
593 else {
594 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
595 fError = TRUE;
596 return 0;
597 }
598
599 unsigned char bytes[10];
600 bytes[0] = (unsigned char)ch;
601 int i;
602 for (i=1; i<nBytes; i++) {
603 bytes[i] = fgetc(fFile);
604 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
605 fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
606 fError = TRUE;
607 return 0;
608 }
609 }
610
611 // Convert the bytes from the temp array to a Unicode char.
612 i = 0;
613 uint32_t cp;
614 UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp);
615 c = (UChar)cp;
616
617 if (cp >= 0x10000) {
618 // The code point needs to be broken up into a utf-16 surrogate pair.
619 // Process first half this time through the main loop, and
620 // remember the other half for the next time through.
621 UChar utf16Buf[3];
622 i = 0;
623 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
624 fPending2ndSurrogate = utf16Buf[1];
625 c = utf16Buf[0];
626 }
627 break;
628 };
629 }
630 return c;
631 }
632
633
634 //----------------------------------------------------------------------------------------
635 //
636 // Main -- process command line, read in and pre-process the test file,
637 // call other functions to do the actual tests.
638 //
639 //----------------------------------------------------------------------------------------
main(int argc,const char ** argv)640 int main(int argc, const char** argv) {
641 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
642 printf(gUsageString);
643 exit (1);
644 }
645 // Make sure that we've only got one API selected.
646 if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
647 if (opt_mac || opt_unix) opt_win = FALSE;
648 if (opt_mac) opt_unix = FALSE;
649
650 UErrorCode status = U_ZERO_ERROR;
651
652
653
654 //
655 // Set up a Windows LCID
656 //
657 /*
658 if (opt_langid != 0) {
659 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
660 }
661 else {
662 gWinLCID = uloc_getLCID(opt_locale);
663 }
664 */
665
666 //
667 // Set the UNIX locale
668 //
669 if (opt_unix) {
670 if (setlocale(LC_ALL, opt_locale) == 0) {
671 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
672 exit(-1);
673 }
674 }
675
676 // Read in the input file.
677 // File assumed to be utf-16.
678 // Lines go onto heap buffers. Global index array to line starts is created.
679 // Lines themselves are null terminated.
680 //
681
682 UCharFile f(opt_fName);
683 if (f.error()) {
684 exit(-1);
685 }
686 int32_t fileSize = f.size();
687 const int STARTSIZE = 70000;
688 int32_t bufSize = 0;
689 int32_t charCount = 0;
690 if(fileSize != -1) {
691 text = (UChar *)malloc(fileSize*sizeof(UChar));
692 bufSize = fileSize;
693 } else {
694 text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
695 bufSize = STARTSIZE;
696 }
697 if(text == NULL) {
698 fprintf(stderr, "Allocating buffer failed\n");
699 exit(-1);
700 }
701
702
703 // Read the file, split into lines, and save in memory.
704 // Loop runs once per utf-16 value from the input file,
705 // (The number of bytes read from file per loop iteration depends on external encoding.)
706 for (;;) {
707
708 UChar c = f.get();
709 if(f.eof()) {
710 break;
711 }
712 if (f.error()){
713 exit(-1);
714 }
715 // We now have a good UTF-16 value in c.
716 text[charCount++] = c;
717 if(charCount == bufSize) {
718 text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
719 if(text == NULL) {
720 fprintf(stderr, "Reallocating buffer failed\n");
721 exit(-1);
722 }
723 bufSize *= 2;
724 }
725 }
726
727
728 if (opt_terse == FALSE) {
729 printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
730 }
731
732 textSize = charCount;
733
734
735
736
737 //
738 // Dump file contents if requested.
739 //
740 if (opt_dump) {
741 // dump file, etc... possibly
742 }
743
744
745 //
746 // We've got the file read into memory. Go do something with it.
747 //
748 int32_t i = 0;
749 for(i = 0; i < opt_passesCount; i++) {
750 if(opt_loopCount != 0) {
751 if(opt_next) {
752 doForwardTest();
753 } else if(opt_isBound) {
754 doIsBoundTest();
755 } else {
756 doForwardTest();
757 }
758 } else if(opt_time != 0) {
759
760 }
761 }
762
763 if(text != NULL) {
764 free(text);
765 }
766 if(brkit != NULL) {
767 delete brkit;
768 }
769
770 return 0;
771 }
772