1 /*
2 ******************************************************************************
3 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 ******************************************************************************
6 */
7
8 #include <errno.h>
9 #include <stdio.h>
10 #include <string.h>
11
12 #include "unicode/utypes.h"
13 #include "unicode/uchar.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/locid.h"
17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ustring.h"
20
21 /*
22 * This program takes a Unicode text file containing Thai text with
23 * spaces inserted where the word breaks are. It computes a copy of
24 * the text without spaces and uses a word instance of a Thai BreakIterator
25 * to compute the word breaks. The program reports any differences in the
26 * breaks.
27 *
28 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
29 * exptected that this program will always report some differences.
30 */
31
32 /*
33 * This class is a break iterator that counts words and spaces.
34 */
35 class SpaceBreakIterator
36 {
37 public:
38 // The constructor:
39 // text - pointer to an array of UChars to iterate over
40 // count - the number of UChars in text
41 SpaceBreakIterator(const UChar *text, int32_t count);
42
43 // the destructor
44 ~SpaceBreakIterator();
45
46 // return next break position
47 int32_t next();
48
49 // return current word count
50 int32_t getWordCount();
51
52 // return current space count
53 int32_t getSpaceCount();
54
55 private:
56 // No arg constructor: private so clients can't call it.
57 SpaceBreakIterator();
58
59 // The underlying BreakIterator
60 BreakIterator *fBreakIter;
61
62 // address of the UChar array
63 const UChar *fText;
64
65 // number of UChars in fText
66 int32_t fTextCount;
67
68 // current word count
69 int32_t fWordCount;
70
71 // current space count
72 int32_t fSpaceCount;
73
74 // UnicodeSet of SA characters
75 UnicodeSet fComplexContext;
76
77 // true when fBreakIter has returned DONE
78 UBool fDone;
79 };
80
81 /*
82 * This is the main class. It compares word breaks and reports the differences.
83 */
84 class ThaiWordbreakTest
85 {
86 public:
87 // The main constructor:
88 // spaces - pointer to a UChar array for the text with spaces
89 // spaceCount - the number of characters in the spaces array
90 // noSpaces - pointer to a UChar array for the text without spaces
91 // noSpaceCount - the number of characters in the noSpaces array
92 // verbose - report all breaks if true, otherwise just report differences
93 ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
94 ~ThaiWordbreakTest();
95
96 // returns the number of breaks that are in the spaces array
97 // but aren't found in the noSpaces array
98 int32_t getBreaksNotFound();
99
100 // returns the number of breaks which are found in the noSpaces
101 // array but aren't in the spaces array
102 int32_t getInvalidBreaks();
103
104 // returns the number of words found in the spaces array
105 int32_t getWordCount();
106
107 // reads the input Unicode text file:
108 // fileName - the path name of the file
109 // charCount - set to the number of UChars read from the file
110 // returns - the address of the UChar array containing the characters
111 static const UChar *readFile(char *fileName, int32_t &charCount);
112
113 // removes spaces form the input UChar array:
114 // spaces - pointer to the input UChar array
115 // count - number of UChars in the spaces array
116 // nonSpaceCount - the number of UChars in the result array
117 // returns - the address of the UChar array with spaces removed
118 static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
119
120 private:
121 // The no arg constructor - private so clients can't call it
122 ThaiWordbreakTest();
123
124 // This does the actual comparison:
125 // spaces - the address of the UChar array for the text with spaces
126 // spaceCount - the number of UChars in the spaces array
127 // noSpaces - the address of the UChar array for the text without spaces
128 // noSpaceCount - the number of UChars in the noSpaces array
129 // returns - true if all breaks match, FALSE otherwise
130 UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
131 const UChar *noSpaces, int32_t noSpaceCount);
132
133 // helper method to report a break in the spaces
134 // array that's not found in the noSpaces array
135 void breakNotFound(int32_t br);
136
137 // helper method to report a break that's found in
138 // the noSpaces array that's not in the spaces array
139 void foundInvalidBreak(int32_t br);
140
141 // count of breaks in the spaces array that
142 // aren't found in the noSpaces array
143 int32_t fBreaksNotFound;
144
145 // count of breaks found in the noSpaces array
146 // that aren't in the spaces array
147 int32_t fInvalidBreaks;
148
149 // number of words found in the spaces array
150 int32_t fWordCount;
151
152 // report all breaks if true, otherwise just report differences
153 UBool fVerbose;
154 };
155
156 /*
157 * The main constructor: it calls compareWordBreaks and reports any differences
158 */
ThaiWordbreakTest(const UChar * spaces,int32_t spaceCount,const UChar * noSpaces,int32_t noSpaceCount,UBool verbose)159 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
160 const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
161 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
162 {
163 compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
164 }
165
166 /*
167 * The no arg constructor
168 */
ThaiWordbreakTest()169 ThaiWordbreakTest::ThaiWordbreakTest()
170 {
171 // nothing
172 }
173
174 /*
175 * The destructor
176 */
~ThaiWordbreakTest()177 ThaiWordbreakTest::~ThaiWordbreakTest()
178 {
179 // nothing?
180 }
181
182 /*
183 * returns the number of breaks in the spaces array
184 * that aren't found in the noSpaces array
185 */
getBreaksNotFound()186 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
187 {
188 return fBreaksNotFound;
189 }
190
191 /*
192 * Returns the number of breaks found in the noSpaces
193 * array that aren't in the spaces array
194 */
getInvalidBreaks()195 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
196 {
197 return fInvalidBreaks;
198 }
199
200 /*
201 * Returns the number of words found in the spaces array
202 */
getWordCount()203 inline int32_t ThaiWordbreakTest::getWordCount()
204 {
205 return fWordCount;
206 }
207
208 /*
209 * This method does the acutal break comparison and reports the results.
210 * It uses a SpaceBreakIterator to iterate over the text with spaces,
211 * and a word instance of a Thai BreakIterator to iterate over the text
212 * without spaces.
213 */
compareWordBreaks(const UChar * spaces,int32_t spaceCount,const UChar * noSpaces,int32_t noSpaceCount)214 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
215 const UChar *noSpaces, int32_t noSpaceCount)
216 {
217 UBool result = TRUE;
218 Locale thai("th");
219 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
220 UErrorCode status = U_ZERO_ERROR;
221
222 BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
223 breakIter->adoptText(noSpaceIter);
224
225 SpaceBreakIterator spaceIter(spaces, spaceCount);
226
227 int32_t nextBreak = 0;
228 int32_t nextSpaceBreak = 0;
229 int32_t iterCount = 0;
230
231 while (TRUE) {
232 nextSpaceBreak = spaceIter.next();
233 nextBreak = breakIter->next();
234
235 if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
236 if (nextBreak != BreakIterator::DONE) {
237 fprintf(stderr, "break iterator didn't end.\n");
238 } else if (nextSpaceBreak != BreakIterator::DONE) {
239 fprintf(stderr, "premature break iterator end.\n");
240 }
241
242 break;
243 }
244
245 while (nextSpaceBreak != nextBreak &&
246 nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
247 if (nextSpaceBreak < nextBreak) {
248 breakNotFound(nextSpaceBreak);
249 result = FALSE;
250 nextSpaceBreak = spaceIter.next();
251 } else if (nextSpaceBreak > nextBreak) {
252 foundInvalidBreak(nextBreak);
253 result = FALSE;
254 nextBreak = breakIter->next();
255 }
256 }
257
258 if (fVerbose) {
259 printf("%d %d\n", nextSpaceBreak, nextBreak);
260 }
261 }
262
263
264 fWordCount = spaceIter.getWordCount();
265
266 delete breakIter;
267
268 return result;
269 }
270
271 /*
272 * Report a break that's in the text with spaces but
273 * not found in the text without spaces.
274 */
breakNotFound(int32_t br)275 void ThaiWordbreakTest::breakNotFound(int32_t br)
276 {
277 if (fVerbose) {
278 printf("%d ****\n", br);
279 } else {
280 fprintf(stderr, "break not found: %d\n", br);
281 }
282
283 fBreaksNotFound += 1;
284 }
285
286 /*
287 * Report a break that's found in the text without spaces
288 * that isn't in the text with spaces.
289 */
foundInvalidBreak(int32_t br)290 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
291 {
292 if (fVerbose) {
293 printf("**** %d\n", br);
294 } else {
295 fprintf(stderr, "found invalid break: %d\n", br);
296 }
297
298 fInvalidBreaks += 1;
299 }
300
301 /*
302 * Read the text from a file. The text must start with a Unicode Byte
303 * Order Mark (BOM) so that we know what order to read the bytes in.
304 */
readFile(char * fileName,int32_t & charCount)305 const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
306 {
307 FILE *f;
308 int32_t fileSize;
309
310 UChar *buffer;
311 char *bufferChars;
312
313 f = fopen(fileName, "rb");
314
315 if( f == NULL ) {
316 fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
317 return 0;
318 }
319
320 fseek(f, 0, SEEK_END);
321 fileSize = ftell(f);
322
323 fseek(f, 0, SEEK_SET);
324 bufferChars = new char[fileSize];
325
326 if(bufferChars == 0) {
327 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
328 fclose(f);
329 return 0;
330 }
331
332 fread(bufferChars, sizeof(char), fileSize, f);
333 if( ferror(f) ) {
334 fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
335 fclose(f);
336 delete[] bufferChars;
337 return 0;
338 }
339 fclose(f);
340
341 UnicodeString myText(bufferChars, fileSize, "UTF-8");
342
343 delete[] bufferChars;
344
345 charCount = myText.length();
346 buffer = new UChar[charCount];
347 if(buffer == 0) {
348 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
349 return 0;
350 }
351
352 myText.extract(1, myText.length(), buffer);
353 charCount--; // skip the BOM
354 buffer[charCount] = 0; // NULL terminate for easier reading in the debugger
355
356 return buffer;
357 }
358
359 /*
360 * Remove spaces from the input UChar array.
361 *
362 * We check explicitly for a Unicode code value of 0x0020
363 * because Unicode::isSpaceChar returns true for CR, LF, etc.
364 *
365 */
crunchSpaces(const UChar * spaces,int32_t count,int32_t & nonSpaceCount)366 const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
367 {
368 int32_t i, out, spaceCount;
369
370 spaceCount = 0;
371 for (i = 0; i < count; i += 1) {
372 if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
373 spaceCount += 1;
374 }
375 }
376
377 nonSpaceCount = count - spaceCount;
378 UChar *noSpaces = new UChar[nonSpaceCount];
379
380 if (noSpaces == 0) {
381 fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
382 return 0;
383 }
384
385 for (out = 0, i = 0; i < count; i += 1) {
386 if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
387 noSpaces[out++] = spaces[i];
388 }
389 }
390
391 return noSpaces;
392 }
393
394 /*
395 * Generate a text file with spaces in it from a file without.
396 */
generateFile(const UChar * chars,int32_t length)397 int generateFile(const UChar *chars, int32_t length) {
398 Locale root("");
399 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
400 UErrorCode status = U_ZERO_ERROR;
401
402 UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
403 BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
404 breakIter->adoptText(noSpaceIter);
405 char outbuf[1024];
406 int32_t strlength;
407 UChar bom = 0xFEFF;
408
409 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
410 int32_t prevbreak = 0;
411 while (U_SUCCESS(status)) {
412 int32_t nextbreak = breakIter->next();
413 if (nextbreak == BreakIterator::DONE) {
414 break;
415 }
416 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
417 nextbreak-prevbreak, &status));
418 if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
419 && complexContext.contains(chars[nextbreak])) {
420 printf(" ");
421 }
422 prevbreak = nextbreak;
423 }
424
425 if (U_FAILURE(status)) {
426 fprintf(stderr, "generate failed: %s\n", u_errorName(status));
427 return status;
428 }
429 else {
430 return 0;
431 }
432 }
433
434 /*
435 * The main routine. Read the command line arguments, read the text file,
436 * remove the spaces, do the comparison and report the final results
437 */
main(int argc,char ** argv)438 int main(int argc, char **argv)
439 {
440 char *fileName = "space.txt";
441 int arg = 1;
442 UBool verbose = FALSE;
443 UBool generate = FALSE;
444
445 if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
446 generate = TRUE;
447 arg += 1;
448 }
449
450 if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
451 verbose = TRUE;
452 arg += 1;
453 }
454
455 if (arg == argc - 1) {
456 fileName = argv[arg++];
457 }
458
459 if (arg != argc) {
460 fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
461 return 1;
462 }
463
464 int32_t spaceCount, nonSpaceCount;
465 const UChar *spaces, *noSpaces;
466
467 spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
468
469 if (spaces == 0) {
470 return 1;
471 }
472
473 if (generate) {
474 return generateFile(spaces, spaceCount);
475 }
476
477 noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
478
479 if (noSpaces == 0) {
480 return 1;
481 }
482
483 ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
484
485 printf("word count: %d\n", test.getWordCount());
486 printf("breaks not found: %d\n", test.getBreaksNotFound());
487 printf("invalid breaks found: %d\n", test.getInvalidBreaks());
488
489 return 0;
490 }
491
492 /*
493 * The main constructor. Clear all the counts and construct a default
494 * word instance of a BreakIterator.
495 */
SpaceBreakIterator(const UChar * text,int32_t count)496 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
497 : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
498 {
499 UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
500 UErrorCode status = U_ZERO_ERROR;
501 fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
502 Locale root("");
503
504 fBreakIter = BreakIterator::createWordInstance(root, status);
505 fBreakIter->adoptText(iter);
506 }
507
SpaceBreakIterator()508 SpaceBreakIterator::SpaceBreakIterator()
509 {
510 // nothing
511 }
512
513 /*
514 * The destructor. delete the underlying BreakIterator
515 */
~SpaceBreakIterator()516 SpaceBreakIterator::~SpaceBreakIterator()
517 {
518 delete fBreakIter;
519 }
520
521 /*
522 * Return the next break, counting words and spaces.
523 */
next()524 int32_t SpaceBreakIterator::next()
525 {
526 if (fDone) {
527 return BreakIterator::DONE;
528 }
529
530 int32_t nextBreak;
531 do {
532 nextBreak = fBreakIter->next();
533
534 if (nextBreak == BreakIterator::DONE) {
535 fDone = TRUE;
536 return BreakIterator::DONE;
537 }
538 }
539 while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
540 && fComplexContext.contains(fText[nextBreak]));
541
542 int32_t result = nextBreak - fSpaceCount;
543
544 if (nextBreak < fTextCount) {
545 if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
546 fSpaceCount += fBreakIter->next() - nextBreak;
547 }
548 }
549
550 fWordCount += 1;
551
552 return result;
553 }
554
555 /*
556 * Returns the current space count
557 */
getSpaceCount()558 int32_t SpaceBreakIterator::getSpaceCount()
559 {
560 return fSpaceCount;
561 }
562
563 /*
564 * Returns the current word count
565 */
getWordCount()566 int32_t SpaceBreakIterator::getWordCount()
567 {
568 return fWordCount;
569 }
570
571
572