• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  ******************************************************************************
3  * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
4  * and others. All Rights Reserved.                                           *
5  ******************************************************************************
6  */
7 
8 #include <errno.h>
9 #include <stdio.h>
10 #include <string.h>
11 
12 #include "unicode/utypes.h"
13 #include "unicode/uchar.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/locid.h"
17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ustring.h"
20 
21 /*
22  * This program takes a Unicode text file containing Thai text with
23  * spaces inserted where the word breaks are. It computes a copy of
24  * the text without spaces and uses a word instance of a Thai BreakIterator
25  * to compute the word breaks. The program reports any differences in the
26  * breaks.
27  *
28  * NOTE: by it's very nature, Thai word breaking is not exact, so it is
29  * exptected that this program will always report some differences.
30  */
31 
32 /*
33  * This class is a break iterator that counts words and spaces.
34  */
35 class SpaceBreakIterator
36 {
37 public:
38     // The constructor:
39     // text  - pointer to an array of UChars to iterate over
40     // count - the number of UChars in text
41     SpaceBreakIterator(const UChar *text, int32_t count);
42 
43     // the destructor
44     ~SpaceBreakIterator();
45 
46     // return next break position
47     int32_t next();
48 
49     // return current word count
50     int32_t getWordCount();
51 
52     // return current space count
53     int32_t getSpaceCount();
54 
55 private:
56     // No arg constructor: private so clients can't call it.
57     SpaceBreakIterator();
58 
59     // The underlying BreakIterator
60     BreakIterator *fBreakIter;
61 
62     // address of the UChar array
63     const UChar *fText;
64 
65     // number of UChars in fText
66     int32_t fTextCount;
67 
68     // current word count
69     int32_t fWordCount;
70 
71     // current space count
72     int32_t fSpaceCount;
73 
74     // UnicodeSet of SA characters
75     UnicodeSet fComplexContext;
76 
77     // true when fBreakIter has returned DONE
78     UBool fDone;
79 };
80 
81 /*
82  * This is the main class. It compares word breaks and reports the differences.
83  */
84 class ThaiWordbreakTest
85 {
86 public:
87     // The main constructor:
88     // spaces       - pointer to a UChar array for the text with spaces
89     // spaceCount   - the number of characters in the spaces array
90     // noSpaces     - pointer to a UChar array for the text without spaces
91     // noSpaceCount - the number of characters in the noSpaces array
92     // verbose      - report all breaks if true, otherwise just report differences
93     ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
94     ~ThaiWordbreakTest();
95 
96     // returns the number of breaks that are in the spaces array
97     // but aren't found in the noSpaces array
98     int32_t getBreaksNotFound();
99 
100     // returns the number of breaks which are found in the noSpaces
101     // array but aren't in the spaces array
102     int32_t getInvalidBreaks();
103 
104     // returns the number of words found in the spaces array
105     int32_t getWordCount();
106 
107     // reads the input Unicode text file:
108     // fileName  - the path name of the file
109     // charCount - set to the number of UChars read from the file
110     // returns   - the address of the UChar array containing the characters
111     static const UChar *readFile(char *fileName, int32_t &charCount);
112 
113     // removes spaces form the input UChar array:
114     // spaces        - pointer to the input UChar array
115     // count         - number of UChars in the spaces array
116     // nonSpaceCount - the number of UChars in the result array
117     // returns       - the address of the UChar array with spaces removed
118     static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
119 
120 private:
121     // The no arg constructor - private so clients can't call it
122     ThaiWordbreakTest();
123 
124     // This does the actual comparison:
125     // spaces - the address of the UChar array for the text with spaces
126     // spaceCount - the number of UChars in the spaces array
127     // noSpaces   - the address of the UChar array for the text without spaces
128     // noSpaceCount - the number of UChars in the noSpaces array
129     // returns      - true if all breaks match, FALSE otherwise
130     UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
131                             const UChar *noSpaces, int32_t noSpaceCount);
132 
133     // helper method to report a break in the spaces
134     // array that's not found in the noSpaces array
135     void breakNotFound(int32_t br);
136 
137     // helper method to report a break that's found in
138     // the noSpaces array that's not in the spaces array
139     void foundInvalidBreak(int32_t br);
140 
141     // count of breaks in the spaces array that
142     // aren't found in the noSpaces array
143     int32_t fBreaksNotFound;
144 
145     // count of breaks found in the noSpaces array
146     // that aren't in the spaces array
147     int32_t fInvalidBreaks;
148 
149     // number of words found in the spaces array
150     int32_t fWordCount;
151 
152     // report all breaks if true, otherwise just report differences
153     UBool fVerbose;
154 };
155 
156 /*
157  * The main constructor: it calls compareWordBreaks and reports any differences
158  */
ThaiWordbreakTest(const UChar * spaces,int32_t spaceCount,const UChar * noSpaces,int32_t noSpaceCount,UBool verbose)159 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
160                                      const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
161 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
162 {
163     compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
164 }
165 
166 /*
167  * The no arg constructor
168  */
ThaiWordbreakTest()169 ThaiWordbreakTest::ThaiWordbreakTest()
170 {
171     // nothing
172 }
173 
174 /*
175  * The destructor
176  */
~ThaiWordbreakTest()177 ThaiWordbreakTest::~ThaiWordbreakTest()
178 {
179     // nothing?
180 }
181 
182 /*
183  * returns the number of breaks in the spaces array
184  * that aren't found in the noSpaces array
185  */
getBreaksNotFound()186 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
187 {
188     return fBreaksNotFound;
189 }
190 
191 /*
192  * Returns the number of breaks found in the noSpaces
193  * array that aren't in the spaces array
194  */
getInvalidBreaks()195 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
196 {
197     return fInvalidBreaks;
198 }
199 
200 /*
201  * Returns the number of words found in the spaces array
202  */
getWordCount()203 inline int32_t ThaiWordbreakTest::getWordCount()
204 {
205     return fWordCount;
206 }
207 
208 /*
209  * This method does the acutal break comparison and reports the results.
210  * It uses a SpaceBreakIterator to iterate over the text with spaces,
211  * and a word instance of a Thai BreakIterator to iterate over the text
212  * without spaces.
213  */
compareWordBreaks(const UChar * spaces,int32_t spaceCount,const UChar * noSpaces,int32_t noSpaceCount)214 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
215                                            const UChar *noSpaces, int32_t noSpaceCount)
216 {
217     UBool result = TRUE;
218     Locale thai("th");
219     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
220     UErrorCode status = U_ZERO_ERROR;
221 
222     BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
223     breakIter->adoptText(noSpaceIter);
224 
225     SpaceBreakIterator spaceIter(spaces, spaceCount);
226 
227     int32_t nextBreak = 0;
228     int32_t nextSpaceBreak = 0;
229     int32_t iterCount = 0;
230 
231     while (TRUE) {
232         nextSpaceBreak = spaceIter.next();
233         nextBreak = breakIter->next();
234 
235         if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
236             if (nextBreak != BreakIterator::DONE) {
237                 fprintf(stderr, "break iterator didn't end.\n");
238             } else if (nextSpaceBreak != BreakIterator::DONE) {
239                 fprintf(stderr, "premature break iterator end.\n");
240             }
241 
242             break;
243         }
244 
245         while (nextSpaceBreak != nextBreak &&
246                nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
247             if (nextSpaceBreak < nextBreak) {
248                 breakNotFound(nextSpaceBreak);
249                 result = FALSE;
250                 nextSpaceBreak = spaceIter.next();
251             } else if (nextSpaceBreak > nextBreak) {
252                 foundInvalidBreak(nextBreak);
253                 result = FALSE;
254                 nextBreak = breakIter->next();
255             }
256         }
257 
258         if (fVerbose) {
259             printf("%d   %d\n", nextSpaceBreak, nextBreak);
260         }
261     }
262 
263 
264     fWordCount = spaceIter.getWordCount();
265 
266     delete breakIter;
267 
268     return result;
269 }
270 
271 /*
272  * Report a break that's in the text with spaces but
273  * not found in the text without spaces.
274  */
breakNotFound(int32_t br)275 void ThaiWordbreakTest::breakNotFound(int32_t br)
276 {
277     if (fVerbose) {
278         printf("%d   ****\n", br);
279     } else {
280         fprintf(stderr, "break not found: %d\n", br);
281     }
282 
283     fBreaksNotFound += 1;
284 }
285 
286 /*
287  * Report a break that's found in the text without spaces
288  * that isn't in the text with spaces.
289  */
foundInvalidBreak(int32_t br)290 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
291 {
292     if (fVerbose) {
293         printf("****   %d\n", br);
294     } else {
295         fprintf(stderr, "found invalid break: %d\n", br);
296     }
297 
298     fInvalidBreaks += 1;
299 }
300 
301 /*
302  * Read the text from a file. The text must start with a Unicode Byte
303  * Order Mark (BOM) so that we know what order to read the bytes in.
304  */
readFile(char * fileName,int32_t & charCount)305 const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
306 {
307     FILE *f;
308     int32_t fileSize;
309 
310     UChar *buffer;
311     char *bufferChars;
312 
313     f = fopen(fileName, "rb");
314 
315     if( f == NULL ) {
316         fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
317         return 0;
318     }
319 
320     fseek(f, 0, SEEK_END);
321     fileSize = ftell(f);
322 
323     fseek(f, 0, SEEK_SET);
324     bufferChars = new char[fileSize];
325 
326     if(bufferChars == 0) {
327         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
328         fclose(f);
329         return 0;
330     }
331 
332     fread(bufferChars, sizeof(char), fileSize, f);
333     if( ferror(f) ) {
334         fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
335         fclose(f);
336         delete[] bufferChars;
337         return 0;
338     }
339     fclose(f);
340 
341     UnicodeString myText(bufferChars, fileSize, "UTF-8");
342 
343     delete[] bufferChars;
344 
345     charCount = myText.length();
346     buffer = new UChar[charCount];
347     if(buffer == 0) {
348         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
349         return 0;
350     }
351 
352     myText.extract(1, myText.length(), buffer);
353     charCount--;  // skip the BOM
354     buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
355 
356     return buffer;
357 }
358 
359 /*
360  * Remove spaces from the input UChar array.
361  *
362  * We check explicitly for a Unicode code value of 0x0020
363  * because Unicode::isSpaceChar returns true for CR, LF, etc.
364  *
365  */
crunchSpaces(const UChar * spaces,int32_t count,int32_t & nonSpaceCount)366 const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
367 {
368     int32_t i, out, spaceCount;
369 
370     spaceCount = 0;
371     for (i = 0; i < count; i += 1) {
372         if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
373             spaceCount += 1;
374         }
375     }
376 
377     nonSpaceCount = count - spaceCount;
378     UChar *noSpaces = new UChar[nonSpaceCount];
379 
380     if (noSpaces == 0) {
381         fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
382         return 0;
383     }
384 
385     for (out = 0, i = 0; i < count; i += 1) {
386         if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
387             noSpaces[out++] = spaces[i];
388         }
389     }
390 
391     return noSpaces;
392 }
393 
394 /*
395  * Generate a text file with spaces in it from a file without.
396  */
generateFile(const UChar * chars,int32_t length)397 int generateFile(const UChar *chars, int32_t length) {
398     Locale root("");
399     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
400     UErrorCode status = U_ZERO_ERROR;
401 
402     UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
403     BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
404     breakIter->adoptText(noSpaceIter);
405     char outbuf[1024];
406     int32_t strlength;
407     UChar bom = 0xFEFF;
408 
409     printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
410     int32_t prevbreak = 0;
411     while (U_SUCCESS(status)) {
412         int32_t nextbreak = breakIter->next();
413         if (nextbreak == BreakIterator::DONE) {
414             break;
415         }
416         printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
417                                     nextbreak-prevbreak, &status));
418         if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
419             && complexContext.contains(chars[nextbreak])) {
420             printf(" ");
421         }
422         prevbreak = nextbreak;
423     }
424 
425     if (U_FAILURE(status)) {
426         fprintf(stderr, "generate failed: %s\n", u_errorName(status));
427         return status;
428     }
429     else {
430         return 0;
431     }
432 }
433 
434 /*
435  * The main routine. Read the command line arguments, read the text file,
436  * remove the spaces, do the comparison and report the final results
437  */
main(int argc,char ** argv)438 int main(int argc, char **argv)
439 {
440     char *fileName = "space.txt";
441     int arg = 1;
442     UBool verbose = FALSE;
443     UBool generate = FALSE;
444 
445     if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
446         generate = TRUE;
447         arg += 1;
448     }
449 
450     if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
451         verbose = TRUE;
452         arg += 1;
453     }
454 
455     if (arg == argc - 1) {
456         fileName = argv[arg++];
457     }
458 
459     if (arg != argc) {
460         fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
461         return 1;
462     }
463 
464     int32_t spaceCount, nonSpaceCount;
465     const UChar *spaces, *noSpaces;
466 
467     spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
468 
469     if (spaces == 0) {
470         return 1;
471     }
472 
473     if (generate) {
474         return generateFile(spaces, spaceCount);
475     }
476 
477     noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
478 
479     if (noSpaces == 0) {
480         return 1;
481     }
482 
483     ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
484 
485     printf("word count: %d\n", test.getWordCount());
486     printf("breaks not found: %d\n", test.getBreaksNotFound());
487     printf("invalid breaks found: %d\n", test.getInvalidBreaks());
488 
489     return 0;
490 }
491 
492 /*
493  * The main constructor. Clear all the counts and construct a default
494  * word instance of a BreakIterator.
495  */
SpaceBreakIterator(const UChar * text,int32_t count)496 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
497   : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
498 {
499     UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
500     UErrorCode status = U_ZERO_ERROR;
501     fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
502     Locale root("");
503 
504     fBreakIter = BreakIterator::createWordInstance(root, status);
505     fBreakIter->adoptText(iter);
506 }
507 
SpaceBreakIterator()508 SpaceBreakIterator::SpaceBreakIterator()
509 {
510     // nothing
511 }
512 
513 /*
514  * The destructor. delete the underlying BreakIterator
515  */
~SpaceBreakIterator()516 SpaceBreakIterator::~SpaceBreakIterator()
517 {
518     delete fBreakIter;
519 }
520 
521 /*
522  * Return the next break, counting words and spaces.
523  */
next()524 int32_t SpaceBreakIterator::next()
525 {
526     if (fDone) {
527         return BreakIterator::DONE;
528     }
529 
530     int32_t nextBreak;
531     do {
532         nextBreak = fBreakIter->next();
533 
534         if (nextBreak == BreakIterator::DONE) {
535             fDone = TRUE;
536             return BreakIterator::DONE;
537         }
538     }
539     while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
540             && fComplexContext.contains(fText[nextBreak]));
541 
542    int32_t result = nextBreak - fSpaceCount;
543 
544     if (nextBreak < fTextCount) {
545         if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
546             fSpaceCount += fBreakIter->next() - nextBreak;
547         }
548     }
549 
550     fWordCount += 1;
551 
552     return result;
553 }
554 
555 /*
556  * Returns the current space count
557  */
getSpaceCount()558 int32_t SpaceBreakIterator::getSpaceCount()
559 {
560     return fSpaceCount;
561 }
562 
563 /*
564  * Returns the current word count
565  */
getWordCount()566 int32_t SpaceBreakIterator::getWordCount()
567 {
568     return fWordCount;
569 }
570 
571 
572