• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ******************************************************************************
5  * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
6  * and others. All Rights Reserved.                                           *
7  ******************************************************************************
8  */
9 
10 #include <errno.h>
11 #include <stdio.h>
12 #include <string.h>
13 
14 #include "unicode/utypes.h"
15 #include "unicode/uchar.h"
16 #include "unicode/uchriter.h"
17 #include "unicode/brkiter.h"
18 #include "unicode/locid.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/ustring.h"
22 
23 using icu::BreakIterator;
24 using icu::Locale;
25 using icu::UCharCharacterIterator;
26 using icu::UnicodeSet;
27 using icu::UnicodeString;
28 
29 /*
30  * This program takes a Unicode text file containing Thai text with
31  * spaces inserted where the word breaks are. It computes a copy of
32  * the text without spaces and uses a word instance of a Thai BreakIterator
33  * to compute the word breaks. The program reports any differences in the
34  * breaks.
35  *
36  * NOTE: by it's very nature, Thai word breaking is not exact, so it is
37  * expected that this program will always report some differences.
38  */
39 
40 /*
41  * This class is a break iterator that counts words and spaces.
42  */
43 class SpaceBreakIterator
44 {
45 public:
46     // The constructor:
47     // text  - pointer to an array of UChars to iterate over
48     // count - the number of UChars in text
49     SpaceBreakIterator(const char16_t *text, int32_t count);
50 
51     // the destructor
52     ~SpaceBreakIterator();
53 
54     // return next break position
55     int32_t next();
56 
57     // return current word count
58     int32_t getWordCount();
59 
60     // return current space count
61     int32_t getSpaceCount();
62 
63 private:
64     // No arg constructor: private so clients can't call it.
65     SpaceBreakIterator();
66 
67     // The underlying BreakIterator
68     BreakIterator *fBreakIter;
69 
70     // address of the char16_t array
71     const char16_t *fText;
72 
73     // number of UChars in fText
74     int32_t fTextCount;
75 
76     // current word count
77     int32_t fWordCount;
78 
79     // current space count
80     int32_t fSpaceCount;
81 
82     // UnicodeSet of SA characters
83     UnicodeSet fComplexContext;
84 
85     // true when fBreakIter has returned DONE
86     UBool fDone;
87 };
88 
89 /*
90  * This is the main class. It compares word breaks and reports the differences.
91  */
92 class ThaiWordbreakTest
93 {
94 public:
95     // The main constructor:
96     // spaces       - pointer to a char16_t array for the text with spaces
97     // spaceCount   - the number of characters in the spaces array
98     // noSpaces     - pointer to a char16_t array for the text without spaces
99     // noSpaceCount - the number of characters in the noSpaces array
100     // verbose      - report all breaks if true, otherwise just report differences
101     ThaiWordbreakTest(const char16_t *spaces, int32_t spaceCount, const char16_t *noSpaces, int32_t noSpaceCount, UBool verbose);
102     ~ThaiWordbreakTest();
103 
104     // returns the number of breaks that are in the spaces array
105     // but aren't found in the noSpaces array
106     int32_t getBreaksNotFound();
107 
108     // returns the number of breaks which are found in the noSpaces
109     // array but aren't in the spaces array
110     int32_t getInvalidBreaks();
111 
112     // returns the number of words found in the spaces array
113     int32_t getWordCount();
114 
115     // reads the input Unicode text file:
116     // fileName  - the path name of the file
117     // charCount - set to the number of UChars read from the file
118     // returns   - the address of the char16_t array containing the characters
119     static const char16_t *readFile(const char *fileName, int32_t &charCount);
120 
121     // removes spaces form the input char16_t array:
122     // spaces        - pointer to the input char16_t array
123     // count         - number of UChars in the spaces array
124     // nonSpaceCount - the number of UChars in the result array
125     // returns       - the address of the char16_t array with spaces removed
126     static const char16_t *crunchSpaces(const char16_t *spaces, int32_t count, int32_t &nonSpaceCount);
127 
128 private:
129     // The no arg constructor - private so clients can't call it
130     ThaiWordbreakTest();
131 
132     // This does the actual comparison:
133     // spaces - the address of the char16_t array for the text with spaces
134     // spaceCount - the number of UChars in the spaces array
135     // noSpaces   - the address of the char16_t array for the text without spaces
136     // noSpaceCount - the number of UChars in the noSpaces array
137     // returns      - true if all breaks match, false otherwise
138     UBool compareWordBreaks(const char16_t *spaces, int32_t spaceCount,
139                             const char16_t *noSpaces, int32_t noSpaceCount);
140 
141     // helper method to report a break in the spaces
142     // array that's not found in the noSpaces array
143     void breakNotFound(int32_t br);
144 
145     // helper method to report a break that's found in
146     // the noSpaces array that's not in the spaces array
147     void foundInvalidBreak(int32_t br);
148 
149     // count of breaks in the spaces array that
150     // aren't found in the noSpaces array
151     int32_t fBreaksNotFound;
152 
153     // count of breaks found in the noSpaces array
154     // that aren't in the spaces array
155     int32_t fInvalidBreaks;
156 
157     // number of words found in the spaces array
158     int32_t fWordCount;
159 
160     // report all breaks if true, otherwise just report differences
161     UBool fVerbose;
162 };
163 
164 /*
165  * The main constructor: it calls compareWordBreaks and reports any differences
166  */
ThaiWordbreakTest(const char16_t * spaces,int32_t spaceCount,const char16_t * noSpaces,int32_t noSpaceCount,UBool verbose)167 ThaiWordbreakTest::ThaiWordbreakTest(const char16_t *spaces, int32_t spaceCount,
168                                      const char16_t *noSpaces, int32_t noSpaceCount, UBool verbose)
169 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
170 {
171     compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
172 }
173 
174 /*
175  * The no arg constructor
176  */
ThaiWordbreakTest()177 ThaiWordbreakTest::ThaiWordbreakTest()
178 {
179     // nothing
180 }
181 
182 /*
183  * The destructor
184  */
~ThaiWordbreakTest()185 ThaiWordbreakTest::~ThaiWordbreakTest()
186 {
187     // nothing?
188 }
189 
190 /*
191  * returns the number of breaks in the spaces array
192  * that aren't found in the noSpaces array
193  */
getBreaksNotFound()194 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
195 {
196     return fBreaksNotFound;
197 }
198 
199 /*
200  * Returns the number of breaks found in the noSpaces
201  * array that aren't in the spaces array
202  */
getInvalidBreaks()203 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
204 {
205     return fInvalidBreaks;
206 }
207 
208 /*
209  * Returns the number of words found in the spaces array
210  */
getWordCount()211 inline int32_t ThaiWordbreakTest::getWordCount()
212 {
213     return fWordCount;
214 }
215 
216 /*
217  * This method does the actual break comparison and reports the results.
218  * It uses a SpaceBreakIterator to iterate over the text with spaces,
219  * and a word instance of a Thai BreakIterator to iterate over the text
220  * without spaces.
221  */
compareWordBreaks(const char16_t * spaces,int32_t spaceCount,const char16_t * noSpaces,int32_t noSpaceCount)222 UBool ThaiWordbreakTest::compareWordBreaks(const char16_t *spaces, int32_t spaceCount,
223                                            const char16_t *noSpaces, int32_t noSpaceCount)
224 {
225     UBool result = true;
226     Locale thai("th");
227     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
228     UErrorCode status = U_ZERO_ERROR;
229 
230     BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
231     breakIter->adoptText(noSpaceIter);
232 
233     SpaceBreakIterator spaceIter(spaces, spaceCount);
234 
235     int32_t nextBreak = 0;
236     int32_t nextSpaceBreak = 0;
237     int32_t iterCount = 0;
238 
239     while (true) {
240         nextSpaceBreak = spaceIter.next();
241         nextBreak = breakIter->next();
242 
243         if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
244             if (nextBreak != BreakIterator::DONE) {
245                 fprintf(stderr, "break iterator didn't end.\n");
246             } else if (nextSpaceBreak != BreakIterator::DONE) {
247                 fprintf(stderr, "premature break iterator end.\n");
248             }
249 
250             break;
251         }
252 
253         while (nextSpaceBreak != nextBreak &&
254                nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
255             if (nextSpaceBreak < nextBreak) {
256                 breakNotFound(nextSpaceBreak);
257                 result = false;
258                 nextSpaceBreak = spaceIter.next();
259             } else if (nextSpaceBreak > nextBreak) {
260                 foundInvalidBreak(nextBreak);
261                 result = false;
262                 nextBreak = breakIter->next();
263             }
264         }
265 
266         if (fVerbose) {
267             printf("%d   %d\n", nextSpaceBreak, nextBreak);
268         }
269     }
270 
271 
272     fWordCount = spaceIter.getWordCount();
273 
274     delete breakIter;
275 
276     return result;
277 }
278 
279 /*
280  * Report a break that's in the text with spaces but
281  * not found in the text without spaces.
282  */
breakNotFound(int32_t br)283 void ThaiWordbreakTest::breakNotFound(int32_t br)
284 {
285     if (fVerbose) {
286         printf("%d   ****\n", br);
287     } else {
288         fprintf(stderr, "break not found: %d\n", br);
289     }
290 
291     fBreaksNotFound += 1;
292 }
293 
294 /*
295  * Report a break that's found in the text without spaces
296  * that isn't in the text with spaces.
297  */
foundInvalidBreak(int32_t br)298 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
299 {
300     if (fVerbose) {
301         printf("****   %d\n", br);
302     } else {
303         fprintf(stderr, "found invalid break: %d\n", br);
304     }
305 
306     fInvalidBreaks += 1;
307 }
308 
309 /*
310  * Read the text from a file. The text must start with a Unicode Byte
311  * Order Mark (BOM) so that we know what order to read the bytes in.
312  */
readFile(const char * fileName,int32_t & charCount)313 const char16_t *ThaiWordbreakTest::readFile(const char *fileName, int32_t &charCount)
314 {
315     FILE *f;
316     int32_t fileSize;
317 
318     char16_t *buffer;
319     char *bufferChars;
320 
321     f = fopen(fileName, "rb");
322 
323     if( f == nullptr ) {
324         fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
325         return nullptr;
326     }
327 
328     fseek(f, 0, SEEK_END);
329     fileSize = ftell(f);
330 
331     fseek(f, 0, SEEK_SET);
332     bufferChars = new char[fileSize];
333 
334     if (bufferChars == nullptr) {
335         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
336         fclose(f);
337         return nullptr;
338     }
339 
340     fread(bufferChars, sizeof(char), fileSize, f);
341     if( ferror(f) ) {
342         fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
343         fclose(f);
344         delete[] bufferChars;
345         return nullptr;
346     }
347     fclose(f);
348 
349     UnicodeString myText(bufferChars, fileSize, "UTF-8");
350 
351     delete[] bufferChars;
352 
353     charCount = myText.length();
354     buffer = new char16_t[charCount];
355     if (buffer == nullptr) {
356         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
357         return nullptr;
358     }
359 
360     myText.extract(1, myText.length(), buffer);
361     charCount--;  // skip the BOM
362     buffer[charCount] = 0;    // NUL terminate for easier reading in the debugger
363 
364     return buffer;
365 }
366 
367 /*
368  * Remove spaces from the input char16_t array.
369  *
370  * We check explicitly for a Unicode code value of 0x0020
371  * because Unicode::isSpaceChar returns true for CR, LF, etc.
372  *
373  */
crunchSpaces(const char16_t * spaces,int32_t count,int32_t & nonSpaceCount)374 const char16_t *ThaiWordbreakTest::crunchSpaces(const char16_t *spaces, int32_t count, int32_t &nonSpaceCount)
375 {
376     int32_t i, out, spaceCount;
377 
378     spaceCount = 0;
379     for (i = 0; i < count; i += 1) {
380         if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
381             spaceCount += 1;
382         }
383     }
384 
385     nonSpaceCount = count - spaceCount;
386     char16_t *noSpaces = new char16_t[nonSpaceCount];
387 
388     if (noSpaces == nullptr) {
389         fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
390         return nullptr;
391     }
392 
393     for (out = 0, i = 0; i < count; i += 1) {
394         if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
395             noSpaces[out++] = spaces[i];
396         }
397     }
398 
399     return noSpaces;
400 }
401 
402 /*
403  * Generate a text file with spaces in it from a file without.
404  */
generateFile(const char16_t * chars,int32_t length)405 int generateFile(const char16_t *chars, int32_t length) {
406     Locale root("");
407     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
408     UErrorCode status = U_ZERO_ERROR;
409 
410     UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
411     BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
412     breakIter->adoptText(noSpaceIter);
413     char outbuf[1024];
414     int32_t strlength;
415     char16_t bom = 0xFEFF;
416 
417     printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
418     int32_t prevbreak = 0;
419     while (U_SUCCESS(status)) {
420         int32_t nextbreak = breakIter->next();
421         if (nextbreak == BreakIterator::DONE) {
422             break;
423         }
424         printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
425                                     nextbreak-prevbreak, &status));
426         if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
427             && complexContext.contains(chars[nextbreak])) {
428             printf(" ");
429         }
430         prevbreak = nextbreak;
431     }
432 
433     if (U_FAILURE(status)) {
434         fprintf(stderr, "generate failed: %s\n", u_errorName(status));
435         return status;
436     }
437     else {
438         return 0;
439     }
440 }
441 
442 /*
443  * The main routine. Read the command line arguments, read the text file,
444  * remove the spaces, do the comparison and report the final results
445  */
main(int argc,char ** argv)446 int main(int argc, char **argv)
447 {
448     const char *fileName = "space.txt";
449     int arg = 1;
450     UBool verbose = false;
451     UBool generate = false;
452 
453     if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
454         generate = true;
455         arg += 1;
456     }
457 
458     if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
459         verbose = true;
460         arg += 1;
461     }
462 
463     if (arg == argc - 1) {
464         fileName = argv[arg++];
465     }
466 
467     if (arg != argc) {
468         fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
469         return 1;
470     }
471 
472     int32_t spaceCount, nonSpaceCount;
473     const char16_t *spaces, *noSpaces;
474 
475     spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
476 
477     if (spaces == nullptr) {
478         return 1;
479     }
480 
481     if (generate) {
482         return generateFile(spaces, spaceCount);
483     }
484 
485     noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
486 
487     if (noSpaces == nullptr) {
488         return 1;
489     }
490 
491     ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
492 
493     printf("word count: %d\n", test.getWordCount());
494     printf("breaks not found: %d\n", test.getBreaksNotFound());
495     printf("invalid breaks found: %d\n", test.getInvalidBreaks());
496 
497     return 0;
498 }
499 
500 /*
501  * The main constructor. Clear all the counts and construct a default
502  * word instance of a BreakIterator.
503  */
SpaceBreakIterator(const char16_t * text,int32_t count)504 SpaceBreakIterator::SpaceBreakIterator(const char16_t *text, int32_t count)
505   : fBreakIter(nullptr), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(false)
506 {
507     UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
508     UErrorCode status = U_ZERO_ERROR;
509     fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
510     Locale root("");
511 
512     fBreakIter = BreakIterator::createWordInstance(root, status);
513     fBreakIter->adoptText(iter);
514 }
515 
SpaceBreakIterator()516 SpaceBreakIterator::SpaceBreakIterator()
517 {
518     // nothing
519 }
520 
521 /*
522  * The destructor. delete the underlying BreakIterator
523  */
~SpaceBreakIterator()524 SpaceBreakIterator::~SpaceBreakIterator()
525 {
526     delete fBreakIter;
527 }
528 
529 /*
530  * Return the next break, counting words and spaces.
531  */
next()532 int32_t SpaceBreakIterator::next()
533 {
534     if (fDone) {
535         return BreakIterator::DONE;
536     }
537 
538     int32_t nextBreak;
539     do {
540         nextBreak = fBreakIter->next();
541 
542         if (nextBreak == BreakIterator::DONE) {
543             fDone = true;
544             return BreakIterator::DONE;
545         }
546     }
547     while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
548             && fComplexContext.contains(fText[nextBreak]));
549 
550    int32_t result = nextBreak - fSpaceCount;
551 
552     if (nextBreak < fTextCount) {
553         if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
554             fSpaceCount += fBreakIter->next() - nextBreak;
555         }
556     }
557 
558     fWordCount += 1;
559 
560     return result;
561 }
562 
563 /*
564  * Returns the current space count
565  */
getSpaceCount()566 int32_t SpaceBreakIterator::getSpaceCount()
567 {
568     return fSpaceCount;
569 }
570 
571 /*
572  * Returns the current word count
573  */
getWordCount()574 int32_t SpaceBreakIterator::getWordCount()
575 {
576     return fWordCount;
577 }
578 
579 
580