1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
6 * and others. All Rights Reserved. *
7 ******************************************************************************
8 */
9
10 #include <errno.h>
11 #include <stdio.h>
12 #include <string.h>
13
14 #include "unicode/utypes.h"
15 #include "unicode/uchar.h"
16 #include "unicode/uchriter.h"
17 #include "unicode/brkiter.h"
18 #include "unicode/locid.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/ustring.h"
22
23 using icu::BreakIterator;
24 using icu::Locale;
25 using icu::UCharCharacterIterator;
26 using icu::UnicodeSet;
27 using icu::UnicodeString;
28
29 /*
30 * This program takes a Unicode text file containing Thai text with
31 * spaces inserted where the word breaks are. It computes a copy of
32 * the text without spaces and uses a word instance of a Thai BreakIterator
33 * to compute the word breaks. The program reports any differences in the
34 * breaks.
35 *
36 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
37 * expected that this program will always report some differences.
38 */
39
40 /*
41 * This class is a break iterator that counts words and spaces.
42 */
43 class SpaceBreakIterator
44 {
45 public:
46 // The constructor:
47 // text - pointer to an array of UChars to iterate over
48 // count - the number of UChars in text
49 SpaceBreakIterator(const char16_t *text, int32_t count);
50
51 // the destructor
52 ~SpaceBreakIterator();
53
54 // return next break position
55 int32_t next();
56
57 // return current word count
58 int32_t getWordCount();
59
60 // return current space count
61 int32_t getSpaceCount();
62
63 private:
64 // No arg constructor: private so clients can't call it.
65 SpaceBreakIterator();
66
67 // The underlying BreakIterator
68 BreakIterator *fBreakIter;
69
70 // address of the char16_t array
71 const char16_t *fText;
72
73 // number of UChars in fText
74 int32_t fTextCount;
75
76 // current word count
77 int32_t fWordCount;
78
79 // current space count
80 int32_t fSpaceCount;
81
82 // UnicodeSet of SA characters
83 UnicodeSet fComplexContext;
84
85 // true when fBreakIter has returned DONE
86 UBool fDone;
87 };
88
89 /*
90 * This is the main class. It compares word breaks and reports the differences.
91 */
92 class ThaiWordbreakTest
93 {
94 public:
95 // The main constructor:
96 // spaces - pointer to a char16_t array for the text with spaces
97 // spaceCount - the number of characters in the spaces array
98 // noSpaces - pointer to a char16_t array for the text without spaces
99 // noSpaceCount - the number of characters in the noSpaces array
100 // verbose - report all breaks if true, otherwise just report differences
101 ThaiWordbreakTest(const char16_t *spaces, int32_t spaceCount, const char16_t *noSpaces, int32_t noSpaceCount, UBool verbose);
102 ~ThaiWordbreakTest();
103
104 // returns the number of breaks that are in the spaces array
105 // but aren't found in the noSpaces array
106 int32_t getBreaksNotFound();
107
108 // returns the number of breaks which are found in the noSpaces
109 // array but aren't in the spaces array
110 int32_t getInvalidBreaks();
111
112 // returns the number of words found in the spaces array
113 int32_t getWordCount();
114
115 // reads the input Unicode text file:
116 // fileName - the path name of the file
117 // charCount - set to the number of UChars read from the file
118 // returns - the address of the char16_t array containing the characters
119 static const char16_t *readFile(const char *fileName, int32_t &charCount);
120
121 // removes spaces form the input char16_t array:
122 // spaces - pointer to the input char16_t array
123 // count - number of UChars in the spaces array
124 // nonSpaceCount - the number of UChars in the result array
125 // returns - the address of the char16_t array with spaces removed
126 static const char16_t *crunchSpaces(const char16_t *spaces, int32_t count, int32_t &nonSpaceCount);
127
128 private:
129 // The no arg constructor - private so clients can't call it
130 ThaiWordbreakTest();
131
132 // This does the actual comparison:
133 // spaces - the address of the char16_t array for the text with spaces
134 // spaceCount - the number of UChars in the spaces array
135 // noSpaces - the address of the char16_t array for the text without spaces
136 // noSpaceCount - the number of UChars in the noSpaces array
137 // returns - true if all breaks match, false otherwise
138 UBool compareWordBreaks(const char16_t *spaces, int32_t spaceCount,
139 const char16_t *noSpaces, int32_t noSpaceCount);
140
141 // helper method to report a break in the spaces
142 // array that's not found in the noSpaces array
143 void breakNotFound(int32_t br);
144
145 // helper method to report a break that's found in
146 // the noSpaces array that's not in the spaces array
147 void foundInvalidBreak(int32_t br);
148
149 // count of breaks in the spaces array that
150 // aren't found in the noSpaces array
151 int32_t fBreaksNotFound;
152
153 // count of breaks found in the noSpaces array
154 // that aren't in the spaces array
155 int32_t fInvalidBreaks;
156
157 // number of words found in the spaces array
158 int32_t fWordCount;
159
160 // report all breaks if true, otherwise just report differences
161 UBool fVerbose;
162 };
163
164 /*
165 * The main constructor: it calls compareWordBreaks and reports any differences
166 */
ThaiWordbreakTest(const char16_t * spaces,int32_t spaceCount,const char16_t * noSpaces,int32_t noSpaceCount,UBool verbose)167 ThaiWordbreakTest::ThaiWordbreakTest(const char16_t *spaces, int32_t spaceCount,
168 const char16_t *noSpaces, int32_t noSpaceCount, UBool verbose)
169 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
170 {
171 compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
172 }
173
174 /*
175 * The no arg constructor
176 */
ThaiWordbreakTest()177 ThaiWordbreakTest::ThaiWordbreakTest()
178 {
179 // nothing
180 }
181
182 /*
183 * The destructor
184 */
~ThaiWordbreakTest()185 ThaiWordbreakTest::~ThaiWordbreakTest()
186 {
187 // nothing?
188 }
189
190 /*
191 * returns the number of breaks in the spaces array
192 * that aren't found in the noSpaces array
193 */
getBreaksNotFound()194 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
195 {
196 return fBreaksNotFound;
197 }
198
199 /*
200 * Returns the number of breaks found in the noSpaces
201 * array that aren't in the spaces array
202 */
getInvalidBreaks()203 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
204 {
205 return fInvalidBreaks;
206 }
207
208 /*
209 * Returns the number of words found in the spaces array
210 */
getWordCount()211 inline int32_t ThaiWordbreakTest::getWordCount()
212 {
213 return fWordCount;
214 }
215
216 /*
217 * This method does the actual break comparison and reports the results.
218 * It uses a SpaceBreakIterator to iterate over the text with spaces,
219 * and a word instance of a Thai BreakIterator to iterate over the text
220 * without spaces.
221 */
compareWordBreaks(const char16_t * spaces,int32_t spaceCount,const char16_t * noSpaces,int32_t noSpaceCount)222 UBool ThaiWordbreakTest::compareWordBreaks(const char16_t *spaces, int32_t spaceCount,
223 const char16_t *noSpaces, int32_t noSpaceCount)
224 {
225 UBool result = true;
226 Locale thai("th");
227 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
228 UErrorCode status = U_ZERO_ERROR;
229
230 BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
231 breakIter->adoptText(noSpaceIter);
232
233 SpaceBreakIterator spaceIter(spaces, spaceCount);
234
235 int32_t nextBreak = 0;
236 int32_t nextSpaceBreak = 0;
237 int32_t iterCount = 0;
238
239 while (true) {
240 nextSpaceBreak = spaceIter.next();
241 nextBreak = breakIter->next();
242
243 if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
244 if (nextBreak != BreakIterator::DONE) {
245 fprintf(stderr, "break iterator didn't end.\n");
246 } else if (nextSpaceBreak != BreakIterator::DONE) {
247 fprintf(stderr, "premature break iterator end.\n");
248 }
249
250 break;
251 }
252
253 while (nextSpaceBreak != nextBreak &&
254 nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
255 if (nextSpaceBreak < nextBreak) {
256 breakNotFound(nextSpaceBreak);
257 result = false;
258 nextSpaceBreak = spaceIter.next();
259 } else if (nextSpaceBreak > nextBreak) {
260 foundInvalidBreak(nextBreak);
261 result = false;
262 nextBreak = breakIter->next();
263 }
264 }
265
266 if (fVerbose) {
267 printf("%d %d\n", nextSpaceBreak, nextBreak);
268 }
269 }
270
271
272 fWordCount = spaceIter.getWordCount();
273
274 delete breakIter;
275
276 return result;
277 }
278
279 /*
280 * Report a break that's in the text with spaces but
281 * not found in the text without spaces.
282 */
breakNotFound(int32_t br)283 void ThaiWordbreakTest::breakNotFound(int32_t br)
284 {
285 if (fVerbose) {
286 printf("%d ****\n", br);
287 } else {
288 fprintf(stderr, "break not found: %d\n", br);
289 }
290
291 fBreaksNotFound += 1;
292 }
293
294 /*
295 * Report a break that's found in the text without spaces
296 * that isn't in the text with spaces.
297 */
foundInvalidBreak(int32_t br)298 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
299 {
300 if (fVerbose) {
301 printf("**** %d\n", br);
302 } else {
303 fprintf(stderr, "found invalid break: %d\n", br);
304 }
305
306 fInvalidBreaks += 1;
307 }
308
309 /*
310 * Read the text from a file. The text must start with a Unicode Byte
311 * Order Mark (BOM) so that we know what order to read the bytes in.
312 */
readFile(const char * fileName,int32_t & charCount)313 const char16_t *ThaiWordbreakTest::readFile(const char *fileName, int32_t &charCount)
314 {
315 FILE *f;
316 int32_t fileSize;
317
318 char16_t *buffer;
319 char *bufferChars;
320
321 f = fopen(fileName, "rb");
322
323 if( f == nullptr ) {
324 fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
325 return nullptr;
326 }
327
328 fseek(f, 0, SEEK_END);
329 fileSize = ftell(f);
330
331 fseek(f, 0, SEEK_SET);
332 bufferChars = new char[fileSize];
333
334 if (bufferChars == nullptr) {
335 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
336 fclose(f);
337 return nullptr;
338 }
339
340 fread(bufferChars, sizeof(char), fileSize, f);
341 if( ferror(f) ) {
342 fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
343 fclose(f);
344 delete[] bufferChars;
345 return nullptr;
346 }
347 fclose(f);
348
349 UnicodeString myText(bufferChars, fileSize, "UTF-8");
350
351 delete[] bufferChars;
352
353 charCount = myText.length();
354 buffer = new char16_t[charCount];
355 if (buffer == nullptr) {
356 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
357 return nullptr;
358 }
359
360 myText.extract(1, myText.length(), buffer);
361 charCount--; // skip the BOM
362 buffer[charCount] = 0; // NUL terminate for easier reading in the debugger
363
364 return buffer;
365 }
366
367 /*
368 * Remove spaces from the input char16_t array.
369 *
370 * We check explicitly for a Unicode code value of 0x0020
371 * because Unicode::isSpaceChar returns true for CR, LF, etc.
372 *
373 */
crunchSpaces(const char16_t * spaces,int32_t count,int32_t & nonSpaceCount)374 const char16_t *ThaiWordbreakTest::crunchSpaces(const char16_t *spaces, int32_t count, int32_t &nonSpaceCount)
375 {
376 int32_t i, out, spaceCount;
377
378 spaceCount = 0;
379 for (i = 0; i < count; i += 1) {
380 if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
381 spaceCount += 1;
382 }
383 }
384
385 nonSpaceCount = count - spaceCount;
386 char16_t *noSpaces = new char16_t[nonSpaceCount];
387
388 if (noSpaces == nullptr) {
389 fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
390 return nullptr;
391 }
392
393 for (out = 0, i = 0; i < count; i += 1) {
394 if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
395 noSpaces[out++] = spaces[i];
396 }
397 }
398
399 return noSpaces;
400 }
401
402 /*
403 * Generate a text file with spaces in it from a file without.
404 */
generateFile(const char16_t * chars,int32_t length)405 int generateFile(const char16_t *chars, int32_t length) {
406 Locale root("");
407 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
408 UErrorCode status = U_ZERO_ERROR;
409
410 UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
411 BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
412 breakIter->adoptText(noSpaceIter);
413 char outbuf[1024];
414 int32_t strlength;
415 char16_t bom = 0xFEFF;
416
417 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
418 int32_t prevbreak = 0;
419 while (U_SUCCESS(status)) {
420 int32_t nextbreak = breakIter->next();
421 if (nextbreak == BreakIterator::DONE) {
422 break;
423 }
424 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
425 nextbreak-prevbreak, &status));
426 if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
427 && complexContext.contains(chars[nextbreak])) {
428 printf(" ");
429 }
430 prevbreak = nextbreak;
431 }
432
433 if (U_FAILURE(status)) {
434 fprintf(stderr, "generate failed: %s\n", u_errorName(status));
435 return status;
436 }
437 else {
438 return 0;
439 }
440 }
441
442 /*
443 * The main routine. Read the command line arguments, read the text file,
444 * remove the spaces, do the comparison and report the final results
445 */
main(int argc,char ** argv)446 int main(int argc, char **argv)
447 {
448 const char *fileName = "space.txt";
449 int arg = 1;
450 UBool verbose = false;
451 UBool generate = false;
452
453 if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
454 generate = true;
455 arg += 1;
456 }
457
458 if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
459 verbose = true;
460 arg += 1;
461 }
462
463 if (arg == argc - 1) {
464 fileName = argv[arg++];
465 }
466
467 if (arg != argc) {
468 fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
469 return 1;
470 }
471
472 int32_t spaceCount, nonSpaceCount;
473 const char16_t *spaces, *noSpaces;
474
475 spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
476
477 if (spaces == nullptr) {
478 return 1;
479 }
480
481 if (generate) {
482 return generateFile(spaces, spaceCount);
483 }
484
485 noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
486
487 if (noSpaces == nullptr) {
488 return 1;
489 }
490
491 ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
492
493 printf("word count: %d\n", test.getWordCount());
494 printf("breaks not found: %d\n", test.getBreaksNotFound());
495 printf("invalid breaks found: %d\n", test.getInvalidBreaks());
496
497 return 0;
498 }
499
500 /*
501 * The main constructor. Clear all the counts and construct a default
502 * word instance of a BreakIterator.
503 */
SpaceBreakIterator(const char16_t * text,int32_t count)504 SpaceBreakIterator::SpaceBreakIterator(const char16_t *text, int32_t count)
505 : fBreakIter(nullptr), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(false)
506 {
507 UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
508 UErrorCode status = U_ZERO_ERROR;
509 fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
510 Locale root("");
511
512 fBreakIter = BreakIterator::createWordInstance(root, status);
513 fBreakIter->adoptText(iter);
514 }
515
SpaceBreakIterator()516 SpaceBreakIterator::SpaceBreakIterator()
517 {
518 // nothing
519 }
520
521 /*
522 * The destructor. delete the underlying BreakIterator
523 */
~SpaceBreakIterator()524 SpaceBreakIterator::~SpaceBreakIterator()
525 {
526 delete fBreakIter;
527 }
528
529 /*
530 * Return the next break, counting words and spaces.
531 */
next()532 int32_t SpaceBreakIterator::next()
533 {
534 if (fDone) {
535 return BreakIterator::DONE;
536 }
537
538 int32_t nextBreak;
539 do {
540 nextBreak = fBreakIter->next();
541
542 if (nextBreak == BreakIterator::DONE) {
543 fDone = true;
544 return BreakIterator::DONE;
545 }
546 }
547 while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
548 && fComplexContext.contains(fText[nextBreak]));
549
550 int32_t result = nextBreak - fSpaceCount;
551
552 if (nextBreak < fTextCount) {
553 if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
554 fSpaceCount += fBreakIter->next() - nextBreak;
555 }
556 }
557
558 fWordCount += 1;
559
560 return result;
561 }
562
563 /*
564 * Returns the current space count
565 */
getSpaceCount()566 int32_t SpaceBreakIterator::getSpaceCount()
567 {
568 return fSpaceCount;
569 }
570
571 /*
572 * Returns the current word count
573 */
getWordCount()574 int32_t SpaceBreakIterator::getWordCount()
575 {
576 return fWordCount;
577 }
578
579
580