1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
6 * and others. All Rights Reserved. *
7 ******************************************************************************
8 */
9
10 #include <errno.h>
11 #include <stdio.h>
12 #include <string.h>
13
14 #include "unicode/utypes.h"
15 #include "unicode/uchar.h"
16 #include "unicode/uchriter.h"
17 #include "unicode/brkiter.h"
18 #include "unicode/locid.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/ustring.h"
22
23 /*
24 * This program takes a Unicode text file containing Thai text with
25 * spaces inserted where the word breaks are. It computes a copy of
26 * the text without spaces and uses a word instance of a Thai BreakIterator
27 * to compute the word breaks. The program reports any differences in the
28 * breaks.
29 *
30 * NOTE: by it's very nature, Thai word breaking is not exact, so it is
31 * expected that this program will always report some differences.
32 */
33
34 /*
35 * This class is a break iterator that counts words and spaces.
36 */
37 class SpaceBreakIterator
38 {
39 public:
40 // The constructor:
41 // text - pointer to an array of UChars to iterate over
42 // count - the number of UChars in text
43 SpaceBreakIterator(const UChar *text, int32_t count);
44
45 // the destructor
46 ~SpaceBreakIterator();
47
48 // return next break position
49 int32_t next();
50
51 // return current word count
52 int32_t getWordCount();
53
54 // return current space count
55 int32_t getSpaceCount();
56
57 private:
58 // No arg constructor: private so clients can't call it.
59 SpaceBreakIterator();
60
61 // The underlying BreakIterator
62 BreakIterator *fBreakIter;
63
64 // address of the UChar array
65 const UChar *fText;
66
67 // number of UChars in fText
68 int32_t fTextCount;
69
70 // current word count
71 int32_t fWordCount;
72
73 // current space count
74 int32_t fSpaceCount;
75
76 // UnicodeSet of SA characters
77 UnicodeSet fComplexContext;
78
79 // true when fBreakIter has returned DONE
80 UBool fDone;
81 };
82
83 /*
84 * This is the main class. It compares word breaks and reports the differences.
85 */
86 class ThaiWordbreakTest
87 {
88 public:
89 // The main constructor:
90 // spaces - pointer to a UChar array for the text with spaces
91 // spaceCount - the number of characters in the spaces array
92 // noSpaces - pointer to a UChar array for the text without spaces
93 // noSpaceCount - the number of characters in the noSpaces array
94 // verbose - report all breaks if true, otherwise just report differences
95 ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
96 ~ThaiWordbreakTest();
97
98 // returns the number of breaks that are in the spaces array
99 // but aren't found in the noSpaces array
100 int32_t getBreaksNotFound();
101
102 // returns the number of breaks which are found in the noSpaces
103 // array but aren't in the spaces array
104 int32_t getInvalidBreaks();
105
106 // returns the number of words found in the spaces array
107 int32_t getWordCount();
108
109 // reads the input Unicode text file:
110 // fileName - the path name of the file
111 // charCount - set to the number of UChars read from the file
112 // returns - the address of the UChar array containing the characters
113 static const UChar *readFile(char *fileName, int32_t &charCount);
114
115 // removes spaces form the input UChar array:
116 // spaces - pointer to the input UChar array
117 // count - number of UChars in the spaces array
118 // nonSpaceCount - the number of UChars in the result array
119 // returns - the address of the UChar array with spaces removed
120 static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
121
122 private:
123 // The no arg constructor - private so clients can't call it
124 ThaiWordbreakTest();
125
126 // This does the actual comparison:
127 // spaces - the address of the UChar array for the text with spaces
128 // spaceCount - the number of UChars in the spaces array
129 // noSpaces - the address of the UChar array for the text without spaces
130 // noSpaceCount - the number of UChars in the noSpaces array
131 // returns - true if all breaks match, false otherwise
132 UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
133 const UChar *noSpaces, int32_t noSpaceCount);
134
135 // helper method to report a break in the spaces
136 // array that's not found in the noSpaces array
137 void breakNotFound(int32_t br);
138
139 // helper method to report a break that's found in
140 // the noSpaces array that's not in the spaces array
141 void foundInvalidBreak(int32_t br);
142
143 // count of breaks in the spaces array that
144 // aren't found in the noSpaces array
145 int32_t fBreaksNotFound;
146
147 // count of breaks found in the noSpaces array
148 // that aren't in the spaces array
149 int32_t fInvalidBreaks;
150
151 // number of words found in the spaces array
152 int32_t fWordCount;
153
154 // report all breaks if true, otherwise just report differences
155 UBool fVerbose;
156 };
157
158 /*
159 * The main constructor: it calls compareWordBreaks and reports any differences
160 */
ThaiWordbreakTest(const UChar * spaces,int32_t spaceCount,const UChar * noSpaces,int32_t noSpaceCount,UBool verbose)161 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
162 const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
163 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
164 {
165 compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
166 }
167
168 /*
169 * The no arg constructor
170 */
ThaiWordbreakTest()171 ThaiWordbreakTest::ThaiWordbreakTest()
172 {
173 // nothing
174 }
175
176 /*
177 * The destructor
178 */
~ThaiWordbreakTest()179 ThaiWordbreakTest::~ThaiWordbreakTest()
180 {
181 // nothing?
182 }
183
184 /*
185 * returns the number of breaks in the spaces array
186 * that aren't found in the noSpaces array
187 */
getBreaksNotFound()188 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
189 {
190 return fBreaksNotFound;
191 }
192
193 /*
194 * Returns the number of breaks found in the noSpaces
195 * array that aren't in the spaces array
196 */
getInvalidBreaks()197 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
198 {
199 return fInvalidBreaks;
200 }
201
202 /*
203 * Returns the number of words found in the spaces array
204 */
getWordCount()205 inline int32_t ThaiWordbreakTest::getWordCount()
206 {
207 return fWordCount;
208 }
209
210 /*
211 * This method does the actual break comparison and reports the results.
212 * It uses a SpaceBreakIterator to iterate over the text with spaces,
213 * and a word instance of a Thai BreakIterator to iterate over the text
214 * without spaces.
215 */
compareWordBreaks(const UChar * spaces,int32_t spaceCount,const UChar * noSpaces,int32_t noSpaceCount)216 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
217 const UChar *noSpaces, int32_t noSpaceCount)
218 {
219 UBool result = true;
220 Locale thai("th");
221 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
222 UErrorCode status = U_ZERO_ERROR;
223
224 BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
225 breakIter->adoptText(noSpaceIter);
226
227 SpaceBreakIterator spaceIter(spaces, spaceCount);
228
229 int32_t nextBreak = 0;
230 int32_t nextSpaceBreak = 0;
231 int32_t iterCount = 0;
232
233 while (true) {
234 nextSpaceBreak = spaceIter.next();
235 nextBreak = breakIter->next();
236
237 if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
238 if (nextBreak != BreakIterator::DONE) {
239 fprintf(stderr, "break iterator didn't end.\n");
240 } else if (nextSpaceBreak != BreakIterator::DONE) {
241 fprintf(stderr, "premature break iterator end.\n");
242 }
243
244 break;
245 }
246
247 while (nextSpaceBreak != nextBreak &&
248 nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
249 if (nextSpaceBreak < nextBreak) {
250 breakNotFound(nextSpaceBreak);
251 result = false;
252 nextSpaceBreak = spaceIter.next();
253 } else if (nextSpaceBreak > nextBreak) {
254 foundInvalidBreak(nextBreak);
255 result = false;
256 nextBreak = breakIter->next();
257 }
258 }
259
260 if (fVerbose) {
261 printf("%d %d\n", nextSpaceBreak, nextBreak);
262 }
263 }
264
265
266 fWordCount = spaceIter.getWordCount();
267
268 delete breakIter;
269
270 return result;
271 }
272
273 /*
274 * Report a break that's in the text with spaces but
275 * not found in the text without spaces.
276 */
breakNotFound(int32_t br)277 void ThaiWordbreakTest::breakNotFound(int32_t br)
278 {
279 if (fVerbose) {
280 printf("%d ****\n", br);
281 } else {
282 fprintf(stderr, "break not found: %d\n", br);
283 }
284
285 fBreaksNotFound += 1;
286 }
287
288 /*
289 * Report a break that's found in the text without spaces
290 * that isn't in the text with spaces.
291 */
foundInvalidBreak(int32_t br)292 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
293 {
294 if (fVerbose) {
295 printf("**** %d\n", br);
296 } else {
297 fprintf(stderr, "found invalid break: %d\n", br);
298 }
299
300 fInvalidBreaks += 1;
301 }
302
303 /*
304 * Read the text from a file. The text must start with a Unicode Byte
305 * Order Mark (BOM) so that we know what order to read the bytes in.
306 */
readFile(char * fileName,int32_t & charCount)307 const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
308 {
309 FILE *f;
310 int32_t fileSize;
311
312 UChar *buffer;
313 char *bufferChars;
314
315 f = fopen(fileName, "rb");
316
317 if( f == NULL ) {
318 fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
319 return 0;
320 }
321
322 fseek(f, 0, SEEK_END);
323 fileSize = ftell(f);
324
325 fseek(f, 0, SEEK_SET);
326 bufferChars = new char[fileSize];
327
328 if(bufferChars == 0) {
329 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
330 fclose(f);
331 return 0;
332 }
333
334 fread(bufferChars, sizeof(char), fileSize, f);
335 if( ferror(f) ) {
336 fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
337 fclose(f);
338 delete[] bufferChars;
339 return 0;
340 }
341 fclose(f);
342
343 UnicodeString myText(bufferChars, fileSize, "UTF-8");
344
345 delete[] bufferChars;
346
347 charCount = myText.length();
348 buffer = new UChar[charCount];
349 if(buffer == 0) {
350 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
351 return 0;
352 }
353
354 myText.extract(1, myText.length(), buffer);
355 charCount--; // skip the BOM
356 buffer[charCount] = 0; // NULL terminate for easier reading in the debugger
357
358 return buffer;
359 }
360
361 /*
362 * Remove spaces from the input UChar array.
363 *
364 * We check explicitly for a Unicode code value of 0x0020
365 * because Unicode::isSpaceChar returns true for CR, LF, etc.
366 *
367 */
crunchSpaces(const UChar * spaces,int32_t count,int32_t & nonSpaceCount)368 const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
369 {
370 int32_t i, out, spaceCount;
371
372 spaceCount = 0;
373 for (i = 0; i < count; i += 1) {
374 if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
375 spaceCount += 1;
376 }
377 }
378
379 nonSpaceCount = count - spaceCount;
380 UChar *noSpaces = new UChar[nonSpaceCount];
381
382 if (noSpaces == 0) {
383 fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
384 return 0;
385 }
386
387 for (out = 0, i = 0; i < count; i += 1) {
388 if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
389 noSpaces[out++] = spaces[i];
390 }
391 }
392
393 return noSpaces;
394 }
395
396 /*
397 * Generate a text file with spaces in it from a file without.
398 */
generateFile(const UChar * chars,int32_t length)399 int generateFile(const UChar *chars, int32_t length) {
400 Locale root("");
401 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
402 UErrorCode status = U_ZERO_ERROR;
403
404 UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
405 BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
406 breakIter->adoptText(noSpaceIter);
407 char outbuf[1024];
408 int32_t strlength;
409 UChar bom = 0xFEFF;
410
411 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
412 int32_t prevbreak = 0;
413 while (U_SUCCESS(status)) {
414 int32_t nextbreak = breakIter->next();
415 if (nextbreak == BreakIterator::DONE) {
416 break;
417 }
418 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
419 nextbreak-prevbreak, &status));
420 if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
421 && complexContext.contains(chars[nextbreak])) {
422 printf(" ");
423 }
424 prevbreak = nextbreak;
425 }
426
427 if (U_FAILURE(status)) {
428 fprintf(stderr, "generate failed: %s\n", u_errorName(status));
429 return status;
430 }
431 else {
432 return 0;
433 }
434 }
435
436 /*
437 * The main routine. Read the command line arguments, read the text file,
438 * remove the spaces, do the comparison and report the final results
439 */
main(int argc,char ** argv)440 int main(int argc, char **argv)
441 {
442 char *fileName = "space.txt";
443 int arg = 1;
444 UBool verbose = false;
445 UBool generate = false;
446
447 if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
448 generate = true;
449 arg += 1;
450 }
451
452 if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
453 verbose = true;
454 arg += 1;
455 }
456
457 if (arg == argc - 1) {
458 fileName = argv[arg++];
459 }
460
461 if (arg != argc) {
462 fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
463 return 1;
464 }
465
466 int32_t spaceCount, nonSpaceCount;
467 const UChar *spaces, *noSpaces;
468
469 spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
470
471 if (spaces == 0) {
472 return 1;
473 }
474
475 if (generate) {
476 return generateFile(spaces, spaceCount);
477 }
478
479 noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
480
481 if (noSpaces == 0) {
482 return 1;
483 }
484
485 ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
486
487 printf("word count: %d\n", test.getWordCount());
488 printf("breaks not found: %d\n", test.getBreaksNotFound());
489 printf("invalid breaks found: %d\n", test.getInvalidBreaks());
490
491 return 0;
492 }
493
494 /*
495 * The main constructor. Clear all the counts and construct a default
496 * word instance of a BreakIterator.
497 */
SpaceBreakIterator(const UChar * text,int32_t count)498 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
499 : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(false)
500 {
501 UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
502 UErrorCode status = U_ZERO_ERROR;
503 fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
504 Locale root("");
505
506 fBreakIter = BreakIterator::createWordInstance(root, status);
507 fBreakIter->adoptText(iter);
508 }
509
SpaceBreakIterator()510 SpaceBreakIterator::SpaceBreakIterator()
511 {
512 // nothing
513 }
514
515 /*
516 * The destructor. delete the underlying BreakIterator
517 */
~SpaceBreakIterator()518 SpaceBreakIterator::~SpaceBreakIterator()
519 {
520 delete fBreakIter;
521 }
522
523 /*
524 * Return the next break, counting words and spaces.
525 */
next()526 int32_t SpaceBreakIterator::next()
527 {
528 if (fDone) {
529 return BreakIterator::DONE;
530 }
531
532 int32_t nextBreak;
533 do {
534 nextBreak = fBreakIter->next();
535
536 if (nextBreak == BreakIterator::DONE) {
537 fDone = true;
538 return BreakIterator::DONE;
539 }
540 }
541 while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
542 && fComplexContext.contains(fText[nextBreak]));
543
544 int32_t result = nextBreak - fSpaceCount;
545
546 if (nextBreak < fTextCount) {
547 if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
548 fSpaceCount += fBreakIter->next() - nextBreak;
549 }
550 }
551
552 fWordCount += 1;
553
554 return result;
555 }
556
557 /*
558 * Returns the current space count
559 */
getSpaceCount()560 int32_t SpaceBreakIterator::getSpaceCount()
561 {
562 return fSpaceCount;
563 }
564
565 /*
566 * Returns the current word count
567 */
getWordCount()568 int32_t SpaceBreakIterator::getWordCount()
569 {
570 return fWordCount;
571 }
572
573
574