1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 * Created on May 5, 2004
5 *
6 * Copyright (C) 2004-2016 International Business Machines Corporation and others.
7 * All Rights Reserved.
8 *
9 */
10 package com.ibm.icu.dev.test.rbbi;
11
12 import java.io.IOException;
13 import java.io.InputStream;
14 import java.io.InputStreamReader;
15 import java.util.Arrays;
16
17 import org.junit.Test;
18 import org.junit.runner.RunWith;
19 import org.junit.runners.JUnit4;
20
21 import com.ibm.icu.dev.test.TestFmwk;
22 import com.ibm.icu.dev.test.TestUtil;
23 import com.ibm.icu.impl.Utility;
24 import com.ibm.icu.lang.UCharacter;
25 import com.ibm.icu.text.BreakIterator;
26 import com.ibm.icu.text.RuleBasedBreakIterator;
27 import com.ibm.icu.util.ULocale;
28
29
30 /**
31 * Rule based break iterator data driven test.
32 * Perform the tests from the file rbbitst.txt.
33 * The test data file is common to both ICU4C and ICU4J.
34 * See the data file for a description of the tests.
35 *
36 */
37 @RunWith(JUnit4.class)
38 public class RBBITestExtended extends TestFmwk {
RBBITestExtended()39 public RBBITestExtended() {
40 }
41
42
43
44 static class TestParams {
45 BreakIterator bi;
46 StringBuilder dataToBreak = new StringBuilder();
47 int[] expectedBreaks = new int[4000];
48 int[] srcLine = new int[4000];
49 int[] srcCol = new int[4000];
50 ULocale currentLocale = new ULocale("en_US");
51 }
52
53
54 @Test
TestExtended()55 public void TestExtended() {
56 // The expectations in this test heavily depends on the Thai dictionary.
57 // Therefore, we skip this test under the LSTM configuration.
58 org.junit.Assume.assumeTrue(!TestUtil.skipDictionaryTest());
59 TestParams tp = new TestParams();
60
61
62 //
63 // Open and read the test data file.
64 //
65 StringBuilder testFileBuf = new StringBuilder();
66 InputStream is = null;
67 try {
68 is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
69 if (is == null) {
70 errln("Could not open test data file rbbitst.txt");
71 return;
72 }
73 InputStreamReader isr = new InputStreamReader(is, "UTF-8");
74 try {
75 int c;
76 int count = 0;
77 for (;;) {
78 c = isr.read();
79 if (c < 0) {
80 break;
81 }
82 count++;
83 if (c == 0xFEFF && count == 1) {
84 // BOM in the test data file. Discard it.
85 continue;
86 }
87
88 testFileBuf.appendCodePoint(c);
89 }
90 } finally {
91 isr.close();
92 }
93 } catch (IOException e) {
94 errln(e.toString());
95 try {
96 is.close();
97 } catch (IOException ignored) {
98 }
99 return;
100 }
101
102 String testString = testFileBuf.toString();
103
104
105 final int PARSE_COMMENT = 1;
106 final int PARSE_TAG = 2;
107 final int PARSE_DATA = 3;
108 final int PARSE_NUM = 4;
109 final int PARSE_RULES = 5;
110
111 int parseState = PARSE_TAG;
112
113 int savedState = PARSE_TAG;
114
115 int lineNum = 1;
116 int colStart = 0;
117 int column = 0;
118 int charIdx = 0;
119 int i;
120
121 int tagValue = 0; // The numeric value of a <nnn> tag.
122
123 StringBuilder rules = new StringBuilder(); // Holds rules from a <rules> ... </rules> block
124 int rulesFirstLine = 0; // Line number of the start of current <rules> block
125
126 int len = testString.length();
127
128 for (charIdx = 0; charIdx < len; ) {
129 int c = testString.codePointAt(charIdx);
130 charIdx++;
131 if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
132 // treat CRLF as a unit
133 c = '\n';
134 charIdx++;
135 }
136 if (c == '\n' || c == '\r') {
137 lineNum++;
138 colStart = charIdx;
139 }
140 column = charIdx - colStart + 1;
141
142 switch (parseState) {
143 case PARSE_COMMENT:
144 if (c == 0x0a || c == 0x0d) {
145 parseState = savedState;
146 }
147 break;
148
149 case PARSE_TAG:
150 {
151 if (c == '#') {
152 parseState = PARSE_COMMENT;
153 savedState = PARSE_TAG;
154 break;
155 }
156 if (UCharacter.isWhitespace(c)) {
157 break;
158 }
159 if (testString.startsWith("<word>", charIdx-1)) {
160 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
161 charIdx += 5;
162 break;
163 }
164 if (testString.startsWith("<char>", charIdx-1)) {
165 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
166 charIdx += 5;
167 break;
168 }
169 if (testString.startsWith("<line>", charIdx-1)) {
170 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
171 charIdx += 5;
172 break;
173 }
174 if (testString.startsWith("<sent>", charIdx-1)) {
175 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
176 charIdx += 5;
177 break;
178 }
179 if (testString.startsWith("<title>", charIdx-1)) {
180 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
181 charIdx += 6;
182 break;
183 }
184 if (testString.startsWith("<rules>", charIdx-1) ||
185 testString.startsWith("<badrules>", charIdx-1)) {
186 charIdx = testString.indexOf('>', charIdx) + 1;
187 parseState = PARSE_RULES;
188 rules.setLength(0);
189 rulesFirstLine = lineNum;
190 break;
191 }
192
193 if (testString.startsWith("<locale ", charIdx-1)) {
194 int closeIndex = testString.indexOf(">", charIdx);
195 if (closeIndex < 0) {
196 errln("line" + lineNum + ": missing close on <locale tag.");
197 break;
198 }
199 String localeName = testString.substring(charIdx+6, closeIndex);
200 localeName = localeName.trim();
201 tp.currentLocale = new ULocale(localeName);
202 charIdx = closeIndex+1;
203 break;
204 }
205 if (testString.startsWith("<data>", charIdx-1)) {
206 parseState = PARSE_DATA;
207 charIdx += 5;
208 tp.dataToBreak.setLength(0);
209 Arrays.fill(tp.expectedBreaks, 0);
210 Arrays.fill(tp.srcCol, 0);
211 Arrays.fill(tp.srcLine, 0);
212 break;
213 }
214
215 errln("line" + lineNum + ": Tag expected in test file.");
216 return;
217 //parseState = PARSE_COMMENT;
218 //savedState = PARSE_DATA;
219 }
220
221 case PARSE_RULES:
222 if (testString.startsWith("</rules>", charIdx-1)) {
223 charIdx += 7;
224 parseState = PARSE_TAG;
225 try {
226 tp.bi = new RuleBasedBreakIterator(rules.toString());
227 } catch (IllegalArgumentException e) {
228 errln(String.format("rbbitst.txt:%d Error creating break iterator from rules. %s", lineNum, e));
229 }
230 } else if (testString.startsWith("</badrules>", charIdx-1)) {
231 charIdx += 10;
232 parseState = PARSE_TAG;
233 boolean goodRules = true;
234 try {
235 new RuleBasedBreakIterator(rules.toString());
236 } catch (IllegalArgumentException e) {
237 goodRules = false;
238 }
239 if (goodRules) {
240 errln(String.format(
241 "rbbitst.txt:%d Expected, but did not get, a failure creating break iterator from rules.",
242 lineNum));
243 }
244 } else {
245 rules.appendCodePoint(c);
246 }
247 break;
248
249 case PARSE_DATA:
250 if (c == '•') {
251 int breakIdx = tp.dataToBreak.length();
252 if (tp.expectedBreaks[breakIdx] != 0) {
253 errln(String.format(
254 "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
255 lineNum, column));
256 }
257 tp.expectedBreaks[breakIdx] = -1;
258 tp.srcLine[breakIdx] = lineNum;
259 tp.srcCol[breakIdx] = column;
260 break;
261 }
262
263 if (testString.startsWith("</data>", charIdx-1)) {
264 // Add final entry to mappings from break location to source file position.
265 // Need one extra because last break position returned is after the
266 // last char in the data, not at the last char.
267 int idx = tp.dataToBreak.length();
268 tp.srcLine[idx] = lineNum;
269 tp.srcCol[idx] = column;
270
271 parseState = PARSE_TAG;
272 charIdx += 6;
273
274 // RUN THE TEST!
275 executeTest(tp);
276 break;
277 }
278
279 if (testString.startsWith("\\N{", charIdx-1)) {
280 int nameEndIdx = testString.indexOf('}', charIdx);
281 if (nameEndIdx == -1) {
282 errln("Error in named character in test file at line " + lineNum +
283 ", col " + column);
284 }
285 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
286 // Get the code point from the name and insert it into the test data.
287 String charName = testString.substring(charIdx+2, nameEndIdx);
288 c = UCharacter.getCharFromName(charName);
289 if (c == -1) {
290 errln("Error in named character in test file at line " + lineNum +
291 ", col " + column);
292 } else {
293 // Named code point was recognized. Insert it
294 // into the test data.
295 tp.dataToBreak.appendCodePoint(c);
296 for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
297 tp.srcLine[i] = lineNum;
298 tp.srcCol[i] = column;
299 }
300
301 }
302 if (nameEndIdx > charIdx) {
303 charIdx = nameEndIdx+1;
304 }
305 break;
306 }
307
308 if (testString.startsWith("<>", charIdx-1)) {
309 charIdx++;
310 int breakIdx = tp.dataToBreak.length();
311 tp.expectedBreaks[breakIdx] = -1;
312 tp.srcLine[breakIdx] = lineNum;
313 tp.srcCol[breakIdx] = column;
314 break;
315 }
316
317 if (c == '<') {
318 tagValue = 0;
319 parseState = PARSE_NUM;
320 break;
321 }
322
323 if (c == '#' && column==3) { // TODO: why is column off so far?
324 parseState = PARSE_COMMENT;
325 savedState = PARSE_DATA;
326 break;
327 }
328
329 if (c == '\\') {
330 // Check for \ at end of line, a line continuation.
331 // Advance over (discard) the newline
332 int cp = testString.codePointAt(charIdx);
333 if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
334 // We have a CR LF
335 // Need an extra increment of the input ptr to move over both of them
336 charIdx++;
337 }
338 if (cp == '\n' || cp == '\r') {
339 lineNum++;
340 column = 0;
341 charIdx++;
342 colStart = charIdx;
343 break;
344 }
345
346 // Let unescape handle the back slash.
347 int cpAndLength = Utility.unescapeAndLengthAt(testString, charIdx);
348 if (cpAndLength >= 0) {
349 // Escape sequence was recognized. Insert the char
350 // into the test data.
351 charIdx += Utility.lengthFromCodePointAndLength(cpAndLength);
352 tp.dataToBreak.appendCodePoint(Utility.cpFromCodePointAndLength(cpAndLength));
353 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
354 tp.srcLine[i] = lineNum;
355 tp.srcCol[i] = column;
356 }
357
358 break;
359 }
360
361
362 // Not a recognized backslash escape sequence.
363 // Take the next char as a literal.
364 // TODO: Should this be an error?
365 c = testString.codePointAt(charIdx);
366 charIdx = testString.offsetByCodePoints(charIdx, 1);
367 }
368
369 // Normal, non-escaped data char.
370 tp.dataToBreak.appendCodePoint(c);
371
372 // Save the mapping from offset in the data to line/column numbers in
373 // the original input file. Will be used for better error messages only.
374 // If there's an expected break before this char, the slot in the mapping
375 // vector will already be set for this char; don't overwrite it.
376 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
377 tp.srcLine[i] = lineNum;
378 tp.srcCol[i] = column;
379 }
380 break;
381
382
383 case PARSE_NUM:
384 // We are parsing an expected numeric tag value, like <1234>,
385 // within a chunk of data.
386 if (UCharacter.isWhitespace(c)) {
387 break;
388 }
389
390 if (c == '>') {
391 // Finished the number. Add the info to the expected break data,
392 // and switch parse state back to doing plain data.
393 parseState = PARSE_DATA;
394 if (tagValue == 0) {
395 tagValue = -1;
396 }
397 int breakIdx = tp.dataToBreak.length();
398 if (tp.expectedBreaks[breakIdx] != 0) {
399 errln(String.format(
400 "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
401 lineNum, column));
402 }
403 tp.expectedBreaks[breakIdx] = tagValue;
404 tp.srcLine[breakIdx] = lineNum;
405 tp.srcCol[breakIdx] = column;
406 break;
407 }
408
409 if (UCharacter.isDigit(c)) {
410 tagValue = tagValue*10 + UCharacter.digit(c);
411 break;
412 }
413
414 errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
415 return;
416 }
417 }
418
419 // Reached end of test file. Raise an error if parseState indicates that we are
420 // within a block that should have been terminated.
421 if (parseState == PARSE_RULES) {
422 errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
423 lineNum, rulesFirstLine));
424 }
425 if (parseState == PARSE_DATA) {
426 errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
427 }
428 }
429
executeTest(TestParams t)430 void executeTest(TestParams t) {
431 // TODO: also rerun tests with a break iterator re-created from bi.getRules()
432 // and from bi.clone(). If in exhaustive mode only.
433 int bp;
434 int prevBP;
435 int i;
436
437 if (t.bi == null) {
438 return;
439 }
440
441 t.bi.setText(t.dataToBreak.toString());
442 //
443 // Run the iterator forward
444 //
445 prevBP = -1;
446 for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
447 if (prevBP == bp) {
448 // Fail for lack of forward progress.
449 errln("Forward Iteration, no forward progress. Break Pos=" + bp +
450 " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
451 break;
452 }
453
454 // Check that there were we didn't miss an expected break between the last one
455 // and this one.
456 for (i=prevBP+1; i<bp; i++) {
457 if (t.expectedBreaks[i] != 0) {
458 errln("Forward Iteration, break expected, but not found. Pos=" + i +
459 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
460 }
461 }
462
463 // Check that the break we did find was expected
464 if (t.expectedBreaks[bp] == 0) {
465 errln("Forward Iteration, break found, but not expected. Pos=" + bp +
466 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
467 } else {
468 // The break was expected.
469 // Check that the {nnn} tag value is correct.
470 int expectedTagVal = t.expectedBreaks[bp];
471 if (expectedTagVal == -1) {
472 expectedTagVal = 0;
473 }
474 int line = t.srcLine[bp];
475 int rs = t.bi.getRuleStatus();
476 if (rs != expectedTagVal) {
477 errln("Incorrect status for forward break. Pos = " + bp +
478 ". File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
479 " Actual, Expected status = " + rs + ", " + expectedTagVal);
480 }
481 int[] fillInArray = new int[4];
482 int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
483 assertTrue("", numStatusVals >= 1);
484 assertEquals("", expectedTagVal, fillInArray[0]);
485 }
486
487
488 prevBP = bp;
489 }
490
491 // Verify that there were no missed expected breaks after the last one found
492 for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
493 if (t.expectedBreaks[i] != 0) {
494 errln("Forward Iteration, break expected, but not found. Pos=" + i +
495 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
496 }
497 }
498
499
500 //
501 // Run the iterator backwards, verify that the same breaks are found.
502 //
503 prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen.
504 for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
505 if (prevBP == bp) {
506 // Fail for lack of progress.
507 errln("Reverse Iteration, no progress. Break Pos=" + bp +
508 "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]);
509 break;
510 }
511
512 // Check that we didn't miss an expected break between the last one
513 // and this one. (UVector returns zeros for index out of bounds.)
514 for (i=prevBP-1; i>bp; i--) {
515 if (t.expectedBreaks[i] != 0) {
516 errln("Reverse Itertion, break expected, but not found. Pos=" + i +
517 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
518 }
519 }
520
521 // Check that the break we did find was expected
522 if (t.expectedBreaks[bp] == 0) {
523 errln("Reverse Itertion, break found, but not expected. Pos=" + bp +
524 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
525 } else {
526 // The break was expected.
527 // Check that the {nnn} tag value is correct.
528 int expectedTagVal = t.expectedBreaks[bp];
529 if (expectedTagVal == -1) {
530 expectedTagVal = 0;
531 }
532 int line = t.srcLine[bp];
533 int rs = t.bi.getRuleStatus();
534 if (rs != expectedTagVal) {
535 errln("Incorrect status for reverse break. Pos = " + bp +
536 " File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
537 " Actual, Expected status = " + rs + ", " + expectedTagVal);
538 }
539 }
540
541 prevBP = bp;
542 }
543
544 // Verify that there were no missed breaks prior to the last one found
545 for (i=prevBP-1; i>=0; i--) {
546 if (t.expectedBreaks[i] != 0) {
547 errln("Reverse Itertion, break expected, but not found. Pos=" + i +
548 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
549 }
550 }
551 // Check isBoundary()
552 for (i=0; i<=t.dataToBreak.length(); i++) {
553 boolean boundaryExpected = (t.expectedBreaks[i] != 0);
554 boolean boundaryFound = t.bi.isBoundary(i);
555 if (boundaryExpected != boundaryFound) {
556 errln("isBoundary(" + i + ") incorrect.\n" +
557 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
558 " Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
559 }
560 }
561
562 // Check following()
563 for (i=0; i<=t.dataToBreak.length(); i++) {
564 int actualBreak = t.bi.following(i);
565 int expectedBreak = BreakIterator.DONE;
566 for (int j=i+1; j < t.expectedBreaks.length; j++) {
567 if (t.expectedBreaks[j] != 0) {
568 expectedBreak = j;
569 break;
570 }
571 }
572 if (expectedBreak != actualBreak) {
573 errln("following(" + i + ") incorrect.\n" +
574 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
575 " Expected, Actual= " + expectedBreak + ", " + actualBreak);
576 }
577 }
578
579 // Check preceding()
580 for (i=t.dataToBreak.length(); i>=0; i--) {
581 int actualBreak = t.bi.preceding(i);
582 int expectedBreak = BreakIterator.DONE;
583
584 for (int j=i-1; j >= 0; j--) {
585 if (t.expectedBreaks[j] != 0) {
586 expectedBreak = j;
587 break;
588 }
589 }
590 if (expectedBreak != actualBreak) {
591 errln("preceding(" + i + ") incorrect.\n" +
592 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
593 " Expected, Actual= " + expectedBreak + ", " + actualBreak);
594 }
595 }
596
597 }
598
599
600
601
602 }
603