1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 * Created on May 5, 2004
5 *
6 * Copyright (C) 2004-2016 International Business Machines Corporation and others.
7 * All Rights Reserved.
8 *
9 */
10 package com.ibm.icu.dev.test.rbbi;
11
12 import java.io.IOException;
13 import java.io.InputStream;
14 import java.io.InputStreamReader;
15 import java.util.Arrays;
16
17 import org.junit.Test;
18 import org.junit.runner.RunWith;
19 import org.junit.runners.JUnit4;
20
21 import com.ibm.icu.dev.test.TestFmwk;
22 import com.ibm.icu.impl.Utility;
23 import com.ibm.icu.lang.UCharacter;
24 import com.ibm.icu.text.BreakIterator;
25 import com.ibm.icu.text.RuleBasedBreakIterator;
26 import com.ibm.icu.util.ULocale;
27
28
29 /**
30 * Rule based break iterator data driven test.
31 * Perform the tests from the file rbbitst.txt.
32 * The test data file is common to both ICU4C and ICU4J.
33 * See the data file for a description of the tests.
34 *
35 */
36 @RunWith(JUnit4.class)
37 public class RBBITestExtended extends TestFmwk {
RBBITestExtended()38 public RBBITestExtended() {
39 }
40
41
42
43 static class TestParams {
44 BreakIterator bi;
45 StringBuilder dataToBreak = new StringBuilder();
46 int[] expectedBreaks = new int[4000];
47 int[] srcLine = new int[4000];
48 int[] srcCol = new int[4000];
49 ULocale currentLocale = new ULocale("en_US");
50 }
51
52
53 @Test
TestExtended()54 public void TestExtended() {
55 TestParams tp = new TestParams();
56
57
58 //
59 // Open and read the test data file.
60 //
61 StringBuilder testFileBuf = new StringBuilder();
62 InputStream is = null;
63 try {
64 is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
65 if (is == null) {
66 errln("Could not open test data file rbbitst.txt");
67 return;
68 }
69 InputStreamReader isr = new InputStreamReader(is, "UTF-8");
70 try {
71 int c;
72 int count = 0;
73 for (;;) {
74 c = isr.read();
75 if (c < 0) {
76 break;
77 }
78 count++;
79 if (c == 0xFEFF && count == 1) {
80 // BOM in the test data file. Discard it.
81 continue;
82 }
83
84 testFileBuf.appendCodePoint(c);
85 }
86 } finally {
87 isr.close();
88 }
89 } catch (IOException e) {
90 errln(e.toString());
91 try {
92 is.close();
93 } catch (IOException ignored) {
94 }
95 return;
96 }
97
98 String testString = testFileBuf.toString();
99
100
101 final int PARSE_COMMENT = 1;
102 final int PARSE_TAG = 2;
103 final int PARSE_DATA = 3;
104 final int PARSE_NUM = 4;
105 final int PARSE_RULES = 5;
106
107 int parseState = PARSE_TAG;
108
109 int savedState = PARSE_TAG;
110
111 int lineNum = 1;
112 int colStart = 0;
113 int column = 0;
114 int charIdx = 0;
115 int i;
116
117 int tagValue = 0; // The numeric value of a <nnn> tag.
118
119 StringBuilder rules = new StringBuilder(); // Holds rules from a <rules> ... </rules> block
120 int rulesFirstLine = 0; // Line number of the start of current <rules> block
121
122 int len = testString.length();
123
124 for (charIdx = 0; charIdx < len; ) {
125 int c = testString.codePointAt(charIdx);
126 charIdx++;
127 if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
128 // treat CRLF as a unit
129 c = '\n';
130 charIdx++;
131 }
132 if (c == '\n' || c == '\r') {
133 lineNum++;
134 colStart = charIdx;
135 }
136 column = charIdx - colStart + 1;
137
138 switch (parseState) {
139 case PARSE_COMMENT:
140 if (c == 0x0a || c == 0x0d) {
141 parseState = savedState;
142 }
143 break;
144
145 case PARSE_TAG:
146 {
147 if (c == '#') {
148 parseState = PARSE_COMMENT;
149 savedState = PARSE_TAG;
150 break;
151 }
152 if (UCharacter.isWhitespace(c)) {
153 break;
154 }
155 if (testString.startsWith("<word>", charIdx-1)) {
156 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
157 charIdx += 5;
158 break;
159 }
160 if (testString.startsWith("<char>", charIdx-1)) {
161 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
162 charIdx += 5;
163 break;
164 }
165 if (testString.startsWith("<line>", charIdx-1)) {
166 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
167 charIdx += 5;
168 break;
169 }
170 if (testString.startsWith("<sent>", charIdx-1)) {
171 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
172 charIdx += 5;
173 break;
174 }
175 if (testString.startsWith("<title>", charIdx-1)) {
176 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
177 charIdx += 6;
178 break;
179 }
180 if (testString.startsWith("<rules>", charIdx-1) ||
181 testString.startsWith("<badrules>", charIdx-1)) {
182 charIdx = testString.indexOf('>', charIdx) + 1;
183 parseState = PARSE_RULES;
184 rules.setLength(0);
185 rulesFirstLine = lineNum;
186 break;
187 }
188
189 if (testString.startsWith("<locale ", charIdx-1)) {
190 int closeIndex = testString.indexOf(">", charIdx);
191 if (closeIndex < 0) {
192 errln("line" + lineNum + ": missing close on <locale tag.");
193 break;
194 }
195 String localeName = testString.substring(charIdx+6, closeIndex);
196 localeName = localeName.trim();
197 tp.currentLocale = new ULocale(localeName);
198 charIdx = closeIndex+1;
199 break;
200 }
201 if (testString.startsWith("<data>", charIdx-1)) {
202 parseState = PARSE_DATA;
203 charIdx += 5;
204 tp.dataToBreak.setLength(0);
205 Arrays.fill(tp.expectedBreaks, 0);
206 Arrays.fill(tp.srcCol, 0);
207 Arrays.fill(tp.srcLine, 0);
208 break;
209 }
210
211 errln("line" + lineNum + ": Tag expected in test file.");
212 return;
213 //parseState = PARSE_COMMENT;
214 //savedState = PARSE_DATA;
215 }
216
217 case PARSE_RULES:
218 if (testString.startsWith("</rules>", charIdx-1)) {
219 charIdx += 7;
220 parseState = PARSE_TAG;
221 try {
222 tp.bi = new RuleBasedBreakIterator(rules.toString());
223 } catch (IllegalArgumentException e) {
224 errln(String.format("rbbitst.txt:%d Error creating break iterator from rules. %s", lineNum, e));
225 }
226 } else if (testString.startsWith("</badrules>", charIdx-1)) {
227 charIdx += 10;
228 parseState = PARSE_TAG;
229 boolean goodRules = true;
230 try {
231 new RuleBasedBreakIterator(rules.toString());
232 } catch (IllegalArgumentException e) {
233 goodRules = false;
234 }
235 if (goodRules) {
236 errln(String.format(
237 "rbbitst.txt:%d Expected, but did not get, a failure creating break iterator from rules.",
238 lineNum));
239 }
240 } else {
241 rules.appendCodePoint(c);
242 }
243 break;
244
245 case PARSE_DATA:
246 if (c == '•') {
247 int breakIdx = tp.dataToBreak.length();
248 if (tp.expectedBreaks[breakIdx] != 0) {
249 errln(String.format(
250 "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
251 lineNum, column));
252 }
253 tp.expectedBreaks[breakIdx] = -1;
254 tp.srcLine[breakIdx] = lineNum;
255 tp.srcCol[breakIdx] = column;
256 break;
257 }
258
259 if (testString.startsWith("</data>", charIdx-1)) {
260 // Add final entry to mappings from break location to source file position.
261 // Need one extra because last break position returned is after the
262 // last char in the data, not at the last char.
263 int idx = tp.dataToBreak.length();
264 tp.srcLine[idx] = lineNum;
265 tp.srcCol[idx] = column;
266
267 parseState = PARSE_TAG;
268 charIdx += 6;
269
270 // RUN THE TEST!
271 executeTest(tp);
272 break;
273 }
274
275 if (testString.startsWith("\\N{", charIdx-1)) {
276 int nameEndIdx = testString.indexOf('}', charIdx);
277 if (nameEndIdx == -1) {
278 errln("Error in named character in test file at line " + lineNum +
279 ", col " + column);
280 }
281 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
282 // Get the code point from the name and insert it into the test data.
283 String charName = testString.substring(charIdx+2, nameEndIdx);
284 c = UCharacter.getCharFromName(charName);
285 if (c == -1) {
286 errln("Error in named character in test file at line " + lineNum +
287 ", col " + column);
288 } else {
289 // Named code point was recognized. Insert it
290 // into the test data.
291 tp.dataToBreak.appendCodePoint(c);
292 for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
293 tp.srcLine[i] = lineNum;
294 tp.srcCol[i] = column;
295 }
296
297 }
298 if (nameEndIdx > charIdx) {
299 charIdx = nameEndIdx+1;
300 }
301 break;
302 }
303
304 if (testString.startsWith("<>", charIdx-1)) {
305 charIdx++;
306 int breakIdx = tp.dataToBreak.length();
307 tp.expectedBreaks[breakIdx] = -1;
308 tp.srcLine[breakIdx] = lineNum;
309 tp.srcCol[breakIdx] = column;
310 break;
311 }
312
313 if (c == '<') {
314 tagValue = 0;
315 parseState = PARSE_NUM;
316 break;
317 }
318
319 if (c == '#' && column==3) { // TODO: why is column off so far?
320 parseState = PARSE_COMMENT;
321 savedState = PARSE_DATA;
322 break;
323 }
324
325 if (c == '\\') {
326 // Check for \ at end of line, a line continuation.
327 // Advance over (discard) the newline
328 int cp = testString.codePointAt(charIdx);
329 if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
330 // We have a CR LF
331 // Need an extra increment of the input ptr to move over both of them
332 charIdx++;
333 }
334 if (cp == '\n' || cp == '\r') {
335 lineNum++;
336 column = 0;
337 charIdx++;
338 colStart = charIdx;
339 break;
340 }
341
342 // Let unescape handle the back slash.
343 int charIdxAr[] = new int[1];
344 charIdxAr[0] = charIdx;
345 cp = Utility.unescapeAt(testString, charIdxAr);
346 if (cp != -1) {
347 // Escape sequence was recognized. Insert the char
348 // into the test data.
349 charIdx = charIdxAr[0];
350 tp.dataToBreak.appendCodePoint(cp);
351 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
352 tp.srcLine[i] = lineNum;
353 tp.srcCol[i] = column;
354 }
355
356 break;
357 }
358
359
360 // Not a recognized backslash escape sequence.
361 // Take the next char as a literal.
362 // TODO: Should this be an error?
363 c = testString.codePointAt(charIdx);
364 charIdx = testString.offsetByCodePoints(charIdx, 1);
365 }
366
367 // Normal, non-escaped data char.
368 tp.dataToBreak.appendCodePoint(c);
369
370 // Save the mapping from offset in the data to line/column numbers in
371 // the original input file. Will be used for better error messages only.
372 // If there's an expected break before this char, the slot in the mapping
373 // vector will already be set for this char; don't overwrite it.
374 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
375 tp.srcLine[i] = lineNum;
376 tp.srcCol[i] = column;
377 }
378 break;
379
380
381 case PARSE_NUM:
382 // We are parsing an expected numeric tag value, like <1234>,
383 // within a chunk of data.
384 if (UCharacter.isWhitespace(c)) {
385 break;
386 }
387
388 if (c == '>') {
389 // Finished the number. Add the info to the expected break data,
390 // and switch parse state back to doing plain data.
391 parseState = PARSE_DATA;
392 if (tagValue == 0) {
393 tagValue = -1;
394 }
395 int breakIdx = tp.dataToBreak.length();
396 if (tp.expectedBreaks[breakIdx] != 0) {
397 errln(String.format(
398 "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
399 lineNum, column));
400 }
401 tp.expectedBreaks[breakIdx] = tagValue;
402 tp.srcLine[breakIdx] = lineNum;
403 tp.srcCol[breakIdx] = column;
404 break;
405 }
406
407 if (UCharacter.isDigit(c)) {
408 tagValue = tagValue*10 + UCharacter.digit(c);
409 break;
410 }
411
412 errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
413 return;
414 }
415 }
416
417 // Reached end of test file. Raise an error if parseState indicates that we are
418 // within a block that should have been terminated.
419 if (parseState == PARSE_RULES) {
420 errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
421 lineNum, rulesFirstLine));
422 }
423 if (parseState == PARSE_DATA) {
424 errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
425 }
426 }
427
executeTest(TestParams t)428 void executeTest(TestParams t) {
429 // TODO: also rerun tests with a break iterator re-created from bi.getRules()
430 // and from bi.clone(). If in exhaustive mode only.
431 int bp;
432 int prevBP;
433 int i;
434
435 if (t.bi == null) {
436 return;
437 }
438
439 t.bi.setText(t.dataToBreak.toString());
440 //
441 // Run the iterator forward
442 //
443 prevBP = -1;
444 for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
445 if (prevBP == bp) {
446 // Fail for lack of forward progress.
447 errln("Forward Iteration, no forward progress. Break Pos=" + bp +
448 " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
449 break;
450 }
451
452 // Check that there were we didn't miss an expected break between the last one
453 // and this one.
454 for (i=prevBP+1; i<bp; i++) {
455 if (t.expectedBreaks[i] != 0) {
456 errln("Forward Iteration, break expected, but not found. Pos=" + i +
457 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
458 }
459 }
460
461 // Check that the break we did find was expected
462 if (t.expectedBreaks[bp] == 0) {
463 errln("Forward Iteration, break found, but not expected. Pos=" + bp +
464 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
465 } else {
466 // The break was expected.
467 // Check that the {nnn} tag value is correct.
468 int expectedTagVal = t.expectedBreaks[bp];
469 if (expectedTagVal == -1) {
470 expectedTagVal = 0;
471 }
472 int line = t.srcLine[bp];
473 int rs = t.bi.getRuleStatus();
474 if (rs != expectedTagVal) {
475 errln("Incorrect status for forward break. Pos = " + bp +
476 ". File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
477 " Actual, Expected status = " + rs + ", " + expectedTagVal);
478 }
479 int[] fillInArray = new int[4];
480 int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
481 assertTrue("", numStatusVals >= 1);
482 assertEquals("", expectedTagVal, fillInArray[0]);
483 }
484
485
486 prevBP = bp;
487 }
488
489 // Verify that there were no missed expected breaks after the last one found
490 for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
491 if (t.expectedBreaks[i] != 0) {
492 errln("Forward Iteration, break expected, but not found. Pos=" + i +
493 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
494 }
495 }
496
497
498 //
499 // Run the iterator backwards, verify that the same breaks are found.
500 //
501 prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen.
502 for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
503 if (prevBP == bp) {
504 // Fail for lack of progress.
505 errln("Reverse Iteration, no progress. Break Pos=" + bp +
506 "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]);
507 break;
508 }
509
510 // Check that we didn't miss an expected break between the last one
511 // and this one. (UVector returns zeros for index out of bounds.)
512 for (i=prevBP-1; i>bp; i--) {
513 if (t.expectedBreaks[i] != 0) {
514 errln("Reverse Itertion, break expected, but not found. Pos=" + i +
515 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
516 }
517 }
518
519 // Check that the break we did find was expected
520 if (t.expectedBreaks[bp] == 0) {
521 errln("Reverse Itertion, break found, but not expected. Pos=" + bp +
522 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
523 } else {
524 // The break was expected.
525 // Check that the {nnn} tag value is correct.
526 int expectedTagVal = t.expectedBreaks[bp];
527 if (expectedTagVal == -1) {
528 expectedTagVal = 0;
529 }
530 int line = t.srcLine[bp];
531 int rs = t.bi.getRuleStatus();
532 if (rs != expectedTagVal) {
533 errln("Incorrect status for reverse break. Pos = " + bp +
534 " File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
535 " Actual, Expected status = " + rs + ", " + expectedTagVal);
536 }
537 }
538
539 prevBP = bp;
540 }
541
542 // Verify that there were no missed breaks prior to the last one found
543 for (i=prevBP-1; i>=0; i--) {
544 if (t.expectedBreaks[i] != 0) {
545 errln("Reverse Itertion, break expected, but not found. Pos=" + i +
546 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
547 }
548 }
549 // Check isBoundary()
550 for (i=0; i<=t.dataToBreak.length(); i++) {
551 boolean boundaryExpected = (t.expectedBreaks[i] != 0);
552 boolean boundaryFound = t.bi.isBoundary(i);
553 if (boundaryExpected != boundaryFound) {
554 errln("isBoundary(" + i + ") incorrect.\n" +
555 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
556 " Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
557 }
558 }
559
560 // Check following()
561 for (i=0; i<=t.dataToBreak.length(); i++) {
562 int actualBreak = t.bi.following(i);
563 int expectedBreak = BreakIterator.DONE;
564 for (int j=i+1; j < t.expectedBreaks.length; j++) {
565 if (t.expectedBreaks[j] != 0) {
566 expectedBreak = j;
567 break;
568 }
569 }
570 if (expectedBreak != actualBreak) {
571 errln("following(" + i + ") incorrect.\n" +
572 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
573 " Expected, Actual= " + expectedBreak + ", " + actualBreak);
574 }
575 }
576
577 // Check preceding()
578 for (i=t.dataToBreak.length(); i>=0; i--) {
579 int actualBreak = t.bi.preceding(i);
580 int expectedBreak = BreakIterator.DONE;
581
582 for (int j=i-1; j >= 0; j--) {
583 if (t.expectedBreaks[j] != 0) {
584 expectedBreak = j;
585 break;
586 }
587 }
588 if (expectedBreak != actualBreak) {
589 errln("preceding(" + i + ") incorrect.\n" +
590 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
591 " Expected, Actual= " + expectedBreak + ", " + actualBreak);
592 }
593 }
594
595 }
596
597
598
599
600 }
601