1 /*
2 * Created on May 5, 2004
3 *
4 * Copyright (C) 2004-2015 International Business Machines Corporation and others.
5 * All Rights Reserved.
6 *
7 */
8 package com.ibm.icu.dev.test.rbbi;
9
10 import java.io.IOException;
11 import java.io.InputStream;
12 import java.io.InputStreamReader;
13 import java.util.Arrays;
14
15 import com.ibm.icu.dev.test.TestFmwk;
16 import com.ibm.icu.impl.Utility;
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.text.BreakIterator;
19 import com.ibm.icu.text.UTF16;
20 import com.ibm.icu.util.ULocale;
21
22
23 /**
24 * Rule based break iterator data driven test.
25 * Perform the tests from the file rbbitst.txt.
26 * The test data file is common to both ICU4C and ICU4J.
27 * See the data file for a description of the tests.
28 *
29 */
30 public class RBBITestExtended extends TestFmwk {
31
main(String[] args)32 public static void main(String[] args)throws Exception {
33 new RBBITestExtended().run(args);
34 }
35
36
RBBITestExtended()37 public RBBITestExtended() {
38 }
39
40
41
42 static class TestParams {
43 BreakIterator bi;
44 StringBuffer dataToBreak = new StringBuffer();
45 int[] expectedBreaks = new int[1000];
46 int[] srcLine = new int[1000];
47 int[] srcCol = new int[1000];
48 ULocale currentLocale = new ULocale("en_US");
49 }
50
51
TestExtended()52 public void TestExtended() {
53 TestParams tp = new TestParams();
54
55
56 //
57 // Open and read the test data file.
58 //
59 StringBuffer testFileBuf = new StringBuffer();
60 InputStream is = null;
61 try {
62 is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
63 if (is == null) {
64 errln("Could not open test data file rbbitst.txt");
65 return;
66 }
67 InputStreamReader isr = new InputStreamReader(is, "UTF-8");
68 try {
69 int c;
70 int count = 0;
71 for (;;) {
72 c = isr.read();
73 if (c < 0) {
74 break;
75 }
76 count++;
77 if (c == 0xFEFF && count == 1) {
78 // BOM in the test data file. Discard it.
79 continue;
80 }
81
82 UTF16.append(testFileBuf, c);
83 }
84 } finally {
85 isr.close();
86 }
87 } catch (IOException e) {
88 errln(e.toString());
89 try {
90 is.close();
91 } catch (IOException ignored) {
92 }
93 return;
94 }
95
96 String testString = testFileBuf.toString();
97
98
99 final int PARSE_COMMENT = 1;
100 final int PARSE_TAG = 2;
101 final int PARSE_DATA = 3;
102 final int PARSE_NUM = 4;
103
104 int parseState = PARSE_TAG;
105
106 int savedState = PARSE_TAG;
107
108 final char CH_LF = 0x0a;
109 final char CH_CR = 0x0d;
110 final char CH_HASH = 0x23;
111 /*static const UChar CH_PERIOD = 0x2e;*/
112 final char CH_LT = 0x3c;
113 final char CH_GT = 0x3e;
114 final char CH_BACKSLASH = 0x5c;
115 final char CH_BULLET = 0x2022;
116
117 int lineNum = 1;
118 int colStart = 0;
119 int column = 0;
120 int charIdx = 0;
121 int i;
122
123 int tagValue = 0; // The numeric value of a <nnn> tag.
124 int len = testString.length();
125
126 for (charIdx = 0; charIdx < len; ) {
127 int c = UTF16.charAt(testString, charIdx);
128 charIdx++;
129 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
130 // treat CRLF as a unit
131 c = CH_LF;
132 charIdx++;
133 }
134 if (c == CH_LF || c == CH_CR) {
135 lineNum++;
136 colStart = charIdx;
137 }
138 column = charIdx - colStart + 1;
139
140 switch (parseState) {
141 case PARSE_COMMENT:
142 if (c == 0x0a || c == 0x0d) {
143 parseState = savedState;
144 }
145 break;
146
147 case PARSE_TAG:
148 {
149 if (c == CH_HASH) {
150 parseState = PARSE_COMMENT;
151 savedState = PARSE_TAG;
152 break;
153 }
154 if (UCharacter.isWhitespace(c)) {
155 break;
156 }
157 if (testString.startsWith("<word>", charIdx-1)) {
158 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
159 charIdx += 5;
160 break;
161 }
162 if (testString.startsWith("<char>", charIdx-1)) {
163 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
164 charIdx += 5;
165 break;
166 }
167 if (testString.startsWith("<line>", charIdx-1)) {
168 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
169 charIdx += 5;
170 break;
171 }
172 if (testString.startsWith("<sent>", charIdx-1)) {
173 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
174 charIdx += 5;
175 break;
176 }
177 if (testString.startsWith("<title>", charIdx-1)) {
178 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
179 charIdx += 6;
180 break;
181 }
182 if (testString.startsWith("<locale ", charIdx-1)) {
183 int closeIndex = testString.indexOf(">", charIdx);
184 if (closeIndex < 0) {
185 errln("line" + lineNum + ": missing close on <locale tag.");
186 break;
187 }
188 String localeName = testString.substring(charIdx+6, closeIndex);
189 localeName = localeName.trim();
190 tp.currentLocale = new ULocale(localeName);
191 charIdx = closeIndex+1;
192 break;
193 }
194 if (testString.startsWith("<data>", charIdx-1)) {
195 parseState = PARSE_DATA;
196 charIdx += 5;
197 tp.dataToBreak.setLength(0);
198 Arrays.fill(tp.expectedBreaks, 0);
199 Arrays.fill(tp.srcCol, 0);
200 Arrays.fill(tp.srcLine, 0);
201 break;
202 }
203
204 errln("line" + lineNum + ": Tag expected in test file.");
205 return;
206 //parseState = PARSE_COMMENT;
207 //savedState = PARSE_DATA;
208 }
209
210 case PARSE_DATA:
211 if (c == CH_BULLET) {
212 int breakIdx = tp.dataToBreak.length();
213 tp.expectedBreaks[breakIdx] = -1;
214 tp.srcLine[breakIdx] = lineNum;
215 tp.srcCol[breakIdx] = column;
216 break;
217 }
218
219 if (testString.startsWith("</data>", charIdx-1)) {
220 // Add final entry to mappings from break location to source file position.
221 // Need one extra because last break position returned is after the
222 // last char in the data, not at the last char.
223 int idx = tp.dataToBreak.length();
224 tp.srcLine[idx] = lineNum;
225 tp.srcCol[idx] = column;
226
227 parseState = PARSE_TAG;
228 charIdx += 6;
229
230 // RUN THE TEST!
231 executeTest(tp);
232 break;
233 }
234
235 if (testString.startsWith("\\N{", charIdx-1)) {
236 int nameEndIdx = testString.indexOf('}', charIdx);
237 if (nameEndIdx == -1) {
238 errln("Error in named character in test file at line " + lineNum +
239 ", col " + column);
240 }
241 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
242 // Get the code point from the name and insert it into the test data.
243 String charName = testString.substring(charIdx+2, nameEndIdx);
244 c = UCharacter.getCharFromName(charName);
245 if (c == -1) {
246 errln("Error in named character in test file at line " + lineNum +
247 ", col " + column);
248 } else {
249 // Named code point was recognized. Insert it
250 // into the test data.
251 UTF16.append(tp.dataToBreak, c);
252 for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
253 tp.srcLine[i] = lineNum;
254 tp.srcCol[i] = column;
255 }
256
257 }
258 if (nameEndIdx > charIdx) {
259 charIdx = nameEndIdx+1;
260 }
261 break;
262 }
263
264 if (testString.startsWith("<>", charIdx-1)) {
265 charIdx++;
266 int breakIdx = tp.dataToBreak.length();
267 tp.expectedBreaks[breakIdx] = -1;
268 tp.srcLine[breakIdx] = lineNum;
269 tp.srcCol[breakIdx] = column;
270 break;
271 }
272
273 if (c == CH_LT) {
274 tagValue = 0;
275 parseState = PARSE_NUM;
276 break;
277 }
278
279 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
280 parseState = PARSE_COMMENT;
281 savedState = PARSE_DATA;
282 break;
283 }
284
285 if (c == CH_BACKSLASH) {
286 // Check for \ at end of line, a line continuation.
287 // Advance over (discard) the newline
288 int cp = UTF16.charAt(testString, charIdx);
289 if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
290 // We have a CR LF
291 // Need an extra increment of the input ptr to move over both of them
292 charIdx++;
293 }
294 if (cp == CH_LF || cp == CH_CR) {
295 lineNum++;
296 column = 0;
297 charIdx++;
298 colStart = charIdx;
299 break;
300 }
301
302 // Let unescape handle the back slash.
303 int charIdxAr[] = new int[1];
304 charIdxAr[0] = charIdx;
305 cp = Utility.unescapeAt(testString, charIdxAr);
306 if (cp != -1) {
307 // Escape sequence was recognized. Insert the char
308 // into the test data.
309 charIdx = charIdxAr[0];
310 UTF16.append(tp.dataToBreak, cp);
311 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
312 tp.srcLine[i] = lineNum;
313 tp.srcCol[i] = column;
314 }
315
316 break;
317 }
318
319
320 // Not a recognized backslash escape sequence.
321 // Take the next char as a literal.
322 // TODO: Should this be an error?
323 c = UTF16.charAt(testString,charIdx);
324 charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
325 }
326
327 // Normal, non-escaped data char.
328 UTF16.append(tp.dataToBreak, c);
329
330 // Save the mapping from offset in the data to line/column numbers in
331 // the original input file. Will be used for better error messages only.
332 // If there's an expected break before this char, the slot in the mapping
333 // vector will already be set for this char; don't overwrite it.
334 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
335 tp.srcLine[i] = lineNum;
336 tp.srcCol[i] = column;
337 }
338 break;
339
340
341 case PARSE_NUM:
342 // We are parsing an expected numeric tag value, like <1234>,
343 // within a chunk of data.
344 if (UCharacter.isWhitespace(c)) {
345 break;
346 }
347
348 if (c == CH_GT) {
349 // Finished the number. Add the info to the expected break data,
350 // and switch parse state back to doing plain data.
351 parseState = PARSE_DATA;
352 if (tagValue == 0) {
353 tagValue = -1;
354 }
355 int breakIdx = tp.dataToBreak.length();
356 tp.expectedBreaks[breakIdx] = tagValue;
357 tp.srcLine[breakIdx] = lineNum;
358 tp.srcCol[breakIdx] = column;
359 break;
360 }
361
362 if (UCharacter.isDigit(c)) {
363 tagValue = tagValue*10 + UCharacter.digit(c);
364 break;
365 }
366
367 errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
368 return;
369
370 // parseState = PARSE_COMMENT; // TODO: unreachable. Don't stop on errors.
371 // break;
372 }
373
374
375
376 }
377 }
378
executeTest(TestParams t)379 void executeTest(TestParams t) {
380 int bp;
381 int prevBP;
382 int i;
383
384 if (t.bi == null) {
385 return;
386 }
387
388 t.bi.setText(t.dataToBreak.toString());
389 //
390 // Run the iterator forward
391 //
392 prevBP = -1;
393 for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
394 if (prevBP == bp) {
395 // Fail for lack of forward progress.
396 errln("Forward Iteration, no forward progress. Break Pos=" + bp +
397 " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
398 break;
399 }
400
401 // Check that there were we didn't miss an expected break between the last one
402 // and this one.
403 for (i=prevBP+1; i<bp; i++) {
404 if (t.expectedBreaks[i] != 0) {
405 errln("Forward Iteration, break expected, but not found. Pos=" + i +
406 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
407 }
408 }
409
410 // Check that the break we did find was expected
411 if (t.expectedBreaks[bp] == 0) {
412 errln("Forward Iteration, break found, but not expected. Pos=" + bp +
413 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
414 } else {
415 // The break was expected.
416 // Check that the {nnn} tag value is correct.
417 int expectedTagVal = t.expectedBreaks[bp];
418 if (expectedTagVal == -1) {
419 expectedTagVal = 0;
420 }
421 int line = t.srcLine[bp];
422 int rs = t.bi.getRuleStatus();
423 if (rs != expectedTagVal) {
424 errln("Incorrect status for forward break. Pos = " + bp +
425 ". File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
426 " Actual, Expected status = " + rs + ", " + expectedTagVal);
427 }
428 int[] fillInArray = new int[4];
429 int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
430 assertTrue("", numStatusVals >= 1);
431 assertEquals("", expectedTagVal, fillInArray[0]);
432 }
433
434
435 prevBP = bp;
436 }
437
438 // Verify that there were no missed expected breaks after the last one found
439 for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
440 if (t.expectedBreaks[i] != 0) {
441 errln("Forward Iteration, break expected, but not found. Pos=" + i +
442 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
443 }
444 }
445
446
447 //
448 // Run the iterator backwards, verify that the same breaks are found.
449 //
450 prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen.
451 for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
452 if (prevBP == bp) {
453 // Fail for lack of progress.
454 errln("Reverse Iteration, no progress. Break Pos=" + bp +
455 "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]);
456 break;
457 }
458
459 // Check that we didn't miss an expected break between the last one
460 // and this one. (UVector returns zeros for index out of bounds.)
461 for (i=prevBP-1; i>bp; i--) {
462 if (t.expectedBreaks[i] != 0) {
463 errln("Reverse Itertion, break expected, but not found. Pos=" + i +
464 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
465 }
466 }
467
468 // Check that the break we did find was expected
469 if (t.expectedBreaks[bp] == 0) {
470 errln("Reverse Itertion, break found, but not expected. Pos=" + bp +
471 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
472 } else {
473 // The break was expected.
474 // Check that the {nnn} tag value is correct.
475 int expectedTagVal = t.expectedBreaks[bp];
476 if (expectedTagVal == -1) {
477 expectedTagVal = 0;
478 }
479 int line = t.srcLine[bp];
480 int rs = t.bi.getRuleStatus();
481 if (rs != expectedTagVal) {
482 errln("Incorrect status for reverse break. Pos= " + bp +
483 "File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
484 " Actual, Expected status = " + rs + ", " + expectedTagVal);
485 }
486 }
487
488 prevBP = bp;
489 }
490
491 // Verify that there were no missed breaks prior to the last one found
492 for (i=prevBP-1; i>=0; i--) {
493 if (t.expectedBreaks[i] != 0) {
494 errln("Reverse Itertion, break expected, but not found. Pos=" + i +
495 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
496 }
497 }
498 // Check isBoundary()
499 for (i=0; i<=t.dataToBreak.length(); i++) {
500 boolean boundaryExpected = (t.expectedBreaks[i] != 0);
501 boolean boundaryFound = t.bi.isBoundary(i);
502 if (boundaryExpected != boundaryFound) {
503 errln("isBoundary(" + i + ") incorrect.\n" +
504 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
505 " Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
506 }
507 }
508
509 // Check following()
510 for (i=0; i<=t.dataToBreak.length(); i++) {
511 int actualBreak = t.bi.following(i);
512 int expectedBreak = BreakIterator.DONE;
513 for (int j=i+1; j < t.expectedBreaks.length; j++) {
514 if (t.expectedBreaks[j] != 0) {
515 expectedBreak = j;
516 break;
517 }
518 }
519 if (expectedBreak != actualBreak) {
520 errln("following(" + i + ") incorrect.\n" +
521 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
522 " Expected, Actual= " + expectedBreak + ", " + actualBreak);
523 }
524 }
525
526 // Check preceding()
527 for (i=t.dataToBreak.length(); i>=0; i--) {
528 int actualBreak = t.bi.preceding(i);
529 int expectedBreak = BreakIterator.DONE;
530
531 for (int j=i-1; j >= 0; j--) {
532 if (t.expectedBreaks[j] != 0) {
533 expectedBreak = j;
534 break;
535 }
536 }
537 if (expectedBreak != actualBreak) {
538 errln("preceding(" + i + ") incorrect.\n" +
539 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
540 " Expected, Actual= " + expectedBreak + ", " + actualBreak);
541 }
542 }
543
544 }
545
546
547
548
549 }
550