• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  * Created on May 5, 2004
5  *
6  * Copyright (C) 2004-2016 International Business Machines Corporation and others.
7  * All Rights Reserved.
8  *
9  */
10 package com.ibm.icu.dev.test.rbbi;
11 
12 import java.io.IOException;
13 import java.io.InputStream;
14 import java.io.InputStreamReader;
15 import java.util.Arrays;
16 
17 import org.junit.Test;
18 import org.junit.runner.RunWith;
19 import org.junit.runners.JUnit4;
20 
21 import com.ibm.icu.dev.test.TestFmwk;
22 import com.ibm.icu.impl.Utility;
23 import com.ibm.icu.lang.UCharacter;
24 import com.ibm.icu.text.BreakIterator;
25 import com.ibm.icu.text.RuleBasedBreakIterator;
26 import com.ibm.icu.util.ULocale;
27 
28 
29 /**
30  * Rule based break iterator data driven test.
31  *      Perform the tests from the file rbbitst.txt.
32  *      The test data file is common to both ICU4C and ICU4J.
33  *      See the data file for a description of the tests.
34  *
35  */
36 @RunWith(JUnit4.class)
37 public class RBBITestExtended extends TestFmwk {
RBBITestExtended()38 public RBBITestExtended() {
39     }
40 
41 
42 
43 static class TestParams {
44     BreakIterator   bi;
45     StringBuilder   dataToBreak    = new StringBuilder();
46     int[]           expectedBreaks = new int[4000];
47     int[]           srcLine        = new int[4000];
48     int[]           srcCol         = new int[4000];
49     ULocale         currentLocale  = new ULocale("en_US");
50 }
51 
52 
53 @Test
TestExtended()54 public void TestExtended() {
55     TestParams     tp = new TestParams();
56 
57 
58     //
59     //  Open and read the test data file.
60     //
61     StringBuilder testFileBuf = new StringBuilder();
62     InputStream is = null;
63     try {
64         is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
65         if (is == null) {
66             errln("Could not open test data file rbbitst.txt");
67             return;
68         }
69         InputStreamReader isr = new InputStreamReader(is, "UTF-8");
70         try {
71             int c;
72             int count = 0;
73             for (;;) {
74                 c = isr.read();
75                 if (c < 0) {
76                     break;
77                 }
78                 count++;
79                 if (c == 0xFEFF && count == 1) {
80                     // BOM in the test data file. Discard it.
81                     continue;
82                 }
83 
84                 testFileBuf.appendCodePoint(c);
85             }
86         } finally {
87             isr.close();
88         }
89     } catch (IOException e) {
90         errln(e.toString());
91         try {
92             is.close();
93         } catch (IOException ignored) {
94         }
95         return;
96     }
97 
98     String testString = testFileBuf.toString();
99 
100 
101     final int  PARSE_COMMENT = 1;
102     final int  PARSE_TAG     = 2;
103     final int  PARSE_DATA    = 3;
104     final int  PARSE_NUM     = 4;
105     final int  PARSE_RULES   = 5;
106 
107     int parseState = PARSE_TAG;
108 
109     int savedState = PARSE_TAG;
110 
111     int    lineNum  = 1;
112     int    colStart = 0;
113     int    column   = 0;
114     int    charIdx  = 0;
115     int    i;
116 
117     int    tagValue = 0;       // The numeric value of a <nnn> tag.
118 
119     StringBuilder   rules = new StringBuilder();     // Holds rules from a <rules> ... </rules> block
120     int             rulesFirstLine = 0;              // Line number of the start of current <rules> block
121 
122     int    len = testString.length();
123 
124     for (charIdx = 0; charIdx < len; ) {
125         int c = testString.codePointAt(charIdx);
126         charIdx++;
127         if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
128             // treat CRLF as a unit
129             c = '\n';
130             charIdx++;
131         }
132         if (c == '\n' || c == '\r') {
133             lineNum++;
134             colStart = charIdx;
135         }
136         column = charIdx - colStart + 1;
137 
138         switch (parseState) {
139         case PARSE_COMMENT:
140             if (c == 0x0a || c == 0x0d) {
141                 parseState = savedState;
142             }
143             break;
144 
145         case PARSE_TAG:
146             {
147             if (c == '#') {
148                 parseState = PARSE_COMMENT;
149                 savedState = PARSE_TAG;
150                 break;
151             }
152             if (UCharacter.isWhitespace(c)) {
153                 break;
154             }
155            if (testString.startsWith("<word>", charIdx-1)) {
156                 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
157                 charIdx += 5;
158                 break;
159             }
160             if (testString.startsWith("<char>", charIdx-1)) {
161                 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
162                 charIdx += 5;
163                 break;
164             }
165             if (testString.startsWith("<line>", charIdx-1)) {
166                 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
167                 charIdx += 5;
168                 break;
169             }
170             if (testString.startsWith("<sent>", charIdx-1)) {
171                 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
172                 charIdx += 5;
173                 break;
174             }
175             if (testString.startsWith("<title>", charIdx-1)) {
176                 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
177                 charIdx += 6;
178                 break;
179             }
180             if (testString.startsWith("<rules>", charIdx-1) ||
181                     testString.startsWith("<badrules>", charIdx-1)) {
182                 charIdx = testString.indexOf('>', charIdx) + 1;
183                 parseState = PARSE_RULES;
184                 rules.setLength(0);
185                 rulesFirstLine = lineNum;
186                 break;
187             }
188 
189             if (testString.startsWith("<locale ", charIdx-1)) {
190                 int closeIndex = testString.indexOf(">", charIdx);
191                 if (closeIndex < 0) {
192                     errln("line" + lineNum + ": missing close on <locale  tag.");
193                     break;
194                 }
195                 String localeName = testString.substring(charIdx+6, closeIndex);
196                 localeName = localeName.trim();
197                 tp.currentLocale = new ULocale(localeName);
198                 charIdx = closeIndex+1;
199                 break;
200             }
201             if (testString.startsWith("<data>", charIdx-1)) {
202                 parseState = PARSE_DATA;
203                 charIdx += 5;
204                 tp.dataToBreak.setLength(0);
205                 Arrays.fill(tp.expectedBreaks, 0);
206                 Arrays.fill(tp.srcCol, 0);
207                 Arrays.fill(tp.srcLine, 0);
208                 break;
209             }
210 
211             errln("line" + lineNum + ": Tag expected in test file.");
212             return;
213             //parseState = PARSE_COMMENT;
214             //savedState = PARSE_DATA;
215             }
216 
217         case PARSE_RULES:
218             if (testString.startsWith("</rules>", charIdx-1)) {
219                 charIdx += 7;
220                 parseState = PARSE_TAG;
221                 try {
222                     tp.bi = new RuleBasedBreakIterator(rules.toString());
223                 } catch (IllegalArgumentException e) {
224                     errln(String.format("rbbitst.txt:%d  Error creating break iterator from rules.  %s", lineNum, e));
225                 }
226             } else if (testString.startsWith("</badrules>", charIdx-1)) {
227                 charIdx += 10;
228                 parseState = PARSE_TAG;
229                 boolean goodRules = true;
230                 try {
231                     new RuleBasedBreakIterator(rules.toString());
232                 } catch (IllegalArgumentException e) {
233                     goodRules = false;
234                 }
235                 if (goodRules) {
236                     errln(String.format(
237                             "rbbitst.txt:%d  Expected, but did not get, a failure creating break iterator from rules.",
238                             lineNum));
239                 }
240             } else {
241                 rules.appendCodePoint(c);
242             }
243             break;
244 
245         case PARSE_DATA:
246             if (c == '•') {
247                 int  breakIdx = tp.dataToBreak.length();
248                 if (tp.expectedBreaks[breakIdx] != 0) {
249                     errln(String.format(
250                             "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
251                             lineNum, column));
252                 }
253                 tp.expectedBreaks[breakIdx] = -1;
254                 tp.srcLine[breakIdx]        = lineNum;
255                 tp.srcCol[breakIdx]         = column;
256                 break;
257             }
258 
259             if (testString.startsWith("</data>", charIdx-1))  {
260                 // Add final entry to mappings from break location to source file position.
261                 //  Need one extra because last break position returned is after the
262                 //    last char in the data, not at the last char.
263                 int idx = tp.dataToBreak.length();
264                 tp.srcLine[idx] = lineNum;
265                 tp.srcCol[idx]  = column;
266 
267                 parseState = PARSE_TAG;
268                 charIdx += 6;
269 
270                 // RUN THE TEST!
271                 executeTest(tp);
272                 break;
273             }
274 
275            if (testString.startsWith("\\N{", charIdx-1)) {
276                int nameEndIdx = testString.indexOf('}', charIdx);
277                if (nameEndIdx == -1) {
278                    errln("Error in named character in test file at line " + lineNum +
279                            ", col " + column);
280                }
281                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
282                 // Get the code point from the name and insert it into the test data.
283                 String charName = testString.substring(charIdx+2, nameEndIdx);
284                 c = UCharacter.getCharFromName(charName);
285                 if (c == -1) {
286                     errln("Error in named character in test file at line " + lineNum +
287                             ", col " + column);
288                 } else {
289                     // Named code point was recognized.  Insert it
290                     //   into the test data.
291                     tp.dataToBreak.appendCodePoint(c);
292                     for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
293                         tp.srcLine[i] = lineNum;
294                         tp.srcCol[i]  = column;
295                     }
296 
297                  }
298                 if (nameEndIdx > charIdx) {
299                     charIdx = nameEndIdx+1;
300                 }
301                 break;
302             }
303 
304             if (testString.startsWith("<>", charIdx-1)) {
305                 charIdx++;
306                 int  breakIdx = tp.dataToBreak.length();
307                 tp.expectedBreaks[breakIdx] = -1;
308                 tp.srcLine[breakIdx]        = lineNum;
309                 tp.srcCol[breakIdx]         = column;
310                 break;
311             }
312 
313             if (c == '<') {
314                 tagValue   = 0;
315                 parseState = PARSE_NUM;
316                 break;
317             }
318 
319             if (c == '#' && column==3) {   // TODO:  why is column off so far?
320                 parseState = PARSE_COMMENT;
321                 savedState = PARSE_DATA;
322                 break;
323             }
324 
325             if (c == '\\') {
326                 // Check for \ at end of line, a line continuation.
327                 //     Advance over (discard) the newline
328                 int cp = testString.codePointAt(charIdx);
329                 if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
330                     // We have a CR LF
331                     //  Need an extra increment of the input ptr to move over both of them
332                     charIdx++;
333                 }
334                 if (cp == '\n' || cp == '\r') {
335                     lineNum++;
336                     column   = 0;
337                     charIdx++;
338                     colStart = charIdx;
339                     break;
340                 }
341 
342                 // Let unescape handle the back slash.
343                 int  charIdxAr[] = new int[1];
344                 charIdxAr[0] = charIdx;
345                 cp = Utility.unescapeAt(testString, charIdxAr);
346                 if (cp != -1) {
347                     // Escape sequence was recognized.  Insert the char
348                     //   into the test data.
349                     charIdx = charIdxAr[0];
350                     tp.dataToBreak.appendCodePoint(cp);
351                     for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
352                         tp.srcLine[i] = lineNum;
353                         tp.srcCol[i]  = column;
354                     }
355 
356                     break;
357                 }
358 
359 
360                 // Not a recognized backslash escape sequence.
361                 // Take the next char as a literal.
362                 //  TODO:  Should this be an error?
363                 c = testString.codePointAt(charIdx);
364                 charIdx = testString.offsetByCodePoints(charIdx, 1);
365              }
366 
367             // Normal, non-escaped data char.
368             tp.dataToBreak.appendCodePoint(c);
369 
370             // Save the mapping from offset in the data to line/column numbers in
371             //   the original input file.  Will be used for better error messages only.
372             //   If there's an expected break before this char, the slot in the mapping
373             //     vector will already be set for this char; don't overwrite it.
374             for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
375                 tp.srcLine[i] = lineNum;
376                 tp.srcCol[i]  = column;
377             }
378             break;
379 
380 
381         case PARSE_NUM:
382             // We are parsing an expected numeric tag value, like <1234>,
383             //   within a chunk of data.
384             if (UCharacter.isWhitespace(c)) {
385                 break;
386             }
387 
388             if (c == '>') {
389                 // Finished the number.  Add the info to the expected break data,
390                 //   and switch parse state back to doing plain data.
391                 parseState = PARSE_DATA;
392                 if (tagValue == 0) {
393                     tagValue = -1;
394                 }
395                 int  breakIdx = tp.dataToBreak.length();
396                 if (tp.expectedBreaks[breakIdx] != 0) {
397                     errln(String.format(
398                             "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
399                             lineNum, column));
400                 }
401                 tp.expectedBreaks[breakIdx] = tagValue;
402                 tp.srcLine[breakIdx]        = lineNum;
403                 tp.srcCol[breakIdx]         = column;
404                 break;
405             }
406 
407             if (UCharacter.isDigit(c)) {
408                 tagValue = tagValue*10 + UCharacter.digit(c);
409                 break;
410             }
411 
412             errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
413             return;
414         }
415     }
416 
417     // Reached end of test file. Raise an error if parseState indicates that we are
418     //   within a block that should have been terminated.
419     if (parseState == PARSE_RULES) {
420         errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
421             lineNum, rulesFirstLine));
422     }
423     if (parseState == PARSE_DATA) {
424         errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
425     }
426 }
427 
executeTest(TestParams t)428 void executeTest(TestParams t) {
429     // TODO: also rerun tests with a break iterator re-created from bi.getRules()
430     //       and from bi.clone(). If in exhaustive mode only.
431     int    bp;
432     int    prevBP;
433     int    i;
434 
435     if (t.bi == null) {
436         return;
437     }
438 
439     t.bi.setText(t.dataToBreak.toString());
440     //
441     //  Run the iterator forward
442     //
443     prevBP = -1;
444     for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
445         if (prevBP ==  bp) {
446             // Fail for lack of forward progress.
447             errln("Forward Iteration, no forward progress.  Break Pos=" + bp +
448                     "  File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
449             break;
450         }
451 
452         // Check that there were we didn't miss an expected break between the last one
453         //  and this one.
454         for (i=prevBP+1; i<bp; i++) {
455             if (t.expectedBreaks[i] != 0) {
456                 errln("Forward Iteration, break expected, but not found.  Pos=" + i +
457                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
458             }
459         }
460 
461         // Check that the break we did find was expected
462         if (t.expectedBreaks[bp] == 0) {
463             errln("Forward Iteration, break found, but not expected.  Pos=" + bp +
464                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
465         } else {
466             // The break was expected.
467             //   Check that the {nnn} tag value is correct.
468             int expectedTagVal = t.expectedBreaks[bp];
469             if (expectedTagVal == -1) {
470                 expectedTagVal = 0;
471             }
472             int line = t.srcLine[bp];
473             int rs = t.bi.getRuleStatus();
474             if (rs != expectedTagVal) {
475                 errln("Incorrect status for forward break.  Pos = " + bp +
476                         ".  File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
477                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
478             }
479             int[] fillInArray = new int[4];
480             int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
481             assertTrue("", numStatusVals >= 1);
482             assertEquals("", expectedTagVal, fillInArray[0]);
483         }
484 
485 
486         prevBP = bp;
487     }
488 
489     // Verify that there were no missed expected breaks after the last one found
490     for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
491         if (t.expectedBreaks[i] != 0) {
492             errln("Forward Iteration, break expected, but not found.  Pos=" + i +
493                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
494        }
495     }
496 
497 
498     //
499     //  Run the iterator backwards, verify that the same breaks are found.
500     //
501     prevBP = t.dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
502     for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
503         if (prevBP ==  bp) {
504             // Fail for lack of progress.
505             errln("Reverse Iteration, no progress.  Break Pos=" + bp +
506                     "File line,col=" + t.srcLine[bp] + " " +  t.srcCol[bp]);
507             break;
508         }
509 
510         // Check that we didn't miss an expected break between the last one
511         //  and this one.  (UVector returns zeros for index out of bounds.)
512         for (i=prevBP-1; i>bp; i--) {
513             if (t.expectedBreaks[i] != 0) {
514                 errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
515                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
516             }
517         }
518 
519         // Check that the break we did find was expected
520         if (t.expectedBreaks[bp] == 0) {
521             errln("Reverse Itertion, break found, but not expected.  Pos=" + bp +
522                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
523         } else {
524             // The break was expected.
525             //   Check that the {nnn} tag value is correct.
526             int expectedTagVal = t.expectedBreaks[bp];
527             if (expectedTagVal == -1) {
528                 expectedTagVal = 0;
529             }
530             int line = t.srcLine[bp];
531             int rs = t.bi.getRuleStatus();
532             if (rs != expectedTagVal) {
533                 errln("Incorrect status for reverse break.  Pos = " + bp +
534                       "  File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
535                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
536             }
537         }
538 
539         prevBP = bp;
540     }
541 
542     // Verify that there were no missed breaks prior to the last one found
543     for (i=prevBP-1; i>=0; i--) {
544         if (t.expectedBreaks[i] != 0) {
545             errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
546                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
547          }
548     }
549     // Check isBoundary()
550     for (i=0; i<=t.dataToBreak.length(); i++) {
551         boolean boundaryExpected = (t.expectedBreaks[i] != 0);
552         boolean boundaryFound    = t.bi.isBoundary(i);
553         if (boundaryExpected != boundaryFound) {
554             errln("isBoundary(" + i + ") incorrect.\n" +
555                   "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
556                   "    Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
557         }
558     }
559 
560     // Check following()
561     for (i=0; i<=t.dataToBreak.length(); i++) {
562         int actualBreak = t.bi.following(i);
563         int expectedBreak = BreakIterator.DONE;
564         for (int j=i+1; j < t.expectedBreaks.length; j++) {
565             if (t.expectedBreaks[j] != 0) {
566                 expectedBreak = j;
567                 break;
568             }
569         }
570         if (expectedBreak != actualBreak) {
571             errln("following(" + i + ") incorrect.\n" +
572                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
573                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
574         }
575     }
576 
577     // Check preceding()
578     for (i=t.dataToBreak.length(); i>=0; i--) {
579         int actualBreak = t.bi.preceding(i);
580         int expectedBreak = BreakIterator.DONE;
581 
582         for (int j=i-1; j >= 0; j--) {
583             if (t.expectedBreaks[j] != 0) {
584                 expectedBreak = j;
585                 break;
586             }
587         }
588         if (expectedBreak != actualBreak) {
589             errln("preceding(" + i + ") incorrect.\n" +
590                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
591                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
592         }
593     }
594 
595 }
596 
597 
598 
599 
600 }
601