• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  * Created on May 5, 2004
5  *
6  * Copyright (C) 2004-2016 International Business Machines Corporation and others.
7  * All Rights Reserved.
8  *
9  */
10 package com.ibm.icu.dev.test.rbbi;
11 
12 import java.io.IOException;
13 import java.io.InputStream;
14 import java.io.InputStreamReader;
15 import java.util.Arrays;
16 
17 import org.junit.Test;
18 import org.junit.runner.RunWith;
19 import org.junit.runners.JUnit4;
20 
21 import com.ibm.icu.dev.test.TestFmwk;
22 import com.ibm.icu.dev.test.TestUtil;
23 import com.ibm.icu.impl.Utility;
24 import com.ibm.icu.lang.UCharacter;
25 import com.ibm.icu.text.BreakIterator;
26 import com.ibm.icu.text.RuleBasedBreakIterator;
27 import com.ibm.icu.util.ULocale;
28 
29 
30 /**
31  * Rule based break iterator data driven test.
32  *      Perform the tests from the file rbbitst.txt.
33  *      The test data file is common to both ICU4C and ICU4J.
34  *      See the data file for a description of the tests.
35  *
36  */
37 @RunWith(JUnit4.class)
38 public class RBBITestExtended extends TestFmwk {
RBBITestExtended()39 public RBBITestExtended() {
40     }
41 
42 
43 
44 static class TestParams {
45     BreakIterator   bi;
46     StringBuilder   dataToBreak    = new StringBuilder();
47     int[]           expectedBreaks = new int[4000];
48     int[]           srcLine        = new int[4000];
49     int[]           srcCol         = new int[4000];
50     ULocale         currentLocale  = new ULocale("en_US");
51 }
52 
53 
54 @Test
TestExtended()55 public void TestExtended() {
56     // The expectations in this test heavily depends on the Thai dictionary.
57     // Therefore, we skip this test under the LSTM configuration.
58     org.junit.Assume.assumeTrue(!TestUtil.skipDictionaryTest());
59     TestParams     tp = new TestParams();
60 
61 
62     //
63     //  Open and read the test data file.
64     //
65     StringBuilder testFileBuf = new StringBuilder();
66     InputStream is = null;
67     try {
68         is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
69         if (is == null) {
70             errln("Could not open test data file rbbitst.txt");
71             return;
72         }
73         InputStreamReader isr = new InputStreamReader(is, "UTF-8");
74         try {
75             int c;
76             int count = 0;
77             for (;;) {
78                 c = isr.read();
79                 if (c < 0) {
80                     break;
81                 }
82                 count++;
83                 if (c == 0xFEFF && count == 1) {
84                     // BOM in the test data file. Discard it.
85                     continue;
86                 }
87 
88                 testFileBuf.appendCodePoint(c);
89             }
90         } finally {
91             isr.close();
92         }
93     } catch (IOException e) {
94         errln(e.toString());
95         try {
96             is.close();
97         } catch (IOException ignored) {
98         }
99         return;
100     }
101 
102     String testString = testFileBuf.toString();
103 
104 
105     final int  PARSE_COMMENT = 1;
106     final int  PARSE_TAG     = 2;
107     final int  PARSE_DATA    = 3;
108     final int  PARSE_NUM     = 4;
109     final int  PARSE_RULES   = 5;
110 
111     int parseState = PARSE_TAG;
112 
113     int savedState = PARSE_TAG;
114 
115     int    lineNum  = 1;
116     int    colStart = 0;
117     int    column   = 0;
118     int    charIdx  = 0;
119     int    i;
120 
121     int    tagValue = 0;       // The numeric value of a <nnn> tag.
122 
123     StringBuilder   rules = new StringBuilder();     // Holds rules from a <rules> ... </rules> block
124     int             rulesFirstLine = 0;              // Line number of the start of current <rules> block
125 
126     int    len = testString.length();
127 
128     for (charIdx = 0; charIdx < len; ) {
129         int c = testString.codePointAt(charIdx);
130         charIdx++;
131         if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
132             // treat CRLF as a unit
133             c = '\n';
134             charIdx++;
135         }
136         if (c == '\n' || c == '\r') {
137             lineNum++;
138             colStart = charIdx;
139         }
140         column = charIdx - colStart + 1;
141 
142         switch (parseState) {
143         case PARSE_COMMENT:
144             if (c == 0x0a || c == 0x0d) {
145                 parseState = savedState;
146             }
147             break;
148 
149         case PARSE_TAG:
150             {
151             if (c == '#') {
152                 parseState = PARSE_COMMENT;
153                 savedState = PARSE_TAG;
154                 break;
155             }
156             if (UCharacter.isWhitespace(c)) {
157                 break;
158             }
159            if (testString.startsWith("<word>", charIdx-1)) {
160                 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
161                 charIdx += 5;
162                 break;
163             }
164             if (testString.startsWith("<char>", charIdx-1)) {
165                 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
166                 charIdx += 5;
167                 break;
168             }
169             if (testString.startsWith("<line>", charIdx-1)) {
170                 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
171                 charIdx += 5;
172                 break;
173             }
174             if (testString.startsWith("<sent>", charIdx-1)) {
175                 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
176                 charIdx += 5;
177                 break;
178             }
179             if (testString.startsWith("<title>", charIdx-1)) {
180                 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
181                 charIdx += 6;
182                 break;
183             }
184             if (testString.startsWith("<rules>", charIdx-1) ||
185                     testString.startsWith("<badrules>", charIdx-1)) {
186                 charIdx = testString.indexOf('>', charIdx) + 1;
187                 parseState = PARSE_RULES;
188                 rules.setLength(0);
189                 rulesFirstLine = lineNum;
190                 break;
191             }
192 
193             if (testString.startsWith("<locale ", charIdx-1)) {
194                 int closeIndex = testString.indexOf(">", charIdx);
195                 if (closeIndex < 0) {
196                     errln("line" + lineNum + ": missing close on <locale  tag.");
197                     break;
198                 }
199                 String localeName = testString.substring(charIdx+6, closeIndex);
200                 localeName = localeName.trim();
201                 tp.currentLocale = new ULocale(localeName);
202                 charIdx = closeIndex+1;
203                 break;
204             }
205             if (testString.startsWith("<data>", charIdx-1)) {
206                 parseState = PARSE_DATA;
207                 charIdx += 5;
208                 tp.dataToBreak.setLength(0);
209                 Arrays.fill(tp.expectedBreaks, 0);
210                 Arrays.fill(tp.srcCol, 0);
211                 Arrays.fill(tp.srcLine, 0);
212                 break;
213             }
214 
215             errln("line" + lineNum + ": Tag expected in test file.");
216             return;
217             //parseState = PARSE_COMMENT;
218             //savedState = PARSE_DATA;
219             }
220 
221         case PARSE_RULES:
222             if (testString.startsWith("</rules>", charIdx-1)) {
223                 charIdx += 7;
224                 parseState = PARSE_TAG;
225                 try {
226                     tp.bi = new RuleBasedBreakIterator(rules.toString());
227                 } catch (IllegalArgumentException e) {
228                     errln(String.format("rbbitst.txt:%d  Error creating break iterator from rules.  %s", lineNum, e));
229                 }
230             } else if (testString.startsWith("</badrules>", charIdx-1)) {
231                 charIdx += 10;
232                 parseState = PARSE_TAG;
233                 boolean goodRules = true;
234                 try {
235                     new RuleBasedBreakIterator(rules.toString());
236                 } catch (IllegalArgumentException e) {
237                     goodRules = false;
238                 }
239                 if (goodRules) {
240                     errln(String.format(
241                             "rbbitst.txt:%d  Expected, but did not get, a failure creating break iterator from rules.",
242                             lineNum));
243                 }
244             } else {
245                 rules.appendCodePoint(c);
246             }
247             break;
248 
249         case PARSE_DATA:
250             if (c == '•') {
251                 int  breakIdx = tp.dataToBreak.length();
252                 if (tp.expectedBreaks[breakIdx] != 0) {
253                     errln(String.format(
254                             "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
255                             lineNum, column));
256                 }
257                 tp.expectedBreaks[breakIdx] = -1;
258                 tp.srcLine[breakIdx]        = lineNum;
259                 tp.srcCol[breakIdx]         = column;
260                 break;
261             }
262 
263             if (testString.startsWith("</data>", charIdx-1))  {
264                 // Add final entry to mappings from break location to source file position.
265                 //  Need one extra because last break position returned is after the
266                 //    last char in the data, not at the last char.
267                 int idx = tp.dataToBreak.length();
268                 tp.srcLine[idx] = lineNum;
269                 tp.srcCol[idx]  = column;
270 
271                 parseState = PARSE_TAG;
272                 charIdx += 6;
273 
274                 // RUN THE TEST!
275                 executeTest(tp);
276                 break;
277             }
278 
279            if (testString.startsWith("\\N{", charIdx-1)) {
280                int nameEndIdx = testString.indexOf('}', charIdx);
281                if (nameEndIdx == -1) {
282                    errln("Error in named character in test file at line " + lineNum +
283                            ", col " + column);
284                }
285                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
286                 // Get the code point from the name and insert it into the test data.
287                 String charName = testString.substring(charIdx+2, nameEndIdx);
288                 c = UCharacter.getCharFromName(charName);
289                 if (c == -1) {
290                     errln("Error in named character in test file at line " + lineNum +
291                             ", col " + column);
292                 } else {
293                     // Named code point was recognized.  Insert it
294                     //   into the test data.
295                     tp.dataToBreak.appendCodePoint(c);
296                     for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
297                         tp.srcLine[i] = lineNum;
298                         tp.srcCol[i]  = column;
299                     }
300 
301                  }
302                 if (nameEndIdx > charIdx) {
303                     charIdx = nameEndIdx+1;
304                 }
305                 break;
306             }
307 
308             if (testString.startsWith("<>", charIdx-1)) {
309                 charIdx++;
310                 int  breakIdx = tp.dataToBreak.length();
311                 tp.expectedBreaks[breakIdx] = -1;
312                 tp.srcLine[breakIdx]        = lineNum;
313                 tp.srcCol[breakIdx]         = column;
314                 break;
315             }
316 
317             if (c == '<') {
318                 tagValue   = 0;
319                 parseState = PARSE_NUM;
320                 break;
321             }
322 
323             if (c == '#' && column==3) {   // TODO:  why is column off so far?
324                 parseState = PARSE_COMMENT;
325                 savedState = PARSE_DATA;
326                 break;
327             }
328 
329             if (c == '\\') {
330                 // Check for \ at end of line, a line continuation.
331                 //     Advance over (discard) the newline
332                 int cp = testString.codePointAt(charIdx);
333                 if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
334                     // We have a CR LF
335                     //  Need an extra increment of the input ptr to move over both of them
336                     charIdx++;
337                 }
338                 if (cp == '\n' || cp == '\r') {
339                     lineNum++;
340                     column   = 0;
341                     charIdx++;
342                     colStart = charIdx;
343                     break;
344                 }
345 
346                 // Let unescape handle the back slash.
347                 int cpAndLength = Utility.unescapeAndLengthAt(testString, charIdx);
348                 if (cpAndLength >= 0) {
349                     // Escape sequence was recognized.  Insert the char
350                     //   into the test data.
351                     charIdx += Utility.lengthFromCodePointAndLength(cpAndLength);
352                     tp.dataToBreak.appendCodePoint(Utility.cpFromCodePointAndLength(cpAndLength));
353                     for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
354                         tp.srcLine[i] = lineNum;
355                         tp.srcCol[i]  = column;
356                     }
357 
358                     break;
359                 }
360 
361 
362                 // Not a recognized backslash escape sequence.
363                 // Take the next char as a literal.
364                 //  TODO:  Should this be an error?
365                 c = testString.codePointAt(charIdx);
366                 charIdx = testString.offsetByCodePoints(charIdx, 1);
367              }
368 
369             // Normal, non-escaped data char.
370             tp.dataToBreak.appendCodePoint(c);
371 
372             // Save the mapping from offset in the data to line/column numbers in
373             //   the original input file.  Will be used for better error messages only.
374             //   If there's an expected break before this char, the slot in the mapping
375             //     vector will already be set for this char; don't overwrite it.
376             for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
377                 tp.srcLine[i] = lineNum;
378                 tp.srcCol[i]  = column;
379             }
380             break;
381 
382 
383         case PARSE_NUM:
384             // We are parsing an expected numeric tag value, like <1234>,
385             //   within a chunk of data.
386             if (UCharacter.isWhitespace(c)) {
387                 break;
388             }
389 
390             if (c == '>') {
391                 // Finished the number.  Add the info to the expected break data,
392                 //   and switch parse state back to doing plain data.
393                 parseState = PARSE_DATA;
394                 if (tagValue == 0) {
395                     tagValue = -1;
396                 }
397                 int  breakIdx = tp.dataToBreak.length();
398                 if (tp.expectedBreaks[breakIdx] != 0) {
399                     errln(String.format(
400                             "rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
401                             lineNum, column));
402                 }
403                 tp.expectedBreaks[breakIdx] = tagValue;
404                 tp.srcLine[breakIdx]        = lineNum;
405                 tp.srcCol[breakIdx]         = column;
406                 break;
407             }
408 
409             if (UCharacter.isDigit(c)) {
410                 tagValue = tagValue*10 + UCharacter.digit(c);
411                 break;
412             }
413 
414             errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
415             return;
416         }
417     }
418 
419     // Reached end of test file. Raise an error if parseState indicates that we are
420     //   within a block that should have been terminated.
421     if (parseState == PARSE_RULES) {
422         errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
423             lineNum, rulesFirstLine));
424     }
425     if (parseState == PARSE_DATA) {
426         errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
427     }
428 }
429 
executeTest(TestParams t)430 void executeTest(TestParams t) {
431     // TODO: also rerun tests with a break iterator re-created from bi.getRules()
432     //       and from bi.clone(). If in exhaustive mode only.
433     int    bp;
434     int    prevBP;
435     int    i;
436 
437     if (t.bi == null) {
438         return;
439     }
440 
441     t.bi.setText(t.dataToBreak.toString());
442     //
443     //  Run the iterator forward
444     //
445     prevBP = -1;
446     for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
447         if (prevBP ==  bp) {
448             // Fail for lack of forward progress.
449             errln("Forward Iteration, no forward progress.  Break Pos=" + bp +
450                     "  File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
451             break;
452         }
453 
454         // Check that there were we didn't miss an expected break between the last one
455         //  and this one.
456         for (i=prevBP+1; i<bp; i++) {
457             if (t.expectedBreaks[i] != 0) {
458                 errln("Forward Iteration, break expected, but not found.  Pos=" + i +
459                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
460             }
461         }
462 
463         // Check that the break we did find was expected
464         if (t.expectedBreaks[bp] == 0) {
465             errln("Forward Iteration, break found, but not expected.  Pos=" + bp +
466                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
467         } else {
468             // The break was expected.
469             //   Check that the {nnn} tag value is correct.
470             int expectedTagVal = t.expectedBreaks[bp];
471             if (expectedTagVal == -1) {
472                 expectedTagVal = 0;
473             }
474             int line = t.srcLine[bp];
475             int rs = t.bi.getRuleStatus();
476             if (rs != expectedTagVal) {
477                 errln("Incorrect status for forward break.  Pos = " + bp +
478                         ".  File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
479                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
480             }
481             int[] fillInArray = new int[4];
482             int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
483             assertTrue("", numStatusVals >= 1);
484             assertEquals("", expectedTagVal, fillInArray[0]);
485         }
486 
487 
488         prevBP = bp;
489     }
490 
491     // Verify that there were no missed expected breaks after the last one found
492     for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
493         if (t.expectedBreaks[i] != 0) {
494             errln("Forward Iteration, break expected, but not found.  Pos=" + i +
495                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
496        }
497     }
498 
499 
500     //
501     //  Run the iterator backwards, verify that the same breaks are found.
502     //
503     prevBP = t.dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
504     for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
505         if (prevBP ==  bp) {
506             // Fail for lack of progress.
507             errln("Reverse Iteration, no progress.  Break Pos=" + bp +
508                     "File line,col=" + t.srcLine[bp] + " " +  t.srcCol[bp]);
509             break;
510         }
511 
512         // Check that we didn't miss an expected break between the last one
513         //  and this one.  (UVector returns zeros for index out of bounds.)
514         for (i=prevBP-1; i>bp; i--) {
515             if (t.expectedBreaks[i] != 0) {
516                 errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
517                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
518             }
519         }
520 
521         // Check that the break we did find was expected
522         if (t.expectedBreaks[bp] == 0) {
523             errln("Reverse Itertion, break found, but not expected.  Pos=" + bp +
524                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
525         } else {
526             // The break was expected.
527             //   Check that the {nnn} tag value is correct.
528             int expectedTagVal = t.expectedBreaks[bp];
529             if (expectedTagVal == -1) {
530                 expectedTagVal = 0;
531             }
532             int line = t.srcLine[bp];
533             int rs = t.bi.getRuleStatus();
534             if (rs != expectedTagVal) {
535                 errln("Incorrect status for reverse break.  Pos = " + bp +
536                       "  File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
537                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
538             }
539         }
540 
541         prevBP = bp;
542     }
543 
544     // Verify that there were no missed breaks prior to the last one found
545     for (i=prevBP-1; i>=0; i--) {
546         if (t.expectedBreaks[i] != 0) {
547             errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
548                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
549          }
550     }
551     // Check isBoundary()
552     for (i=0; i<=t.dataToBreak.length(); i++) {
553         boolean boundaryExpected = (t.expectedBreaks[i] != 0);
554         boolean boundaryFound    = t.bi.isBoundary(i);
555         if (boundaryExpected != boundaryFound) {
556             errln("isBoundary(" + i + ") incorrect.\n" +
557                   "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
558                   "    Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
559         }
560     }
561 
562     // Check following()
563     for (i=0; i<=t.dataToBreak.length(); i++) {
564         int actualBreak = t.bi.following(i);
565         int expectedBreak = BreakIterator.DONE;
566         for (int j=i+1; j < t.expectedBreaks.length; j++) {
567             if (t.expectedBreaks[j] != 0) {
568                 expectedBreak = j;
569                 break;
570             }
571         }
572         if (expectedBreak != actualBreak) {
573             errln("following(" + i + ") incorrect.\n" +
574                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
575                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
576         }
577     }
578 
579     // Check preceding()
580     for (i=t.dataToBreak.length(); i>=0; i--) {
581         int actualBreak = t.bi.preceding(i);
582         int expectedBreak = BreakIterator.DONE;
583 
584         for (int j=i-1; j >= 0; j--) {
585             if (t.expectedBreaks[j] != 0) {
586                 expectedBreak = j;
587                 break;
588             }
589         }
590         if (expectedBreak != actualBreak) {
591             errln("preceding(" + i + ") incorrect.\n" +
592                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
593                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
594         }
595     }
596 
597 }
598 
599 
600 
601 
602 }
603