• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Created on May 5, 2004
3  *
4  * Copyright (C) 2004-2015 International Business Machines Corporation and others.
5  * All Rights Reserved.
6  *
7  */
8 package com.ibm.icu.dev.test.rbbi;
9 
10 import java.io.IOException;
11 import java.io.InputStream;
12 import java.io.InputStreamReader;
13 import java.util.Arrays;
14 
15 import com.ibm.icu.dev.test.TestFmwk;
16 import com.ibm.icu.impl.Utility;
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.text.BreakIterator;
19 import com.ibm.icu.text.UTF16;
20 import com.ibm.icu.util.ULocale;
21 
22 
23 /**
24  * Rule based break iterator data driven test.
25  *      Perform the tests from the file rbbitst.txt.
26  *      The test data file is common to both ICU4C and ICU4J.
27  *      See the data file for a description of the tests.
28  *
29  */
30 public class RBBITestExtended extends TestFmwk {
31 
main(String[] args)32     public static void main(String[] args)throws Exception {
33         new RBBITestExtended().run(args);
34     }
35 
36 
RBBITestExtended()37 public RBBITestExtended() {
38     }
39 
40 
41 
42 static class TestParams {
43     BreakIterator   bi;
44     StringBuffer    dataToBreak    = new StringBuffer();
45     int[]           expectedBreaks = new int[1000];
46     int[]           srcLine        = new int[1000];
47     int[]           srcCol         = new int[1000];
48     ULocale         currentLocale  = new ULocale("en_US");
49 }
50 
51 
TestExtended()52 public void TestExtended() {
53     TestParams     tp = new TestParams();
54 
55 
56     //
57     //  Open and read the test data file.
58     //
59     StringBuffer testFileBuf = new StringBuffer();
60     InputStream is = null;
61     try {
62         is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
63         if (is == null) {
64             errln("Could not open test data file rbbitst.txt");
65             return;
66         }
67         InputStreamReader isr = new InputStreamReader(is, "UTF-8");
68         try {
69             int c;
70             int count = 0;
71             for (;;) {
72                 c = isr.read();
73                 if (c < 0) {
74                     break;
75                 }
76                 count++;
77                 if (c == 0xFEFF && count == 1) {
78                     // BOM in the test data file. Discard it.
79                     continue;
80                 }
81 
82                 UTF16.append(testFileBuf, c);
83             }
84         } finally {
85             isr.close();
86         }
87     } catch (IOException e) {
88         errln(e.toString());
89         try {
90             is.close();
91         } catch (IOException ignored) {
92         }
93         return;
94     }
95 
96     String testString = testFileBuf.toString();
97 
98 
99     final int  PARSE_COMMENT = 1;
100     final int  PARSE_TAG     = 2;
101     final int  PARSE_DATA    = 3;
102     final int  PARSE_NUM     = 4;
103 
104     int parseState = PARSE_TAG;
105 
106     int savedState = PARSE_TAG;
107 
108     final char CH_LF        = 0x0a;
109     final char CH_CR        = 0x0d;
110     final char CH_HASH      = 0x23;
111     /*static const UChar CH_PERIOD    = 0x2e;*/
112     final char CH_LT        = 0x3c;
113     final char CH_GT        = 0x3e;
114     final char CH_BACKSLASH = 0x5c;
115     final char CH_BULLET    = 0x2022;
116 
117     int    lineNum  = 1;
118     int    colStart = 0;
119     int    column   = 0;
120     int    charIdx  = 0;
121     int    i;
122 
123     int    tagValue = 0;       // The numeric value of a <nnn> tag.
124     int    len = testString.length();
125 
126     for (charIdx = 0; charIdx < len; ) {
127         int  c = UTF16.charAt(testString, charIdx);
128         charIdx++;
129         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
130             // treat CRLF as a unit
131             c = CH_LF;
132             charIdx++;
133         }
134         if (c == CH_LF || c == CH_CR) {
135             lineNum++;
136             colStart = charIdx;
137         }
138         column = charIdx - colStart + 1;
139 
140         switch (parseState) {
141         case PARSE_COMMENT:
142             if (c == 0x0a || c == 0x0d) {
143                 parseState = savedState;
144             }
145             break;
146 
147         case PARSE_TAG:
148             {
149             if (c == CH_HASH) {
150                 parseState = PARSE_COMMENT;
151                 savedState = PARSE_TAG;
152                 break;
153             }
154             if (UCharacter.isWhitespace(c)) {
155                 break;
156             }
157            if (testString.startsWith("<word>", charIdx-1)) {
158                 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
159                 charIdx += 5;
160                 break;
161             }
162             if (testString.startsWith("<char>", charIdx-1)) {
163                 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
164                 charIdx += 5;
165                 break;
166             }
167             if (testString.startsWith("<line>", charIdx-1)) {
168                 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
169                 charIdx += 5;
170                 break;
171             }
172             if (testString.startsWith("<sent>", charIdx-1)) {
173                 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
174                 charIdx += 5;
175                 break;
176             }
177             if (testString.startsWith("<title>", charIdx-1)) {
178                 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
179                 charIdx += 6;
180                 break;
181             }
182             if (testString.startsWith("<locale ", charIdx-1)) {
183                 int closeIndex = testString.indexOf(">", charIdx);
184                 if (closeIndex < 0) {
185                     errln("line" + lineNum + ": missing close on <locale  tag.");
186                     break;
187                 }
188                 String localeName = testString.substring(charIdx+6, closeIndex);
189                 localeName = localeName.trim();
190                 tp.currentLocale = new ULocale(localeName);
191                 charIdx = closeIndex+1;
192                 break;
193             }
194             if (testString.startsWith("<data>", charIdx-1)) {
195                 parseState = PARSE_DATA;
196                 charIdx += 5;
197                 tp.dataToBreak.setLength(0);
198                 Arrays.fill(tp.expectedBreaks, 0);
199                 Arrays.fill(tp.srcCol, 0);
200                 Arrays.fill(tp.srcLine, 0);
201                 break;
202             }
203 
204             errln("line" + lineNum + ": Tag expected in test file.");
205             return;
206             //parseState = PARSE_COMMENT;
207             //savedState = PARSE_DATA;
208             }
209 
210         case PARSE_DATA:
211             if (c == CH_BULLET) {
212                 int  breakIdx = tp.dataToBreak.length();
213                 tp.expectedBreaks[breakIdx] = -1;
214                 tp.srcLine[breakIdx]        = lineNum;
215                 tp.srcCol[breakIdx]         = column;
216                 break;
217             }
218 
219             if (testString.startsWith("</data>", charIdx-1))  {
220                 // Add final entry to mappings from break location to source file position.
221                 //  Need one extra because last break position returned is after the
222                 //    last char in the data, not at the last char.
223                 int idx = tp.dataToBreak.length();
224                 tp.srcLine[idx] = lineNum;
225                 tp.srcCol[idx]  = column;
226 
227                 parseState = PARSE_TAG;
228                 charIdx += 6;
229 
230                 // RUN THE TEST!
231                 executeTest(tp);
232                 break;
233             }
234 
235            if (testString.startsWith("\\N{", charIdx-1)) {
236                int nameEndIdx = testString.indexOf('}', charIdx);
237                if (nameEndIdx == -1) {
238                    errln("Error in named character in test file at line " + lineNum +
239                            ", col " + column);
240                }
241                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
242                 // Get the code point from the name and insert it into the test data.
243                 String charName = testString.substring(charIdx+2, nameEndIdx);
244                 c = UCharacter.getCharFromName(charName);
245                 if (c == -1) {
246                     errln("Error in named character in test file at line " + lineNum +
247                             ", col " + column);
248                 } else {
249                     // Named code point was recognized.  Insert it
250                     //   into the test data.
251                     UTF16.append(tp.dataToBreak, c);
252                     for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
253                         tp.srcLine[i] = lineNum;
254                         tp.srcCol[i]  = column;
255                     }
256 
257                  }
258                 if (nameEndIdx > charIdx) {
259                     charIdx = nameEndIdx+1;
260                 }
261                 break;
262             }
263 
264             if (testString.startsWith("<>", charIdx-1)) {
265                 charIdx++;
266                 int  breakIdx = tp.dataToBreak.length();
267                 tp.expectedBreaks[breakIdx] = -1;
268                 tp.srcLine[breakIdx]        = lineNum;
269                 tp.srcCol[breakIdx]         = column;
270                 break;
271             }
272 
273             if (c == CH_LT) {
274                 tagValue   = 0;
275                 parseState = PARSE_NUM;
276                 break;
277             }
278 
279             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
280                 parseState = PARSE_COMMENT;
281                 savedState = PARSE_DATA;
282                 break;
283             }
284 
285             if (c == CH_BACKSLASH) {
286                 // Check for \ at end of line, a line continuation.
287                 //     Advance over (discard) the newline
288                 int cp = UTF16.charAt(testString, charIdx);
289                 if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
290                     // We have a CR LF
291                     //  Need an extra increment of the input ptr to move over both of them
292                     charIdx++;
293                 }
294                 if (cp == CH_LF || cp == CH_CR) {
295                     lineNum++;
296                     column   = 0;
297                     charIdx++;
298                     colStart = charIdx;
299                     break;
300                 }
301 
302                 // Let unescape handle the back slash.
303                 int  charIdxAr[] = new int[1];
304                 charIdxAr[0] = charIdx;
305                 cp = Utility.unescapeAt(testString, charIdxAr);
306                 if (cp != -1) {
307                     // Escape sequence was recognized.  Insert the char
308                     //   into the test data.
309                     charIdx = charIdxAr[0];
310                     UTF16.append(tp.dataToBreak, cp);
311                     for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
312                         tp.srcLine[i] = lineNum;
313                         tp.srcCol[i]  = column;
314                     }
315 
316                     break;
317                 }
318 
319 
320                 // Not a recognized backslash escape sequence.
321                 // Take the next char as a literal.
322                 //  TODO:  Should this be an error?
323                 c = UTF16.charAt(testString,charIdx);
324                 charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
325              }
326 
327             // Normal, non-escaped data char.
328             UTF16.append(tp.dataToBreak, c);
329 
330             // Save the mapping from offset in the data to line/column numbers in
331             //   the original input file.  Will be used for better error messages only.
332             //   If there's an expected break before this char, the slot in the mapping
333             //     vector will already be set for this char; don't overwrite it.
334             for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
335                 tp.srcLine[i] = lineNum;
336                 tp.srcCol[i]  = column;
337             }
338             break;
339 
340 
341         case PARSE_NUM:
342             // We are parsing an expected numeric tag value, like <1234>,
343             //   within a chunk of data.
344             if (UCharacter.isWhitespace(c)) {
345                 break;
346             }
347 
348             if (c == CH_GT) {
349                 // Finished the number.  Add the info to the expected break data,
350                 //   and switch parse state back to doing plain data.
351                 parseState = PARSE_DATA;
352                 if (tagValue == 0) {
353                     tagValue = -1;
354                 }
355                 int  breakIdx = tp.dataToBreak.length();
356                 tp.expectedBreaks[breakIdx] = tagValue;
357                 tp.srcLine[breakIdx]        = lineNum;
358                 tp.srcCol[breakIdx]         = column;
359                 break;
360             }
361 
362             if (UCharacter.isDigit(c)) {
363                 tagValue = tagValue*10 + UCharacter.digit(c);
364                 break;
365             }
366 
367             errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
368             return;
369 
370             // parseState = PARSE_COMMENT;   // TODO: unreachable.  Don't stop on errors.
371             // break;
372         }
373 
374 
375 
376     }
377 }
378 
executeTest(TestParams t)379 void executeTest(TestParams t) {
380     int    bp;
381     int    prevBP;
382     int    i;
383 
384     if (t.bi == null) {
385         return;
386     }
387 
388     t.bi.setText(t.dataToBreak.toString());
389     //
390     //  Run the iterator forward
391     //
392     prevBP = -1;
393     for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
394         if (prevBP ==  bp) {
395             // Fail for lack of forward progress.
396             errln("Forward Iteration, no forward progress.  Break Pos=" + bp +
397                     "  File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
398             break;
399         }
400 
401         // Check that there were we didn't miss an expected break between the last one
402         //  and this one.
403         for (i=prevBP+1; i<bp; i++) {
404             if (t.expectedBreaks[i] != 0) {
405                 errln("Forward Iteration, break expected, but not found.  Pos=" + i +
406                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
407             }
408         }
409 
410         // Check that the break we did find was expected
411         if (t.expectedBreaks[bp] == 0) {
412             errln("Forward Iteration, break found, but not expected.  Pos=" + bp +
413                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
414         } else {
415             // The break was expected.
416             //   Check that the {nnn} tag value is correct.
417             int expectedTagVal = t.expectedBreaks[bp];
418             if (expectedTagVal == -1) {
419                 expectedTagVal = 0;
420             }
421             int line = t.srcLine[bp];
422             int rs = t.bi.getRuleStatus();
423             if (rs != expectedTagVal) {
424                 errln("Incorrect status for forward break.  Pos = " + bp +
425                         ".  File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
426                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
427             }
428             int[] fillInArray = new int[4];
429             int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
430             assertTrue("", numStatusVals >= 1);
431             assertEquals("", expectedTagVal, fillInArray[0]);
432         }
433 
434 
435         prevBP = bp;
436     }
437 
438     // Verify that there were no missed expected breaks after the last one found
439     for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
440         if (t.expectedBreaks[i] != 0) {
441             errln("Forward Iteration, break expected, but not found.  Pos=" + i +
442                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
443        }
444     }
445 
446 
447     //
448     //  Run the iterator backwards, verify that the same breaks are found.
449     //
450     prevBP = t.dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
451     for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
452         if (prevBP ==  bp) {
453             // Fail for lack of progress.
454             errln("Reverse Iteration, no progress.  Break Pos=" + bp +
455                     "File line,col=" + t.srcLine[bp] + " " +  t.srcCol[bp]);
456             break;
457         }
458 
459         // Check that we didn't miss an expected break between the last one
460         //  and this one.  (UVector returns zeros for index out of bounds.)
461         for (i=prevBP-1; i>bp; i--) {
462             if (t.expectedBreaks[i] != 0) {
463                 errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
464                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
465             }
466         }
467 
468         // Check that the break we did find was expected
469         if (t.expectedBreaks[bp] == 0) {
470             errln("Reverse Itertion, break found, but not expected.  Pos=" + bp +
471                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
472         } else {
473             // The break was expected.
474             //   Check that the {nnn} tag value is correct.
475             int expectedTagVal = t.expectedBreaks[bp];
476             if (expectedTagVal == -1) {
477                 expectedTagVal = 0;
478             }
479             int line = t.srcLine[bp];
480             int rs = t.bi.getRuleStatus();
481             if (rs != expectedTagVal) {
482                 errln("Incorrect status for reverse break.  Pos=  " + bp +
483                         "File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
484                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
485             }
486         }
487 
488         prevBP = bp;
489     }
490 
491     // Verify that there were no missed breaks prior to the last one found
492     for (i=prevBP-1; i>=0; i--) {
493         if (t.expectedBreaks[i] != 0) {
494             errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
495                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
496          }
497     }
498     // Check isBoundary()
499     for (i=0; i<=t.dataToBreak.length(); i++) {
500         boolean boundaryExpected = (t.expectedBreaks[i] != 0);
501         boolean boundaryFound    = t.bi.isBoundary(i);
502         if (boundaryExpected != boundaryFound) {
503             errln("isBoundary(" + i + ") incorrect.\n" +
504                   "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
505                   "    Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
506         }
507     }
508 
509     // Check following()
510     for (i=0; i<=t.dataToBreak.length(); i++) {
511         int actualBreak = t.bi.following(i);
512         int expectedBreak = BreakIterator.DONE;
513         for (int j=i+1; j < t.expectedBreaks.length; j++) {
514             if (t.expectedBreaks[j] != 0) {
515                 expectedBreak = j;
516                 break;
517             }
518         }
519         if (expectedBreak != actualBreak) {
520             errln("following(" + i + ") incorrect.\n" +
521                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
522                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
523         }
524     }
525 
526     // Check preceding()
527     for (i=t.dataToBreak.length(); i>=0; i--) {
528         int actualBreak = t.bi.preceding(i);
529         int expectedBreak = BreakIterator.DONE;
530 
531         for (int j=i-1; j >= 0; j--) {
532             if (t.expectedBreaks[j] != 0) {
533                 expectedBreak = j;
534                 break;
535             }
536         }
537         if (expectedBreak != actualBreak) {
538             errln("preceding(" + i + ") incorrect.\n" +
539                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
540                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
541         }
542     }
543 
544 }
545 
546 
547 
548 
549 }
550