• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 package com.ibm.icu.dev.test.rbbi;
5 
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.HashMap;
12 import java.util.List;
13 import java.util.Map;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16 import java.util.regex.PatternSyntaxException;
17 
18 import org.junit.Test;
19 import org.junit.runner.RunWith;
20 import org.junit.runners.JUnit4;
21 
22 import com.ibm.icu.dev.test.TestFmwk;
23 import com.ibm.icu.impl.UCharacterName;
24 import com.ibm.icu.impl.UCharacterNameChoice;
25 import com.ibm.icu.text.BreakIterator;
26 import com.ibm.icu.text.RuleBasedBreakIterator;
27 import com.ibm.icu.text.UnicodeSet;
28 import com.ibm.icu.util.ULocale;
29 
30 /**
31  * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp.
32  * This is the newer, data driven monkey test. It is completely separate from the
33  * older class RBBITestMonkey.
34  */
35 
36 @RunWith(JUnit4.class)
37 public class RBBIMonkeyTest extends TestFmwk {
38 
39 
40     //  class CharClass    Represents a single character class from the source break rules.
41     //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
42     //                     deletes them using hash's object deleter function.
43 
44     static class CharClass  {
45         String         fName;
46         String         fOriginalDef;    // set definition as it appeared in user supplied rules.
47         String         fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
48         UnicodeSet     fSet;
CharClass(String name, String originalDef, String expandedDef, UnicodeSet set)49         CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) {
50             fName = name;
51             fOriginalDef = originalDef;
52             fExpandedDef = expandedDef;
53             fSet = set;
54         };
55     }
56 
57 
58     // class BreakRule    Struct-like class represents a single rule from a set of break rules.
59     //                    Each rule has the set definitions expanded, and
60     //                    is compiled to a regular expression.
61 
62     static class BreakRule {
63         String    fName;                      // Name of the rule.
64         String    fRule;                      // Rule expression, excluding the name, as written in user source.
65         String    fExpandedRule;              // Rule expression after expanding the set definitions.
66         Matcher   fRuleMatcher;               // Regular expression that matches the rule.
67         boolean   fInitialMatchOnly = false;  // True if rule begins with '^', meaning no chaining.
68     };
69 
70 
71     // class BreakRules    represents a complete set of break rules, possibly tailored,
72     //                     compiled from testdata break rules.
73 
74     static class BreakRules {
BreakRules(RBBIMonkeyImpl monkeyImpl)75         BreakRules(RBBIMonkeyImpl monkeyImpl) {
76             fMonkeyImpl = monkeyImpl;
77             fBreakRules = new ArrayList<>();
78             fType = BreakIterator.KIND_TITLE;
79             fCharClasses = new HashMap<>();
80             fCharClassList = new ArrayList<>();
81             fDictionarySet = new UnicodeSet();
82 
83             // Match an alpha-numeric identifier in a rule. Will be a set name.
84             // Use negative look-behind to exclude non-identifiers, mostly property names or values.
85             fSetRefsMatcher = Pattern.compile(
86                     "(?<!\\{[ \\t]{0,4})" +
87                     "(?<!=[ \\t]{0,4})" +
88                     "(?<!\\[:[ \\t]{0,4})" +
89                     "(?<!\\\\)" +
90                     "(?<![A-Za-z0-9_])" +
91                     "([A-Za-z_][A-Za-z0-9_]*)").     // The char class name
92                     matcher("");
93 
94             // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
95             fCommentsMatcher = Pattern.compile("" +
96                     "(^|(?<=;))"   +                // Start either at start of line, or just after a ';' (look-behind for ';')
97                     "[ \\t]*+"     +                //   Match white space.
98                     "(#.*)?+"      +                //   Optional # plus whatever follows
99                     "$").                           //   new-line at end of line.
100                     matcher("");
101 
102             // Match (initial parse) of a character class definition line.
103             fClassDefMatcher = Pattern.compile("" +
104                     "[ \\t]*"           +                    // leading white space
105                     "([A-Za-z_][A-Za-z0-9_]*)" +             // The char class name
106                     "[ \\t]*=[ \\t]*"   +                    //   =
107                     "(.*?)"  +                               // The char class UnicodeSet expression
108                     "[ \\t]*;$").                            // ; <end of line>
109                     matcher("");
110 
111             // Match (initial parse) of a break rule line.
112             fRuleDefMatcher = Pattern.compile("" +
113                     "[ \\t]*"           +                     // leading white space
114                     "([A-Za-z_][A-Za-z0-9_.]*)" +             // The rule name
115                     "[ \\t]*:[ \\t]*"   +                     //   :
116                     "(.*?)"   +                               // The rule definition
117                     "[ \\t]*;$").                             // ; <end of line>
118                     matcher("");
119 
120             // Match a property expression, either [:xxx:] or \p{...}
121             fPropertyMatcher = Pattern.compile("" +
122                     "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}").
123                     matcher("");
124 
125 
126         }
127 
128         /**
129          * Create the expanded definition for this char class,
130          * replacing any set references with the corresponding definition.
131          */
addCharClass(String name, String definition)132         CharClass  addCharClass(String name, String definition) {
133             StringBuffer expandedDef = new StringBuffer();
134             fSetRefsMatcher.reset(definition);
135             while (fSetRefsMatcher.find()) {
136                 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
137                 CharClass snameClass = fCharClasses.get(sname);
138                 String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname;
139 
140                 fSetRefsMatcher.appendReplacement(expandedDef, "");
141                 expandedDef.append(expansionForName);
142             }
143             fSetRefsMatcher.appendTail(expandedDef);
144             String expandedDefString = expandedDef.toString();
145 
146             if (fMonkeyImpl.fDumpExpansions) {
147                 System.out.printf("addCharClass(\"%s\"\n", name);
148                 System.out.printf("             %s\n", definition);
149                 System.out.printf("expandedDef: %s\n", expandedDefString);
150             }
151 
152             // Verify that the expanded set definition is valid.
153 
154             UnicodeSet s;
155             try {
156                 s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE);
157             } catch (java.lang.IllegalArgumentException e) {
158                 System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name);
159                 throw e;
160             }
161 
162             // Get an expanded equivalent pattern from the UnicodeSet.
163             // This removes set difference operators, which would fail if passed through to Java regex.
164 
165             StringBuffer expandedPattern = new StringBuffer();
166             s._generatePattern(expandedPattern, true);
167             expandedDefString = expandedPattern.toString();
168             if (fMonkeyImpl.fDumpExpansions) {
169                 System.out.printf("expandedDef2: %s\n", expandedDefString);
170             }
171 
172             CharClass cclass = new CharClass(name, definition, expandedDefString, s);
173             CharClass previousClass = fCharClasses.put(name, cclass);
174 
175             if (previousClass != null) {
176                 // TODO: decide whether or not to allow redefinitions.
177                 //       Can be convenient in some cases.
178                 // String msg = String.format("%s: Redefinition of character class %s\n",
179                 //         fMonkeyImpl.fRuleFileName, cclass.fName);
180                 // System.err.println(msg);
181                 // throw new IllegalArgumentException(msg);
182             }
183             return cclass;
184 
185         };
186 
187 
addRule(String name, String definition)188         void addRule(String  name, String  definition) {
189             BreakRule  thisRule = new BreakRule();
190             StringBuffer expandedDefsRule = new StringBuffer();
191             thisRule.fName = name;
192             thisRule.fRule = definition;
193 
194             // Expand the char class definitions within the rule.
195             fSetRefsMatcher.reset(definition);
196             while (fSetRefsMatcher.find()) {
197                 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
198                 CharClass nameClass = fCharClasses.get(sname);
199                 if (nameClass == null) {
200                     System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition);
201                 }
202                 String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname;
203                 fSetRefsMatcher.appendReplacement(expandedDefsRule, "");
204                 expandedDefsRule.append(expansionForName);
205             }
206             fSetRefsMatcher.appendTail(expandedDefsRule);
207 
208             // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion,
209             // obtained from ICU UnicodeSet. Need to do this substitution because Java regex
210             // does not recognize all properties, and because Java's definitions are likely
211             // older than ICU's.
212 
213             StringBuffer expandedRule = new StringBuffer();
214             fPropertyMatcher.reset(expandedDefsRule);
215             while (fPropertyMatcher.find()) {
216                 String prop = fPropertyMatcher.group();
217                 UnicodeSet propSet = new UnicodeSet("[" + prop + "]");
218                 StringBuffer propExpansion = new StringBuffer();
219                 propSet._generatePattern(propExpansion, true);
220                 fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString());
221             }
222             fPropertyMatcher.appendTail(expandedRule);
223 
224             // If rule begins with a '^' rule chaining is disallowed.
225             // Strip off the '^' from the rule expression, and set the flag.
226             if (expandedRule.charAt(0) == '^') {
227                 thisRule.fInitialMatchOnly = true;
228                 expandedRule.deleteCharAt(0);
229                 expandedRule = new StringBuffer(expandedRule.toString().trim());
230             }
231 
232             //   Replace any [^negated sets] with equivalent flattened sets generated by
233             //   ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
234             //   to any nested classes. Variable substitution in rules produces
235             //   nested sets that [^negation] needs to apply to.
236 
237             StringBuffer ruleWithFlattenedSets = new StringBuffer();
238             int idx = 0;
239             while (idx<expandedRule.length()) {
240                 int setOpenPos = expandedRule.indexOf("[^", idx);
241                 if (setOpenPos < 0) {
242                     break;
243                 }
244                 if (setOpenPos > idx) {
245                     // Move anything from the source rule preceding the [^ into the processed rule, unchanged.
246                     ruleWithFlattenedSets.append(expandedRule.substring(idx,  setOpenPos));
247                 }
248                 int nestingLevel = 1;
249                 boolean haveNesting = false;
250                 int setClosePos;
251                 for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) {
252                     char c = expandedRule.charAt(setClosePos);
253                     if (c == '\\') {
254                         ++setClosePos;
255                     } else if (c == '[') {
256                         ++nestingLevel;
257                         haveNesting = true;
258                     } else if (c == ']') {
259                         --nestingLevel;
260                     }
261                 }
262                 if (haveNesting && nestingLevel == 0) {
263                     // Found one, a negated set that includes interior nested sets.
264                     // Create an ICU UnicodeSet from the source pattern, and obtain an
265                     // equivalent flattened pattern from that.
266                     UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true);
267                     uset._generatePattern(ruleWithFlattenedSets, true);
268                 } else {
269                     // The [^ set definition did not include any nested sets.
270                     // Copy the original definition without change.
271                     // Java regular expressions will handle it without needing to recast it.
272                     if (nestingLevel > 0) {
273                         // Error case of an unclosed character class expression.
274                         // Java regex will also eventually flag the error.
275                         System.err.printf("No closing ] found in rule %s\n", name);
276                     }
277                     ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos));
278                 }
279                 idx = setClosePos;
280             }
281 
282             if (idx < expandedRule.length()) {
283                 ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length()));
284             }
285 
286             thisRule.fExpandedRule = ruleWithFlattenedSets.toString();
287 
288             // Replace the divide sign (\u00f7) with a regular expression named capture.
289             // When running the rules, a match that includes this group means we found a break position.
290 
291             // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)");
292             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()");
293             if (thisRule.fExpandedRule.indexOf("÷") != -1) {
294                 String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name);
295                 System.err.println(msg);
296                 throw new IllegalArgumentException(msg);
297             }
298 
299             // UAX break rule set definitions can be empty, just [].
300             // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which
301             // also matches nothing.
302 
303             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]");
304 
305             // Change Unicode escape syntax for compatibility with Java regular expressions
306             //    \udddd     => \x{dddd}
307             //    \U00hhhhhh => \x{hhhhhh}
308 
309              thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}");
310              thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}");
311 
312             // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment.
313             // UnicodeSet._generatePattern() inserts un-escaped "#"s
314 
315             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#");
316             if (fMonkeyImpl.fDumpExpansions) {
317                 System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule);
318             }
319 
320             // Compile a regular expression for this rule.
321 
322             try {
323                 thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher("");
324             } catch (PatternSyntaxException e) {
325                 System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"",
326                         fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule);
327                 throw e;
328             }
329 
330             // Put this new rule into the vector of all Rules.
331 
332             fBreakRules.add(thisRule);
333         };
334 
335         @SuppressWarnings("unused")
hexToCodePoint(String hex)336         private static String hexToCodePoint(String hex) {
337             int cp = Integer.parseInt(hex, 16);
338             return new StringBuilder().appendCodePoint(cp).toString();
339         }
340 
341 
setKeywordParameter(String keyword, String value)342         boolean setKeywordParameter(String keyword, String value) {
343             if (keyword.equals("locale")) {
344                 fLocale = new ULocale(value);
345                 return true;
346             }
347             if (keyword.equals("type")) {
348                 if (value.equals("grapheme")) {
349                     fType = BreakIterator.KIND_CHARACTER;
350                 } else if (value.equals("word")) {
351                     fType = BreakIterator.KIND_WORD;
352                 } else if (value.equals("line")) {
353                     fType = BreakIterator.KIND_LINE;
354                 } else if (value.equals("sentence")) {
355                     fType = BreakIterator.KIND_SENTENCE;
356                 } else {
357                     String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value);
358                     System.err.println(msg);
359                     throw new IllegalArgumentException(msg);
360                 }
361                 return true;
362             }
363             return false;
364         }
365 
366 
createICUBreakIterator()367         RuleBasedBreakIterator createICUBreakIterator() {
368             BreakIterator bi;
369             switch(fType) {
370                 case BreakIterator.KIND_CHARACTER:
371                     bi = (BreakIterator.getCharacterInstance(fLocale));
372                     break;
373                 case BreakIterator.KIND_WORD:
374                     bi = (BreakIterator.getWordInstance(fLocale));
375                     break;
376                 case BreakIterator.KIND_LINE:
377                     bi = (BreakIterator.getLineInstance(fLocale));
378                     break;
379                 case BreakIterator.KIND_SENTENCE:
380                     bi = (BreakIterator.getSentenceInstance(fLocale));
381                     break;
382                 default:
383                     String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType);
384                     System.err.println(msg);
385                     throw new IllegalArgumentException(msg);
386             }
387             return (RuleBasedBreakIterator)bi;
388 
389         };
390 
391 
392 
compileRules(String rules)393         void compileRules(String rules) {
394             int lineNumber = 0;
395             for (String line: rules.split("\\r?\\n")) {
396                 ++lineNumber;
397                 // Strip comment lines.
398                 fCommentsMatcher.reset(line);
399                 line = fCommentsMatcher.replaceFirst("");
400                 if (line.isEmpty()) {
401                     continue;
402                 }
403 
404                 // Recognize character class definition and keyword lines
405                 fClassDefMatcher.reset(line);
406                 if (fClassDefMatcher.matches()) {
407                     String className = fClassDefMatcher.group(/*"ClassName"*/ 1);
408                     String classDef  = fClassDefMatcher.group(/*"ClassDef"*/ 2);
409                     if (fMonkeyImpl.fDumpExpansions) {
410                         System.out.printf("scanned class: %s = %s\n", className, classDef);
411                     }
412                     if (setKeywordParameter(className, classDef)) {
413                         // The scanned item was "type = ..." or "locale = ...", etc.
414                         //   which are not actual character classes.
415                         continue;
416                     }
417                     addCharClass(className, classDef);
418                     continue;
419                 }
420 
421                 // Recognize rule lines.
422                 fRuleDefMatcher.reset(line);
423                 if (fRuleDefMatcher.matches()) {
424                     String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1);
425                     String ruleDef  = fRuleDefMatcher.group(/*"RuleDef"*/ 2);
426                     if (fMonkeyImpl.fDumpExpansions) {
427                         System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef);
428                     }
429                     addRule(ruleName, ruleDef);
430                     continue;
431                 }
432 
433                 String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"",
434                         fMonkeyImpl.fRuleFileName, lineNumber, line);
435                 System.err.println(msg);
436                 throw new IllegalArgumentException(msg);
437             }
438 
439             // Build the vector of char classes, omitting the dictionary class if there is one.
440             // This will be used when constructing the random text to be tested.
441 
442             // Also compute the "other" set, consisting of any characters not included in
443             // one or more of the user defined sets.
444 
445             UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff);
446 
447             for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) {
448                 String ccName = el.getKey();
449                 CharClass cclass = el.getValue();
450 
451                 // System.out.printf("    Adding %s\n", ccName);
452                 if (!ccName.equals(cclass.fName)) {
453                     throw new IllegalArgumentException(
454                             String.format("%s: internal error, set names (%s, %s) inconsistent.\n",
455                                     fMonkeyImpl.fRuleFileName, ccName, cclass.fName));
456                 }
457                 otherSet.removeAll(cclass.fSet);
458                 if (ccName.equals("dictionary")) {
459                     fDictionarySet = cclass.fSet;
460                 } else {
461                     fCharClassList.add(cclass);
462                 }
463             }
464 
465             if (!otherSet.isEmpty()) {
466                 // System.out.printf("have an other set.\n");
467                 CharClass cclass = addCharClass("__Others", otherSet.toPattern(true));
468                 fCharClassList.add(cclass);
469             }
470 
471         };
472 
getClassForChar(int c)473         CharClass getClassForChar(int c) {
474             for (CharClass cc: fCharClassList) {
475                 if (cc.fSet.contains(c)) {
476                     return cc;
477                 }
478             }
479             return null;
480         };
481 
482 
483         RBBIMonkeyImpl          fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
484         List<BreakRule>         fBreakRules;        // Contents are of type (BreakRule *).
485 
486         Map<String, CharClass>  fCharClasses;       // Key is the set name.
487         //                                          // Value is the corresponding CharClass
488         List<CharClass>         fCharClassList;     // Char Classes, same contents as fCharClasses values,
489 
490         UnicodeSet              fDictionarySet;     // Dictionary set, empty if none is defined.
491         ULocale                 fLocale;
492         int                     fType;              // BreakItererator.KIND_WORD, etc.
493 
494 
495         Matcher fSetRefsMatcher;
496         Matcher fCommentsMatcher;
497         Matcher fClassDefMatcher;
498         Matcher fRuleDefMatcher;
499         Matcher fPropertyMatcher;
500     };
501 
502 
503 
504 
505     // class MonkeyTestData    represents a randomly synthesized test data string together
506     //                         with the expected break positions obtained by applying
507     //                         the test break rules.
508 
509     static class MonkeyTestData{
510 
set(BreakRules rules, ICU_Rand rand)511         void set(BreakRules rules, ICU_Rand rand) {
512             int dataLength = 1000;   // length of test data to generate, in code points.
513 
514             // Fill the test string with random characters.
515             // First randomly pick a char class, then randomly pick a character from that class.
516             // Exclude any characters from the dictionary set.
517 
518             // System.out.println("Populating Test Data");
519             fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
520                                                   // allowing recreation of failing data.
521             fBkRules = rules;
522             StringBuilder newString = new StringBuilder();
523             for (int n=0; n<dataLength;) {
524                 int charClassIndex = rand.next() % rules.fCharClassList.size();
525                 CharClass cclass = rules.fCharClassList.get(charClassIndex);
526                 if (cclass.fSet.size() == 0) {
527                     // Some rules or tailorings do end up with empty char classes.
528                     continue;
529                 }
530                 int charIndex = rand.next() % cclass.fSet.size();
531                 int c = cclass.fSet.charAt(charIndex);
532                 if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) &&
533                         newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) {
534                     // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
535                     // Don't let random unpaired surrogates combine in the test data because they might
536                     // produce an unwanted dictionary character.
537                     continue;
538                 }
539 
540                 if (!rules.fDictionarySet.contains(c)) {
541                     newString.appendCodePoint(c);
542                     ++n;
543                 }
544             }
545             fString = newString.toString();
546 
547             // Init the expectedBreaks, actualBreaks and ruleForPosition.
548             // Expected and Actual breaks are one longer than the input string; a true value
549             // will indicate a boundary preceding that position.
550 
551             fActualBreaks    = new boolean[fString.length()+1];
552             fExpectedBreaks  = new boolean[fString.length()+1];
553             fRuleForPosition = new int[fString.length()+1];
554             f2ndRuleForPos   = new int[fString.length()+1];
555 
556             int expectedBreakCount = 0;
557 
558             // Apply reference rules to find the expected breaks.
559 
560             fExpectedBreaks[0] = true;       // Force an expected break before the start of the text.
561                                              // ICU always reports a break there.
562                                              // The reference rules do not have a means to do so.
563             int strIdx = 0;
564             boolean initialMatch = true;     // True at start of text, and immediately after each boundary,
565             //                               // for control over rule chaining.
566 
567             while (strIdx < fString.length()) {
568                 BreakRule matchingRule = null;
569                 boolean hasBreak = false;
570                 int ruleNum = 0;
571                 int matchStart = 0;
572                 int matchEnd = 0;
573                 for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
574                     BreakRule rule = rules.fBreakRules.get(ruleNum);
575                     if (rule.fInitialMatchOnly && !initialMatch) {
576                         // Skip checking this '^' rule. (No rule chaining)
577                         continue;
578                     }
579                     rule.fRuleMatcher.reset(fString.substring(strIdx));
580                     if (rule.fRuleMatcher.lookingAt()) {
581                         // A candidate rule match, check further to see if we take it or continue to check other rules.
582                         // Matches of zero or one code point count only if they also specify a break.
583                         matchStart = strIdx;
584                         matchEnd = strIdx + rule.fRuleMatcher.end();
585                         hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0;
586                         if (hasBreak ||
587                                 (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) {
588                             matchingRule = rule;
589                             break;
590                         }
591                     }
592                 }
593                 if (matchingRule == null) {
594                     // No reference rule matched. This is an error in the rules that should never happen.
595                     String msg = String.format("%s: No reference rules matched at position %d. ",
596                             rules.fMonkeyImpl.fRuleFileName, strIdx);
597                     System.err.println(msg);
598                     dump(strIdx);
599                     throw new IllegalArgumentException(msg);
600                 }
601                 if (matchingRule.fRuleMatcher.group().length() == 0) {
602                     // Zero length rule match. This is also an error in the rule expressions.
603                     String msg = String.format("%s:%s: Zero length rule match at %d.",
604                             rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx);
605                     System.err.println(msg);
606                     dump(strIdx);
607                     throw new IllegalArgumentException(msg);
608                 }
609 
610                 // Record which rule matched over the length of the match.
611                 for (int i = matchStart; i < matchEnd; i++) {
612                     if (fRuleForPosition[i] == 0) {
613                         fRuleForPosition[i] = ruleNum;
614                     } else {
615                         f2ndRuleForPos[i] = ruleNum;
616                     }
617                 }
618 
619                 // Break positions appear in rules as a matching named capture of zero length at the break position,
620                 //   the adjusted pattern contains (?<BreakPosition>)
621                 if (hasBreak) {
622                     int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher);
623                     fExpectedBreaks[breakPos] = true;
624                     expectedBreakCount++;
625                     // System.out.printf("recording break at %d\n", breakPos);
626                     // For the next iteration, pick up applying rules immediately after the break,
627                     // which may differ from end of the match. The matching rule may have included
628                     // context following the boundary that needs to be looked at again.
629                     strIdx = breakPos;
630                     initialMatch = true;
631                 } else {
632                     // Original rule didn't specify a break.
633                     // Continue applying rules starting on the last code point of this match.
634                     int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1);
635                     if (updatedStrIdx == matchStart) {
636                         // Match was only one code point, no progress if we continue.
637                         // Shouldn't get here, case is filtered out at top of loop.
638                         throw new IllegalArgumentException(String.format("%s: Rule %s internal error.",
639                                 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
640                     }
641                     strIdx = updatedStrIdx;
642                     initialMatch = false;
643                 }
644             }
645             if (expectedBreakCount >= fString.length()) {
646                 throw new IllegalArgumentException(String.format("expectedBreakCount (%d) should be less than the test string length (%d).",
647                         expectedBreakCount, fString.length()));
648             }
649         };
650 
651         // Helper function to find the starting index of a match of the "BreakPosition" named capture group.
652         // @param m: a Java regex Matcher that has completed a matching operation.
653         // @return m.start("BreakPosition),
654         //         or -1 if there is no such group, or the group did not participate in the match.
655         //
656         // TODO: this becomes m.start("BreakPosition") with Java 8.
657         //       In the mean time, assume that the only zero-length capturing group in
658         //       a reference rule expression is the "BreakPosition" that corresponds to a "÷".
659 
BreakGroupStart(Matcher m)660         static int BreakGroupStart(Matcher m) {
661             for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) {
662                 String group = m.group(groupNum);
663                 if (group == null) {
664                     continue;
665                 }
666                 if (group.equals("")) {
667                     // assert(m.end(groupNum) == m.end("BreakPosition"));
668                     return m.start(groupNum);
669                 }
670             }
671             return -1;
672         }
673 
dump(int around)674         void dump(int around) {
675             System.out.print("\n"
676                     +        "         char                        break  Rule                     Character\n"
677                     +        "   pos   code   class                 R I   name                     name\n"
678                     +        "---------------------------------------------------------------------------------------------\n");
679 
680             int start;
681             int end;
682 
683             if (around == -1) {
684                 start = 0;
685                 end = fString.length();
686             } else {
687                 // Display context around a failure.
688                 try {
689                     start = fString.offsetByCodePoints(around, -30);
690                 } catch (Exception e) {
691                     start = 0;
692                 }
693                 try {
694                     end = fString.offsetByCodePoints(around, +30);
695                 } catch (Exception e) {
696                     end = fString.length();
697                 }
698             }
699 
700             for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) {
701                 int c = fString.codePointAt(charIdx);
702                 CharClass cc = fBkRules.getClassForChar(c);
703 
704                 BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]);
705                 String secondRuleName = "";
706                 if (f2ndRuleForPos[charIdx] > 0) {
707                     secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName;
708                 }
709                 String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME);
710 
711                 System.out.printf("  %4d %6x   %-20s  %c %c   %-10s %-10s    %s\n",
712                         charIdx, c, cc.fName,
713                         fExpectedBreaks[charIdx] ? '*' : '.',
714                         fActualBreaks[charIdx] ? '*' : '.',
715                         rule.fName, secondRuleName, cName
716                         );
717                 }
718 
719         };
720 
clearActualBreaks()721         void clearActualBreaks() {
722             Arrays.fill(fActualBreaks, false);
723         }
724 
725 
726         int               fRandomSeed;        // The initial seed value from the random number generator.
727         BreakRules        fBkRules;           // The break rules used to generate this data.
728         String            fString;            // The text.
729         boolean           fExpectedBreaks[];  // Breaks as found by the reference rules.
730                                               //     Parallel to fString. true if break preceding.
731         boolean           fActualBreaks[];    // Breaks as found by ICU break iterator.
732         int               fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position.
733                                               // Also parallel to fString.
734         int               f2ndRuleForPos[];   // As above. A 2nd rule applies when the preceding rule
735                                               //   didn't cause a break, and a subsequent rule match starts
736                                               //   on the last code point of the preceding match.
737 
738     }
739 
740 
741     // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
742     //                          test for one set of break rules.
743     //
744 
745     static class RBBIMonkeyImpl extends Thread {
746 
setup(String ruleFile)747         void setup(String ruleFile) {
748             fRuleFileName = ruleFile;
749             openBreakRules(ruleFile);
750             fRuleSet = new BreakRules(this);
751             fRuleSet.compileRules(fRuleCharBuffer);
752             fBI = fRuleSet.createICUBreakIterator();
753             fTestData = new MonkeyTestData();
754         };
755 
openBreakRules(String fileName)756         void openBreakRules(String fileName) {
757             StringBuilder testFileBuf = new StringBuilder();
758             InputStream is = null;
759             String filePath = "break_rules/" + fileName;
760             try {
761                 is = RBBIMonkeyImpl.class.getResourceAsStream(filePath);
762                 if (is == null) {
763                     errln("Could not open test data file " + fileName);
764                     return;
765                 }
766                 InputStreamReader isr = new InputStreamReader(is, "UTF-8");
767                 try {
768                     int c;
769                     int count = 0;
770                     for (;;) {
771                         c = isr.read();
772                         if (c < 0) {
773                             break;
774                         }
775                         count++;
776                         if (c == 0xFEFF && count == 1) {
777                             // BOM in the test data file. Discard it.
778                             continue;
779                         }
780                        testFileBuf.appendCodePoint(c);
781                     }
782                 } finally {
783                     isr.close();
784                 }
785                 } catch (IOException e) {
786                 try {
787                     is.close();
788                 } catch (IOException ignored) {
789                 }
790                 errln(e.toString());
791             }
792             fRuleCharBuffer =  testFileBuf.toString();  /* the file as a String */
793         }
794 
795         class MonkeyException extends RuntimeException  {
796             private static final long serialVersionUID = 1L;
797             public int fPosition;    // Position of the failure in the test data.
MonkeyException(String description, int pos)798             MonkeyException(String description, int pos) {
799                 super(description);
800                 fPosition = pos;
801             }
802         }
803 
804         @Override
run()805         public void run() {
806             int errorCount = 0;
807             if (fBI == null) {
808                 fErrorMsgs.append("Unable to run test because fBI is null.\n");
809                 return;
810             }
811             for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
812                 try {
813                     fTestData.set(fRuleSet, fRandomGenerator);
814                     // fTestData.dump(-1);
815                     testForwards();
816                     testPrevious();
817                     testFollowing();
818                     testPreceding();
819                     testIsBoundary();
820                 } catch (MonkeyException e) {
821                     String formattedMsg = String.format(
822                             "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n",
823                             e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed);
824                     System.err.print(formattedMsg);
825                     if (fVerbose) {
826                         fTestData.dump(e.fPosition);
827                     }
828                     fErrorMsgs.append(formattedMsg);
829                     if (++errorCount > 10) {
830                         return;
831                     }
832                 }
833                 if (fLoopCount < 0 && loopCount % 100 == 0) {
834                     System.err.print(".");
835                 }
836             }
837         }
838 
839         enum CheckDirection {
840             FORWARD,
841             REVERSE
842         };
843 
testForwards()844         void testForwards() {
845             fTestData.clearActualBreaks();
846             fBI.setText(fTestData.fString);
847             int previousBreak = -2;
848             for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) {
849                 if (bk <= previousBreak) {
850                     throw new MonkeyException("Break Iterator Stall", bk);
851                 }
852                 if (bk < 0 || bk > fTestData.fString.length()) {
853                     throw new MonkeyException("Boundary out of bounds", bk);
854                 }
855                 fTestData.fActualBreaks[bk] = true;
856             }
857             checkResults("testForwards", CheckDirection.FORWARD);
858         };
859 
860 
testFollowing()861        void testFollowing() {
862            fTestData.clearActualBreaks();
863            fBI.setText(fTestData.fString);
864            int nextBreak = -1;
865            for (int i=-1 ; i<fTestData.fString.length(); ++i) {
866                int bk = fBI.following(i);
867                if (bk == BreakIterator.DONE && i == fTestData.fString.length()) {
868                    continue;
869                }
870                if (bk == nextBreak && bk > i) {
871                    // i is in the gap between two breaks.
872                    continue;
873                }
874                if (i == nextBreak && bk > nextBreak) {
875                    fTestData.fActualBreaks[bk] = true;
876                    nextBreak = bk;
877                    continue;
878                }
879                throw new MonkeyException("following(i)", i);
880            }
881            checkResults("testFollowing", CheckDirection.FORWARD);
882         };
883 
884 
testPrevious()885         void testPrevious() {
886             fTestData.clearActualBreaks();
887             fBI.setText(fTestData.fString);
888             int previousBreak = Integer.MAX_VALUE;
889             for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) {
890                  if (bk >= previousBreak) {
891                      throw new MonkeyException("Break Iterator Stall", bk);
892                 }
893                 if (bk < 0 || bk > fTestData.fString.length()) {
894                     throw new MonkeyException("Boundary out of bounds", bk);
895                 }
896                 fTestData.fActualBreaks[bk] = true;
897             }
898             checkResults("testPrevius", CheckDirection.REVERSE);
899         };
900 
901 
902         /**
903          * Given an index into a string, if it refers to the trail surrogate of a surrogate pair,
904          * adjust it to point to the lead surrogate, which is the start of the code point.
905          * @param s the String.
906          * @param i the initial index
907          * @return the adjusted index
908          */
getChar32Start(String s, int i)909         private int getChar32Start(String s, int i) {
910             if (i > 0 && i < s.length() &&
911                     Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) {
912                 --i;
913             }
914             return i;
915         }
916 
917 
testPreceding()918         void testPreceding() {
919             fTestData.clearActualBreaks();
920             fBI.setText(fTestData.fString);
921             int nextBreak = fTestData.fString.length()+1;
922             for (int i=fTestData.fString.length()+1 ; i>=0; --i) {
923                 int bk = fBI.preceding(i);
924                 // System.err.printf("testPreceding() i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
925                 if (bk == BreakIterator.DONE && i == 0) {
926                     continue;
927                 }
928                 if (bk == nextBreak && bk < i) {
929                     // i is in the gap between two breaks.
930                     continue;
931                 }
932                 if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) {
933                     // i indexes to a trailing surrogate.
934                     // Break Iterators treat an index to either half as referring to the supplemental code point,
935                     // with preceding going to some preceding code point.
936                     if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) {
937                         throw new MonkeyException("preceding of trailing surrogate error", i);
938                     }
939                     continue;
940                 }
941                 if (i == nextBreak && bk < nextBreak) {
942                     fTestData.fActualBreaks[bk] = true;
943                     nextBreak = bk;
944                     continue;
945                 }
946                 throw new MonkeyException("preceding(i)", i);
947             }
948             checkResults("testPreceding", CheckDirection.REVERSE);
949 
950         };
951 
952 
testIsBoundary()953         void testIsBoundary() {
954             fTestData.clearActualBreaks();
955             fBI.setText(fTestData.fString);
956             for (int i=fTestData.fString.length(); i>=0; --i) {
957                 if (fBI.isBoundary(i)) {
958                     fTestData.fActualBreaks[i] = true;
959                 }
960             }
961             checkResults("testForwards", CheckDirection.FORWARD);
962         };
963 
964 
checkResults(String msg, CheckDirection direction)965         void checkResults(String msg, CheckDirection direction) {
966             if (direction == CheckDirection.FORWARD) {
967                 for (int i=0; i<=fTestData.fString.length(); ++i) {
968                     if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
969                         throw new MonkeyException(msg, i);
970                     }
971                 }
972             } else {
973                 for (int i=fTestData.fString.length(); i>=0; i--) {
974                     if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
975                         throw new MonkeyException(msg, i);
976                     }
977                 }
978             }
979 
980         };
981 
982         String                 fRuleCharBuffer;         // source file contents of the reference rules.
983         BreakRules             fRuleSet;
984         RuleBasedBreakIterator fBI;
985         MonkeyTestData         fTestData;
986         ICU_Rand               fRandomGenerator;
987         String                 fRuleFileName;
988         boolean                fVerbose;                 // True to do long dump of failing data.
989         int                    fLoopCount;
990         int                    fErrorCount;
991 
992         boolean                fDumpExpansions;          // Debug flag to output expanded form of rules and sets.
993         StringBuilder          fErrorMsgs = new StringBuilder();
994 
995     }
996 
997     //  Test parameters, specified via Java properties.
998     //
999     //  rules=file_name   Name of file containing the reference rules.
1000     //  seed=nnnnn        Random number starting seed.
1001     //                    Setting the seed allows errors to be reproduced.
1002     //  loop=nnn          Looping count.  Controls running time.
1003     //                    -1:  run forever.
1004     //                     0 or greater:  run length.
1005     //  expansions        debug option, show expansions of rules and sets.
1006     //  verbose           Display details of the failure.
1007     //
1008     // Parameters are passed to the JVM on the command line, or
1009     // via the Eclipse Run Configuration settings, arguments tab, VM parameters.
1010     // For example,
1011     //      -ea -Drules=line.txt -Dloop=-1
1012     //
1013     @Test
TestMonkey()1014     public void TestMonkey() {
1015         String tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt",
1016                 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt"
1017         };
1018 
1019         String testNameFromParams = getProperty("rules");
1020 
1021         if (testNameFromParams != null) {
1022             tests = new String[] {testNameFromParams};
1023         }
1024 
1025         int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000);
1026         boolean dumpExpansions =  getBooleanProperty("expansions", false);
1027         boolean verbose = getBooleanProperty("verbose", false);
1028         int seed = getIntProperty("seed", 1);
1029 
1030         List<RBBIMonkeyImpl> startedTests = new ArrayList<>();
1031 
1032         // Monkey testing is multi-threaded.
1033         // Each set of break rules to be tested is run in a separate thread.
1034         // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
1035 
1036         for (String testName: tests) {
1037             logln(String.format("beginning testing of %s", testName));
1038 
1039             RBBIMonkeyImpl test = new RBBIMonkeyImpl();
1040 
1041             test.fDumpExpansions = dumpExpansions;
1042             test.fVerbose = verbose;
1043             test.fRandomGenerator = new ICU_Rand(seed);
1044             test.fLoopCount = loopCount;
1045             test.setup(testName);
1046 
1047             test.start();
1048             startedTests.add(test);
1049         }
1050 
1051         StringBuilder errors = new StringBuilder();
1052         for (RBBIMonkeyImpl test: startedTests) {
1053             try {
1054                 test.join();
1055                 errors.append(test.fErrorMsgs);
1056             } catch (InterruptedException e) {
1057                 errors.append(e + "\n");
1058             }
1059         }
1060         String errorMsgs = errors.toString();
1061         assertEquals(errorMsgs, "", errorMsgs);
1062 
1063     }
1064 
1065 
1066 }
1067