1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 package com.ibm.icu.dev.test.rbbi; 5 6 import java.io.IOException; 7 import java.io.InputStream; 8 import java.io.InputStreamReader; 9 import java.util.ArrayList; 10 import java.util.Arrays; 11 import java.util.HashMap; 12 import java.util.List; 13 import java.util.Map; 14 import java.util.regex.Matcher; 15 import java.util.regex.Pattern; 16 import java.util.regex.PatternSyntaxException; 17 18 import org.junit.Test; 19 import org.junit.runner.RunWith; 20 import org.junit.runners.JUnit4; 21 22 import com.ibm.icu.dev.test.TestFmwk; 23 import com.ibm.icu.impl.UCharacterName; 24 import com.ibm.icu.impl.UCharacterNameChoice; 25 import com.ibm.icu.text.BreakIterator; 26 import com.ibm.icu.text.RuleBasedBreakIterator; 27 import com.ibm.icu.text.UnicodeSet; 28 import com.ibm.icu.util.ULocale; 29 30 /** 31 * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp. 32 * This is the newer, data driven monkey test. It is completely separate from the 33 * older class RBBITestMonkey. 34 */ 35 36 @RunWith(JUnit4.class) 37 public class RBBIMonkeyTest extends TestFmwk { 38 39 40 // class CharClass Represents a single character class from the source break rules. 41 // Inherits from UObject because instances are adopted by UHashtable, which ultimately 42 // deletes them using hash's object deleter function. 43 44 static class CharClass { 45 String fName; 46 String fOriginalDef; // set definition as it appeared in user supplied rules. 47 String fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively. 48 UnicodeSet fSet; CharClass(String name, String originalDef, String expandedDef, UnicodeSet set)49 CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) { 50 fName = name; 51 fOriginalDef = originalDef; 52 fExpandedDef = expandedDef; 53 fSet = set; 54 }; 55 } 56 57 58 // class BreakRule Struct-like class represents a single rule from a set of break rules. 59 // Each rule has the set definitions expanded, and 60 // is compiled to a regular expression. 61 62 static class BreakRule { 63 String fName; // Name of the rule. 64 String fRule; // Rule expression, excluding the name, as written in user source. 65 String fExpandedRule; // Rule expression after expanding the set definitions. 66 Matcher fRuleMatcher; // Regular expression that matches the rule. 67 boolean fInitialMatchOnly = false; // True if rule begins with '^', meaning no chaining. 68 }; 69 70 71 // class BreakRules represents a complete set of break rules, possibly tailored, 72 // compiled from testdata break rules. 73 74 static class BreakRules { BreakRules(RBBIMonkeyImpl monkeyImpl)75 BreakRules(RBBIMonkeyImpl monkeyImpl) { 76 fMonkeyImpl = monkeyImpl; 77 fBreakRules = new ArrayList<>(); 78 fType = BreakIterator.KIND_TITLE; 79 fCharClasses = new HashMap<>(); 80 fCharClassList = new ArrayList<>(); 81 fDictionarySet = new UnicodeSet(); 82 83 // Match an alpha-numeric identifier in a rule. Will be a set name. 84 // Use negative look-behind to exclude non-identifiers, mostly property names or values. 85 fSetRefsMatcher = Pattern.compile( 86 "(?<!\\{[ \\t]{0,4})" + 87 "(?<!=[ \\t]{0,4})" + 88 "(?<!\\[:[ \\t]{0,4})" + 89 "(?<!\\\\)" + 90 "(?<![A-Za-z0-9_])" + 91 "([A-Za-z_][A-Za-z0-9_]*)"). // The char class name 92 matcher(""); 93 94 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules. 95 fCommentsMatcher = Pattern.compile("" + 96 "(^|(?<=;))" + // Start either at start of line, or just after a ';' (look-behind for ';') 97 "[ \\t]*+" + // Match white space. 98 "(#.*)?+" + // Optional # plus whatever follows 99 "$"). // new-line at end of line. 100 matcher(""); 101 102 // Match (initial parse) of a character class definition line. 103 fClassDefMatcher = Pattern.compile("" + 104 "[ \\t]*" + // leading white space 105 "([A-Za-z_][A-Za-z0-9_]*)" + // The char class name 106 "[ \\t]*=[ \\t]*" + // = 107 "(.*?)" + // The char class UnicodeSet expression 108 "[ \\t]*;$"). // ; <end of line> 109 matcher(""); 110 111 // Match (initial parse) of a break rule line. 112 fRuleDefMatcher = Pattern.compile("" + 113 "[ \\t]*" + // leading white space 114 "([A-Za-z_][A-Za-z0-9_.]*)" + // The rule name 115 "[ \\t]*:[ \\t]*" + // : 116 "(.*?)" + // The rule definition 117 "[ \\t]*;$"). // ; <end of line> 118 matcher(""); 119 120 // Match a property expression, either [:xxx:] or \p{...} 121 fPropertyMatcher = Pattern.compile("" + 122 "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}"). 123 matcher(""); 124 125 126 } 127 128 /** 129 * Create the expanded definition for this char class, 130 * replacing any set references with the corresponding definition. 131 */ addCharClass(String name, String definition)132 CharClass addCharClass(String name, String definition) { 133 StringBuffer expandedDef = new StringBuffer(); 134 fSetRefsMatcher.reset(definition); 135 while (fSetRefsMatcher.find()) { 136 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1); 137 CharClass snameClass = fCharClasses.get(sname); 138 String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname; 139 140 fSetRefsMatcher.appendReplacement(expandedDef, ""); 141 expandedDef.append(expansionForName); 142 } 143 fSetRefsMatcher.appendTail(expandedDef); 144 String expandedDefString = expandedDef.toString(); 145 146 if (fMonkeyImpl.fDumpExpansions) { 147 System.out.printf("addCharClass(\"%s\"\n", name); 148 System.out.printf(" %s\n", definition); 149 System.out.printf("expandedDef: %s\n", expandedDefString); 150 } 151 152 // Verify that the expanded set definition is valid. 153 154 UnicodeSet s; 155 try { 156 s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE); 157 } catch (java.lang.IllegalArgumentException e) { 158 System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name); 159 throw e; 160 } 161 162 // Get an expanded equivalent pattern from the UnicodeSet. 163 // This removes set difference operators, which would fail if passed through to Java regex. 164 165 StringBuffer expandedPattern = new StringBuffer(); 166 s._generatePattern(expandedPattern, true); 167 expandedDefString = expandedPattern.toString(); 168 if (fMonkeyImpl.fDumpExpansions) { 169 System.out.printf("expandedDef2: %s\n", expandedDefString); 170 } 171 172 CharClass cclass = new CharClass(name, definition, expandedDefString, s); 173 CharClass previousClass = fCharClasses.put(name, cclass); 174 175 if (previousClass != null) { 176 // TODO: decide whether or not to allow redefinitions. 177 // Can be convenient in some cases. 178 // String msg = String.format("%s: Redefinition of character class %s\n", 179 // fMonkeyImpl.fRuleFileName, cclass.fName); 180 // System.err.println(msg); 181 // throw new IllegalArgumentException(msg); 182 } 183 return cclass; 184 185 }; 186 187 addRule(String name, String definition)188 void addRule(String name, String definition) { 189 BreakRule thisRule = new BreakRule(); 190 StringBuffer expandedDefsRule = new StringBuffer(); 191 thisRule.fName = name; 192 thisRule.fRule = definition; 193 194 // Expand the char class definitions within the rule. 195 fSetRefsMatcher.reset(definition); 196 while (fSetRefsMatcher.find()) { 197 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1); 198 CharClass nameClass = fCharClasses.get(sname); 199 if (nameClass == null) { 200 System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition); 201 } 202 String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname; 203 fSetRefsMatcher.appendReplacement(expandedDefsRule, ""); 204 expandedDefsRule.append(expansionForName); 205 } 206 fSetRefsMatcher.appendTail(expandedDefsRule); 207 208 // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion, 209 // obtained from ICU UnicodeSet. Need to do this substitution because Java regex 210 // does not recognize all properties, and because Java's definitions are likely 211 // older than ICU's. 212 213 StringBuffer expandedRule = new StringBuffer(); 214 fPropertyMatcher.reset(expandedDefsRule); 215 while (fPropertyMatcher.find()) { 216 String prop = fPropertyMatcher.group(); 217 UnicodeSet propSet = new UnicodeSet("[" + prop + "]"); 218 StringBuffer propExpansion = new StringBuffer(); 219 propSet._generatePattern(propExpansion, true); 220 fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString()); 221 } 222 fPropertyMatcher.appendTail(expandedRule); 223 224 // If rule begins with a '^' rule chaining is disallowed. 225 // Strip off the '^' from the rule expression, and set the flag. 226 if (expandedRule.charAt(0) == '^') { 227 thisRule.fInitialMatchOnly = true; 228 expandedRule.deleteCharAt(0); 229 expandedRule = new StringBuffer(expandedRule.toString().trim()); 230 } 231 232 // Replace any [^negated sets] with equivalent flattened sets generated by 233 // ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply 234 // to any nested classes. Variable substitution in rules produces 235 // nested sets that [^negation] needs to apply to. 236 237 StringBuffer ruleWithFlattenedSets = new StringBuffer(); 238 int idx = 0; 239 while (idx<expandedRule.length()) { 240 int setOpenPos = expandedRule.indexOf("[^", idx); 241 if (setOpenPos < 0) { 242 break; 243 } 244 if (setOpenPos > idx) { 245 // Move anything from the source rule preceding the [^ into the processed rule, unchanged. 246 ruleWithFlattenedSets.append(expandedRule.substring(idx, setOpenPos)); 247 } 248 int nestingLevel = 1; 249 boolean haveNesting = false; 250 int setClosePos; 251 for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) { 252 char c = expandedRule.charAt(setClosePos); 253 if (c == '\\') { 254 ++setClosePos; 255 } else if (c == '[') { 256 ++nestingLevel; 257 haveNesting = true; 258 } else if (c == ']') { 259 --nestingLevel; 260 } 261 } 262 if (haveNesting && nestingLevel == 0) { 263 // Found one, a negated set that includes interior nested sets. 264 // Create an ICU UnicodeSet from the source pattern, and obtain an 265 // equivalent flattened pattern from that. 266 UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true); 267 uset._generatePattern(ruleWithFlattenedSets, true); 268 } else { 269 // The [^ set definition did not include any nested sets. 270 // Copy the original definition without change. 271 // Java regular expressions will handle it without needing to recast it. 272 if (nestingLevel > 0) { 273 // Error case of an unclosed character class expression. 274 // Java regex will also eventually flag the error. 275 System.err.printf("No closing ] found in rule %s\n", name); 276 } 277 ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos)); 278 } 279 idx = setClosePos; 280 } 281 282 if (idx < expandedRule.length()) { 283 ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length())); 284 } 285 286 thisRule.fExpandedRule = ruleWithFlattenedSets.toString(); 287 288 // Replace the divide sign (\u00f7) with a regular expression named capture. 289 // When running the rules, a match that includes this group means we found a break position. 290 291 // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)"); 292 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()"); 293 if (thisRule.fExpandedRule.indexOf("÷") != -1) { 294 String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name); 295 System.err.println(msg); 296 throw new IllegalArgumentException(msg); 297 } 298 299 // UAX break rule set definitions can be empty, just []. 300 // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which 301 // also matches nothing. 302 303 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]"); 304 305 // Change Unicode escape syntax for compatibility with Java regular expressions 306 // \udddd => \x{dddd} 307 // \U00hhhhhh => \x{hhhhhh} 308 309 thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}"); 310 thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}"); 311 312 // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment. 313 // UnicodeSet._generatePattern() inserts un-escaped "#"s 314 315 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#"); 316 if (fMonkeyImpl.fDumpExpansions) { 317 System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule); 318 } 319 320 // Compile a regular expression for this rule. 321 322 try { 323 thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher(""); 324 } catch (PatternSyntaxException e) { 325 System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"", 326 fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule); 327 throw e; 328 } 329 330 // Put this new rule into the vector of all Rules. 331 332 fBreakRules.add(thisRule); 333 }; 334 335 @SuppressWarnings("unused") hexToCodePoint(String hex)336 private static String hexToCodePoint(String hex) { 337 int cp = Integer.parseInt(hex, 16); 338 return new StringBuilder().appendCodePoint(cp).toString(); 339 } 340 341 setKeywordParameter(String keyword, String value)342 boolean setKeywordParameter(String keyword, String value) { 343 if (keyword.equals("locale")) { 344 fLocale = new ULocale(value); 345 return true; 346 } 347 if (keyword.equals("type")) { 348 if (value.equals("grapheme")) { 349 fType = BreakIterator.KIND_CHARACTER; 350 } else if (value.equals("word")) { 351 fType = BreakIterator.KIND_WORD; 352 } else if (value.equals("line")) { 353 fType = BreakIterator.KIND_LINE; 354 } else if (value.equals("sentence")) { 355 fType = BreakIterator.KIND_SENTENCE; 356 } else { 357 String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value); 358 System.err.println(msg); 359 throw new IllegalArgumentException(msg); 360 } 361 return true; 362 } 363 return false; 364 } 365 366 createICUBreakIterator()367 RuleBasedBreakIterator createICUBreakIterator() { 368 BreakIterator bi; 369 switch(fType) { 370 case BreakIterator.KIND_CHARACTER: 371 bi = (BreakIterator.getCharacterInstance(fLocale)); 372 break; 373 case BreakIterator.KIND_WORD: 374 bi = (BreakIterator.getWordInstance(fLocale)); 375 break; 376 case BreakIterator.KIND_LINE: 377 bi = (BreakIterator.getLineInstance(fLocale)); 378 break; 379 case BreakIterator.KIND_SENTENCE: 380 bi = (BreakIterator.getSentenceInstance(fLocale)); 381 break; 382 default: 383 String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType); 384 System.err.println(msg); 385 throw new IllegalArgumentException(msg); 386 } 387 return (RuleBasedBreakIterator)bi; 388 389 }; 390 391 392 compileRules(String rules)393 void compileRules(String rules) { 394 int lineNumber = 0; 395 for (String line: rules.split("\\r?\\n")) { 396 ++lineNumber; 397 // Strip comment lines. 398 fCommentsMatcher.reset(line); 399 line = fCommentsMatcher.replaceFirst(""); 400 if (line.isEmpty()) { 401 continue; 402 } 403 404 // Recognize character class definition and keyword lines 405 fClassDefMatcher.reset(line); 406 if (fClassDefMatcher.matches()) { 407 String className = fClassDefMatcher.group(/*"ClassName"*/ 1); 408 String classDef = fClassDefMatcher.group(/*"ClassDef"*/ 2); 409 if (fMonkeyImpl.fDumpExpansions) { 410 System.out.printf("scanned class: %s = %s\n", className, classDef); 411 } 412 if (setKeywordParameter(className, classDef)) { 413 // The scanned item was "type = ..." or "locale = ...", etc. 414 // which are not actual character classes. 415 continue; 416 } 417 addCharClass(className, classDef); 418 continue; 419 } 420 421 // Recognize rule lines. 422 fRuleDefMatcher.reset(line); 423 if (fRuleDefMatcher.matches()) { 424 String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1); 425 String ruleDef = fRuleDefMatcher.group(/*"RuleDef"*/ 2); 426 if (fMonkeyImpl.fDumpExpansions) { 427 System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef); 428 } 429 addRule(ruleName, ruleDef); 430 continue; 431 } 432 433 String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"", 434 fMonkeyImpl.fRuleFileName, lineNumber, line); 435 System.err.println(msg); 436 throw new IllegalArgumentException(msg); 437 } 438 439 // Build the vector of char classes, omitting the dictionary class if there is one. 440 // This will be used when constructing the random text to be tested. 441 442 // Also compute the "other" set, consisting of any characters not included in 443 // one or more of the user defined sets. 444 445 UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff); 446 447 for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) { 448 String ccName = el.getKey(); 449 CharClass cclass = el.getValue(); 450 451 // System.out.printf(" Adding %s\n", ccName); 452 if (!ccName.equals(cclass.fName)) { 453 throw new IllegalArgumentException( 454 String.format("%s: internal error, set names (%s, %s) inconsistent.\n", 455 fMonkeyImpl.fRuleFileName, ccName, cclass.fName)); 456 } 457 otherSet.removeAll(cclass.fSet); 458 if (ccName.equals("dictionary")) { 459 fDictionarySet = cclass.fSet; 460 } else { 461 fCharClassList.add(cclass); 462 } 463 } 464 465 if (!otherSet.isEmpty()) { 466 // System.out.printf("have an other set.\n"); 467 CharClass cclass = addCharClass("__Others", otherSet.toPattern(true)); 468 fCharClassList.add(cclass); 469 } 470 471 }; 472 getClassForChar(int c)473 CharClass getClassForChar(int c) { 474 for (CharClass cc: fCharClassList) { 475 if (cc.fSet.contains(c)) { 476 return cc; 477 } 478 } 479 return null; 480 }; 481 482 483 RBBIMonkeyImpl fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance. 484 List<BreakRule> fBreakRules; // Contents are of type (BreakRule *). 485 486 Map<String, CharClass> fCharClasses; // Key is the set name. 487 // // Value is the corresponding CharClass 488 List<CharClass> fCharClassList; // Char Classes, same contents as fCharClasses values, 489 490 UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined. 491 ULocale fLocale; 492 int fType; // BreakItererator.KIND_WORD, etc. 493 494 495 Matcher fSetRefsMatcher; 496 Matcher fCommentsMatcher; 497 Matcher fClassDefMatcher; 498 Matcher fRuleDefMatcher; 499 Matcher fPropertyMatcher; 500 }; 501 502 503 504 505 // class MonkeyTestData represents a randomly synthesized test data string together 506 // with the expected break positions obtained by applying 507 // the test break rules. 508 509 static class MonkeyTestData{ 510 set(BreakRules rules, ICU_Rand rand)511 void set(BreakRules rules, ICU_Rand rand) { 512 int dataLength = 1000; // length of test data to generate, in code points. 513 514 // Fill the test string with random characters. 515 // First randomly pick a char class, then randomly pick a character from that class. 516 // Exclude any characters from the dictionary set. 517 518 // System.out.println("Populating Test Data"); 519 fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages, 520 // allowing recreation of failing data. 521 fBkRules = rules; 522 StringBuilder newString = new StringBuilder(); 523 for (int n=0; n<dataLength;) { 524 int charClassIndex = rand.next() % rules.fCharClassList.size(); 525 CharClass cclass = rules.fCharClassList.get(charClassIndex); 526 if (cclass.fSet.size() == 0) { 527 // Some rules or tailorings do end up with empty char classes. 528 continue; 529 } 530 int charIndex = rand.next() % cclass.fSet.size(); 531 int c = cclass.fSet.charAt(charIndex); 532 if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) && 533 newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) { 534 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control. 535 // Don't let random unpaired surrogates combine in the test data because they might 536 // produce an unwanted dictionary character. 537 continue; 538 } 539 540 if (!rules.fDictionarySet.contains(c)) { 541 newString.appendCodePoint(c); 542 ++n; 543 } 544 } 545 fString = newString.toString(); 546 547 // Init the expectedBreaks, actualBreaks and ruleForPosition. 548 // Expected and Actual breaks are one longer than the input string; a true value 549 // will indicate a boundary preceding that position. 550 551 fActualBreaks = new boolean[fString.length()+1]; 552 fExpectedBreaks = new boolean[fString.length()+1]; 553 fRuleForPosition = new int[fString.length()+1]; 554 f2ndRuleForPos = new int[fString.length()+1]; 555 556 // Apply reference rules to find the expected breaks. 557 558 fExpectedBreaks[0] = true; // Force an expected break before the start of the text. 559 // ICU always reports a break there. 560 // The reference rules do not have a means to do so. 561 int strIdx = 0; 562 boolean initialMatch = true; // True at start of text, and immediately after each boundary, 563 // // for control over rule chaining. 564 565 while (strIdx < fString.length()) { 566 BreakRule matchingRule = null; 567 boolean hasBreak = false; 568 int ruleNum = 0; 569 int matchStart = 0; 570 int matchEnd = 0; 571 for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) { 572 BreakRule rule = rules.fBreakRules.get(ruleNum); 573 if (rule.fInitialMatchOnly && !initialMatch) { 574 // Skip checking this '^' rule. (No rule chaining) 575 continue; 576 } 577 rule.fRuleMatcher.reset(fString.substring(strIdx)); 578 if (rule.fRuleMatcher.lookingAt()) { 579 // A candidate rule match, check further to see if we take it or continue to check other rules. 580 // Matches of zero or one code point count only if they also specify a break. 581 matchStart = strIdx; 582 matchEnd = strIdx + rule.fRuleMatcher.end(); 583 hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0; 584 if (hasBreak || 585 (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) { 586 matchingRule = rule; 587 break; 588 } 589 } 590 } 591 if (matchingRule == null) { 592 // No reference rule matched. This is an error in the rules that should never happen. 593 String msg = String.format("%s: No reference rules matched at position %d. ", 594 rules.fMonkeyImpl.fRuleFileName, strIdx); 595 System.err.println(msg); 596 dump(strIdx); 597 throw new IllegalArgumentException(msg); 598 } 599 if (matchingRule.fRuleMatcher.group().length() == 0) { 600 // Zero length rule match. This is also an error in the rule expressions. 601 String msg = String.format("%s:%s: Zero length rule match at %d.", 602 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx); 603 System.err.println(msg); 604 dump(strIdx); 605 throw new IllegalArgumentException(msg); 606 } 607 608 // Record which rule matched over the length of the match. 609 for (int i = matchStart; i < matchEnd; i++) { 610 if (fRuleForPosition[i] == 0) { 611 fRuleForPosition[i] = ruleNum; 612 } else { 613 f2ndRuleForPos[i] = ruleNum; 614 } 615 } 616 617 // Break positions appear in rules as a matching named capture of zero length at the break position, 618 // the adjusted pattern contains (?<BreakPosition>) 619 if (hasBreak) { 620 int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher); 621 fExpectedBreaks[breakPos] = true; 622 // System.out.printf("recording break at %d\n", breakPos); 623 // For the next iteration, pick up applying rules immediately after the break, 624 // which may differ from end of the match. The matching rule may have included 625 // context following the boundary that needs to be looked at again. 626 strIdx = breakPos; 627 initialMatch = true; 628 } else { 629 // Original rule didn't specify a break. 630 // Continue applying rules starting on the last code point of this match. 631 int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1); 632 if (updatedStrIdx == matchStart) { 633 // Match was only one code point, no progress if we continue. 634 // Shouldn't get here, case is filtered out at top of loop. 635 throw new IllegalArgumentException(String.format("%s: Rule %s internal error.", 636 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName)); 637 } 638 strIdx = updatedStrIdx; 639 initialMatch = false; 640 } 641 } 642 }; 643 644 // Helper function to find the starting index of a match of the "BreakPosition" named capture group. 645 // @param m: a Java regex Matcher that has completed a matching operation. 646 // @return m.start("BreakPosition), 647 // or -1 if there is no such group, or the group did not participate in the match. 648 // 649 // TODO: this becomes m.start("BreakPosition") with Java 8. 650 // In the mean time, assume that the only zero-length capturing group in 651 // a reference rule expression is the "BreakPosition" that corresponds to a "÷". 652 BreakGroupStart(Matcher m)653 static int BreakGroupStart(Matcher m) { 654 for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) { 655 String group = m.group(groupNum); 656 if (group == null) { 657 continue; 658 } 659 if (group.equals("")) { 660 // assert(m.end(groupNum) == m.end("BreakPosition")); 661 return m.start(groupNum); 662 } 663 } 664 return -1; 665 } 666 dump(int around)667 void dump(int around) { 668 System.out.print("\n" 669 + " char break Rule Character\n" 670 + " pos code class R I name name\n" 671 + "---------------------------------------------------------------------------------------------\n"); 672 673 int start; 674 int end; 675 676 if (around == -1) { 677 start = 0; 678 end = fString.length(); 679 } else { 680 // Display context around a failure. 681 try { 682 start = fString.offsetByCodePoints(around, -30); 683 } catch (Exception e) { 684 start = 0; 685 } 686 try { 687 end = fString.offsetByCodePoints(around, +30); 688 } catch (Exception e) { 689 end = fString.length(); 690 } 691 } 692 693 for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) { 694 int c = fString.codePointAt(charIdx); 695 CharClass cc = fBkRules.getClassForChar(c); 696 697 BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]); 698 String secondRuleName = ""; 699 if (f2ndRuleForPos[charIdx] > 0) { 700 secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName; 701 } 702 String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME); 703 704 System.out.printf(" %4d %6x %-20s %c %c %-10s %-10s %s\n", 705 charIdx, c, cc.fName, 706 fExpectedBreaks[charIdx] ? '*' : '.', 707 fActualBreaks[charIdx] ? '*' : '.', 708 rule.fName, secondRuleName, cName 709 ); 710 } 711 712 }; 713 clearActualBreaks()714 void clearActualBreaks() { 715 Arrays.fill(fActualBreaks, false); 716 } 717 718 719 int fRandomSeed; // The initial seed value from the random number generator. 720 BreakRules fBkRules; // The break rules used to generate this data. 721 String fString; // The text. 722 boolean fExpectedBreaks[]; // Breaks as found by the reference rules. 723 // Parallel to fString. true if break preceding. 724 boolean fActualBreaks[]; // Breaks as found by ICU break iterator. 725 int fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position. 726 // Also parallel to fString. 727 int f2ndRuleForPos[]; // As above. A 2nd rule applies when the preceding rule 728 // didn't cause a break, and a subsequent rule match starts 729 // on the last code point of the preceding match. 730 731 } 732 733 734 // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey 735 // test for one set of break rules. 736 // 737 738 static class RBBIMonkeyImpl extends Thread { 739 setup(String ruleFile)740 void setup(String ruleFile) { 741 fRuleFileName = ruleFile; 742 openBreakRules(ruleFile); 743 fRuleSet = new BreakRules(this); 744 fRuleSet.compileRules(fRuleCharBuffer); 745 fBI = fRuleSet.createICUBreakIterator(); 746 fTestData = new MonkeyTestData(); 747 }; 748 openBreakRules(String fileName)749 void openBreakRules(String fileName) { 750 StringBuilder testFileBuf = new StringBuilder(); 751 InputStream is = null; 752 String filePath = "break_rules/" + fileName; 753 try { 754 is = RBBIMonkeyImpl.class.getResourceAsStream(filePath); 755 if (is == null) { 756 errln("Could not open test data file " + fileName); 757 return; 758 } 759 InputStreamReader isr = new InputStreamReader(is, "UTF-8"); 760 try { 761 int c; 762 int count = 0; 763 for (;;) { 764 c = isr.read(); 765 if (c < 0) { 766 break; 767 } 768 count++; 769 if (c == 0xFEFF && count == 1) { 770 // BOM in the test data file. Discard it. 771 continue; 772 } 773 testFileBuf.appendCodePoint(c); 774 } 775 } finally { 776 isr.close(); 777 } 778 } catch (IOException e) { 779 try { 780 is.close(); 781 } catch (IOException ignored) { 782 } 783 errln(e.toString()); 784 } 785 fRuleCharBuffer = testFileBuf.toString(); /* the file as a String */ 786 } 787 788 class MonkeyException extends RuntimeException { 789 private static final long serialVersionUID = 1L; 790 public int fPosition; // Position of the failure in the test data. MonkeyException(String description, int pos)791 MonkeyException(String description, int pos) { 792 super(description); 793 fPosition = pos; 794 } 795 } 796 797 @Override run()798 public void run() { 799 int errorCount = 0; 800 if (fBI == null) { 801 fErrorMsgs.append("Unable to run test because fBI is null.\n"); 802 return; 803 } 804 for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) { 805 try { 806 fTestData.set(fRuleSet, fRandomGenerator); 807 // fTestData.dump(-1); 808 testForwards(); 809 testPrevious(); 810 testFollowing(); 811 testPreceding(); 812 testIsBoundary(); 813 } catch (MonkeyException e) { 814 String formattedMsg = String.format( 815 "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n", 816 e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed); 817 System.err.print(formattedMsg); 818 if (fVerbose) { 819 fTestData.dump(e.fPosition); 820 } 821 fErrorMsgs.append(formattedMsg); 822 if (++errorCount > 10) { 823 return; 824 } 825 } 826 if (fLoopCount < 0 && loopCount % 100 == 0) { 827 System.err.print("."); 828 } 829 } 830 } 831 832 enum CheckDirection { 833 FORWARD, 834 REVERSE 835 }; 836 testForwards()837 void testForwards() { 838 fTestData.clearActualBreaks(); 839 fBI.setText(fTestData.fString); 840 int previousBreak = -2; 841 for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) { 842 if (bk <= previousBreak) { 843 throw new MonkeyException("Break Iterator Stall", bk); 844 } 845 if (bk < 0 || bk > fTestData.fString.length()) { 846 throw new MonkeyException("Boundary out of bounds", bk); 847 } 848 fTestData.fActualBreaks[bk] = true; 849 } 850 checkResults("testForwards", CheckDirection.FORWARD); 851 }; 852 853 testFollowing()854 void testFollowing() { 855 fTestData.clearActualBreaks(); 856 fBI.setText(fTestData.fString); 857 int nextBreak = -1; 858 for (int i=-1 ; i<fTestData.fString.length(); ++i) { 859 int bk = fBI.following(i); 860 if (bk == BreakIterator.DONE && i == fTestData.fString.length()) { 861 continue; 862 } 863 if (bk == nextBreak && bk > i) { 864 // i is in the gap between two breaks. 865 continue; 866 } 867 if (i == nextBreak && bk > nextBreak) { 868 fTestData.fActualBreaks[bk] = true; 869 nextBreak = bk; 870 continue; 871 } 872 throw new MonkeyException("following(i)", i); 873 } 874 checkResults("testFollowing", CheckDirection.FORWARD); 875 }; 876 877 testPrevious()878 void testPrevious() { 879 fTestData.clearActualBreaks(); 880 fBI.setText(fTestData.fString); 881 int previousBreak = Integer.MAX_VALUE; 882 for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) { 883 if (bk >= previousBreak) { 884 throw new MonkeyException("Break Iterator Stall", bk); 885 } 886 if (bk < 0 || bk > fTestData.fString.length()) { 887 throw new MonkeyException("Boundary out of bounds", bk); 888 } 889 fTestData.fActualBreaks[bk] = true; 890 } 891 checkResults("testPrevius", CheckDirection.REVERSE); 892 }; 893 894 895 /** 896 * Given an index into a string, if it refers to the trail surrogate of a surrogate pair, 897 * adjust it to point to the lead surrogate, which is the start of the code point. 898 * @param s the String. 899 * @param i the initial index 900 * @return the adjusted index 901 */ getChar32Start(String s, int i)902 private int getChar32Start(String s, int i) { 903 if (i > 0 && i < s.length() && 904 Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) { 905 --i; 906 } 907 return i; 908 } 909 910 testPreceding()911 void testPreceding() { 912 fTestData.clearActualBreaks(); 913 fBI.setText(fTestData.fString); 914 int nextBreak = fTestData.fString.length()+1; 915 for (int i=fTestData.fString.length()+1 ; i>=0; --i) { 916 int bk = fBI.preceding(i); 917 // System.err.printf("testPreceding() i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak); 918 if (bk == BreakIterator.DONE && i == 0) { 919 continue; 920 } 921 if (bk == nextBreak && bk < i) { 922 // i is in the gap between two breaks. 923 continue; 924 } 925 if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) { 926 // i indexes to a trailing surrogate. 927 // Break Iterators treat an index to either half as referring to the supplemental code point, 928 // with preceding going to some preceding code point. 929 if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) { 930 throw new MonkeyException("preceding of trailing surrogate error", i); 931 } 932 continue; 933 } 934 if (i == nextBreak && bk < nextBreak) { 935 fTestData.fActualBreaks[bk] = true; 936 nextBreak = bk; 937 continue; 938 } 939 throw new MonkeyException("preceding(i)", i); 940 } 941 checkResults("testPreceding", CheckDirection.REVERSE); 942 943 }; 944 945 testIsBoundary()946 void testIsBoundary() { 947 fTestData.clearActualBreaks(); 948 fBI.setText(fTestData.fString); 949 for (int i=fTestData.fString.length(); i>=0; --i) { 950 if (fBI.isBoundary(i)) { 951 fTestData.fActualBreaks[i] = true; 952 } 953 } 954 checkResults("testForwards", CheckDirection.FORWARD); 955 }; 956 957 checkResults(String msg, CheckDirection direction)958 void checkResults(String msg, CheckDirection direction) { 959 if (direction == CheckDirection.FORWARD) { 960 for (int i=0; i<=fTestData.fString.length(); ++i) { 961 if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) { 962 throw new MonkeyException(msg, i); 963 } 964 } 965 } else { 966 for (int i=fTestData.fString.length(); i>=0; i--) { 967 if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) { 968 throw new MonkeyException(msg, i); 969 } 970 } 971 } 972 973 }; 974 975 String fRuleCharBuffer; // source file contents of the reference rules. 976 BreakRules fRuleSet; 977 RuleBasedBreakIterator fBI; 978 MonkeyTestData fTestData; 979 ICU_Rand fRandomGenerator; 980 String fRuleFileName; 981 boolean fVerbose; // True to do long dump of failing data. 982 int fLoopCount; 983 int fErrorCount; 984 985 boolean fDumpExpansions; // Debug flag to output expanded form of rules and sets. 986 StringBuilder fErrorMsgs = new StringBuilder(); 987 988 } 989 990 // Test parameters, specified via Java properties. 991 // 992 // rules=file_name Name of file containing the reference rules. 993 // seed=nnnnn Random number starting seed. 994 // Setting the seed allows errors to be reproduced. 995 // loop=nnn Looping count. Controls running time. 996 // -1: run forever. 997 // 0 or greater: run length. 998 // expansions debug option, show expansions of rules and sets. 999 // verbose Display details of the failure. 1000 // 1001 // Parameters are passed to the JVM on the command line, or 1002 // via the Eclipse Run Configuration settings, arguments tab, VM parameters. 1003 // For example, 1004 // -ea -Drules=line.txt -Dloop=-1 1005 // 1006 @Test TestMonkey()1007 public void TestMonkey() { 1008 String tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt", 1009 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt" 1010 }; 1011 1012 String testNameFromParams = getProperty("rules"); 1013 1014 if (testNameFromParams != null) { 1015 tests = new String[] {testNameFromParams}; 1016 } 1017 1018 int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000); 1019 boolean dumpExpansions = getBooleanProperty("expansions", false); 1020 boolean verbose = getBooleanProperty("verbose", false); 1021 int seed = getIntProperty("seed", 1); 1022 1023 List<RBBIMonkeyImpl> startedTests = new ArrayList<>(); 1024 1025 // Monkey testing is multi-threaded. 1026 // Each set of break rules to be tested is run in a separate thread. 1027 // Each thread/set of rules gets a separate RBBIMonkeyImpl object. 1028 1029 for (String testName: tests) { 1030 logln(String.format("beginning testing of %s", testName)); 1031 1032 RBBIMonkeyImpl test = new RBBIMonkeyImpl(); 1033 1034 test.fDumpExpansions = dumpExpansions; 1035 test.fVerbose = verbose; 1036 test.fRandomGenerator = new ICU_Rand(seed); 1037 test.fLoopCount = loopCount; 1038 test.setup(testName); 1039 1040 test.start(); 1041 startedTests.add(test); 1042 } 1043 1044 StringBuilder errors = new StringBuilder(); 1045 for (RBBIMonkeyImpl test: startedTests) { 1046 try { 1047 test.join(); 1048 errors.append(test.fErrorMsgs); 1049 } catch (InterruptedException e) { 1050 errors.append(e + "\n"); 1051 } 1052 } 1053 String errorMsgs = errors.toString(); 1054 assertEquals(errorMsgs, "", errorMsgs); 1055 1056 } 1057 1058 1059 } 1060