1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 package com.ibm.icu.dev.test.rbbi; 5 6 import java.io.IOException; 7 import java.io.InputStream; 8 import java.io.InputStreamReader; 9 import java.util.ArrayList; 10 import java.util.Arrays; 11 import java.util.HashMap; 12 import java.util.List; 13 import java.util.Map; 14 import java.util.regex.Matcher; 15 import java.util.regex.Pattern; 16 import java.util.regex.PatternSyntaxException; 17 18 import org.junit.Test; 19 import org.junit.runner.RunWith; 20 import org.junit.runners.JUnit4; 21 22 import com.ibm.icu.dev.test.TestFmwk; 23 import com.ibm.icu.impl.UCharacterName; 24 import com.ibm.icu.impl.UCharacterNameChoice; 25 import com.ibm.icu.text.BreakIterator; 26 import com.ibm.icu.text.RuleBasedBreakIterator; 27 import com.ibm.icu.text.UnicodeSet; 28 import com.ibm.icu.util.ULocale; 29 30 /** 31 * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp. 32 * This is the newer, data driven monkey test. It is completely separate from the 33 * older class RBBITestMonkey. 34 */ 35 36 @RunWith(JUnit4.class) 37 public class RBBIMonkeyTest extends TestFmwk { 38 39 40 // class CharClass Represents a single character class from the source break rules. 41 // Inherits from UObject because instances are adopted by UHashtable, which ultimately 42 // deletes them using hash's object deleter function. 43 44 static class CharClass { 45 String fName; 46 String fOriginalDef; // set definition as it appeared in user supplied rules. 47 String fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively. 48 UnicodeSet fSet; CharClass(String name, String originalDef, String expandedDef, UnicodeSet set)49 CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) { 50 fName = name; 51 fOriginalDef = originalDef; 52 fExpandedDef = expandedDef; 53 fSet = set; 54 }; 55 } 56 57 58 // class BreakRule Struct-like class represents a single rule from a set of break rules. 59 // Each rule has the set definitions expanded, and 60 // is compiled to a regular expression. 61 62 static class BreakRule { 63 String fName; // Name of the rule. 64 String fRule; // Rule expression, excluding the name, as written in user source. 65 String fExpandedRule; // Rule expression after expanding the set definitions. 66 Matcher fRuleMatcher; // Regular expression that matches the rule. 67 boolean fInitialMatchOnly = false; // True if rule begins with '^', meaning no chaining. 68 }; 69 70 71 // class BreakRules represents a complete set of break rules, possibly tailored, 72 // compiled from testdata break rules. 73 74 static class BreakRules { BreakRules(RBBIMonkeyImpl monkeyImpl)75 BreakRules(RBBIMonkeyImpl monkeyImpl) { 76 fMonkeyImpl = monkeyImpl; 77 fBreakRules = new ArrayList<>(); 78 fType = BreakIterator.KIND_TITLE; 79 fCharClasses = new HashMap<>(); 80 fCharClassList = new ArrayList<>(); 81 fDictionarySet = new UnicodeSet(); 82 83 // Match an alpha-numeric identifier in a rule. Will be a set name. 84 // Use negative look-behind to exclude non-identifiers, mostly property names or values. 85 fSetRefsMatcher = Pattern.compile( 86 "(?<!\\{[ \\t]{0,4})" + 87 "(?<!=[ \\t]{0,4})" + 88 "(?<!\\[:[ \\t]{0,4})" + 89 "(?<!\\\\)" + 90 "(?<![A-Za-z0-9_])" + 91 "([A-Za-z_][A-Za-z0-9_]*)"). // The char class name 92 matcher(""); 93 94 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules. 95 fCommentsMatcher = Pattern.compile("" + 96 "(^|(?<=;))" + // Start either at start of line, or just after a ';' (look-behind for ';') 97 "[ \\t]*+" + // Match white space. 98 "(#.*)?+" + // Optional # plus whatever follows 99 "$"). // new-line at end of line. 100 matcher(""); 101 102 // Match (initial parse) of a character class definition line. 103 fClassDefMatcher = Pattern.compile("" + 104 "[ \\t]*" + // leading white space 105 "([A-Za-z_][A-Za-z0-9_]*)" + // The char class name 106 "[ \\t]*=[ \\t]*" + // = 107 "(.*?)" + // The char class UnicodeSet expression 108 "[ \\t]*;$"). // ; <end of line> 109 matcher(""); 110 111 // Match (initial parse) of a break rule line. 112 fRuleDefMatcher = Pattern.compile("" + 113 "[ \\t]*" + // leading white space 114 "([A-Za-z_][A-Za-z0-9_.]*)" + // The rule name 115 "[ \\t]*:[ \\t]*" + // : 116 "(.*?)" + // The rule definition 117 "[ \\t]*;$"). // ; <end of line> 118 matcher(""); 119 120 // Match a property expression, either [:xxx:] or \p{...} 121 fPropertyMatcher = Pattern.compile("" + 122 "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}"). 123 matcher(""); 124 125 126 } 127 128 /** 129 * Create the expanded definition for this char class, 130 * replacing any set references with the corresponding definition. 131 */ addCharClass(String name, String definition)132 CharClass addCharClass(String name, String definition) { 133 StringBuffer expandedDef = new StringBuffer(); 134 fSetRefsMatcher.reset(definition); 135 while (fSetRefsMatcher.find()) { 136 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1); 137 CharClass snameClass = fCharClasses.get(sname); 138 String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname; 139 140 fSetRefsMatcher.appendReplacement(expandedDef, ""); 141 expandedDef.append(expansionForName); 142 } 143 fSetRefsMatcher.appendTail(expandedDef); 144 String expandedDefString = expandedDef.toString(); 145 146 if (fMonkeyImpl.fDumpExpansions) { 147 System.out.printf("addCharClass(\"%s\"\n", name); 148 System.out.printf(" %s\n", definition); 149 System.out.printf("expandedDef: %s\n", expandedDefString); 150 } 151 152 // Verify that the expanded set definition is valid. 153 154 UnicodeSet s; 155 try { 156 s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE); 157 } catch (java.lang.IllegalArgumentException e) { 158 System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name); 159 throw e; 160 } 161 162 // Get an expanded equivalent pattern from the UnicodeSet. 163 // This removes set difference operators, which would fail if passed through to Java regex. 164 165 StringBuffer expandedPattern = new StringBuffer(); 166 s._generatePattern(expandedPattern, true); 167 expandedDefString = expandedPattern.toString(); 168 if (fMonkeyImpl.fDumpExpansions) { 169 System.out.printf("expandedDef2: %s\n", expandedDefString); 170 } 171 172 CharClass cclass = new CharClass(name, definition, expandedDefString, s); 173 CharClass previousClass = fCharClasses.put(name, cclass); 174 175 if (previousClass != null) { 176 // TODO: decide whether or not to allow redefinitions. 177 // Can be convenient in some cases. 178 // String msg = String.format("%s: Redefinition of character class %s\n", 179 // fMonkeyImpl.fRuleFileName, cclass.fName); 180 // System.err.println(msg); 181 // throw new IllegalArgumentException(msg); 182 } 183 return cclass; 184 185 }; 186 187 addRule(String name, String definition)188 void addRule(String name, String definition) { 189 BreakRule thisRule = new BreakRule(); 190 StringBuffer expandedDefsRule = new StringBuffer(); 191 thisRule.fName = name; 192 thisRule.fRule = definition; 193 194 // Expand the char class definitions within the rule. 195 fSetRefsMatcher.reset(definition); 196 while (fSetRefsMatcher.find()) { 197 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1); 198 CharClass nameClass = fCharClasses.get(sname); 199 if (nameClass == null) { 200 System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition); 201 } 202 String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname; 203 fSetRefsMatcher.appendReplacement(expandedDefsRule, ""); 204 expandedDefsRule.append(expansionForName); 205 } 206 fSetRefsMatcher.appendTail(expandedDefsRule); 207 208 // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion, 209 // obtained from ICU UnicodeSet. Need to do this substitution because Java regex 210 // does not recognize all properties, and because Java's definitions are likely 211 // older than ICU's. 212 213 StringBuffer expandedRule = new StringBuffer(); 214 fPropertyMatcher.reset(expandedDefsRule); 215 while (fPropertyMatcher.find()) { 216 String prop = fPropertyMatcher.group(); 217 UnicodeSet propSet = new UnicodeSet("[" + prop + "]"); 218 StringBuffer propExpansion = new StringBuffer(); 219 propSet._generatePattern(propExpansion, true); 220 fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString()); 221 } 222 fPropertyMatcher.appendTail(expandedRule); 223 224 // If rule begins with a '^' rule chaining is disallowed. 225 // Strip off the '^' from the rule expression, and set the flag. 226 if (expandedRule.charAt(0) == '^') { 227 thisRule.fInitialMatchOnly = true; 228 expandedRule.deleteCharAt(0); 229 expandedRule = new StringBuffer(expandedRule.toString().trim()); 230 } 231 232 // Replace any [^negated sets] with equivalent flattened sets generated by 233 // ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply 234 // to any nested classes. Variable substitution in rules produces 235 // nested sets that [^negation] needs to apply to. 236 237 StringBuffer ruleWithFlattenedSets = new StringBuffer(); 238 int idx = 0; 239 while (idx<expandedRule.length()) { 240 int setOpenPos = expandedRule.indexOf("[^", idx); 241 if (setOpenPos < 0) { 242 break; 243 } 244 if (setOpenPos > idx) { 245 // Move anything from the source rule preceding the [^ into the processed rule, unchanged. 246 ruleWithFlattenedSets.append(expandedRule.substring(idx, setOpenPos)); 247 } 248 int nestingLevel = 1; 249 boolean haveNesting = false; 250 int setClosePos; 251 for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) { 252 char c = expandedRule.charAt(setClosePos); 253 if (c == '\\') { 254 ++setClosePos; 255 } else if (c == '[') { 256 ++nestingLevel; 257 haveNesting = true; 258 } else if (c == ']') { 259 --nestingLevel; 260 } 261 } 262 if (haveNesting && nestingLevel == 0) { 263 // Found one, a negated set that includes interior nested sets. 264 // Create an ICU UnicodeSet from the source pattern, and obtain an 265 // equivalent flattened pattern from that. 266 UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true); 267 uset._generatePattern(ruleWithFlattenedSets, true); 268 } else { 269 // The [^ set definition did not include any nested sets. 270 // Copy the original definition without change. 271 // Java regular expressions will handle it without needing to recast it. 272 if (nestingLevel > 0) { 273 // Error case of an unclosed character class expression. 274 // Java regex will also eventually flag the error. 275 System.err.printf("No closing ] found in rule %s\n", name); 276 } 277 ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos)); 278 } 279 idx = setClosePos; 280 } 281 282 if (idx < expandedRule.length()) { 283 ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length())); 284 } 285 286 thisRule.fExpandedRule = ruleWithFlattenedSets.toString(); 287 288 // Replace the divide sign (\u00f7) with a regular expression named capture. 289 // When running the rules, a match that includes this group means we found a break position. 290 291 // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)"); 292 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()"); 293 if (thisRule.fExpandedRule.indexOf("÷") != -1) { 294 String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name); 295 System.err.println(msg); 296 throw new IllegalArgumentException(msg); 297 } 298 299 // UAX break rule set definitions can be empty, just []. 300 // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which 301 // also matches nothing. 302 303 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]"); 304 305 // Change Unicode escape syntax for compatibility with Java regular expressions 306 // \udddd => \x{dddd} 307 // \U00hhhhhh => \x{hhhhhh} 308 309 thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}"); 310 thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}"); 311 312 // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment. 313 // UnicodeSet._generatePattern() inserts un-escaped "#"s 314 315 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#"); 316 if (fMonkeyImpl.fDumpExpansions) { 317 System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule); 318 } 319 320 // Compile a regular expression for this rule. 321 322 try { 323 thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher(""); 324 } catch (PatternSyntaxException e) { 325 System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"", 326 fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule); 327 throw e; 328 } 329 330 // Put this new rule into the vector of all Rules. 331 332 fBreakRules.add(thisRule); 333 }; 334 335 @SuppressWarnings("unused") hexToCodePoint(String hex)336 private static String hexToCodePoint(String hex) { 337 int cp = Integer.parseInt(hex, 16); 338 return new StringBuilder().appendCodePoint(cp).toString(); 339 } 340 341 setKeywordParameter(String keyword, String value)342 boolean setKeywordParameter(String keyword, String value) { 343 if (keyword.equals("locale")) { 344 fLocale = new ULocale(value); 345 return true; 346 } 347 if (keyword.equals("type")) { 348 if (value.equals("grapheme")) { 349 fType = BreakIterator.KIND_CHARACTER; 350 } else if (value.equals("word")) { 351 fType = BreakIterator.KIND_WORD; 352 } else if (value.equals("line")) { 353 fType = BreakIterator.KIND_LINE; 354 } else if (value.equals("sentence")) { 355 fType = BreakIterator.KIND_SENTENCE; 356 } else { 357 String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value); 358 System.err.println(msg); 359 throw new IllegalArgumentException(msg); 360 } 361 return true; 362 } 363 return false; 364 } 365 366 createICUBreakIterator()367 RuleBasedBreakIterator createICUBreakIterator() { 368 BreakIterator bi; 369 switch(fType) { 370 case BreakIterator.KIND_CHARACTER: 371 bi = (BreakIterator.getCharacterInstance(fLocale)); 372 break; 373 case BreakIterator.KIND_WORD: 374 bi = (BreakIterator.getWordInstance(fLocale)); 375 break; 376 case BreakIterator.KIND_LINE: 377 bi = (BreakIterator.getLineInstance(fLocale)); 378 break; 379 case BreakIterator.KIND_SENTENCE: 380 bi = (BreakIterator.getSentenceInstance(fLocale)); 381 break; 382 default: 383 String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType); 384 System.err.println(msg); 385 throw new IllegalArgumentException(msg); 386 } 387 return (RuleBasedBreakIterator)bi; 388 389 }; 390 391 392 compileRules(String rules)393 void compileRules(String rules) { 394 int lineNumber = 0; 395 for (String line: rules.split("\\r?\\n")) { 396 ++lineNumber; 397 // Strip comment lines. 398 fCommentsMatcher.reset(line); 399 line = fCommentsMatcher.replaceFirst(""); 400 if (line.isEmpty()) { 401 continue; 402 } 403 404 // Recognize character class definition and keyword lines 405 fClassDefMatcher.reset(line); 406 if (fClassDefMatcher.matches()) { 407 String className = fClassDefMatcher.group(/*"ClassName"*/ 1); 408 String classDef = fClassDefMatcher.group(/*"ClassDef"*/ 2); 409 if (fMonkeyImpl.fDumpExpansions) { 410 System.out.printf("scanned class: %s = %s\n", className, classDef); 411 } 412 if (setKeywordParameter(className, classDef)) { 413 // The scanned item was "type = ..." or "locale = ...", etc. 414 // which are not actual character classes. 415 continue; 416 } 417 addCharClass(className, classDef); 418 continue; 419 } 420 421 // Recognize rule lines. 422 fRuleDefMatcher.reset(line); 423 if (fRuleDefMatcher.matches()) { 424 String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1); 425 String ruleDef = fRuleDefMatcher.group(/*"RuleDef"*/ 2); 426 if (fMonkeyImpl.fDumpExpansions) { 427 System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef); 428 } 429 addRule(ruleName, ruleDef); 430 continue; 431 } 432 433 String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"", 434 fMonkeyImpl.fRuleFileName, lineNumber, line); 435 System.err.println(msg); 436 throw new IllegalArgumentException(msg); 437 } 438 439 // Build the vector of char classes, omitting the dictionary class if there is one. 440 // This will be used when constructing the random text to be tested. 441 442 // Also compute the "other" set, consisting of any characters not included in 443 // one or more of the user defined sets. 444 445 UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff); 446 447 for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) { 448 String ccName = el.getKey(); 449 CharClass cclass = el.getValue(); 450 451 // System.out.printf(" Adding %s\n", ccName); 452 if (!ccName.equals(cclass.fName)) { 453 throw new IllegalArgumentException( 454 String.format("%s: internal error, set names (%s, %s) inconsistent.\n", 455 fMonkeyImpl.fRuleFileName, ccName, cclass.fName)); 456 } 457 otherSet.removeAll(cclass.fSet); 458 if (ccName.equals("dictionary")) { 459 fDictionarySet = cclass.fSet; 460 } else { 461 fCharClassList.add(cclass); 462 } 463 } 464 465 if (!otherSet.isEmpty()) { 466 // System.out.printf("have an other set.\n"); 467 CharClass cclass = addCharClass("__Others", otherSet.toPattern(true)); 468 fCharClassList.add(cclass); 469 } 470 471 }; 472 getClassForChar(int c)473 CharClass getClassForChar(int c) { 474 for (CharClass cc: fCharClassList) { 475 if (cc.fSet.contains(c)) { 476 return cc; 477 } 478 } 479 return null; 480 }; 481 482 483 RBBIMonkeyImpl fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance. 484 List<BreakRule> fBreakRules; // Contents are of type (BreakRule *). 485 486 Map<String, CharClass> fCharClasses; // Key is the set name. 487 // // Value is the corresponding CharClass 488 List<CharClass> fCharClassList; // Char Classes, same contents as fCharClasses values, 489 490 UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined. 491 ULocale fLocale; 492 int fType; // BreakItererator.KIND_WORD, etc. 493 494 495 Matcher fSetRefsMatcher; 496 Matcher fCommentsMatcher; 497 Matcher fClassDefMatcher; 498 Matcher fRuleDefMatcher; 499 Matcher fPropertyMatcher; 500 }; 501 502 503 504 505 // class MonkeyTestData represents a randomly synthesized test data string together 506 // with the expected break positions obtained by applying 507 // the test break rules. 508 509 static class MonkeyTestData{ 510 set(BreakRules rules, ICU_Rand rand)511 void set(BreakRules rules, ICU_Rand rand) { 512 int dataLength = 1000; // length of test data to generate, in code points. 513 514 // Fill the test string with random characters. 515 // First randomly pick a char class, then randomly pick a character from that class. 516 // Exclude any characters from the dictionary set. 517 518 // System.out.println("Populating Test Data"); 519 fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages, 520 // allowing recreation of failing data. 521 fBkRules = rules; 522 StringBuilder newString = new StringBuilder(); 523 for (int n=0; n<dataLength;) { 524 int charClassIndex = rand.next() % rules.fCharClassList.size(); 525 CharClass cclass = rules.fCharClassList.get(charClassIndex); 526 if (cclass.fSet.size() == 0) { 527 // Some rules or tailorings do end up with empty char classes. 528 continue; 529 } 530 int charIndex = rand.next() % cclass.fSet.size(); 531 int c = cclass.fSet.charAt(charIndex); 532 if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) && 533 newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) { 534 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control. 535 // Don't let random unpaired surrogates combine in the test data because they might 536 // produce an unwanted dictionary character. 537 continue; 538 } 539 540 if (!rules.fDictionarySet.contains(c)) { 541 newString.appendCodePoint(c); 542 ++n; 543 } 544 } 545 fString = newString.toString(); 546 547 // Init the expectedBreaks, actualBreaks and ruleForPosition. 548 // Expected and Actual breaks are one longer than the input string; a true value 549 // will indicate a boundary preceding that position. 550 551 fActualBreaks = new boolean[fString.length()+1]; 552 fExpectedBreaks = new boolean[fString.length()+1]; 553 fRuleForPosition = new int[fString.length()+1]; 554 f2ndRuleForPos = new int[fString.length()+1]; 555 556 int expectedBreakCount = 0; 557 558 // Apply reference rules to find the expected breaks. 559 560 fExpectedBreaks[0] = true; // Force an expected break before the start of the text. 561 // ICU always reports a break there. 562 // The reference rules do not have a means to do so. 563 int strIdx = 0; 564 boolean initialMatch = true; // True at start of text, and immediately after each boundary, 565 // // for control over rule chaining. 566 567 while (strIdx < fString.length()) { 568 BreakRule matchingRule = null; 569 boolean hasBreak = false; 570 int ruleNum = 0; 571 int matchStart = 0; 572 int matchEnd = 0; 573 for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) { 574 BreakRule rule = rules.fBreakRules.get(ruleNum); 575 if (rule.fInitialMatchOnly && !initialMatch) { 576 // Skip checking this '^' rule. (No rule chaining) 577 continue; 578 } 579 rule.fRuleMatcher.reset(fString.substring(strIdx)); 580 if (rule.fRuleMatcher.lookingAt()) { 581 // A candidate rule match, check further to see if we take it or continue to check other rules. 582 // Matches of zero or one code point count only if they also specify a break. 583 matchStart = strIdx; 584 matchEnd = strIdx + rule.fRuleMatcher.end(); 585 hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0; 586 if (hasBreak || 587 (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) { 588 matchingRule = rule; 589 break; 590 } 591 } 592 } 593 if (matchingRule == null) { 594 // No reference rule matched. This is an error in the rules that should never happen. 595 String msg = String.format("%s: No reference rules matched at position %d. ", 596 rules.fMonkeyImpl.fRuleFileName, strIdx); 597 System.err.println(msg); 598 dump(strIdx); 599 throw new IllegalArgumentException(msg); 600 } 601 if (matchingRule.fRuleMatcher.group().length() == 0) { 602 // Zero length rule match. This is also an error in the rule expressions. 603 String msg = String.format("%s:%s: Zero length rule match at %d.", 604 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx); 605 System.err.println(msg); 606 dump(strIdx); 607 throw new IllegalArgumentException(msg); 608 } 609 610 // Record which rule matched over the length of the match. 611 for (int i = matchStart; i < matchEnd; i++) { 612 if (fRuleForPosition[i] == 0) { 613 fRuleForPosition[i] = ruleNum; 614 } else { 615 f2ndRuleForPos[i] = ruleNum; 616 } 617 } 618 619 // Break positions appear in rules as a matching named capture of zero length at the break position, 620 // the adjusted pattern contains (?<BreakPosition>) 621 if (hasBreak) { 622 int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher); 623 fExpectedBreaks[breakPos] = true; 624 expectedBreakCount++; 625 // System.out.printf("recording break at %d\n", breakPos); 626 // For the next iteration, pick up applying rules immediately after the break, 627 // which may differ from end of the match. The matching rule may have included 628 // context following the boundary that needs to be looked at again. 629 strIdx = breakPos; 630 initialMatch = true; 631 } else { 632 // Original rule didn't specify a break. 633 // Continue applying rules starting on the last code point of this match. 634 int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1); 635 if (updatedStrIdx == matchStart) { 636 // Match was only one code point, no progress if we continue. 637 // Shouldn't get here, case is filtered out at top of loop. 638 throw new IllegalArgumentException(String.format("%s: Rule %s internal error.", 639 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName)); 640 } 641 strIdx = updatedStrIdx; 642 initialMatch = false; 643 } 644 } 645 if (expectedBreakCount >= fString.length()) { 646 throw new IllegalArgumentException(String.format("expectedBreakCount (%d) should be less than the test string length (%d).", 647 expectedBreakCount, fString.length())); 648 } 649 }; 650 651 // Helper function to find the starting index of a match of the "BreakPosition" named capture group. 652 // @param m: a Java regex Matcher that has completed a matching operation. 653 // @return m.start("BreakPosition), 654 // or -1 if there is no such group, or the group did not participate in the match. 655 // 656 // TODO: this becomes m.start("BreakPosition") with Java 8. 657 // In the mean time, assume that the only zero-length capturing group in 658 // a reference rule expression is the "BreakPosition" that corresponds to a "÷". 659 BreakGroupStart(Matcher m)660 static int BreakGroupStart(Matcher m) { 661 for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) { 662 String group = m.group(groupNum); 663 if (group == null) { 664 continue; 665 } 666 if (group.equals("")) { 667 // assert(m.end(groupNum) == m.end("BreakPosition")); 668 return m.start(groupNum); 669 } 670 } 671 return -1; 672 } 673 dump(int around)674 void dump(int around) { 675 System.out.print("\n" 676 + " char break Rule Character\n" 677 + " pos code class R I name name\n" 678 + "---------------------------------------------------------------------------------------------\n"); 679 680 int start; 681 int end; 682 683 if (around == -1) { 684 start = 0; 685 end = fString.length(); 686 } else { 687 // Display context around a failure. 688 try { 689 start = fString.offsetByCodePoints(around, -30); 690 } catch (Exception e) { 691 start = 0; 692 } 693 try { 694 end = fString.offsetByCodePoints(around, +30); 695 } catch (Exception e) { 696 end = fString.length(); 697 } 698 } 699 700 for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) { 701 int c = fString.codePointAt(charIdx); 702 CharClass cc = fBkRules.getClassForChar(c); 703 704 BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]); 705 String secondRuleName = ""; 706 if (f2ndRuleForPos[charIdx] > 0) { 707 secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName; 708 } 709 String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME); 710 711 System.out.printf(" %4d %6x %-20s %c %c %-10s %-10s %s\n", 712 charIdx, c, cc.fName, 713 fExpectedBreaks[charIdx] ? '*' : '.', 714 fActualBreaks[charIdx] ? '*' : '.', 715 rule.fName, secondRuleName, cName 716 ); 717 } 718 719 }; 720 clearActualBreaks()721 void clearActualBreaks() { 722 Arrays.fill(fActualBreaks, false); 723 } 724 725 726 int fRandomSeed; // The initial seed value from the random number generator. 727 BreakRules fBkRules; // The break rules used to generate this data. 728 String fString; // The text. 729 boolean fExpectedBreaks[]; // Breaks as found by the reference rules. 730 // Parallel to fString. true if break preceding. 731 boolean fActualBreaks[]; // Breaks as found by ICU break iterator. 732 int fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position. 733 // Also parallel to fString. 734 int f2ndRuleForPos[]; // As above. A 2nd rule applies when the preceding rule 735 // didn't cause a break, and a subsequent rule match starts 736 // on the last code point of the preceding match. 737 738 } 739 740 741 // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey 742 // test for one set of break rules. 743 // 744 745 static class RBBIMonkeyImpl extends Thread { 746 setup(String ruleFile)747 void setup(String ruleFile) { 748 fRuleFileName = ruleFile; 749 openBreakRules(ruleFile); 750 fRuleSet = new BreakRules(this); 751 fRuleSet.compileRules(fRuleCharBuffer); 752 fBI = fRuleSet.createICUBreakIterator(); 753 fTestData = new MonkeyTestData(); 754 }; 755 openBreakRules(String fileName)756 void openBreakRules(String fileName) { 757 StringBuilder testFileBuf = new StringBuilder(); 758 InputStream is = null; 759 String filePath = "break_rules/" + fileName; 760 try { 761 is = RBBIMonkeyImpl.class.getResourceAsStream(filePath); 762 if (is == null) { 763 errln("Could not open test data file " + fileName); 764 return; 765 } 766 InputStreamReader isr = new InputStreamReader(is, "UTF-8"); 767 try { 768 int c; 769 int count = 0; 770 for (;;) { 771 c = isr.read(); 772 if (c < 0) { 773 break; 774 } 775 count++; 776 if (c == 0xFEFF && count == 1) { 777 // BOM in the test data file. Discard it. 778 continue; 779 } 780 testFileBuf.appendCodePoint(c); 781 } 782 } finally { 783 isr.close(); 784 } 785 } catch (IOException e) { 786 try { 787 is.close(); 788 } catch (IOException ignored) { 789 } 790 errln(e.toString()); 791 } 792 fRuleCharBuffer = testFileBuf.toString(); /* the file as a String */ 793 } 794 795 class MonkeyException extends RuntimeException { 796 private static final long serialVersionUID = 1L; 797 public int fPosition; // Position of the failure in the test data. MonkeyException(String description, int pos)798 MonkeyException(String description, int pos) { 799 super(description); 800 fPosition = pos; 801 } 802 } 803 804 @Override run()805 public void run() { 806 int errorCount = 0; 807 if (fBI == null) { 808 fErrorMsgs.append("Unable to run test because fBI is null.\n"); 809 return; 810 } 811 for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) { 812 try { 813 fTestData.set(fRuleSet, fRandomGenerator); 814 // fTestData.dump(-1); 815 testForwards(); 816 testPrevious(); 817 testFollowing(); 818 testPreceding(); 819 testIsBoundary(); 820 } catch (MonkeyException e) { 821 String formattedMsg = String.format( 822 "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n", 823 e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed); 824 System.err.print(formattedMsg); 825 if (fVerbose) { 826 fTestData.dump(e.fPosition); 827 } 828 fErrorMsgs.append(formattedMsg); 829 if (++errorCount > 10) { 830 return; 831 } 832 } 833 if (fLoopCount < 0 && loopCount % 100 == 0) { 834 System.err.print("."); 835 } 836 } 837 } 838 839 enum CheckDirection { 840 FORWARD, 841 REVERSE 842 }; 843 testForwards()844 void testForwards() { 845 fTestData.clearActualBreaks(); 846 fBI.setText(fTestData.fString); 847 int previousBreak = -2; 848 for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) { 849 if (bk <= previousBreak) { 850 throw new MonkeyException("Break Iterator Stall", bk); 851 } 852 if (bk < 0 || bk > fTestData.fString.length()) { 853 throw new MonkeyException("Boundary out of bounds", bk); 854 } 855 fTestData.fActualBreaks[bk] = true; 856 } 857 checkResults("testForwards", CheckDirection.FORWARD); 858 }; 859 860 testFollowing()861 void testFollowing() { 862 fTestData.clearActualBreaks(); 863 fBI.setText(fTestData.fString); 864 int nextBreak = -1; 865 for (int i=-1 ; i<fTestData.fString.length(); ++i) { 866 int bk = fBI.following(i); 867 if (bk == BreakIterator.DONE && i == fTestData.fString.length()) { 868 continue; 869 } 870 if (bk == nextBreak && bk > i) { 871 // i is in the gap between two breaks. 872 continue; 873 } 874 if (i == nextBreak && bk > nextBreak) { 875 fTestData.fActualBreaks[bk] = true; 876 nextBreak = bk; 877 continue; 878 } 879 throw new MonkeyException("following(i)", i); 880 } 881 checkResults("testFollowing", CheckDirection.FORWARD); 882 }; 883 884 testPrevious()885 void testPrevious() { 886 fTestData.clearActualBreaks(); 887 fBI.setText(fTestData.fString); 888 int previousBreak = Integer.MAX_VALUE; 889 for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) { 890 if (bk >= previousBreak) { 891 throw new MonkeyException("Break Iterator Stall", bk); 892 } 893 if (bk < 0 || bk > fTestData.fString.length()) { 894 throw new MonkeyException("Boundary out of bounds", bk); 895 } 896 fTestData.fActualBreaks[bk] = true; 897 } 898 checkResults("testPrevius", CheckDirection.REVERSE); 899 }; 900 901 902 /** 903 * Given an index into a string, if it refers to the trail surrogate of a surrogate pair, 904 * adjust it to point to the lead surrogate, which is the start of the code point. 905 * @param s the String. 906 * @param i the initial index 907 * @return the adjusted index 908 */ getChar32Start(String s, int i)909 private int getChar32Start(String s, int i) { 910 if (i > 0 && i < s.length() && 911 Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) { 912 --i; 913 } 914 return i; 915 } 916 917 testPreceding()918 void testPreceding() { 919 fTestData.clearActualBreaks(); 920 fBI.setText(fTestData.fString); 921 int nextBreak = fTestData.fString.length()+1; 922 for (int i=fTestData.fString.length()+1 ; i>=0; --i) { 923 int bk = fBI.preceding(i); 924 // System.err.printf("testPreceding() i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak); 925 if (bk == BreakIterator.DONE && i == 0) { 926 continue; 927 } 928 if (bk == nextBreak && bk < i) { 929 // i is in the gap between two breaks. 930 continue; 931 } 932 if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) { 933 // i indexes to a trailing surrogate. 934 // Break Iterators treat an index to either half as referring to the supplemental code point, 935 // with preceding going to some preceding code point. 936 if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) { 937 throw new MonkeyException("preceding of trailing surrogate error", i); 938 } 939 continue; 940 } 941 if (i == nextBreak && bk < nextBreak) { 942 fTestData.fActualBreaks[bk] = true; 943 nextBreak = bk; 944 continue; 945 } 946 throw new MonkeyException("preceding(i)", i); 947 } 948 checkResults("testPreceding", CheckDirection.REVERSE); 949 950 }; 951 952 testIsBoundary()953 void testIsBoundary() { 954 fTestData.clearActualBreaks(); 955 fBI.setText(fTestData.fString); 956 for (int i=fTestData.fString.length(); i>=0; --i) { 957 if (fBI.isBoundary(i)) { 958 fTestData.fActualBreaks[i] = true; 959 } 960 } 961 checkResults("testForwards", CheckDirection.FORWARD); 962 }; 963 964 checkResults(String msg, CheckDirection direction)965 void checkResults(String msg, CheckDirection direction) { 966 if (direction == CheckDirection.FORWARD) { 967 for (int i=0; i<=fTestData.fString.length(); ++i) { 968 if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) { 969 throw new MonkeyException(msg, i); 970 } 971 } 972 } else { 973 for (int i=fTestData.fString.length(); i>=0; i--) { 974 if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) { 975 throw new MonkeyException(msg, i); 976 } 977 } 978 } 979 980 }; 981 982 String fRuleCharBuffer; // source file contents of the reference rules. 983 BreakRules fRuleSet; 984 RuleBasedBreakIterator fBI; 985 MonkeyTestData fTestData; 986 ICU_Rand fRandomGenerator; 987 String fRuleFileName; 988 boolean fVerbose; // True to do long dump of failing data. 989 int fLoopCount; 990 int fErrorCount; 991 992 boolean fDumpExpansions; // Debug flag to output expanded form of rules and sets. 993 StringBuilder fErrorMsgs = new StringBuilder(); 994 995 } 996 997 // Test parameters, specified via Java properties. 998 // 999 // rules=file_name Name of file containing the reference rules. 1000 // seed=nnnnn Random number starting seed. 1001 // Setting the seed allows errors to be reproduced. 1002 // loop=nnn Looping count. Controls running time. 1003 // -1: run forever. 1004 // 0 or greater: run length. 1005 // expansions debug option, show expansions of rules and sets. 1006 // verbose Display details of the failure. 1007 // 1008 // Parameters are passed to the JVM on the command line, or 1009 // via the Eclipse Run Configuration settings, arguments tab, VM parameters. 1010 // For example, 1011 // -ea -Drules=line.txt -Dloop=-1 1012 // 1013 @Test TestMonkey()1014 public void TestMonkey() { 1015 String tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt", 1016 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt" 1017 }; 1018 1019 String testNameFromParams = getProperty("rules"); 1020 1021 if (testNameFromParams != null) { 1022 tests = new String[] {testNameFromParams}; 1023 } 1024 1025 int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000); 1026 boolean dumpExpansions = getBooleanProperty("expansions", false); 1027 boolean verbose = getBooleanProperty("verbose", false); 1028 int seed = getIntProperty("seed", 1); 1029 1030 List<RBBIMonkeyImpl> startedTests = new ArrayList<>(); 1031 1032 // Monkey testing is multi-threaded. 1033 // Each set of break rules to be tested is run in a separate thread. 1034 // Each thread/set of rules gets a separate RBBIMonkeyImpl object. 1035 1036 for (String testName: tests) { 1037 logln(String.format("beginning testing of %s", testName)); 1038 1039 RBBIMonkeyImpl test = new RBBIMonkeyImpl(); 1040 1041 test.fDumpExpansions = dumpExpansions; 1042 test.fVerbose = verbose; 1043 test.fRandomGenerator = new ICU_Rand(seed); 1044 test.fLoopCount = loopCount; 1045 test.setup(testName); 1046 1047 test.start(); 1048 startedTests.add(test); 1049 } 1050 1051 StringBuilder errors = new StringBuilder(); 1052 for (RBBIMonkeyImpl test: startedTests) { 1053 try { 1054 test.join(); 1055 errors.append(test.fErrorMsgs); 1056 } catch (InterruptedException e) { 1057 errors.append(e + "\n"); 1058 } 1059 } 1060 String errorMsgs = errors.toString(); 1061 assertEquals(errorMsgs, "", errorMsgs); 1062 1063 } 1064 1065 1066 } 1067