1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2003-2016 International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.dev.test.rbbi; 10 11 12 // Monkey testing of RuleBasedBreakIterator. 13 // The old, original monkey test. TODO: remove 14 // The new monkey test is class RBBIMonkeyTest. 15 16 import java.util.ArrayList; 17 import java.util.Arrays; 18 import java.util.List; 19 import java.util.Locale; 20 21 import org.junit.Test; 22 import org.junit.runner.RunWith; 23 import org.junit.runners.JUnit4; 24 25 import com.ibm.icu.dev.test.TestFmwk; 26 import com.ibm.icu.lang.UCharacter; 27 import com.ibm.icu.lang.UProperty; 28 import com.ibm.icu.text.BreakIterator; 29 import com.ibm.icu.text.RuleBasedBreakIterator; 30 import com.ibm.icu.text.UTF16; 31 import com.ibm.icu.text.UnicodeSet; 32 33 34 /** 35 * Monkey tests for RBBI. These tests have independent implementations of 36 * the Unicode TR boundary rules, and compare results between these and ICU's 37 * implementation, using random data. 38 * 39 * Tests cover Grapheme Cluster (char), Word and Line breaks 40 * 41 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp 42 * 43 */ 44 @RunWith(JUnit4.class) 45 public class RBBITestMonkey extends TestFmwk { 46 // 47 // class RBBIMonkeyKind 48 // 49 // Monkey Test for Break Iteration 50 // Abstract interface class. Concrete derived classes independently 51 // implement the break rules for different iterator types. 52 // 53 // The Monkey Test itself uses doesn't know which type of break iterator it is 54 // testing, but works purely in terms of the interface defined here. 55 // 56 abstract static class RBBIMonkeyKind { RBBIMonkeyKind()57 RBBIMonkeyKind() { 58 fSets = new ArrayList(); 59 fClassNames = new ArrayList(); 60 fAppliedRules = new ArrayList(); 61 } 62 63 // Return a List of UnicodeSets, representing the character classes used 64 // for this type of iterator. charClasses()65 abstract List charClasses(); 66 67 // Set the test text on which subsequent calls to next() will operate setText(StringBuffer text)68 abstract void setText(StringBuffer text); 69 70 // Find the next break position, starting from the specified position. 71 // Return -1 after reaching end of string. next(int i)72 abstract int next(int i); 73 74 // Name of each character class, parallel with charClasses. Used for debugging output 75 // of characters. characterClassNames()76 List<String> characterClassNames() { 77 return fClassNames; 78 } 79 setAppliedRule(int position, String value)80 void setAppliedRule(int position, String value) { 81 fAppliedRules.set(position, value); 82 } 83 getAppliedRule(int position)84 String getAppliedRule(int position) { 85 return fAppliedRules.get(position); 86 } 87 classNameFromCodepoint(int c)88 String classNameFromCodepoint(int c) { 89 // Simply iterate through fSets to find character's class 90 for (int aClassNum = 0; aClassNum < charClasses().size(); aClassNum++) { 91 UnicodeSet classSet = (UnicodeSet)charClasses().get(aClassNum); 92 if (classSet.contains(c)) { 93 return fClassNames.get(aClassNum); 94 } 95 } 96 return "bad class name"; 97 } 98 maxClassNameSize()99 int maxClassNameSize() { 100 int maxSize = 0; 101 for (int aClassNum = 0; aClassNum < charClasses().size(); aClassNum++) { 102 if (fClassNames.get(aClassNum).length() > maxSize) { 103 maxSize = fClassNames.get(aClassNum).length(); 104 } 105 } 106 return maxSize; 107 } 108 109 // Clear `appliedRules` and fill it with empty strings in the size of test text. prepareAppliedRules(int size)110 void prepareAppliedRules(int size) { 111 // Remove all the information in the `appliedRules`. 112 fAppliedRules.clear(); 113 fAppliedRules.ensureCapacity(size + 1); 114 while (fAppliedRules.size() < size + 1) { 115 fAppliedRules.add(""); 116 } 117 } 118 119 // A Character Property, one of the constants defined in class UProperty. 120 // The value of this property will be displayed for the characters 121 // near any test failure. 122 int fCharProperty; 123 124 List fSets; 125 ArrayList<String> fClassNames; 126 ArrayList<String> fAppliedRules; 127 } 128 129 /** 130 * Monkey test subclass for testing Character (Grapheme Cluster) boundaries. 131 * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets 132 */ 133 static class RBBICharMonkey extends RBBIMonkeyKind { 134 UnicodeSet fCRLFSet; 135 UnicodeSet fControlSet; 136 UnicodeSet fExtendSet; 137 UnicodeSet fRegionalIndicatorSet; 138 UnicodeSet fPrependSet; 139 UnicodeSet fSpacingSet; 140 UnicodeSet fLSet; 141 UnicodeSet fVSet; 142 UnicodeSet fTSet; 143 UnicodeSet fLVSet; 144 UnicodeSet fLVTSet; 145 UnicodeSet fHangulSet; 146 UnicodeSet fZWJSet; 147 UnicodeSet fExtendedPictSet; 148 UnicodeSet fViramaSet; 149 UnicodeSet fLinkingConsonantSet; 150 UnicodeSet fExtCccZwjSet; 151 UnicodeSet fAnySet; 152 153 154 StringBuffer fText; 155 RBBICharMonkey()156 RBBICharMonkey() { 157 fText = null; 158 fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK; 159 fCRLFSet = new UnicodeSet("[\\r\\n]"); 160 fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]"); 161 fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]"); 162 fZWJSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]"); 163 fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"); 164 fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]"); 165 fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]"); 166 fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]"); 167 fVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]"); 168 fTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]"); 169 fLVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]"); 170 fLVTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]"); 171 fHangulSet = new UnicodeSet(); 172 fHangulSet.addAll(fLSet); 173 fHangulSet.addAll(fVSet); 174 fHangulSet.addAll(fTSet); 175 fHangulSet.addAll(fLVSet); 176 fHangulSet.addAll(fLVTSet); 177 178 fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]"); 179 fViramaSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" 180 + "\\p{Indic_Syllabic_Category=Virama}]"); 181 fLinkingConsonantSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" 182 + "\\p{Indic_Syllabic_Category=Consonant}]"); 183 fExtCccZwjSet = new UnicodeSet("[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]"); 184 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]"); 185 186 187 fSets.add(fCRLFSet); fClassNames.add("CRLF"); 188 fSets.add(fControlSet); fClassNames.add("Control"); 189 fSets.add(fExtendSet); fClassNames.add("Extended"); 190 fSets.add(fRegionalIndicatorSet); fClassNames.add("RegionalIndicator"); 191 if (!fPrependSet.isEmpty()) { 192 fSets.add(fPrependSet); fClassNames.add("Prepend"); 193 } 194 fSets.add(fSpacingSet); fClassNames.add("Spacing"); 195 fSets.add(fHangulSet); fClassNames.add("Hangul"); 196 fSets.add(fAnySet); fClassNames.add("Any"); 197 fSets.add(fZWJSet); fClassNames.add("ZWJ"); 198 fSets.add(fExtendedPictSet); fClassNames.add("ExtendedPict"); 199 fSets.add(fViramaSet); fClassNames.add("Virama"); 200 fSets.add(fLinkingConsonantSet); fClassNames.add("LinkingConsonant"); 201 fSets.add(fExtCccZwjSet); fClassNames.add("ExtCccZwj"); 202 } 203 204 205 @Override setText(StringBuffer s)206 void setText(StringBuffer s) { 207 fText = s; 208 prepareAppliedRules(s.length()); 209 } 210 211 @Override charClasses()212 List charClasses() { 213 return fSets; 214 } 215 216 @Override next(int prevPos)217 int next(int prevPos) { 218 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 219 // break position being tested. The candidate break 220 // location is before p2. 221 222 int breakPos = -1; 223 224 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 225 int cBase; // for (X Extend*) patterns, the X character. 226 227 // Previous break at end of string. return DONE. 228 if (prevPos >= fText.length()) { 229 return -1; 230 } 231 /* p0 = */ p1 = p2 = p3 = prevPos; 232 c3 = UTF16.charAt(fText, prevPos); 233 c0 = c1 = c2 = cBase = 0; 234 235 // Loop runs once per "significant" character position in the input text. 236 for (;;) { 237 // Move all of the positions forward in the input string. 238 /* p0 = p1;*/ c0 = c1; 239 p1 = p2; c1 = c2; 240 p2 = p3; c2 = c3; 241 242 // Advance p3 by one codepoint 243 p3 = moveIndex32(fText, p3, 1); 244 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3); 245 246 if (p1 == p2) { 247 // Still warming up the loop. (won't work with zero length strings, but we don't care) 248 continue; 249 } 250 if (p2 == fText.length()) { 251 setAppliedRule(p2, "End of String"); 252 break; 253 } 254 255 // No Extend or Format characters may appear between the CR and LF, 256 // which requires the additional check for p2 immediately following p1. 257 // 258 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 259 setAppliedRule(p2, "GB 3 CR x LF"); 260 continue; 261 } 262 263 if (fControlSet.contains(c1) || 264 c1 == 0x0D || 265 c1 == 0x0A) { 266 setAppliedRule(p2, "GB 4 ( Control | CR | LF ) <break>"); 267 break; 268 } 269 270 if (fControlSet.contains(c2) || 271 c2 == 0x0D || 272 c2 == 0x0A) { 273 setAppliedRule(p2, "GB 5 <break> ( Control | CR | LF )"); 274 break; 275 } 276 277 278 if (fLSet.contains(c1) && 279 (fLSet.contains(c2) || 280 fVSet.contains(c2) || 281 fLVSet.contains(c2) || 282 fLVTSet.contains(c2))) { 283 setAppliedRule(p2, "GB 6 L x ( L | V | LV | LVT )"); 284 continue; 285 } 286 287 if ((fLVSet.contains(c1) || fVSet.contains(c1)) && 288 (fVSet.contains(c2) || fTSet.contains(c2))) { 289 setAppliedRule(p2, "GB 7 ( LV | V ) x ( V | T )"); 290 continue; 291 } 292 293 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) && 294 fTSet.contains(c2)) { 295 setAppliedRule(p2, "GB 8 ( LVT | T) x T"); 296 continue; 297 } 298 299 if (fExtendSet.contains(c2) || fZWJSet.contains(c2)) { 300 if (!fExtendSet.contains(c1)) { 301 cBase = c1; 302 } 303 setAppliedRule(p2, "GB 9 x (Extend | ZWJ)"); 304 continue; 305 } 306 307 if (fSpacingSet.contains(c2)) { 308 setAppliedRule(p2, "GB 9a x SpacingMark"); 309 continue; 310 } 311 312 if (fPrependSet.contains(c1)) { 313 setAppliedRule(p2, "GB 9b Prepend x"); 314 continue; 315 } 316 317 // Note: Viramas are also included in the ExtCccZwj class. 318 if (fLinkingConsonantSet.contains(c2)) { 319 int pi = p1; 320 boolean sawVirama = false; 321 while (pi > 0 && fExtCccZwjSet.contains(fText.codePointAt(pi))) { 322 if (fViramaSet.contains(fText.codePointAt(pi))) { 323 sawVirama = true; 324 } 325 pi = fText.offsetByCodePoints(pi, -1); 326 } 327 if (sawVirama && fLinkingConsonantSet.contains(fText.codePointAt(pi))) { 328 setAppliedRule(p2, "GB 9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant"); 329 continue; 330 } 331 } 332 333 if (fExtendedPictSet.contains(cBase) && fZWJSet.contains(c1) && fExtendedPictSet.contains(c2) ) { 334 setAppliedRule(p2, "GB 11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic"); 335 continue; 336 } 337 338 // Note: The first if condition is a little tricky. We only need to force 339 // a break if there are three or more contiguous RIs. If there are 340 // only two, a break following will occur via other rules, and will include 341 // any trailing extend characters, which is needed behavior. 342 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1) 343 && fRegionalIndicatorSet.contains(c2)) { 344 setAppliedRule(p2, "GB 12-13 Regional_Indicator x Regional_Indicator"); 345 break; 346 } 347 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { 348 setAppliedRule(p2, "GB 12-13 Regional_Indicator x Regional_Indicator"); 349 continue; 350 } 351 352 setAppliedRule(p2, "GB 999 Any <break> Any"); 353 break; 354 } 355 356 breakPos = p2; 357 return breakPos; 358 } 359 360 } 361 362 /** 363 * 364 * Word Monkey Test Class 365 * 366 * 367 * 368 */ 369 static class RBBIWordMonkey extends RBBIMonkeyKind { 370 StringBuffer fText; 371 372 UnicodeSet fCRSet; 373 UnicodeSet fLFSet; 374 UnicodeSet fNewlineSet; 375 UnicodeSet fRegionalIndicatorSet; 376 UnicodeSet fKatakanaSet; 377 UnicodeSet fHebrew_LetterSet; 378 UnicodeSet fALetterSet; 379 UnicodeSet fSingle_QuoteSet; 380 UnicodeSet fDouble_QuoteSet; 381 UnicodeSet fMidNumLetSet; 382 UnicodeSet fMidLetterSet; 383 UnicodeSet fMidNumSet; 384 UnicodeSet fNumericSet; 385 UnicodeSet fFormatSet; 386 UnicodeSet fExtendSet; 387 UnicodeSet fExtendNumLetSet; 388 UnicodeSet fWSegSpaceSet; 389 UnicodeSet fOtherSet; 390 UnicodeSet fDictionarySet; 391 UnicodeSet fZWJSet; 392 UnicodeSet fExtendedPictSet; 393 RBBIWordMonkey()394 RBBIWordMonkey() { 395 fCharProperty = UProperty.WORD_BREAK; 396 397 fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]"); 398 fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]"); 399 fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]"); 400 fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]"); 401 fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]"); 402 fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]"); 403 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter} @]"); 404 fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]"); 405 fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]"); 406 fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]"); 407 fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]"); 408 fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]"); 409 fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]"); 410 fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]"); 411 fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]"); 412 // There are some sc=Hani characters with WB=Extend. 413 // The break rules need to pick one or the other because 414 // Extend overlapping with something else is messy. 415 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1 416 // in $Han (for $dictionary) and out of $Extend. 417 fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}-[:Hani:]]"); 418 fWSegSpaceSet = new UnicodeSet("[\\p{Word_Break = WSegSpace}]"); 419 fZWJSet = new UnicodeSet("[\\p{Word_Break = ZWJ}]"); 420 fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]"); 421 422 fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"); 423 fDictionarySet.addAll(fKatakanaSet); 424 fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]")); 425 426 fALetterSet.removeAll(fDictionarySet); 427 428 fOtherSet = new UnicodeSet(); 429 fOtherSet.complement(); 430 fOtherSet.removeAll(fCRSet); 431 fOtherSet.removeAll(fLFSet); 432 fOtherSet.removeAll(fNewlineSet); 433 fOtherSet.removeAll(fALetterSet); 434 fOtherSet.removeAll(fSingle_QuoteSet); 435 fOtherSet.removeAll(fDouble_QuoteSet); 436 fOtherSet.removeAll(fKatakanaSet); 437 fOtherSet.removeAll(fHebrew_LetterSet); 438 fOtherSet.removeAll(fMidLetterSet); 439 fOtherSet.removeAll(fMidNumSet); 440 fOtherSet.removeAll(fNumericSet); 441 fOtherSet.removeAll(fFormatSet); 442 fOtherSet.removeAll(fExtendSet); 443 fOtherSet.removeAll(fExtendNumLetSet); 444 fOtherSet.removeAll(fWSegSpaceSet); 445 fOtherSet.removeAll(fRegionalIndicatorSet); 446 fOtherSet.removeAll(fZWJSet); 447 fOtherSet.removeAll(fExtendedPictSet); 448 449 // Inhibit dictionary characters from being tested at all. 450 // remove surrogates so as to not generate higher CJK characters 451 fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]")); 452 fOtherSet.removeAll(fDictionarySet); 453 454 fSets.add(fCRSet); fClassNames.add("CR"); 455 fSets.add(fLFSet); fClassNames.add("LF"); 456 fSets.add(fNewlineSet); fClassNames.add("Newline"); 457 fSets.add(fRegionalIndicatorSet); fClassNames.add("RegionalIndicator"); 458 fSets.add(fHebrew_LetterSet); fClassNames.add("Hebrew"); 459 fSets.add(fALetterSet); fClassNames.add("ALetter"); 460 //fSets.add(fKatakanaSet); // Omit Katakana from fSets, which omits Katakana characters 461 // from the test data. They are all in the dictionary set, 462 // which this (old, to be retired) monkey test cannot handle. 463 fSets.add(fSingle_QuoteSet); fClassNames.add("Single Quote"); 464 fSets.add(fDouble_QuoteSet); fClassNames.add("Double Quote"); 465 fSets.add(fMidLetterSet); fClassNames.add("MidLetter"); 466 fSets.add(fMidNumLetSet); fClassNames.add("MidNumLet"); 467 fSets.add(fMidNumSet); fClassNames.add("MidNum"); 468 fSets.add(fNumericSet); fClassNames.add("Numeric"); 469 fSets.add(fFormatSet); fClassNames.add("Format"); 470 fSets.add(fExtendSet); fClassNames.add("Extend"); 471 fSets.add(fExtendNumLetSet); fClassNames.add("ExtendNumLet"); 472 fSets.add(fWSegSpaceSet); fClassNames.add("WSegSpace"); 473 fSets.add(fZWJSet); fClassNames.add("ZWJ"); 474 fSets.add(fExtendedPictSet); fClassNames.add("ExtendedPict"); 475 fSets.add(fOtherSet); fClassNames.add("Other"); 476 } 477 478 479 @Override charClasses()480 List charClasses() { 481 return fSets; 482 } 483 484 @Override setText(StringBuffer s)485 void setText(StringBuffer s) { 486 fText = s; 487 prepareAppliedRules(s.length()); 488 } 489 490 @Override next(int prevPos)491 int next(int prevPos) { 492 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 493 // break position being tested. The candidate break 494 // location is before p2. 495 int breakPos = -1; 496 497 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 498 499 // Previous break at end of string. return DONE. 500 if (prevPos >= fText.length()) { 501 return -1; 502 } 503 /*p0 =*/ p1 = p2 = p3 = prevPos; 504 c3 = UTF16.charAt(fText, prevPos); 505 c0 = c1 = c2 = 0; 506 507 508 509 // Loop runs once per "significant" character position in the input text. 510 for (;;) { 511 // Move all of the positions forward in the input string. 512 /*p0 = p1;*/ c0 = c1; 513 p1 = p2; c1 = c2; 514 p2 = p3; c2 = c3; 515 516 // Advance p3 by X(Extend | Format)* Rule 4 517 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 518 do { 519 p3 = moveIndex32(fText, p3, 1); 520 c3 = -1; 521 if (p3>=fText.length()) { 522 break; 523 } 524 c3 = UTF16.charAt(fText, p3); 525 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) { 526 break; 527 } 528 } 529 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3)); 530 531 if (p1 == p2) { 532 // Still warming up the loop. (won't work with zero length strings, but we don't care) 533 continue; 534 } 535 if (p2 == fText.length()) { 536 // Reached end of string. Always a break position. 537 break; 538 } 539 540 // No Extend or Format characters may appear between the CR and LF, 541 // which requires the additional check for p2 immediately following p1. 542 // 543 if (c1==0x0D && c2==0x0A) { 544 setAppliedRule(p2, "WB 3 CR x LF"); 545 continue; 546 } 547 548 // 549 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) { 550 setAppliedRule(p2, "WB 3a Break before and after newlines (including CR and LF)"); 551 break; 552 } 553 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) { 554 setAppliedRule(p2, "WB 3a Break before and after newlines (including CR and LF)"); 555 break; 556 } 557 558 // Not ignoring extend chars, so peek into input text to 559 // get the potential ZWJ, the character immediately preceding c2. 560 if (fZWJSet.contains(fText.codePointBefore(p2)) && fExtendedPictSet.contains(c2)) { 561 setAppliedRule(p2, "WB 3c ZWJ x Extended_Pictographic"); 562 continue; 563 } 564 565 if (fWSegSpaceSet.contains(fText.codePointBefore(p2)) && fWSegSpaceSet.contains(c2)) { 566 setAppliedRule(p2, "WB 3d Keep horizontal whitespace together"); 567 continue; 568 } 569 570 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 571 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 572 setAppliedRule(p2, "WB 4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)"); 573 continue; 574 } 575 576 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 577 (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && 578 (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) { 579 setAppliedRule(p2, "WB 6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)"); 580 continue; 581 } 582 583 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) && 584 (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) && 585 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 586 setAppliedRule(p2, "WB 7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)"); 587 continue; 588 } 589 590 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) { 591 setAppliedRule(p2, "WB 7a Hebrew_Letter x Single_Quote"); 592 continue; 593 } 594 595 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) { 596 setAppliedRule(p2, "WB 7b Hebrew_Letter x Single_Quote"); 597 continue; 598 } 599 600 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) { 601 setAppliedRule(p2, "WB 7c Hebrew_Letter Double_Quote x Hebrew_Letter"); 602 continue; 603 } 604 605 if (fNumericSet.contains(c1) && 606 fNumericSet.contains(c2)) { 607 setAppliedRule(p2, "WB 8 Numeric x Numeric"); 608 continue; 609 } 610 611 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 612 fNumericSet.contains(c2)) { 613 setAppliedRule(p2, "WB 9 (ALetter | Hebrew_Letter) x Numeric"); 614 continue; 615 } 616 617 if (fNumericSet.contains(c1) && 618 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 619 setAppliedRule(p2, "WB 10 Numeric x (ALetter | Hebrew_Letter)"); 620 continue; 621 } 622 623 if (fNumericSet.contains(c0) && 624 (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) && 625 fNumericSet.contains(c2)) { 626 setAppliedRule(p2, "WB 11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric"); 627 continue; 628 } 629 630 if (fNumericSet.contains(c1) && 631 (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && 632 setContains(fNumericSet, c3)) { 633 setAppliedRule(p2, "WB 12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric"); 634 continue; 635 } 636 637 // Note: matches UAX 29 rules, but doesn't come into play for ICU because 638 // all Katakana are handled by the dictionary breaker. 639 if (fKatakanaSet.contains(c1) && 640 fKatakanaSet.contains(c2)) { 641 setAppliedRule(p2, "WB 13 Katakana x Katakana"); 642 continue; 643 } 644 645 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) || 646 fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) && 647 fExtendNumLetSet.contains(c2)) { 648 setAppliedRule(p2, "WB 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet"); 649 continue; 650 } 651 652 if (fExtendNumLetSet.contains(c1) && 653 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) || 654 fNumericSet.contains(c2) || fKatakanaSet.contains(c2))) { 655 setAppliedRule(p2, "WB 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)"); 656 continue; 657 } 658 659 660 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) { 661 setAppliedRule(p2, "WB 15-17 Group pairs of Regional Indicators."); 662 break; 663 } 664 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { 665 setAppliedRule(p2, "WB 15-17 Group pairs of Regional Indicators."); 666 continue; 667 } 668 669 setAppliedRule(p2, "WB 999"); 670 break; 671 } 672 673 breakPos = p2; 674 return breakPos; 675 } 676 } 677 678 679 static class RBBILineMonkey extends RBBIMonkeyKind { 680 // UnicodeSets for each of the Line Breaking character classes. 681 // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier 682 // to verify that they are all accounted for. 683 684 // XUnicodeSet is like UnicodeSet, except that the method contains(int codePoint) does not 685 // throw exceptions on out-of-range codePoints. This matches ICU4C behavior. 686 // The LineMonkey test (ported from ICU4C) relies on this behavior, it uses a value of -1 687 // to represent a non-codepoint that is not included in any of the property sets. 688 // This happens for rule 30a. 689 class XUnicodeSet extends UnicodeSet { XUnicodeSet(String pattern)690 XUnicodeSet(String pattern) { super(pattern); } XUnicodeSet()691 XUnicodeSet() { super(); } 692 @Override contains(int codePoint)693 public boolean contains(int codePoint) { 694 return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ? 695 false : super.contains(codePoint); 696 } 697 } 698 699 // Declare these variables as XUnicodeSet, not merely as UnicodeSet, 700 // so that when we copy a new declaration from C++ (where only UnicodeSet exists), 701 // the missing 'X' prefix is visible; 702 // and when the prefix is there and we copy a new initializer we get a compiler error. 703 // (Otherwise we rely on the caller catching the IAE from using codePoint=-1 704 // and failing with a message that tells us what to do.) 705 XUnicodeSet fBK; 706 XUnicodeSet fCR; 707 XUnicodeSet fLF; 708 XUnicodeSet fCM; 709 XUnicodeSet fNL; 710 XUnicodeSet fSG; 711 XUnicodeSet fWJ; 712 XUnicodeSet fZW; 713 XUnicodeSet fGL; 714 XUnicodeSet fSP; 715 XUnicodeSet fB2; 716 XUnicodeSet fBA; 717 XUnicodeSet fBB; 718 XUnicodeSet fHH; 719 XUnicodeSet fHY; 720 XUnicodeSet fCB; 721 XUnicodeSet fCL; 722 XUnicodeSet fCP; 723 XUnicodeSet fEX; 724 XUnicodeSet fIN; 725 XUnicodeSet fNS; 726 XUnicodeSet fOP; 727 XUnicodeSet fQU; 728 XUnicodeSet fIS; 729 XUnicodeSet fNU; 730 XUnicodeSet fPO; 731 XUnicodeSet fPR; 732 XUnicodeSet fSY; 733 XUnicodeSet fAI; 734 XUnicodeSet fAL; 735 XUnicodeSet fCJ; 736 XUnicodeSet fH2; 737 XUnicodeSet fH3; 738 XUnicodeSet fHL; 739 XUnicodeSet fID; 740 XUnicodeSet fJL; 741 XUnicodeSet fJV; 742 XUnicodeSet fJT; 743 XUnicodeSet fRI; 744 XUnicodeSet fXX; 745 XUnicodeSet fEB; 746 XUnicodeSet fEM; 747 XUnicodeSet fZWJ; 748 XUnicodeSet fOP30; 749 XUnicodeSet fCP30; 750 XUnicodeSet fExtPictUnassigned; 751 752 StringBuffer fText; 753 int fOrigPositions; 754 RBBILineMonkey()755 RBBILineMonkey() 756 { 757 fCharProperty = UProperty.LINE_BREAK; 758 759 fBK = new XUnicodeSet("[\\p{Line_Break=BK}]"); 760 fCR = new XUnicodeSet("[\\p{Line_break=CR}]"); 761 fLF = new XUnicodeSet("[\\p{Line_break=LF}]"); 762 fCM = new XUnicodeSet("[\\p{Line_break=CM}]"); 763 fNL = new XUnicodeSet("[\\p{Line_break=NL}]"); 764 fSG = new XUnicodeSet("[\\ud800-\\udfff]"); 765 fWJ = new XUnicodeSet("[\\p{Line_break=WJ}]"); 766 fZW = new XUnicodeSet("[\\p{Line_break=ZW}]"); 767 fGL = new XUnicodeSet("[\\p{Line_break=GL}]"); 768 fSP = new XUnicodeSet("[\\p{Line_break=SP}]"); 769 fB2 = new XUnicodeSet("[\\p{Line_break=B2}]"); 770 fBA = new XUnicodeSet("[\\p{Line_break=BA}]"); 771 fBB = new XUnicodeSet("[\\p{Line_break=BB}]"); 772 fHH = new XUnicodeSet(); 773 fHY = new XUnicodeSet("[\\p{Line_break=HY}]"); 774 fCB = new XUnicodeSet("[\\p{Line_break=CB}]"); 775 fCL = new XUnicodeSet("[\\p{Line_break=CL}]"); 776 fCP = new XUnicodeSet("[\\p{Line_break=CP}]"); 777 fEX = new XUnicodeSet("[\\p{Line_break=EX}]"); 778 fIN = new XUnicodeSet("[\\p{Line_break=IN}]"); 779 fNS = new XUnicodeSet("[\\p{Line_break=NS}]"); 780 fOP = new XUnicodeSet("[\\p{Line_break=OP}]"); 781 fQU = new XUnicodeSet("[\\p{Line_break=QU}]"); 782 fIS = new XUnicodeSet("[\\p{Line_break=IS}]"); 783 fNU = new XUnicodeSet("[\\p{Line_break=NU}]"); 784 fPO = new XUnicodeSet("[\\p{Line_break=PO}]"); 785 fPR = new XUnicodeSet("[\\p{Line_break=PR}]"); 786 fSY = new XUnicodeSet("[\\p{Line_break=SY}]"); 787 fAI = new XUnicodeSet("[\\p{Line_break=AI}]"); 788 fAL = new XUnicodeSet("[\\p{Line_break=AL}]"); 789 fCJ = new XUnicodeSet("[\\p{Line_break=CJ}]"); 790 fH2 = new XUnicodeSet("[\\p{Line_break=H2}]"); 791 fH3 = new XUnicodeSet("[\\p{Line_break=H3}]"); 792 fHL = new XUnicodeSet("[\\p{Line_break=HL}]"); 793 fID = new XUnicodeSet("[\\p{Line_break=ID}]"); 794 fJL = new XUnicodeSet("[\\p{Line_break=JL}]"); 795 fJV = new XUnicodeSet("[\\p{Line_break=JV}]"); 796 fJT = new XUnicodeSet("[\\p{Line_break=JT}]"); 797 fRI = new XUnicodeSet("[\\p{Line_break=RI}]"); 798 fXX = new XUnicodeSet("[\\p{Line_break=XX}]"); 799 fEB = new XUnicodeSet("[\\p{Line_break=EB}]"); 800 fEM = new XUnicodeSet("[\\p{Line_break=EM}]"); 801 fZWJ = new XUnicodeSet("[\\p{Line_break=ZWJ}]"); 802 fOP30 = new XUnicodeSet("[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"); 803 fCP30 = new XUnicodeSet("[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"); 804 fExtPictUnassigned = new XUnicodeSet("[\\p{Extended_Pictographic}&\\p{Cn}]"); 805 806 // Remove dictionary characters. 807 // The monkey test reference implementation of line break does not replicate the dictionary behavior, 808 // so dictionary characters are omitted from the monkey test data. 809 @SuppressWarnings("unused") 810 UnicodeSet dictionarySet = new UnicodeSet( 811 "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]"); 812 813 fAL.addAll(fXX); // Default behavior for XX is identical to AL 814 fAL.addAll(fAI); // Default behavior for AI is identical to AL 815 fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL 816 817 fNS.addAll(fCJ); // Default behavior for CJ is identical to NS. 818 fCM.addAll(fZWJ); // ZWJ behaves as a CM. 819 820 fHH.add('\u2010'); // Hyphen, '‐' 821 822 fSets.add(fBK); fClassNames.add("BK"); 823 fSets.add(fCR); fClassNames.add("CR"); 824 fSets.add(fLF); fClassNames.add("LF"); 825 fSets.add(fCM); fClassNames.add("CM"); 826 fSets.add(fNL); fClassNames.add("NL"); 827 fSets.add(fWJ); fClassNames.add("WJ"); 828 fSets.add(fZW); fClassNames.add("ZW"); 829 fSets.add(fGL); fClassNames.add("GL"); 830 fSets.add(fSP); fClassNames.add("SP"); 831 fSets.add(fB2); fClassNames.add("B2"); 832 fSets.add(fBA); fClassNames.add("BA"); 833 fSets.add(fBB); fClassNames.add("BB"); 834 fSets.add(fHY); fClassNames.add("HY"); 835 fSets.add(fCB); fClassNames.add("CB"); 836 fSets.add(fCL); fClassNames.add("CL"); 837 fSets.add(fCP); fClassNames.add("CP"); 838 fSets.add(fEX); fClassNames.add("EX"); 839 fSets.add(fIN); fClassNames.add("IN"); 840 fSets.add(fJL); fClassNames.add("JL"); 841 fSets.add(fJT); fClassNames.add("JT"); 842 fSets.add(fJV); fClassNames.add("JV"); 843 fSets.add(fNS); fClassNames.add("NV"); 844 fSets.add(fOP); fClassNames.add("OP"); 845 fSets.add(fQU); fClassNames.add("QU"); 846 fSets.add(fIS); fClassNames.add("IS"); 847 fSets.add(fNU); fClassNames.add("NU"); 848 fSets.add(fPO); fClassNames.add("PO"); 849 fSets.add(fPR); fClassNames.add("PR"); 850 fSets.add(fSY); fClassNames.add("SY"); 851 fSets.add(fAI); fClassNames.add("AI"); 852 fSets.add(fAL); fClassNames.add("AL"); 853 fSets.add(fH2); fClassNames.add("H2"); 854 fSets.add(fH3); fClassNames.add("H3"); 855 fSets.add(fHL); fClassNames.add("HL"); 856 fSets.add(fID); fClassNames.add("ID"); 857 fSets.add(fRI); fClassNames.add("RI"); 858 fSets.add(fSG); fClassNames.add("SG"); 859 fSets.add(fEB); fClassNames.add("EB"); 860 fSets.add(fEM); fClassNames.add("EM"); 861 fSets.add(fZWJ); fClassNames.add("ZWJ"); 862 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented. 863 fSets.add(fOP30); fClassNames.add("OP30"); 864 fSets.add(fCP30); fClassNames.add("CP30"); 865 fSets.add(fExtPictUnassigned); fClassNames.add("fExtPictUnassigned"); 866 } 867 868 @Override setText(StringBuffer s)869 void setText(StringBuffer s) { 870 fText = s; 871 prepareAppliedRules(s.length()); 872 } 873 874 875 876 877 @Override next(int startPos)878 int next(int startPos) { 879 int pos; // Index of the char following a potential break position 880 int thisChar; // Character at above position "pos" 881 882 int prevPos; // Index of the char preceding a potential break position 883 int prevChar; // Character at above position. Note that prevChar 884 // // and thisChar may not be adjacent because combining 885 // // characters between them will be ignored. 886 887 int prevPosX2; 888 int prevCharX2; // Character before prevChar, more context for LB 21a 889 890 int nextPos; // Index of the next character following pos. 891 // // Usually skips over combining marks. 892 int tPos; // temp value. 893 int matchVals[] = null; // Number Expression Match Results 894 895 896 if (startPos >= fText.length()) { 897 return -1; 898 } 899 900 901 // Initial values for loop. Loop will run the first time without finding breaks, 902 // while the invalid values shift out and the "this" and 903 // "prev" positions are filled in with good values. 904 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 905 thisChar = prevChar = prevCharX2 = 0; 906 nextPos = startPos; 907 908 909 // Loop runs once per position in the test text, until a break position 910 // is found. In each iteration, we are testing for a possible break 911 // just preceding the character at index "pos". The character preceding 912 // this char is at position "prevPos"; because of combining sequences, 913 // "prevPos" can be arbitrarily far before "pos". 914 for (;;) { 915 // Advance to the next position to be tested. 916 prevPosX2 = prevPos; 917 prevCharX2 = prevChar; 918 prevPos = pos; 919 prevChar = thisChar; 920 pos = nextPos; 921 nextPos = moveIndex32(fText, pos, 1); 922 923 if (pos >= fText.length()) { 924 setAppliedRule(pos, "LB 2 Break at end of text"); 925 break; 926 } 927 928 // We do this rule out-of-order because the adjustment does 929 // not effect the way that rules LB 3 through LB 6 match, 930 // and doing it here rather than after LB 6 is substantially 931 // simpler when combining sequences do occur. 932 933 934 // LB 9 Keep combining sequences together. 935 // advance over any CM class chars at "pos", 936 // result is "nextPos" for the following loop iteration. 937 thisChar = UTF16.charAt(fText, pos); 938 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d || 939 thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) { 940 for (;;) { 941 if (nextPos == fText.length()) { 942 break; 943 } 944 int nextChar = UTF16.charAt(fText, nextPos); 945 if (!fCM.contains(nextChar)) { 946 break; 947 } 948 nextPos = moveIndex32(fText, nextPos, 1); 949 } 950 } 951 952 // LB 9 Treat X CM* as if it were X 953 // No explicit action required. 954 955 // LB 10 Treat any remaining combining mark as AL 956 if (fCM.contains(thisChar)) { 957 thisChar = 'A'; 958 } 959 960 961 // If the loop is still warming up - if we haven't shifted the initial 962 // -1 positions out of prevPos yet - loop back to advance the 963 // position in the input without any further looking for breaks. 964 if (prevPos == -1) { 965 setAppliedRule(pos, "LB 9 adjust for combining sequences."); 966 continue; 967 } 968 969 if (fBK.contains(prevChar)) { 970 setAppliedRule(pos, "LB 4 Always break after hard line breaks"); 971 break; 972 } 973 974 if (fCR.contains(prevChar) && fLF.contains(thisChar)) { 975 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF"); 976 continue; 977 } 978 if (fCR.contains(prevChar) || 979 fLF.contains(prevChar) || 980 fNL.contains(prevChar)) { 981 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF"); 982 break; 983 } 984 985 if (fBK.contains(thisChar) || fCR.contains(thisChar) || 986 fLF.contains(thisChar) || fNL.contains(thisChar) ) { 987 setAppliedRule(pos, "LB 6 Don't break before hard line breaks"); 988 continue; 989 } 990 991 992 if (fSP.contains(thisChar)) { 993 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space"); 994 continue; 995 } 996 997 if (fZW.contains(thisChar)) { 998 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space"); 999 continue; 1000 } 1001 1002 // ZW SP* ÷ 1003 // Scan backwards from prevChar for SP* ZW 1004 tPos = prevPos; 1005 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1006 tPos = moveIndex32(fText, tPos, -1); 1007 } 1008 if (fZW.contains(UTF16.charAt(fText, tPos))) { 1009 setAppliedRule(pos, "LB 8 Break after zero width space"); 1010 break; 1011 } 1012 1013 // Move this test up, before LB8a, because numbers can match a longer sequence that would 1014 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM) 1015 matchVals = LBNumberCheck(fText, prevPos, matchVals); 1016 if (matchVals[0] != -1) { 1017 // Matched a number. But could have been just a single digit, which would 1018 // not represent a "no break here" between prevChar and thisChar 1019 int numEndIdx = matchVals[1]; // idx of first char following num 1020 if (numEndIdx > pos) { 1021 // Number match includes at least the two chars being checked 1022 if (numEndIdx > nextPos) { 1023 // Number match includes additional chars. Update pos and nextPos 1024 // so that next loop iteration will continue at the end of the number, 1025 // checking for breaks between last char in number & whatever follows. 1026 nextPos = numEndIdx; 1027 pos = numEndIdx; 1028 do { 1029 pos = moveIndex32(fText, pos, -1); 1030 thisChar = UTF16.charAt(fText, pos); 1031 } 1032 while (fCM.contains(thisChar)); 1033 } 1034 setAppliedRule(pos, "LB 25 Numbers"); 1035 continue; 1036 } 1037 } 1038 1039 // The monkey test's way of ignoring combining characters doesn't work 1040 // for this rule. ZWJ is also a CM. Need to get the actual character 1041 // preceding "thisChar", not ignoring combining marks, possibly ZWJ. 1042 { 1043 int prevC = fText.codePointBefore(pos); 1044 if (fZWJ.contains(prevC)) { 1045 setAppliedRule(pos, "LB 8a ZWJ x"); 1046 continue; 1047 } 1048 } 1049 1050 // appliedRule: "LB 9, 10"; // Already done, at top of loop."; 1051 1052 1053 // x WJ 1054 // WJ x 1055 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) { 1056 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters."); 1057 continue; 1058 } 1059 1060 1061 if (fGL.contains(prevChar)) { 1062 setAppliedRule(pos, "LB 12 GL x"); 1063 continue; 1064 } 1065 1066 if (!(fSP.contains(prevChar) || 1067 fBA.contains(prevChar) || 1068 fHY.contains(prevChar) ) && fGL.contains(thisChar)) { 1069 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL"); 1070 continue; 1071 } 1072 1073 if (fCL.contains(thisChar) || 1074 fCP.contains(thisChar) || 1075 fEX.contains(thisChar) || 1076 fSY.contains(thisChar)) { 1077 setAppliedRule(pos, "LB 13 Don't break before closings"); 1078 continue; 1079 } 1080 1081 // Scan backwards, checking for this sequence. 1082 // The OP char could include combining marks, so we actually check for 1083 // OP CM* SP* x 1084 tPos = prevPos; 1085 if (fSP.contains(prevChar)) { 1086 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1087 tPos=moveIndex32(fText, tPos, -1); 1088 } 1089 } 1090 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1091 tPos=moveIndex32(fText, tPos, -1); 1092 } 1093 if (fOP.contains(UTF16.charAt(fText, tPos))) { 1094 setAppliedRule(pos, "LB 14 Don't break after OP SP*"); 1095 continue; 1096 } 1097 1098 if (nextPos < fText.length()) { 1099 int nextChar = fText.codePointAt(nextPos); 1100 if (fSP.contains(prevChar) && fIS.contains(thisChar) && fNU.contains(nextChar)) { 1101 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space"); 1102 break; 1103 } 1104 } 1105 1106 if (fIS.contains(thisChar)) { 1107 setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces"); 1108 continue; 1109 } 1110 1111 if (fOP.contains(thisChar)) { 1112 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 1113 tPos = prevPos; 1114 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1115 tPos = moveIndex32(fText, tPos, -1); 1116 } 1117 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1118 tPos = moveIndex32(fText, tPos, -1); 1119 } 1120 if (fQU.contains(UTF16.charAt(fText, tPos))) { 1121 setAppliedRule(pos, "LB 15 QU SP* x OP"); 1122 continue; 1123 } 1124 } 1125 1126 if (fNS.contains(thisChar)) { 1127 tPos = prevPos; 1128 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1129 tPos = moveIndex32(fText, tPos, -1); 1130 } 1131 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1132 tPos = moveIndex32(fText, tPos, -1); 1133 } 1134 if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) { 1135 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS"); 1136 continue; 1137 } 1138 } 1139 1140 1141 if (fB2.contains(thisChar)) { 1142 tPos = prevPos; 1143 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1144 tPos = moveIndex32(fText, tPos, -1); 1145 } 1146 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1147 tPos = moveIndex32(fText, tPos, -1); 1148 } 1149 if (fB2.contains(UTF16.charAt(fText, tPos))) { 1150 setAppliedRule(pos, "LB 17 B2 SP* x B2"); 1151 continue; 1152 } 1153 } 1154 1155 if (fSP.contains(prevChar)) { 1156 setAppliedRule(pos, "LB 18 break after space"); 1157 break; 1158 } 1159 1160 // x QU 1161 // QU x 1162 if (fQU.contains(thisChar) || fQU.contains(prevChar)) { 1163 setAppliedRule(pos, "LB 19"); 1164 continue; 1165 } 1166 1167 if (fCB.contains(thisChar) || fCB.contains(prevChar)) { 1168 setAppliedRule(pos, "LB 20 Break around a CB"); 1169 break; 1170 } 1171 1172 // Don't break between Hyphens and letters if a break precedes the hyphen. 1173 // Formerly this was a Finnish tailoring. 1174 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14. 1175 // ^($HY | $HH) $AL; 1176 if (fAL.contains(thisChar) && (fHY.contains(prevChar) || fHH.contains(prevChar)) && 1177 prevPosX2 == -1) { 1178 setAppliedRule(pos, "LB 20.09"); 1179 continue; 1180 } 1181 1182 if (fBA.contains(thisChar) || 1183 fHY.contains(thisChar) || 1184 fNS.contains(thisChar) || 1185 fBB.contains(prevChar) ) { 1186 setAppliedRule(pos, "LB 21"); 1187 continue; 1188 } 1189 1190 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) { 1191 setAppliedRule(pos, "LB 21a HL (HY | BA) x"); 1192 continue; 1193 } 1194 1195 if (fSY.contains(prevChar) && fHL.contains(thisChar)) { 1196 setAppliedRule(pos, "LB 21b SY x HL"); 1197 continue; 1198 } 1199 1200 if (fIN.contains(thisChar)) { 1201 setAppliedRule(pos, "LB 22"); 1202 continue; 1203 } 1204 1205 // (AL | HL) x NU 1206 // NU x (AL | HL) 1207 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) { 1208 setAppliedRule(pos, "LB 23"); 1209 continue; 1210 } 1211 if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1212 setAppliedRule(pos, "LB 23"); 1213 continue; 1214 } 1215 1216 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. 1217 // PR x (ID | EB | EM) 1218 // (ID | EB | EM) x PO 1219 if (fPR.contains(prevChar) && 1220 (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar))) { 1221 setAppliedRule(pos, "LB 23a"); 1222 continue; 1223 } 1224 if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && 1225 fPO.contains(thisChar)) { 1226 setAppliedRule(pos, "LB 23a"); 1227 continue; 1228 } 1229 1230 // Do not break between prefix and letters or ideographs. 1231 // (PR | PO) x (AL | HL) 1232 // (AL | HL) x (PR | PO) 1233 if ((fPR.contains(prevChar) || fPO.contains(prevChar)) && 1234 (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1235 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs"); 1236 continue; 1237 } 1238 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && 1239 (fPR.contains(thisChar) || fPO.contains(thisChar))) { 1240 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs"); 1241 continue; 1242 } 1243 1244 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a, 1245 1246 if (fJL.contains(prevChar) && (fJL.contains(thisChar) || 1247 fJV.contains(thisChar) || 1248 fH2.contains(thisChar) || 1249 fH3.contains(thisChar))) { 1250 setAppliedRule(pos, "LB 26 Do not break a Korean syllable."); 1251 continue; 1252 } 1253 1254 if ((fJV.contains(prevChar) || fH2.contains(prevChar)) && 1255 (fJV.contains(thisChar) || fJT.contains(thisChar))) { 1256 setAppliedRule(pos, "LB 26 Do not break a Korean syllable."); 1257 continue; 1258 } 1259 1260 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) && 1261 fJT.contains(thisChar)) { 1262 setAppliedRule(pos, "LB 26 Do not break a Korean syllable."); 1263 continue; 1264 } 1265 1266 if ((fJL.contains(prevChar) || fJV.contains(prevChar) || 1267 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && 1268 fPO.contains(thisChar)) { 1269 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID."); 1270 continue; 1271 } 1272 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) || 1273 fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) { 1274 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID."); 1275 continue; 1276 } 1277 1278 1279 1280 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1281 setAppliedRule(pos, "LB 28 Do not break between alphabetics"); 1282 continue; 1283 } 1284 1285 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1286 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics"); 1287 continue; 1288 } 1289 1290 // (AL | NU) x OP 1291 // CP x (AL | NU) 1292 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && 1293 fOP30.contains(thisChar)) { 1294 setAppliedRule(pos, "LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation."); 1295 continue; 1296 } 1297 if (fCP30.contains(prevChar) && 1298 (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) { 1299 setAppliedRule(pos, "LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation."); 1300 continue; 1301 } 1302 1303 // RI RI ÷ RI 1304 // RI x RI 1305 if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) { 1306 setAppliedRule(pos, "LB 30a Break between pairs of Regional Indicators."); 1307 break; 1308 } 1309 if (fRI.contains(prevChar) && fRI.contains(thisChar)) { 1310 // Two Regional Indicators have been paired. 1311 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a 1312 // following RI. This is a hack. 1313 thisChar = -1; 1314 setAppliedRule(pos, "LB 30a Break between pairs of Regional Indicators."); 1315 continue; 1316 } 1317 1318 // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier. 1319 if (fEB.contains(prevChar) && fEM.contains(thisChar)) { 1320 setAppliedRule(pos, "LB 30b Emoji Base x Emoji Modifier"); 1321 continue; 1322 } 1323 1324 if (fExtPictUnassigned.contains(prevChar) && fEM.contains(thisChar)) { 1325 setAppliedRule(pos, "LB30b [\\p{Extended_Pictographic}&\\p{Cn}] × EM"); 1326 continue; 1327 } 1328 1329 // LB 31 Break everywhere else 1330 setAppliedRule(pos, "LB 31 Break everywhere else"); 1331 break; 1332 } 1333 1334 return pos; 1335 } 1336 1337 1338 1339 // Match the following regular expression in the input text. 1340 // ((PR | PO) CM*)? ((OP | HY) CM*)? (IS CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)? 1341 // 0 0 1 4 4 4 5 5 7 7 7 7 9 9 9 11 11 (match states) 1342 // retVals array [0] index of the start of the match, or -1 if no match 1343 // [1] index of first char following the match. 1344 // Can not use Java regex because need supplementary character support, 1345 // and because Unicode char properties version must be the same as in 1346 // the version of ICU being tested. LBNumberCheck(StringBuffer s, int startIdx, int[] retVals)1347 private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) { 1348 if (retVals == null) { 1349 retVals = new int[2]; 1350 } 1351 retVals[0] = -1; // Indicates no match. 1352 int matchState = 0; 1353 int idx = startIdx; 1354 1355 matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){ 1356 int c = UTF16.charAt(s, idx); 1357 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK); 1358 switch (matchState) { 1359 case 0: 1360 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC || 1361 cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1362 matchState = 1; 1363 break; 1364 } 1365 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { 1366 matchState = 4; 1367 break; 1368 } 1369 if (cLBType == UCharacter.LineBreak.HYPHEN) { 1370 matchState = 4; 1371 break; 1372 } 1373 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) { 1374 matchState = 5; 1375 break; 1376 } 1377 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1378 matchState = 7; 1379 break; 1380 } 1381 break matchLoop; /* No Match */ 1382 1383 case 1: 1384 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1385 matchState = 1; 1386 break; 1387 } 1388 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { 1389 matchState = 4; 1390 break; 1391 } 1392 if (cLBType == UCharacter.LineBreak.HYPHEN) { 1393 matchState = 4; 1394 break; 1395 } 1396 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) { 1397 matchState = 5; 1398 break; 1399 } 1400 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1401 matchState = 7; 1402 break; 1403 } 1404 break matchLoop; /* No Match */ 1405 1406 case 4: 1407 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1408 matchState = 4; 1409 break; 1410 } 1411 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) { 1412 matchState = 5; 1413 break; 1414 } 1415 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1416 matchState = 7; 1417 break; 1418 } 1419 break matchLoop; /* No Match */ 1420 1421 case 5: 1422 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1423 matchState = 5; 1424 break; 1425 } 1426 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1427 matchState = 7; 1428 break; 1429 } 1430 break matchLoop; /* No Match */ 1431 1432 1433 case 7: 1434 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1435 matchState = 7; 1436 break; 1437 } 1438 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1439 matchState = 7; 1440 break; 1441 } 1442 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) { 1443 matchState = 7; 1444 break; 1445 } 1446 if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) { 1447 matchState = 7; 1448 break; 1449 } 1450 if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) { 1451 matchState = 9; 1452 break; 1453 } 1454 if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) { 1455 matchState = 9; 1456 break; 1457 } 1458 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1459 matchState = 11; 1460 break; 1461 } 1462 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { 1463 matchState = 11; 1464 break; 1465 } 1466 1467 break matchLoop; // Match Complete. 1468 case 9: 1469 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1470 matchState = 9; 1471 break; 1472 } 1473 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1474 matchState = 11; 1475 break; 1476 } 1477 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { 1478 matchState = 11; 1479 break; 1480 } 1481 break matchLoop; // Match Complete. 1482 case 11: 1483 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1484 matchState = 11; 1485 break; 1486 } 1487 break matchLoop; // Match Complete. 1488 } 1489 } 1490 if (matchState >= 7) { 1491 retVals[0] = startIdx; 1492 retVals[1] = idx; 1493 } 1494 return retVals; 1495 } 1496 1497 1498 @Override charClasses()1499 List charClasses() { 1500 return fSets; 1501 } 1502 } 1503 1504 1505 /** 1506 * 1507 * Sentence Monkey Test Class 1508 * 1509 * 1510 * 1511 */ 1512 static class RBBISentenceMonkey extends RBBIMonkeyKind { 1513 StringBuffer fText; 1514 1515 UnicodeSet fSepSet; 1516 UnicodeSet fFormatSet; 1517 UnicodeSet fSpSet; 1518 UnicodeSet fLowerSet; 1519 UnicodeSet fUpperSet; 1520 UnicodeSet fOLetterSet; 1521 UnicodeSet fNumericSet; 1522 UnicodeSet fATermSet; 1523 UnicodeSet fSContinueSet; 1524 UnicodeSet fSTermSet; 1525 UnicodeSet fCloseSet; 1526 UnicodeSet fOtherSet; 1527 UnicodeSet fExtendSet; 1528 RBBISentenceMonkey()1529 RBBISentenceMonkey() { 1530 fCharProperty = UProperty.SENTENCE_BREAK; 1531 1532 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 1533 // set and made into character classes of their own. For the monkey impl, 1534 // they remain in SEP, since Sep always appears with CR and LF in the rules. 1535 fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"); 1536 fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]"); 1537 fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]"); 1538 fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]"); 1539 fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]"); 1540 fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]"); 1541 fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]"); 1542 fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]"); 1543 fSContinueSet = new UnicodeSet("[\\p{Sentence_Break = SContinue}]"); 1544 fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]"); 1545 fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]"); 1546 fExtendSet = new UnicodeSet("[\\p{Sentence_Break = Extend}]"); 1547 fOtherSet = new UnicodeSet(); 1548 1549 1550 fOtherSet.complement(); 1551 fOtherSet.removeAll(fSepSet); 1552 fOtherSet.removeAll(fFormatSet); 1553 fOtherSet.removeAll(fSpSet); 1554 fOtherSet.removeAll(fLowerSet); 1555 fOtherSet.removeAll(fUpperSet); 1556 fOtherSet.removeAll(fOLetterSet); 1557 fOtherSet.removeAll(fNumericSet); 1558 fOtherSet.removeAll(fATermSet); 1559 fOtherSet.removeAll(fSContinueSet); 1560 fOtherSet.removeAll(fSTermSet); 1561 fOtherSet.removeAll(fCloseSet); 1562 fOtherSet.removeAll(fExtendSet); 1563 1564 fSets.add(fSepSet); fClassNames.add("Sep"); 1565 fSets.add(fFormatSet); fClassNames.add("Format"); 1566 1567 fSets.add(fSpSet); fClassNames.add("Sp"); 1568 fSets.add(fLowerSet); fClassNames.add("Lower"); 1569 fSets.add(fUpperSet); fClassNames.add("Upper"); 1570 fSets.add(fOLetterSet); fClassNames.add("OLetter"); 1571 fSets.add(fNumericSet); fClassNames.add("Numeric"); 1572 fSets.add(fATermSet); fClassNames.add("ATerm"); 1573 fSets.add(fSContinueSet); fClassNames.add("SContinue"); 1574 fSets.add(fSTermSet); fClassNames.add("STerm"); 1575 fSets.add(fCloseSet); fClassNames.add("Close"); 1576 fSets.add(fOtherSet); fClassNames.add("Other"); 1577 fSets.add(fExtendSet); fClassNames.add("Extend"); 1578 } 1579 1580 1581 @Override charClasses()1582 List charClasses() { 1583 return fSets; 1584 } 1585 1586 @Override setText(StringBuffer s)1587 void setText(StringBuffer s) { 1588 fText = s; 1589 prepareAppliedRules(s.length()); 1590 } 1591 1592 1593 // moveBack() Find the "significant" code point preceding the index i. 1594 // Skips over ($Extend | $Format)* 1595 // moveBack(int i)1596 private int moveBack(int i) { 1597 1598 if (i <= 0) { 1599 return -1; 1600 } 1601 1602 int c; 1603 int j = i; 1604 do { 1605 j = moveIndex32(fText, j, -1); 1606 c = UTF16.charAt(fText, j); 1607 } 1608 while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c))); 1609 return j; 1610 } 1611 1612 moveForward(int i)1613 int moveForward(int i) { 1614 if (i>=fText.length()) { 1615 return fText.length(); 1616 } 1617 int c; 1618 int j = i; 1619 do { 1620 j = moveIndex32(fText, j, 1); 1621 c = cAt(j); 1622 } 1623 while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c))); 1624 return j; 1625 1626 } 1627 cAt(int pos)1628 int cAt(int pos) { 1629 if (pos<0 || pos>=fText.length()) { 1630 return -1; 1631 } 1632 return UTF16.charAt(fText, pos); 1633 } 1634 1635 @Override next(int prevPos)1636 int next(int prevPos) { 1637 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 1638 // break position being tested. The candidate break 1639 // location is before p2. 1640 int breakPos = -1; 1641 1642 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1643 int c; 1644 1645 // Prev break at end of string. return DONE. 1646 if (prevPos >= fText.length()) { 1647 return -1; 1648 } 1649 /*p0 =*/ p1 = p2 = p3 = prevPos; 1650 c3 = UTF16.charAt(fText, prevPos); 1651 c0 = c1 = c2 = 0; 1652 1653 // Loop runs once per "significant" character position in the input text. 1654 for (;;) { 1655 // Move all of the positions forward in the input string. 1656 /*p0 = p1;*/ c0 = c1; 1657 p1 = p2; c1 = c2; 1658 p2 = p3; c2 = c3; 1659 1660 // Advance p3 by X(Extend | Format)* Rule 4 1661 p3 = moveForward(p3); 1662 c3 = cAt(p3); 1663 1664 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 1665 setAppliedRule(p2, "SB3 CR x LF"); 1666 continue; 1667 } 1668 1669 if (fSepSet.contains(c1)) { 1670 p2 = p1+1; // Separators don't combine with Extend or Format 1671 setAppliedRule(p2, "SB4 Sep <break>"); 1672 break; 1673 } 1674 1675 if (p2 >= fText.length()) { 1676 // Reached end of string. Always a break position. 1677 setAppliedRule(p2, "SB4 Sep <break>"); 1678 break; 1679 } 1680 1681 if (p2 == prevPos) { 1682 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1683 setAppliedRule(p2, "SB4 Sep <break>"); 1684 continue; 1685 } 1686 1687 if (fATermSet.contains(c1) && fNumericSet.contains(c2)) { 1688 setAppliedRule(p2, "SB6 ATerm x Numeric"); 1689 continue; 1690 } 1691 1692 if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) && 1693 fATermSet.contains(c1) && fUpperSet.contains(c2)) { 1694 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper"); 1695 continue; 1696 } 1697 1698 // Note: Sterm | ATerm are added to the negated part of the expression by a 1699 // note to the Unicode 5.0 documents. 1700 int p8 = p1; 1701 while (p8>0 && fSpSet.contains(cAt(p8))) { 1702 p8 = moveBack(p8); 1703 } 1704 while (p8>0 && fCloseSet.contains(cAt(p8))) { 1705 p8 = moveBack(p8); 1706 } 1707 if (fATermSet.contains(cAt(p8))) { 1708 p8=p2; 1709 for (;;) { 1710 c = cAt(p8); 1711 if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) || 1712 fLowerSet.contains(c) || fSepSet.contains(c) || 1713 fATermSet.contains(c) || fSTermSet.contains(c)) 1714 { 1715 setAppliedRule(p2, "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower"); 1716 break; 1717 } 1718 p8 = moveForward(p8); 1719 } 1720 if (p8<fText.length() && fLowerSet.contains(cAt(p8))) { 1721 setAppliedRule(p2, "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower"); 1722 continue; 1723 } 1724 } 1725 1726 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) { 1727 p8 = p1; 1728 while (setContains(fSpSet, cAt(p8))) { 1729 p8 = moveBack(p8); 1730 } 1731 while (setContains(fCloseSet, cAt(p8))) { 1732 p8 = moveBack(p8); 1733 } 1734 c = cAt(p8); 1735 if (setContains(fSTermSet, c) || setContains(fATermSet, c)) { 1736 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)"); 1737 continue; 1738 } 1739 } 1740 1741 1742 int p9 = p1; 1743 while (p9>0 && fCloseSet.contains(cAt(p9))) { 1744 p9 = moveBack(p9); 1745 } 1746 c = cAt(p9); 1747 if ((fSTermSet.contains(c) || fATermSet.contains(c))) { 1748 if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) { 1749 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)"); 1750 continue; 1751 } 1752 } 1753 1754 int p10 = p1; 1755 while (p10>0 && fSpSet.contains(cAt(p10))) { 1756 p10 = moveBack(p10); 1757 } 1758 while (p10>0 && fCloseSet.contains(cAt(p10))) { 1759 p10 = moveBack(p10); 1760 } 1761 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) { 1762 if (fSpSet.contains(c2) || fSepSet.contains(c2)) { 1763 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)"); 1764 continue; 1765 } 1766 } 1767 1768 int p11 = p1; 1769 if (p11>0 && fSepSet.contains(cAt(p11))) { 1770 p11 = moveBack(p11); 1771 } 1772 while (p11>0 && fSpSet.contains(cAt(p11))) { 1773 p11 = moveBack(p11); 1774 } 1775 while (p11>0 && fCloseSet.contains(cAt(p11))) { 1776 p11 = moveBack(p11); 1777 } 1778 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) { 1779 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* <break>"); 1780 break; 1781 } 1782 1783 setAppliedRule(p2, "SB12 Any x Any"); 1784 continue; 1785 } 1786 breakPos = p2; 1787 return breakPos; 1788 } 1789 } 1790 1791 1792 /** 1793 * Move an index into a string by n code points. 1794 * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were 1795 * complicating usage. 1796 * @param s a Text string 1797 * @param pos The starting code unit index into the text string 1798 * @param amt The amount to adjust the string by. 1799 * @return The adjusted code unit index, pinned to the string's length, or 1800 * unchanged if input index was outside of the string. 1801 */ moveIndex32(StringBuffer s, int pos, int amt)1802 static int moveIndex32(StringBuffer s, int pos, int amt) { 1803 int i; 1804 char c; 1805 if (amt>0) { 1806 for (i=0; i<amt; i++) { 1807 if (pos >= s.length()) { 1808 return s.length(); 1809 } 1810 c = s.charAt(pos); 1811 pos++; 1812 if (UTF16.isLeadSurrogate(c) && pos < s.length()) { 1813 c = s.charAt(pos); 1814 if (UTF16.isTrailSurrogate(c)) { 1815 pos++; 1816 } 1817 } 1818 } 1819 } else { 1820 for (i=0; i>amt; i--) { 1821 if (pos <= 0) { 1822 return 0; 1823 } 1824 pos--; 1825 c = s.charAt(pos); 1826 if (UTF16.isTrailSurrogate(c) && pos >= 0) { 1827 c = s.charAt(pos); 1828 if (UTF16.isLeadSurrogate(c)) { 1829 pos--; 1830 } 1831 } 1832 } 1833 } 1834 return pos; 1835 } 1836 1837 /** 1838 * No-exceptions form of UnicodeSet.contains(c). 1839 * Simplifies loops that terminate with an end-of-input character value. 1840 * @param s A unicode set 1841 * @param c A code point value 1842 * @return true if the set contains c. 1843 */ setContains(UnicodeSet s, int c)1844 static boolean setContains(UnicodeSet s, int c) { 1845 if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) { 1846 return false; 1847 } 1848 return s.contains(c); 1849 } 1850 1851 1852 /** 1853 * return the index of the next code point in the input text. 1854 * @param i the preceding index 1855 */ nextCP(StringBuffer s, int i)1856 static int nextCP(StringBuffer s, int i) { 1857 if (i == -1) { 1858 // End of Input indication. Continue to return end value. 1859 return -1; 1860 } 1861 int retVal = i + 1; 1862 if (retVal > s.length()) { 1863 return -1; 1864 } 1865 int c = UTF16.charAt(s, i); 1866 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) { 1867 retVal++; 1868 } 1869 return retVal; 1870 } 1871 1872 1873 /** 1874 * random number generator. Not using Java's built-in Randoms for two reasons: 1875 * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test. 1876 * 2. We need to get and restore the seed from values occurring in the middle 1877 * of a long sequence, to more easily reproduce failing cases. 1878 */ 1879 private static int m_seed = 1; m_rand()1880 private static int m_rand() 1881 { 1882 m_seed = m_seed * 1103515245 + 12345; 1883 return (m_seed >>> 16) % 32768; 1884 } 1885 1886 // Helper function for formatting error output. 1887 // Append a string into a fixed-size field in a StringBuffer. 1888 // Blank-pad the string if it is shorter than the field. 1889 // Truncate the source string if it is too long. 1890 // appendToBuf(StringBuffer dest, String src, int fieldLen)1891 private static void appendToBuf(StringBuffer dest, String src, int fieldLen) { 1892 int appendLen = src.length(); 1893 if (appendLen >= fieldLen) { 1894 dest.append(src.substring(0, fieldLen)); 1895 } else { 1896 dest.append(src); 1897 while (appendLen < fieldLen) { 1898 dest.append(' '); 1899 appendLen++; 1900 } 1901 } 1902 } 1903 1904 // Helper function for formatting error output. 1905 // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format 1906 @SuppressWarnings("unused") appendCharToBuf(StringBuffer dest, int c, int fieldLen)1907 private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) { 1908 String hexChars = "0123456789abcdef"; 1909 if (c < 0x10000) { 1910 dest.append("\\u"); 1911 for (int bn=12; bn>=0; bn-=4) { 1912 dest.append(hexChars.charAt(((c)>>bn)&0xf)); 1913 } 1914 appendToBuf(dest, " ", fieldLen-6); 1915 } else { 1916 dest.append("\\U"); 1917 for (int bn=28; bn>=0; bn-=4) { 1918 dest.append(hexChars.charAt(((c)>>bn)&0xf)); 1919 } 1920 appendToBuf(dest, " ", fieldLen-10); 1921 1922 } 1923 } 1924 1925 /** 1926 * Run a RBBI monkey test. Common routine, for all break iterator types. 1927 * Parameters: 1928 * bi - the break iterator to use 1929 * mk - MonkeyKind, abstraction for obtaining expected results 1930 * name - Name of test (char, word, etc.) for use in error messages 1931 * seed - Seed for starting random number generator (parameter from user) 1932 * numIterations 1933 */ RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations)1934 void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) { 1935 int TESTSTRINGLEN = 500; 1936 StringBuffer testText = new StringBuffer(); 1937 int numCharClasses; 1938 List chClasses; 1939 @SuppressWarnings("unused") 1940 int expectedCount = 0; 1941 boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1942 boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1943 boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1944 boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1945 boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1946 boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1947 int i; 1948 int loopCount = 0; 1949 boolean printTestData = false; 1950 boolean printBreaksFromBI = false; 1951 1952 m_seed = seed; 1953 1954 numCharClasses = mk.charClasses().size(); 1955 chClasses = mk.charClasses(); 1956 1957 // Verify that the character classes all have at least one member. 1958 for (i=0; i<numCharClasses; i++) { 1959 UnicodeSet s = (UnicodeSet)chClasses.get(i); 1960 if (s == null || s.size() == 0) { 1961 errln("Character Class " + i + " is null or of zero size."); 1962 return; 1963 } 1964 } 1965 1966 //-------------------------------------------------------------------------------------------- 1967 // 1968 // Debugging settings. Comment out everything in the following block for normal operation 1969 // 1970 //-------------------------------------------------------------------------------------------- 1971 // numIterations = -1; 1972 // numIterations = 10000; // Same as exhaustive. 1973 // RuleBasedBreakIterator_New.fTrace = true; 1974 // m_seed = 859056465; 1975 // TESTSTRINGLEN = 50; 1976 // printTestData = true; 1977 // printBreaksFromBI = true; 1978 // ((RuleBasedBreakIterator_New)bi).dump(); 1979 1980 //-------------------------------------------------------------------------------------------- 1981 // 1982 // End of Debugging settings. 1983 // 1984 //-------------------------------------------------------------------------------------------- 1985 1986 // For minimizing width of class name output. 1987 int classNameSize = mk.maxClassNameSize(); 1988 1989 int dotsOnLine = 0; 1990 while (loopCount < numIterations || numIterations == -1) { 1991 if (numIterations == -1 && loopCount % 10 == 0) { 1992 // If test is running in an infinite loop, display a periodic tic so 1993 // we can tell that it is making progress. 1994 System.out.print("."); 1995 if (dotsOnLine++ >= 80){ 1996 System.out.println(); 1997 dotsOnLine = 0; 1998 } 1999 } 2000 // Save current random number seed, so that we can recreate the random numbers 2001 // for this loop iteration in event of an error. 2002 seed = m_seed; 2003 2004 testText.setLength(0); 2005 // Populate a test string with data. 2006 if (printTestData) { 2007 System.out.println("Test Data string ..."); 2008 } 2009 for (i=0; i<TESTSTRINGLEN; i++) { 2010 int aClassNum = m_rand() % numCharClasses; 2011 UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum); 2012 int charIdx = m_rand() % classSet.size(); 2013 int c = classSet.charAt(charIdx); 2014 if (c < 0) { // TODO: deal with sets containing strings. 2015 errln("c < 0"); 2016 } 2017 // Do not assemble a supplementary character from randomly generated separate surrogates. 2018 // (It could be a dictionary character) 2019 if (c < 0x10000 && Character.isLowSurrogate((char)c) && testText.length() > 0 && 2020 Character.isHighSurrogate(testText.charAt(testText.length()-1))) { 2021 continue; 2022 } 2023 testText.appendCodePoint(c); 2024 if (printTestData) { 2025 System.out.print(Integer.toHexString(c) + " "); 2026 } 2027 } 2028 if (printTestData) { 2029 System.out.println(); 2030 } 2031 2032 Arrays.fill(expectedBreaks, false); 2033 Arrays.fill(forwardBreaks, false); 2034 Arrays.fill(reverseBreaks, false); 2035 Arrays.fill(isBoundaryBreaks, false); 2036 Arrays.fill(followingBreaks, false); 2037 Arrays.fill(precedingBreaks, false); 2038 2039 // Calculate the expected results for this test string and reset applied rules. 2040 mk.setText(testText); 2041 expectedCount = 0; 2042 expectedBreaks[0] = true; 2043 int breakPos = 0; 2044 int lastBreakPos = -1; 2045 for (;;) { 2046 lastBreakPos = breakPos; 2047 breakPos = mk.next(breakPos); 2048 if (breakPos == -1) { 2049 break; 2050 } 2051 if (breakPos > testText.length()) { 2052 errln("breakPos > testText.length()"); 2053 } 2054 if (lastBreakPos >= breakPos) { 2055 errln("Next() not increasing."); 2056 // break; 2057 } 2058 expectedBreaks[breakPos] = true; 2059 } 2060 2061 // Find the break positions using forward iteration 2062 if (printBreaksFromBI) { 2063 System.out.println("Breaks from BI..."); 2064 } 2065 bi.setText(testText.toString()); 2066 for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) { 2067 if (i < 0 || i > testText.length()) { 2068 errln(name + " break monkey test: Out of range value returned by breakIterator::next()"); 2069 break; 2070 } 2071 if (printBreaksFromBI) { 2072 System.out.print(Integer.toHexString(i) + " "); 2073 } 2074 forwardBreaks[i] = true; 2075 } 2076 if (printBreaksFromBI) { 2077 System.out.println(); 2078 } 2079 2080 // Find the break positions using reverse iteration 2081 for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) { 2082 if (i < 0 || i > testText.length()) { 2083 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name); 2084 break; 2085 } 2086 reverseBreaks[i] = true; 2087 } 2088 2089 // Find the break positions using isBoundary() tests. 2090 for (i=0; i<=testText.length(); i++) { 2091 isBoundaryBreaks[i] = bi.isBoundary(i); 2092 } 2093 2094 // Find the break positions using the following() function. 2095 lastBreakPos = 0; 2096 followingBreaks[0] = true; 2097 for (i=0; i<testText.length(); i++) { 2098 breakPos = bi.following(i); 2099 if (breakPos <= i || 2100 breakPos < lastBreakPos || 2101 breakPos > testText.length() || 2102 breakPos > lastBreakPos && lastBreakPos > i ) { 2103 errln(name + " break monkey test: " + 2104 "Out of range value returned by BreakIterator::following().\n" + 2105 "index=" + i + "following returned=" + breakPos + 2106 "lastBreak=" + lastBreakPos); 2107 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. 2108 } else { 2109 followingBreaks[breakPos] = true; 2110 lastBreakPos = breakPos; 2111 } 2112 } 2113 2114 // Find the break positions using the preceding() function. 2115 lastBreakPos = testText.length(); 2116 precedingBreaks[testText.length()] = true; 2117 for (i=testText.length(); i>0; i--) { 2118 breakPos = bi.preceding(i); 2119 if (breakPos >= i || 2120 breakPos > lastBreakPos || 2121 breakPos < 0 || 2122 breakPos < lastBreakPos && lastBreakPos < i ) { 2123 errln(name + " break monkey test: " + 2124 "Out of range value returned by BreakIterator::preceding().\n" + 2125 "index=" + i + "preceding returned=" + breakPos + 2126 "lastBreak=" + lastBreakPos); 2127 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. 2128 } else { 2129 precedingBreaks[breakPos] = true; 2130 lastBreakPos = breakPos; 2131 } 2132 } 2133 2134 2135 2136 // Compare the expected and actual results. 2137 for (i=0; i<=testText.length(); i++) { 2138 String errorType = null; 2139 boolean[] currentBreakData = null; 2140 if (forwardBreaks[i] != expectedBreaks[i]) { 2141 errorType = "next()"; 2142 currentBreakData = forwardBreaks; 2143 } else if (reverseBreaks[i] != forwardBreaks[i]) { 2144 errorType = "previous()"; 2145 currentBreakData = reverseBreaks; 2146 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 2147 errorType = "isBoundary()"; 2148 currentBreakData = isBoundaryBreaks; 2149 } else if (followingBreaks[i] != expectedBreaks[i]) { 2150 errorType = "following()"; 2151 currentBreakData = followingBreaks; 2152 } else if (precedingBreaks[i] != expectedBreaks[i]) { 2153 errorType = "preceding()"; 2154 currentBreakData = precedingBreaks; 2155 } 2156 2157 if (errorType != null) { 2158 // Format a range of the test text that includes the failure as 2159 // a data item that can be included in the rbbi test data file. 2160 2161 // Start of the range is the last point where expected and actual results 2162 // both agreed that there was a break position. 2163 int startContext = i; 2164 int count = 0; 2165 for (;;) { 2166 if (startContext==0) { break; } 2167 startContext --; 2168 if (expectedBreaks[startContext]) { 2169 if (count == 2) break; 2170 count ++; 2171 } 2172 } 2173 2174 // End of range is two expected breaks past the start position. 2175 int endContext = i + 1; 2176 int ci; 2177 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 2178 for (;;) { 2179 if (endContext >= testText.length()) {break;} 2180 if (expectedBreaks[endContext-1]) { 2181 if (count == 0) break; 2182 count --; 2183 } 2184 endContext ++; 2185 } 2186 } 2187 2188 // Formatting of each line includes: 2189 // character code 2190 // reference break: '|' -> a break, '.' -> no break 2191 // actual break: '|' -> a break, '.' -> no break 2192 // (name of character clase) 2193 // Unicode name of character 2194 // '--→' indicates location of the difference. 2195 2196 StringBuilder buffer = new StringBuilder(); 2197 buffer.append("\n") 2198 .append((expectedBreaks[i] ? "Break expected but not found." : "Break found but not expected.")) 2199 .append( 2200 String.format(" at index %d. Parameters to reproduce: @\"type=%s seed=%d loop=1\"\n", 2201 i, name, seed)); 2202 2203 int c; // Char from test data 2204 for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) { 2205 2206 c = testText.codePointAt(ci); 2207 buffer.append((ci == i) ? " --→" : " ") 2208 .append(String.format(" %3d : ", ci)) 2209 .append(!expectedBreaks[ci] ? " . " : " | ") // Reference break 2210 .append(!currentBreakData[ci] ? " . " : " | "); // Actual break 2211 2212 // BMP or SMP character in hex 2213 if (c >= 0x10000) { 2214 buffer.append("\\U").append(String.format("%08x", c)); 2215 } else { 2216 buffer.append(" \\u").append(String.format("%04x", c)); 2217 } 2218 2219 buffer.append( 2220 String.format(String.format(" %%-%ds", classNameSize), 2221 mk.classNameFromCodepoint(c))) 2222 .append(String.format(" %-40s", mk.getAppliedRule(ci))) 2223 .append(String.format(" %-40s\n", UCharacter.getExtendedName(c))); 2224 2225 if (ci >= endContext) { break; } 2226 } 2227 errln(buffer.toString()); 2228 2229 break; 2230 } 2231 } 2232 2233 loopCount++; 2234 } 2235 } 2236 2237 // Test parameters are passed on the command line, or 2238 // via the Eclipse Run Configuration settings, arguments tab, VM parameters. 2239 // For example, 2240 // -ea -Dseed=554654 -Dloop=1 2241 2242 @Test TestCharMonkey()2243 public void TestCharMonkey() { 2244 int loopCount = getIntProperty("loop", isQuick() ? 500 : 10000); 2245 int seed = getIntProperty("seed", 1); 2246 2247 RBBICharMonkey m = new RBBICharMonkey(); 2248 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); 2249 RunMonkey(bi, m, "char", seed, loopCount); 2250 } 2251 2252 @Test TestWordMonkey()2253 public void TestWordMonkey() { 2254 int loopCount = getIntProperty("loop", isQuick() ? 500 : 10000); 2255 int seed = getIntProperty("seed", 1); 2256 2257 logln("Word Break Monkey Test"); 2258 RBBIWordMonkey m = new RBBIWordMonkey(); 2259 BreakIterator bi = BreakIterator.getWordInstance(Locale.US); 2260 RunMonkey(bi, m, "word", seed, loopCount); 2261 } 2262 2263 @Test TestLineMonkey()2264 public void TestLineMonkey() { 2265 int loopCount = getIntProperty("loop", isQuick() ? 500 : 10000); 2266 int seed = getIntProperty("seed", 1); 2267 2268 logln("Line Break Monkey Test"); 2269 RBBILineMonkey m = new RBBILineMonkey(); 2270 BreakIterator bi = BreakIterator.getLineInstance(Locale.US); 2271 try { 2272 RunMonkey(bi, m, "line", seed, loopCount); 2273 } catch(IllegalArgumentException e) { 2274 if (e.getMessage().equals("Invalid code point U+-000001")) { 2275 // Looks like you used class UnicodeSet instead of class XUnicodeSet 2276 // (note the leading 'X'). 2277 // See the comment before the definition of class XUnicodeSet. 2278 errln("Probable program error: use XUnicodeSet in RBBILineMonkey code"); 2279 } else { 2280 throw e; 2281 } 2282 } 2283 } 2284 2285 @Test TestSentMonkey()2286 public void TestSentMonkey() { 2287 int loopCount = getIntProperty("loop", isQuick() ? 500 : 3000); 2288 int seed = getIntProperty("seed", 1); 2289 2290 logln("Sentence Break Monkey Test"); 2291 RBBISentenceMonkey m = new RBBISentenceMonkey(); 2292 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); 2293 RunMonkey(bi, m, "sent", seed, loopCount); 2294 } 2295 // 2296 // Round-trip monkey tests. 2297 // Verify that break iterators created from the rule source from the default 2298 // break iterators still pass the monkey test for the iterator type. 2299 // 2300 // This is a major test for the Rule Compiler. The default break iterators are built 2301 // from pre-compiled binary rule data that was created using ICU4C; these 2302 // round-trip rule recompile tests verify that the Java rule compiler can 2303 // rebuild break iterators from the original source rules. 2304 // 2305 @Test TestRTCharMonkey()2306 public void TestRTCharMonkey() { 2307 int loopCount = getIntProperty("loop", isQuick() ? 200 : 2000); 2308 int seed = getIntProperty("seed", 1); 2309 2310 RBBICharMonkey m = new RBBICharMonkey(); 2311 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); 2312 String rules = bi.toString(); 2313 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2314 RunMonkey(rtbi, m, "char", seed, loopCount); 2315 } 2316 2317 @Test TestRTWordMonkey()2318 public void TestRTWordMonkey() { 2319 int loopCount = getIntProperty("loop", isQuick() ? 200 : 2000); 2320 int seed = getIntProperty("seed", 1); 2321 2322 logln("Word Break Monkey Test"); 2323 RBBIWordMonkey m = new RBBIWordMonkey(); 2324 BreakIterator bi = BreakIterator.getWordInstance(Locale.US); 2325 String rules = bi.toString(); 2326 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2327 RunMonkey(rtbi, m, "word", seed, loopCount); 2328 } 2329 2330 @Test TestRTLineMonkey()2331 public void TestRTLineMonkey() { 2332 int loopCount = getIntProperty("loop", isQuick() ? 200 : 2000); 2333 int seed = getIntProperty("seed", 1); 2334 2335 logln("Line Break Monkey Test"); 2336 RBBILineMonkey m = new RBBILineMonkey(); 2337 BreakIterator bi = BreakIterator.getLineInstance(Locale.US); 2338 String rules = bi.toString(); 2339 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2340 try { 2341 RunMonkey(rtbi, m, "line", seed, loopCount); 2342 } catch(IllegalArgumentException e) { 2343 if (e.getMessage().equals("Invalid code point U+-000001")) { 2344 // Looks like you used class UnicodeSet instead of class XUnicodeSet 2345 // (note the leading 'X'). 2346 // See the comment before the definition of class XUnicodeSet. 2347 errln("Probable program error: use XUnicodeSet in RBBILineMonkey code"); 2348 } else { 2349 throw e; 2350 } 2351 } 2352 } 2353 2354 @Test TestRTSentMonkey()2355 public void TestRTSentMonkey() { 2356 int loopCount = getIntProperty("loop", isQuick() ? 200 : 1000); 2357 int seed = getIntProperty("seed", 1); 2358 2359 logln("Sentence Break Monkey Test"); 2360 RBBISentenceMonkey m = new RBBISentenceMonkey(); 2361 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); 2362 String rules = bi.toString(); 2363 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2364 RunMonkey(rtbi, m, "sent", seed, loopCount); 2365 } 2366 } 2367 2368