• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 2003-2016 International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.dev.test.rbbi;
10 
11 
12 // Monkey testing of RuleBasedBreakIterator.
13 //    The old, original monkey test. TODO: remove
14 //    The new monkey test is class RBBIMonkeyTest.
15 
16 import java.util.ArrayList;
17 import java.util.Arrays;
18 import java.util.List;
19 import java.util.Locale;
20 
21 import org.junit.Test;
22 import org.junit.runner.RunWith;
23 import org.junit.runners.JUnit4;
24 
25 import com.ibm.icu.dev.test.TestFmwk;
26 import com.ibm.icu.lang.UCharacter;
27 import com.ibm.icu.lang.UProperty;
28 import com.ibm.icu.text.BreakIterator;
29 import com.ibm.icu.text.RuleBasedBreakIterator;
30 import com.ibm.icu.text.UTF16;
31 import com.ibm.icu.text.UnicodeSet;
32 
33 
34 /**
35  * Monkey tests for RBBI.  These tests have independent implementations of
36  * the Unicode TR boundary rules, and compare results between these and ICU's
37  * implementation, using random data.
38  *
39  * Tests cover Grapheme Cluster (char), Word and Line breaks
40  *
41  * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
42  *
43  */
44 @RunWith(JUnit4.class)
45 public class RBBITestMonkey extends TestFmwk {
46     //
47     //     class RBBIMonkeyKind
48     //
49     //        Monkey Test for Break Iteration
50     //        Abstract interface class.   Concrete derived classes independently
51     //        implement the break rules for different iterator types.
52     //
53     //        The Monkey Test itself uses doesn't know which type of break iterator it is
54     //        testing, but works purely in terms of the interface defined here.
55     //
56     abstract static class RBBIMonkeyKind {
RBBIMonkeyKind()57         RBBIMonkeyKind() {
58             fSets = new  ArrayList();
59             fClassNames = new ArrayList();
60             fAppliedRules = new ArrayList();
61         }
62 
63         // Return a List of UnicodeSets, representing the character classes used
64         //   for this type of iterator.
charClasses()65         abstract  List  charClasses();
66 
67         // Set the test text on which subsequent calls to next() will operate
setText(StringBuffer text)68         abstract  void   setText(StringBuffer text);
69 
70         // Find the next break position, starting from the specified position.
71         // Return -1 after reaching end of string.
next(int i)72         abstract   int   next(int i);
73 
74         // Name of each character class, parallel with charClasses. Used for debugging output
75         // of characters.
characterClassNames()76         List<String> characterClassNames() {
77             return fClassNames;
78         }
79 
setAppliedRule(int position, String value)80         void setAppliedRule(int position, String value) {
81             fAppliedRules.set(position, value);
82         }
83 
getAppliedRule(int position)84         String getAppliedRule(int position) {
85             return fAppliedRules.get(position);
86         }
87 
classNameFromCodepoint(int c)88         String classNameFromCodepoint(int c) {
89             // Simply iterate through fSets to find character's class
90             for (int aClassNum = 0; aClassNum < charClasses().size(); aClassNum++) {
91                 UnicodeSet classSet = (UnicodeSet)charClasses().get(aClassNum);
92                 if (classSet.contains(c)) {
93                     return fClassNames.get(aClassNum);
94                 }
95             }
96             return "bad class name";
97         }
98 
maxClassNameSize()99         int maxClassNameSize() {
100             int maxSize = 0;
101             for (int aClassNum = 0; aClassNum < charClasses().size(); aClassNum++) {
102                 if (fClassNames.get(aClassNum).length() > maxSize) {
103                     maxSize = fClassNames.get(aClassNum).length();
104                 }
105             }
106             return maxSize;
107         }
108 
109         // Clear `appliedRules` and fill it with empty strings in the size of test text.
prepareAppliedRules(int size)110         void prepareAppliedRules(int size) {
111             // Remove all the information in the `appliedRules`.
112             fAppliedRules.clear();
113             fAppliedRules.ensureCapacity(size + 1);
114             while (fAppliedRules.size() < size + 1) {
115                 fAppliedRules.add("");
116             }
117         }
118 
119         // A Character Property, one of the constants defined in class UProperty.
120         //   The value of this property will be displayed for the characters
121         //    near any test failure.
122         int   fCharProperty;
123 
124         List fSets;
125         ArrayList<String> fClassNames;
126         ArrayList<String> fAppliedRules;
127     }
128 
129     /**
130      * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
131      * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
132      */
133     static class RBBICharMonkey extends RBBIMonkeyKind {
134         UnicodeSet                fCRLFSet;
135         UnicodeSet                fControlSet;
136         UnicodeSet                fExtendSet;
137         UnicodeSet                fRegionalIndicatorSet;
138         UnicodeSet                fPrependSet;
139         UnicodeSet                fSpacingSet;
140         UnicodeSet                fLSet;
141         UnicodeSet                fVSet;
142         UnicodeSet                fTSet;
143         UnicodeSet                fLVSet;
144         UnicodeSet                fLVTSet;
145         UnicodeSet                fHangulSet;
146         UnicodeSet                fZWJSet;
147         UnicodeSet                fExtendedPictSet;
148         UnicodeSet                fViramaSet;
149         UnicodeSet                fLinkingConsonantSet;
150         UnicodeSet                fExtCccZwjSet;
151         UnicodeSet                fAnySet;
152 
153 
154         StringBuffer              fText;
155 
RBBICharMonkey()156         RBBICharMonkey() {
157             fText       = null;
158             fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
159             fCRLFSet    = new UnicodeSet("[\\r\\n]");
160             fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
161             fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
162             fZWJSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]");
163             fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
164             fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
165             fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
166             fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
167             fVSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
168             fTSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
169             fLVSet      = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
170             fLVTSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
171             fHangulSet  = new UnicodeSet();
172             fHangulSet.addAll(fLSet);
173             fHangulSet.addAll(fVSet);
174             fHangulSet.addAll(fTSet);
175             fHangulSet.addAll(fLVSet);
176             fHangulSet.addAll(fLVTSet);
177 
178             fExtendedPictSet  = new UnicodeSet("[:Extended_Pictographic:]");
179             fViramaSet        = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
180                                                + "\\p{Indic_Syllabic_Category=Virama}]");
181             fLinkingConsonantSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
182                                                   + "\\p{Indic_Syllabic_Category=Consonant}]");
183             fExtCccZwjSet     = new UnicodeSet("[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]");
184             fAnySet           = new UnicodeSet("[\\u0000-\\U0010ffff]");
185 
186 
187             fSets.add(fCRLFSet);               fClassNames.add("CRLF");
188             fSets.add(fControlSet);            fClassNames.add("Control");
189             fSets.add(fExtendSet);             fClassNames.add("Extended");
190             fSets.add(fRegionalIndicatorSet);  fClassNames.add("RegionalIndicator");
191             if (!fPrependSet.isEmpty()) {
192                 fSets.add(fPrependSet);        fClassNames.add("Prepend");
193             }
194             fSets.add(fSpacingSet);            fClassNames.add("Spacing");
195             fSets.add(fHangulSet);             fClassNames.add("Hangul");
196             fSets.add(fAnySet);                fClassNames.add("Any");
197             fSets.add(fZWJSet);                fClassNames.add("ZWJ");
198             fSets.add(fExtendedPictSet);       fClassNames.add("ExtendedPict");
199             fSets.add(fViramaSet);             fClassNames.add("Virama");
200             fSets.add(fLinkingConsonantSet);   fClassNames.add("LinkingConsonant");
201             fSets.add(fExtCccZwjSet);          fClassNames.add("ExtCccZwj");
202         }
203 
204 
205         @Override
setText(StringBuffer s)206         void setText(StringBuffer s) {
207             fText = s;
208             prepareAppliedRules(s.length());
209         }
210 
211         @Override
charClasses()212         List charClasses() {
213             return fSets;
214         }
215 
216         @Override
next(int prevPos)217         int next(int prevPos) {
218             int    /*p0,*/ p1, p2, p3;    // Indices of the significant code points around the
219             //   break position being tested.  The candidate break
220             //   location is before p2.
221 
222             int     breakPos = -1;
223 
224             int   c0, c1, c2, c3;     // The code points at p0, p1, p2 & p3.
225             int   cBase;              // for (X Extend*) patterns, the X character.
226 
227             // Previous break at end of string.  return DONE.
228             if (prevPos >= fText.length()) {
229                 return -1;
230             }
231             /* p0 = */ p1 = p2 = p3 = prevPos;
232             c3 =  UTF16.charAt(fText, prevPos);
233             c0 = c1 = c2 = cBase = 0;
234 
235             // Loop runs once per "significant" character position in the input text.
236             for (;;) {
237                 // Move all of the positions forward in the input string.
238                 /* p0 = p1;*/  c0 = c1;
239                 p1 = p2;  c1 = c2;
240                 p2 = p3;  c2 = c3;
241 
242                 // Advance p3 by one codepoint
243                 p3 = moveIndex32(fText, p3, 1);
244                 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
245 
246                 if (p1 == p2) {
247                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
248                     continue;
249                 }
250                 if (p2 == fText.length()) {
251                     setAppliedRule(p2, "End of String");
252                     break;
253                 }
254 
255                 //     No Extend or Format characters may appear between the CR and LF,
256                 //     which requires the additional check for p2 immediately following p1.
257                 //
258                 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
259                     setAppliedRule(p2, "GB 3   CR x LF");
260                     continue;
261                 }
262 
263                 if (fControlSet.contains(c1) ||
264                         c1 == 0x0D ||
265                         c1 == 0x0A)  {
266                     setAppliedRule(p2, "GB 4   ( Control | CR | LF ) <break>");
267                     break;
268                 }
269 
270                 if (fControlSet.contains(c2) ||
271                         c2 == 0x0D ||
272                         c2 == 0x0A)  {
273                     setAppliedRule(p2, "GB 5   <break>  ( Control | CR | LF )");
274                     break;
275                 }
276 
277 
278                 if (fLSet.contains(c1) &&
279                         (fLSet.contains(c2)  ||
280                                 fVSet.contains(c2)  ||
281                                 fLVSet.contains(c2) ||
282                                 fLVTSet.contains(c2))) {
283                     setAppliedRule(p2, "GB 6   L x ( L | V | LV | LVT )");
284                     continue;
285                 }
286 
287                 if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
288                         (fVSet.contains(c2) || fTSet.contains(c2)))  {
289                     setAppliedRule(p2, "GB 7   ( LV | V )  x  ( V | T )");
290                     continue;
291                 }
292 
293                 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
294                         fTSet.contains(c2))  {
295                     setAppliedRule(p2, "GB 8   ( LVT | T)  x T");
296                     continue;
297                 }
298 
299                 if (fExtendSet.contains(c2) || fZWJSet.contains(c2))  {
300                     if (!fExtendSet.contains(c1)) {
301                         cBase = c1;
302                     }
303                     setAppliedRule(p2, "GB 9   x (Extend | ZWJ)");
304                     continue;
305                 }
306 
307                 if (fSpacingSet.contains(c2)) {
308                     setAppliedRule(p2, "GB 9a  x  SpacingMark");
309                     continue;
310                 }
311 
312                 if (fPrependSet.contains(c1)) {
313                     setAppliedRule(p2, "GB 9b  Prepend x");
314                     continue;
315                 }
316 
317                 //   Note: Viramas are also included in the ExtCccZwj class.
318                 if (fLinkingConsonantSet.contains(c2)) {
319                     int pi = p1;
320                     boolean sawVirama = false;
321                     while (pi > 0 && fExtCccZwjSet.contains(fText.codePointAt(pi))) {
322                         if (fViramaSet.contains(fText.codePointAt(pi))) {
323                             sawVirama = true;
324                         }
325                         pi = fText.offsetByCodePoints(pi, -1);
326                     }
327                     if (sawVirama && fLinkingConsonantSet.contains(fText.codePointAt(pi))) {
328                         setAppliedRule(p2, "GB 9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
329                         continue;
330                     }
331                 }
332 
333                 if (fExtendedPictSet.contains(cBase) && fZWJSet.contains(c1) && fExtendedPictSet.contains(c2) ) {
334                     setAppliedRule(p2, "GB 11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
335                     continue;
336                 }
337 
338                 //                  Note: The first if condition is a little tricky. We only need to force
339                 //                      a break if there are three or more contiguous RIs. If there are
340                 //                      only two, a break following will occur via other rules, and will include
341                 //                      any trailing extend characters, which is needed behavior.
342                 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
343                         && fRegionalIndicatorSet.contains(c2)) {
344                     setAppliedRule(p2, "GB 12-13 Regional_Indicator x Regional_Indicator");
345                     break;
346                 }
347                 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
348                     setAppliedRule(p2, "GB 12-13 Regional_Indicator x Regional_Indicator");
349                     continue;
350                 }
351 
352                 setAppliedRule(p2, "GB 999 Any <break> Any");
353                 break;
354             }
355 
356             breakPos = p2;
357             return breakPos;
358         }
359 
360     }
361 
362     /**
363      *
364      * Word Monkey Test Class
365      *
366      *
367      *
368      */
369     static class RBBIWordMonkey extends RBBIMonkeyKind {
370         StringBuffer              fText;
371 
372         UnicodeSet                fCRSet;
373         UnicodeSet                fLFSet;
374         UnicodeSet                fNewlineSet;
375         UnicodeSet                fRegionalIndicatorSet;
376         UnicodeSet                fKatakanaSet;
377         UnicodeSet                fHebrew_LetterSet;
378         UnicodeSet                fALetterSet;
379         UnicodeSet                fSingle_QuoteSet;
380         UnicodeSet                fDouble_QuoteSet;
381         UnicodeSet                fMidNumLetSet;
382         UnicodeSet                fMidLetterSet;
383         UnicodeSet                fMidNumSet;
384         UnicodeSet                fNumericSet;
385         UnicodeSet                fFormatSet;
386         UnicodeSet                fExtendSet;
387         UnicodeSet                fExtendNumLetSet;
388         UnicodeSet                fWSegSpaceSet;
389         UnicodeSet                fOtherSet;
390         UnicodeSet                fDictionarySet;
391         UnicodeSet                fZWJSet;
392         UnicodeSet                fExtendedPictSet;
393 
RBBIWordMonkey()394         RBBIWordMonkey() {
395             fCharProperty    = UProperty.WORD_BREAK;
396 
397             fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");
398             fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");
399             fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");
400             fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
401             fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");
402             fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");
403             fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");
404             fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
405             fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");
406             fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
407             fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
408             fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");
409             fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");
410             fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");
411             fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
412             // There are some sc=Hani characters with WB=Extend.
413             // The break rules need to pick one or the other because
414             // Extend overlapping with something else is messy.
415             // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
416             // in $Han (for $dictionary) and out of $Extend.
417             fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}-[:Hani:]]");
418             fWSegSpaceSet    = new UnicodeSet("[\\p{Word_Break = WSegSpace}]");
419             fZWJSet          = new UnicodeSet("[\\p{Word_Break = ZWJ}]");
420             fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]");
421 
422             fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]");
423             fDictionarySet.addAll(fKatakanaSet);
424             fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
425 
426             fALetterSet.removeAll(fDictionarySet);
427 
428             fOtherSet        = new UnicodeSet();
429             fOtherSet.complement();
430             fOtherSet.removeAll(fCRSet);
431             fOtherSet.removeAll(fLFSet);
432             fOtherSet.removeAll(fNewlineSet);
433             fOtherSet.removeAll(fALetterSet);
434             fOtherSet.removeAll(fSingle_QuoteSet);
435             fOtherSet.removeAll(fDouble_QuoteSet);
436             fOtherSet.removeAll(fKatakanaSet);
437             fOtherSet.removeAll(fHebrew_LetterSet);
438             fOtherSet.removeAll(fMidLetterSet);
439             fOtherSet.removeAll(fMidNumSet);
440             fOtherSet.removeAll(fNumericSet);
441             fOtherSet.removeAll(fFormatSet);
442             fOtherSet.removeAll(fExtendSet);
443             fOtherSet.removeAll(fExtendNumLetSet);
444             fOtherSet.removeAll(fWSegSpaceSet);
445             fOtherSet.removeAll(fRegionalIndicatorSet);
446             fOtherSet.removeAll(fZWJSet);
447             fOtherSet.removeAll(fExtendedPictSet);
448 
449             // Inhibit dictionary characters from being tested at all.
450             // remove surrogates so as to not generate higher CJK characters
451             fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
452             fOtherSet.removeAll(fDictionarySet);
453 
454             fSets.add(fCRSet);                    fClassNames.add("CR");
455             fSets.add(fLFSet);                    fClassNames.add("LF");
456             fSets.add(fNewlineSet);               fClassNames.add("Newline");
457             fSets.add(fRegionalIndicatorSet);     fClassNames.add("RegionalIndicator");
458             fSets.add(fHebrew_LetterSet);         fClassNames.add("Hebrew");
459             fSets.add(fALetterSet);               fClassNames.add("ALetter");
460             //fSets.add(fKatakanaSet);  // Omit Katakana from fSets, which omits Katakana characters
461             // from the test data. They are all in the dictionary set,
462             // which this (old, to be retired) monkey test cannot handle.
463             fSets.add(fSingle_QuoteSet);          fClassNames.add("Single Quote");
464             fSets.add(fDouble_QuoteSet);          fClassNames.add("Double Quote");
465             fSets.add(fMidLetterSet);             fClassNames.add("MidLetter");
466             fSets.add(fMidNumLetSet);             fClassNames.add("MidNumLet");
467             fSets.add(fMidNumSet);                fClassNames.add("MidNum");
468             fSets.add(fNumericSet);               fClassNames.add("Numeric");
469             fSets.add(fFormatSet);                fClassNames.add("Format");
470             fSets.add(fExtendSet);                fClassNames.add("Extend");
471             fSets.add(fExtendNumLetSet);          fClassNames.add("ExtendNumLet");
472             fSets.add(fWSegSpaceSet);             fClassNames.add("WSegSpace");
473             fSets.add(fZWJSet);                   fClassNames.add("ZWJ");
474             fSets.add(fExtendedPictSet);          fClassNames.add("ExtendedPict");
475             fSets.add(fOtherSet);                 fClassNames.add("Other");
476         }
477 
478 
479         @Override
charClasses()480         List  charClasses() {
481             return fSets;
482         }
483 
484         @Override
setText(StringBuffer s)485         void   setText(StringBuffer s) {
486             fText = s;
487             prepareAppliedRules(s.length());
488         }
489 
490         @Override
next(int prevPos)491         int   next(int prevPos) {
492             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
493             //   break position being tested.  The candidate break
494             //   location is before p2.
495             int     breakPos = -1;
496 
497             int c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
498 
499             // Previous break at end of string.  return DONE.
500             if (prevPos >= fText.length()) {
501                 return -1;
502             }
503             /*p0 =*/ p1 = p2 = p3 = prevPos;
504             c3 = UTF16.charAt(fText, prevPos);
505             c0 = c1 = c2 = 0;
506 
507 
508 
509             // Loop runs once per "significant" character position in the input text.
510             for (;;) {
511                 // Move all of the positions forward in the input string.
512                 /*p0 = p1;*/  c0 = c1;
513                 p1 = p2;  c1 = c2;
514                 p2 = p3;  c2 = c3;
515 
516                 // Advance p3 by    X(Extend | Format)*   Rule 4
517                 //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
518                 do {
519                     p3 = moveIndex32(fText, p3, 1);
520                     c3 = -1;
521                     if (p3>=fText.length()) {
522                         break;
523                     }
524                     c3 = UTF16.charAt(fText, p3);
525                     if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
526                         break;
527                     }
528                 }
529                 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3));
530 
531                 if (p1 == p2) {
532                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
533                     continue;
534                 }
535                 if (p2 == fText.length()) {
536                     // Reached end of string.  Always a break position.
537                     break;
538                 }
539 
540                 //     No Extend or Format characters may appear between the CR and LF,
541                 //     which requires the additional check for p2 immediately following p1.
542                 //
543                 if (c1==0x0D && c2==0x0A) {
544                     setAppliedRule(p2, "WB 3   CR x LF");
545                     continue;
546                 }
547 
548                 //
549                 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
550                     setAppliedRule(p2, "WB 3a  Break before and after newlines (including CR and LF)");
551                     break;
552                 }
553                 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
554                     setAppliedRule(p2, "WB 3a  Break before and after newlines (including CR and LF)");
555                     break;
556                 }
557 
558                 //              Not ignoring extend chars, so peek into input text to
559                 //              get the potential ZWJ, the character immediately preceding c2.
560                 if (fZWJSet.contains(fText.codePointBefore(p2)) && fExtendedPictSet.contains(c2)) {
561                     setAppliedRule(p2, "WB 3c  ZWJ x Extended_Pictographic");
562                     continue;
563                 }
564 
565                 if (fWSegSpaceSet.contains(fText.codePointBefore(p2)) && fWSegSpaceSet.contains(c2)) {
566                     setAppliedRule(p2, "WB 3d  Keep horizontal whitespace together");
567                     continue;
568                 }
569 
570                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
571                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
572                     setAppliedRule(p2, "WB 4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
573                     continue;
574                 }
575 
576                 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1))   &&
577                         (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
578                         (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
579                     setAppliedRule(p2, "WB 6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)");
580                     continue;
581                 }
582 
583                 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
584                         (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
585                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
586                     setAppliedRule(p2, "WB 7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
587                     continue;
588                 }
589 
590                 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
591                     setAppliedRule(p2, "WB 7a  Hebrew_Letter x Single_Quote");
592                     continue;
593                 }
594 
595                 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
596                     setAppliedRule(p2, "WB 7b  Hebrew_Letter x Single_Quote");
597                     continue;
598                 }
599 
600                 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
601                     setAppliedRule(p2, "WB 7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
602                     continue;
603                 }
604 
605                 if (fNumericSet.contains(c1) &&
606                         fNumericSet.contains(c2))  {
607                     setAppliedRule(p2, "WB 8   Numeric x Numeric");
608                     continue;
609                 }
610 
611                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
612                         fNumericSet.contains(c2))  {
613                     setAppliedRule(p2, "WB 9   (ALetter | Hebrew_Letter) x Numeric");
614                     continue;
615                 }
616 
617                 if (fNumericSet.contains(c1) &&
618                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
619                     setAppliedRule(p2, "WB 10  Numeric x (ALetter | Hebrew_Letter)");
620                     continue;
621                 }
622 
623                 if (fNumericSet.contains(c0) &&
624                         (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1))  &&
625                         fNumericSet.contains(c2)) {
626                     setAppliedRule(p2, "WB 11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
627                     continue;
628                 }
629 
630                 if (fNumericSet.contains(c1) &&
631                         (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2))  &&
632                         setContains(fNumericSet, c3)) {
633                     setAppliedRule(p2, "WB 12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
634                     continue;
635                 }
636 
637                 //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
638                 //                  all Katakana are handled by the dictionary breaker.
639                 if (fKatakanaSet.contains(c1) &&
640                         fKatakanaSet.contains(c2))  {
641                     setAppliedRule(p2, "WB 13  Katakana x Katakana");
642                     continue;
643                 }
644 
645                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
646                         fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
647                         fExtendNumLetSet.contains(c2)) {
648                     setAppliedRule(p2, "WB 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
649                     continue;
650                 }
651 
652                 if (fExtendNumLetSet.contains(c1) &&
653                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
654                                 fNumericSet.contains(c2) || fKatakanaSet.contains(c2)))  {
655                     setAppliedRule(p2, "WB 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
656                     continue;
657                 }
658 
659 
660                 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) {
661                     setAppliedRule(p2, "WB 15-17 Group pairs of Regional Indicators.");
662                     break;
663                 }
664                 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
665                     setAppliedRule(p2, "WB 15-17 Group pairs of Regional Indicators.");
666                     continue;
667                 }
668 
669                 setAppliedRule(p2, "WB 999");
670                 break;
671             }
672 
673             breakPos = p2;
674             return breakPos;
675         }
676     }
677 
678 
679     static class RBBILineMonkey extends RBBIMonkeyKind {
680         // UnicodeSets for each of the Line Breaking character classes.
681         // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier
682         // to verify that they are all accounted for.
683 
684         UnicodeSet  fBK;
685         UnicodeSet  fCR;
686         UnicodeSet  fLF;
687         UnicodeSet  fCM;
688         UnicodeSet  fNL;
689         UnicodeSet  fSG;
690         UnicodeSet  fWJ;
691         UnicodeSet  fZW;
692         UnicodeSet  fGL;
693         UnicodeSet  fSP;
694         UnicodeSet  fB2;
695         UnicodeSet  fBA;
696         UnicodeSet  fBB;
697         UnicodeSet  fHH;
698         UnicodeSet  fHY;
699         UnicodeSet  fCB;
700         UnicodeSet  fCL;
701         UnicodeSet  fCP;
702         UnicodeSet  fEX;
703         UnicodeSet  fIN;
704         UnicodeSet  fNS;
705         UnicodeSet  fOP;
706         UnicodeSet  fQU;
707         UnicodeSet  fIS;
708         UnicodeSet  fNU;
709         UnicodeSet  fPO;
710         UnicodeSet  fPR;
711         UnicodeSet  fSY;
712         UnicodeSet  fAI;
713         UnicodeSet  fAL;
714         UnicodeSet  fCJ;
715         UnicodeSet  fH2;
716         UnicodeSet  fH3;
717         UnicodeSet  fHL;
718         UnicodeSet  fID;
719         UnicodeSet  fJL;
720         UnicodeSet  fJV;
721         UnicodeSet  fJT;
722         UnicodeSet  fRI;
723         UnicodeSet  fXX;
724         UnicodeSet  fEB;
725         UnicodeSet  fEM;
726         UnicodeSet  fZWJ;
727         UnicodeSet  fOP30;
728         UnicodeSet  fCP30;
729 
730         StringBuffer  fText;
731         int           fOrigPositions;
732 
733         // XUnicodeSet is like UnicodeSet, except that the method contains(int codePoint) does not
734         // throw exceptions on out-of-range codePoints. This matches ICU4C behavior.
735         // The LineMonkey test (ported from ICU4C) relies on this behavior, it uses a value of -1
736         // to represent a non-codepoint that is not included in any of the property sets.
737         // This happens for rule 30a.
738 
739         class XUnicodeSet extends UnicodeSet {
XUnicodeSet(String pattern)740             XUnicodeSet(String pattern) { super(pattern); }
XUnicodeSet()741             XUnicodeSet() { super(); }
742             @Override
contains(int codePoint)743             public boolean contains(int codePoint) {
744                 return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ?
745                         false : super.contains(codePoint);
746             }
747         }
748 
RBBILineMonkey()749         RBBILineMonkey()
750         {
751             fCharProperty  = UProperty.LINE_BREAK;
752 
753             fBK    = new XUnicodeSet("[\\p{Line_Break=BK}]");
754             fCR    = new XUnicodeSet("[\\p{Line_break=CR}]");
755             fLF    = new XUnicodeSet("[\\p{Line_break=LF}]");
756             fCM    = new XUnicodeSet("[\\p{Line_break=CM}]");
757             fNL    = new XUnicodeSet("[\\p{Line_break=NL}]");
758             fSG    = new XUnicodeSet("[\\ud800-\\udfff]");
759             fWJ    = new XUnicodeSet("[\\p{Line_break=WJ}]");
760             fZW    = new XUnicodeSet("[\\p{Line_break=ZW}]");
761             fGL    = new XUnicodeSet("[\\p{Line_break=GL}]");
762             fSP    = new XUnicodeSet("[\\p{Line_break=SP}]");
763             fB2    = new XUnicodeSet("[\\p{Line_break=B2}]");
764             fBA    = new XUnicodeSet("[\\p{Line_break=BA}]");
765             fBB    = new XUnicodeSet("[\\p{Line_break=BB}]");
766             fHH    = new XUnicodeSet();
767             fHY    = new XUnicodeSet("[\\p{Line_break=HY}]");
768             fCB    = new XUnicodeSet("[\\p{Line_break=CB}]");
769             fCL    = new XUnicodeSet("[\\p{Line_break=CL}]");
770             fCP    = new XUnicodeSet("[\\p{Line_break=CP}]");
771             fEX    = new XUnicodeSet("[\\p{Line_break=EX}]");
772             fIN    = new XUnicodeSet("[\\p{Line_break=IN}]");
773             fNS    = new XUnicodeSet("[\\p{Line_break=NS}]");
774             fOP    = new XUnicodeSet("[\\p{Line_break=OP}]");
775             fQU    = new XUnicodeSet("[\\p{Line_break=QU}]");
776             fIS    = new XUnicodeSet("[\\p{Line_break=IS}]");
777             fNU    = new XUnicodeSet("[\\p{Line_break=NU}]");
778             fPO    = new XUnicodeSet("[\\p{Line_break=PO}]");
779             fPR    = new XUnicodeSet("[\\p{Line_break=PR}]");
780             fSY    = new XUnicodeSet("[\\p{Line_break=SY}]");
781             fAI    = new XUnicodeSet("[\\p{Line_break=AI}]");
782             fAL    = new XUnicodeSet("[\\p{Line_break=AL}]");
783             fCJ    = new XUnicodeSet("[\\p{Line_break=CJ}]");
784             fH2    = new XUnicodeSet("[\\p{Line_break=H2}]");
785             fH3    = new XUnicodeSet("[\\p{Line_break=H3}]");
786             fHL    = new XUnicodeSet("[\\p{Line_break=HL}]");
787             fID    = new XUnicodeSet("[\\p{Line_break=ID}]");
788             fJL    = new XUnicodeSet("[\\p{Line_break=JL}]");
789             fJV    = new XUnicodeSet("[\\p{Line_break=JV}]");
790             fJT    = new XUnicodeSet("[\\p{Line_break=JT}]");
791             fRI    = new XUnicodeSet("[\\p{Line_break=RI}]");
792             fXX    = new XUnicodeSet("[\\p{Line_break=XX}]");
793             fEB    = new XUnicodeSet("[\\p{Line_break=EB}]");
794             fEM    = new XUnicodeSet("[\\p{Line_break=EM}]");
795             fZWJ   = new XUnicodeSet("[\\p{Line_break=ZWJ}]");
796             fOP30  = new XUnicodeSet("[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]");
797             fCP30  = new XUnicodeSet("[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]");
798 
799             // Remove dictionary characters.
800             // The monkey test reference implementation of line break does not replicate the dictionary behavior,
801             // so dictionary characters are omitted from the monkey test data.
802             @SuppressWarnings("unused")
803             UnicodeSet dictionarySet = new UnicodeSet(
804                     "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]");
805 
806             fAL.addAll(fXX);     // Default behavior for XX is identical to AL
807             fAL.addAll(fAI);     // Default behavior for AI is identical to AL
808             fAL.addAll(fSG);     // Default behavior for SG (unpaired surrogates) is AL
809 
810             fNS.addAll(fCJ);     // Default behavior for CJ is identical to NS.
811             fCM.addAll(fZWJ);    // ZWJ behaves as a CM.
812 
813             fHH.add('\u2010');   // Hyphen, '‐'
814 
815             fSets.add(fBK);     fClassNames.add("BK");
816             fSets.add(fCR);     fClassNames.add("CR");
817             fSets.add(fLF);     fClassNames.add("LF");
818             fSets.add(fCM);     fClassNames.add("CM");
819             fSets.add(fNL);     fClassNames.add("NL");
820             fSets.add(fWJ);     fClassNames.add("WJ");
821             fSets.add(fZW);     fClassNames.add("ZW");
822             fSets.add(fGL);     fClassNames.add("GL");
823             fSets.add(fSP);     fClassNames.add("SP");
824             fSets.add(fB2);     fClassNames.add("B2");
825             fSets.add(fBA);     fClassNames.add("BA");
826             fSets.add(fBB);     fClassNames.add("BB");
827             fSets.add(fHY);     fClassNames.add("HY");
828             fSets.add(fCB);     fClassNames.add("CB");
829             fSets.add(fCL);     fClassNames.add("CL");
830             fSets.add(fCP);     fClassNames.add("CP");
831             fSets.add(fEX);     fClassNames.add("EX");
832             fSets.add(fIN);     fClassNames.add("IN");
833             fSets.add(fJL);     fClassNames.add("JL");
834             fSets.add(fJT);     fClassNames.add("JT");
835             fSets.add(fJV);     fClassNames.add("JV");
836             fSets.add(fNS);     fClassNames.add("NV");
837             fSets.add(fOP);     fClassNames.add("OP");
838             fSets.add(fQU);     fClassNames.add("QU");
839             fSets.add(fIS);     fClassNames.add("IS");
840             fSets.add(fNU);     fClassNames.add("NU");
841             fSets.add(fPO);     fClassNames.add("PO");
842             fSets.add(fPR);     fClassNames.add("PR");
843             fSets.add(fSY);     fClassNames.add("SY");
844             fSets.add(fAI);     fClassNames.add("AI");
845             fSets.add(fAL);     fClassNames.add("AL");
846             fSets.add(fH2);     fClassNames.add("H2");
847             fSets.add(fH3);     fClassNames.add("H3");
848             fSets.add(fHL);     fClassNames.add("HL");
849             fSets.add(fID);     fClassNames.add("ID");
850             fSets.add(fWJ);     fClassNames.add("WJ");
851             fSets.add(fRI);     fClassNames.add("RI");
852             fSets.add(fSG);     fClassNames.add("SG");
853             fSets.add(fEB);     fClassNames.add("EB");
854             fSets.add(fEM);     fClassNames.add("EM");
855             fSets.add(fZWJ);    fClassNames.add("ZWJ");
856             // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
857             fSets.add(fOP30);   fClassNames.add("OP30");
858             fSets.add(fCP30);   fClassNames.add("CP30");
859         }
860 
861         @Override
setText(StringBuffer s)862         void setText(StringBuffer s) {
863             fText       = s;
864             prepareAppliedRules(s.length());
865         }
866 
867 
868 
869 
870         @Override
next(int startPos)871         int next(int startPos) {
872             int    pos;       //  Index of the char following a potential break position
873             int    thisChar;  //  Character at above position "pos"
874 
875             int    prevPos;   //  Index of the char preceding a potential break position
876             int    prevChar;  //  Character at above position.  Note that prevChar
877             //                //  and thisChar may not be adjacent because combining
878             //                //  characters between them will be ignored.
879 
880             int    prevPosX2;
881             int    prevCharX2; //  Character before prevChar, more context for LB 21a
882 
883             int    nextPos;   //  Index of the next character following pos.
884             //                //  Usually skips over combining marks.
885             int    tPos;      //  temp value.
886             int    matchVals[]  = null;       // Number  Expression Match Results
887 
888 
889             if (startPos >= fText.length()) {
890                 return -1;
891             }
892 
893 
894             // Initial values for loop.  Loop will run the first time without finding breaks,
895             //                           while the invalid values shift out and the "this" and
896             //                           "prev" positions are filled in with good values.
897             pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
898             thisChar = prevChar  = prevCharX2 =  0;
899             nextPos  = startPos;
900 
901 
902             // Loop runs once per position in the test text, until a break position
903             //  is found.  In each iteration, we are testing for a possible break
904             //  just preceding the character at index "pos".  The character preceding
905             //  this char is at postion "prevPos"; because of combining sequences,
906             //  "prevPos" can be arbitrarily far before "pos".
907             for (;;) {
908                 // Advance to the next position to be tested.
909                 prevPosX2  = prevPos;
910                 prevCharX2 = prevChar;
911                 prevPos   = pos;
912                 prevChar  = thisChar;
913                 pos       = nextPos;
914                 nextPos   = moveIndex32(fText, pos, 1);
915 
916                 if (pos >= fText.length()) {
917                     setAppliedRule(pos, "LB 2   Break at end of text");
918                     break;
919                 }
920 
921                 //             We do this rule out-of-order because the adjustment does
922                 //             not effect the way that rules LB 3 through LB 6 match,
923                 //             and doing it here rather than after LB 6 is substantially
924                 //             simpler when combining sequences do occur.
925 
926 
927                 // LB 9         Keep combining sequences together.
928                 //              advance over any CM class chars at "pos",
929                 //              result is "nextPos" for the following loop iteration.
930                 thisChar  = UTF16.charAt(fText, pos);
931                 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
932                         thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
933                     for (;;) {
934                         if (nextPos == fText.length()) {
935                             break;
936                         }
937                         int nextChar = UTF16.charAt(fText, nextPos);
938                         if (!fCM.contains(nextChar)) {
939                             break;
940                         }
941                         nextPos = moveIndex32(fText, nextPos, 1);
942                     }
943                 }
944 
945                 // LB 9 Treat X CM* as if it were X
946                 //        No explicit action required.
947 
948                 // LB 10     Treat any remaining combining mark as AL
949                 if (fCM.contains(thisChar)) {
950                     thisChar = 'A';
951                 }
952 
953 
954                 // If the loop is still warming up - if we haven't shifted the initial
955                 //   -1 positions out of prevPos yet - loop back to advance the
956                 //    position in the input without any further looking for breaks.
957                 if (prevPos == -1) {
958                     setAppliedRule(pos, "LB 9   adjust for combining sequences.");
959                     continue;
960                 }
961 
962                 if (fBK.contains(prevChar)) {
963                     setAppliedRule(pos, "LB 4   Always break after hard line breaks");
964                     break;
965                 }
966 
967                 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
968                     setAppliedRule(pos, "LB 5   Break after CR, LF, NL, but not inside CR LF");
969                     continue;
970                 }
971                 if  (fCR.contains(prevChar) ||
972                         fLF.contains(prevChar) ||
973                         fNL.contains(prevChar))  {
974                     setAppliedRule(pos, "LB 5   Break after CR, LF, NL, but not inside CR LF");
975                     break;
976                 }
977 
978                 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
979                         fLF.contains(thisChar) || fNL.contains(thisChar) ) {
980                     setAppliedRule(pos, "LB 6   Don't break before hard line breaks");
981                     continue;
982                 }
983 
984 
985                 if (fSP.contains(thisChar)) {
986                     setAppliedRule(pos, "LB 7   Don't break before spaces or zero-width space");
987                     continue;
988                 }
989 
990                 if (fZW.contains(thisChar)) {
991                     setAppliedRule(pos, "LB 7   Don't break before spaces or zero-width space");
992                     continue;
993                 }
994 
995                 //       ZW SP* ÷
996                 //       Scan backwards from prevChar for SP* ZW
997                 tPos = prevPos;
998                 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
999                     tPos = moveIndex32(fText, tPos, -1);
1000                 }
1001                 if (fZW.contains(UTF16.charAt(fText, tPos))) {
1002                     setAppliedRule(pos, "LB 8   Break after zero width space");
1003                     break;
1004                 }
1005 
1006                 //          Move this test up, before LB8a, because numbers can match a longer sequence that would
1007                 //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
1008                 matchVals = LBNumberCheck(fText, prevPos, matchVals);
1009                 if (matchVals[0] != -1) {
1010                     // Matched a number.  But could have been just a single digit, which would
1011                     //    not represent a "no break here" between prevChar and thisChar
1012                     int numEndIdx = matchVals[1];  // idx of first char following num
1013                     if (numEndIdx > pos) {
1014                         // Number match includes at least the two chars being checked
1015                         if (numEndIdx > nextPos) {
1016                             // Number match includes additional chars.  Update pos and nextPos
1017                             //   so that next loop iteration will continue at the end of the number,
1018                             //   checking for breaks between last char in number & whatever follows.
1019                             nextPos = numEndIdx;
1020                             pos     = numEndIdx;
1021                             do {
1022                                 pos = moveIndex32(fText, pos, -1);
1023                                 thisChar = UTF16.charAt(fText, pos);
1024                             }
1025                             while (fCM.contains(thisChar));
1026                         }
1027                         setAppliedRule(pos, "LB 25  Numbers");
1028                         continue;
1029                     }
1030                 }
1031 
1032                 //       The monkey test's way of ignoring combining characters doesn't work
1033                 //       for this rule. ZWJ is also a CM. Need to get the actual character
1034                 //       preceding "thisChar", not ignoring combining marks, possibly ZWJ.
1035                 {
1036                     int prevC = fText.codePointBefore(pos);
1037                     if (fZWJ.contains(prevC)) {
1038                         setAppliedRule(pos, "LB 8a  ZWJ x");
1039                         continue;
1040                     }
1041                 }
1042 
1043                 // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
1044 
1045 
1046                 //    x  WJ
1047                 //    WJ  x
1048                 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
1049                     setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
1050                     continue;
1051                 }
1052 
1053 
1054                 if (fGL.contains(prevChar)) {
1055                     setAppliedRule(pos, "LB 12  GL  x");
1056                     continue;
1057                 }
1058 
1059                 if (!(fSP.contains(prevChar) ||
1060                         fBA.contains(prevChar) ||
1061                         fHY.contains(prevChar)     ) && fGL.contains(thisChar)) {
1062                     setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
1063                     continue;
1064                 }
1065 
1066                 if (fCL.contains(thisChar) ||
1067                         fCP.contains(thisChar) ||
1068                         fEX.contains(thisChar) ||
1069                         fSY.contains(thisChar)) {
1070                     setAppliedRule(pos, "LB 13  Don't break before closings");
1071                     continue;
1072                 }
1073 
1074                 //       Scan backwards, checking for this sequence.
1075                 //       The OP char could include combining marks, so we actually check for
1076                 //           OP CM* SP* x
1077                 tPos = prevPos;
1078                 if (fSP.contains(prevChar)) {
1079                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1080                         tPos=moveIndex32(fText, tPos, -1);
1081                     }
1082                 }
1083                 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1084                     tPos=moveIndex32(fText, tPos, -1);
1085                 }
1086                 if (fOP.contains(UTF16.charAt(fText, tPos))) {
1087                     setAppliedRule(pos, "LB 14  Don't break after OP SP*");
1088                     continue;
1089                 }
1090 
1091                 if (nextPos < fText.length()) {
1092                     int nextChar = fText.codePointAt(nextPos);
1093                     if (fSP.contains(prevChar) && fIS.contains(thisChar) && fNU.contains(nextChar)) {
1094                         setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
1095                         break;
1096                     }
1097                 }
1098 
1099                 if (fIS.contains(thisChar)) {
1100                     setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces");
1101                     continue;
1102                 }
1103 
1104                 if (fOP.contains(thisChar)) {
1105                     // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
1106                     tPos = prevPos;
1107                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1108                         tPos = moveIndex32(fText, tPos, -1);
1109                     }
1110                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1111                         tPos = moveIndex32(fText, tPos, -1);
1112                     }
1113                     if (fQU.contains(UTF16.charAt(fText, tPos))) {
1114                         setAppliedRule(pos, "LB 15  QU SP* x OP");
1115                         continue;
1116                     }
1117                 }
1118 
1119                 if (fNS.contains(thisChar)) {
1120                     tPos = prevPos;
1121                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1122                         tPos = moveIndex32(fText, tPos, -1);
1123                     }
1124                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1125                         tPos = moveIndex32(fText, tPos, -1);
1126                     }
1127                     if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
1128                         setAppliedRule(pos, "LB 16  (CL | CP) SP* x NS");
1129                         continue;
1130                     }
1131                 }
1132 
1133 
1134                 if (fB2.contains(thisChar)) {
1135                     tPos = prevPos;
1136                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1137                         tPos = moveIndex32(fText, tPos, -1);
1138                     }
1139                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1140                         tPos = moveIndex32(fText, tPos, -1);
1141                     }
1142                     if (fB2.contains(UTF16.charAt(fText, tPos))) {
1143                         setAppliedRule(pos, "LB 17  B2 SP* x B2");
1144                         continue;
1145                     }
1146                 }
1147 
1148                 if (fSP.contains(prevChar)) {
1149                     setAppliedRule(pos, "LB 18  break after space");
1150                     break;
1151                 }
1152 
1153                 //    x   QU
1154                 //    QU  x
1155                 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
1156                         setAppliedRule(pos, "LB 19");
1157                     continue;
1158                 }
1159 
1160                 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
1161                     setAppliedRule(pos, "LB 20  Break around a CB");
1162                     break;
1163                 }
1164 
1165                 //           Don't break between Hyphens and letters if a break precedes the hyphen.
1166                 //           Formerly this was a Finnish tailoring.
1167                 //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
1168                 //    ^($HY | $HH) $AL;
1169                 if (fAL.contains(thisChar) && (fHY.contains(prevChar) || fHH.contains(prevChar)) &&
1170                         prevPosX2 == -1) {
1171                     setAppliedRule(pos, "LB 20.09");
1172                     continue;
1173                 }
1174 
1175                 if (fBA.contains(thisChar) ||
1176                         fHY.contains(thisChar) ||
1177                         fNS.contains(thisChar) ||
1178                         fBB.contains(prevChar) )   {
1179                     setAppliedRule(pos, "LB 21");
1180                     continue;
1181                 }
1182 
1183                 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
1184                     setAppliedRule(pos, "LB 21a HL (HY | BA) x");
1185                     continue;
1186                 }
1187 
1188                 if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
1189                     setAppliedRule(pos, "LB 21b SY x HL");
1190                     continue;
1191                 }
1192 
1193                 if (fIN.contains(thisChar)) {
1194                     setAppliedRule(pos, "LB 22");
1195                     continue;
1196                 }
1197 
1198                 //          (AL | HL) x NU
1199                 //          NU x (AL | HL)
1200                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) {
1201                     setAppliedRule(pos, "LB 23");
1202                     continue;
1203                 }
1204                 if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1205                     setAppliedRule(pos, "LB 23");
1206                     continue;
1207                 }
1208 
1209                 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
1210                 //      PR x (ID | EB | EM)
1211                 //     (ID | EB | EM) x PO
1212                 if (fPR.contains(prevChar) &&
1213                         (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar)))  {
1214                     setAppliedRule(pos, "LB 23a");
1215                     continue;
1216                 }
1217                 if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) &&
1218                         fPO.contains(thisChar)) {
1219                     setAppliedRule(pos, "LB 23a");
1220                     continue;
1221                 }
1222 
1223                 // Do not break between prefix and letters or ideographs.
1224                 //         (PR | PO) x (AL | HL)
1225                 //         (AL | HL) x (PR | PO)
1226                 if ((fPR.contains(prevChar) || fPO.contains(prevChar)) &&
1227                         (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1228                     setAppliedRule(pos, "LB 24  no break between prefix and letters or ideographs");
1229                     continue;
1230                 }
1231                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) &&
1232                         (fPR.contains(thisChar) || fPO.contains(thisChar))) {
1233                     setAppliedRule(pos, "LB 24  no break between prefix and letters or ideographs");
1234                     continue;
1235                 }
1236 
1237                 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
1238 
1239                 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
1240                         fJV.contains(thisChar) ||
1241                         fH2.contains(thisChar) ||
1242                         fH3.contains(thisChar))) {
1243                     setAppliedRule(pos, "LB 26  Do not break a Korean syllable.");
1244                     continue;
1245                 }
1246 
1247                 if ((fJV.contains(prevChar) || fH2.contains(prevChar))  &&
1248                         (fJV.contains(thisChar) || fJT.contains(thisChar))) {
1249                     setAppliedRule(pos, "LB 26  Do not break a Korean syllable.");
1250                     continue;
1251                 }
1252 
1253                 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
1254                         fJT.contains(thisChar)) {
1255                     setAppliedRule(pos, "LB 26  Do not break a Korean syllable.");
1256                     continue;
1257                 }
1258 
1259                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1260                         fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1261                         fIN.contains(thisChar)) {
1262                     setAppliedRule(pos, "LB 27  Treat a Korean Syllable Block the same as ID.");
1263                     continue;
1264                 }
1265                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1266                         fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1267                         fPO.contains(thisChar)) {
1268                     setAppliedRule(pos, "LB 27  Treat a Korean Syllable Block the same as ID.");
1269                     continue;
1270                 }
1271                 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
1272                         fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
1273                     setAppliedRule(pos, "LB 27  Treat a Korean Syllable Block the same as ID.");
1274                     continue;
1275                 }
1276 
1277 
1278 
1279                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1280                     setAppliedRule(pos, "LB 28  Do not break between alphabetics");
1281                     continue;
1282                 }
1283 
1284                 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1285                     setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics");
1286                     continue;
1287                 }
1288 
1289                 //          (AL | NU) x OP
1290                 //          CP x (AL | NU)
1291                 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) &&
1292                         fOP30.contains(thisChar)) {
1293                     setAppliedRule(pos, "LB 30  Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.");
1294                     continue;
1295                 }
1296                 if (fCP30.contains(prevChar) &&
1297                         (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
1298                     setAppliedRule(pos, "LB 30  Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.");
1299                     continue;
1300                 }
1301 
1302                 //             RI RI  ÷  RI
1303                 //                RI  x  RI
1304                 if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
1305                     setAppliedRule(pos, "LB 30a Break between pairs of Regional Indicators.");
1306                     break;
1307                 }
1308                 if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
1309                     // Two Regional Indicators have been paired.
1310                     // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
1311                     // following RI. This is a hack.
1312                     thisChar = -1;
1313                     setAppliedRule(pos, "LB 30a Break between pairs of Regional Indicators.");
1314                     continue;
1315                 }
1316 
1317                 if (fEB.contains(prevChar) && fEM.contains(thisChar)) {
1318                     setAppliedRule(pos, "LB 30b Emoji Base x Emoji Modifier");
1319                     continue;
1320                 }
1321                 // LB 31    Break everywhere else
1322                 setAppliedRule(pos, "LB 31 Break everywhere else");
1323                 break;
1324             }
1325 
1326             return pos;
1327         }
1328 
1329 
1330 
1331         // Match the following regular expression in the input text.
1332         //    ((PR | PO) CM*)? ((OP | HY) CM*)? (IS CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)?  (PR | PO) CM*)?
1333         //      0    0   1       4    4    4      5  5              7    7    7    7      9    9    9     11   11    (match states)
1334         //  retVals array  [0]  index of the start of the match, or -1 if no match
1335         //                 [1]  index of first char following the match.
1336         //  Can not use Java regex because need supplementary character support,
1337         //     and because Unicode char properties version must be the same as in
1338         //     the version of ICU being tested.
LBNumberCheck(StringBuffer s, int startIdx, int[] retVals)1339         private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
1340             if (retVals == null) {
1341                 retVals = new int[2];
1342             }
1343             retVals[0]     = -1;  // Indicates no match.
1344             int matchState = 0;
1345             int idx        = startIdx;
1346 
1347             matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
1348                 int c = UTF16.charAt(s, idx);
1349                 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
1350                 switch (matchState) {
1351                 case 0:
1352                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
1353                     cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1354                         matchState = 1;
1355                         break;
1356                     }
1357                     if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1358                         matchState = 4;
1359                         break;
1360                     }
1361                     if (cLBType == UCharacter.LineBreak.HYPHEN) {
1362                         matchState = 4;
1363                         break;
1364                     }
1365                     if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1366                         matchState = 5;
1367                         break;
1368                     }
1369                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
1370                         matchState = 7;
1371                         break;
1372                     }
1373                     break matchLoop;   /* No Match  */
1374 
1375                 case 1:
1376                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1377                         matchState = 1;
1378                         break;
1379                     }
1380                     if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1381                         matchState = 4;
1382                         break;
1383                     }
1384                     if (cLBType == UCharacter.LineBreak.HYPHEN) {
1385                         matchState = 4;
1386                         break;
1387                     }
1388                     if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1389                         matchState = 5;
1390                         break;
1391                     }
1392                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
1393                         matchState = 7;
1394                         break;
1395                     }
1396                     break matchLoop;   /* No Match  */
1397 
1398                 case 4:
1399                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1400                         matchState = 4;
1401                         break;
1402                     }
1403                     if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1404                         matchState = 5;
1405                         break;
1406                     }
1407                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
1408                         matchState = 7;
1409                         break;
1410                     }
1411                     break matchLoop;   /* No Match  */
1412 
1413                 case 5:
1414                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1415                         matchState = 5;
1416                         break;
1417                     }
1418                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
1419                         matchState = 7;
1420                         break;
1421                     }
1422                     break matchLoop;   /* No Match  */
1423 
1424 
1425                 case 7:
1426                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1427                         matchState = 7;
1428                         break;
1429                     }
1430                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
1431                         matchState = 7;
1432                         break;
1433                     }
1434                     if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1435                         matchState = 7;
1436                         break;
1437                     }
1438                     if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
1439                         matchState = 7;
1440                         break;
1441                     }
1442                     if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
1443                         matchState = 9;
1444                         break;
1445                     }
1446                     if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
1447                         matchState = 9;
1448                         break;
1449                     }
1450                     if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1451                         matchState = 11;
1452                         break;
1453                     }
1454                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1455                         matchState = 11;
1456                         break;
1457                     }
1458 
1459                     break matchLoop;    // Match Complete.
1460                 case 9:
1461                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1462                         matchState = 9;
1463                         break;
1464                     }
1465                     if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1466                         matchState = 11;
1467                         break;
1468                     }
1469                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1470                         matchState = 11;
1471                         break;
1472                     }
1473                     break matchLoop;    // Match Complete.
1474                 case 11:
1475                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1476                         matchState = 11;
1477                         break;
1478                     }
1479                     break matchLoop;    // Match Complete.
1480                 }
1481             }
1482             if (matchState >= 7) {
1483                 retVals[0] = startIdx;
1484                 retVals[1] = idx;
1485             }
1486             return retVals;
1487         }
1488 
1489 
1490         @Override
charClasses()1491         List  charClasses() {
1492             return fSets;
1493         }
1494     }
1495 
1496 
1497     /**
1498      *
1499      * Sentence Monkey Test Class
1500      *
1501      *
1502      *
1503      */
1504     static class RBBISentenceMonkey extends RBBIMonkeyKind {
1505         StringBuffer         fText;
1506 
1507         UnicodeSet           fSepSet;
1508         UnicodeSet           fFormatSet;
1509         UnicodeSet           fSpSet;
1510         UnicodeSet           fLowerSet;
1511         UnicodeSet           fUpperSet;
1512         UnicodeSet           fOLetterSet;
1513         UnicodeSet           fNumericSet;
1514         UnicodeSet           fATermSet;
1515         UnicodeSet           fSContinueSet;
1516         UnicodeSet           fSTermSet;
1517         UnicodeSet           fCloseSet;
1518         UnicodeSet           fOtherSet;
1519         UnicodeSet           fExtendSet;
1520 
RBBISentenceMonkey()1521         RBBISentenceMonkey() {
1522             fCharProperty  = UProperty.SENTENCE_BREAK;
1523 
1524             //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
1525             //                       set and made into character classes of their own.  For the monkey impl,
1526             //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
1527             fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
1528             fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]");
1529             fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
1530             fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
1531             fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
1532             fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
1533             fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
1534             fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
1535             fSContinueSet    = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
1536             fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
1537             fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]");
1538             fExtendSet       = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
1539             fOtherSet        = new UnicodeSet();
1540 
1541 
1542             fOtherSet.complement();
1543             fOtherSet.removeAll(fSepSet);
1544             fOtherSet.removeAll(fFormatSet);
1545             fOtherSet.removeAll(fSpSet);
1546             fOtherSet.removeAll(fLowerSet);
1547             fOtherSet.removeAll(fUpperSet);
1548             fOtherSet.removeAll(fOLetterSet);
1549             fOtherSet.removeAll(fNumericSet);
1550             fOtherSet.removeAll(fATermSet);
1551             fOtherSet.removeAll(fSContinueSet);
1552             fOtherSet.removeAll(fSTermSet);
1553             fOtherSet.removeAll(fCloseSet);
1554             fOtherSet.removeAll(fExtendSet);
1555 
1556             fSets.add(fSepSet);         fClassNames.add("Sep");
1557             fSets.add(fFormatSet);      fClassNames.add("Format");
1558 
1559             fSets.add(fSpSet);          fClassNames.add("Sp");
1560             fSets.add(fLowerSet);       fClassNames.add("Lower");
1561             fSets.add(fUpperSet);       fClassNames.add("Upper");
1562             fSets.add(fOLetterSet);     fClassNames.add("OLetter");
1563             fSets.add(fNumericSet);     fClassNames.add("Numeric");
1564             fSets.add(fATermSet);       fClassNames.add("ATerm");
1565             fSets.add(fSContinueSet);   fClassNames.add("SContinue");
1566             fSets.add(fSTermSet);       fClassNames.add("STerm");
1567             fSets.add(fCloseSet);       fClassNames.add("Close");
1568             fSets.add(fOtherSet);       fClassNames.add("Other");
1569             fSets.add(fExtendSet);      fClassNames.add("Extend");
1570         }
1571 
1572 
1573         @Override
charClasses()1574         List  charClasses() {
1575             return fSets;
1576         }
1577 
1578         @Override
setText(StringBuffer s)1579         void   setText(StringBuffer s) {
1580             fText = s;
1581             prepareAppliedRules(s.length());
1582         }
1583 
1584 
1585         //      moveBack()   Find the "significant" code point preceding the index i.
1586         //      Skips over ($Extend | $Format)*
1587         //
moveBack(int i)1588         private int moveBack(int i) {
1589 
1590             if (i <= 0) {
1591                 return -1;
1592             }
1593 
1594             int      c;
1595             int      j = i;
1596             do {
1597                 j = moveIndex32(fText, j, -1);
1598                 c = UTF16.charAt(fText, j);
1599             }
1600             while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
1601             return j;
1602         }
1603 
1604 
moveForward(int i)1605         int moveForward(int i) {
1606             if (i>=fText.length()) {
1607                 return fText.length();
1608             }
1609             int   c;
1610             int   j = i;
1611             do {
1612                 j = moveIndex32(fText, j, 1);
1613                 c = cAt(j);
1614             }
1615             while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
1616             return j;
1617 
1618         }
1619 
cAt(int pos)1620         int cAt(int pos) {
1621             if (pos<0 || pos>=fText.length()) {
1622                 return -1;
1623             }
1624             return UTF16.charAt(fText, pos);
1625         }
1626 
1627         @Override
next(int prevPos)1628         int   next(int prevPos) {
1629             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
1630             //   break position being tested.  The candidate break
1631             //   location is before p2.
1632             int     breakPos = -1;
1633 
1634             int c0, c1, c2, c3;         // The code points at p0, p1, p2 & p3.
1635             int c;
1636 
1637             // Prev break at end of string.  return DONE.
1638             if (prevPos >= fText.length()) {
1639                 return -1;
1640             }
1641             /*p0 =*/ p1 = p2 = p3 = prevPos;
1642             c3 = UTF16.charAt(fText, prevPos);
1643             c0 = c1 = c2 = 0;
1644 
1645             // Loop runs once per "significant" character position in the input text.
1646             for (;;) {
1647                 // Move all of the positions forward in the input string.
1648                 /*p0 = p1;*/  c0 = c1;
1649                 p1 = p2;  c1 = c2;
1650                 p2 = p3;  c2 = c3;
1651 
1652                 // Advance p3 by  X(Extend | Format)*   Rule 4
1653                 p3 = moveForward(p3);
1654                 c3 = cAt(p3);
1655 
1656                 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
1657                     setAppliedRule(p2, "SB3   CR x LF");
1658                     continue;
1659                 }
1660 
1661                 if (fSepSet.contains(c1)) {
1662                     p2 = p1+1;   // Separators don't combine with Extend or Format
1663                     setAppliedRule(p2, "SB4   Sep  <break>");
1664                     break;
1665                 }
1666 
1667                 if (p2 >= fText.length()) {
1668                     // Reached end of string.  Always a break position.
1669                     setAppliedRule(p2, "SB4   Sep  <break>");
1670                     break;
1671                 }
1672 
1673                 if (p2 == prevPos) {
1674                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1675                     setAppliedRule(p2, "SB4   Sep  <break>");
1676                     continue;
1677                 }
1678 
1679                 if (fATermSet.contains(c1) &&  fNumericSet.contains(c2))  {
1680                     setAppliedRule(p2, "SB6   ATerm x Numeric");
1681                     continue;
1682                 }
1683 
1684                 if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) &&
1685                         fATermSet.contains(c1) && fUpperSet.contains(c2)) {
1686                     setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
1687                     continue;
1688                 }
1689 
1690                 //           Note:  Sterm | ATerm are added to the negated part of the expression by a
1691                 //                  note to the Unicode 5.0 documents.
1692                 int p8 = p1;
1693                 while (p8>0 && fSpSet.contains(cAt(p8))) {
1694                     p8 = moveBack(p8);
1695                 }
1696                 while (p8>0 && fCloseSet.contains(cAt(p8))) {
1697                     p8 = moveBack(p8);
1698                 }
1699                 if (fATermSet.contains(cAt(p8))) {
1700                     p8=p2;
1701                     for (;;) {
1702                         c = cAt(p8);
1703                         if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
1704                                 fLowerSet.contains(c) || fSepSet.contains(c) ||
1705                                 fATermSet.contains(c) || fSTermSet.contains(c))
1706                         {
1707                             setAppliedRule(p2, "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower");
1708                             break;
1709                         }
1710                         p8 = moveForward(p8);
1711                     }
1712                     if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
1713                         setAppliedRule(p2, "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower");
1714                         continue;
1715                     }
1716                 }
1717 
1718                 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
1719                     p8 = p1;
1720                     while (setContains(fSpSet, cAt(p8))) {
1721                         p8 = moveBack(p8);
1722                     }
1723                     while (setContains(fCloseSet, cAt(p8))) {
1724                         p8 = moveBack(p8);
1725                     }
1726                     c = cAt(p8);
1727                     if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
1728                         setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)");
1729                         continue;
1730                     }
1731                 }
1732 
1733 
1734                 int p9 = p1;
1735                 while (p9>0 && fCloseSet.contains(cAt(p9))) {
1736                     p9 = moveBack(p9);
1737                 }
1738                 c = cAt(p9);
1739                 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
1740                     if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
1741                         setAppliedRule(p2, "SB9   (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
1742                         continue;
1743                     }
1744                 }
1745 
1746                 int p10 = p1;
1747                 while (p10>0 && fSpSet.contains(cAt(p10))) {
1748                     p10 = moveBack(p10);
1749                 }
1750                 while (p10>0 && fCloseSet.contains(cAt(p10))) {
1751                     p10 = moveBack(p10);
1752                 }
1753                 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
1754                     if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
1755                         setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
1756                         continue;
1757                     }
1758                 }
1759 
1760                 int p11 = p1;
1761                 if (p11>0 && fSepSet.contains(cAt(p11))) {
1762                     p11 = moveBack(p11);
1763                 }
1764                 while (p11>0 && fSpSet.contains(cAt(p11))) {
1765                     p11 = moveBack(p11);
1766                 }
1767                 while (p11>0 && fCloseSet.contains(cAt(p11))) {
1768                     p11 = moveBack(p11);
1769                 }
1770                 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
1771                     setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp*   <break>");
1772                     break;
1773                 }
1774 
1775                 setAppliedRule(p2, "SB12  Any x Any");
1776                 continue;
1777             }
1778             breakPos = p2;
1779             return breakPos;
1780         }
1781     }
1782 
1783 
1784     /**
1785      * Move an index into a string by n code points.
1786      *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
1787      *   complicating usage.
1788      * @param s   a Text string
1789      * @param pos The starting code unit index into the text string
1790      * @param amt The amount to adjust the string by.
1791      * @return    The adjusted code unit index, pinned to the string's length, or
1792      *            unchanged if input index was outside of the string.
1793      */
moveIndex32(StringBuffer s, int pos, int amt)1794     static int moveIndex32(StringBuffer s, int pos, int amt) {
1795         int i;
1796         char  c;
1797         if (amt>0) {
1798             for (i=0; i<amt; i++) {
1799                 if (pos >= s.length()) {
1800                     return s.length();
1801                 }
1802                 c = s.charAt(pos);
1803                 pos++;
1804                 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
1805                     c = s.charAt(pos);
1806                     if (UTF16.isTrailSurrogate(c)) {
1807                         pos++;
1808                     }
1809                 }
1810             }
1811         } else {
1812             for (i=0; i>amt; i--) {
1813                 if (pos <= 0) {
1814                     return 0;
1815                 }
1816                 pos--;
1817                 c = s.charAt(pos);
1818                 if (UTF16.isTrailSurrogate(c) && pos >= 0) {
1819                     c = s.charAt(pos);
1820                     if (UTF16.isLeadSurrogate(c)) {
1821                         pos--;
1822                     }
1823                 }
1824             }
1825         }
1826         return pos;
1827     }
1828 
1829     /**
1830      * No-exceptions form of UnicodeSet.contains(c).
1831      *    Simplifies loops that terminate with an end-of-input character value.
1832      * @param s  A unicode set
1833      * @param c  A code point value
1834      * @return   true if the set contains c.
1835      */
setContains(UnicodeSet s, int c)1836     static boolean setContains(UnicodeSet s, int c) {
1837         if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
1838             return false;
1839         }
1840         return s.contains(c);
1841     }
1842 
1843 
1844     /**
1845      * return the index of the next code point in the input text.
1846      * @param i the preceding index
1847      */
nextCP(StringBuffer s, int i)1848     static int  nextCP(StringBuffer s, int i) {
1849         if (i == -1) {
1850             // End of Input indication.  Continue to return end value.
1851             return -1;
1852         }
1853         int  retVal = i + 1;
1854         if (retVal > s.length()) {
1855             return -1;
1856         }
1857         int  c = UTF16.charAt(s, i);
1858         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
1859             retVal++;
1860         }
1861         return retVal;
1862     }
1863 
1864 
1865     /**
1866      * random number generator.  Not using Java's built-in Randoms for two reasons:
1867      *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
1868      *    2.  We need to get and restore the seed from values occurring in the middle
1869      *        of a long sequence, to more easily reproduce failing cases.
1870      */
1871     private static int m_seed = 1;
m_rand()1872     private static int  m_rand()
1873     {
1874         m_seed = m_seed * 1103515245 + 12345;
1875         return (m_seed >>> 16) % 32768;
1876     }
1877 
1878     // Helper function for formatting error output.
1879     //   Append a string into a fixed-size field in a StringBuffer.
1880     //   Blank-pad the string if it is shorter than the field.
1881     //   Truncate the source string if it is too long.
1882     //
appendToBuf(StringBuffer dest, String src, int fieldLen)1883     private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
1884         int appendLen = src.length();
1885         if (appendLen >= fieldLen) {
1886             dest.append(src.substring(0, fieldLen));
1887         } else {
1888             dest.append(src);
1889             while (appendLen < fieldLen) {
1890                 dest.append(' ');
1891                 appendLen++;
1892             }
1893         }
1894     }
1895 
1896     // Helper function for formatting error output.
1897     // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
1898     @SuppressWarnings("unused")
appendCharToBuf(StringBuffer dest, int c, int fieldLen)1899     private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
1900         String hexChars = "0123456789abcdef";
1901         if (c < 0x10000) {
1902             dest.append("\\u");
1903             for (int bn=12; bn>=0; bn-=4) {
1904                 dest.append(hexChars.charAt(((c)>>bn)&0xf));
1905             }
1906             appendToBuf(dest, " ", fieldLen-6);
1907         } else {
1908             dest.append("\\U");
1909             for (int bn=28; bn>=0; bn-=4) {
1910                 dest.append(hexChars.charAt(((c)>>bn)&0xf));
1911             }
1912             appendToBuf(dest, " ", fieldLen-10);
1913 
1914         }
1915     }
1916 
1917     /**
1918      *  Run a RBBI monkey test.  Common routine, for all break iterator types.
1919      *    Parameters:
1920      *       bi      - the break iterator to use
1921      *       mk      - MonkeyKind, abstraction for obtaining expected results
1922      *       name    - Name of test (char, word, etc.) for use in error messages
1923      *       seed    - Seed for starting random number generator (parameter from user)
1924      *       numIterations
1925      */
RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations)1926     void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int numIterations) {
1927         int              TESTSTRINGLEN = 500;
1928         StringBuffer     testText         = new StringBuffer();
1929         int              numCharClasses;
1930         List             chClasses;
1931         @SuppressWarnings("unused")
1932         int              expectedCount    = 0;
1933         boolean[]        expectedBreaks   = new boolean[TESTSTRINGLEN*2 + 1];
1934         boolean[]        forwardBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1935         boolean[]        reverseBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1936         boolean[]        isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1937         boolean[]        followingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1938         boolean[]        precedingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1939         int              i;
1940         int              loopCount        = 0;
1941         boolean          printTestData    = false;
1942         boolean          printBreaksFromBI = false;
1943 
1944         m_seed = seed;
1945 
1946         numCharClasses = mk.charClasses().size();
1947         chClasses      = mk.charClasses();
1948 
1949         // Verify that the character classes all have at least one member.
1950         for (i=0; i<numCharClasses; i++) {
1951             UnicodeSet s = (UnicodeSet)chClasses.get(i);
1952             if (s == null || s.size() == 0) {
1953                 errln("Character Class " + i + " is null or of zero size.");
1954                 return;
1955             }
1956         }
1957 
1958         //--------------------------------------------------------------------------------------------
1959         //
1960         //  Debugging settings.  Comment out everything in the following block for normal operation
1961         //
1962         //--------------------------------------------------------------------------------------------
1963         // numIterations = -1;
1964         // numIterations = 10000;   // Same as exhaustive.
1965         // RuleBasedBreakIterator_New.fTrace = true;
1966         // m_seed = 859056465;
1967         // TESTSTRINGLEN = 50;
1968         // printTestData = true;
1969         // printBreaksFromBI = true;
1970         // ((RuleBasedBreakIterator_New)bi).dump();
1971 
1972         //--------------------------------------------------------------------------------------------
1973         //
1974         //  End of Debugging settings.
1975         //
1976         //--------------------------------------------------------------------------------------------
1977 
1978         // For minimizing width of class name output.
1979         int classNameSize = mk.maxClassNameSize();
1980 
1981         int  dotsOnLine = 0;
1982         while (loopCount < numIterations || numIterations == -1) {
1983             if (numIterations == -1 && loopCount % 10 == 0) {
1984                 // If test is running in an infinite loop, display a periodic tic so
1985                 //   we can tell that it is making progress.
1986                 System.out.print(".");
1987                 if (dotsOnLine++ >= 80){
1988                     System.out.println();
1989                     dotsOnLine = 0;
1990                 }
1991             }
1992             // Save current random number seed, so that we can recreate the random numbers
1993             //   for this loop iteration in event of an error.
1994             seed = m_seed;
1995 
1996             testText.setLength(0);
1997             // Populate a test string with data.
1998             if (printTestData) {
1999                 System.out.println("Test Data string ...");
2000             }
2001             for (i=0; i<TESTSTRINGLEN; i++) {
2002                 int        aClassNum = m_rand() % numCharClasses;
2003                 UnicodeSet classSet  = (UnicodeSet)chClasses.get(aClassNum);
2004                 int        charIdx   = m_rand() % classSet.size();
2005                 int        c         = classSet.charAt(charIdx);
2006                 if (c < 0) {   // TODO:  deal with sets containing strings.
2007                     errln("c < 0");
2008                 }
2009                 // Do not assemble a supplementary character from randomly generated separate surrogates.
2010                 //   (It could be a dictionary character)
2011                 if (c < 0x10000 && Character.isLowSurrogate((char)c) && testText.length() > 0 &&
2012                         Character.isHighSurrogate(testText.charAt(testText.length()-1))) {
2013                     continue;
2014                 }
2015                 testText.appendCodePoint(c);
2016                 if (printTestData) {
2017                     System.out.print(Integer.toHexString(c) + " ");
2018                 }
2019             }
2020             if (printTestData) {
2021                 System.out.println();
2022             }
2023 
2024             Arrays.fill(expectedBreaks, false);
2025             Arrays.fill(forwardBreaks, false);
2026             Arrays.fill(reverseBreaks, false);
2027             Arrays.fill(isBoundaryBreaks, false);
2028             Arrays.fill(followingBreaks, false);
2029             Arrays.fill(precedingBreaks, false);
2030 
2031             // Calculate the expected results for this test string and reset applied rules.
2032             mk.setText(testText);
2033             expectedCount = 0;
2034             expectedBreaks[0] = true;
2035             int breakPos = 0;
2036             int lastBreakPos = -1;
2037             for (;;) {
2038                 lastBreakPos = breakPos;
2039                 breakPos = mk.next(breakPos);
2040                 if (breakPos == -1) {
2041                     break;
2042                 }
2043                 if (breakPos > testText.length()) {
2044                     errln("breakPos > testText.length()");
2045                 }
2046                 if (lastBreakPos >= breakPos) {
2047                     errln("Next() not increasing.");
2048                     // break;
2049                 }
2050                 expectedBreaks[breakPos] = true;
2051             }
2052 
2053             // Find the break positions using forward iteration
2054             if (printBreaksFromBI) {
2055                 System.out.println("Breaks from BI...");
2056             }
2057             bi.setText(testText.toString());
2058             for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
2059                 if (i < 0 || i > testText.length()) {
2060                     errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
2061                     break;
2062                 }
2063                 if (printBreaksFromBI) {
2064                     System.out.print(Integer.toHexString(i) + " ");
2065                 }
2066                 forwardBreaks[i] = true;
2067             }
2068             if (printBreaksFromBI) {
2069                 System.out.println();
2070             }
2071 
2072             // Find the break positions using reverse iteration
2073             for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
2074                 if (i < 0 || i > testText.length()) {
2075                     errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
2076                     break;
2077                 }
2078                 reverseBreaks[i] = true;
2079             }
2080 
2081             // Find the break positions using isBoundary() tests.
2082             for (i=0; i<=testText.length(); i++) {
2083                 isBoundaryBreaks[i] = bi.isBoundary(i);
2084             }
2085 
2086             // Find the break positions using the following() function.
2087             lastBreakPos = 0;
2088             followingBreaks[0] = true;
2089             for (i=0; i<testText.length(); i++) {
2090                 breakPos = bi.following(i);
2091                 if (breakPos <= i ||
2092                         breakPos < lastBreakPos ||
2093                         breakPos > testText.length() ||
2094                         breakPos > lastBreakPos && lastBreakPos > i ) {
2095                     errln(name + " break monkey test: " +
2096                             "Out of range value returned by BreakIterator::following().\n" +
2097                             "index=" + i + "following returned=" + breakPos +
2098                             "lastBreak=" + lastBreakPos);
2099                     precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
2100                 } else {
2101                     followingBreaks[breakPos] = true;
2102                     lastBreakPos = breakPos;
2103                 }
2104             }
2105 
2106             // Find the break positions using the preceding() function.
2107             lastBreakPos = testText.length();
2108             precedingBreaks[testText.length()] = true;
2109             for (i=testText.length(); i>0; i--) {
2110                 breakPos = bi.preceding(i);
2111                 if (breakPos >= i ||
2112                         breakPos > lastBreakPos ||
2113                         breakPos < 0 ||
2114                         breakPos < lastBreakPos && lastBreakPos < i ) {
2115                     errln(name + " break monkey test: " +
2116                             "Out of range value returned by BreakIterator::preceding().\n" +
2117                             "index=" + i + "preceding returned=" + breakPos +
2118                             "lastBreak=" + lastBreakPos);
2119                     precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
2120                 } else {
2121                     precedingBreaks[breakPos] = true;
2122                     lastBreakPos = breakPos;
2123                 }
2124             }
2125 
2126 
2127 
2128             // Compare the expected and actual results.
2129             for (i=0; i<=testText.length(); i++) {
2130                 String errorType = null;
2131                 boolean[] currentBreakData = null;
2132                 if  (forwardBreaks[i] != expectedBreaks[i]) {
2133                     errorType = "next()";
2134                     currentBreakData = forwardBreaks;
2135                 } else if (reverseBreaks[i] != forwardBreaks[i]) {
2136                     errorType = "previous()";
2137                     currentBreakData = reverseBreaks;
2138                 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
2139                     errorType = "isBoundary()";
2140                     currentBreakData = isBoundaryBreaks;
2141                 } else if (followingBreaks[i] != expectedBreaks[i]) {
2142                     errorType = "following()";
2143                     currentBreakData = followingBreaks;
2144                 } else if (precedingBreaks[i] != expectedBreaks[i]) {
2145                     errorType = "preceding()";
2146                     currentBreakData = precedingBreaks;
2147                 }
2148 
2149                 if (errorType != null) {
2150                     // Format a range of the test text that includes the failure as
2151                     //  a data item that can be included in the rbbi test data file.
2152 
2153                     // Start of the range is the last point where expected and actual results
2154                     //   both agreed that there was a break position.
2155                     int startContext = i;
2156                     int count = 0;
2157                     for (;;) {
2158                         if (startContext==0) { break; }
2159                         startContext --;
2160                         if (expectedBreaks[startContext]) {
2161                             if (count == 2) break;
2162                             count ++;
2163                         }
2164                     }
2165 
2166                     // End of range is two expected breaks past the start position.
2167                     int endContext = i + 1;
2168                     int ci;
2169                     for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
2170                         for (;;) {
2171                             if (endContext >= testText.length()) {break;}
2172                             if (expectedBreaks[endContext-1]) {
2173                                 if (count == 0) break;
2174                                 count --;
2175                             }
2176                             endContext ++;
2177                         }
2178                     }
2179 
2180                     // Formatting of each line includes:
2181                     //   character code
2182                     //   reference break: '|' -> a break, '.' -> no break
2183                     //   actual break:    '|' -> a break, '.' -> no break
2184                     //   (name of character clase)
2185                     //   Unicode name of character
2186                     //   '--→' indicates location of the difference.
2187 
2188                     StringBuilder buffer = new StringBuilder();
2189                     buffer.append("\n")
2190                         .append((expectedBreaks[i] ? "Break expected but not found." : "Break found but not expected."))
2191                         .append(
2192                             String.format(" at index %d. Parameters to reproduce: @\"type=%s  seed=%d  loop=1\"\n",
2193                               i, name, seed));
2194 
2195                     int c;  // Char from test data
2196                     for (ci = startContext;  ci <= endContext && ci != -1;  ci = nextCP(testText, ci)) {
2197 
2198                         c = testText.codePointAt(ci);
2199                         buffer.append((ci == i) ? " --→" : "    ")
2200                             .append(String.format(" %3d : ", ci))
2201                             .append(!expectedBreaks[ci] ? " . " : " | ")  // Reference break
2202                             .append(!currentBreakData[ci] ? " . " : " | "); // Actual break
2203 
2204                         // BMP or SMP character in hex
2205                         if (c >= 0x10000) {
2206                             buffer.append("\\U").append(String.format("%08x", c));
2207                         } else {
2208                             buffer.append("    \\u").append(String.format("%04x", c));
2209                         }
2210 
2211                         buffer.append(
2212                             String.format(String.format(" %%-%ds", classNameSize),
2213                               mk.classNameFromCodepoint(c)))
2214                             .append(String.format(" %-40s", mk.getAppliedRule(ci)))
2215                             .append(String.format(" %-40s\n", UCharacter.getExtendedName(c)));
2216 
2217                         if (ci >= endContext) { break; }
2218                     }
2219                     errln(buffer.toString());
2220 
2221                     break;
2222                 }
2223             }
2224 
2225             loopCount++;
2226         }
2227     }
2228 
2229     // Test parameters are passed on the command line, or
2230     // via the Eclipse Run Configuration settings, arguments tab, VM parameters.
2231     // For example,
2232     //      -ea -Dseed=554654 -Dloop=1
2233 
2234     @Test
TestCharMonkey()2235     public void TestCharMonkey() {
2236         int loopCount = getIntProperty("loop", isQuick() ? 500 : 10000);
2237         int seed = getIntProperty("seed", 1);
2238 
2239         RBBICharMonkey  m = new RBBICharMonkey();
2240         BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
2241         RunMonkey(bi, m, "char", seed, loopCount);
2242     }
2243 
2244     @Test
TestWordMonkey()2245     public void TestWordMonkey() {
2246         int loopCount = getIntProperty("loop", isQuick() ? 500 : 10000);
2247         int seed = getIntProperty("seed", 1);
2248 
2249         logln("Word Break Monkey Test");
2250         RBBIWordMonkey  m = new RBBIWordMonkey();
2251         BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
2252         RunMonkey(bi, m, "word", seed, loopCount);
2253     }
2254 
2255     @Test
TestLineMonkey()2256     public void TestLineMonkey() {
2257         int loopCount = getIntProperty("loop", isQuick() ? 500 : 10000);
2258         int seed = getIntProperty("seed", 1);
2259 
2260         logln("Line Break Monkey Test");
2261         RBBILineMonkey  m = new RBBILineMonkey();
2262         BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
2263         RunMonkey(bi, m, "line", seed, loopCount);
2264     }
2265 
2266     @Test
TestSentMonkey()2267     public void TestSentMonkey() {
2268         int loopCount = getIntProperty("loop", isQuick() ? 500 : 3000);
2269         int seed = getIntProperty("seed", 1);
2270 
2271         logln("Sentence Break Monkey Test");
2272         RBBISentenceMonkey  m = new RBBISentenceMonkey();
2273         BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
2274         RunMonkey(bi, m, "sent", seed, loopCount);
2275     }
2276     //
2277     //  Round-trip monkey tests.
2278     //  Verify that break iterators created from the rule source from the default
2279     //    break iterators still pass the monkey test for the iterator type.
2280     //
2281     //  This is a major test for the Rule Compiler.  The default break iterators are built
2282     //  from pre-compiled binary rule data that was created using ICU4C; these
2283     //  round-trip rule recompile tests verify that the Java rule compiler can
2284     //  rebuild break iterators from the original source rules.
2285     //
2286     @Test
TestRTCharMonkey()2287     public void TestRTCharMonkey() {
2288         int loopCount = getIntProperty("loop", isQuick() ? 200 : 2000);
2289         int seed = getIntProperty("seed", 1);
2290 
2291         RBBICharMonkey  m = new RBBICharMonkey();
2292         BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
2293         String rules = bi.toString();
2294         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2295         RunMonkey(rtbi, m, "char", seed, loopCount);
2296     }
2297 
2298     @Test
TestRTWordMonkey()2299     public void TestRTWordMonkey() {
2300         int loopCount = getIntProperty("loop", isQuick() ? 200 : 2000);
2301         int seed = getIntProperty("seed", 1);
2302 
2303         logln("Word Break Monkey Test");
2304         RBBIWordMonkey  m = new RBBIWordMonkey();
2305         BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
2306         String rules = bi.toString();
2307         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2308         RunMonkey(rtbi, m, "word", seed, loopCount);
2309     }
2310 
2311     @Test
TestRTLineMonkey()2312     public void TestRTLineMonkey() {
2313         int loopCount = getIntProperty("loop", isQuick() ? 200 : 2000);
2314         int seed = getIntProperty("seed", 1);
2315 
2316         logln("Line Break Monkey Test");
2317         RBBILineMonkey  m = new RBBILineMonkey();
2318         BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
2319         String rules = bi.toString();
2320         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2321         RunMonkey(rtbi, m, "line", seed, loopCount);
2322     }
2323 
2324     @Test
TestRTSentMonkey()2325     public void TestRTSentMonkey() {
2326         int loopCount = getIntProperty("loop", isQuick() ? 200 : 1000);
2327         int seed = getIntProperty("seed", 1);
2328 
2329         logln("Sentence Break Monkey Test");
2330         RBBISentenceMonkey  m = new RBBISentenceMonkey();
2331         BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
2332         String rules = bi.toString();
2333         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2334         RunMonkey(rtbi, m, "sent", seed, loopCount);
2335     }
2336 }
2337 
2338