• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 2003-2016 International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.dev.test.rbbi;
10 
11 
12 // Monkey testing of RuleBasedBreakIterator.
13 //    The old, original monkey test. TODO: remove
14 //    The new monkey test is class RBBIMonkeyTest.
15 
16 import java.util.ArrayList;
17 import java.util.Arrays;
18 import java.util.List;
19 import java.util.Locale;
20 
21 import org.junit.Test;
22 import org.junit.runner.RunWith;
23 import org.junit.runners.JUnit4;
24 
25 import com.ibm.icu.dev.test.TestFmwk;
26 import com.ibm.icu.lang.UCharacter;
27 import com.ibm.icu.lang.UProperty;
28 import com.ibm.icu.text.BreakIterator;
29 import com.ibm.icu.text.RuleBasedBreakIterator;
30 import com.ibm.icu.text.UTF16;
31 import com.ibm.icu.text.UnicodeSet;
32 
33 
34 /**
35  * Monkey tests for RBBI.  These tests have independent implementations of
36  * the Unicode TR boundary rules, and compare results between these and ICU's
37  * implementation, using random data.
38  *
39  * Tests cover Grapheme Cluster (char), Word and Line breaks
40  *
41  * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
42  *
43  */
44 @RunWith(JUnit4.class)
45 public class RBBITestMonkey extends TestFmwk {
46     //
47     //     class RBBIMonkeyKind
48     //
49     //        Monkey Test for Break Iteration
50     //        Abstract interface class.   Concrete derived classes independently
51     //        implement the break rules for different iterator types.
52     //
53     //        The Monkey Test itself uses doesn't know which type of break iterator it is
54     //        testing, but works purely in terms of the interface defined here.
55     //
56     abstract static class RBBIMonkeyKind {
RBBIMonkeyKind()57         RBBIMonkeyKind() {
58             fSets = new  ArrayList();
59             fClassNames = new ArrayList();
60             fAppliedRules = new ArrayList();
61         }
62 
63         // Return a List of UnicodeSets, representing the character classes used
64         //   for this type of iterator.
charClasses()65         abstract  List  charClasses();
66 
67         // Set the test text on which subsequent calls to next() will operate
setText(StringBuffer text)68         abstract  void   setText(StringBuffer text);
69 
70         // Find the next break position, starting from the specified position.
71         // Return -1 after reaching end of string.
next(int i)72         abstract   int   next(int i);
73 
74         // Name of each character class, parallel with charClasses. Used for debugging output
75         // of characters.
characterClassNames()76         List<String> characterClassNames() {
77             return fClassNames;
78         }
79 
setAppliedRule(int position, String value)80         void setAppliedRule(int position, String value) {
81             fAppliedRules.set(position, value);
82         }
83 
getAppliedRule(int position)84         String getAppliedRule(int position) {
85             return fAppliedRules.get(position);
86         }
87 
classNameFromCodepoint(int c)88         String classNameFromCodepoint(int c) {
89             // Simply iterate through fSets to find character's class
90             for (int aClassNum = 0; aClassNum < charClasses().size(); aClassNum++) {
91                 UnicodeSet classSet = (UnicodeSet)charClasses().get(aClassNum);
92                 if (classSet.contains(c)) {
93                     return fClassNames.get(aClassNum);
94                 }
95             }
96             return "bad class name";
97         }
98 
maxClassNameSize()99         int maxClassNameSize() {
100             int maxSize = 0;
101             for (int aClassNum = 0; aClassNum < charClasses().size(); aClassNum++) {
102                 if (fClassNames.get(aClassNum).length() > maxSize) {
103                     maxSize = fClassNames.get(aClassNum).length();
104                 }
105             }
106             return maxSize;
107         }
108 
109         // Clear `appliedRules` and fill it with empty strings in the size of test text.
prepareAppliedRules(int size)110         void prepareAppliedRules(int size) {
111             // Remove all the information in the `appliedRules`.
112             fAppliedRules.clear();
113             fAppliedRules.ensureCapacity(size + 1);
114             while (fAppliedRules.size() < size + 1) {
115                 fAppliedRules.add("");
116             }
117         }
118 
119         // A Character Property, one of the constants defined in class UProperty.
120         //   The value of this property will be displayed for the characters
121         //    near any test failure.
122         int   fCharProperty;
123 
124         List fSets;
125         ArrayList<String> fClassNames;
126         ArrayList<String> fAppliedRules;
127     }
128 
129     /**
130      * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
131      * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
132      */
133     static class RBBICharMonkey extends RBBIMonkeyKind {
134         UnicodeSet                fCRLFSet;
135         UnicodeSet                fControlSet;
136         UnicodeSet                fExtendSet;
137         UnicodeSet                fRegionalIndicatorSet;
138         UnicodeSet                fPrependSet;
139         UnicodeSet                fSpacingSet;
140         UnicodeSet                fLSet;
141         UnicodeSet                fVSet;
142         UnicodeSet                fTSet;
143         UnicodeSet                fLVSet;
144         UnicodeSet                fLVTSet;
145         UnicodeSet                fHangulSet;
146         UnicodeSet                fZWJSet;
147         UnicodeSet                fExtendedPictSet;
148         UnicodeSet                fViramaSet;
149         UnicodeSet                fLinkingConsonantSet;
150         UnicodeSet                fExtCccZwjSet;
151         UnicodeSet                fAnySet;
152 
153 
154         StringBuffer              fText;
155 
RBBICharMonkey()156         RBBICharMonkey() {
157             fText       = null;
158             fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
159             fCRLFSet    = new UnicodeSet("[\\r\\n]");
160             fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
161             fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
162             fZWJSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]");
163             fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
164             fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
165             fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
166             fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
167             fVSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
168             fTSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
169             fLVSet      = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
170             fLVTSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
171             fHangulSet  = new UnicodeSet();
172             fHangulSet.addAll(fLSet);
173             fHangulSet.addAll(fVSet);
174             fHangulSet.addAll(fTSet);
175             fHangulSet.addAll(fLVSet);
176             fHangulSet.addAll(fLVTSet);
177 
178             fExtendedPictSet  = new UnicodeSet("[:Extended_Pictographic:]");
179             fViramaSet        = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
180                                                + "\\p{Indic_Syllabic_Category=Virama}]");
181             fLinkingConsonantSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
182                                                   + "\\p{Indic_Syllabic_Category=Consonant}]");
183             fExtCccZwjSet     = new UnicodeSet("[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]");
184             fAnySet           = new UnicodeSet("[\\u0000-\\U0010ffff]");
185 
186 
187             fSets.add(fCRLFSet);               fClassNames.add("CRLF");
188             fSets.add(fControlSet);            fClassNames.add("Control");
189             fSets.add(fExtendSet);             fClassNames.add("Extended");
190             fSets.add(fRegionalIndicatorSet);  fClassNames.add("RegionalIndicator");
191             if (!fPrependSet.isEmpty()) {
192                 fSets.add(fPrependSet);        fClassNames.add("Prepend");
193             }
194             fSets.add(fSpacingSet);            fClassNames.add("Spacing");
195             fSets.add(fHangulSet);             fClassNames.add("Hangul");
196             fSets.add(fAnySet);                fClassNames.add("Any");
197             fSets.add(fZWJSet);                fClassNames.add("ZWJ");
198             fSets.add(fExtendedPictSet);       fClassNames.add("ExtendedPict");
199             fSets.add(fViramaSet);             fClassNames.add("Virama");
200             fSets.add(fLinkingConsonantSet);   fClassNames.add("LinkingConsonant");
201             fSets.add(fExtCccZwjSet);          fClassNames.add("ExtCccZwj");
202         }
203 
204 
205         @Override
setText(StringBuffer s)206         void setText(StringBuffer s) {
207             fText = s;
208             prepareAppliedRules(s.length());
209         }
210 
211         @Override
charClasses()212         List charClasses() {
213             return fSets;
214         }
215 
216         @Override
next(int prevPos)217         int next(int prevPos) {
218             int    /*p0,*/ p1, p2, p3;    // Indices of the significant code points around the
219             //   break position being tested.  The candidate break
220             //   location is before p2.
221 
222             int     breakPos = -1;
223 
224             int   c0, c1, c2, c3;     // The code points at p0, p1, p2 & p3.
225             int   cBase;              // for (X Extend*) patterns, the X character.
226 
227             // Previous break at end of string.  return DONE.
228             if (prevPos >= fText.length()) {
229                 return -1;
230             }
231             /* p0 = */ p1 = p2 = p3 = prevPos;
232             c3 =  UTF16.charAt(fText, prevPos);
233             c0 = c1 = c2 = cBase = 0;
234 
235             // Loop runs once per "significant" character position in the input text.
236             for (;;) {
237                 // Move all of the positions forward in the input string.
238                 /* p0 = p1;*/  c0 = c1;
239                 p1 = p2;  c1 = c2;
240                 p2 = p3;  c2 = c3;
241 
242                 // Advance p3 by one codepoint
243                 p3 = moveIndex32(fText, p3, 1);
244                 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
245 
246                 if (p1 == p2) {
247                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
248                     continue;
249                 }
250                 if (p2 == fText.length()) {
251                     setAppliedRule(p2, "End of String");
252                     break;
253                 }
254 
255                 //     No Extend or Format characters may appear between the CR and LF,
256                 //     which requires the additional check for p2 immediately following p1.
257                 //
258                 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
259                     setAppliedRule(p2, "GB 3   CR x LF");
260                     continue;
261                 }
262 
263                 if (fControlSet.contains(c1) ||
264                         c1 == 0x0D ||
265                         c1 == 0x0A)  {
266                     setAppliedRule(p2, "GB 4   ( Control | CR | LF ) <break>");
267                     break;
268                 }
269 
270                 if (fControlSet.contains(c2) ||
271                         c2 == 0x0D ||
272                         c2 == 0x0A)  {
273                     setAppliedRule(p2, "GB 5   <break>  ( Control | CR | LF )");
274                     break;
275                 }
276 
277 
278                 if (fLSet.contains(c1) &&
279                         (fLSet.contains(c2)  ||
280                                 fVSet.contains(c2)  ||
281                                 fLVSet.contains(c2) ||
282                                 fLVTSet.contains(c2))) {
283                     setAppliedRule(p2, "GB 6   L x ( L | V | LV | LVT )");
284                     continue;
285                 }
286 
287                 if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
288                         (fVSet.contains(c2) || fTSet.contains(c2)))  {
289                     setAppliedRule(p2, "GB 7   ( LV | V )  x  ( V | T )");
290                     continue;
291                 }
292 
293                 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
294                         fTSet.contains(c2))  {
295                     setAppliedRule(p2, "GB 8   ( LVT | T)  x T");
296                     continue;
297                 }
298 
299                 if (fExtendSet.contains(c2) || fZWJSet.contains(c2))  {
300                     if (!fExtendSet.contains(c1)) {
301                         cBase = c1;
302                     }
303                     setAppliedRule(p2, "GB 9   x (Extend | ZWJ)");
304                     continue;
305                 }
306 
307                 if (fSpacingSet.contains(c2)) {
308                     setAppliedRule(p2, "GB 9a  x  SpacingMark");
309                     continue;
310                 }
311 
312                 if (fPrependSet.contains(c1)) {
313                     setAppliedRule(p2, "GB 9b  Prepend x");
314                     continue;
315                 }
316 
317                 //   Note: Viramas are also included in the ExtCccZwj class.
318                 if (fLinkingConsonantSet.contains(c2)) {
319                     int pi = p1;
320                     boolean sawVirama = false;
321                     while (pi > 0 && fExtCccZwjSet.contains(fText.codePointAt(pi))) {
322                         if (fViramaSet.contains(fText.codePointAt(pi))) {
323                             sawVirama = true;
324                         }
325                         pi = fText.offsetByCodePoints(pi, -1);
326                     }
327                     if (sawVirama && fLinkingConsonantSet.contains(fText.codePointAt(pi))) {
328                         setAppliedRule(p2, "GB 9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
329                         continue;
330                     }
331                 }
332 
333                 if (fExtendedPictSet.contains(cBase) && fZWJSet.contains(c1) && fExtendedPictSet.contains(c2) ) {
334                     setAppliedRule(p2, "GB 11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
335                     continue;
336                 }
337 
338                 //                  Note: The first if condition is a little tricky. We only need to force
339                 //                      a break if there are three or more contiguous RIs. If there are
340                 //                      only two, a break following will occur via other rules, and will include
341                 //                      any trailing extend characters, which is needed behavior.
342                 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
343                         && fRegionalIndicatorSet.contains(c2)) {
344                     setAppliedRule(p2, "GB 12-13 Regional_Indicator x Regional_Indicator");
345                     break;
346                 }
347                 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
348                     setAppliedRule(p2, "GB 12-13 Regional_Indicator x Regional_Indicator");
349                     continue;
350                 }
351 
352                 setAppliedRule(p2, "GB 999 Any <break> Any");
353                 break;
354             }
355 
356             breakPos = p2;
357             return breakPos;
358         }
359 
360     }
361 
362     /**
363      *
364      * Word Monkey Test Class
365      *
366      *
367      *
368      */
369     static class RBBIWordMonkey extends RBBIMonkeyKind {
370         StringBuffer              fText;
371 
372         UnicodeSet                fCRSet;
373         UnicodeSet                fLFSet;
374         UnicodeSet                fNewlineSet;
375         UnicodeSet                fRegionalIndicatorSet;
376         UnicodeSet                fKatakanaSet;
377         UnicodeSet                fHebrew_LetterSet;
378         UnicodeSet                fALetterSet;
379         UnicodeSet                fSingle_QuoteSet;
380         UnicodeSet                fDouble_QuoteSet;
381         UnicodeSet                fMidNumLetSet;
382         UnicodeSet                fMidLetterSet;
383         UnicodeSet                fMidNumSet;
384         UnicodeSet                fNumericSet;
385         UnicodeSet                fFormatSet;
386         UnicodeSet                fExtendSet;
387         UnicodeSet                fExtendNumLetSet;
388         UnicodeSet                fWSegSpaceSet;
389         UnicodeSet                fOtherSet;
390         UnicodeSet                fDictionarySet;
391         UnicodeSet                fZWJSet;
392         UnicodeSet                fExtendedPictSet;
393 
RBBIWordMonkey()394         RBBIWordMonkey() {
395             fCharProperty    = UProperty.WORD_BREAK;
396 
397             fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");
398             fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");
399             fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");
400             fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
401             fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");
402             fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");
403             fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter} @]");
404             fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
405             fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");
406             fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
407             fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]");
408             fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");
409             fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");
410             fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");
411             fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
412             // There are some sc=Hani characters with WB=Extend.
413             // The break rules need to pick one or the other because
414             // Extend overlapping with something else is messy.
415             // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
416             // in $Han (for $dictionary) and out of $Extend.
417             fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}-[:Hani:]]");
418             fWSegSpaceSet    = new UnicodeSet("[\\p{Word_Break = WSegSpace}]");
419             fZWJSet          = new UnicodeSet("[\\p{Word_Break = ZWJ}]");
420             fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]");
421 
422             fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]");
423             fDictionarySet.addAll(fKatakanaSet);
424             fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
425 
426             fALetterSet.removeAll(fDictionarySet);
427 
428             fOtherSet        = new UnicodeSet();
429             fOtherSet.complement();
430             fOtherSet.removeAll(fCRSet);
431             fOtherSet.removeAll(fLFSet);
432             fOtherSet.removeAll(fNewlineSet);
433             fOtherSet.removeAll(fALetterSet);
434             fOtherSet.removeAll(fSingle_QuoteSet);
435             fOtherSet.removeAll(fDouble_QuoteSet);
436             fOtherSet.removeAll(fKatakanaSet);
437             fOtherSet.removeAll(fHebrew_LetterSet);
438             fOtherSet.removeAll(fMidLetterSet);
439             fOtherSet.removeAll(fMidNumSet);
440             fOtherSet.removeAll(fNumericSet);
441             fOtherSet.removeAll(fFormatSet);
442             fOtherSet.removeAll(fExtendSet);
443             fOtherSet.removeAll(fExtendNumLetSet);
444             fOtherSet.removeAll(fWSegSpaceSet);
445             fOtherSet.removeAll(fRegionalIndicatorSet);
446             fOtherSet.removeAll(fZWJSet);
447             fOtherSet.removeAll(fExtendedPictSet);
448 
449             // Inhibit dictionary characters from being tested at all.
450             // remove surrogates so as to not generate higher CJK characters
451             fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
452             fOtherSet.removeAll(fDictionarySet);
453 
454             fSets.add(fCRSet);                    fClassNames.add("CR");
455             fSets.add(fLFSet);                    fClassNames.add("LF");
456             fSets.add(fNewlineSet);               fClassNames.add("Newline");
457             fSets.add(fRegionalIndicatorSet);     fClassNames.add("RegionalIndicator");
458             fSets.add(fHebrew_LetterSet);         fClassNames.add("Hebrew");
459             fSets.add(fALetterSet);               fClassNames.add("ALetter");
460             //fSets.add(fKatakanaSet);  // Omit Katakana from fSets, which omits Katakana characters
461             // from the test data. They are all in the dictionary set,
462             // which this (old, to be retired) monkey test cannot handle.
463             fSets.add(fSingle_QuoteSet);          fClassNames.add("Single Quote");
464             fSets.add(fDouble_QuoteSet);          fClassNames.add("Double Quote");
465             fSets.add(fMidLetterSet);             fClassNames.add("MidLetter");
466             fSets.add(fMidNumLetSet);             fClassNames.add("MidNumLet");
467             fSets.add(fMidNumSet);                fClassNames.add("MidNum");
468             fSets.add(fNumericSet);               fClassNames.add("Numeric");
469             fSets.add(fFormatSet);                fClassNames.add("Format");
470             fSets.add(fExtendSet);                fClassNames.add("Extend");
471             fSets.add(fExtendNumLetSet);          fClassNames.add("ExtendNumLet");
472             fSets.add(fWSegSpaceSet);             fClassNames.add("WSegSpace");
473             fSets.add(fZWJSet);                   fClassNames.add("ZWJ");
474             fSets.add(fExtendedPictSet);          fClassNames.add("ExtendedPict");
475             fSets.add(fOtherSet);                 fClassNames.add("Other");
476         }
477 
478 
479         @Override
charClasses()480         List  charClasses() {
481             return fSets;
482         }
483 
484         @Override
setText(StringBuffer s)485         void   setText(StringBuffer s) {
486             fText = s;
487             prepareAppliedRules(s.length());
488         }
489 
490         @Override
next(int prevPos)491         int   next(int prevPos) {
492             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
493             //   break position being tested.  The candidate break
494             //   location is before p2.
495             int     breakPos = -1;
496 
497             int c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
498 
499             // Previous break at end of string.  return DONE.
500             if (prevPos >= fText.length()) {
501                 return -1;
502             }
503             /*p0 =*/ p1 = p2 = p3 = prevPos;
504             c3 = UTF16.charAt(fText, prevPos);
505             c0 = c1 = c2 = 0;
506 
507 
508 
509             // Loop runs once per "significant" character position in the input text.
510             for (;;) {
511                 // Move all of the positions forward in the input string.
512                 /*p0 = p1;*/  c0 = c1;
513                 p1 = p2;  c1 = c2;
514                 p2 = p3;  c2 = c3;
515 
516                 // Advance p3 by    X(Extend | Format)*   Rule 4
517                 //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
518                 do {
519                     p3 = moveIndex32(fText, p3, 1);
520                     c3 = -1;
521                     if (p3>=fText.length()) {
522                         break;
523                     }
524                     c3 = UTF16.charAt(fText, p3);
525                     if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
526                         break;
527                     }
528                 }
529                 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3));
530 
531                 if (p1 == p2) {
532                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
533                     continue;
534                 }
535                 if (p2 == fText.length()) {
536                     // Reached end of string.  Always a break position.
537                     break;
538                 }
539 
540                 //     No Extend or Format characters may appear between the CR and LF,
541                 //     which requires the additional check for p2 immediately following p1.
542                 //
543                 if (c1==0x0D && c2==0x0A) {
544                     setAppliedRule(p2, "WB 3   CR x LF");
545                     continue;
546                 }
547 
548                 //
549                 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
550                     setAppliedRule(p2, "WB 3a  Break before and after newlines (including CR and LF)");
551                     break;
552                 }
553                 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
554                     setAppliedRule(p2, "WB 3a  Break before and after newlines (including CR and LF)");
555                     break;
556                 }
557 
558                 //              Not ignoring extend chars, so peek into input text to
559                 //              get the potential ZWJ, the character immediately preceding c2.
560                 if (fZWJSet.contains(fText.codePointBefore(p2)) && fExtendedPictSet.contains(c2)) {
561                     setAppliedRule(p2, "WB 3c  ZWJ x Extended_Pictographic");
562                     continue;
563                 }
564 
565                 if (fWSegSpaceSet.contains(fText.codePointBefore(p2)) && fWSegSpaceSet.contains(c2)) {
566                     setAppliedRule(p2, "WB 3d  Keep horizontal whitespace together");
567                     continue;
568                 }
569 
570                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
571                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
572                     setAppliedRule(p2, "WB 4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
573                     continue;
574                 }
575 
576                 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1))   &&
577                         (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
578                         (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
579                     setAppliedRule(p2, "WB 6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)");
580                     continue;
581                 }
582 
583                 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
584                         (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
585                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
586                     setAppliedRule(p2, "WB 7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
587                     continue;
588                 }
589 
590                 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
591                     setAppliedRule(p2, "WB 7a  Hebrew_Letter x Single_Quote");
592                     continue;
593                 }
594 
595                 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
596                     setAppliedRule(p2, "WB 7b  Hebrew_Letter x Single_Quote");
597                     continue;
598                 }
599 
600                 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
601                     setAppliedRule(p2, "WB 7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
602                     continue;
603                 }
604 
605                 if (fNumericSet.contains(c1) &&
606                         fNumericSet.contains(c2))  {
607                     setAppliedRule(p2, "WB 8   Numeric x Numeric");
608                     continue;
609                 }
610 
611                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
612                         fNumericSet.contains(c2))  {
613                     setAppliedRule(p2, "WB 9   (ALetter | Hebrew_Letter) x Numeric");
614                     continue;
615                 }
616 
617                 if (fNumericSet.contains(c1) &&
618                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
619                     setAppliedRule(p2, "WB 10  Numeric x (ALetter | Hebrew_Letter)");
620                     continue;
621                 }
622 
623                 if (fNumericSet.contains(c0) &&
624                         (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1))  &&
625                         fNumericSet.contains(c2)) {
626                     setAppliedRule(p2, "WB 11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
627                     continue;
628                 }
629 
630                 if (fNumericSet.contains(c1) &&
631                         (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2))  &&
632                         setContains(fNumericSet, c3)) {
633                     setAppliedRule(p2, "WB 12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
634                     continue;
635                 }
636 
637                 //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
638                 //                  all Katakana are handled by the dictionary breaker.
639                 if (fKatakanaSet.contains(c1) &&
640                         fKatakanaSet.contains(c2))  {
641                     setAppliedRule(p2, "WB 13  Katakana x Katakana");
642                     continue;
643                 }
644 
645                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
646                         fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
647                         fExtendNumLetSet.contains(c2)) {
648                     setAppliedRule(p2, "WB 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
649                     continue;
650                 }
651 
652                 if (fExtendNumLetSet.contains(c1) &&
653                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
654                                 fNumericSet.contains(c2) || fKatakanaSet.contains(c2)))  {
655                     setAppliedRule(p2, "WB 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
656                     continue;
657                 }
658 
659 
660                 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) {
661                     setAppliedRule(p2, "WB 15-17 Group pairs of Regional Indicators.");
662                     break;
663                 }
664                 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
665                     setAppliedRule(p2, "WB 15-17 Group pairs of Regional Indicators.");
666                     continue;
667                 }
668 
669                 setAppliedRule(p2, "WB 999");
670                 break;
671             }
672 
673             breakPos = p2;
674             return breakPos;
675         }
676     }
677 
678 
679     static class RBBILineMonkey extends RBBIMonkeyKind {
680         // UnicodeSets for each of the Line Breaking character classes.
681         // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier
682         // to verify that they are all accounted for.
683 
684         // XUnicodeSet is like UnicodeSet, except that the method contains(int codePoint) does not
685         // throw exceptions on out-of-range codePoints. This matches ICU4C behavior.
686         // The LineMonkey test (ported from ICU4C) relies on this behavior, it uses a value of -1
687         // to represent a non-codepoint that is not included in any of the property sets.
688         // This happens for rule 30a.
689         class XUnicodeSet extends UnicodeSet {
XUnicodeSet(String pattern)690             XUnicodeSet(String pattern) { super(pattern); }
XUnicodeSet()691             XUnicodeSet() { super(); }
692             @Override
contains(int codePoint)693             public boolean contains(int codePoint) {
694                 return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ?
695                         false : super.contains(codePoint);
696             }
697         }
698 
699         // Declare these variables as XUnicodeSet, not merely as UnicodeSet,
700         // so that when we copy a new declaration from C++ (where only UnicodeSet exists),
701         // the missing 'X' prefix is visible;
702         // and when the prefix is there and we copy a new initializer we get a compiler error.
703         // (Otherwise we rely on the caller catching the IAE from using codePoint=-1
704         // and failing with a message that tells us what to do.)
705         XUnicodeSet fBK;
706         XUnicodeSet fCR;
707         XUnicodeSet fLF;
708         XUnicodeSet fCM;
709         XUnicodeSet fNL;
710         XUnicodeSet fSG;
711         XUnicodeSet fWJ;
712         XUnicodeSet fZW;
713         XUnicodeSet fGL;
714         XUnicodeSet fSP;
715         XUnicodeSet fB2;
716         XUnicodeSet fBA;
717         XUnicodeSet fBB;
718         XUnicodeSet fHH;
719         XUnicodeSet fHY;
720         XUnicodeSet fCB;
721         XUnicodeSet fCL;
722         XUnicodeSet fCP;
723         XUnicodeSet fEX;
724         XUnicodeSet fIN;
725         XUnicodeSet fNS;
726         XUnicodeSet fOP;
727         XUnicodeSet fQU;
728         XUnicodeSet fIS;
729         XUnicodeSet fNU;
730         XUnicodeSet fPO;
731         XUnicodeSet fPR;
732         XUnicodeSet fSY;
733         XUnicodeSet fAI;
734         XUnicodeSet fAL;
735         XUnicodeSet fCJ;
736         XUnicodeSet fH2;
737         XUnicodeSet fH3;
738         XUnicodeSet fHL;
739         XUnicodeSet fID;
740         XUnicodeSet fJL;
741         XUnicodeSet fJV;
742         XUnicodeSet fJT;
743         XUnicodeSet fRI;
744         XUnicodeSet fXX;
745         XUnicodeSet fEB;
746         XUnicodeSet fEM;
747         XUnicodeSet fZWJ;
748         XUnicodeSet fOP30;
749         XUnicodeSet fCP30;
750         XUnicodeSet fExtPictUnassigned;
751 
752         StringBuffer  fText;
753         int           fOrigPositions;
754 
RBBILineMonkey()755         RBBILineMonkey()
756         {
757             fCharProperty  = UProperty.LINE_BREAK;
758 
759             fBK    = new XUnicodeSet("[\\p{Line_Break=BK}]");
760             fCR    = new XUnicodeSet("[\\p{Line_break=CR}]");
761             fLF    = new XUnicodeSet("[\\p{Line_break=LF}]");
762             fCM    = new XUnicodeSet("[\\p{Line_break=CM}]");
763             fNL    = new XUnicodeSet("[\\p{Line_break=NL}]");
764             fSG    = new XUnicodeSet("[\\ud800-\\udfff]");
765             fWJ    = new XUnicodeSet("[\\p{Line_break=WJ}]");
766             fZW    = new XUnicodeSet("[\\p{Line_break=ZW}]");
767             fGL    = new XUnicodeSet("[\\p{Line_break=GL}]");
768             fSP    = new XUnicodeSet("[\\p{Line_break=SP}]");
769             fB2    = new XUnicodeSet("[\\p{Line_break=B2}]");
770             fBA    = new XUnicodeSet("[\\p{Line_break=BA}]");
771             fBB    = new XUnicodeSet("[\\p{Line_break=BB}]");
772             fHH    = new XUnicodeSet();
773             fHY    = new XUnicodeSet("[\\p{Line_break=HY}]");
774             fCB    = new XUnicodeSet("[\\p{Line_break=CB}]");
775             fCL    = new XUnicodeSet("[\\p{Line_break=CL}]");
776             fCP    = new XUnicodeSet("[\\p{Line_break=CP}]");
777             fEX    = new XUnicodeSet("[\\p{Line_break=EX}]");
778             fIN    = new XUnicodeSet("[\\p{Line_break=IN}]");
779             fNS    = new XUnicodeSet("[\\p{Line_break=NS}]");
780             fOP    = new XUnicodeSet("[\\p{Line_break=OP}]");
781             fQU    = new XUnicodeSet("[\\p{Line_break=QU}]");
782             fIS    = new XUnicodeSet("[\\p{Line_break=IS}]");
783             fNU    = new XUnicodeSet("[\\p{Line_break=NU}]");
784             fPO    = new XUnicodeSet("[\\p{Line_break=PO}]");
785             fPR    = new XUnicodeSet("[\\p{Line_break=PR}]");
786             fSY    = new XUnicodeSet("[\\p{Line_break=SY}]");
787             fAI    = new XUnicodeSet("[\\p{Line_break=AI}]");
788             fAL    = new XUnicodeSet("[\\p{Line_break=AL}]");
789             fCJ    = new XUnicodeSet("[\\p{Line_break=CJ}]");
790             fH2    = new XUnicodeSet("[\\p{Line_break=H2}]");
791             fH3    = new XUnicodeSet("[\\p{Line_break=H3}]");
792             fHL    = new XUnicodeSet("[\\p{Line_break=HL}]");
793             fID    = new XUnicodeSet("[\\p{Line_break=ID}]");
794             fJL    = new XUnicodeSet("[\\p{Line_break=JL}]");
795             fJV    = new XUnicodeSet("[\\p{Line_break=JV}]");
796             fJT    = new XUnicodeSet("[\\p{Line_break=JT}]");
797             fRI    = new XUnicodeSet("[\\p{Line_break=RI}]");
798             fXX    = new XUnicodeSet("[\\p{Line_break=XX}]");
799             fEB    = new XUnicodeSet("[\\p{Line_break=EB}]");
800             fEM    = new XUnicodeSet("[\\p{Line_break=EM}]");
801             fZWJ   = new XUnicodeSet("[\\p{Line_break=ZWJ}]");
802             fOP30  = new XUnicodeSet("[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]");
803             fCP30  = new XUnicodeSet("[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]");
804             fExtPictUnassigned = new XUnicodeSet("[\\p{Extended_Pictographic}&\\p{Cn}]");
805 
806             // Remove dictionary characters.
807             // The monkey test reference implementation of line break does not replicate the dictionary behavior,
808             // so dictionary characters are omitted from the monkey test data.
809             @SuppressWarnings("unused")
810             UnicodeSet dictionarySet = new UnicodeSet(
811                     "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]");
812 
813             fAL.addAll(fXX);     // Default behavior for XX is identical to AL
814             fAL.addAll(fAI);     // Default behavior for AI is identical to AL
815             fAL.addAll(fSG);     // Default behavior for SG (unpaired surrogates) is AL
816 
817             fNS.addAll(fCJ);     // Default behavior for CJ is identical to NS.
818             fCM.addAll(fZWJ);    // ZWJ behaves as a CM.
819 
820             fHH.add('\u2010');   // Hyphen, '‐'
821 
822             fSets.add(fBK);     fClassNames.add("BK");
823             fSets.add(fCR);     fClassNames.add("CR");
824             fSets.add(fLF);     fClassNames.add("LF");
825             fSets.add(fCM);     fClassNames.add("CM");
826             fSets.add(fNL);     fClassNames.add("NL");
827             fSets.add(fWJ);     fClassNames.add("WJ");
828             fSets.add(fZW);     fClassNames.add("ZW");
829             fSets.add(fGL);     fClassNames.add("GL");
830             fSets.add(fSP);     fClassNames.add("SP");
831             fSets.add(fB2);     fClassNames.add("B2");
832             fSets.add(fBA);     fClassNames.add("BA");
833             fSets.add(fBB);     fClassNames.add("BB");
834             fSets.add(fHY);     fClassNames.add("HY");
835             fSets.add(fCB);     fClassNames.add("CB");
836             fSets.add(fCL);     fClassNames.add("CL");
837             fSets.add(fCP);     fClassNames.add("CP");
838             fSets.add(fEX);     fClassNames.add("EX");
839             fSets.add(fIN);     fClassNames.add("IN");
840             fSets.add(fJL);     fClassNames.add("JL");
841             fSets.add(fJT);     fClassNames.add("JT");
842             fSets.add(fJV);     fClassNames.add("JV");
843             fSets.add(fNS);     fClassNames.add("NV");
844             fSets.add(fOP);     fClassNames.add("OP");
845             fSets.add(fQU);     fClassNames.add("QU");
846             fSets.add(fIS);     fClassNames.add("IS");
847             fSets.add(fNU);     fClassNames.add("NU");
848             fSets.add(fPO);     fClassNames.add("PO");
849             fSets.add(fPR);     fClassNames.add("PR");
850             fSets.add(fSY);     fClassNames.add("SY");
851             fSets.add(fAI);     fClassNames.add("AI");
852             fSets.add(fAL);     fClassNames.add("AL");
853             fSets.add(fH2);     fClassNames.add("H2");
854             fSets.add(fH3);     fClassNames.add("H3");
855             fSets.add(fHL);     fClassNames.add("HL");
856             fSets.add(fID);     fClassNames.add("ID");
857             fSets.add(fRI);     fClassNames.add("RI");
858             fSets.add(fSG);     fClassNames.add("SG");
859             fSets.add(fEB);     fClassNames.add("EB");
860             fSets.add(fEM);     fClassNames.add("EM");
861             fSets.add(fZWJ);    fClassNames.add("ZWJ");
862             // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
863             fSets.add(fOP30);   fClassNames.add("OP30");
864             fSets.add(fCP30);   fClassNames.add("CP30");
865             fSets.add(fExtPictUnassigned); fClassNames.add("fExtPictUnassigned");
866         }
867 
868         @Override
setText(StringBuffer s)869         void setText(StringBuffer s) {
870             fText       = s;
871             prepareAppliedRules(s.length());
872         }
873 
874 
875 
876 
877         @Override
next(int startPos)878         int next(int startPos) {
879             int    pos;       //  Index of the char following a potential break position
880             int    thisChar;  //  Character at above position "pos"
881 
882             int    prevPos;   //  Index of the char preceding a potential break position
883             int    prevChar;  //  Character at above position.  Note that prevChar
884             //                //  and thisChar may not be adjacent because combining
885             //                //  characters between them will be ignored.
886 
887             int    prevPosX2;
888             int    prevCharX2; //  Character before prevChar, more context for LB 21a
889 
890             int    nextPos;   //  Index of the next character following pos.
891             //                //  Usually skips over combining marks.
892             int    tPos;      //  temp value.
893             int    matchVals[]  = null;       // Number  Expression Match Results
894 
895 
896             if (startPos >= fText.length()) {
897                 return -1;
898             }
899 
900 
901             // Initial values for loop.  Loop will run the first time without finding breaks,
902             //                           while the invalid values shift out and the "this" and
903             //                           "prev" positions are filled in with good values.
904             pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
905             thisChar = prevChar  = prevCharX2 =  0;
906             nextPos  = startPos;
907 
908 
909             // Loop runs once per position in the test text, until a break position
910             //  is found.  In each iteration, we are testing for a possible break
911             //  just preceding the character at index "pos".  The character preceding
912             //  this char is at position "prevPos"; because of combining sequences,
913             //  "prevPos" can be arbitrarily far before "pos".
914             for (;;) {
915                 // Advance to the next position to be tested.
916                 prevPosX2  = prevPos;
917                 prevCharX2 = prevChar;
918                 prevPos   = pos;
919                 prevChar  = thisChar;
920                 pos       = nextPos;
921                 nextPos   = moveIndex32(fText, pos, 1);
922 
923                 if (pos >= fText.length()) {
924                     setAppliedRule(pos, "LB 2   Break at end of text");
925                     break;
926                 }
927 
928                 //             We do this rule out-of-order because the adjustment does
929                 //             not effect the way that rules LB 3 through LB 6 match,
930                 //             and doing it here rather than after LB 6 is substantially
931                 //             simpler when combining sequences do occur.
932 
933 
934                 // LB 9         Keep combining sequences together.
935                 //              advance over any CM class chars at "pos",
936                 //              result is "nextPos" for the following loop iteration.
937                 thisChar  = UTF16.charAt(fText, pos);
938                 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
939                         thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
940                     for (;;) {
941                         if (nextPos == fText.length()) {
942                             break;
943                         }
944                         int nextChar = UTF16.charAt(fText, nextPos);
945                         if (!fCM.contains(nextChar)) {
946                             break;
947                         }
948                         nextPos = moveIndex32(fText, nextPos, 1);
949                     }
950                 }
951 
952                 // LB 9 Treat X CM* as if it were X
953                 //        No explicit action required.
954 
955                 // LB 10     Treat any remaining combining mark as AL
956                 if (fCM.contains(thisChar)) {
957                     thisChar = 'A';
958                 }
959 
960 
961                 // If the loop is still warming up - if we haven't shifted the initial
962                 //   -1 positions out of prevPos yet - loop back to advance the
963                 //    position in the input without any further looking for breaks.
964                 if (prevPos == -1) {
965                     setAppliedRule(pos, "LB 9   adjust for combining sequences.");
966                     continue;
967                 }
968 
969                 if (fBK.contains(prevChar)) {
970                     setAppliedRule(pos, "LB 4   Always break after hard line breaks");
971                     break;
972                 }
973 
974                 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
975                     setAppliedRule(pos, "LB 5   Break after CR, LF, NL, but not inside CR LF");
976                     continue;
977                 }
978                 if  (fCR.contains(prevChar) ||
979                         fLF.contains(prevChar) ||
980                         fNL.contains(prevChar))  {
981                     setAppliedRule(pos, "LB 5   Break after CR, LF, NL, but not inside CR LF");
982                     break;
983                 }
984 
985                 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
986                         fLF.contains(thisChar) || fNL.contains(thisChar) ) {
987                     setAppliedRule(pos, "LB 6   Don't break before hard line breaks");
988                     continue;
989                 }
990 
991 
992                 if (fSP.contains(thisChar)) {
993                     setAppliedRule(pos, "LB 7   Don't break before spaces or zero-width space");
994                     continue;
995                 }
996 
997                 if (fZW.contains(thisChar)) {
998                     setAppliedRule(pos, "LB 7   Don't break before spaces or zero-width space");
999                     continue;
1000                 }
1001 
1002                 //       ZW SP* ÷
1003                 //       Scan backwards from prevChar for SP* ZW
1004                 tPos = prevPos;
1005                 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1006                     tPos = moveIndex32(fText, tPos, -1);
1007                 }
1008                 if (fZW.contains(UTF16.charAt(fText, tPos))) {
1009                     setAppliedRule(pos, "LB 8   Break after zero width space");
1010                     break;
1011                 }
1012 
1013                 //          Move this test up, before LB8a, because numbers can match a longer sequence that would
1014                 //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
1015                 matchVals = LBNumberCheck(fText, prevPos, matchVals);
1016                 if (matchVals[0] != -1) {
1017                     // Matched a number.  But could have been just a single digit, which would
1018                     //    not represent a "no break here" between prevChar and thisChar
1019                     int numEndIdx = matchVals[1];  // idx of first char following num
1020                     if (numEndIdx > pos) {
1021                         // Number match includes at least the two chars being checked
1022                         if (numEndIdx > nextPos) {
1023                             // Number match includes additional chars.  Update pos and nextPos
1024                             //   so that next loop iteration will continue at the end of the number,
1025                             //   checking for breaks between last char in number & whatever follows.
1026                             nextPos = numEndIdx;
1027                             pos     = numEndIdx;
1028                             do {
1029                                 pos = moveIndex32(fText, pos, -1);
1030                                 thisChar = UTF16.charAt(fText, pos);
1031                             }
1032                             while (fCM.contains(thisChar));
1033                         }
1034                         setAppliedRule(pos, "LB 25  Numbers");
1035                         continue;
1036                     }
1037                 }
1038 
1039                 //       The monkey test's way of ignoring combining characters doesn't work
1040                 //       for this rule. ZWJ is also a CM. Need to get the actual character
1041                 //       preceding "thisChar", not ignoring combining marks, possibly ZWJ.
1042                 {
1043                     int prevC = fText.codePointBefore(pos);
1044                     if (fZWJ.contains(prevC)) {
1045                         setAppliedRule(pos, "LB 8a  ZWJ x");
1046                         continue;
1047                     }
1048                 }
1049 
1050                 // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
1051 
1052 
1053                 //    x  WJ
1054                 //    WJ  x
1055                 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
1056                     setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
1057                     continue;
1058                 }
1059 
1060 
1061                 if (fGL.contains(prevChar)) {
1062                     setAppliedRule(pos, "LB 12  GL  x");
1063                     continue;
1064                 }
1065 
1066                 if (!(fSP.contains(prevChar) ||
1067                         fBA.contains(prevChar) ||
1068                         fHY.contains(prevChar)     ) && fGL.contains(thisChar)) {
1069                     setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
1070                     continue;
1071                 }
1072 
1073                 if (fCL.contains(thisChar) ||
1074                         fCP.contains(thisChar) ||
1075                         fEX.contains(thisChar) ||
1076                         fSY.contains(thisChar)) {
1077                     setAppliedRule(pos, "LB 13  Don't break before closings");
1078                     continue;
1079                 }
1080 
1081                 //       Scan backwards, checking for this sequence.
1082                 //       The OP char could include combining marks, so we actually check for
1083                 //           OP CM* SP* x
1084                 tPos = prevPos;
1085                 if (fSP.contains(prevChar)) {
1086                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1087                         tPos=moveIndex32(fText, tPos, -1);
1088                     }
1089                 }
1090                 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1091                     tPos=moveIndex32(fText, tPos, -1);
1092                 }
1093                 if (fOP.contains(UTF16.charAt(fText, tPos))) {
1094                     setAppliedRule(pos, "LB 14  Don't break after OP SP*");
1095                     continue;
1096                 }
1097 
1098                 if (nextPos < fText.length()) {
1099                     int nextChar = fText.codePointAt(nextPos);
1100                     if (fSP.contains(prevChar) && fIS.contains(thisChar) && fNU.contains(nextChar)) {
1101                         setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
1102                         break;
1103                     }
1104                 }
1105 
1106                 if (fIS.contains(thisChar)) {
1107                     setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces");
1108                     continue;
1109                 }
1110 
1111                 if (fOP.contains(thisChar)) {
1112                     // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
1113                     tPos = prevPos;
1114                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1115                         tPos = moveIndex32(fText, tPos, -1);
1116                     }
1117                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1118                         tPos = moveIndex32(fText, tPos, -1);
1119                     }
1120                     if (fQU.contains(UTF16.charAt(fText, tPos))) {
1121                         setAppliedRule(pos, "LB 15  QU SP* x OP");
1122                         continue;
1123                     }
1124                 }
1125 
1126                 if (fNS.contains(thisChar)) {
1127                     tPos = prevPos;
1128                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1129                         tPos = moveIndex32(fText, tPos, -1);
1130                     }
1131                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1132                         tPos = moveIndex32(fText, tPos, -1);
1133                     }
1134                     if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
1135                         setAppliedRule(pos, "LB 16  (CL | CP) SP* x NS");
1136                         continue;
1137                     }
1138                 }
1139 
1140 
1141                 if (fB2.contains(thisChar)) {
1142                     tPos = prevPos;
1143                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1144                         tPos = moveIndex32(fText, tPos, -1);
1145                     }
1146                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1147                         tPos = moveIndex32(fText, tPos, -1);
1148                     }
1149                     if (fB2.contains(UTF16.charAt(fText, tPos))) {
1150                         setAppliedRule(pos, "LB 17  B2 SP* x B2");
1151                         continue;
1152                     }
1153                 }
1154 
1155                 if (fSP.contains(prevChar)) {
1156                     setAppliedRule(pos, "LB 18  break after space");
1157                     break;
1158                 }
1159 
1160                 //    x   QU
1161                 //    QU  x
1162                 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
1163                         setAppliedRule(pos, "LB 19");
1164                     continue;
1165                 }
1166 
1167                 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
1168                     setAppliedRule(pos, "LB 20  Break around a CB");
1169                     break;
1170                 }
1171 
1172                 //           Don't break between Hyphens and letters if a break precedes the hyphen.
1173                 //           Formerly this was a Finnish tailoring.
1174                 //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
1175                 //    ^($HY | $HH) $AL;
1176                 if (fAL.contains(thisChar) && (fHY.contains(prevChar) || fHH.contains(prevChar)) &&
1177                         prevPosX2 == -1) {
1178                     setAppliedRule(pos, "LB 20.09");
1179                     continue;
1180                 }
1181 
1182                 if (fBA.contains(thisChar) ||
1183                         fHY.contains(thisChar) ||
1184                         fNS.contains(thisChar) ||
1185                         fBB.contains(prevChar) )   {
1186                     setAppliedRule(pos, "LB 21");
1187                     continue;
1188                 }
1189 
1190                 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
1191                     setAppliedRule(pos, "LB 21a HL (HY | BA) x");
1192                     continue;
1193                 }
1194 
1195                 if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
1196                     setAppliedRule(pos, "LB 21b SY x HL");
1197                     continue;
1198                 }
1199 
1200                 if (fIN.contains(thisChar)) {
1201                     setAppliedRule(pos, "LB 22");
1202                     continue;
1203                 }
1204 
1205                 //          (AL | HL) x NU
1206                 //          NU x (AL | HL)
1207                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) {
1208                     setAppliedRule(pos, "LB 23");
1209                     continue;
1210                 }
1211                 if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1212                     setAppliedRule(pos, "LB 23");
1213                     continue;
1214                 }
1215 
1216                 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
1217                 //      PR x (ID | EB | EM)
1218                 //     (ID | EB | EM) x PO
1219                 if (fPR.contains(prevChar) &&
1220                         (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar)))  {
1221                     setAppliedRule(pos, "LB 23a");
1222                     continue;
1223                 }
1224                 if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) &&
1225                         fPO.contains(thisChar)) {
1226                     setAppliedRule(pos, "LB 23a");
1227                     continue;
1228                 }
1229 
1230                 // Do not break between prefix and letters or ideographs.
1231                 //         (PR | PO) x (AL | HL)
1232                 //         (AL | HL) x (PR | PO)
1233                 if ((fPR.contains(prevChar) || fPO.contains(prevChar)) &&
1234                         (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1235                     setAppliedRule(pos, "LB 24  no break between prefix and letters or ideographs");
1236                     continue;
1237                 }
1238                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) &&
1239                         (fPR.contains(thisChar) || fPO.contains(thisChar))) {
1240                     setAppliedRule(pos, "LB 24  no break between prefix and letters or ideographs");
1241                     continue;
1242                 }
1243 
1244                 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
1245 
1246                 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
1247                         fJV.contains(thisChar) ||
1248                         fH2.contains(thisChar) ||
1249                         fH3.contains(thisChar))) {
1250                     setAppliedRule(pos, "LB 26  Do not break a Korean syllable.");
1251                     continue;
1252                 }
1253 
1254                 if ((fJV.contains(prevChar) || fH2.contains(prevChar))  &&
1255                         (fJV.contains(thisChar) || fJT.contains(thisChar))) {
1256                     setAppliedRule(pos, "LB 26  Do not break a Korean syllable.");
1257                     continue;
1258                 }
1259 
1260                 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
1261                         fJT.contains(thisChar)) {
1262                     setAppliedRule(pos, "LB 26  Do not break a Korean syllable.");
1263                     continue;
1264                 }
1265 
1266                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1267                         fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1268                         fPO.contains(thisChar)) {
1269                     setAppliedRule(pos, "LB 27  Treat a Korean Syllable Block the same as ID.");
1270                     continue;
1271                 }
1272                 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
1273                         fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
1274                     setAppliedRule(pos, "LB 27  Treat a Korean Syllable Block the same as ID.");
1275                     continue;
1276                 }
1277 
1278 
1279 
1280                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1281                     setAppliedRule(pos, "LB 28  Do not break between alphabetics");
1282                     continue;
1283                 }
1284 
1285                 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1286                     setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics");
1287                     continue;
1288                 }
1289 
1290                 //          (AL | NU) x OP
1291                 //          CP x (AL | NU)
1292                 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) &&
1293                         fOP30.contains(thisChar)) {
1294                     setAppliedRule(pos, "LB 30  Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.");
1295                     continue;
1296                 }
1297                 if (fCP30.contains(prevChar) &&
1298                         (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
1299                     setAppliedRule(pos, "LB 30  Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.");
1300                     continue;
1301                 }
1302 
1303                 //             RI RI  ÷  RI
1304                 //                RI  x  RI
1305                 if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
1306                     setAppliedRule(pos, "LB 30a Break between pairs of Regional Indicators.");
1307                     break;
1308                 }
1309                 if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
1310                     // Two Regional Indicators have been paired.
1311                     // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
1312                     // following RI. This is a hack.
1313                     thisChar = -1;
1314                     setAppliedRule(pos, "LB 30a Break between pairs of Regional Indicators.");
1315                     continue;
1316                 }
1317 
1318                 // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
1319                 if (fEB.contains(prevChar) && fEM.contains(thisChar)) {
1320                     setAppliedRule(pos, "LB 30b Emoji Base x Emoji Modifier");
1321                     continue;
1322                 }
1323 
1324                 if (fExtPictUnassigned.contains(prevChar) && fEM.contains(thisChar)) {
1325                     setAppliedRule(pos, "LB30b    [\\p{Extended_Pictographic}&\\p{Cn}] × EM");
1326                     continue;
1327                 }
1328 
1329                 // LB 31    Break everywhere else
1330                 setAppliedRule(pos, "LB 31 Break everywhere else");
1331                 break;
1332             }
1333 
1334             return pos;
1335         }
1336 
1337 
1338 
1339         // Match the following regular expression in the input text.
1340         //    ((PR | PO) CM*)? ((OP | HY) CM*)? (IS CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)?  (PR | PO) CM*)?
1341         //      0    0   1       4    4    4      5  5              7    7    7    7      9    9    9     11   11    (match states)
1342         //  retVals array  [0]  index of the start of the match, or -1 if no match
1343         //                 [1]  index of first char following the match.
1344         //  Can not use Java regex because need supplementary character support,
1345         //     and because Unicode char properties version must be the same as in
1346         //     the version of ICU being tested.
LBNumberCheck(StringBuffer s, int startIdx, int[] retVals)1347         private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
1348             if (retVals == null) {
1349                 retVals = new int[2];
1350             }
1351             retVals[0]     = -1;  // Indicates no match.
1352             int matchState = 0;
1353             int idx        = startIdx;
1354 
1355             matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
1356                 int c = UTF16.charAt(s, idx);
1357                 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
1358                 switch (matchState) {
1359                 case 0:
1360                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
1361                     cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1362                         matchState = 1;
1363                         break;
1364                     }
1365                     if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1366                         matchState = 4;
1367                         break;
1368                     }
1369                     if (cLBType == UCharacter.LineBreak.HYPHEN) {
1370                         matchState = 4;
1371                         break;
1372                     }
1373                     if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1374                         matchState = 5;
1375                         break;
1376                     }
1377                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
1378                         matchState = 7;
1379                         break;
1380                     }
1381                     break matchLoop;   /* No Match  */
1382 
1383                 case 1:
1384                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1385                         matchState = 1;
1386                         break;
1387                     }
1388                     if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1389                         matchState = 4;
1390                         break;
1391                     }
1392                     if (cLBType == UCharacter.LineBreak.HYPHEN) {
1393                         matchState = 4;
1394                         break;
1395                     }
1396                     if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1397                         matchState = 5;
1398                         break;
1399                     }
1400                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
1401                         matchState = 7;
1402                         break;
1403                     }
1404                     break matchLoop;   /* No Match  */
1405 
1406                 case 4:
1407                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1408                         matchState = 4;
1409                         break;
1410                     }
1411                     if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1412                         matchState = 5;
1413                         break;
1414                     }
1415                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
1416                         matchState = 7;
1417                         break;
1418                     }
1419                     break matchLoop;   /* No Match  */
1420 
1421                 case 5:
1422                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1423                         matchState = 5;
1424                         break;
1425                     }
1426                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
1427                         matchState = 7;
1428                         break;
1429                     }
1430                     break matchLoop;   /* No Match  */
1431 
1432 
1433                 case 7:
1434                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1435                         matchState = 7;
1436                         break;
1437                     }
1438                     if (cLBType == UCharacter.LineBreak.NUMERIC) {
1439                         matchState = 7;
1440                         break;
1441                     }
1442                     if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1443                         matchState = 7;
1444                         break;
1445                     }
1446                     if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
1447                         matchState = 7;
1448                         break;
1449                     }
1450                     if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
1451                         matchState = 9;
1452                         break;
1453                     }
1454                     if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
1455                         matchState = 9;
1456                         break;
1457                     }
1458                     if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1459                         matchState = 11;
1460                         break;
1461                     }
1462                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1463                         matchState = 11;
1464                         break;
1465                     }
1466 
1467                     break matchLoop;    // Match Complete.
1468                 case 9:
1469                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1470                         matchState = 9;
1471                         break;
1472                     }
1473                     if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1474                         matchState = 11;
1475                         break;
1476                     }
1477                     if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1478                         matchState = 11;
1479                         break;
1480                     }
1481                     break matchLoop;    // Match Complete.
1482                 case 11:
1483                     if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1484                         matchState = 11;
1485                         break;
1486                     }
1487                     break matchLoop;    // Match Complete.
1488                 }
1489             }
1490             if (matchState >= 7) {
1491                 retVals[0] = startIdx;
1492                 retVals[1] = idx;
1493             }
1494             return retVals;
1495         }
1496 
1497 
1498         @Override
charClasses()1499         List  charClasses() {
1500             return fSets;
1501         }
1502     }
1503 
1504 
1505     /**
1506      *
1507      * Sentence Monkey Test Class
1508      *
1509      *
1510      *
1511      */
1512     static class RBBISentenceMonkey extends RBBIMonkeyKind {
1513         StringBuffer         fText;
1514 
1515         UnicodeSet           fSepSet;
1516         UnicodeSet           fFormatSet;
1517         UnicodeSet           fSpSet;
1518         UnicodeSet           fLowerSet;
1519         UnicodeSet           fUpperSet;
1520         UnicodeSet           fOLetterSet;
1521         UnicodeSet           fNumericSet;
1522         UnicodeSet           fATermSet;
1523         UnicodeSet           fSContinueSet;
1524         UnicodeSet           fSTermSet;
1525         UnicodeSet           fCloseSet;
1526         UnicodeSet           fOtherSet;
1527         UnicodeSet           fExtendSet;
1528 
RBBISentenceMonkey()1529         RBBISentenceMonkey() {
1530             fCharProperty  = UProperty.SENTENCE_BREAK;
1531 
1532             //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
1533             //                       set and made into character classes of their own.  For the monkey impl,
1534             //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
1535             fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
1536             fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]");
1537             fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
1538             fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
1539             fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
1540             fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
1541             fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
1542             fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
1543             fSContinueSet    = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
1544             fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
1545             fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]");
1546             fExtendSet       = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
1547             fOtherSet        = new UnicodeSet();
1548 
1549 
1550             fOtherSet.complement();
1551             fOtherSet.removeAll(fSepSet);
1552             fOtherSet.removeAll(fFormatSet);
1553             fOtherSet.removeAll(fSpSet);
1554             fOtherSet.removeAll(fLowerSet);
1555             fOtherSet.removeAll(fUpperSet);
1556             fOtherSet.removeAll(fOLetterSet);
1557             fOtherSet.removeAll(fNumericSet);
1558             fOtherSet.removeAll(fATermSet);
1559             fOtherSet.removeAll(fSContinueSet);
1560             fOtherSet.removeAll(fSTermSet);
1561             fOtherSet.removeAll(fCloseSet);
1562             fOtherSet.removeAll(fExtendSet);
1563 
1564             fSets.add(fSepSet);         fClassNames.add("Sep");
1565             fSets.add(fFormatSet);      fClassNames.add("Format");
1566 
1567             fSets.add(fSpSet);          fClassNames.add("Sp");
1568             fSets.add(fLowerSet);       fClassNames.add("Lower");
1569             fSets.add(fUpperSet);       fClassNames.add("Upper");
1570             fSets.add(fOLetterSet);     fClassNames.add("OLetter");
1571             fSets.add(fNumericSet);     fClassNames.add("Numeric");
1572             fSets.add(fATermSet);       fClassNames.add("ATerm");
1573             fSets.add(fSContinueSet);   fClassNames.add("SContinue");
1574             fSets.add(fSTermSet);       fClassNames.add("STerm");
1575             fSets.add(fCloseSet);       fClassNames.add("Close");
1576             fSets.add(fOtherSet);       fClassNames.add("Other");
1577             fSets.add(fExtendSet);      fClassNames.add("Extend");
1578         }
1579 
1580 
1581         @Override
charClasses()1582         List  charClasses() {
1583             return fSets;
1584         }
1585 
1586         @Override
setText(StringBuffer s)1587         void   setText(StringBuffer s) {
1588             fText = s;
1589             prepareAppliedRules(s.length());
1590         }
1591 
1592 
1593         //      moveBack()   Find the "significant" code point preceding the index i.
1594         //      Skips over ($Extend | $Format)*
1595         //
moveBack(int i)1596         private int moveBack(int i) {
1597 
1598             if (i <= 0) {
1599                 return -1;
1600             }
1601 
1602             int      c;
1603             int      j = i;
1604             do {
1605                 j = moveIndex32(fText, j, -1);
1606                 c = UTF16.charAt(fText, j);
1607             }
1608             while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
1609             return j;
1610         }
1611 
1612 
moveForward(int i)1613         int moveForward(int i) {
1614             if (i>=fText.length()) {
1615                 return fText.length();
1616             }
1617             int   c;
1618             int   j = i;
1619             do {
1620                 j = moveIndex32(fText, j, 1);
1621                 c = cAt(j);
1622             }
1623             while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
1624             return j;
1625 
1626         }
1627 
cAt(int pos)1628         int cAt(int pos) {
1629             if (pos<0 || pos>=fText.length()) {
1630                 return -1;
1631             }
1632             return UTF16.charAt(fText, pos);
1633         }
1634 
1635         @Override
next(int prevPos)1636         int   next(int prevPos) {
1637             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
1638             //   break position being tested.  The candidate break
1639             //   location is before p2.
1640             int     breakPos = -1;
1641 
1642             int c0, c1, c2, c3;         // The code points at p0, p1, p2 & p3.
1643             int c;
1644 
1645             // Prev break at end of string.  return DONE.
1646             if (prevPos >= fText.length()) {
1647                 return -1;
1648             }
1649             /*p0 =*/ p1 = p2 = p3 = prevPos;
1650             c3 = UTF16.charAt(fText, prevPos);
1651             c0 = c1 = c2 = 0;
1652 
1653             // Loop runs once per "significant" character position in the input text.
1654             for (;;) {
1655                 // Move all of the positions forward in the input string.
1656                 /*p0 = p1;*/  c0 = c1;
1657                 p1 = p2;  c1 = c2;
1658                 p2 = p3;  c2 = c3;
1659 
1660                 // Advance p3 by  X(Extend | Format)*   Rule 4
1661                 p3 = moveForward(p3);
1662                 c3 = cAt(p3);
1663 
1664                 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
1665                     setAppliedRule(p2, "SB3   CR x LF");
1666                     continue;
1667                 }
1668 
1669                 if (fSepSet.contains(c1)) {
1670                     p2 = p1+1;   // Separators don't combine with Extend or Format
1671                     setAppliedRule(p2, "SB4   Sep  <break>");
1672                     break;
1673                 }
1674 
1675                 if (p2 >= fText.length()) {
1676                     // Reached end of string.  Always a break position.
1677                     setAppliedRule(p2, "SB4   Sep  <break>");
1678                     break;
1679                 }
1680 
1681                 if (p2 == prevPos) {
1682                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1683                     setAppliedRule(p2, "SB4   Sep  <break>");
1684                     continue;
1685                 }
1686 
1687                 if (fATermSet.contains(c1) &&  fNumericSet.contains(c2))  {
1688                     setAppliedRule(p2, "SB6   ATerm x Numeric");
1689                     continue;
1690                 }
1691 
1692                 if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) &&
1693                         fATermSet.contains(c1) && fUpperSet.contains(c2)) {
1694                     setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
1695                     continue;
1696                 }
1697 
1698                 //           Note:  Sterm | ATerm are added to the negated part of the expression by a
1699                 //                  note to the Unicode 5.0 documents.
1700                 int p8 = p1;
1701                 while (p8>0 && fSpSet.contains(cAt(p8))) {
1702                     p8 = moveBack(p8);
1703                 }
1704                 while (p8>0 && fCloseSet.contains(cAt(p8))) {
1705                     p8 = moveBack(p8);
1706                 }
1707                 if (fATermSet.contains(cAt(p8))) {
1708                     p8=p2;
1709                     for (;;) {
1710                         c = cAt(p8);
1711                         if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
1712                                 fLowerSet.contains(c) || fSepSet.contains(c) ||
1713                                 fATermSet.contains(c) || fSTermSet.contains(c))
1714                         {
1715                             setAppliedRule(p2, "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower");
1716                             break;
1717                         }
1718                         p8 = moveForward(p8);
1719                     }
1720                     if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
1721                         setAppliedRule(p2, "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower");
1722                         continue;
1723                     }
1724                 }
1725 
1726                 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
1727                     p8 = p1;
1728                     while (setContains(fSpSet, cAt(p8))) {
1729                         p8 = moveBack(p8);
1730                     }
1731                     while (setContains(fCloseSet, cAt(p8))) {
1732                         p8 = moveBack(p8);
1733                     }
1734                     c = cAt(p8);
1735                     if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
1736                         setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)");
1737                         continue;
1738                     }
1739                 }
1740 
1741 
1742                 int p9 = p1;
1743                 while (p9>0 && fCloseSet.contains(cAt(p9))) {
1744                     p9 = moveBack(p9);
1745                 }
1746                 c = cAt(p9);
1747                 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
1748                     if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
1749                         setAppliedRule(p2, "SB9   (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
1750                         continue;
1751                     }
1752                 }
1753 
1754                 int p10 = p1;
1755                 while (p10>0 && fSpSet.contains(cAt(p10))) {
1756                     p10 = moveBack(p10);
1757                 }
1758                 while (p10>0 && fCloseSet.contains(cAt(p10))) {
1759                     p10 = moveBack(p10);
1760                 }
1761                 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
1762                     if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
1763                         setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
1764                         continue;
1765                     }
1766                 }
1767 
1768                 int p11 = p1;
1769                 if (p11>0 && fSepSet.contains(cAt(p11))) {
1770                     p11 = moveBack(p11);
1771                 }
1772                 while (p11>0 && fSpSet.contains(cAt(p11))) {
1773                     p11 = moveBack(p11);
1774                 }
1775                 while (p11>0 && fCloseSet.contains(cAt(p11))) {
1776                     p11 = moveBack(p11);
1777                 }
1778                 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
1779                     setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp*   <break>");
1780                     break;
1781                 }
1782 
1783                 setAppliedRule(p2, "SB12  Any x Any");
1784                 continue;
1785             }
1786             breakPos = p2;
1787             return breakPos;
1788         }
1789     }
1790 
1791 
1792     /**
1793      * Move an index into a string by n code points.
1794      *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
1795      *   complicating usage.
1796      * @param s   a Text string
1797      * @param pos The starting code unit index into the text string
1798      * @param amt The amount to adjust the string by.
1799      * @return    The adjusted code unit index, pinned to the string's length, or
1800      *            unchanged if input index was outside of the string.
1801      */
moveIndex32(StringBuffer s, int pos, int amt)1802     static int moveIndex32(StringBuffer s, int pos, int amt) {
1803         int i;
1804         char  c;
1805         if (amt>0) {
1806             for (i=0; i<amt; i++) {
1807                 if (pos >= s.length()) {
1808                     return s.length();
1809                 }
1810                 c = s.charAt(pos);
1811                 pos++;
1812                 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
1813                     c = s.charAt(pos);
1814                     if (UTF16.isTrailSurrogate(c)) {
1815                         pos++;
1816                     }
1817                 }
1818             }
1819         } else {
1820             for (i=0; i>amt; i--) {
1821                 if (pos <= 0) {
1822                     return 0;
1823                 }
1824                 pos--;
1825                 c = s.charAt(pos);
1826                 if (UTF16.isTrailSurrogate(c) && pos >= 0) {
1827                     c = s.charAt(pos);
1828                     if (UTF16.isLeadSurrogate(c)) {
1829                         pos--;
1830                     }
1831                 }
1832             }
1833         }
1834         return pos;
1835     }
1836 
1837     /**
1838      * No-exceptions form of UnicodeSet.contains(c).
1839      *    Simplifies loops that terminate with an end-of-input character value.
1840      * @param s  A unicode set
1841      * @param c  A code point value
1842      * @return   true if the set contains c.
1843      */
setContains(UnicodeSet s, int c)1844     static boolean setContains(UnicodeSet s, int c) {
1845         if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
1846             return false;
1847         }
1848         return s.contains(c);
1849     }
1850 
1851 
1852     /**
1853      * return the index of the next code point in the input text.
1854      * @param i the preceding index
1855      */
nextCP(StringBuffer s, int i)1856     static int  nextCP(StringBuffer s, int i) {
1857         if (i == -1) {
1858             // End of Input indication.  Continue to return end value.
1859             return -1;
1860         }
1861         int  retVal = i + 1;
1862         if (retVal > s.length()) {
1863             return -1;
1864         }
1865         int  c = UTF16.charAt(s, i);
1866         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
1867             retVal++;
1868         }
1869         return retVal;
1870     }
1871 
1872 
1873     /**
1874      * random number generator.  Not using Java's built-in Randoms for two reasons:
1875      *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
1876      *    2.  We need to get and restore the seed from values occurring in the middle
1877      *        of a long sequence, to more easily reproduce failing cases.
1878      */
1879     private static int m_seed = 1;
m_rand()1880     private static int  m_rand()
1881     {
1882         m_seed = m_seed * 1103515245 + 12345;
1883         return (m_seed >>> 16) % 32768;
1884     }
1885 
1886     // Helper function for formatting error output.
1887     //   Append a string into a fixed-size field in a StringBuffer.
1888     //   Blank-pad the string if it is shorter than the field.
1889     //   Truncate the source string if it is too long.
1890     //
appendToBuf(StringBuffer dest, String src, int fieldLen)1891     private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
1892         int appendLen = src.length();
1893         if (appendLen >= fieldLen) {
1894             dest.append(src.substring(0, fieldLen));
1895         } else {
1896             dest.append(src);
1897             while (appendLen < fieldLen) {
1898                 dest.append(' ');
1899                 appendLen++;
1900             }
1901         }
1902     }
1903 
1904     // Helper function for formatting error output.
1905     // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
1906     @SuppressWarnings("unused")
appendCharToBuf(StringBuffer dest, int c, int fieldLen)1907     private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
1908         String hexChars = "0123456789abcdef";
1909         if (c < 0x10000) {
1910             dest.append("\\u");
1911             for (int bn=12; bn>=0; bn-=4) {
1912                 dest.append(hexChars.charAt(((c)>>bn)&0xf));
1913             }
1914             appendToBuf(dest, " ", fieldLen-6);
1915         } else {
1916             dest.append("\\U");
1917             for (int bn=28; bn>=0; bn-=4) {
1918                 dest.append(hexChars.charAt(((c)>>bn)&0xf));
1919             }
1920             appendToBuf(dest, " ", fieldLen-10);
1921 
1922         }
1923     }
1924 
1925     /**
1926      *  Run a RBBI monkey test.  Common routine, for all break iterator types.
1927      *    Parameters:
1928      *       bi      - the break iterator to use
1929      *       mk      - MonkeyKind, abstraction for obtaining expected results
1930      *       name    - Name of test (char, word, etc.) for use in error messages
1931      *       seed    - Seed for starting random number generator (parameter from user)
1932      *       numIterations
1933      */
RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations)1934     void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int numIterations) {
1935         int              TESTSTRINGLEN = 500;
1936         StringBuffer     testText         = new StringBuffer();
1937         int              numCharClasses;
1938         List             chClasses;
1939         @SuppressWarnings("unused")
1940         int              expectedCount    = 0;
1941         boolean[]        expectedBreaks   = new boolean[TESTSTRINGLEN*2 + 1];
1942         boolean[]        forwardBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1943         boolean[]        reverseBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1944         boolean[]        isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1945         boolean[]        followingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1946         boolean[]        precedingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1947         int              i;
1948         int              loopCount        = 0;
1949         boolean          printTestData    = false;
1950         boolean          printBreaksFromBI = false;
1951 
1952         m_seed = seed;
1953 
1954         numCharClasses = mk.charClasses().size();
1955         chClasses      = mk.charClasses();
1956 
1957         // Verify that the character classes all have at least one member.
1958         for (i=0; i<numCharClasses; i++) {
1959             UnicodeSet s = (UnicodeSet)chClasses.get(i);
1960             if (s == null || s.size() == 0) {
1961                 errln("Character Class " + i + " is null or of zero size.");
1962                 return;
1963             }
1964         }
1965 
1966         //--------------------------------------------------------------------------------------------
1967         //
1968         //  Debugging settings.  Comment out everything in the following block for normal operation
1969         //
1970         //--------------------------------------------------------------------------------------------
1971         // numIterations = -1;
1972         // numIterations = 10000;   // Same as exhaustive.
1973         // RuleBasedBreakIterator_New.fTrace = true;
1974         // m_seed = 859056465;
1975         // TESTSTRINGLEN = 50;
1976         // printTestData = true;
1977         // printBreaksFromBI = true;
1978         // ((RuleBasedBreakIterator_New)bi).dump();
1979 
1980         //--------------------------------------------------------------------------------------------
1981         //
1982         //  End of Debugging settings.
1983         //
1984         //--------------------------------------------------------------------------------------------
1985 
1986         // For minimizing width of class name output.
1987         int classNameSize = mk.maxClassNameSize();
1988 
1989         int  dotsOnLine = 0;
1990         while (loopCount < numIterations || numIterations == -1) {
1991             if (numIterations == -1 && loopCount % 10 == 0) {
1992                 // If test is running in an infinite loop, display a periodic tic so
1993                 //   we can tell that it is making progress.
1994                 System.out.print(".");
1995                 if (dotsOnLine++ >= 80){
1996                     System.out.println();
1997                     dotsOnLine = 0;
1998                 }
1999             }
2000             // Save current random number seed, so that we can recreate the random numbers
2001             //   for this loop iteration in event of an error.
2002             seed = m_seed;
2003 
2004             testText.setLength(0);
2005             // Populate a test string with data.
2006             if (printTestData) {
2007                 System.out.println("Test Data string ...");
2008             }
2009             for (i=0; i<TESTSTRINGLEN; i++) {
2010                 int        aClassNum = m_rand() % numCharClasses;
2011                 UnicodeSet classSet  = (UnicodeSet)chClasses.get(aClassNum);
2012                 int        charIdx   = m_rand() % classSet.size();
2013                 int        c         = classSet.charAt(charIdx);
2014                 if (c < 0) {   // TODO:  deal with sets containing strings.
2015                     errln("c < 0");
2016                 }
2017                 // Do not assemble a supplementary character from randomly generated separate surrogates.
2018                 //   (It could be a dictionary character)
2019                 if (c < 0x10000 && Character.isLowSurrogate((char)c) && testText.length() > 0 &&
2020                         Character.isHighSurrogate(testText.charAt(testText.length()-1))) {
2021                     continue;
2022                 }
2023                 testText.appendCodePoint(c);
2024                 if (printTestData) {
2025                     System.out.print(Integer.toHexString(c) + " ");
2026                 }
2027             }
2028             if (printTestData) {
2029                 System.out.println();
2030             }
2031 
2032             Arrays.fill(expectedBreaks, false);
2033             Arrays.fill(forwardBreaks, false);
2034             Arrays.fill(reverseBreaks, false);
2035             Arrays.fill(isBoundaryBreaks, false);
2036             Arrays.fill(followingBreaks, false);
2037             Arrays.fill(precedingBreaks, false);
2038 
2039             // Calculate the expected results for this test string and reset applied rules.
2040             mk.setText(testText);
2041             expectedCount = 0;
2042             expectedBreaks[0] = true;
2043             int breakPos = 0;
2044             int lastBreakPos = -1;
2045             for (;;) {
2046                 lastBreakPos = breakPos;
2047                 breakPos = mk.next(breakPos);
2048                 if (breakPos == -1) {
2049                     break;
2050                 }
2051                 if (breakPos > testText.length()) {
2052                     errln("breakPos > testText.length()");
2053                 }
2054                 if (lastBreakPos >= breakPos) {
2055                     errln("Next() not increasing.");
2056                     // break;
2057                 }
2058                 expectedBreaks[breakPos] = true;
2059             }
2060 
2061             // Find the break positions using forward iteration
2062             if (printBreaksFromBI) {
2063                 System.out.println("Breaks from BI...");
2064             }
2065             bi.setText(testText.toString());
2066             for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
2067                 if (i < 0 || i > testText.length()) {
2068                     errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
2069                     break;
2070                 }
2071                 if (printBreaksFromBI) {
2072                     System.out.print(Integer.toHexString(i) + " ");
2073                 }
2074                 forwardBreaks[i] = true;
2075             }
2076             if (printBreaksFromBI) {
2077                 System.out.println();
2078             }
2079 
2080             // Find the break positions using reverse iteration
2081             for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
2082                 if (i < 0 || i > testText.length()) {
2083                     errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
2084                     break;
2085                 }
2086                 reverseBreaks[i] = true;
2087             }
2088 
2089             // Find the break positions using isBoundary() tests.
2090             for (i=0; i<=testText.length(); i++) {
2091                 isBoundaryBreaks[i] = bi.isBoundary(i);
2092             }
2093 
2094             // Find the break positions using the following() function.
2095             lastBreakPos = 0;
2096             followingBreaks[0] = true;
2097             for (i=0; i<testText.length(); i++) {
2098                 breakPos = bi.following(i);
2099                 if (breakPos <= i ||
2100                         breakPos < lastBreakPos ||
2101                         breakPos > testText.length() ||
2102                         breakPos > lastBreakPos && lastBreakPos > i ) {
2103                     errln(name + " break monkey test: " +
2104                             "Out of range value returned by BreakIterator::following().\n" +
2105                             "index=" + i + "following returned=" + breakPos +
2106                             "lastBreak=" + lastBreakPos);
2107                     precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
2108                 } else {
2109                     followingBreaks[breakPos] = true;
2110                     lastBreakPos = breakPos;
2111                 }
2112             }
2113 
2114             // Find the break positions using the preceding() function.
2115             lastBreakPos = testText.length();
2116             precedingBreaks[testText.length()] = true;
2117             for (i=testText.length(); i>0; i--) {
2118                 breakPos = bi.preceding(i);
2119                 if (breakPos >= i ||
2120                         breakPos > lastBreakPos ||
2121                         breakPos < 0 ||
2122                         breakPos < lastBreakPos && lastBreakPos < i ) {
2123                     errln(name + " break monkey test: " +
2124                             "Out of range value returned by BreakIterator::preceding().\n" +
2125                             "index=" + i + "preceding returned=" + breakPos +
2126                             "lastBreak=" + lastBreakPos);
2127                     precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
2128                 } else {
2129                     precedingBreaks[breakPos] = true;
2130                     lastBreakPos = breakPos;
2131                 }
2132             }
2133 
2134 
2135 
2136             // Compare the expected and actual results.
2137             for (i=0; i<=testText.length(); i++) {
2138                 String errorType = null;
2139                 boolean[] currentBreakData = null;
2140                 if  (forwardBreaks[i] != expectedBreaks[i]) {
2141                     errorType = "next()";
2142                     currentBreakData = forwardBreaks;
2143                 } else if (reverseBreaks[i] != forwardBreaks[i]) {
2144                     errorType = "previous()";
2145                     currentBreakData = reverseBreaks;
2146                 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
2147                     errorType = "isBoundary()";
2148                     currentBreakData = isBoundaryBreaks;
2149                 } else if (followingBreaks[i] != expectedBreaks[i]) {
2150                     errorType = "following()";
2151                     currentBreakData = followingBreaks;
2152                 } else if (precedingBreaks[i] != expectedBreaks[i]) {
2153                     errorType = "preceding()";
2154                     currentBreakData = precedingBreaks;
2155                 }
2156 
2157                 if (errorType != null) {
2158                     // Format a range of the test text that includes the failure as
2159                     //  a data item that can be included in the rbbi test data file.
2160 
2161                     // Start of the range is the last point where expected and actual results
2162                     //   both agreed that there was a break position.
2163                     int startContext = i;
2164                     int count = 0;
2165                     for (;;) {
2166                         if (startContext==0) { break; }
2167                         startContext --;
2168                         if (expectedBreaks[startContext]) {
2169                             if (count == 2) break;
2170                             count ++;
2171                         }
2172                     }
2173 
2174                     // End of range is two expected breaks past the start position.
2175                     int endContext = i + 1;
2176                     int ci;
2177                     for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
2178                         for (;;) {
2179                             if (endContext >= testText.length()) {break;}
2180                             if (expectedBreaks[endContext-1]) {
2181                                 if (count == 0) break;
2182                                 count --;
2183                             }
2184                             endContext ++;
2185                         }
2186                     }
2187 
2188                     // Formatting of each line includes:
2189                     //   character code
2190                     //   reference break: '|' -> a break, '.' -> no break
2191                     //   actual break:    '|' -> a break, '.' -> no break
2192                     //   (name of character clase)
2193                     //   Unicode name of character
2194                     //   '--→' indicates location of the difference.
2195 
2196                     StringBuilder buffer = new StringBuilder();
2197                     buffer.append("\n")
2198                         .append((expectedBreaks[i] ? "Break expected but not found." : "Break found but not expected."))
2199                         .append(
2200                             String.format(" at index %d. Parameters to reproduce: @\"type=%s  seed=%d  loop=1\"\n",
2201                               i, name, seed));
2202 
2203                     int c;  // Char from test data
2204                     for (ci = startContext;  ci <= endContext && ci != -1;  ci = nextCP(testText, ci)) {
2205 
2206                         c = testText.codePointAt(ci);
2207                         buffer.append((ci == i) ? " --→" : "    ")
2208                             .append(String.format(" %3d : ", ci))
2209                             .append(!expectedBreaks[ci] ? " . " : " | ")  // Reference break
2210                             .append(!currentBreakData[ci] ? " . " : " | "); // Actual break
2211 
2212                         // BMP or SMP character in hex
2213                         if (c >= 0x10000) {
2214                             buffer.append("\\U").append(String.format("%08x", c));
2215                         } else {
2216                             buffer.append("    \\u").append(String.format("%04x", c));
2217                         }
2218 
2219                         buffer.append(
2220                             String.format(String.format(" %%-%ds", classNameSize),
2221                               mk.classNameFromCodepoint(c)))
2222                             .append(String.format(" %-40s", mk.getAppliedRule(ci)))
2223                             .append(String.format(" %-40s\n", UCharacter.getExtendedName(c)));
2224 
2225                         if (ci >= endContext) { break; }
2226                     }
2227                     errln(buffer.toString());
2228 
2229                     break;
2230                 }
2231             }
2232 
2233             loopCount++;
2234         }
2235     }
2236 
2237     // Test parameters are passed on the command line, or
2238     // via the Eclipse Run Configuration settings, arguments tab, VM parameters.
2239     // For example,
2240     //      -ea -Dseed=554654 -Dloop=1
2241 
2242     @Test
TestCharMonkey()2243     public void TestCharMonkey() {
2244         int loopCount = getIntProperty("loop", isQuick() ? 500 : 10000);
2245         int seed = getIntProperty("seed", 1);
2246 
2247         RBBICharMonkey  m = new RBBICharMonkey();
2248         BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
2249         RunMonkey(bi, m, "char", seed, loopCount);
2250     }
2251 
2252     @Test
TestWordMonkey()2253     public void TestWordMonkey() {
2254         int loopCount = getIntProperty("loop", isQuick() ? 500 : 10000);
2255         int seed = getIntProperty("seed", 1);
2256 
2257         logln("Word Break Monkey Test");
2258         RBBIWordMonkey  m = new RBBIWordMonkey();
2259         BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
2260         RunMonkey(bi, m, "word", seed, loopCount);
2261     }
2262 
2263     @Test
TestLineMonkey()2264     public void TestLineMonkey() {
2265         int loopCount = getIntProperty("loop", isQuick() ? 500 : 10000);
2266         int seed = getIntProperty("seed", 1);
2267 
2268         logln("Line Break Monkey Test");
2269         RBBILineMonkey  m = new RBBILineMonkey();
2270         BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
2271         try {
2272             RunMonkey(bi, m, "line", seed, loopCount);
2273         } catch(IllegalArgumentException e) {
2274             if (e.getMessage().equals("Invalid code point U+-000001")) {
2275                 // Looks like you used class UnicodeSet instead of class XUnicodeSet
2276                 // (note the leading 'X').
2277                 // See the comment before the definition of class XUnicodeSet.
2278                 errln("Probable program error: use XUnicodeSet in RBBILineMonkey code");
2279             } else {
2280                 throw e;
2281             }
2282         }
2283     }
2284 
2285     @Test
TestSentMonkey()2286     public void TestSentMonkey() {
2287         int loopCount = getIntProperty("loop", isQuick() ? 500 : 3000);
2288         int seed = getIntProperty("seed", 1);
2289 
2290         logln("Sentence Break Monkey Test");
2291         RBBISentenceMonkey  m = new RBBISentenceMonkey();
2292         BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
2293         RunMonkey(bi, m, "sent", seed, loopCount);
2294     }
2295     //
2296     //  Round-trip monkey tests.
2297     //  Verify that break iterators created from the rule source from the default
2298     //    break iterators still pass the monkey test for the iterator type.
2299     //
2300     //  This is a major test for the Rule Compiler.  The default break iterators are built
2301     //  from pre-compiled binary rule data that was created using ICU4C; these
2302     //  round-trip rule recompile tests verify that the Java rule compiler can
2303     //  rebuild break iterators from the original source rules.
2304     //
2305     @Test
TestRTCharMonkey()2306     public void TestRTCharMonkey() {
2307         int loopCount = getIntProperty("loop", isQuick() ? 200 : 2000);
2308         int seed = getIntProperty("seed", 1);
2309 
2310         RBBICharMonkey  m = new RBBICharMonkey();
2311         BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
2312         String rules = bi.toString();
2313         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2314         RunMonkey(rtbi, m, "char", seed, loopCount);
2315     }
2316 
2317     @Test
TestRTWordMonkey()2318     public void TestRTWordMonkey() {
2319         int loopCount = getIntProperty("loop", isQuick() ? 200 : 2000);
2320         int seed = getIntProperty("seed", 1);
2321 
2322         logln("Word Break Monkey Test");
2323         RBBIWordMonkey  m = new RBBIWordMonkey();
2324         BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
2325         String rules = bi.toString();
2326         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2327         RunMonkey(rtbi, m, "word", seed, loopCount);
2328     }
2329 
2330     @Test
TestRTLineMonkey()2331     public void TestRTLineMonkey() {
2332         int loopCount = getIntProperty("loop", isQuick() ? 200 : 2000);
2333         int seed = getIntProperty("seed", 1);
2334 
2335         logln("Line Break Monkey Test");
2336         RBBILineMonkey  m = new RBBILineMonkey();
2337         BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
2338         String rules = bi.toString();
2339         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2340         try {
2341             RunMonkey(rtbi, m, "line", seed, loopCount);
2342         } catch(IllegalArgumentException e) {
2343             if (e.getMessage().equals("Invalid code point U+-000001")) {
2344                 // Looks like you used class UnicodeSet instead of class XUnicodeSet
2345                 // (note the leading 'X').
2346                 // See the comment before the definition of class XUnicodeSet.
2347                 errln("Probable program error: use XUnicodeSet in RBBILineMonkey code");
2348             } else {
2349                 throw e;
2350             }
2351         }
2352     }
2353 
2354     @Test
TestRTSentMonkey()2355     public void TestRTSentMonkey() {
2356         int loopCount = getIntProperty("loop", isQuick() ? 200 : 1000);
2357         int seed = getIntProperty("seed", 1);
2358 
2359         logln("Sentence Break Monkey Test");
2360         RBBISentenceMonkey  m = new RBBISentenceMonkey();
2361         BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
2362         String rules = bi.toString();
2363         BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2364         RunMonkey(rtbi, m, "sent", seed, loopCount);
2365     }
2366 }
2367 
2368