• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*************************************************************************
4  * Copyright (c) 2016, International Business Machines
5  * Corporation and others. All Rights Reserved.
6  *************************************************************************
7 */
8 #ifndef RBBIMONKEYTEST_H
9 #define RBBIMONKEYTEST_H
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
14 
15 #include "intltest.h"
16 
17 #include "unicode/rbbi.h"
18 #include "unicode/regex.h"
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21 #include "unicode/uobject.h"
22 
23 #include "simplethread.h"
24 #include "ucbuf.h"
25 #include "uhash.h"
26 #include "uvector.h"
27 
28 // RBBI Monkey Test. Run break iterators against randomly generated strings, compare results with
29 //                   an independent reference implementation.
30 //
31 //         The monkey test can be run with parameters, e.g.
32 //              intltest rbbi/RBBIMonkeyTest@loop=-1,rules=word.txt
33 //         will run word break testing in an infinite loop.
34 //         Summary of options
35 //               rules=name             Test against the named reference rule file.
36 //                                     Files are found in source/test/testdata/break_rules
37 //               loop=nnn              Loop nnn times. -1 for no limit. loop of 1 is useful for debugging.
38 //               seed=nnnn             Random number generator seed. Allows recreation of a failure.
39 //                                     Error messages include the necessary seed value.
40 //               verbose               Display details of a failure. Useful for debugging. Use with loop=1.
41 //               expansions            Debug option, show expansions of rules and sets.
42 //
43 //  TODO:
44 //     Develop a tailoring format.
45 //     Hook to old tests that use monkey impl to get expected data.
46 //     Remove old tests.
47 
48 class BreakRules;       // Forward declaration
49 class RBBIMonkeyImpl;
50 
51 /**
52  * Test the RuleBasedBreakIterator class giving different rules
53  */
54 class RBBIMonkeyTest: public IntlTest {
55   public:
56     RBBIMonkeyTest();
57     virtual ~RBBIMonkeyTest();
58 
59     void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ) override;
60     void testMonkey();
61 
62 
63   private:
64     const char *fParams;                  // Copy of user parameters passed in from IntlTest.
65 
66 
67     void testRules(const char *ruleFile);
68     static UBool getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status);
69     static UBool getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status);
70     static UBool getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status);
71 
72 };
73 
74 // The following classes are internal to the RBBI Monkey Test implementation.
75 
76 
77 
78 //  class CharClass    Represents a single character class from the source break rules.
79 //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
80 //                     deletes them using hash's object deleter function.
81 
82 class CharClass: public UObject {
83   public:
84     UnicodeString                fName;
85     UnicodeString                fOriginalDef;    // set definition as it appeared in user supplied rules.
86     UnicodeString                fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
87     LocalPointer<const UnicodeSet>     fSet;
CharClass(const UnicodeString & name,const UnicodeString & originalDef,const UnicodeString & expandedDef,const UnicodeSet * set)88     CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) :
89             fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {}
90 };
91 
92 
93 // class BreakRule    represents a single rule from a set of break rules.
94 //                    Each rule has the set definitions expanded, and
95 //                    is compiled to a regular expression.
96 
97 class BreakRule: public UObject {
98   public:
99     BreakRule();
100     ~BreakRule();
101     UnicodeString    fName;                            // Name of the rule.
102     UnicodeString    fRule;                            // Rule expression, excluding the name, as written in user source.
103     UnicodeString    fExpandedRule;                    // Rule expression after expanding the set definitions.
104     LocalPointer<RegexMatcher>  fRuleMatcher;          // Regular expression that matches the rule.
105     bool             fInitialMatchOnly = false;        // True if rule begins with '^', meaning no chaining.
106 };
107 
108 
109 // class BreakRules    represents a complete set of break rules, possibly tailored,
110 //                     compiled from testdata break rules.
111 
112 class BreakRules: public UObject {
113   public:
114     BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status);
115     ~BreakRules();
116 
117     void compileRules(UCHARBUF *rules, UErrorCode &status);
118 
119     const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const;
120 
121 
122     RBBIMonkeyImpl    *fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
123     icu::UVector       fBreakRules;        // Contents are of type (BreakRule *).
124 
125     LocalUHashtablePointer fCharClasses;   // Key is set name (UnicodeString).
126                                            // Value is (CharClass *)
127     LocalPointer<UVector>  fCharClassList; // Char Classes, same contents as fCharClasses values,
128                                            //   but in a vector so they can be accessed by index.
129     UnicodeSet         fDictionarySet;     // Dictionary set, empty if none is defined.
130     Locale             fLocale;
131     UBreakIteratorType fType;
132 
133     CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
134     void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
135     bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status);
136     RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status);
137 
138     LocalPointer<RegexMatcher> fSetRefsMatcher;
139     LocalPointer<RegexMatcher> fCommentsMatcher;
140     LocalPointer<RegexMatcher> fClassDefMatcher;
141     LocalPointer<RegexMatcher> fRuleDefMatcher;
142 };
143 
144 
145 // class MonkeyTestData    represents a randomly synthesized test data string together
146 //                         with the expected break positions obtained by applying
147 //                         the test break rules.
148 
149 class MonkeyTestData: public UObject {
150   public:
MonkeyTestData()151     MonkeyTestData() {}
~MonkeyTestData()152     ~MonkeyTestData() {}
153     void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status);
154     void clearActualBreaks();
155     void dump(int32_t around = -1) const;
156 
157     uint32_t               fRandomSeed;        // The initial seed value from the random number genererator.
158     const BreakRules      *fBkRules;           // The break rules used to generate this data.
159     UnicodeString          fString;            // The text.
160     UnicodeString          fExpectedBreaks;    // Breaks as found by the reference rules.
161                                                //     Parallel to fString. Non-zero if break preceding.
162     UnicodeString          fActualBreaks;      // Breaks as found by ICU break iterator.
163     UnicodeString          fRuleForPosition;   // Index into BreakRules.fBreakRules of rule that applied at each position.
164                                                // Also parallel to fString.
165     UnicodeString          f2ndRuleForPos;     // As above. A 2nd rule applies when the preceding rule
166                                                //   didn't cause a break, and a subsequent rule match starts
167                                                //   on the last code point of the preceding match.
168 
169 };
170 
171 
172 
173 
174 // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
175 //                          test for one set of break rules.
176 //
177 //                          When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
178 //                          between instances of RBBIMonkeyImpl and threads.
179 //
180 class RBBIMonkeyImpl: public UObject {
181   public:
182     RBBIMonkeyImpl(UErrorCode &status);
183     ~RBBIMonkeyImpl();
184 
185     void setup(const char *ruleFileName, UErrorCode &status);
186 
187     void startTest();
188     void runTest();
189     void join();
190 
191     LocalUCHARBUFPointer                 fRuleCharBuffer;         // source file contents of the reference rules.
192     LocalPointer<BreakRules>             fRuleSet;
193     LocalPointer<RuleBasedBreakIterator> fBI;
194     LocalPointer<MonkeyTestData>         fTestData;
195     IntlTest::icu_rand                   fRandomGenerator;
196     const char                          *fRuleFileName;
197     UBool                                fVerbose;                 // True to do long dump of failing data.
198     int32_t                              fLoopCount;
199 
200     UBool                                fDumpExpansions;          // Debug flag to output epananded form of rules and sets.
201 
202     enum CheckDirection {
203         FORWARD = 1,
204         REVERSE = 2
205     };
206     void clearActualBreaks();
207     void testForwards(UErrorCode &status);
208     void testPrevious(UErrorCode &status);
209     void testFollowing(UErrorCode &status);
210     void testPreceding(UErrorCode &status);
211     void testIsBoundary(UErrorCode &status);
212     void testIsBoundaryRandom(UErrorCode &status);
213     void checkResults(const char *msg, CheckDirection dir, UErrorCode &status);
214 
215     class RBBIMonkeyThread: public SimpleThread {
216       private:
217         RBBIMonkeyImpl *fMonkeyImpl;
218       public:
RBBIMonkeyThread(RBBIMonkeyImpl * impl)219         RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {}
run()220         void run() U_OVERRIDE { fMonkeyImpl->runTest(); }
221     };
222   private:
223     void openBreakRules(const char *fileName, UErrorCode &status);
224     RBBIMonkeyThread fThread;
225 
226 };
227 
228 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
229 
230 #endif  //  RBBIMONKEYTEST_H
231