1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /************************************************************************* 4 * Copyright (c) 2016, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 ************************************************************************* 7 */ 8 #ifndef RBBIMONKEYTEST_H 9 #define RBBIMONKEYTEST_H 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING 14 15 #include "intltest.h" 16 17 #include "unicode/rbbi.h" 18 #include "unicode/regex.h" 19 #include "unicode/uniset.h" 20 #include "unicode/unistr.h" 21 #include "unicode/uobject.h" 22 23 #include "simplethread.h" 24 #include "ucbuf.h" 25 #include "uhash.h" 26 #include "uvector.h" 27 28 // RBBI Monkey Test. Run break iterators against randomly generated strings, compare results with 29 // an independent reference implementation. 30 // 31 // The monkey test can be run with parameters, e.g. 32 // intltest rbbi/RBBIMonkeyTest@loop=-1,rules=word.txt 33 // will run word break testing in an infinite loop. 34 // Summary of options 35 // rules=name Test against the named reference rule file. 36 // Files are found in source/test/testdata/break_rules 37 // loop=nnn Loop nnn times. -1 for no limit. loop of 1 is useful for debugging. 38 // seed=nnnn Random number generator seed. Allows recreation of a failure. 39 // Error messages include the necessary seed value. 40 // verbose Display details of a failure. Useful for debugging. Use with loop=1. 41 // expansions Debug option, show expansions of rules and sets. 42 // 43 // TODO: 44 // Develop a tailoring format. 45 // Hook to old tests that use monkey impl to get expected data. 46 // Remove old tests. 47 48 class BreakRules; // Forward declaration 49 class RBBIMonkeyImpl; 50 51 /** 52 * Test the RuleBasedBreakIterator class giving different rules 53 */ 54 class RBBIMonkeyTest: public IntlTest { 55 public: 56 RBBIMonkeyTest(); 57 virtual ~RBBIMonkeyTest(); 58 59 void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); 60 void testMonkey(); 61 62 63 private: 64 const char *fParams; // Copy of user parameters passed in from IntlTest. 65 66 67 void testRules(const char *ruleFile); 68 static UBool getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status); 69 static UBool getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status); 70 static UBool getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status); 71 72 }; 73 74 // The following classes are internal to the RBBI Monkey Test implementation. 75 76 77 78 // class CharClass Represents a single character class from the source break rules. 79 // Inherits from UObject because instances are adopted by UHashtable, which ultimately 80 // deletes them using hash's object deleter function. 81 82 class CharClass: public UObject { 83 public: 84 UnicodeString fName; 85 UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules. 86 UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively. 87 LocalPointer<const UnicodeSet> fSet; CharClass(const UnicodeString & name,const UnicodeString & originalDef,const UnicodeString & expandedDef,const UnicodeSet * set)88 CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) : 89 fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {} 90 }; 91 92 93 // class BreakRule represents a single rule from a set of break rules. 94 // Each rule has the set definitions expanded, and 95 // is compiled to a regular expression. 96 97 class BreakRule: public UObject { 98 public: 99 BreakRule(); 100 ~BreakRule(); 101 UnicodeString fName; // Name of the rule. 102 UnicodeString fRule; // Rule expression, excluding the name, as written in user source. 103 UnicodeString fExpandedRule; // Rule expression after expanding the set definitions. 104 LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule. 105 bool fInitialMatchOnly = false; // True if rule begins with '^', meaning no chaining. 106 }; 107 108 109 // class BreakRules represents a complete set of break rules, possibly tailored, 110 // compiled from testdata break rules. 111 112 class BreakRules: public UObject { 113 public: 114 BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status); 115 ~BreakRules(); 116 117 void compileRules(UCHARBUF *rules, UErrorCode &status); 118 119 const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const; 120 121 122 RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance. 123 icu::UVector fBreakRules; // Contents are of type (BreakRule *). 124 125 LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString). 126 // Value is (CharClass *) 127 LocalPointer<UVector> fCharClassList; // Char Classes, same contents as fCharClasses values, 128 // but in a vector so they can be accessed by index. 129 UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined. 130 Locale fLocale; 131 UBreakIteratorType fType; 132 133 CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status); 134 void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status); 135 bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status); 136 RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status); 137 138 LocalPointer<RegexMatcher> fSetRefsMatcher; 139 LocalPointer<RegexMatcher> fCommentsMatcher; 140 LocalPointer<RegexMatcher> fClassDefMatcher; 141 LocalPointer<RegexMatcher> fRuleDefMatcher; 142 }; 143 144 145 // class MonkeyTestData represents a randomly synthesized test data string together 146 // with the expected break positions obtained by applying 147 // the test break rules. 148 149 class MonkeyTestData: public UObject { 150 public: MonkeyTestData()151 MonkeyTestData() {} ~MonkeyTestData()152 ~MonkeyTestData() {} 153 void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status); 154 void clearActualBreaks(); 155 void dump(int32_t around = -1) const; 156 157 uint32_t fRandomSeed; // The initial seed value from the random number genererator. 158 const BreakRules *fBkRules; // The break rules used to generate this data. 159 UnicodeString fString; // The text. 160 UnicodeString fExpectedBreaks; // Breaks as found by the reference rules. 161 // Parallel to fString. Non-zero if break preceding. 162 UnicodeString fActualBreaks; // Breaks as found by ICU break iterator. 163 UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position. 164 // Also parallel to fString. 165 UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule 166 // didn't cause a break, and a subsequent rule match starts 167 // on the last code point of the preceding match. 168 169 }; 170 171 172 173 174 // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey 175 // test for one set of break rules. 176 // 177 // When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence 178 // between instances of RBBIMonkeyImpl and threads. 179 // 180 class RBBIMonkeyImpl: public UObject { 181 public: 182 RBBIMonkeyImpl(UErrorCode &status); 183 ~RBBIMonkeyImpl(); 184 185 void setup(const char *ruleFileName, UErrorCode &status); 186 187 void startTest(); 188 void runTest(); 189 void join(); 190 191 LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules. 192 LocalPointer<BreakRules> fRuleSet; 193 LocalPointer<RuleBasedBreakIterator> fBI; 194 LocalPointer<MonkeyTestData> fTestData; 195 IntlTest::icu_rand fRandomGenerator; 196 const char *fRuleFileName; 197 UBool fVerbose; // True to do long dump of failing data. 198 int32_t fLoopCount; 199 200 UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets. 201 202 enum CheckDirection { 203 FORWARD = 1, 204 REVERSE = 2 205 }; 206 void clearActualBreaks(); 207 void testForwards(UErrorCode &status); 208 void testPrevious(UErrorCode &status); 209 void testFollowing(UErrorCode &status); 210 void testPreceding(UErrorCode &status); 211 void testIsBoundary(UErrorCode &status); 212 void testIsBoundaryRandom(UErrorCode &status); 213 void checkResults(const char *msg, CheckDirection dir, UErrorCode &status); 214 215 class RBBIMonkeyThread: public SimpleThread { 216 private: 217 RBBIMonkeyImpl *fMonkeyImpl; 218 public: RBBIMonkeyThread(RBBIMonkeyImpl * impl)219 RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {} run()220 void run() U_OVERRIDE { fMonkeyImpl->runTest(); } 221 }; 222 private: 223 void openBreakRules(const char *fileName, UErrorCode &status); 224 RBBIMonkeyThread fThread; 225 226 }; 227 228 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */ 229 230 #endif // RBBIMONKEYTEST_H 231