1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /************************************************************************* 4 * Copyright (c) 2016, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 ************************************************************************* 7 */ 8 #ifndef RBBIMONKEYTEST_H 9 #define RBBIMONKEYTEST_H 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING 14 15 #include "intltest.h" 16 17 #include "unicode/rbbi.h" 18 #include "unicode/regex.h" 19 #include "unicode/uniset.h" 20 #include "unicode/unistr.h" 21 #include "unicode/uobject.h" 22 23 #include "simplethread.h" 24 #include "ucbuf.h" 25 #include "uhash.h" 26 #include "uvector.h" 27 28 // 29 // TODO: 30 // Develop a tailoring format. 31 // Hook to old tests that use monkey impl to get expected data. 32 // Remove old tests. 33 34 class BreakRules; // Forward declaration 35 class RBBIMonkeyImpl; 36 37 /** 38 * Test the RuleBasedBreakIterator class giving different rules 39 */ 40 class RBBIMonkeyTest: public IntlTest { 41 public: 42 RBBIMonkeyTest(); 43 virtual ~RBBIMonkeyTest(); 44 45 void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); 46 void testMonkey(); 47 48 49 private: 50 const char *fParams; // Copy of user parameters passed in from IntlTest. 51 52 53 void testRules(const char *ruleFile); 54 static UBool getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status); 55 static UBool getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status); 56 static UBool getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status); 57 58 }; 59 60 // The following classes are internal to the RBBI Monkey Test implementation. 61 62 63 64 // class CharClass Represents a single character class from the source break rules. 65 // Inherits from UObject because instances are adopted by UHashtable, which ultimately 66 // deletes them using hash's object deleter function. 67 68 class CharClass: public UObject { 69 public: 70 UnicodeString fName; 71 UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules. 72 UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively. 73 LocalPointer<const UnicodeSet> fSet; CharClass(const UnicodeString & name,const UnicodeString & originalDef,const UnicodeString & expandedDef,const UnicodeSet * set)74 CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) : 75 fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {} 76 }; 77 78 79 // class BreakRule represents a single rule from a set of break rules. 80 // Each rule has the set definitions expanded, and 81 // is compiled to a regular expression. 82 83 class BreakRule: public UObject { 84 public: 85 BreakRule(); 86 ~BreakRule(); 87 UnicodeString fName; // Name of the rule. 88 UnicodeString fRule; // Rule expression, excluding the name, as written in user source. 89 UnicodeString fExpandedRule; // Rule expression after expanding the set definitions. 90 LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule. 91 }; 92 93 94 // class BreakRules represents a complete set of break rules, possibly tailored, 95 // compiled from testdata break rules. 96 97 class BreakRules: public UObject { 98 public: 99 BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status); 100 ~BreakRules(); 101 102 void compileRules(UCHARBUF *rules, UErrorCode &status); 103 104 const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const; 105 106 107 RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance. 108 icu::UVector fBreakRules; // Contents are of type (BreakRule *). 109 110 LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString). 111 // Value is (CharClass *) 112 LocalPointer<UVector> fCharClassList; // Char Classes, same contents as fCharClasses values, 113 // but in a vector so they can be accessed by index. 114 UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined. 115 Locale fLocale; 116 UBreakIteratorType fType; 117 118 CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status); 119 void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status); 120 bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status); 121 RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status); 122 123 LocalPointer<RegexMatcher> fSetRefsMatcher; 124 LocalPointer<RegexMatcher> fCommentsMatcher; 125 LocalPointer<RegexMatcher> fClassDefMatcher; 126 LocalPointer<RegexMatcher> fRuleDefMatcher; 127 }; 128 129 130 // class MonkeyTestData represents a randomly synthesized test data string together 131 // with the expected break positions obtained by applying 132 // the test break rules. 133 134 class MonkeyTestData: public UObject { 135 public: MonkeyTestData()136 MonkeyTestData() {}; ~MonkeyTestData()137 ~MonkeyTestData() {}; 138 void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status); 139 void clearActualBreaks(); 140 void dump(int32_t around = -1) const; 141 142 uint32_t fRandomSeed; // The initial seed value from the random number genererator. 143 const BreakRules *fBkRules; // The break rules used to generate this data. 144 UnicodeString fString; // The text. 145 UnicodeString fExpectedBreaks; // Breaks as found by the reference rules. 146 // Parallel to fString. Non-zero if break preceding. 147 UnicodeString fActualBreaks; // Breaks as found by ICU break iterator. 148 UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position. 149 // Also parallel to fString. 150 UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule 151 // didn't cause a break, and a subsequent rule match starts 152 // on the last code point of the preceding match. 153 154 }; 155 156 157 158 159 // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey 160 // test for one set of break rules. 161 // 162 // When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence 163 // between instances of RBBIMonkeyImpl and threads. 164 // 165 class RBBIMonkeyImpl: public UObject { 166 public: 167 RBBIMonkeyImpl(UErrorCode &status); 168 ~RBBIMonkeyImpl(); 169 170 void setup(const char *ruleFileName, UErrorCode &status); 171 172 void startTest(); 173 void runTest(); 174 void join(); 175 176 LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules. 177 LocalPointer<BreakRules> fRuleSet; 178 LocalPointer<RuleBasedBreakIterator> fBI; 179 LocalPointer<MonkeyTestData> fTestData; 180 IntlTest::icu_rand fRandomGenerator; 181 const char *fRuleFileName; 182 UBool fVerbose; // True to do long dump of failing data. 183 int32_t fLoopCount; 184 185 UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets. 186 187 enum CheckDirection { 188 FORWARD = 1, 189 REVERSE = 2 190 }; 191 void clearActualBreaks(); 192 void testForwards(UErrorCode &status); 193 void testPrevious(UErrorCode &status); 194 void testFollowing(UErrorCode &status); 195 void testPreceding(UErrorCode &status); 196 void testIsBoundary(UErrorCode &status); 197 void checkResults(const char *msg, CheckDirection dir, UErrorCode &status); 198 199 class RBBIMonkeyThread: public SimpleThread { 200 private: 201 RBBIMonkeyImpl *fMonkeyImpl; 202 public: RBBIMonkeyThread(RBBIMonkeyImpl * impl)203 RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {}; run()204 void run() U_OVERRIDE { fMonkeyImpl->runTest(); }; 205 }; 206 private: 207 void openBreakRules(const char *fileName, UErrorCode &status); 208 RBBIMonkeyThread fThread; 209 210 }; 211 212 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */ 213 214 #endif // RBBIMONKEYTEST_H 215