• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*************************************************************************
4  * Copyright (c) 2016, International Business Machines
5  * Corporation and others. All Rights Reserved.
6  *************************************************************************
7 */
8 #ifndef RBBIMONKEYTEST_H
9 #define RBBIMONKEYTEST_H
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
14 
15 #include "intltest.h"
16 
17 #include "unicode/rbbi.h"
18 #include "unicode/regex.h"
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21 #include "unicode/uobject.h"
22 
23 #include "simplethread.h"
24 #include "ucbuf.h"
25 #include "uhash.h"
26 #include "uvector.h"
27 
28 //
29 //  TODO:
30 //     Develop a tailoring format.
31 //     Hook to old tests that use monkey impl to get expected data.
32 //     Remove old tests.
33 
34 class BreakRules;       // Forward declaration
35 class RBBIMonkeyImpl;
36 
37 /**
38  * Test the RuleBasedBreakIterator class giving different rules
39  */
40 class RBBIMonkeyTest: public IntlTest {
41   public:
42     RBBIMonkeyTest();
43     virtual ~RBBIMonkeyTest();
44 
45     void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
46     void testMonkey();
47 
48 
49   private:
50     const char *fParams;                  // Copy of user parameters passed in from IntlTest.
51 
52 
53     void testRules(const char *ruleFile);
54     static UBool getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status);
55     static UBool getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status);
56     static UBool getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status);
57 
58 };
59 
60 // The following classes are internal to the RBBI Monkey Test implementation.
61 
62 
63 
64 //  class CharClass    Represents a single character class from the source break rules.
65 //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
66 //                     deletes them using hash's object deleter function.
67 
68 class CharClass: public UObject {
69   public:
70     UnicodeString                fName;
71     UnicodeString                fOriginalDef;    // set definition as it appeared in user supplied rules.
72     UnicodeString                fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
73     LocalPointer<const UnicodeSet>     fSet;
CharClass(const UnicodeString & name,const UnicodeString & originalDef,const UnicodeString & expandedDef,const UnicodeSet * set)74     CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) :
75             fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {}
76 };
77 
78 
79 // class BreakRule    represents a single rule from a set of break rules.
80 //                    Each rule has the set definitions expanded, and
81 //                    is compiled to a regular expression.
82 
83 class BreakRule: public UObject {
84   public:
85     BreakRule();
86     ~BreakRule();
87     UnicodeString    fName;                            // Name of the rule.
88     UnicodeString    fRule;                            // Rule expression, excluding the name, as written in user source.
89     UnicodeString    fExpandedRule;                    // Rule expression after expanding the set definitions.
90     LocalPointer<RegexMatcher>  fRuleMatcher;          // Regular expression that matches the rule.
91 };
92 
93 
94 // class BreakRules    represents a complete set of break rules, possibly tailored,
95 //                     compiled from testdata break rules.
96 
97 class BreakRules: public UObject {
98   public:
99     BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status);
100     ~BreakRules();
101 
102     void compileRules(UCHARBUF *rules, UErrorCode &status);
103 
104     const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const;
105 
106 
107     RBBIMonkeyImpl    *fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
108     icu::UVector       fBreakRules;        // Contents are of type (BreakRule *).
109 
110     LocalUHashtablePointer fCharClasses;   // Key is set name (UnicodeString).
111                                            // Value is (CharClass *)
112     LocalPointer<UVector>  fCharClassList; // Char Classes, same contents as fCharClasses values,
113                                            //   but in a vector so they can be accessed by index.
114     UnicodeSet         fDictionarySet;     // Dictionary set, empty if none is defined.
115     Locale             fLocale;
116     UBreakIteratorType fType;
117 
118     CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
119     void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
120     bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status);
121     RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status);
122 
123     LocalPointer<RegexMatcher> fSetRefsMatcher;
124     LocalPointer<RegexMatcher> fCommentsMatcher;
125     LocalPointer<RegexMatcher> fClassDefMatcher;
126     LocalPointer<RegexMatcher> fRuleDefMatcher;
127 };
128 
129 
130 // class MonkeyTestData    represents a randomly synthesized test data string together
131 //                         with the expected break positions obtained by applying
132 //                         the test break rules.
133 
134 class MonkeyTestData: public UObject {
135   public:
MonkeyTestData()136     MonkeyTestData() {};
~MonkeyTestData()137     ~MonkeyTestData() {};
138     void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status);
139     void clearActualBreaks();
140     void dump(int32_t around = -1) const;
141 
142     uint32_t               fRandomSeed;        // The initial seed value from the random number genererator.
143     const BreakRules      *fBkRules;           // The break rules used to generate this data.
144     UnicodeString          fString;            // The text.
145     UnicodeString          fExpectedBreaks;    // Breaks as found by the reference rules.
146                                                //     Parallel to fString. Non-zero if break preceding.
147     UnicodeString          fActualBreaks;      // Breaks as found by ICU break iterator.
148     UnicodeString          fRuleForPosition;   // Index into BreakRules.fBreakRules of rule that applied at each position.
149                                                // Also parallel to fString.
150     UnicodeString          f2ndRuleForPos;     // As above. A 2nd rule applies when the preceding rule
151                                                //   didn't cause a break, and a subsequent rule match starts
152                                                //   on the last code point of the preceding match.
153 
154 };
155 
156 
157 
158 
159 // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
160 //                          test for one set of break rules.
161 //
162 //                          When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
163 //                          between instances of RBBIMonkeyImpl and threads.
164 //
165 class RBBIMonkeyImpl: public UObject {
166   public:
167     RBBIMonkeyImpl(UErrorCode &status);
168     ~RBBIMonkeyImpl();
169 
170     void setup(const char *ruleFileName, UErrorCode &status);
171 
172     void startTest();
173     void runTest();
174     void join();
175 
176     LocalUCHARBUFPointer                 fRuleCharBuffer;         // source file contents of the reference rules.
177     LocalPointer<BreakRules>             fRuleSet;
178     LocalPointer<RuleBasedBreakIterator> fBI;
179     LocalPointer<MonkeyTestData>         fTestData;
180     IntlTest::icu_rand                   fRandomGenerator;
181     const char                          *fRuleFileName;
182     UBool                                fVerbose;                 // True to do long dump of failing data.
183     int32_t                              fLoopCount;
184 
185     UBool                                fDumpExpansions;          // Debug flag to output epananded form of rules and sets.
186 
187     enum CheckDirection {
188         FORWARD = 1,
189         REVERSE = 2
190     };
191     void clearActualBreaks();
192     void testForwards(UErrorCode &status);
193     void testPrevious(UErrorCode &status);
194     void testFollowing(UErrorCode &status);
195     void testPreceding(UErrorCode &status);
196     void testIsBoundary(UErrorCode &status);
197     void checkResults(const char *msg, CheckDirection dir, UErrorCode &status);
198 
199     class RBBIMonkeyThread: public SimpleThread {
200       private:
201         RBBIMonkeyImpl *fMonkeyImpl;
202       public:
RBBIMonkeyThread(RBBIMonkeyImpl * impl)203         RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {};
run()204         void run() U_OVERRIDE { fMonkeyImpl->runTest(); };
205     };
206   private:
207     void openBreakRules(const char *fileName, UErrorCode &status);
208     RBBIMonkeyThread fThread;
209 
210 };
211 
212 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
213 
214 #endif  //  RBBIMONKEYTEST_H
215