1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 2016, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************/
7
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
12
13 #include "rbbimonkeytest.h"
14 #include "unicode/utypes.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/utf16.h"
17 #include "unicode/uniset.h"
18 #include "unicode/unistr.h"
19
20 #include "charstr.h"
21 #include "cmemory.h"
22 #include "cstr.h"
23 #include "uelement.h"
24 #include "uhash.h"
25
26 #include <iostream>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string>
30
31 using namespace icu;
32
33
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)34 void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
35 fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function.
36
37 TESTCASE_AUTO_BEGIN;
38 TESTCASE_AUTO(testMonkey);
39 TESTCASE_AUTO_END;
40 }
41
42 //---------------------------------------------------------------------------------------
43 //
44 // class BreakRule implementation.
45 //
46 //---------------------------------------------------------------------------------------
47
BreakRule()48 BreakRule::BreakRule() // : all field default initialized.
49 {
50 }
51
~BreakRule()52 BreakRule::~BreakRule() {}
53
54
55 //---------------------------------------------------------------------------------------
56 //
57 // class BreakRules implementation.
58 //
59 //---------------------------------------------------------------------------------------
BreakRules(RBBIMonkeyImpl * monkeyImpl,UErrorCode & status)60 BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
61 fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
62 fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
63 uhash_compareUnicodeString,
64 NULL, // value comparator.
65 &status));
66 if (U_FAILURE(status)) {
67 return;
68 }
69 uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
70 uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
71 fBreakRules.setDeleter(uprv_deleteUObject);
72
73 fCharClassList.adoptInstead(new UVector(status));
74
75 fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
76 "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:'
77 // (the identifier is a unicode property name or value)
78 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
79 0, status));
80
81 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
82 fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
83 "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
84 "[ \\t]*+" // Match white space.
85 "(#.*)?+" // Optional # plus whatever follows
86 "\\R$" // new-line at end of line.
87 ), 0, status));
88
89 // Match (initial parse) of a character class definition line.
90 fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
91 "[ \\t]*" // leading white space
92 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
93 "[ \\t]*=[ \\t]*" // =
94 "(?<ClassDef>.*?)" // The char class UnicodeSet expression
95 "[ \\t]*;$"), // ; <end of line>
96 0, status));
97
98 // Match (initial parse) of a break rule line.
99 fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
100 "[ \\t]*" // leading white space
101 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
102 "[ \\t]*:[ \\t]*" // :
103 "(?<RuleDef>.*?)" // The rule definition
104 "[ \\t]*;$"), // ; <end of line>
105 0, status));
106
107 }
108
109
~BreakRules()110 BreakRules::~BreakRules() {}
111
112
addCharClass(const UnicodeString & name,const UnicodeString & definition,UErrorCode & status)113 CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
114
115 // Create the expanded definition for this char class,
116 // replacing any set references with the corresponding definition.
117
118 UnicodeString expandedDef;
119 UnicodeString emptyString;
120 fSetRefsMatcher->reset(definition);
121 while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
122 const UnicodeString name =
123 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
124 CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
125 const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
126
127 fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
128 expandedDef.append(expansionForName);
129 }
130 fSetRefsMatcher->appendTail(expandedDef);
131
132 // Verify that the expanded set definition is valid.
133
134 if (fMonkeyImpl->fDumpExpansions) {
135 printf("epandedDef: %s\n", CStr(expandedDef)());
136 }
137
138 LocalPointer<UnicodeSet> s(new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status), status);
139 if (U_FAILURE(status)) {
140 IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s\n Expanded set definition: %s",
141 __FILE__, __LINE__, u_errorName(status), CStr(name)(), CStr(expandedDef)());
142 return nullptr;
143 }
144 CharClass *cclass = new CharClass(name, definition, expandedDef, s.orphan());
145 CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
146 new UnicodeString(name), // Key, owned by hash table.
147 cclass, // Value, owned by hash table.
148 &status));
149
150 if (previousClass != NULL) {
151 // Duplicate class def.
152 // These are legitimate, they are adjustments of an existing class.
153 // TODO: will need to keep the old around when we handle tailorings.
154 IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
155 delete previousClass;
156 }
157 return cclass;
158 }
159
160
addRule(const UnicodeString & name,const UnicodeString & definition,UErrorCode & status)161 void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
162 LocalPointer<BreakRule> thisRule(new BreakRule);
163 thisRule->fName = name;
164 thisRule->fRule = definition;
165
166 // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
167 // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
168 UnicodeString emptyString;
169
170 // Expand the char class definitions within the rule.
171 fSetRefsMatcher->reset(definition);
172 while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
173 const UnicodeString name =
174 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
175 CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
176 if (!nameClass) {
177 IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
178 __FILE__, __LINE__, CStr(name)(), CStr(definition)());
179 }
180 const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
181
182 fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
183 thisRule->fExpandedRule.append(expansionForName);
184 }
185 fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
186
187 // If rule begins with a '^' rule chaining is disallowed.
188 // Strip off the '^' from the rule expression, and set the flag.
189 if (thisRule->fExpandedRule.charAt(0) == u'^') {
190 thisRule->fInitialMatchOnly = true;
191 thisRule->fExpandedRule.remove(0, 1);
192 thisRule->fExpandedRule.trim();
193 }
194
195 // Replace the divide sign (\u00f7) with a regular expression named capture.
196 // When running the rules, a match that includes this group means we found a break position.
197
198 int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
199 if (dividePos >= 0) {
200 thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
201 }
202 if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
203 status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message.
204 }
205
206 // UAX break rule set definitions can be empty, just [].
207 // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
208 // also matches nothing.
209
210 static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
211 int32_t where = 0;
212 while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
213 thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
214 }
215 if (fMonkeyImpl->fDumpExpansions) {
216 printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
217 }
218
219 // Compile a regular expression for this rule.
220 thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
221 if (U_FAILURE(status)) {
222 IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
223 __FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
224 return;
225 }
226
227 // Put this new rule into the vector of all Rules.
228 fBreakRules.addElement(thisRule.orphan(), status);
229 }
230
231
setKeywordParameter(const UnicodeString & keyword,const UnicodeString & value,UErrorCode & status)232 bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
233 if (keyword == UnicodeString("locale")) {
234 CharString localeName;
235 localeName.append(CStr(value)(), -1, status);
236 fLocale = Locale::createFromName(localeName.data());
237 return true;
238 }
239 if (keyword == UnicodeString("type")) {
240 if (value == UnicodeString("grapheme")) {
241 fType = UBRK_CHARACTER;
242 } else if (value == UnicodeString("word")) {
243 fType = UBRK_WORD;
244 } else if (value == UnicodeString("line")) {
245 fType = UBRK_LINE;
246 } else if (value == UnicodeString("sentence")) {
247 fType = UBRK_SENTENCE;
248 } else {
249 IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)());
250 }
251 return true;
252 }
253 // TODO: add tailoring base setting here.
254 return false;
255 }
256
createICUBreakIterator(UErrorCode & status)257 RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
258 if (U_FAILURE(status)) {
259 return NULL;
260 }
261 RuleBasedBreakIterator *bi = NULL;
262 switch(fType) {
263 case UBRK_CHARACTER:
264 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
265 break;
266 case UBRK_WORD:
267 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
268 break;
269 case UBRK_LINE:
270 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
271 break;
272 case UBRK_SENTENCE:
273 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
274 break;
275 default:
276 IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
277 status = U_ILLEGAL_ARGUMENT_ERROR;
278 }
279 return bi;
280 }
281
282
compileRules(UCHARBUF * rules,UErrorCode & status)283 void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
284 if (U_FAILURE(status)) {
285 return;
286 }
287
288 UnicodeString emptyString;
289 for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line.
290 if (U_FAILURE(status)) {
291 return;
292 }
293 int32_t lineLength = 0;
294 const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
295 if (lineBuf == NULL) {
296 break;
297 }
298 UnicodeString line(lineBuf, lineLength);
299
300 // Strip comment lines.
301 fCommentsMatcher->reset(line);
302 line = fCommentsMatcher->replaceFirst(emptyString, status);
303 if (line.isEmpty()) {
304 continue;
305 }
306
307 // Recognize character class definition and keyword lines
308 fClassDefMatcher->reset(line);
309 if (fClassDefMatcher->matches(status)) {
310 UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
311 UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
312 if (fMonkeyImpl->fDumpExpansions) {
313 printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
314 }
315 if (setKeywordParameter(className, classDef, status)) {
316 // The scanned item was "type = ..." or "locale = ...", etc.
317 // which are not actual character classes.
318 continue;
319 }
320 addCharClass(className, classDef, status);
321 continue;
322 }
323
324 // Recognize rule lines.
325 fRuleDefMatcher->reset(line);
326 if (fRuleDefMatcher->matches(status)) {
327 UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
328 UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
329 if (fMonkeyImpl->fDumpExpansions) {
330 printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
331 }
332 addRule(ruleName, ruleDef, status);
333 continue;
334 }
335
336 IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
337 __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
338 }
339
340 // Build the vector of char classes, omitting the dictionary class if there is one.
341 // This will be used when constructing the random text to be tested.
342
343 // Also compute the "other" set, consisting of any characters not included in
344 // one or more of the user defined sets.
345
346 UnicodeSet otherSet((UChar32)0, 0x10ffff);
347 int32_t pos = UHASH_FIRST;
348 const UHashElement *el = NULL;
349 while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
350 const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
351 CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
352 // printf(" Adding %s\n", CStr(*ccName)());
353 if (*ccName != cclass->fName) {
354 IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
355 __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
356 }
357 const UnicodeSet *set = cclass->fSet.getAlias();
358 otherSet.removeAll(*set);
359 if (*ccName == UnicodeString("dictionary")) {
360 fDictionarySet = *set;
361 } else {
362 fCharClassList->addElement(cclass, status);
363 }
364 }
365
366 if (!otherSet.isEmpty()) {
367 // fprintf(stderr, "have an other set.\n");
368 UnicodeString pattern;
369 CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
370 fCharClassList->addElement(cclass, status);
371 }
372 }
373
374
getClassForChar(UChar32 c,int32_t * iter) const375 const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
376 int32_t localIter = 0;
377 int32_t &it = iter? *iter : localIter;
378
379 while (it < fCharClassList->size()) {
380 const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
381 ++it;
382 if (cc->fSet->contains(c)) {
383 return cc;
384 }
385 }
386 return NULL;
387 }
388
389 //---------------------------------------------------------------------------------------
390 //
391 // class MonkeyTestData implementation.
392 //
393 //---------------------------------------------------------------------------------------
394
set(BreakRules * rules,IntlTest::icu_rand & rand,UErrorCode & status)395 void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
396 const int32_t dataLength = 1000;
397
398 // Fill the test string with random characters.
399 // First randomly pick a char class, then randomly pick a character from that class.
400 // Exclude any characters from the dictionary set.
401
402 // std::cout << "Populating Test Data" << std::endl;
403 fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages,
404 // allowing recreation of failing data.
405 fBkRules = rules;
406 fString.remove();
407 for (int32_t n=0; n<dataLength;) {
408 int charClassIndex = rand() % rules->fCharClassList->size();
409 const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
410 if (cclass->fSet->size() == 0) {
411 // Some rules or tailorings do end up with empty char classes.
412 continue;
413 }
414 int32_t charIndex = rand() % cclass->fSet->size();
415 UChar32 c = cclass->fSet->charAt(charIndex);
416 if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
417 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
418 // Don't let random unpaired surrogates combine in the test data because they might
419 // produce an unwanted dictionary character.
420 continue;
421 }
422
423 if (!rules->fDictionarySet.contains(c)) {
424 fString.append(c);
425 ++n;
426 }
427 }
428
429 // Reset each rule matcher regex with this new string.
430 // (Although we are always using the same string object, ICU regular expressions
431 // don't like the underlying string data changing without doing a reset).
432
433 for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
434 BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
435 rule->fRuleMatcher->reset(fString);
436 }
437
438 // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
439 // Expected and Actual breaks are one longer than the input string; a non-zero value
440 // will indicate a boundary preceding that position.
441
442 clearActualBreaks();
443 fExpectedBreaks = fActualBreaks;
444 fRuleForPosition = fActualBreaks;
445 f2ndRuleForPos = fActualBreaks;
446
447 // Apply reference rules to find the expected breaks.
448
449 fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text.
450 // ICU always reports a break there.
451 // The reference rules do not have a means to do so.
452 int32_t strIdx = 0;
453 bool initialMatch = true; // True at start of text, and immediately after each boundary,
454 // for control over rule chaining.
455 while (strIdx < fString.length()) {
456 BreakRule *matchingRule = NULL;
457 UBool hasBreak = FALSE;
458 int32_t ruleNum = 0;
459 int32_t matchStart = 0;
460 int32_t matchEnd = 0;
461 int32_t breakGroup = 0;
462 for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
463 BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
464 if (rule->fInitialMatchOnly && !initialMatch) {
465 // Skip checking this '^' rule. (No rule chaining)
466 continue;
467 }
468 rule->fRuleMatcher->reset();
469 if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
470 // A candidate rule match, check further to see if we take it or continue to check other rules.
471 // Matches of zero or one codepoint count only if they also specify a break.
472 matchStart = rule->fRuleMatcher->start(status);
473 matchEnd = rule->fRuleMatcher->end(status);
474 breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
475 hasBreak = U_SUCCESS(status);
476 if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
477 status = U_ZERO_ERROR;
478 }
479 if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
480 matchingRule = rule;
481 break;
482 }
483 }
484 }
485 if (matchingRule == NULL) {
486 // No reference rule matched. This is an error in the rules that should never happen.
487 IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
488 __FILE__, __LINE__, strIdx);
489 dump(strIdx);
490 status = U_INVALID_FORMAT_ERROR;
491 return;
492 }
493 if (matchingRule->fRuleMatcher->group(status).length() == 0) {
494 // Zero length rule match. This is also an error in the rule expressions.
495 IntlTest::gTest->errln("%s:%d Zero length rule match.",
496 __FILE__, __LINE__);
497 status = U_INVALID_FORMAT_ERROR;
498 return;
499 }
500
501 // Record which rule matched over the length of the match.
502 for (int i = matchStart; i < matchEnd; i++) {
503 if (fRuleForPosition.charAt(i) == 0) {
504 fRuleForPosition.setCharAt(i, (UChar)ruleNum);
505 } else {
506 f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
507 }
508 }
509
510 // Break positions appear in rules as a matching named capture of zero length at the break position,
511 // the adjusted pattern contains (?<BreakPosition>)
512 if (hasBreak) {
513 int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
514 if (U_FAILURE(status) || breakPos < 0) {
515 // Rule specified a break, but that break wasn't part of the match, even
516 // though the rule as a whole matched.
517 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
518 // Shouldn't get here.
519 IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
520 status = U_INVALID_FORMAT_ERROR;
521 break;
522 }
523 fExpectedBreaks.setCharAt(breakPos, (UChar)1);
524 // printf("recording break at %d\n", breakPos);
525 // For the next iteration, pick up applying rules immediately after the break,
526 // which may differ from end of the match. The matching rule may have included
527 // context following the boundary that needs to be looked at again.
528 strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
529 initialMatch = true;
530 } else {
531 // Original rule didn't specify a break.
532 // Continue applying rules starting on the last code point of this match.
533 strIdx = fString.moveIndex32(matchEnd, -1);
534 initialMatch = false;
535 if (strIdx == matchStart) {
536 // Match was only one code point, no progress if we continue.
537 // Shouldn't get here, case is filtered out at top of loop.
538 CharString ruleName;
539 ruleName.appendInvariantChars(matchingRule->fName, status);
540 IntlTest::gTest->errln("%s:%d Rule %s internal error",
541 __FILE__, __LINE__, ruleName.data());
542 status = U_INVALID_FORMAT_ERROR;
543 break;
544 }
545 }
546 if (U_FAILURE(status)) {
547 IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
548 __FILE__, __LINE__, u_errorName(status));
549 break;
550 }
551 }
552 }
553
clearActualBreaks()554 void MonkeyTestData::clearActualBreaks() {
555 fActualBreaks.remove();
556 // Actual Breaks length is one longer than the data string length, allowing
557 // for breaks before the first and after the last character in the data.
558 for (int32_t i=0; i<=fString.length(); i++) {
559 fActualBreaks.append((UChar)0);
560 }
561 }
562
dump(int32_t around) const563 void MonkeyTestData::dump(int32_t around) const {
564 printf("\n"
565 " char break Rule Character\n"
566 " pos code class R I name name\n"
567 "---------------------------------------------------------------------------------------------\n");
568
569 int32_t start;
570 int32_t end;
571
572 if (around == -1) {
573 start = 0;
574 end = fString.length();
575 } else {
576 // Display context around a failure.
577 start = fString.moveIndex32(around, -30);
578 end = fString.moveIndex32(around, +30);
579 }
580
581 for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
582 UErrorCode status = U_ZERO_ERROR;
583 UChar32 c = fString.char32At(charIdx);
584 const CharClass *cc = fBkRules->getClassForChar(c);
585 CharString ccName;
586 ccName.appendInvariantChars(cc->fName, status);
587 CharString ruleName, secondRuleName;
588 const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
589 ruleName.appendInvariantChars(rule->fName, status);
590 if (f2ndRuleForPos.charAt(charIdx) > 0) {
591 const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
592 secondRuleName.appendInvariantChars(secondRule->fName, status);
593 }
594 char cName[200];
595 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
596
597 printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
598 charIdx, c, ccName.data(),
599 fExpectedBreaks.charAt(charIdx) ? '*' : '.',
600 fActualBreaks.charAt(charIdx) ? '*' : '.',
601 ruleName.data(), secondRuleName.data(), cName
602 );
603 }
604 }
605
606
607 //---------------------------------------------------------------------------------------
608 //
609 // class RBBIMonkeyImpl
610 //
611 //---------------------------------------------------------------------------------------
612
RBBIMonkeyImpl(UErrorCode & status)613 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
614 (void)status; // suppress unused parameter compiler warning.
615 }
616
617
618 // RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
619 // reference rules and creating the icu breakiterator to test,
620 // with its type and locale coming from the reference rules.
621
setup(const char * ruleFile,UErrorCode & status)622 void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
623 fRuleFileName = ruleFile;
624 openBreakRules(ruleFile, status);
625 if (U_FAILURE(status)) {
626 IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
627 return;
628 }
629 fRuleSet.adoptInstead(new BreakRules(this, status));
630 fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
631 if (U_FAILURE(status)) {
632 IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
633 return;
634 }
635 fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
636 fTestData.adoptInstead(new MonkeyTestData());
637 }
638
639
~RBBIMonkeyImpl()640 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
641 }
642
643
openBreakRules(const char * fileName,UErrorCode & status)644 void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
645 CharString path;
646 path.append(IntlTest::getSourceTestData(status), status);
647 path.append("break_rules" U_FILE_SEP_STRING, status);
648 path.appendPathPart(fileName, status);
649 const char *codePage = "UTF-8";
650 fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
651 }
652
653
startTest()654 void RBBIMonkeyImpl::startTest() {
655 fThread.start(); // invokes runTest() in a separate thread.
656 }
657
join()658 void RBBIMonkeyImpl::join() {
659 fThread.join();
660 }
661
662
663 #define MONKEY_ERROR(msg, index) UPRV_BLOCK_MACRO_BEGIN { \
664 IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
665 __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
666 if (fVerbose) { fTestData->dump(index); } \
667 status = U_INVALID_STATE_ERROR; \
668 } UPRV_BLOCK_MACRO_END
669
runTest()670 void RBBIMonkeyImpl::runTest() {
671 UErrorCode status = U_ZERO_ERROR;
672 int32_t errorCount = 0;
673 for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
674 status = U_ZERO_ERROR;
675 fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
676 if (fBI.isNull()) {
677 IntlTest::gTest->dataerrln("Unable to run test because fBI is null.");
678 return;
679 }
680 // fTestData->dump();
681 testForwards(status);
682 testPrevious(status);
683 testFollowing(status);
684 testPreceding(status);
685 testIsBoundary(status);
686 testIsBoundaryRandom(status);
687
688 if (fLoopCount < 0 && loopCount % 100 == 0) {
689 fprintf(stderr, ".");
690 }
691 if (U_FAILURE(status)) {
692 if (++errorCount > 10) {
693 return;
694 }
695 }
696 }
697 }
698
testForwards(UErrorCode & status)699 void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
700 if (U_FAILURE(status)) {
701 return;
702 }
703 fTestData->clearActualBreaks();
704 fBI->setText(fTestData->fString);
705 int32_t previousBreak = -2;
706 for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
707 if (bk <= previousBreak) {
708 MONKEY_ERROR("Break Iterator Stall", bk);
709 return;
710 }
711 if (bk < 0 || bk > fTestData->fString.length()) {
712 MONKEY_ERROR("Boundary out of bounds", bk);
713 return;
714 }
715 fTestData->fActualBreaks.setCharAt(bk, 1);
716 }
717 checkResults("testForwards", FORWARD, status);
718 }
719
testFollowing(UErrorCode & status)720 void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
721 if (U_FAILURE(status)) {
722 return;
723 }
724 fTestData->clearActualBreaks();
725 fBI->setText(fTestData->fString);
726 int32_t nextBreak = -1;
727 for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
728 int32_t bk = fBI->following(i);
729 if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
730 continue;
731 }
732 if (bk == nextBreak && bk > i) {
733 // i is in the gap between two breaks.
734 continue;
735 }
736 if (i == nextBreak && bk > nextBreak) {
737 fTestData->fActualBreaks.setCharAt(bk, 1);
738 nextBreak = bk;
739 continue;
740 }
741 MONKEY_ERROR("following(i)", i);
742 return;
743 }
744 checkResults("testFollowing", FORWARD, status);
745 }
746
747
748
testPrevious(UErrorCode & status)749 void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
750 if (U_FAILURE(status)) {return;}
751
752 fTestData->clearActualBreaks();
753 fBI->setText(fTestData->fString);
754 int32_t previousBreak = INT32_MAX;
755 for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
756 if (bk >= previousBreak) {
757 MONKEY_ERROR("Break Iterator Stall", bk);
758 return;
759 }
760 if (bk < 0 || bk > fTestData->fString.length()) {
761 MONKEY_ERROR("Boundary out of bounds", bk);
762 return;
763 }
764 fTestData->fActualBreaks.setCharAt(bk, 1);
765 }
766 checkResults("testPrevius", REVERSE, status);
767 }
768
769
testPreceding(UErrorCode & status)770 void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
771 if (U_FAILURE(status)) {
772 return;
773 }
774 fTestData->clearActualBreaks();
775 fBI->setText(fTestData->fString);
776 int32_t nextBreak = fTestData->fString.length()+1;
777 for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
778 int32_t bk = fBI->preceding(i);
779 // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
780 if (bk == BreakIterator::DONE && i == 0) {
781 continue;
782 }
783 if (bk == nextBreak && bk < i) {
784 // i is in the gap between two breaks.
785 continue;
786 }
787 if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
788 // i indexes to a trailing surrogate.
789 // Break Iterators treat an index to either half as referring to the supplemental code point,
790 // with preceding going to some preceding code point.
791 if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
792 MONKEY_ERROR("preceding of trailing surrogate error", i);
793 }
794 continue;
795 }
796 if (i == nextBreak && bk < nextBreak) {
797 fTestData->fActualBreaks.setCharAt(bk, 1);
798 nextBreak = bk;
799 continue;
800 }
801 MONKEY_ERROR("preceding(i)", i);
802 return;
803 }
804 checkResults("testPreceding", REVERSE, status);
805 }
806
807
testIsBoundary(UErrorCode & status)808 void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
809 if (U_FAILURE(status)) {
810 return;
811 }
812 fTestData->clearActualBreaks();
813 fBI->setText(fTestData->fString);
814 for (int i=fTestData->fString.length(); i>=0; --i) {
815 if (fBI->isBoundary(i)) {
816 fTestData->fActualBreaks.setCharAt(i, 1);
817 }
818 }
819 checkResults("testForwards", FORWARD, status);
820 }
821
testIsBoundaryRandom(UErrorCode & status)822 void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode &status) {
823 if (U_FAILURE(status)) {
824 return;
825 }
826 fBI->setText(fTestData->fString);
827
828 int stringLen = fTestData->fString.length();
829 for (int i=stringLen; i>=0; --i) {
830 int strIdx = fRandomGenerator() % stringLen;
831 if (fTestData->fExpectedBreaks.charAt(strIdx) != fBI->isBoundary(strIdx)) {
832 IntlTest::gTest->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
833 __FILE__, __LINE__, strIdx, fRuleFileName, fTestData->fRandomSeed);
834 if (fVerbose) {
835 fTestData->dump(i);
836 }
837 status = U_INVALID_STATE_ERROR;
838 break;
839 }
840 }
841 }
842
843
844
checkResults(const char * msg,CheckDirection direction,UErrorCode & status)845 void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
846 if (U_FAILURE(status)) {
847 return;
848 }
849 if (direction == FORWARD) {
850 for (int i=0; i<=fTestData->fString.length(); ++i) {
851 if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
852 IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
853 __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
854 if (fVerbose) {
855 fTestData->dump(i);
856 }
857 status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely
858 break; // produce many redundant errors.
859 }
860 }
861 } else {
862 for (int i=fTestData->fString.length(); i>=0; i--) {
863 if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
864 IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
865 __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
866 if (fVerbose) {
867 fTestData->dump(i);
868 }
869 status = U_INVALID_STATE_ERROR;
870 break;
871 }
872 }
873 }
874 }
875
876
877
878 //---------------------------------------------------------------------------------------
879 //
880 // class RBBIMonkeyTest implementation.
881 //
882 //---------------------------------------------------------------------------------------
RBBIMonkeyTest()883 RBBIMonkeyTest::RBBIMonkeyTest() {
884 }
885
~RBBIMonkeyTest()886 RBBIMonkeyTest::~RBBIMonkeyTest() {
887 }
888
889
890 // params, taken from this->fParams.
891 // rules=file_name Name of file containing the reference rules.
892 // seed=nnnnn Random number starting seed.
893 // Setting the seed allows errors to be reproduced.
894 // loop=nnn Looping count. Controls running time.
895 // -1: run forever.
896 // 0 or greater: run length.
897 // expansions debug option, show expansions of rules and sets.
898 // verbose Display details of the failure.
899 //
900 // Parameters on the intltest command line follow the test name, and are preceded by '@'.
901 // For example,
902 // intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
903 //
testMonkey()904 void RBBIMonkeyTest::testMonkey() {
905 // printf("Test parameters: %s\n", fParams);
906 UnicodeString params(fParams);
907 UErrorCode status = U_ZERO_ERROR;
908
909 const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt",
910 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
911 NULL };
912 CharString testNameFromParams;
913 if (getStringParam("rules", params, testNameFromParams, status)) {
914 tests[0] = testNameFromParams.data();
915 tests[1] = NULL;
916 }
917
918 int64_t loopCount = quick? 100 : 5000;
919 getIntParam("loop", params, loopCount, status);
920
921 UBool dumpExpansions = FALSE;
922 getBoolParam("expansions", params, dumpExpansions, status);
923
924 UBool verbose = FALSE;
925 getBoolParam("verbose", params, verbose, status);
926
927 int64_t seed = 0;
928 getIntParam("seed", params, seed, status);
929
930 if (params.length() != 0) {
931 // Options processing did not consume all of the parameters. Something unrecognized was present.
932 CharString unrecognizedParameters;
933 unrecognizedParameters.append(CStr(params)(), -1, status);
934 errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
935 return;
936 }
937
938 UVector startedTests(status);
939 if (U_FAILURE(status)) {
940 errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
941 return;
942 }
943
944 // Monkey testing is multi-threaded.
945 // Each set of break rules to be tested is run in a separate thread.
946 // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
947 int32_t i;
948 for (i=0; tests[i] != NULL; ++i) {
949 logln("beginning testing of %s", tests[i]);
950 LocalPointer<RBBIMonkeyImpl> test(new RBBIMonkeyImpl(status));
951 if (U_FAILURE(status)) {
952 dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
953 break;
954 }
955 test->fDumpExpansions = dumpExpansions;
956 test->fVerbose = verbose;
957 test->fRandomGenerator.seed(static_cast<uint32_t>(seed));
958 test->fLoopCount = static_cast<int32_t>(loopCount);
959 test->setup(tests[i], status);
960 if (U_FAILURE(status)) {
961 dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
962 break;
963 }
964 test->startTest();
965 startedTests.addElement(test.orphan(), status);
966 if (U_FAILURE(status)) {
967 errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
968 break;
969 }
970 }
971
972 for (i=0; i<startedTests.size(); ++i) {
973 RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
974 test->join();
975 delete test;
976 }
977 }
978
979
getIntParam(UnicodeString name,UnicodeString & params,int64_t & val,UErrorCode & status)980 UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status) {
981 name.append(" *= *(-?\\d+) *,? *");
982 RegexMatcher m(name, params, 0, status);
983 if (m.find()) {
984 // The param exists. Convert the string to an int.
985 CharString str;
986 str.append(CStr(m.group(1, status))(), -1, status);
987 val = strtol(str.data(), NULL, 10);
988
989 // Delete this parameter from the params string.
990 m.reset();
991 params = m.replaceFirst(UnicodeString(), status);
992 return TRUE;
993 }
994 return FALSE;
995 }
996
getStringParam(UnicodeString name,UnicodeString & params,CharString & dest,UErrorCode & status)997 UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status) {
998 name.append(" *= *([^ ,]*) *,? *");
999 RegexMatcher m(name, params, 0, status);
1000 if (m.find()) {
1001 // The param exists.
1002 dest.append(CStr(m.group(1, status))(), -1, status);
1003
1004 // Delete this parameter from the params string.
1005 m.reset();
1006 params = m.replaceFirst(UnicodeString(), status);
1007 return TRUE;
1008 }
1009 return FALSE;
1010 }
1011
getBoolParam(UnicodeString name,UnicodeString & params,UBool & dest,UErrorCode & status)1012 UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status) {
1013 name.append("(?: *= *(true|false))? *,? *");
1014 RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
1015 if (m.find()) {
1016 if (m.start(1, status) > 0) {
1017 // user option included a value.
1018 dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
1019 } else {
1020 // No explicit user value, implies true.
1021 dest = TRUE;
1022 }
1023
1024 // Delete this parameter from the params string.
1025 m.reset();
1026 params = m.replaceFirst(UnicodeString(), status);
1027 return TRUE;
1028 }
1029 return FALSE;
1030 }
1031
1032 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
1033