• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 2002-2010, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 
7 //
8 //   regextst.cpp
9 //
10 //      ICU Regular Expressions test, part of intltest.
11 //
12 
13 #include "intltest.h"
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 
16 #include "unicode/regex.h"
17 #include "unicode/uchar.h"
18 #include "unicode/ucnv.h"
19 #include "unicode/ustring.h"
20 #include "regextst.h"
21 #include "uvector.h"
22 #include "util.h"
23 #include <stdlib.h>
24 #include <string.h>
25 #include <stdio.h>
26 
27 #define SUPPORT_MUTATING_INPUT_STRING   0
28 
29 
30 //---------------------------------------------------------------------------
31 //
32 //  Test class boilerplate
33 //
34 //---------------------------------------------------------------------------
RegexTest()35 RegexTest::RegexTest()
36 {
37 }
38 
39 
~RegexTest()40 RegexTest::~RegexTest()
41 {
42 }
43 
44 
45 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)46 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
47 {
48     if (exec) logln("TestSuite RegexTest: ");
49     switch (index) {
50 
51         case 0: name = "Basic";
52             if (exec) Basic();
53             break;
54         case 1: name = "API_Match";
55             if (exec) API_Match();
56             break;
57         case 2: name = "API_Replace";
58             if (exec) API_Replace();
59             break;
60         case 3: name = "API_Pattern";
61             if (exec) API_Pattern();
62             break;
63         case 4:
64 #if !UCONFIG_NO_FILE_IO
65             name = "Extended";
66             if (exec) Extended();
67 #else
68             name = "skip";
69 #endif
70             break;
71         case 5: name = "Errors";
72             if (exec) Errors();
73             break;
74         case 6: name = "PerlTests";
75             if (exec) PerlTests();
76             break;
77         case 7: name = "Callbacks";
78             if (exec) Callbacks();
79             break;
80         case 8: name = "Bug 6149";
81              if (exec) Bug6149();
82              break;
83         case 9: name = "UTextBasic";
84           if (exec) UTextBasic();
85           break;
86         case 10: name = "API_Match_UTF8";
87           if (exec) API_Match_UTF8();
88           break;
89         case 11: name = "API_Replace_UTF8";
90           if (exec) API_Replace_UTF8();
91           break;
92         case 12: name = "API_Pattern_UTF8";
93           if (exec) API_Pattern_UTF8();
94           break;
95         case 13: name = "PerlTestsUTF8";
96           if (exec) PerlTestsUTF8();
97           break;
98         case 14: name = "PreAllocatedUTextCAPI";
99           if (exec) PreAllocatedUTextCAPI();
100           break;
101         case 15: name = "Bug 7651";
102           if (exec) Bug7651();
103           break;
104 
105         default: name = "";
106             break; //needed to end loop
107     }
108 }
109 
110 
111 //---------------------------------------------------------------------------
112 //
113 //   Error Checking / Reporting macros used in all of the tests.
114 //
115 //---------------------------------------------------------------------------
116 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("RegexTest failure at line %d.  status=%s", \
117 __LINE__, u_errorName(status)); return;}}
118 
119 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
120 
121 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
122 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
123     __LINE__, u_errorName(errcode), u_errorName(status));};}
124 
125 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
126     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
127 
128 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
129     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
130 
assertUText(const char * expected,UText * actual,const char * file,int line)131 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
132     UErrorCode status = U_ZERO_ERROR;
133     UText expectedText = UTEXT_INITIALIZER;
134     utext_openUTF8(&expectedText, expected, -1, &status);
135     utext_setNativeIndex(actual, 0);
136     if (utext_compare(&expectedText, -1, actual, -1) != 0) {
137         char buf[201 /*21*/];
138         char *bufPtr = buf;
139         UChar32 c = utext_next32From(actual, 0);
140         while (c != U_SENTINEL && bufPtr < buf+200/*20*/) {
141             if (0x20<c && c<0x7e) {
142                 *bufPtr = c;
143             } else {
144                 *bufPtr = '.';
145             }
146             bufPtr++;
147             c = UTEXT_NEXT32(actual);
148         }
149         *bufPtr = 0;
150 
151         errln("Failure at file %s, line %d, expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expected, utext_nativeLength(&expectedText), buf, utext_nativeLength(actual));
152     }
153     utext_close(&expectedText);
154 }
155 
156 #define REGEX_ASSERT_UTEXT(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
157 
158 
159 //---------------------------------------------------------------------------
160 //
161 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
162 //                       for the LookingAt() and  Match() functions.
163 //
164 //       usage:
165 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
166 //
167 //          The expected results are UBool - TRUE or FALSE.
168 //          The input text is unescaped.  The pattern is not.
169 //
170 //
171 //---------------------------------------------------------------------------
172 
173 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
174 
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)175 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
176     const UnicodeString pattern(pat, -1, US_INV);
177     const UnicodeString inputText(text, -1, US_INV);
178     UErrorCode          status  = U_ZERO_ERROR;
179     UParseError         pe;
180     RegexPattern        *REPattern = NULL;
181     RegexMatcher        *REMatcher = NULL;
182     UBool               retVal     = TRUE;
183 
184     UnicodeString patString(pat, -1, US_INV);
185     REPattern = RegexPattern::compile(patString, 0, pe, status);
186     if (U_FAILURE(status)) {
187         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
188             line, u_errorName(status));
189         return FALSE;
190     }
191     if (line==376) { RegexPatternDump(REPattern);}
192 
193     UnicodeString inputString(inputText);
194     UnicodeString unEscapedInput = inputString.unescape();
195     REMatcher = REPattern->matcher(unEscapedInput, status);
196     if (U_FAILURE(status)) {
197         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
198             line, u_errorName(status));
199         return FALSE;
200     }
201 
202     UBool actualmatch;
203     actualmatch = REMatcher->lookingAt(status);
204     if (U_FAILURE(status)) {
205         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
206             line, u_errorName(status));
207         retVal =  FALSE;
208     }
209     if (actualmatch != looking) {
210         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
211         retVal = FALSE;
212     }
213 
214     status = U_ZERO_ERROR;
215     actualmatch = REMatcher->matches(status);
216     if (U_FAILURE(status)) {
217         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
218             line, u_errorName(status));
219         retVal = FALSE;
220     }
221     if (actualmatch != match) {
222         errln("RegexTest: wrong return from matches() at line %d.\n", line);
223         retVal = FALSE;
224     }
225 
226     if (retVal == FALSE) {
227         RegexPatternDump(REPattern);
228     }
229 
230     delete REPattern;
231     delete REMatcher;
232     return retVal;
233 }
234 
235 
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)236 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
237     UText               pattern    = UTEXT_INITIALIZER;
238     int32_t             inputUTF8Length;
239     char                *textChars = NULL;
240     UText               inputText  = UTEXT_INITIALIZER;
241     UErrorCode          status     = U_ZERO_ERROR;
242     UParseError         pe;
243     RegexPattern        *REPattern = NULL;
244     RegexMatcher        *REMatcher = NULL;
245     UBool               retVal     = TRUE;
246 
247     utext_openUTF8(&pattern, pat, -1, &status);
248     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
249     if (U_FAILURE(status)) {
250         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
251             line, u_errorName(status));
252         return FALSE;
253     }
254 
255     UnicodeString inputString(text, -1, US_INV);
256     UnicodeString unEscapedInput = inputString.unescape();
257     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
258     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
259 
260     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
261     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
262         // UTF-8 does not allow unpaired surrogates, so this could actually happen
263         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
264         return TRUE; // not a failure of the Regex engine
265     }
266     status = U_ZERO_ERROR; // buffer overflow
267     textChars = new char[inputUTF8Length+1];
268     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
269     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
270 
271     REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
272     if (U_FAILURE(status)) {
273         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
274             line, u_errorName(status));
275         return FALSE;
276     }
277 
278     UBool actualmatch;
279     actualmatch = REMatcher->lookingAt(status);
280     if (U_FAILURE(status)) {
281         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
282             line, u_errorName(status));
283         retVal =  FALSE;
284     }
285     if (actualmatch != looking) {
286         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
287         retVal = FALSE;
288     }
289 
290     status = U_ZERO_ERROR;
291     actualmatch = REMatcher->matches(status);
292     if (U_FAILURE(status)) {
293         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
294             line, u_errorName(status));
295         retVal = FALSE;
296     }
297     if (actualmatch != match) {
298         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
299         retVal = FALSE;
300     }
301 
302     if (retVal == FALSE) {
303         RegexPatternDump(REPattern);
304     }
305 
306     delete REPattern;
307     delete REMatcher;
308     utext_close(&inputText);
309     utext_close(&pattern);
310     delete[] textChars;
311     return retVal;
312 }
313 
314 
315 
316 //---------------------------------------------------------------------------
317 //
318 //    REGEX_ERR       Macro + invocation function to simplify writing tests
319 //                       regex tests for incorrect patterns
320 //
321 //       usage:
322 //          REGEX_ERR("pattern",   expected error line, column, expected status);
323 //
324 //---------------------------------------------------------------------------
325 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
326 
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)327 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
328                           UErrorCode expectedStatus, int32_t line) {
329     UnicodeString       pattern(pat);
330 
331     UErrorCode          status         = U_ZERO_ERROR;
332     UParseError         pe;
333     RegexPattern        *callerPattern = NULL;
334 
335     //
336     //  Compile the caller's pattern
337     //
338     UnicodeString patString(pat);
339     callerPattern = RegexPattern::compile(patString, 0, pe, status);
340     if (status != expectedStatus) {
341         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
342     } else {
343         if (status != U_ZERO_ERROR) {
344             if (pe.line != errLine || pe.offset != errCol) {
345                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
346                     line, errLine, errCol, pe.line, pe.offset);
347             }
348         }
349     }
350 
351     delete callerPattern;
352 
353     //
354     //  Compile again, using a UTF-8-based UText
355     //
356     UText patternText = UTEXT_INITIALIZER;
357     utext_openUTF8(&patternText, pat, -1, &status);
358     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
359     if (status != expectedStatus) {
360         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
361     } else {
362         if (status != U_ZERO_ERROR) {
363             if (pe.line != errLine || pe.offset != errCol) {
364                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
365                     line, errLine, errCol, pe.line, pe.offset);
366             }
367         }
368     }
369 
370     delete callerPattern;
371     utext_close(&patternText);
372 }
373 
374 
375 
376 //---------------------------------------------------------------------------
377 //
378 //      Basic      Check for basic functionality of regex pattern matching.
379 //                 Avoid the use of REGEX_FIND test macro, which has
380 //                 substantial dependencies on basic Regex functionality.
381 //
382 //---------------------------------------------------------------------------
Basic()383 void RegexTest::Basic() {
384 
385 
386 //
387 // Debug - slide failing test cases early
388 //
389 #if 0
390     {
391         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
392         UParseError pe;
393         UErrorCode  status = U_ZERO_ERROR;
394         RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
395         // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
396         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
397     }
398     exit(1);
399 #endif
400 
401 
402     //
403     // Pattern with parentheses
404     //
405     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
406     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
407     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
408 
409     //
410     // Patterns with *
411     //
412     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
413     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
414     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
415     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
416     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
417 
418     REGEX_TESTLM("a*", "",  TRUE, TRUE);
419     REGEX_TESTLM("a*", "b", TRUE, FALSE);
420 
421 
422     //
423     //  Patterns with "."
424     //
425     REGEX_TESTLM(".", "abc", TRUE, FALSE);
426     REGEX_TESTLM("...", "abc", TRUE, TRUE);
427     REGEX_TESTLM("....", "abc", FALSE, FALSE);
428     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
429     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
430     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
431     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
432     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
433 
434     //
435     //  Patterns with * applied to chars at end of literal string
436     //
437     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
438     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
439 
440     //
441     //  Supplemental chars match as single chars, not a pair of surrogates.
442     //
443     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
444     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
445     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
446 
447 
448     //
449     //  UnicodeSets in the pattern
450     //
451     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
452     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
453     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
454     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
455     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
456     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
457 
458     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
459     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
460     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
461     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
462     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
463 
464     //
465     //   OR operator in patterns
466     //
467     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
468     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
469     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
470     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
471 
472     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
473     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
474     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
475     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
476     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
477     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
478 
479     //
480     //  +
481     //
482     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
483     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
484     REGEX_TESTLM("b+", "", FALSE, FALSE);
485     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
486     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
487     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
488 
489     //
490     //   ?
491     //
492     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
493     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
494     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
495     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
496     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
497     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
498     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
499     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
500     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
501 
502     //
503     //  Escape sequences that become single literal chars, handled internally
504     //   by ICU's Unescape.
505     //
506 
507     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
508     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
509     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
510     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
511     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
512     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
513     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
514     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
515     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
516     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
517 
518     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
519     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
520 
521     // Escape of special chars in patterns
522     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
523 }
524 
525 
526 //---------------------------------------------------------------------------
527 //
528 //    UTextBasic   Check for quirks that are specific to the UText
529 //                 implementation.
530 //
531 //---------------------------------------------------------------------------
UTextBasic()532 void RegexTest::UTextBasic() {
533     UErrorCode status = U_ZERO_ERROR;
534     UText pattern = UTEXT_INITIALIZER;
535     utext_openUTF8(&pattern, "abc", -1, &status);
536     RegexMatcher matcher(&pattern, 0, status);
537     REGEX_CHECK_STATUS;
538 
539     UText input = UTEXT_INITIALIZER;
540     utext_openUTF8(&input, "abc", -1, &status);
541     REGEX_CHECK_STATUS;
542     matcher.reset(&input);
543     REGEX_CHECK_STATUS;
544     REGEX_ASSERT_UTEXT("abc", matcher.inputText());
545 
546     matcher.reset(matcher.inputText());
547     REGEX_CHECK_STATUS;
548     REGEX_ASSERT_UTEXT("abc", matcher.inputText());
549 
550     utext_close(&pattern);
551     utext_close(&input);
552 }
553 
554 
555 //---------------------------------------------------------------------------
556 //
557 //      API_Match   Test that the API for class RegexMatcher
558 //                  is present and nominally working, but excluding functions
559 //                  implementing replace operations.
560 //
561 //---------------------------------------------------------------------------
API_Match()562 void RegexTest::API_Match() {
563     UParseError         pe;
564     UErrorCode          status=U_ZERO_ERROR;
565     int32_t             flags = 0;
566 
567     //
568     // Debug - slide failing test cases early
569     //
570 #if 0
571     {
572     }
573     return;
574 #endif
575 
576     //
577     // Simple pattern compilation
578     //
579     {
580         UnicodeString       re("abc");
581         RegexPattern        *pat2;
582         pat2 = RegexPattern::compile(re, flags, pe, status);
583         REGEX_CHECK_STATUS;
584 
585         UnicodeString inStr1 = "abcdef this is a test";
586         UnicodeString instr2 = "not abc";
587         UnicodeString empty  = "";
588 
589 
590         //
591         // Matcher creation and reset.
592         //
593         RegexMatcher *m1 = pat2->matcher(inStr1, status);
594         REGEX_CHECK_STATUS;
595         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
596         REGEX_ASSERT(m1->input() == inStr1);
597         m1->reset(instr2);
598         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
599         REGEX_ASSERT(m1->input() == instr2);
600         m1->reset(inStr1);
601         REGEX_ASSERT(m1->input() == inStr1);
602         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
603         m1->reset(empty);
604         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
605         REGEX_ASSERT(m1->input() == empty);
606         REGEX_ASSERT(&m1->pattern() == pat2);
607 
608         //
609         //  reset(pos, status)
610         //
611         m1->reset(inStr1);
612         m1->reset(4, status);
613         REGEX_CHECK_STATUS;
614         REGEX_ASSERT(m1->input() == inStr1);
615         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
616 
617         m1->reset(-1, status);
618         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
619         status = U_ZERO_ERROR;
620 
621         m1->reset(0, status);
622         REGEX_CHECK_STATUS;
623         status = U_ZERO_ERROR;
624 
625         int32_t len = m1->input().length();
626         m1->reset(len-1, status);
627         REGEX_CHECK_STATUS;
628         status = U_ZERO_ERROR;
629 
630         m1->reset(len, status);
631         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
632         status = U_ZERO_ERROR;
633 
634         //
635         // match(pos, status)
636         //
637         m1->reset(instr2);
638         REGEX_ASSERT(m1->matches(4, status) == TRUE);
639         m1->reset();
640         REGEX_ASSERT(m1->matches(3, status) == FALSE);
641         m1->reset();
642         REGEX_ASSERT(m1->matches(5, status) == FALSE);
643         REGEX_ASSERT(m1->matches(4, status) == TRUE);
644         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
645         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
646 
647         // Match() at end of string should fail, but should not
648         //  be an error.
649         status = U_ZERO_ERROR;
650         len = m1->input().length();
651         REGEX_ASSERT(m1->matches(len, status) == FALSE);
652         REGEX_CHECK_STATUS;
653 
654         // Match beyond end of string should fail with an error.
655         status = U_ZERO_ERROR;
656         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
657         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
658 
659         // Successful match at end of string.
660         {
661             status = U_ZERO_ERROR;
662             RegexMatcher m("A?", 0, status);  // will match zero length string.
663             REGEX_CHECK_STATUS;
664             m.reset(inStr1);
665             len = inStr1.length();
666             REGEX_ASSERT(m.matches(len, status) == TRUE);
667             REGEX_CHECK_STATUS;
668             m.reset(empty);
669             REGEX_ASSERT(m.matches(0, status) == TRUE);
670             REGEX_CHECK_STATUS;
671         }
672 
673 
674         //
675         // lookingAt(pos, status)
676         //
677         status = U_ZERO_ERROR;
678         m1->reset(instr2);  // "not abc"
679         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
680         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
681         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
682         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
683         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
684         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
685         status = U_ZERO_ERROR;
686         len = m1->input().length();
687         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
688         REGEX_CHECK_STATUS;
689         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
690         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
691 
692         delete m1;
693         delete pat2;
694     }
695 
696 
697     //
698     // Capture Group.
699     //     RegexMatcher::start();
700     //     RegexMatcher::end();
701     //     RegexMatcher::groupCount();
702     //
703     {
704         int32_t             flags=0;
705         UParseError         pe;
706         UErrorCode          status=U_ZERO_ERROR;
707 
708         UnicodeString       re("01(23(45)67)(.*)");
709         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
710         REGEX_CHECK_STATUS;
711         UnicodeString data = "0123456789";
712 
713         RegexMatcher *matcher = pat->matcher(data, status);
714         REGEX_CHECK_STATUS;
715         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
716         static const int32_t matchStarts[] = {0,  2, 4, 8};
717         static const int32_t matchEnds[]   = {10, 8, 6, 10};
718         int32_t i;
719         for (i=0; i<4; i++) {
720             int32_t actualStart = matcher->start(i, status);
721             REGEX_CHECK_STATUS;
722             if (actualStart != matchStarts[i]) {
723                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
724                     __LINE__, i, matchStarts[i], actualStart);
725             }
726             int32_t actualEnd = matcher->end(i, status);
727             REGEX_CHECK_STATUS;
728             if (actualEnd != matchEnds[i]) {
729                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
730                     __LINE__, i, matchEnds[i], actualEnd);
731             }
732         }
733 
734         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
735         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
736 
737         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
738         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
739         matcher->reset();
740         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
741 
742         matcher->lookingAt(status);
743         REGEX_ASSERT(matcher->group(status)    == "0123456789");
744         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
745         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
746         REGEX_ASSERT(matcher->group(2, status) == "45"        );
747         REGEX_ASSERT(matcher->group(3, status) == "89"        );
748         REGEX_CHECK_STATUS;
749         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
750         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
751         matcher->reset();
752         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
753 
754         delete matcher;
755         delete pat;
756 
757     }
758 
759     //
760     //  find
761     //
762     {
763         int32_t             flags=0;
764         UParseError         pe;
765         UErrorCode          status=U_ZERO_ERROR;
766 
767         UnicodeString       re("abc");
768         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
769         REGEX_CHECK_STATUS;
770         UnicodeString data = ".abc..abc...abc..";
771         //                    012345678901234567
772 
773         RegexMatcher *matcher = pat->matcher(data, status);
774         REGEX_CHECK_STATUS;
775         REGEX_ASSERT(matcher->find());
776         REGEX_ASSERT(matcher->start(status) == 1);
777         REGEX_ASSERT(matcher->find());
778         REGEX_ASSERT(matcher->start(status) == 6);
779         REGEX_ASSERT(matcher->find());
780         REGEX_ASSERT(matcher->start(status) == 12);
781         REGEX_ASSERT(matcher->find() == FALSE);
782         REGEX_ASSERT(matcher->find() == FALSE);
783 
784         matcher->reset();
785         REGEX_ASSERT(matcher->find());
786         REGEX_ASSERT(matcher->start(status) == 1);
787 
788         REGEX_ASSERT(matcher->find(0, status));
789         REGEX_ASSERT(matcher->start(status) == 1);
790         REGEX_ASSERT(matcher->find(1, status));
791         REGEX_ASSERT(matcher->start(status) == 1);
792         REGEX_ASSERT(matcher->find(2, status));
793         REGEX_ASSERT(matcher->start(status) == 6);
794         REGEX_ASSERT(matcher->find(12, status));
795         REGEX_ASSERT(matcher->start(status) == 12);
796         REGEX_ASSERT(matcher->find(13, status) == FALSE);
797         REGEX_ASSERT(matcher->find(16, status) == FALSE);
798         REGEX_ASSERT(matcher->find(17, status) == FALSE);
799         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
800 
801         status = U_ZERO_ERROR;
802         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
803         status = U_ZERO_ERROR;
804         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
805 
806         REGEX_ASSERT(matcher->groupCount() == 0);
807 
808         delete matcher;
809         delete pat;
810     }
811 
812 
813     //
814     //  find, with \G in pattern (true if at the end of a previous match).
815     //
816     {
817         int32_t             flags=0;
818         UParseError         pe;
819         UErrorCode          status=U_ZERO_ERROR;
820 
821         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
822         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
823         REGEX_CHECK_STATUS;
824         UnicodeString data = ".abcabc.abc..";
825         //                    012345678901234567
826 
827         RegexMatcher *matcher = pat->matcher(data, status);
828         REGEX_CHECK_STATUS;
829         REGEX_ASSERT(matcher->find());
830         REGEX_ASSERT(matcher->start(status) == 0);
831         REGEX_ASSERT(matcher->start(1, status) == -1);
832         REGEX_ASSERT(matcher->start(2, status) == 1);
833 
834         REGEX_ASSERT(matcher->find());
835         REGEX_ASSERT(matcher->start(status) == 4);
836         REGEX_ASSERT(matcher->start(1, status) == 4);
837         REGEX_ASSERT(matcher->start(2, status) == -1);
838         REGEX_CHECK_STATUS;
839 
840         delete matcher;
841         delete pat;
842     }
843 
844     //
845     //   find with zero length matches, match position should bump ahead
846     //     to prevent loops.
847     //
848     {
849         int32_t                 i;
850         UErrorCode          status=U_ZERO_ERROR;
851         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
852                                                       //   using an always-true look-ahead.
853         REGEX_CHECK_STATUS;
854         UnicodeString s("    ");
855         m.reset(s);
856         for (i=0; ; i++) {
857             if (m.find() == FALSE) {
858                 break;
859             }
860             REGEX_ASSERT(m.start(status) == i);
861             REGEX_ASSERT(m.end(status) == i);
862         }
863         REGEX_ASSERT(i==5);
864 
865         // Check that the bump goes over surrogate pairs OK
866         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
867         s = s.unescape();
868         m.reset(s);
869         for (i=0; ; i+=2) {
870             if (m.find() == FALSE) {
871                 break;
872             }
873             REGEX_ASSERT(m.start(status) == i);
874             REGEX_ASSERT(m.end(status) == i);
875         }
876         REGEX_ASSERT(i==10);
877     }
878     {
879         // find() loop breaking test.
880         //        with pattern of /.?/, should see a series of one char matches, then a single
881         //        match of zero length at the end of the input string.
882         int32_t                 i;
883         UErrorCode          status=U_ZERO_ERROR;
884         RegexMatcher        m(".?", 0, status);
885         REGEX_CHECK_STATUS;
886         UnicodeString s("    ");
887         m.reset(s);
888         for (i=0; ; i++) {
889             if (m.find() == FALSE) {
890                 break;
891             }
892             REGEX_ASSERT(m.start(status) == i);
893             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
894         }
895         REGEX_ASSERT(i==5);
896     }
897 
898 
899     //
900     // Matchers with no input string behave as if they had an empty input string.
901     //
902 
903     {
904         UErrorCode status = U_ZERO_ERROR;
905         RegexMatcher  m(".?", 0, status);
906         REGEX_CHECK_STATUS;
907         REGEX_ASSERT(m.find());
908         REGEX_ASSERT(m.start(status) == 0);
909         REGEX_ASSERT(m.input() == "");
910     }
911     {
912         UErrorCode status = U_ZERO_ERROR;
913         RegexPattern  *p = RegexPattern::compile(".", 0, status);
914         RegexMatcher  *m = p->matcher(status);
915         REGEX_CHECK_STATUS;
916 
917         REGEX_ASSERT(m->find() == FALSE);
918         REGEX_ASSERT(m->input() == "");
919         delete m;
920         delete p;
921     }
922 
923     //
924     // Regions
925     //
926     {
927         UErrorCode status = U_ZERO_ERROR;
928         UnicodeString testString("This is test data");
929         RegexMatcher m(".*", testString,  0, status);
930         REGEX_CHECK_STATUS;
931         REGEX_ASSERT(m.regionStart() == 0);
932         REGEX_ASSERT(m.regionEnd() == testString.length());
933         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
934         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
935 
936         m.region(2,4, status);
937         REGEX_CHECK_STATUS;
938         REGEX_ASSERT(m.matches(status));
939         REGEX_ASSERT(m.start(status)==2);
940         REGEX_ASSERT(m.end(status)==4);
941         REGEX_CHECK_STATUS;
942 
943         m.reset();
944         REGEX_ASSERT(m.regionStart() == 0);
945         REGEX_ASSERT(m.regionEnd() == testString.length());
946 
947         UnicodeString shorterString("short");
948         m.reset(shorterString);
949         REGEX_ASSERT(m.regionStart() == 0);
950         REGEX_ASSERT(m.regionEnd() == shorterString.length());
951 
952         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
953         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
954         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
955         REGEX_ASSERT(&m == &m.reset());
956         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
957 
958         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
959         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
960         REGEX_ASSERT(&m == &m.reset());
961         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
962 
963         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
964         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
965         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
966         REGEX_ASSERT(&m == &m.reset());
967         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
968 
969         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
970         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
971         REGEX_ASSERT(&m == &m.reset());
972         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
973 
974     }
975 
976     //
977     // hitEnd() and requireEnd()
978     //
979     {
980         UErrorCode status = U_ZERO_ERROR;
981         UnicodeString testString("aabb");
982         RegexMatcher m1(".*", testString,  0, status);
983         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
984         REGEX_ASSERT(m1.hitEnd() == TRUE);
985         REGEX_ASSERT(m1.requireEnd() == FALSE);
986         REGEX_CHECK_STATUS;
987 
988         status = U_ZERO_ERROR;
989         RegexMatcher m2("a*", testString, 0, status);
990         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
991         REGEX_ASSERT(m2.hitEnd() == FALSE);
992         REGEX_ASSERT(m2.requireEnd() == FALSE);
993         REGEX_CHECK_STATUS;
994 
995         status = U_ZERO_ERROR;
996         RegexMatcher m3(".*$", testString, 0, status);
997         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
998         REGEX_ASSERT(m3.hitEnd() == TRUE);
999         REGEX_ASSERT(m3.requireEnd() == TRUE);
1000         REGEX_CHECK_STATUS;
1001     }
1002 
1003 
1004     //
1005     // Compilation error on reset with UChar *
1006     //   These were a hazard that people were stumbling over with runtime errors.
1007     //   Changed them to compiler errors by adding private methods that more closely
1008     //   matched the incorrect use of the functions.
1009     //
1010 #if 0
1011     {
1012         UErrorCode status = U_ZERO_ERROR;
1013         UChar ucharString[20];
1014         RegexMatcher m(".", 0, status);
1015         m.reset(ucharString);  // should not compile.
1016 
1017         RegexPattern *p = RegexPattern::compile(".", 0, status);
1018         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1019 
1020         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1021     }
1022 #endif
1023 
1024     //
1025     //  Time Outs.
1026     //       Note:  These tests will need to be changed when the regexp engine is
1027     //              able to detect and cut short the exponential time behavior on
1028     //              this type of match.
1029     //
1030     {
1031         UErrorCode status = U_ZERO_ERROR;
1032         //    Enough 'a's in the string to cause the match to time out.
1033         //       (Each on additonal 'a' doubles the time)
1034         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1035         RegexMatcher matcher("(a+)+b", testString, 0, status);
1036         REGEX_CHECK_STATUS;
1037         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1038         matcher.setTimeLimit(100, status);
1039         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1040         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1041         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1042     }
1043     {
1044         UErrorCode status = U_ZERO_ERROR;
1045         //   Few enough 'a's to slip in under the time limit.
1046         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1047         RegexMatcher matcher("(a+)+b", testString, 0, status);
1048         REGEX_CHECK_STATUS;
1049         matcher.setTimeLimit(100, status);
1050         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1051         REGEX_CHECK_STATUS;
1052     }
1053 
1054     //
1055     //  Stack Limits
1056     //
1057     {
1058         UErrorCode status = U_ZERO_ERROR;
1059         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1060 
1061         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1062         //   of the '+', and makes the stack frames larger.
1063         RegexMatcher matcher("(A)+A$", testString, 0, status);
1064 
1065         // With the default stack, this match should fail to run
1066         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1067         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1068 
1069         // With unlimited stack, it should run
1070         status = U_ZERO_ERROR;
1071         matcher.setStackLimit(0, status);
1072         REGEX_CHECK_STATUS;
1073         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1074         REGEX_CHECK_STATUS;
1075         REGEX_ASSERT(matcher.getStackLimit() == 0);
1076 
1077         // With a limited stack, it the match should fail
1078         status = U_ZERO_ERROR;
1079         matcher.setStackLimit(10000, status);
1080         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1081         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1082         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1083     }
1084 
1085         // A pattern that doesn't save state should work with
1086         //   a minimal sized stack
1087     {
1088         UErrorCode status = U_ZERO_ERROR;
1089         UnicodeString testString = "abc";
1090         RegexMatcher matcher("abc", testString, 0, status);
1091         REGEX_CHECK_STATUS;
1092         matcher.setStackLimit(30, status);
1093         REGEX_CHECK_STATUS;
1094         REGEX_ASSERT(matcher.matches(status) == TRUE);
1095         REGEX_CHECK_STATUS;
1096         REGEX_ASSERT(matcher.getStackLimit() == 30);
1097 
1098         // Negative stack sizes should fail
1099         status = U_ZERO_ERROR;
1100         matcher.setStackLimit(1000, status);
1101         REGEX_CHECK_STATUS;
1102         matcher.setStackLimit(-1, status);
1103         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1104         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1105     }
1106 
1107 
1108 }
1109 
1110 
1111 
1112 
1113 
1114 
1115 //---------------------------------------------------------------------------
1116 //
1117 //      API_Replace        API test for class RegexMatcher, testing the
1118 //                         Replace family of functions.
1119 //
1120 //---------------------------------------------------------------------------
API_Replace()1121 void RegexTest::API_Replace() {
1122     //
1123     //  Replace
1124     //
1125     int32_t             flags=0;
1126     UParseError         pe;
1127     UErrorCode          status=U_ZERO_ERROR;
1128 
1129     UnicodeString       re("abc");
1130     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1131     REGEX_CHECK_STATUS;
1132     UnicodeString data = ".abc..abc...abc..";
1133     //                    012345678901234567
1134     RegexMatcher *matcher = pat->matcher(data, status);
1135 
1136     //
1137     //  Plain vanilla matches.
1138     //
1139     UnicodeString  dest;
1140     dest = matcher->replaceFirst("yz", status);
1141     REGEX_CHECK_STATUS;
1142     REGEX_ASSERT(dest == ".yz..abc...abc..");
1143 
1144     dest = matcher->replaceAll("yz", status);
1145     REGEX_CHECK_STATUS;
1146     REGEX_ASSERT(dest == ".yz..yz...yz..");
1147 
1148     //
1149     //  Plain vanilla non-matches.
1150     //
1151     UnicodeString d2 = ".abx..abx...abx..";
1152     matcher->reset(d2);
1153     dest = matcher->replaceFirst("yz", status);
1154     REGEX_CHECK_STATUS;
1155     REGEX_ASSERT(dest == ".abx..abx...abx..");
1156 
1157     dest = matcher->replaceAll("yz", status);
1158     REGEX_CHECK_STATUS;
1159     REGEX_ASSERT(dest == ".abx..abx...abx..");
1160 
1161     //
1162     // Empty source string
1163     //
1164     UnicodeString d3 = "";
1165     matcher->reset(d3);
1166     dest = matcher->replaceFirst("yz", status);
1167     REGEX_CHECK_STATUS;
1168     REGEX_ASSERT(dest == "");
1169 
1170     dest = matcher->replaceAll("yz", status);
1171     REGEX_CHECK_STATUS;
1172     REGEX_ASSERT(dest == "");
1173 
1174     //
1175     // Empty substitution string
1176     //
1177     matcher->reset(data);              // ".abc..abc...abc.."
1178     dest = matcher->replaceFirst("", status);
1179     REGEX_CHECK_STATUS;
1180     REGEX_ASSERT(dest == "...abc...abc..");
1181 
1182     dest = matcher->replaceAll("", status);
1183     REGEX_CHECK_STATUS;
1184     REGEX_ASSERT(dest == "........");
1185 
1186     //
1187     // match whole string
1188     //
1189     UnicodeString d4 = "abc";
1190     matcher->reset(d4);
1191     dest = matcher->replaceFirst("xyz", status);
1192     REGEX_CHECK_STATUS;
1193     REGEX_ASSERT(dest == "xyz");
1194 
1195     dest = matcher->replaceAll("xyz", status);
1196     REGEX_CHECK_STATUS;
1197     REGEX_ASSERT(dest == "xyz");
1198 
1199     //
1200     // Capture Group, simple case
1201     //
1202     UnicodeString       re2("a(..)");
1203     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1204     REGEX_CHECK_STATUS;
1205     UnicodeString d5 = "abcdefg";
1206     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1207     REGEX_CHECK_STATUS;
1208     dest = matcher2->replaceFirst("$1$1", status);
1209     REGEX_CHECK_STATUS;
1210     REGEX_ASSERT(dest == "bcbcdefg");
1211 
1212     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1213     REGEX_CHECK_STATUS;
1214     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1215 
1216     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1217     REGEX_CHECK_STATUS;
1218     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1219 
1220     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1221     replacement = replacement.unescape();
1222     dest = matcher2->replaceFirst(replacement, status);
1223     REGEX_CHECK_STATUS;
1224     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1225 
1226     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1227 
1228 
1229     //
1230     // Replacement String with \u hex escapes
1231     //
1232     {
1233         UnicodeString  src = "abc 1 abc 2 abc 3";
1234         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1235         matcher->reset(src);
1236         UnicodeString  result = matcher->replaceAll(substitute, status);
1237         REGEX_CHECK_STATUS;
1238         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1239     }
1240     {
1241         UnicodeString  src = "abc !";
1242         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1243         matcher->reset(src);
1244         UnicodeString  result = matcher->replaceAll(substitute, status);
1245         REGEX_CHECK_STATUS;
1246         UnicodeString expected = UnicodeString("--");
1247         expected.append((UChar32)0x10000);
1248         expected.append("-- !");
1249         REGEX_ASSERT(result == expected);
1250     }
1251     // TODO:  need more through testing of capture substitutions.
1252 
1253     // Bug 4057
1254     //
1255     {
1256         status = U_ZERO_ERROR;
1257         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1258         RegexMatcher m("ss(.*?)ee", 0, status);
1259         REGEX_CHECK_STATUS;
1260         UnicodeString result;
1261 
1262         // Multiple finds do NOT bump up the previous appendReplacement postion.
1263         m.reset(s);
1264         m.find();
1265         m.find();
1266         m.appendReplacement(result, "ooh", status);
1267         REGEX_CHECK_STATUS;
1268         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1269 
1270         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1271         status = U_ZERO_ERROR;
1272         result.truncate(0);
1273         m.reset(10, status);
1274         m.find();
1275         m.find();
1276         m.appendReplacement(result, "ooh", status);
1277         REGEX_CHECK_STATUS;
1278         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1279 
1280         // find() at interior of string, appendReplacemnt still starts at beginning.
1281         status = U_ZERO_ERROR;
1282         result.truncate(0);
1283         m.reset();
1284         m.find(10, status);
1285         m.find();
1286         m.appendReplacement(result, "ooh", status);
1287         REGEX_CHECK_STATUS;
1288         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1289 
1290         m.appendTail(result);
1291         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1292 
1293     }
1294 
1295     delete matcher2;
1296     delete pat2;
1297     delete matcher;
1298     delete pat;
1299 }
1300 
1301 
1302 //---------------------------------------------------------------------------
1303 //
1304 //      API_Pattern       Test that the API for class RegexPattern is
1305 //                        present and nominally working.
1306 //
1307 //---------------------------------------------------------------------------
API_Pattern()1308 void RegexTest::API_Pattern() {
1309     RegexPattern        pata;    // Test default constructor to not crash.
1310     RegexPattern        patb;
1311 
1312     REGEX_ASSERT(pata == patb);
1313     REGEX_ASSERT(pata == pata);
1314 
1315     UnicodeString re1("abc[a-l][m-z]");
1316     UnicodeString re2("def");
1317     UErrorCode    status = U_ZERO_ERROR;
1318     UParseError   pe;
1319 
1320     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1321     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1322     REGEX_CHECK_STATUS;
1323     REGEX_ASSERT(*pat1 == *pat1);
1324     REGEX_ASSERT(*pat1 != pata);
1325 
1326     // Assign
1327     patb = *pat1;
1328     REGEX_ASSERT(patb == *pat1);
1329 
1330     // Copy Construct
1331     RegexPattern patc(*pat1);
1332     REGEX_ASSERT(patc == *pat1);
1333     REGEX_ASSERT(patb == patc);
1334     REGEX_ASSERT(pat1 != pat2);
1335     patb = *pat2;
1336     REGEX_ASSERT(patb != patc);
1337     REGEX_ASSERT(patb == *pat2);
1338 
1339     // Compile with no flags.
1340     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1341     REGEX_ASSERT(*pat1a == *pat1);
1342 
1343     REGEX_ASSERT(pat1a->flags() == 0);
1344 
1345     // Compile with different flags should be not equal
1346     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1347     REGEX_CHECK_STATUS;
1348 
1349     REGEX_ASSERT(*pat1b != *pat1a);
1350     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1351     REGEX_ASSERT(pat1a->flags() == 0);
1352     delete pat1b;
1353 
1354     // clone
1355     RegexPattern *pat1c = pat1->clone();
1356     REGEX_ASSERT(*pat1c == *pat1);
1357     REGEX_ASSERT(*pat1c != *pat2);
1358 
1359     delete pat1c;
1360     delete pat1a;
1361     delete pat1;
1362     delete pat2;
1363 
1364 
1365     //
1366     //   Verify that a matcher created from a cloned pattern works.
1367     //     (Jitterbug 3423)
1368     //
1369     {
1370         UErrorCode     status     = U_ZERO_ERROR;
1371         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1372         RegexPattern  *pClone     = pSource->clone();
1373         delete         pSource;
1374         RegexMatcher  *mFromClone = pClone->matcher(status);
1375         REGEX_CHECK_STATUS;
1376         UnicodeString s = "Hello World";
1377         mFromClone->reset(s);
1378         REGEX_ASSERT(mFromClone->find() == TRUE);
1379         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1380         REGEX_ASSERT(mFromClone->find() == TRUE);
1381         REGEX_ASSERT(mFromClone->group(status) == "World");
1382         REGEX_ASSERT(mFromClone->find() == FALSE);
1383         delete mFromClone;
1384         delete pClone;
1385     }
1386 
1387     //
1388     //   matches convenience API
1389     //
1390     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1391     REGEX_CHECK_STATUS;
1392     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1393     REGEX_CHECK_STATUS;
1394     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1395     REGEX_CHECK_STATUS;
1396     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1397     REGEX_CHECK_STATUS;
1398     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1399     REGEX_CHECK_STATUS;
1400     status = U_INDEX_OUTOFBOUNDS_ERROR;
1401     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1402     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1403 
1404 
1405     //
1406     // Split()
1407     //
1408     status = U_ZERO_ERROR;
1409     pat1 = RegexPattern::compile(" +",  pe, status);
1410     REGEX_CHECK_STATUS;
1411     UnicodeString  fields[10];
1412 
1413     int32_t n;
1414     n = pat1->split("Now is the time", fields, 10, status);
1415     REGEX_CHECK_STATUS;
1416     REGEX_ASSERT(n==4);
1417     REGEX_ASSERT(fields[0]=="Now");
1418     REGEX_ASSERT(fields[1]=="is");
1419     REGEX_ASSERT(fields[2]=="the");
1420     REGEX_ASSERT(fields[3]=="time");
1421     REGEX_ASSERT(fields[4]=="");
1422 
1423     n = pat1->split("Now is the time", fields, 2, status);
1424     REGEX_CHECK_STATUS;
1425     REGEX_ASSERT(n==2);
1426     REGEX_ASSERT(fields[0]=="Now");
1427     REGEX_ASSERT(fields[1]=="is the time");
1428     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1429 
1430     fields[1] = "*";
1431     status = U_ZERO_ERROR;
1432     n = pat1->split("Now is the time", fields, 1, status);
1433     REGEX_CHECK_STATUS;
1434     REGEX_ASSERT(n==1);
1435     REGEX_ASSERT(fields[0]=="Now is the time");
1436     REGEX_ASSERT(fields[1]=="*");
1437     status = U_ZERO_ERROR;
1438 
1439     n = pat1->split("    Now       is the time   ", fields, 10, status);
1440     REGEX_CHECK_STATUS;
1441     REGEX_ASSERT(n==5);
1442     REGEX_ASSERT(fields[0]=="");
1443     REGEX_ASSERT(fields[1]=="Now");
1444     REGEX_ASSERT(fields[2]=="is");
1445     REGEX_ASSERT(fields[3]=="the");
1446     REGEX_ASSERT(fields[4]=="time");
1447     REGEX_ASSERT(fields[5]=="");
1448 
1449     n = pat1->split("     ", fields, 10, status);
1450     REGEX_CHECK_STATUS;
1451     REGEX_ASSERT(n==1);
1452     REGEX_ASSERT(fields[0]=="");
1453 
1454     fields[0] = "foo";
1455     n = pat1->split("", fields, 10, status);
1456     REGEX_CHECK_STATUS;
1457     REGEX_ASSERT(n==0);
1458     REGEX_ASSERT(fields[0]=="foo");
1459 
1460     delete pat1;
1461 
1462     //  split, with a pattern with (capture)
1463     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1464     REGEX_CHECK_STATUS;
1465 
1466     status = U_ZERO_ERROR;
1467     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1468     REGEX_CHECK_STATUS;
1469     REGEX_ASSERT(n==6);
1470     REGEX_ASSERT(fields[0]=="");
1471     REGEX_ASSERT(fields[1]=="a");
1472     REGEX_ASSERT(fields[2]=="Now is ");
1473     REGEX_ASSERT(fields[3]=="b");
1474     REGEX_ASSERT(fields[4]=="the time");
1475     REGEX_ASSERT(fields[5]=="c");
1476     REGEX_ASSERT(fields[6]=="");
1477     REGEX_ASSERT(status==U_ZERO_ERROR);
1478 
1479     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1480     REGEX_CHECK_STATUS;
1481     REGEX_ASSERT(n==6);
1482     REGEX_ASSERT(fields[0]=="  ");
1483     REGEX_ASSERT(fields[1]=="a");
1484     REGEX_ASSERT(fields[2]=="Now is ");
1485     REGEX_ASSERT(fields[3]=="b");
1486     REGEX_ASSERT(fields[4]=="the time");
1487     REGEX_ASSERT(fields[5]=="c");
1488     REGEX_ASSERT(fields[6]=="");
1489 
1490     status = U_ZERO_ERROR;
1491     fields[6] = "foo";
1492     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1493     REGEX_CHECK_STATUS;
1494     REGEX_ASSERT(n==6);
1495     REGEX_ASSERT(fields[0]=="  ");
1496     REGEX_ASSERT(fields[1]=="a");
1497     REGEX_ASSERT(fields[2]=="Now is ");
1498     REGEX_ASSERT(fields[3]=="b");
1499     REGEX_ASSERT(fields[4]=="the time");
1500     REGEX_ASSERT(fields[5]=="c");
1501     REGEX_ASSERT(fields[6]=="foo");
1502 
1503     status = U_ZERO_ERROR;
1504     fields[5] = "foo";
1505     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1506     REGEX_CHECK_STATUS;
1507     REGEX_ASSERT(n==5);
1508     REGEX_ASSERT(fields[0]=="  ");
1509     REGEX_ASSERT(fields[1]=="a");
1510     REGEX_ASSERT(fields[2]=="Now is ");
1511     REGEX_ASSERT(fields[3]=="b");
1512     REGEX_ASSERT(fields[4]=="the time<c>");
1513     REGEX_ASSERT(fields[5]=="foo");
1514 
1515     status = U_ZERO_ERROR;
1516     fields[5] = "foo";
1517     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1518     REGEX_CHECK_STATUS;
1519     REGEX_ASSERT(n==5);
1520     REGEX_ASSERT(fields[0]=="  ");
1521     REGEX_ASSERT(fields[1]=="a");
1522     REGEX_ASSERT(fields[2]=="Now is ");
1523     REGEX_ASSERT(fields[3]=="b");
1524     REGEX_ASSERT(fields[4]=="the time");
1525     REGEX_ASSERT(fields[5]=="foo");
1526 
1527     status = U_ZERO_ERROR;
1528     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1529     REGEX_CHECK_STATUS;
1530     REGEX_ASSERT(n==4);
1531     REGEX_ASSERT(fields[0]=="  ");
1532     REGEX_ASSERT(fields[1]=="a");
1533     REGEX_ASSERT(fields[2]=="Now is ");
1534     REGEX_ASSERT(fields[3]=="the time<c>");
1535     status = U_ZERO_ERROR;
1536     delete pat1;
1537 
1538     pat1 = RegexPattern::compile("([-,])",  pe, status);
1539     REGEX_CHECK_STATUS;
1540     n = pat1->split("1-10,20", fields, 10, status);
1541     REGEX_CHECK_STATUS;
1542     REGEX_ASSERT(n==5);
1543     REGEX_ASSERT(fields[0]=="1");
1544     REGEX_ASSERT(fields[1]=="-");
1545     REGEX_ASSERT(fields[2]=="10");
1546     REGEX_ASSERT(fields[3]==",");
1547     REGEX_ASSERT(fields[4]=="20");
1548     delete pat1;
1549 
1550 
1551     //
1552     // RegexPattern::pattern()
1553     //
1554     pat1 = new RegexPattern();
1555     REGEX_ASSERT(pat1->pattern() == "");
1556     delete pat1;
1557 
1558     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1559     REGEX_CHECK_STATUS;
1560     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1561     delete pat1;
1562 
1563 
1564     //
1565     // classID functions
1566     //
1567     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1568     REGEX_CHECK_STATUS;
1569     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1570     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1571     UnicodeString Hello("Hello, world.");
1572     RegexMatcher *m = pat1->matcher(Hello, status);
1573     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1574     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1575     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1576     delete m;
1577     delete pat1;
1578 
1579 }
1580 
1581 //---------------------------------------------------------------------------
1582 //
1583 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1584 //                       is present and working, but excluding functions
1585 //                       implementing replace operations.
1586 //
1587 //---------------------------------------------------------------------------
API_Match_UTF8()1588 void RegexTest::API_Match_UTF8() {
1589     UParseError         pe;
1590     UErrorCode          status=U_ZERO_ERROR;
1591     int32_t             flags = 0;
1592 
1593     //
1594     // Debug - slide failing test cases early
1595     //
1596 #if 0
1597     {
1598     }
1599     return;
1600 #endif
1601 
1602     //
1603     // Simple pattern compilation
1604     //
1605     {
1606         UText               re = UTEXT_INITIALIZER;
1607         utext_openUTF8(&re, "abc", -1, &status);
1608         RegexPattern        *pat2;
1609         pat2 = RegexPattern::compile(&re, flags, pe, status);
1610         REGEX_CHECK_STATUS;
1611 
1612         UText input1 = UTEXT_INITIALIZER;
1613         UText input2 = UTEXT_INITIALIZER;
1614         UText empty  = UTEXT_INITIALIZER;
1615         utext_openUTF8(&input1, "abcdef this is a test", -1, &status);
1616         utext_openUTF8(&input2, "not abc", -1, &status);
1617         utext_openUChars(&empty, NULL, 0, &status);
1618 
1619         int32_t input1Len = strlen("abcdef this is a test");
1620         int32_t input2Len = strlen("not abc");
1621 
1622 
1623         //
1624         // Matcher creation and reset.
1625         //
1626         RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT, status);
1627         REGEX_CHECK_STATUS;
1628         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1629         REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
1630         m1->reset(&input2);
1631         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1632         REGEX_ASSERT_UTEXT("not abc", m1->inputText());
1633         m1->reset(&input1);
1634         REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
1635         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1636         m1->reset(&empty);
1637         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1638         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1639 
1640         //
1641         //  reset(pos, status)
1642         //
1643         m1->reset(&input1);
1644         m1->reset(4, status);
1645         REGEX_CHECK_STATUS;
1646         REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
1647         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1648 
1649         m1->reset(-1, status);
1650         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1651         status = U_ZERO_ERROR;
1652 
1653         m1->reset(0, status);
1654         REGEX_CHECK_STATUS;
1655         status = U_ZERO_ERROR;
1656 
1657         m1->reset(input1Len-1, status);
1658         REGEX_CHECK_STATUS;
1659         status = U_ZERO_ERROR;
1660 
1661         m1->reset(input1Len, status);
1662         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1663         status = U_ZERO_ERROR;
1664 
1665         //
1666         // match(pos, status)
1667         //
1668         m1->reset(&input2);
1669         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1670         m1->reset();
1671         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1672         m1->reset();
1673         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1674         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1675         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1676         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1677 
1678         // Match() at end of string should fail, but should not
1679         //  be an error.
1680         status = U_ZERO_ERROR;
1681         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1682         REGEX_CHECK_STATUS;
1683 
1684         // Match beyond end of string should fail with an error.
1685         status = U_ZERO_ERROR;
1686         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1687         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1688 
1689         // Successful match at end of string.
1690         {
1691             status = U_ZERO_ERROR;
1692             RegexMatcher m("A?", 0, status);  // will match zero length string.
1693             REGEX_CHECK_STATUS;
1694             m.reset(&input1);
1695             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1696             REGEX_CHECK_STATUS;
1697             m.reset(&empty);
1698             REGEX_ASSERT(m.matches(0, status) == TRUE);
1699             REGEX_CHECK_STATUS;
1700         }
1701 
1702 
1703         //
1704         // lookingAt(pos, status)
1705         //
1706         status = U_ZERO_ERROR;
1707         m1->reset(&input2);  // "not abc"
1708         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1709         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1710         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1711         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1712         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1713         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1714         status = U_ZERO_ERROR;
1715         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1716         REGEX_CHECK_STATUS;
1717         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1718         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1719 
1720         delete m1;
1721         delete pat2;
1722 
1723         utext_close(&re);
1724         utext_close(&input1);
1725         utext_close(&input2);
1726         utext_close(&empty);
1727     }
1728 
1729 
1730     //
1731     // Capture Group.
1732     //     RegexMatcher::start();
1733     //     RegexMatcher::end();
1734     //     RegexMatcher::groupCount();
1735     //
1736     {
1737         int32_t             flags=0;
1738         UParseError         pe;
1739         UErrorCode          status=U_ZERO_ERROR;
1740         UText               re=UTEXT_INITIALIZER;
1741         utext_openUTF8(&re, "01(23(45)67)(.*)", -1, &status);
1742 
1743         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1744         REGEX_CHECK_STATUS;
1745 
1746         UText input = UTEXT_INITIALIZER;
1747         utext_openUTF8(&input, "0123456789", -1, &status);
1748 
1749         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1750         REGEX_CHECK_STATUS;
1751         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1752         static const int32_t matchStarts[] = {0,  2, 4, 8};
1753         static const int32_t matchEnds[]   = {10, 8, 6, 10};
1754         int32_t i;
1755         for (i=0; i<4; i++) {
1756             int32_t actualStart = matcher->start(i, status);
1757             REGEX_CHECK_STATUS;
1758             if (actualStart != matchStarts[i]) {
1759                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
1760                     __LINE__, i, matchStarts[i], actualStart);
1761             }
1762             int32_t actualEnd = matcher->end(i, status);
1763             REGEX_CHECK_STATUS;
1764             if (actualEnd != matchEnds[i]) {
1765                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
1766                     __LINE__, i, matchEnds[i], actualEnd);
1767             }
1768         }
1769 
1770         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1771         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1772 
1773         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1774         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1775         matcher->reset();
1776         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1777 
1778         matcher->lookingAt(status);
1779 
1780         UnicodeString dest;
1781         UText destText = UTEXT_INITIALIZER;
1782         utext_openUnicodeString(&destText, &dest, &status);
1783         UText *result;
1784 
1785         result = matcher->group((UText *)NULL, RegexMatcher::MATCHER_DEST_IS_UTEXT, status);
1786         REGEX_CHECK_STATUS;
1787         REGEX_ASSERT_UTEXT("0123456789", result);
1788         utext_close(result);
1789         result = matcher->group(&destText, RegexMatcher::MATCHER_DEST_IS_UTEXT, status);
1790         REGEX_CHECK_STATUS;
1791         REGEX_ASSERT(result == &destText);
1792         REGEX_ASSERT_UTEXT("0123456789", result);
1793 
1794         result = matcher->group(0, NULL, status);
1795         REGEX_CHECK_STATUS;
1796         REGEX_ASSERT_UTEXT("0123456789", result);
1797         utext_close(result);
1798         result = matcher->group(0, &destText, status);
1799         REGEX_CHECK_STATUS;
1800         REGEX_ASSERT(result == &destText);
1801         REGEX_ASSERT_UTEXT("0123456789", result);
1802 
1803         result = matcher->group(1, NULL, status);
1804         REGEX_CHECK_STATUS;
1805         REGEX_ASSERT_UTEXT("234567", result);
1806         utext_close(result);
1807         result = matcher->group(1, &destText, status);
1808         REGEX_CHECK_STATUS;
1809         REGEX_ASSERT(result == &destText);
1810         REGEX_ASSERT_UTEXT("234567", result);
1811 
1812         result = matcher->group(2, NULL, status);
1813         REGEX_CHECK_STATUS;
1814         REGEX_ASSERT_UTEXT("45", result);
1815         utext_close(result);
1816         result = matcher->group(2, &destText, status);
1817         REGEX_CHECK_STATUS;
1818         REGEX_ASSERT(result == &destText);
1819         REGEX_ASSERT_UTEXT("45", result);
1820 
1821         result = matcher->group(3, NULL, status);
1822         REGEX_CHECK_STATUS;
1823         REGEX_ASSERT_UTEXT("89", result);
1824         utext_close(result);
1825         result = matcher->group(3, &destText, status);
1826         REGEX_CHECK_STATUS;
1827         REGEX_ASSERT(result == &destText);
1828         REGEX_ASSERT_UTEXT("89", result);
1829 
1830         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1831         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1832         matcher->reset();
1833         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
1834 
1835         delete matcher;
1836         delete pat;
1837 
1838         utext_close(&destText);
1839         utext_close(&input);
1840         utext_close(&re);
1841     }
1842 
1843     //
1844     //  find
1845     //
1846     {
1847         int32_t             flags=0;
1848         UParseError         pe;
1849         UErrorCode          status=U_ZERO_ERROR;
1850         UText               re=UTEXT_INITIALIZER;
1851         utext_openUTF8(&re, "abc", -1, &status);
1852 
1853         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1854         REGEX_CHECK_STATUS;
1855         UText input = UTEXT_INITIALIZER;
1856         utext_openUTF8(&input, ".abc..abc...abc..", -1, &status);
1857         //                      012345678901234567
1858 
1859         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1860         REGEX_CHECK_STATUS;
1861         REGEX_ASSERT(matcher->find());
1862         REGEX_ASSERT(matcher->start(status) == 1);
1863         REGEX_ASSERT(matcher->find());
1864         REGEX_ASSERT(matcher->start(status) == 6);
1865         REGEX_ASSERT(matcher->find());
1866         REGEX_ASSERT(matcher->start(status) == 12);
1867         REGEX_ASSERT(matcher->find() == FALSE);
1868         REGEX_ASSERT(matcher->find() == FALSE);
1869 
1870         matcher->reset();
1871         REGEX_ASSERT(matcher->find());
1872         REGEX_ASSERT(matcher->start(status) == 1);
1873 
1874         REGEX_ASSERT(matcher->find(0, status));
1875         REGEX_ASSERT(matcher->start(status) == 1);
1876         REGEX_ASSERT(matcher->find(1, status));
1877         REGEX_ASSERT(matcher->start(status) == 1);
1878         REGEX_ASSERT(matcher->find(2, status));
1879         REGEX_ASSERT(matcher->start(status) == 6);
1880         REGEX_ASSERT(matcher->find(12, status));
1881         REGEX_ASSERT(matcher->start(status) == 12);
1882         REGEX_ASSERT(matcher->find(13, status) == FALSE);
1883         REGEX_ASSERT(matcher->find(16, status) == FALSE);
1884         REGEX_ASSERT(matcher->find(17, status) == FALSE);
1885         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1886 
1887         status = U_ZERO_ERROR;
1888         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1889         status = U_ZERO_ERROR;
1890         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1891 
1892         REGEX_ASSERT(matcher->groupCount() == 0);
1893 
1894         delete matcher;
1895         delete pat;
1896 
1897         utext_close(&input);
1898         utext_close(&re);
1899     }
1900 
1901 
1902     //
1903     //  find, with \G in pattern (true if at the end of a previous match).
1904     //
1905     {
1906         int32_t             flags=0;
1907         UParseError         pe;
1908         UErrorCode          status=U_ZERO_ERROR;
1909         UText               re=UTEXT_INITIALIZER;
1910         utext_openUTF8(&re, ".*?(?:(\\Gabc)|(abc))", -1, &status);
1911 
1912         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1913 
1914         REGEX_CHECK_STATUS;
1915         UText input = UTEXT_INITIALIZER;
1916         utext_openUTF8(&input, ".abcabc.abc..", -1, &status);
1917         //                      012345678901234567
1918 
1919         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1920         REGEX_CHECK_STATUS;
1921         REGEX_ASSERT(matcher->find());
1922         REGEX_ASSERT(matcher->start(status) == 0);
1923         REGEX_ASSERT(matcher->start(1, status) == -1);
1924         REGEX_ASSERT(matcher->start(2, status) == 1);
1925 
1926         REGEX_ASSERT(matcher->find());
1927         REGEX_ASSERT(matcher->start(status) == 4);
1928         REGEX_ASSERT(matcher->start(1, status) == 4);
1929         REGEX_ASSERT(matcher->start(2, status) == -1);
1930         REGEX_CHECK_STATUS;
1931 
1932         delete matcher;
1933         delete pat;
1934 
1935         utext_close(&input);
1936         utext_close(&re);
1937     }
1938 
1939     //
1940     //   find with zero length matches, match position should bump ahead
1941     //     to prevent loops.
1942     //
1943     {
1944         int32_t                 i;
1945         UErrorCode          status=U_ZERO_ERROR;
1946         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1947                                                       //   using an always-true look-ahead.
1948         REGEX_CHECK_STATUS;
1949         UText s = UTEXT_INITIALIZER;
1950         utext_openUTF8(&s, "    ", -1, &status);
1951         m.reset(&s);
1952         for (i=0; ; i++) {
1953             if (m.find() == FALSE) {
1954                 break;
1955             }
1956             REGEX_ASSERT(m.start(status) == i);
1957             REGEX_ASSERT(m.end(status) == i);
1958         }
1959         REGEX_ASSERT(i==5);
1960 
1961         // Check that the bump goes over characters outside the BMP OK
1962         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
1963         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
1964         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
1965         m.reset(&s);
1966         for (i=0; ; i+=2) {
1967             if (m.find() == FALSE) {
1968                 break;
1969             }
1970             REGEX_ASSERT(m.start(status) == i);
1971             REGEX_ASSERT(m.end(status) == i);
1972         }
1973         REGEX_ASSERT(i==10);
1974 
1975         utext_close(&s);
1976     }
1977     {
1978         // find() loop breaking test.
1979         //        with pattern of /.?/, should see a series of one char matches, then a single
1980         //        match of zero length at the end of the input string.
1981         int32_t                 i;
1982         UErrorCode          status=U_ZERO_ERROR;
1983         RegexMatcher        m(".?", 0, status);
1984         REGEX_CHECK_STATUS;
1985         UText s = UTEXT_INITIALIZER;
1986         utext_openUTF8(&s, "    ", -1, &status);
1987         m.reset(&s);
1988         for (i=0; ; i++) {
1989             if (m.find() == FALSE) {
1990                 break;
1991             }
1992             REGEX_ASSERT(m.start(status) == i);
1993             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1994         }
1995         REGEX_ASSERT(i==5);
1996 
1997         utext_close(&s);
1998     }
1999 
2000 
2001     //
2002     // Matchers with no input string behave as if they had an empty input string.
2003     //
2004 
2005     {
2006         UErrorCode status = U_ZERO_ERROR;
2007         RegexMatcher  m(".?", 0, status);
2008         REGEX_CHECK_STATUS;
2009         REGEX_ASSERT(m.find());
2010         REGEX_ASSERT(m.start(status) == 0);
2011         REGEX_ASSERT(m.input() == "");
2012     }
2013     {
2014         UErrorCode status = U_ZERO_ERROR;
2015         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2016         RegexMatcher  *m = p->matcher(status);
2017         REGEX_CHECK_STATUS;
2018 
2019         REGEX_ASSERT(m->find() == FALSE);
2020         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2021         delete m;
2022         delete p;
2023     }
2024 
2025     //
2026     // Regions
2027     //
2028     {
2029         UErrorCode status = U_ZERO_ERROR;
2030         UText testPattern = UTEXT_INITIALIZER;
2031         UText testText    = UTEXT_INITIALIZER;
2032         utext_openUTF8(&testPattern, ".*", -1, &status);
2033         utext_openUTF8(&testText, "This is test data", -1, &status);
2034 
2035         RegexMatcher m(&testPattern, &testText, 0, status);
2036         REGEX_CHECK_STATUS;
2037         REGEX_ASSERT(m.regionStart() == 0);
2038         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2039         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2040         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2041 
2042         m.region(2,4, status);
2043         REGEX_CHECK_STATUS;
2044         REGEX_ASSERT(m.matches(status));
2045         REGEX_ASSERT(m.start(status)==2);
2046         REGEX_ASSERT(m.end(status)==4);
2047         REGEX_CHECK_STATUS;
2048 
2049         m.reset();
2050         REGEX_ASSERT(m.regionStart() == 0);
2051         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2052 
2053         utext_openUTF8(&testText, "short", -1, &status);
2054         m.reset(&testText);
2055         REGEX_ASSERT(m.regionStart() == 0);
2056         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2057 
2058         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2059         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2060         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2061         REGEX_ASSERT(&m == &m.reset());
2062         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2063 
2064         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2065         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2066         REGEX_ASSERT(&m == &m.reset());
2067         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2068 
2069         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2070         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2071         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2072         REGEX_ASSERT(&m == &m.reset());
2073         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2074 
2075         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2076         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2077         REGEX_ASSERT(&m == &m.reset());
2078         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2079 
2080         utext_close(&testText);
2081         utext_close(&testPattern);
2082     }
2083 
2084     //
2085     // hitEnd() and requireEnd()
2086     //
2087     {
2088         UErrorCode status = U_ZERO_ERROR;
2089         UText testPattern = UTEXT_INITIALIZER;
2090         UText testText    = UTEXT_INITIALIZER;
2091         utext_openUTF8(&testPattern, ".*", -1, &status);
2092         utext_openUTF8(&testText, "aabb", -1, &status);
2093 
2094         RegexMatcher m1(&testPattern, &testText,  0, status);
2095         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2096         REGEX_ASSERT(m1.hitEnd() == TRUE);
2097         REGEX_ASSERT(m1.requireEnd() == FALSE);
2098         REGEX_CHECK_STATUS;
2099 
2100         status = U_ZERO_ERROR;
2101         utext_openUTF8(&testPattern, "a*", -1, &status);
2102         RegexMatcher m2(&testPattern, &testText, 0, status);
2103         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2104         REGEX_ASSERT(m2.hitEnd() == FALSE);
2105         REGEX_ASSERT(m2.requireEnd() == FALSE);
2106         REGEX_CHECK_STATUS;
2107 
2108         status = U_ZERO_ERROR;
2109         utext_openUTF8(&testPattern, ".*$", -1, &status);
2110         RegexMatcher m3(&testPattern, &testText, 0, status);
2111         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2112         REGEX_ASSERT(m3.hitEnd() == TRUE);
2113         REGEX_ASSERT(m3.requireEnd() == TRUE);
2114         REGEX_CHECK_STATUS;
2115 
2116         utext_close(&testText);
2117         utext_close(&testPattern);
2118     }
2119 }
2120 
2121 
2122 //---------------------------------------------------------------------------
2123 //
2124 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2125 //                         Replace family of functions.
2126 //
2127 //---------------------------------------------------------------------------
API_Replace_UTF8()2128 void RegexTest::API_Replace_UTF8() {
2129     //
2130     //  Replace
2131     //
2132     int32_t             flags=0;
2133     UParseError         pe;
2134     UErrorCode          status=U_ZERO_ERROR;
2135 
2136     UText               re=UTEXT_INITIALIZER;
2137     utext_openUTF8(&re, "abc", -1, &status);
2138     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2139     REGEX_CHECK_STATUS;
2140 
2141     char data[] = ".abc..abc...abc..";
2142     //             012345678901234567
2143     UText dataText = UTEXT_INITIALIZER;
2144     utext_openUTF8(&dataText, data, -1, &status);
2145     RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2146 
2147     //
2148     //  Plain vanilla matches.
2149     //
2150     UnicodeString  dest;
2151     UText destText = UTEXT_INITIALIZER;
2152     utext_openUnicodeString(&destText, &dest, &status);
2153     UText *result;
2154 
2155     UText replText = UTEXT_INITIALIZER;
2156 
2157     utext_openUTF8(&replText, "yz", -1, &status);
2158     result = matcher->replaceFirst(&replText, NULL, status);
2159     REGEX_CHECK_STATUS;
2160     REGEX_ASSERT_UTEXT(".yz..abc...abc..", result);
2161     utext_close(result);
2162     result = matcher->replaceFirst(&replText, &destText, status);
2163     REGEX_CHECK_STATUS;
2164     REGEX_ASSERT(result == &destText);
2165     REGEX_ASSERT_UTEXT(".yz..abc...abc..", result);
2166 
2167     result = matcher->replaceAll(&replText, NULL, status);
2168     REGEX_CHECK_STATUS;
2169     REGEX_ASSERT_UTEXT(".yz..yz...yz..", result);
2170     utext_close(result);
2171 
2172     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2173     result = matcher->replaceAll(&replText, &destText, status);
2174     REGEX_CHECK_STATUS;
2175     REGEX_ASSERT(result == &destText);
2176     REGEX_ASSERT_UTEXT(".yz..yz...yz..", result);
2177 
2178     //
2179     //  Plain vanilla non-matches.
2180     //
2181     utext_openUTF8(&dataText, ".abx..abx...abx..", -1, &status);
2182     matcher->reset(&dataText);
2183 
2184     result = matcher->replaceFirst(&replText, NULL, status);
2185     REGEX_CHECK_STATUS;
2186     REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2187     utext_close(result);
2188     result = matcher->replaceFirst(&replText, &destText, status);
2189     REGEX_CHECK_STATUS;
2190     REGEX_ASSERT(result == &destText);
2191     REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2192 
2193     result = matcher->replaceAll(&replText, NULL, status);
2194     REGEX_CHECK_STATUS;
2195     REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2196     utext_close(result);
2197     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2198     result = matcher->replaceAll(&replText, &destText, status);
2199     REGEX_CHECK_STATUS;
2200     REGEX_ASSERT(result == &destText);
2201     REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2202 
2203     //
2204     // Empty source string
2205     //
2206     utext_openUTF8(&dataText, NULL, 0, &status);
2207     matcher->reset(&dataText);
2208 
2209     result = matcher->replaceFirst(&replText, NULL, status);
2210     REGEX_CHECK_STATUS;
2211     REGEX_ASSERT_UTEXT("", result);
2212     utext_close(result);
2213     result = matcher->replaceFirst(&replText, &destText, status);
2214     REGEX_CHECK_STATUS;
2215     REGEX_ASSERT(result == &destText);
2216     REGEX_ASSERT_UTEXT("", result);
2217 
2218     result = matcher->replaceAll(&replText, NULL, status);
2219     REGEX_CHECK_STATUS;
2220     REGEX_ASSERT_UTEXT("", result);
2221     utext_close(result);
2222     result = matcher->replaceAll(&replText, &destText, status);
2223     REGEX_CHECK_STATUS;
2224     REGEX_ASSERT(result == &destText);
2225     REGEX_ASSERT_UTEXT("", result);
2226 
2227     //
2228     // Empty substitution string
2229     //
2230     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2231     matcher->reset(&dataText);
2232 
2233     utext_openUTF8(&replText, NULL, 0, &status);
2234     result = matcher->replaceFirst(&replText, NULL, status);
2235     REGEX_CHECK_STATUS;
2236     REGEX_ASSERT_UTEXT("...abc...abc..", result);
2237     utext_close(result);
2238     result = matcher->replaceFirst(&replText, &destText, status);
2239     REGEX_CHECK_STATUS;
2240     REGEX_ASSERT(result == &destText);
2241     REGEX_ASSERT_UTEXT("...abc...abc..", result);
2242 
2243     result = matcher->replaceAll(&replText, NULL, status);
2244     REGEX_CHECK_STATUS;
2245     REGEX_ASSERT_UTEXT("........", result);
2246     utext_close(result);
2247     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2248     result = matcher->replaceAll(&replText, &destText, status);
2249     REGEX_CHECK_STATUS;
2250     REGEX_ASSERT(result == &destText);
2251     REGEX_ASSERT_UTEXT("........", result);
2252 
2253     //
2254     // match whole string
2255     //
2256     utext_openUTF8(&dataText, "abc", -1, &status);
2257     matcher->reset(&dataText);
2258 
2259     utext_openUTF8(&replText, "xyz", -1, &status);
2260     result = matcher->replaceFirst(&replText, NULL, status);
2261     REGEX_CHECK_STATUS;
2262     REGEX_ASSERT_UTEXT("xyz", result);
2263     utext_close(result);
2264     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2265     result = matcher->replaceFirst(&replText, &destText, status);
2266     REGEX_CHECK_STATUS;
2267     REGEX_ASSERT(result == &destText);
2268     REGEX_ASSERT_UTEXT("xyz", result);
2269 
2270     result = matcher->replaceAll(&replText, NULL, status);
2271     REGEX_CHECK_STATUS;
2272     REGEX_ASSERT_UTEXT("xyz", result);
2273     utext_close(result);
2274     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2275     result = matcher->replaceAll(&replText, &destText, status);
2276     REGEX_CHECK_STATUS;
2277     REGEX_ASSERT(result == &destText);
2278     REGEX_ASSERT_UTEXT("xyz", result);
2279 
2280     //
2281     // Capture Group, simple case
2282     //
2283     utext_openUTF8(&re, "a(..)", -1, &status);
2284     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2285     REGEX_CHECK_STATUS;
2286 
2287     utext_openUTF8(&dataText, "abcdefg", -1, &status);
2288     RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2289     REGEX_CHECK_STATUS;
2290 
2291     utext_openUTF8(&replText, "$1$1", -1, &status);
2292     result = matcher2->replaceFirst(&replText, NULL, status);
2293     REGEX_CHECK_STATUS;
2294     REGEX_ASSERT_UTEXT("bcbcdefg", result);
2295     utext_close(result);
2296     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2297     result = matcher2->replaceFirst(&replText, &destText, status);
2298     REGEX_CHECK_STATUS;
2299     REGEX_ASSERT(result == &destText);
2300     REGEX_ASSERT_UTEXT("bcbcdefg", result);
2301 
2302     utext_openUTF8(&replText, "The value of \\$1 is $1.", -1, &status);
2303     result = matcher2->replaceFirst(&replText, NULL, status);
2304     REGEX_CHECK_STATUS;
2305     REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result);
2306     utext_close(result);
2307     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2308     result = matcher2->replaceFirst(&replText, &destText, status);
2309     REGEX_CHECK_STATUS;
2310     REGEX_ASSERT(result == &destText);
2311     REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result);
2312 
2313     utext_openUTF8(&replText, "$ by itself, no group number $$$", -1, &status);
2314     result = matcher2->replaceFirst(&replText, NULL, status);
2315     REGEX_CHECK_STATUS;
2316     REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result);
2317     utext_close(result);
2318     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2319     result = matcher2->replaceFirst(&replText, &destText, status);
2320     REGEX_CHECK_STATUS;
2321     REGEX_ASSERT(result == &destText);
2322     REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result);
2323 
2324     unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2325     //                                 012345678901234567890123456
2326     supplDigitChars[22] = 0xF0;
2327     supplDigitChars[23] = 0x9D;
2328     supplDigitChars[24] = 0x9F;
2329     supplDigitChars[25] = 0x8F;
2330     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2331 
2332     result = matcher2->replaceFirst(&replText, NULL, status);
2333     REGEX_CHECK_STATUS;
2334     REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result);
2335     utext_close(result);
2336     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2337     result = matcher2->replaceFirst(&replText, &destText, status);
2338     REGEX_CHECK_STATUS;
2339     REGEX_ASSERT(result == &destText);
2340     REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result);
2341 
2342     utext_openUTF8(&replText, "bad capture group number $5...", -1, &status);
2343     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2344 //    REGEX_ASSERT_UTEXT("abcdefg", result);
2345     utext_close(result);
2346     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2347     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2348     REGEX_ASSERT(result == &destText);
2349 //    REGEX_ASSERT_UTEXT("abcdefg", result);
2350 
2351     //
2352     // Replacement String with \u hex escapes
2353     //
2354     {
2355         utext_openUTF8(&dataText, "abc 1 abc 2 abc 3", -1, &status);
2356         utext_openUTF8(&replText, "--\\u0043--", -1, &status);
2357         matcher->reset(&dataText);
2358 
2359         result = matcher->replaceAll(&replText, NULL, status);
2360         REGEX_CHECK_STATUS;
2361         REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result);
2362         utext_close(result);
2363         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2364         result = matcher->replaceAll(&replText, &destText, status);
2365         REGEX_CHECK_STATUS;
2366         REGEX_ASSERT(result == &destText);
2367         REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result);
2368     }
2369     {
2370         utext_openUTF8(&dataText, "abc !", -1, &status);
2371         utext_openUTF8(&replText, "--\\U00010000--", -1, &status);
2372         matcher->reset(&dataText);
2373 
2374         unsigned char expected[] = "--xxxx-- !"; // \U00010000, "LINEAR B SYLLABLE B008 A"
2375         //                          0123456789
2376         expected[2] = 0xF0;
2377         expected[3] = 0x90;
2378         expected[4] = 0x80;
2379         expected[5] = 0x80;
2380 
2381         result = matcher->replaceAll(&replText, NULL, status);
2382         REGEX_CHECK_STATUS;
2383         REGEX_ASSERT_UTEXT((char *)expected, result);
2384         utext_close(result);
2385         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2386         result = matcher->replaceAll(&replText, &destText, status);
2387         REGEX_CHECK_STATUS;
2388         REGEX_ASSERT(result == &destText);
2389         REGEX_ASSERT_UTEXT((char *)expected, result);
2390     }
2391     // TODO:  need more through testing of capture substitutions.
2392 
2393     // Bug 4057
2394     //
2395     {
2396         status = U_ZERO_ERROR;
2397         utext_openUTF8(&re, "ss(.*?)ee", -1, &status);
2398         utext_openUTF8(&dataText, "The matches start with ss and end with ee ss stuff ee fin", -1, &status);
2399         utext_openUTF8(&replText, "ooh", -1, &status);
2400 
2401         RegexMatcher m(&re, 0, status);
2402         REGEX_CHECK_STATUS;
2403 
2404         UnicodeString result;
2405         UText resultText = UTEXT_INITIALIZER;
2406         utext_openUnicodeString(&resultText, &result, &status);
2407 
2408         // Multiple finds do NOT bump up the previous appendReplacement postion.
2409         m.reset(&dataText);
2410         m.find();
2411         m.find();
2412         m.appendReplacement(&resultText, &replText, status);
2413         REGEX_CHECK_STATUS;
2414         REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
2415 
2416         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2417         status = U_ZERO_ERROR;
2418         result.truncate(0);
2419         utext_openUnicodeString(&resultText, &result, &status);
2420         m.reset(10, status);
2421         m.find();
2422         m.find();
2423         m.appendReplacement(&resultText, &replText, status);
2424         REGEX_CHECK_STATUS;
2425         REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
2426 
2427         // find() at interior of string, appendReplacement still starts at beginning.
2428         status = U_ZERO_ERROR;
2429         result.truncate(0);
2430         utext_openUnicodeString(&resultText, &result, &status);
2431         m.reset();
2432         m.find(10, status);
2433         m.find();
2434         m.appendReplacement(&resultText, &replText, status);
2435         REGEX_CHECK_STATUS;
2436         REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
2437 
2438         m.appendTail(&resultText);
2439         REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh fin", &resultText);
2440 
2441         utext_close(&resultText);
2442     }
2443 
2444     delete matcher2;
2445     delete pat2;
2446     delete matcher;
2447     delete pat;
2448 
2449     utext_close(&dataText);
2450     utext_close(&replText);
2451     utext_close(&destText);
2452     utext_close(&re);
2453 }
2454 
2455 
2456 //---------------------------------------------------------------------------
2457 //
2458 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2459 //                        present and nominally working.
2460 //
2461 //---------------------------------------------------------------------------
API_Pattern_UTF8()2462 void RegexTest::API_Pattern_UTF8() {
2463     RegexPattern        pata;    // Test default constructor to not crash.
2464     RegexPattern        patb;
2465 
2466     REGEX_ASSERT(pata == patb);
2467     REGEX_ASSERT(pata == pata);
2468 
2469     UText         re1 = UTEXT_INITIALIZER;
2470     UText         re2 = UTEXT_INITIALIZER;
2471     UErrorCode    status = U_ZERO_ERROR;
2472     UParseError   pe;
2473 
2474     utext_openUTF8(&re1, "abc[a-l][m-z]", -1, &status);
2475     utext_openUTF8(&re2, "def", -1, &status);
2476 
2477     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2478     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2479     REGEX_CHECK_STATUS;
2480     REGEX_ASSERT(*pat1 == *pat1);
2481     REGEX_ASSERT(*pat1 != pata);
2482 
2483     // Assign
2484     patb = *pat1;
2485     REGEX_ASSERT(patb == *pat1);
2486 
2487     // Copy Construct
2488     RegexPattern patc(*pat1);
2489     REGEX_ASSERT(patc == *pat1);
2490     REGEX_ASSERT(patb == patc);
2491     REGEX_ASSERT(pat1 != pat2);
2492     patb = *pat2;
2493     REGEX_ASSERT(patb != patc);
2494     REGEX_ASSERT(patb == *pat2);
2495 
2496     // Compile with no flags.
2497     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2498     REGEX_ASSERT(*pat1a == *pat1);
2499 
2500     REGEX_ASSERT(pat1a->flags() == 0);
2501 
2502     // Compile with different flags should be not equal
2503     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2504     REGEX_CHECK_STATUS;
2505 
2506     REGEX_ASSERT(*pat1b != *pat1a);
2507     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2508     REGEX_ASSERT(pat1a->flags() == 0);
2509     delete pat1b;
2510 
2511     // clone
2512     RegexPattern *pat1c = pat1->clone();
2513     REGEX_ASSERT(*pat1c == *pat1);
2514     REGEX_ASSERT(*pat1c != *pat2);
2515 
2516     delete pat1c;
2517     delete pat1a;
2518     delete pat1;
2519     delete pat2;
2520 
2521     utext_close(&re1);
2522     utext_close(&re2);
2523 
2524 
2525     //
2526     //   Verify that a matcher created from a cloned pattern works.
2527     //     (Jitterbug 3423)
2528     //
2529     {
2530         UErrorCode     status     = U_ZERO_ERROR;
2531         UText          pattern    = UTEXT_INITIALIZER;
2532         utext_openUTF8(&pattern, "\\p{L}+", -1, &status);
2533 
2534         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2535         RegexPattern  *pClone     = pSource->clone();
2536         delete         pSource;
2537         RegexMatcher  *mFromClone = pClone->matcher(status);
2538         REGEX_CHECK_STATUS;
2539 
2540         UText          input      = UTEXT_INITIALIZER;
2541         utext_openUTF8(&input, "Hello World", -1, &status);
2542         mFromClone->reset(&input);
2543         REGEX_ASSERT(mFromClone->find() == TRUE);
2544         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2545         REGEX_ASSERT(mFromClone->find() == TRUE);
2546         REGEX_ASSERT(mFromClone->group(status) == "World");
2547         REGEX_ASSERT(mFromClone->find() == FALSE);
2548         delete mFromClone;
2549         delete pClone;
2550 
2551         utext_close(&input);
2552         utext_close(&pattern);
2553     }
2554 
2555     //
2556     //   matches convenience API
2557     //
2558     {
2559         UErrorCode status  = U_ZERO_ERROR;
2560         UText      pattern = UTEXT_INITIALIZER;
2561         UText      input   = UTEXT_INITIALIZER;
2562 
2563         utext_openUTF8(&input, "random input", -1, &status);
2564 
2565         utext_openUTF8(&pattern, ".*", -1, &status);
2566         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2567         REGEX_CHECK_STATUS;
2568 
2569         utext_openUTF8(&pattern, "abc", -1, &status);
2570         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2571         REGEX_CHECK_STATUS;
2572 
2573         utext_openUTF8(&pattern, ".*nput", -1, &status);
2574         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2575         REGEX_CHECK_STATUS;
2576 
2577         utext_openUTF8(&pattern, "random input", -1, &status);
2578         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2579         REGEX_CHECK_STATUS;
2580 
2581         utext_openUTF8(&pattern, ".*u", -1, &status);
2582         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2583         REGEX_CHECK_STATUS;
2584 
2585         utext_openUTF8(&input, "abc", -1, &status);
2586         utext_openUTF8(&pattern, "abc", -1, &status);
2587         status = U_INDEX_OUTOFBOUNDS_ERROR;
2588         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2589         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2590 
2591         utext_close(&input);
2592         utext_close(&pattern);
2593     }
2594 
2595 
2596     //
2597     // Split()
2598     //
2599     status = U_ZERO_ERROR;
2600     utext_openUTF8(&re1, " +", -1, &status);
2601     pat1 = RegexPattern::compile(&re1, pe, status);
2602     REGEX_CHECK_STATUS;
2603     UnicodeString  fields[10];
2604 
2605     int32_t n;
2606     n = pat1->split("Now is the time", fields, 10, status);
2607     REGEX_CHECK_STATUS;
2608     REGEX_ASSERT(n==4);
2609     REGEX_ASSERT(fields[0]=="Now");
2610     REGEX_ASSERT(fields[1]=="is");
2611     REGEX_ASSERT(fields[2]=="the");
2612     REGEX_ASSERT(fields[3]=="time");
2613     REGEX_ASSERT(fields[4]=="");
2614 
2615     n = pat1->split("Now is the time", fields, 2, status);
2616     REGEX_CHECK_STATUS;
2617     REGEX_ASSERT(n==2);
2618     REGEX_ASSERT(fields[0]=="Now");
2619     REGEX_ASSERT(fields[1]=="is the time");
2620     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2621 
2622     fields[1] = "*";
2623     status = U_ZERO_ERROR;
2624     n = pat1->split("Now is the time", fields, 1, status);
2625     REGEX_CHECK_STATUS;
2626     REGEX_ASSERT(n==1);
2627     REGEX_ASSERT(fields[0]=="Now is the time");
2628     REGEX_ASSERT(fields[1]=="*");
2629     status = U_ZERO_ERROR;
2630 
2631     n = pat1->split("    Now       is the time   ", fields, 10, status);
2632     REGEX_CHECK_STATUS;
2633     REGEX_ASSERT(n==5);
2634     REGEX_ASSERT(fields[0]=="");
2635     REGEX_ASSERT(fields[1]=="Now");
2636     REGEX_ASSERT(fields[2]=="is");
2637     REGEX_ASSERT(fields[3]=="the");
2638     REGEX_ASSERT(fields[4]=="time");
2639     REGEX_ASSERT(fields[5]=="");
2640 
2641     n = pat1->split("     ", fields, 10, status);
2642     REGEX_CHECK_STATUS;
2643     REGEX_ASSERT(n==1);
2644     REGEX_ASSERT(fields[0]=="");
2645 
2646     fields[0] = "foo";
2647     n = pat1->split("", fields, 10, status);
2648     REGEX_CHECK_STATUS;
2649     REGEX_ASSERT(n==0);
2650     REGEX_ASSERT(fields[0]=="foo");
2651 
2652     delete pat1;
2653 
2654     //  split, with a pattern with (capture)
2655     utext_openUTF8(&re1, "<(\\w*)>", -1, &status);
2656     pat1 = RegexPattern::compile(&re1,  pe, status);
2657     REGEX_CHECK_STATUS;
2658 
2659     status = U_ZERO_ERROR;
2660     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2661     REGEX_CHECK_STATUS;
2662     REGEX_ASSERT(n==6);
2663     REGEX_ASSERT(fields[0]=="");
2664     REGEX_ASSERT(fields[1]=="a");
2665     REGEX_ASSERT(fields[2]=="Now is ");
2666     REGEX_ASSERT(fields[3]=="b");
2667     REGEX_ASSERT(fields[4]=="the time");
2668     REGEX_ASSERT(fields[5]=="c");
2669     REGEX_ASSERT(fields[6]=="");
2670     REGEX_ASSERT(status==U_ZERO_ERROR);
2671 
2672     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2673     REGEX_CHECK_STATUS;
2674     REGEX_ASSERT(n==6);
2675     REGEX_ASSERT(fields[0]=="  ");
2676     REGEX_ASSERT(fields[1]=="a");
2677     REGEX_ASSERT(fields[2]=="Now is ");
2678     REGEX_ASSERT(fields[3]=="b");
2679     REGEX_ASSERT(fields[4]=="the time");
2680     REGEX_ASSERT(fields[5]=="c");
2681     REGEX_ASSERT(fields[6]=="");
2682 
2683     status = U_ZERO_ERROR;
2684     fields[6] = "foo";
2685     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
2686     REGEX_CHECK_STATUS;
2687     REGEX_ASSERT(n==6);
2688     REGEX_ASSERT(fields[0]=="  ");
2689     REGEX_ASSERT(fields[1]=="a");
2690     REGEX_ASSERT(fields[2]=="Now is ");
2691     REGEX_ASSERT(fields[3]=="b");
2692     REGEX_ASSERT(fields[4]=="the time");
2693     REGEX_ASSERT(fields[5]=="c");
2694     REGEX_ASSERT(fields[6]=="foo");
2695 
2696     status = U_ZERO_ERROR;
2697     fields[5] = "foo";
2698     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
2699     REGEX_CHECK_STATUS;
2700     REGEX_ASSERT(n==5);
2701     REGEX_ASSERT(fields[0]=="  ");
2702     REGEX_ASSERT(fields[1]=="a");
2703     REGEX_ASSERT(fields[2]=="Now is ");
2704     REGEX_ASSERT(fields[3]=="b");
2705     REGEX_ASSERT(fields[4]=="the time<c>");
2706     REGEX_ASSERT(fields[5]=="foo");
2707 
2708     status = U_ZERO_ERROR;
2709     fields[5] = "foo";
2710     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
2711     REGEX_CHECK_STATUS;
2712     REGEX_ASSERT(n==5);
2713     REGEX_ASSERT(fields[0]=="  ");
2714     REGEX_ASSERT(fields[1]=="a");
2715     REGEX_ASSERT(fields[2]=="Now is ");
2716     REGEX_ASSERT(fields[3]=="b");
2717     REGEX_ASSERT(fields[4]=="the time");
2718     REGEX_ASSERT(fields[5]=="foo");
2719 
2720     status = U_ZERO_ERROR;
2721     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
2722     REGEX_CHECK_STATUS;
2723     REGEX_ASSERT(n==4);
2724     REGEX_ASSERT(fields[0]=="  ");
2725     REGEX_ASSERT(fields[1]=="a");
2726     REGEX_ASSERT(fields[2]=="Now is ");
2727     REGEX_ASSERT(fields[3]=="the time<c>");
2728     status = U_ZERO_ERROR;
2729     delete pat1;
2730 
2731     utext_openUTF8(&re1, "([-,])", -1, &status);
2732     pat1 = RegexPattern::compile(&re1, pe, status);
2733     REGEX_CHECK_STATUS;
2734     n = pat1->split("1-10,20", fields, 10, status);
2735     REGEX_CHECK_STATUS;
2736     REGEX_ASSERT(n==5);
2737     REGEX_ASSERT(fields[0]=="1");
2738     REGEX_ASSERT(fields[1]=="-");
2739     REGEX_ASSERT(fields[2]=="10");
2740     REGEX_ASSERT(fields[3]==",");
2741     REGEX_ASSERT(fields[4]=="20");
2742     delete pat1;
2743 
2744 
2745     //
2746     // RegexPattern::pattern() and patternText()
2747     //
2748     pat1 = new RegexPattern();
2749     REGEX_ASSERT(pat1->pattern() == "");
2750     REGEX_ASSERT_UTEXT("", pat1->patternText());
2751     delete pat1;
2752 
2753     utext_openUTF8(&re1, "(Hello, world)*", -1, &status);
2754     pat1 = RegexPattern::compile(&re1, pe, status);
2755     REGEX_CHECK_STATUS;
2756     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
2757     REGEX_ASSERT_UTEXT("(Hello, world)*", pat1->patternText());
2758     delete pat1;
2759 
2760     utext_close(&re1);
2761 }
2762 
2763 
2764 //---------------------------------------------------------------------------
2765 //
2766 //      Extended       A more thorough check for features of regex patterns
2767 //                     The test cases are in a separate data file,
2768 //                       source/tests/testdata/regextst.txt
2769 //                     A description of the test data format is included in that file.
2770 //
2771 //---------------------------------------------------------------------------
2772 
2773 const char *
getPath(char buffer[2048],const char * filename)2774 RegexTest::getPath(char buffer[2048], const char *filename) {
2775     UErrorCode status=U_ZERO_ERROR;
2776     const char *testDataDirectory = IntlTest::getSourceTestData(status);
2777     if (U_FAILURE(status)) {
2778         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
2779         return NULL;
2780     }
2781 
2782     strcpy(buffer, testDataDirectory);
2783     strcat(buffer, filename);
2784     return buffer;
2785 }
2786 
Extended()2787 void RegexTest::Extended() {
2788     char tdd[2048];
2789     const char *srcPath;
2790     UErrorCode  status  = U_ZERO_ERROR;
2791     int32_t     lineNum = 0;
2792 
2793     //
2794     //  Open and read the test data file.
2795     //
2796     srcPath=getPath(tdd, "regextst.txt");
2797     if(srcPath==NULL) {
2798         return; /* something went wrong, error already output */
2799     }
2800 
2801     int32_t    len;
2802     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
2803     if (U_FAILURE(status)) {
2804         return; /* something went wrong, error already output */
2805     }
2806 
2807     //
2808     //  Put the test data into a UnicodeString
2809     //
2810     UnicodeString testString(FALSE, testData, len);
2811 
2812     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
2813     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
2814     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
2815 
2816     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
2817     UnicodeString   testPattern;   // The pattern for test from the test file.
2818     UnicodeString   testFlags;     // the flags   for a test.
2819     UnicodeString   matchString;   // The marked up string to be used as input
2820 
2821     if (U_FAILURE(status)){
2822         dataerrln("Construct RegexMatcher() error.");
2823         delete [] testData;
2824         return;
2825     }
2826 
2827     //
2828     //  Loop over the test data file, once per line.
2829     //
2830     while (lineMat.find()) {
2831         lineNum++;
2832         if (U_FAILURE(status)) {
2833             errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
2834         }
2835 
2836         status = U_ZERO_ERROR;
2837         UnicodeString testLine = lineMat.group(1, status);
2838         if (testLine.length() == 0) {
2839             continue;
2840         }
2841 
2842         //
2843         // Parse the test line.  Skip blank and comment only lines.
2844         // Separate out the three main fields - pattern, flags, target.
2845         //
2846 
2847         commentMat.reset(testLine);
2848         if (commentMat.lookingAt(status)) {
2849             // This line is a comment, or blank.
2850             continue;
2851         }
2852 
2853         //
2854         //  Pull out the pattern field, remove it from the test file line.
2855         //
2856         quotedStuffMat.reset(testLine);
2857         if (quotedStuffMat.lookingAt(status)) {
2858             testPattern = quotedStuffMat.group(2, status);
2859             testLine.remove(0, quotedStuffMat.end(0, status));
2860         } else {
2861             errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
2862             continue;
2863         }
2864 
2865 
2866         //
2867         //  Pull out the flags from the test file line.
2868         //
2869         flagsMat.reset(testLine);
2870         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
2871         testFlags = flagsMat.group(1, status);
2872         if (flagsMat.group(2, status).length() > 0) {
2873             errln("Bad Match flag at line %d. Scanning %c\n",
2874                 lineNum, flagsMat.group(2, status).charAt(0));
2875             continue;
2876         }
2877         testLine.remove(0, flagsMat.end(0, status));
2878 
2879         //
2880         //  Pull out the match string, as a whole.
2881         //    We'll process the <tags> later.
2882         //
2883         quotedStuffMat.reset(testLine);
2884         if (quotedStuffMat.lookingAt(status)) {
2885             matchString = quotedStuffMat.group(2, status);
2886             testLine.remove(0, quotedStuffMat.end(0, status));
2887         } else {
2888             errln("Bad match string at test file line %d", lineNum);
2889             continue;
2890         }
2891 
2892         //
2893         //  The only thing left from the input line should be an optional trailing comment.
2894         //
2895         commentMat.reset(testLine);
2896         if (commentMat.lookingAt(status) == FALSE) {
2897             errln("Line %d: unexpected characters at end of test line.", lineNum);
2898             continue;
2899         }
2900 
2901         //
2902         //  Run the test
2903         //
2904         regex_find(testPattern, testFlags, matchString, lineNum);
2905     }
2906 
2907     delete [] testData;
2908 
2909 }
2910 
2911 
2912 
2913 //---------------------------------------------------------------------------
2914 //
2915 //    regex_find(pattern, flags, inputString, lineNumber)
2916 //
2917 //         Function to run a single test from the Extended (data driven) tests.
2918 //         See file test/testdata/regextst.txt for a description of the
2919 //         pattern and inputString fields, and the allowed flags.
2920 //         lineNumber is the source line in regextst.txt of the test.
2921 //
2922 //---------------------------------------------------------------------------
2923 
2924 
2925 //  Set a value into a UVector at position specified by a decimal number in
2926 //   a UnicodeString.   This is a utility function needed by the actual test function,
2927 //   which follows.
set(UVector & vec,int32_t val,UnicodeString index)2928 static void set(UVector &vec, int32_t val, UnicodeString index) {
2929     UErrorCode  status=U_ZERO_ERROR;
2930     int32_t  idx = 0;
2931     for (int32_t i=0; i<index.length(); i++) {
2932         int32_t d=u_charDigitValue(index.charAt(i));
2933         if (d<0) {return;}
2934         idx = idx*10 + d;
2935     }
2936     while (vec.size()<idx+1) {vec.addElement(-1, status);}
2937     vec.setElementAt(val, idx);
2938 }
2939 
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,int32_t line)2940 void RegexTest::regex_find(const UnicodeString &pattern,
2941                            const UnicodeString &flags,
2942                            const UnicodeString &inputString,
2943                            int32_t line) {
2944     UnicodeString       unEscapedInput;
2945     UnicodeString       deTaggedInput;
2946 
2947     int32_t             patternUTF8Length,      inputUTF8Length;
2948     char                *patternChars  = NULL, *inputChars = NULL;
2949     UText               patternText    = UTEXT_INITIALIZER;
2950     UText               inputText      = UTEXT_INITIALIZER;
2951     UConverter          *UTF8Converter = NULL;
2952 
2953     UErrorCode          status         = U_ZERO_ERROR;
2954     UParseError         pe;
2955     RegexPattern        *parsePat      = NULL;
2956     RegexMatcher        *parseMatcher  = NULL;
2957     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
2958     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
2959     UVector             groupStarts(status);
2960     UVector             groupEnds(status);
2961     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
2962     UBool               failed         = FALSE;
2963     int32_t             numFinds;
2964     int32_t             i;
2965     UBool               useMatchesFunc   = FALSE;
2966     UBool               useLookingAtFunc = FALSE;
2967     int32_t             regionStart      = -1;
2968     int32_t             regionEnd        = -1;
2969 
2970     //
2971     //  Compile the caller's pattern
2972     //
2973     uint32_t bflags = 0;
2974     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
2975         bflags |= UREGEX_CASE_INSENSITIVE;
2976     }
2977     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
2978         bflags |= UREGEX_COMMENTS;
2979     }
2980     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
2981         bflags |= UREGEX_DOTALL;
2982     }
2983     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
2984         bflags |= UREGEX_MULTILINE;
2985     }
2986 
2987     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
2988         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
2989     }
2990     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
2991         bflags |= UREGEX_UNIX_LINES;
2992     }
2993 
2994 
2995     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
2996     if (status != U_ZERO_ERROR) {
2997         #if UCONFIG_NO_BREAK_ITERATION==1
2998         // 'v' test flag means that the test pattern should not compile if ICU was configured
2999         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3000         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3001             goto cleanupAndReturn;
3002         }
3003         #endif
3004         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3005             // Expected pattern compilation error.
3006             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3007                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3008             }
3009             goto cleanupAndReturn;
3010         } else {
3011             // Unexpected pattern compilation error.
3012             errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3013             goto cleanupAndReturn;
3014         }
3015     }
3016 
3017     UTF8Converter = ucnv_open("UTF8", &status);
3018     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3019 
3020     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3021     status = U_ZERO_ERROR; // buffer overflow
3022     patternChars = new char[patternUTF8Length+1];
3023     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3024     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3025 
3026     if (status == U_ZERO_ERROR) {
3027         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3028 
3029         if (status != U_ZERO_ERROR) {
3030 #if UCONFIG_NO_BREAK_ITERATION==1
3031             // 'v' test flag means that the test pattern should not compile if ICU was configured
3032             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3033             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3034                 goto cleanupAndReturn;
3035             }
3036 #endif
3037             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3038                 // Expected pattern compilation error.
3039                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3040                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3041                 }
3042                 goto cleanupAndReturn;
3043             } else {
3044                 // Unexpected pattern compilation error.
3045                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3046                 goto cleanupAndReturn;
3047             }
3048         }
3049     }
3050 
3051     if (UTF8Pattern == NULL) {
3052         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3053         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for line %d", line);
3054         status = U_ZERO_ERROR;
3055     }
3056 
3057     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3058         RegexPatternDump(callerPattern);
3059     }
3060 
3061     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3062         errln("Expected, but did not get, a pattern compilation error.");
3063         goto cleanupAndReturn;
3064     }
3065 
3066 
3067     //
3068     // Number of times find() should be called on the test string, default to 1
3069     //
3070     numFinds = 1;
3071     for (i=2; i<=9; i++) {
3072         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3073             if (numFinds != 1) {
3074                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3075                 goto cleanupAndReturn;
3076             }
3077             numFinds = i;
3078         }
3079     }
3080 
3081     // 'M' flag.  Use matches() instead of find()
3082     if (flags.indexOf((UChar)0x4d) >= 0) {
3083         useMatchesFunc = TRUE;
3084     }
3085     if (flags.indexOf((UChar)0x4c) >= 0) {
3086         useLookingAtFunc = TRUE;
3087     }
3088 
3089     //
3090     //  Find the tags in the input data, remove them, and record the group boundary
3091     //    positions.
3092     //
3093     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3094     REGEX_CHECK_STATUS_L(line);
3095 
3096     unEscapedInput = inputString.unescape();
3097     parseMatcher = parsePat->matcher(unEscapedInput, status);
3098     REGEX_CHECK_STATUS_L(line);
3099     while(parseMatcher->find()) {
3100         parseMatcher->appendReplacement(deTaggedInput, "", status);
3101         REGEX_CHECK_STATUS;
3102         UnicodeString groupNum = parseMatcher->group(2, status);
3103         if (groupNum == "r") {
3104             // <r> or </r>, a region specification within the string
3105             if (parseMatcher->group(1, status) == "/") {
3106                 regionEnd = deTaggedInput.length();
3107             } else {
3108                 regionStart = deTaggedInput.length();
3109             }
3110         } else {
3111             // <digits> or </digits>, a group match boundary tag.
3112             if (parseMatcher->group(1, status) == "/") {
3113                 set(groupEnds, deTaggedInput.length(), groupNum);
3114             } else {
3115                 set(groupStarts, deTaggedInput.length(), groupNum);
3116             }
3117         }
3118     }
3119     parseMatcher->appendTail(deTaggedInput);
3120     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3121     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3122       errln("mismatched <r> tags");
3123       failed = TRUE;
3124       goto cleanupAndReturn;
3125     }
3126 
3127 
3128     //
3129     //  Configure the matcher according to the flags specified with this test.
3130     //
3131     matcher = callerPattern->matcher(deTaggedInput, status);
3132     REGEX_CHECK_STATUS_L(line);
3133     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3134         matcher->setTrace(TRUE);
3135     }
3136 
3137     if (UTF8Pattern != NULL) {
3138         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3139         status = U_ZERO_ERROR; // buffer overflow
3140         inputChars = new char[inputUTF8Length+1];
3141         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3142         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3143 
3144         if (status == U_ZERO_ERROR) {
3145             UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
3146             REGEX_CHECK_STATUS_L(line);
3147         }
3148 
3149         if (UTF8Matcher == NULL) {
3150             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3151             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for line %d", line);
3152             status = U_ZERO_ERROR;
3153         }
3154     }
3155 
3156     if (regionStart>=0) {
3157        matcher->region(regionStart, regionEnd, status);
3158        REGEX_CHECK_STATUS_L(line);
3159        if (UTF8Matcher != NULL) {
3160            UTF8Matcher->region(regionStart, regionEnd, status);
3161            REGEX_CHECK_STATUS_L(line);
3162        }
3163     }
3164     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3165         matcher->useAnchoringBounds(FALSE);
3166         if (UTF8Matcher != NULL) {
3167             UTF8Matcher->useAnchoringBounds(FALSE);
3168         }
3169     }
3170     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3171         matcher->useTransparentBounds(TRUE);
3172         if (UTF8Matcher != NULL) {
3173             UTF8Matcher->useTransparentBounds(TRUE);
3174         }
3175     }
3176 
3177 
3178 
3179     //
3180     // Do a find on the de-tagged input using the caller's pattern
3181     //     TODO: error on count>1 and not find().
3182     //           error on both matches() and lookingAt().
3183     //
3184     for (i=0; i<numFinds; i++) {
3185         if (useMatchesFunc) {
3186             isMatch = matcher->matches(status);
3187             if (UTF8Matcher != NULL) {
3188                isUTF8Match = UTF8Matcher->matches(status);
3189             }
3190         } else  if (useLookingAtFunc) {
3191             isMatch = matcher->lookingAt(status);
3192             if (UTF8Matcher != NULL) {
3193                 isUTF8Match = UTF8Matcher->lookingAt(status);
3194             }
3195         } else {
3196             isMatch = matcher->find();
3197             if (UTF8Matcher != NULL) {
3198                 isUTF8Match = UTF8Matcher->find();
3199             }
3200         }
3201     }
3202     matcher->setTrace(FALSE);
3203 
3204     //
3205     // Match up the groups from the find() with the groups from the tags
3206     //
3207 
3208     // number of tags should match number of groups from find operation.
3209     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3210     //   G option in test means that capture group data is not available in the
3211     //     expected results, so the check needs to be suppressed.
3212     if (isMatch == FALSE && groupStarts.size() != 0) {
3213         errln("Error at line %d:  Match expected, but none found.", line);
3214         failed = TRUE;
3215         goto cleanupAndReturn;
3216     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3217         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3218         failed = TRUE;
3219         goto cleanupAndReturn;
3220     }
3221 
3222     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3223         // Only check for match / no match.  Don't check capture groups.
3224         if (isMatch && groupStarts.size() == 0) {
3225             errln("Error at line %d:  No match expected, but one found.", line);
3226             failed = TRUE;
3227         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3228             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3229             failed = TRUE;
3230         }
3231         goto cleanupAndReturn;
3232     }
3233 
3234     REGEX_CHECK_STATUS_L(line);
3235     for (i=0; i<=matcher->groupCount(); i++) {
3236         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3237         if (matcher->start(i, status) != expectedStart) {
3238             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3239                 line, i, expectedStart, matcher->start(i, status));
3240             failed = TRUE;
3241             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3242         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStart) {
3243             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3244                   line, i, expectedStart, UTF8Matcher->start(i, status));
3245             failed = TRUE;
3246             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3247         }
3248 
3249         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3250         if (matcher->end(i, status) != expectedEnd) {
3251             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3252                 line, i, expectedEnd, matcher->end(i, status));
3253             failed = TRUE;
3254             // Error on end position;  keep going; real error is probably yet to come as group
3255             //   end positions work from end of the input data towards the front.
3256         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEnd) {
3257             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3258                   line, i, expectedEnd, UTF8Matcher->end(i, status));
3259             failed = TRUE;
3260             // Error on end position;  keep going; real error is probably yet to come as group
3261             //   end positions work from end of the input data towards the front.
3262         }
3263     }
3264     if ( matcher->groupCount()+1 < groupStarts.size()) {
3265         errln("Error at line %d: Expected %d capture groups, found %d.",
3266             line, groupStarts.size()-1, matcher->groupCount());
3267         failed = TRUE;
3268         }
3269     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3270         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3271               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3272         failed = TRUE;
3273     }
3274 
3275     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3276         matcher->requireEnd() == TRUE) {
3277         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3278         failed = TRUE;
3279     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3280         UTF8Matcher->requireEnd() == TRUE) {
3281         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3282         failed = TRUE;
3283     }
3284 
3285     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3286         matcher->requireEnd() == FALSE) {
3287         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3288         failed = TRUE;
3289     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3290         UTF8Matcher->requireEnd() == FALSE) {
3291         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3292         failed = TRUE;
3293     }
3294 
3295     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3296         matcher->hitEnd() == TRUE) {
3297         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3298         failed = TRUE;
3299     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3300                UTF8Matcher->hitEnd() == TRUE) {
3301         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3302         failed = TRUE;
3303     }
3304 
3305     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3306         matcher->hitEnd() == FALSE) {
3307         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3308         failed = TRUE;
3309     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3310                UTF8Matcher->hitEnd() == FALSE) {
3311         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3312         failed = TRUE;
3313     }
3314 
3315 
3316 cleanupAndReturn:
3317     if (failed) {
3318         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3319             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3320         // callerPattern->dump();
3321     }
3322     delete parseMatcher;
3323     delete parsePat;
3324     delete UTF8Matcher;
3325     delete UTF8Pattern;
3326     delete matcher;
3327     delete callerPattern;
3328 
3329     utext_close(&inputText);
3330     delete[] inputChars;
3331     utext_close(&patternText);
3332     delete[] patternChars;
3333     ucnv_close(UTF8Converter);
3334 }
3335 
3336 
3337 
3338 
3339 //---------------------------------------------------------------------------
3340 //
3341 //      Errors     Check for error handling in patterns.
3342 //
3343 //---------------------------------------------------------------------------
Errors()3344 void RegexTest::Errors() {
3345     // \escape sequences that aren't implemented yet.
3346     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3347 
3348     // Missing close parentheses
3349     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3350     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3351     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3352 
3353     // Extra close paren
3354     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3355     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3356     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3357 
3358     // Look-ahead, Look-behind
3359     //  TODO:  add tests for unbounded length look-behinds.
3360     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3361 
3362     // Attempt to use non-default flags
3363     {
3364         UParseError   pe;
3365         UErrorCode    status = U_ZERO_ERROR;
3366         int32_t       flags  = UREGEX_CANON_EQ |
3367                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3368                                UREGEX_MULTILINE;
3369         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3370         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3371         delete pat1;
3372     }
3373 
3374 
3375     // Quantifiers are allowed only after something that can be quantified.
3376     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3377     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3378     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3379 
3380     // Mal-formed {min,max} quantifiers
3381     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3382     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3383     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3384     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3385     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3386     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3387     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3388     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3389     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3390 
3391     // Ticket 5389
3392     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3393 
3394     // Invalid Back Reference \0
3395     //    For ICU 3.8 and earlier
3396     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3397     //
3398     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3399 
3400 }
3401 
3402 
3403 //-------------------------------------------------------------------------------
3404 //
3405 //  Read a text data file, convert it to UChars, and return the data
3406 //    in one big UChar * buffer, which the caller must delete.
3407 //
3408 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3409 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3410                                      const char *defEncoding, UErrorCode &status) {
3411     UChar       *retPtr  = NULL;
3412     char        *fileBuf = NULL;
3413     UConverter* conv     = NULL;
3414     FILE        *f       = NULL;
3415 
3416     ulen = 0;
3417     if (U_FAILURE(status)) {
3418         return retPtr;
3419     }
3420 
3421     //
3422     //  Open the file.
3423     //
3424     f = fopen(fileName, "rb");
3425     if (f == 0) {
3426         dataerrln("Error opening test data file %s\n", fileName);
3427         status = U_FILE_ACCESS_ERROR;
3428         return NULL;
3429     }
3430     //
3431     //  Read it in
3432     //
3433     int32_t            fileSize;
3434     int32_t            amt_read;
3435 
3436     fseek( f, 0, SEEK_END);
3437     fileSize = ftell(f);
3438     fileBuf = new char[fileSize];
3439     fseek(f, 0, SEEK_SET);
3440     amt_read = fread(fileBuf, 1, fileSize, f);
3441     if (amt_read != fileSize || fileSize <= 0) {
3442         errln("Error reading test data file.");
3443         goto cleanUpAndReturn;
3444     }
3445 
3446     //
3447     // Look for a Unicode Signature (BOM) on the data just read
3448     //
3449     int32_t        signatureLength;
3450     const char *   fileBufC;
3451     const char*    encoding;
3452 
3453     fileBufC = fileBuf;
3454     encoding = ucnv_detectUnicodeSignature(
3455         fileBuf, fileSize, &signatureLength, &status);
3456     if(encoding!=NULL ){
3457         fileBufC  += signatureLength;
3458         fileSize  -= signatureLength;
3459     } else {
3460         encoding = defEncoding;
3461         if (strcmp(encoding, "utf-8") == 0) {
3462             errln("file %s is missing its BOM", fileName);
3463         }
3464     }
3465 
3466     //
3467     // Open a converter to take the rule file to UTF-16
3468     //
3469     conv = ucnv_open(encoding, &status);
3470     if (U_FAILURE(status)) {
3471         goto cleanUpAndReturn;
3472     }
3473 
3474     //
3475     // Convert the rules to UChar.
3476     //  Preflight first to determine required buffer size.
3477     //
3478     ulen = ucnv_toUChars(conv,
3479         NULL,           //  dest,
3480         0,              //  destCapacity,
3481         fileBufC,
3482         fileSize,
3483         &status);
3484     if (status == U_BUFFER_OVERFLOW_ERROR) {
3485         // Buffer Overflow is expected from the preflight operation.
3486         status = U_ZERO_ERROR;
3487 
3488         retPtr = new UChar[ulen+1];
3489         ucnv_toUChars(conv,
3490             retPtr,       //  dest,
3491             ulen+1,
3492             fileBufC,
3493             fileSize,
3494             &status);
3495     }
3496 
3497 cleanUpAndReturn:
3498     fclose(f);
3499     delete[] fileBuf;
3500     ucnv_close(conv);
3501     if (U_FAILURE(status)) {
3502         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3503         delete retPtr;
3504         retPtr = 0;
3505         ulen   = 0;
3506     };
3507     return retPtr;
3508 }
3509 
3510 
3511 //-------------------------------------------------------------------------------
3512 //
3513 //   PerlTests  - Run Perl's regular expression tests
3514 //                The input file for this test is re_tests, the standard regular
3515 //                expression test data distributed with the Perl source code.
3516 //
3517 //                Here is Perl's description of the test data file:
3518 //
3519 //        # The tests are in a separate file 't/op/re_tests'.
3520 //        # Each line in that file is a separate test.
3521 //        # There are five columns, separated by tabs.
3522 //        #
3523 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3524 //        # Modifiers can be put after the closing C<'>.
3525 //        #
3526 //        # Column 2 contains the string to be matched.
3527 //        #
3528 //        # Column 3 contains the expected result:
3529 //        #     y   expect a match
3530 //        #     n   expect no match
3531 //        #     c   expect an error
3532 //        # B   test exposes a known bug in Perl, should be skipped
3533 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3534 //        #
3535 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3536 //        #
3537 //        # Column 4 contains a string, usually C<$&>.
3538 //        #
3539 //        # Column 5 contains the expected result of double-quote
3540 //        # interpolating that string after the match, or start of error message.
3541 //        #
3542 //        # Column 6, if present, contains a reason why the test is skipped.
3543 //        # This is printed with "skipped", for harness to pick up.
3544 //        #
3545 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3546 //        #
3547 //        # If you want to add a regular expression test that can't be expressed
3548 //        # in this format, don't add it here: put it in op/pat.t instead.
3549 //
3550 //        For ICU, if field 3 contains an 'i', the test will be skipped.
3551 //        The test exposes is some known incompatibility between ICU and Perl regexps.
3552 //        (The i is in addition to whatever was there before.)
3553 //
3554 //-------------------------------------------------------------------------------
PerlTests()3555 void RegexTest::PerlTests() {
3556     char tdd[2048];
3557     const char *srcPath;
3558     UErrorCode  status = U_ZERO_ERROR;
3559     UParseError pe;
3560 
3561     //
3562     //  Open and read the test data file.
3563     //
3564     srcPath=getPath(tdd, "re_tests.txt");
3565     if(srcPath==NULL) {
3566         return; /* something went wrong, error already output */
3567     }
3568 
3569     int32_t    len;
3570     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3571     if (U_FAILURE(status)) {
3572         return; /* something went wrong, error already output */
3573     }
3574 
3575     //
3576     //  Put the test data into a UnicodeString
3577     //
3578     UnicodeString testDataString(FALSE, testData, len);
3579 
3580     //
3581     //  Regex to break the input file into lines, and strip the new lines.
3582     //     One line per match, capture group one is the desired data.
3583     //
3584     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3585     if (U_FAILURE(status)) {
3586         dataerrln("RegexPattern::compile() error");
3587         return;
3588     }
3589     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3590 
3591     //
3592     //  Regex to split a test file line into fields.
3593     //    There are six fields, separated by tabs.
3594     //
3595     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3596 
3597     //
3598     //  Regex to identify test patterns with flag settings, and to separate them.
3599     //    Test patterns with flags look like 'pattern'i
3600     //    Test patterns without flags are not quoted:   pattern
3601     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3602     //
3603     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3604     RegexMatcher* flagMat = flagPat->matcher(status);
3605 
3606     //
3607     // The Perl tests reference several perl-isms, which are evaluated/substituted
3608     //   in the test data.  Not being perl, this must be done explicitly.  Here
3609     //   are string constants and REs for these constructs.
3610     //
3611     UnicodeString nulnulSrc("${nulnul}");
3612     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3613     nulnul = nulnul.unescape();
3614 
3615     UnicodeString ffffSrc("${ffff}");
3616     UnicodeString ffff("\\uffff", -1, US_INV);
3617     ffff = ffff.unescape();
3618 
3619     //  regexp for $-[0], $+[2], etc.
3620     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3621     RegexMatcher *groupsMat = groupsPat->matcher(status);
3622 
3623     //  regexp for $0, $1, $2, etc.
3624     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3625     RegexMatcher *cgMat = cgPat->matcher(status);
3626 
3627 
3628     //
3629     // Main Loop for the Perl Tests, runs once per line from the
3630     //   test data file.
3631     //
3632     int32_t  lineNum = 0;
3633     int32_t  skippedUnimplementedCount = 0;
3634     while (lineMat->find()) {
3635         lineNum++;
3636 
3637         //
3638         //  Get a line, break it into its fields, do the Perl
3639         //    variable substitutions.
3640         //
3641         UnicodeString line = lineMat->group(1, status);
3642         UnicodeString fields[7];
3643         fieldPat->split(line, fields, 7, status);
3644 
3645         flagMat->reset(fields[0]);
3646         flagMat->matches(status);
3647         UnicodeString pattern  = flagMat->group(2, status);
3648         pattern.findAndReplace("${bang}", "!");
3649         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
3650         pattern.findAndReplace(ffffSrc, ffff);
3651 
3652         //
3653         //  Identify patterns that include match flag settings,
3654         //    split off the flags, remove the extra quotes.
3655         //
3656         UnicodeString flagStr = flagMat->group(3, status);
3657         if (U_FAILURE(status)) {
3658             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3659             return;
3660         }
3661         int32_t flags = 0;
3662         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
3663         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
3664         const UChar UChar_m = 0x6d;
3665         const UChar UChar_x = 0x78;
3666         const UChar UChar_y = 0x79;
3667         if (flagStr.indexOf(UChar_i) != -1) {
3668             flags |= UREGEX_CASE_INSENSITIVE;
3669         }
3670         if (flagStr.indexOf(UChar_m) != -1) {
3671             flags |= UREGEX_MULTILINE;
3672         }
3673         if (flagStr.indexOf(UChar_x) != -1) {
3674             flags |= UREGEX_COMMENTS;
3675         }
3676 
3677         //
3678         // Compile the test pattern.
3679         //
3680         status = U_ZERO_ERROR;
3681         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
3682         if (status == U_REGEX_UNIMPLEMENTED) {
3683             //
3684             // Test of a feature that is planned for ICU, but not yet implemented.
3685             //   skip the test.
3686             skippedUnimplementedCount++;
3687             delete testPat;
3688             status = U_ZERO_ERROR;
3689             continue;
3690         }
3691 
3692         if (U_FAILURE(status)) {
3693             // Some tests are supposed to generate errors.
3694             //   Only report an error for tests that are supposed to succeed.
3695             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
3696                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
3697             {
3698                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
3699             }
3700             status = U_ZERO_ERROR;
3701             delete testPat;
3702             continue;
3703         }
3704 
3705         if (fields[2].indexOf(UChar_i) >= 0) {
3706             // ICU should skip this test.
3707             delete testPat;
3708             continue;
3709         }
3710 
3711         if (fields[2].indexOf(UChar_c) >= 0) {
3712             // This pattern should have caused a compilation error, but didn't/
3713             errln("line %d: Expected a pattern compile error, got success.", lineNum);
3714             delete testPat;
3715             continue;
3716         }
3717 
3718         //
3719         // replace the Perl variables that appear in some of the
3720         //   match data strings.
3721         //
3722         UnicodeString matchString = fields[1];
3723         matchString.findAndReplace(nulnulSrc, nulnul);
3724         matchString.findAndReplace(ffffSrc,   ffff);
3725 
3726         // Replace any \n in the match string with an actual new-line char.
3727         //  Don't do full unescape, as this unescapes more than Perl does, which
3728         //  causes other spurious failures in the tests.
3729         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
3730 
3731 
3732 
3733         //
3734         // Run the test, check for expected match/don't match result.
3735         //
3736         RegexMatcher *testMat = testPat->matcher(matchString, status);
3737         UBool found = testMat->find();
3738         UBool expected = FALSE;
3739         if (fields[2].indexOf(UChar_y) >=0) {
3740             expected = TRUE;
3741         }
3742         if (expected != found) {
3743             errln("line %d: Expected %smatch, got %smatch",
3744                 lineNum, expected?"":"no ", found?"":"no " );
3745             continue;
3746         }
3747 
3748         // Don't try to check expected results if there is no match.
3749         //   (Some have stuff in the expected fields)
3750         if (!found) {
3751             delete testMat;
3752             delete testPat;
3753             continue;
3754         }
3755 
3756         //
3757         // Interpret the Perl expression from the fourth field of the data file,
3758         // building up an ICU string from the results of the ICU match.
3759         //   The Perl expression will contain references to the results of
3760         //     a regex match, including the matched string, capture group strings,
3761         //     group starting and ending indicies, etc.
3762         //
3763         UnicodeString resultString;
3764         UnicodeString perlExpr = fields[3];
3765 #if SUPPORT_MUTATING_INPUT_STRING
3766         groupsMat->reset(perlExpr);
3767         cgMat->reset(perlExpr);
3768 #endif
3769 
3770         while (perlExpr.length() > 0) {
3771 #if !SUPPORT_MUTATING_INPUT_STRING
3772             //  Perferred usage.  Reset after any modification to input string.
3773             groupsMat->reset(perlExpr);
3774             cgMat->reset(perlExpr);
3775 #endif
3776 
3777             if (perlExpr.startsWith("$&")) {
3778                 resultString.append(testMat->group(status));
3779                 perlExpr.remove(0, 2);
3780             }
3781 
3782             else if (groupsMat->lookingAt(status)) {
3783                 // $-[0]   $+[2]  etc.
3784                 UnicodeString digitString = groupsMat->group(2, status);
3785                 int32_t t = 0;
3786                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
3787                 UnicodeString plusOrMinus = groupsMat->group(1, status);
3788                 int32_t matchPosition;
3789                 if (plusOrMinus.compare("+") == 0) {
3790                     matchPosition = testMat->end(groupNum, status);
3791                 } else {
3792                     matchPosition = testMat->start(groupNum, status);
3793                 }
3794                 if (matchPosition != -1) {
3795                     ICU_Utility::appendNumber(resultString, matchPosition);
3796                 }
3797                 perlExpr.remove(0, groupsMat->end(status));
3798             }
3799 
3800             else if (cgMat->lookingAt(status)) {
3801                 // $1, $2, $3, etc.
3802                 UnicodeString digitString = cgMat->group(1, status);
3803                 int32_t t = 0;
3804                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
3805                 if (U_SUCCESS(status)) {
3806                     resultString.append(testMat->group(groupNum, status));
3807                     status = U_ZERO_ERROR;
3808                 }
3809                 perlExpr.remove(0, cgMat->end(status));
3810             }
3811 
3812             else if (perlExpr.startsWith("@-")) {
3813                 int32_t i;
3814                 for (i=0; i<=testMat->groupCount(); i++) {
3815                     if (i>0) {
3816                         resultString.append(" ");
3817                     }
3818                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
3819                 }
3820                 perlExpr.remove(0, 2);
3821             }
3822 
3823             else if (perlExpr.startsWith("@+")) {
3824                 int32_t i;
3825                 for (i=0; i<=testMat->groupCount(); i++) {
3826                     if (i>0) {
3827                         resultString.append(" ");
3828                     }
3829                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
3830                 }
3831                 perlExpr.remove(0, 2);
3832             }
3833 
3834             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
3835                                                      //           or as an escaped sequence (e.g. \n)
3836                 if (perlExpr.length() > 1) {
3837                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
3838                 }
3839                 UChar c = perlExpr.charAt(0);
3840                 switch (c) {
3841                 case 'n':   c = '\n'; break;
3842                 // add any other escape sequences that show up in the test expected results.
3843                 }
3844                 resultString.append(c);
3845                 perlExpr.remove(0, 1);
3846             }
3847 
3848             else  {
3849                 // Any characters from the perl expression that we don't explicitly
3850                 //  recognize before here are assumed to be literals and copied
3851                 //  as-is to the expected results.
3852                 resultString.append(perlExpr.charAt(0));
3853                 perlExpr.remove(0, 1);
3854             }
3855 
3856             if (U_FAILURE(status)) {
3857                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
3858                 break;
3859             }
3860         }
3861 
3862         //
3863         // Expected Results Compare
3864         //
3865         UnicodeString expectedS(fields[4]);
3866         expectedS.findAndReplace(nulnulSrc, nulnul);
3867         expectedS.findAndReplace(ffffSrc,   ffff);
3868         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
3869 
3870 
3871         if (expectedS.compare(resultString) != 0) {
3872             err("Line %d: Incorrect perl expression results.", lineNum);
3873             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
3874         }
3875 
3876         delete testMat;
3877         delete testPat;
3878     }
3879 
3880     //
3881     // All done.  Clean up allocated stuff.
3882     //
3883     delete cgMat;
3884     delete cgPat;
3885 
3886     delete groupsMat;
3887     delete groupsPat;
3888 
3889     delete flagMat;
3890     delete flagPat;
3891 
3892     delete lineMat;
3893     delete linePat;
3894 
3895     delete fieldPat;
3896     delete [] testData;
3897 
3898 
3899     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
3900 
3901 }
3902 
3903 
3904 //-------------------------------------------------------------------------------
3905 //
3906 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
3907 //                  (instead of using UnicodeStrings) to test the alternate engine.
3908 //                  The input file for this test is re_tests, the standard regular
3909 //                  expression test data distributed with the Perl source code.
3910 //                  See PerlTests() for more information.
3911 //
3912 //-------------------------------------------------------------------------------
PerlTestsUTF8()3913 void RegexTest::PerlTestsUTF8() {
3914     char tdd[2048];
3915     const char *srcPath;
3916     UErrorCode  status = U_ZERO_ERROR;
3917     UParseError pe;
3918     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
3919     UText       patternText = UTEXT_INITIALIZER;
3920     char       *patternChars = NULL;
3921     int32_t     patternLength;
3922     int32_t     patternCapacity = 0;
3923     UText       inputText = UTEXT_INITIALIZER;
3924     char       *inputChars = NULL;
3925     int32_t     inputLength;
3926     int32_t     inputCapacity = 0;
3927 
3928     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3929 
3930     //
3931     //  Open and read the test data file.
3932     //
3933     srcPath=getPath(tdd, "re_tests.txt");
3934     if(srcPath==NULL) {
3935         return; /* something went wrong, error already output */
3936     }
3937 
3938     int32_t    len;
3939     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3940     if (U_FAILURE(status)) {
3941         return; /* something went wrong, error already output */
3942     }
3943 
3944     //
3945     //  Put the test data into a UnicodeString
3946     //
3947     UnicodeString testDataString(FALSE, testData, len);
3948 
3949     //
3950     //  Regex to break the input file into lines, and strip the new lines.
3951     //     One line per match, capture group one is the desired data.
3952     //
3953     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3954     if (U_FAILURE(status)) {
3955         dataerrln("RegexPattern::compile() error");
3956         return;
3957     }
3958     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3959 
3960     //
3961     //  Regex to split a test file line into fields.
3962     //    There are six fields, separated by tabs.
3963     //
3964     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3965 
3966     //
3967     //  Regex to identify test patterns with flag settings, and to separate them.
3968     //    Test patterns with flags look like 'pattern'i
3969     //    Test patterns without flags are not quoted:   pattern
3970     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3971     //
3972     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3973     RegexMatcher* flagMat = flagPat->matcher(status);
3974 
3975     //
3976     // The Perl tests reference several perl-isms, which are evaluated/substituted
3977     //   in the test data.  Not being perl, this must be done explicitly.  Here
3978     //   are string constants and REs for these constructs.
3979     //
3980     UnicodeString nulnulSrc("${nulnul}");
3981     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3982     nulnul = nulnul.unescape();
3983 
3984     UnicodeString ffffSrc("${ffff}");
3985     UnicodeString ffff("\\uffff", -1, US_INV);
3986     ffff = ffff.unescape();
3987 
3988     //  regexp for $-[0], $+[2], etc.
3989     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3990     RegexMatcher *groupsMat = groupsPat->matcher(status);
3991 
3992     //  regexp for $0, $1, $2, etc.
3993     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3994     RegexMatcher *cgMat = cgPat->matcher(status);
3995 
3996 
3997     //
3998     // Main Loop for the Perl Tests, runs once per line from the
3999     //   test data file.
4000     //
4001     int32_t  lineNum = 0;
4002     int32_t  skippedUnimplementedCount = 0;
4003     while (lineMat->find()) {
4004         lineNum++;
4005 
4006         //
4007         //  Get a line, break it into its fields, do the Perl
4008         //    variable substitutions.
4009         //
4010         UnicodeString line = lineMat->group(1, status);
4011         UnicodeString fields[7];
4012         fieldPat->split(line, fields, 7, status);
4013 
4014         flagMat->reset(fields[0]);
4015         flagMat->matches(status);
4016         UnicodeString pattern  = flagMat->group(2, status);
4017         pattern.findAndReplace("${bang}", "!");
4018         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4019         pattern.findAndReplace(ffffSrc, ffff);
4020 
4021         //
4022         //  Identify patterns that include match flag settings,
4023         //    split off the flags, remove the extra quotes.
4024         //
4025         UnicodeString flagStr = flagMat->group(3, status);
4026         if (U_FAILURE(status)) {
4027             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4028             return;
4029         }
4030         int32_t flags = 0;
4031         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4032         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4033         const UChar UChar_m = 0x6d;
4034         const UChar UChar_x = 0x78;
4035         const UChar UChar_y = 0x79;
4036         if (flagStr.indexOf(UChar_i) != -1) {
4037             flags |= UREGEX_CASE_INSENSITIVE;
4038         }
4039         if (flagStr.indexOf(UChar_m) != -1) {
4040             flags |= UREGEX_MULTILINE;
4041         }
4042         if (flagStr.indexOf(UChar_x) != -1) {
4043             flags |= UREGEX_COMMENTS;
4044         }
4045 
4046         //
4047         // Put the pattern in a UTF-8 UText
4048         //
4049         status = U_ZERO_ERROR;
4050         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4051         if (status == U_BUFFER_OVERFLOW_ERROR) {
4052             status = U_ZERO_ERROR;
4053             delete[] patternChars;
4054             patternCapacity = patternLength + 1;
4055             patternChars = new char[patternCapacity];
4056             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4057         }
4058         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4059 
4060         //
4061         // Compile the test pattern.
4062         //
4063         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4064         if (status == U_REGEX_UNIMPLEMENTED) {
4065             //
4066             // Test of a feature that is planned for ICU, but not yet implemented.
4067             //   skip the test.
4068             skippedUnimplementedCount++;
4069             delete testPat;
4070             status = U_ZERO_ERROR;
4071             continue;
4072         }
4073 
4074         if (U_FAILURE(status)) {
4075             // Some tests are supposed to generate errors.
4076             //   Only report an error for tests that are supposed to succeed.
4077             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4078                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4079             {
4080                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4081             }
4082             status = U_ZERO_ERROR;
4083             delete testPat;
4084             continue;
4085         }
4086 
4087         if (fields[2].indexOf(UChar_i) >= 0) {
4088             // ICU should skip this test.
4089             delete testPat;
4090             continue;
4091         }
4092 
4093         if (fields[2].indexOf(UChar_c) >= 0) {
4094             // This pattern should have caused a compilation error, but didn't/
4095             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4096             delete testPat;
4097             continue;
4098         }
4099 
4100 
4101         //
4102         // replace the Perl variables that appear in some of the
4103         //   match data strings.
4104         //
4105         UnicodeString matchString = fields[1];
4106         matchString.findAndReplace(nulnulSrc, nulnul);
4107         matchString.findAndReplace(ffffSrc,   ffff);
4108 
4109         // Replace any \n in the match string with an actual new-line char.
4110         //  Don't do full unescape, as this unescapes more than Perl does, which
4111         //  causes other spurious failures in the tests.
4112         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4113 
4114         //
4115         // Put the input in a UTF-8 UText
4116         //
4117         status = U_ZERO_ERROR;
4118         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4119         if (status == U_BUFFER_OVERFLOW_ERROR) {
4120             status = U_ZERO_ERROR;
4121             delete[] inputChars;
4122             inputCapacity = inputLength + 1;
4123             inputChars = new char[inputCapacity];
4124             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4125         }
4126         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4127 
4128         //
4129         // Run the test, check for expected match/don't match result.
4130         //
4131         RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
4132         UBool found = testMat->find();
4133         UBool expected = FALSE;
4134         if (fields[2].indexOf(UChar_y) >=0) {
4135             expected = TRUE;
4136         }
4137         if (expected != found) {
4138             errln("line %d: Expected %smatch, got %smatch",
4139                 lineNum, expected?"":"no ", found?"":"no " );
4140             continue;
4141         }
4142 
4143         // Don't try to check expected results if there is no match.
4144         //   (Some have stuff in the expected fields)
4145         if (!found) {
4146             delete testMat;
4147             delete testPat;
4148             continue;
4149         }
4150 
4151         //
4152         // Interpret the Perl expression from the fourth field of the data file,
4153         // building up an ICU string from the results of the ICU match.
4154         //   The Perl expression will contain references to the results of
4155         //     a regex match, including the matched string, capture group strings,
4156         //     group starting and ending indicies, etc.
4157         //
4158         UnicodeString resultString;
4159         UnicodeString perlExpr = fields[3];
4160 
4161         while (perlExpr.length() > 0) {
4162             groupsMat->reset(perlExpr);
4163             cgMat->reset(perlExpr);
4164 
4165             if (perlExpr.startsWith("$&")) {
4166                 resultString.append(testMat->group(status));
4167                 perlExpr.remove(0, 2);
4168             }
4169 
4170             else if (groupsMat->lookingAt(status)) {
4171                 // $-[0]   $+[2]  etc.
4172                 UnicodeString digitString = groupsMat->group(2, status);
4173                 int32_t t = 0;
4174                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4175                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4176                 int32_t matchPosition;
4177                 if (plusOrMinus.compare("+") == 0) {
4178                     matchPosition = testMat->end(groupNum, status);
4179                 } else {
4180                     matchPosition = testMat->start(groupNum, status);
4181                 }
4182                 if (matchPosition != -1) {
4183                     ICU_Utility::appendNumber(resultString, matchPosition);
4184                 }
4185                 perlExpr.remove(0, groupsMat->end(status));
4186             }
4187 
4188             else if (cgMat->lookingAt(status)) {
4189                 // $1, $2, $3, etc.
4190                 UnicodeString digitString = cgMat->group(1, status);
4191                 int32_t t = 0;
4192                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4193                 if (U_SUCCESS(status)) {
4194                     resultString.append(testMat->group(groupNum, status));
4195                     status = U_ZERO_ERROR;
4196                 }
4197                 perlExpr.remove(0, cgMat->end(status));
4198             }
4199 
4200             else if (perlExpr.startsWith("@-")) {
4201                 int32_t i;
4202                 for (i=0; i<=testMat->groupCount(); i++) {
4203                     if (i>0) {
4204                         resultString.append(" ");
4205                     }
4206                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4207                 }
4208                 perlExpr.remove(0, 2);
4209             }
4210 
4211             else if (perlExpr.startsWith("@+")) {
4212                 int32_t i;
4213                 for (i=0; i<=testMat->groupCount(); i++) {
4214                     if (i>0) {
4215                         resultString.append(" ");
4216                     }
4217                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4218                 }
4219                 perlExpr.remove(0, 2);
4220             }
4221 
4222             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4223                                                      //           or as an escaped sequence (e.g. \n)
4224                 if (perlExpr.length() > 1) {
4225                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4226                 }
4227                 UChar c = perlExpr.charAt(0);
4228                 switch (c) {
4229                 case 'n':   c = '\n'; break;
4230                 // add any other escape sequences that show up in the test expected results.
4231                 }
4232                 resultString.append(c);
4233                 perlExpr.remove(0, 1);
4234             }
4235 
4236             else  {
4237                 // Any characters from the perl expression that we don't explicitly
4238                 //  recognize before here are assumed to be literals and copied
4239                 //  as-is to the expected results.
4240                 resultString.append(perlExpr.charAt(0));
4241                 perlExpr.remove(0, 1);
4242             }
4243 
4244             if (U_FAILURE(status)) {
4245                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4246                 break;
4247             }
4248         }
4249 
4250         //
4251         // Expected Results Compare
4252         //
4253         UnicodeString expectedS(fields[4]);
4254         expectedS.findAndReplace(nulnulSrc, nulnul);
4255         expectedS.findAndReplace(ffffSrc,   ffff);
4256         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4257 
4258 
4259         if (expectedS.compare(resultString) != 0) {
4260             err("Line %d: Incorrect perl expression results.", lineNum);
4261             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4262         }
4263 
4264         delete testMat;
4265         delete testPat;
4266     }
4267 
4268     //
4269     // All done.  Clean up allocated stuff.
4270     //
4271     delete cgMat;
4272     delete cgPat;
4273 
4274     delete groupsMat;
4275     delete groupsPat;
4276 
4277     delete flagMat;
4278     delete flagPat;
4279 
4280     delete lineMat;
4281     delete linePat;
4282 
4283     delete fieldPat;
4284     delete [] testData;
4285 
4286     utext_close(&patternText);
4287     utext_close(&inputText);
4288 
4289     delete [] patternChars;
4290     delete [] inputChars;
4291 
4292 
4293     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4294 
4295 }
4296 
4297 
4298 //--------------------------------------------------------------
4299 //
4300 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4301 //             Use this pattern,
4302 //                 "(a?){1,}"
4303 //             The zero-length match will repeat forever.
4304 //                (That this goes into a loop is another bug)
4305 //
4306 //---------------------------------------------------------------
Bug6149()4307 void RegexTest::Bug6149() {
4308     UnicodeString pattern("(a?){1,}");
4309     UnicodeString s("xyz");
4310     uint32_t flags = 0;
4311     UErrorCode status = U_ZERO_ERROR;
4312 
4313     RegexMatcher  matcher(pattern, s, flags, status);
4314     UBool result = false;
4315     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4316     REGEX_ASSERT(result == FALSE);
4317  }
4318 
4319 
4320 //
4321 //   Callbacks()    Test the callback function.
4322 //                  When set, callbacks occur periodically during matching operations,
4323 //                  giving the application code the ability to abort the operation
4324 //                  before it's normal completion.
4325 //
4326 
4327 struct callBackContext {
4328     RegexTest        *test;
4329     int32_t          maxCalls;
4330     int32_t          numCalls;
4331     int32_t          lastSteps;
resetcallBackContext4332     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4333 };
4334 
4335 U_CDECL_BEGIN
4336 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4337 testCallBackFn(const void *context, int32_t steps) {
4338     callBackContext  *info = (callBackContext *)context;
4339     if (info->lastSteps+1 != steps) {
4340         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4341     }
4342     info->lastSteps = steps;
4343     info->numCalls++;
4344     return (info->numCalls < info->maxCalls);
4345 }
4346 U_CDECL_END
4347 
Callbacks()4348 void RegexTest::Callbacks() {
4349    {
4350         // Getter returns NULLs if no callback has been set
4351 
4352         //   The variables that the getter will fill in.
4353         //   Init to non-null values so that the action of the getter can be seen.
4354         const void          *returnedContext = &returnedContext;
4355         URegexMatchCallback *returnedFn = &testCallBackFn;
4356 
4357         UErrorCode status = U_ZERO_ERROR;
4358         RegexMatcher matcher("x", 0, status);
4359         REGEX_CHECK_STATUS;
4360         matcher.getMatchCallback(returnedFn, returnedContext, status);
4361         REGEX_CHECK_STATUS;
4362         REGEX_ASSERT(returnedFn == NULL);
4363         REGEX_ASSERT(returnedContext == NULL);
4364     }
4365 
4366    {
4367         // Set and Get work
4368         callBackContext cbInfo = {this, 0, 0, 0};
4369         const void          *returnedContext;
4370         URegexMatchCallback *returnedFn;
4371         UErrorCode status = U_ZERO_ERROR;
4372         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4373         REGEX_CHECK_STATUS;
4374         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4375         REGEX_CHECK_STATUS;
4376         matcher.getMatchCallback(returnedFn, returnedContext, status);
4377         REGEX_CHECK_STATUS;
4378         REGEX_ASSERT(returnedFn == testCallBackFn);
4379         REGEX_ASSERT(returnedContext == &cbInfo);
4380 
4381         // A short-running match shouldn't invoke the callback
4382         status = U_ZERO_ERROR;
4383         cbInfo.reset(1);
4384         UnicodeString s = "xxx";
4385         matcher.reset(s);
4386         REGEX_ASSERT(matcher.matches(status));
4387         REGEX_CHECK_STATUS;
4388         REGEX_ASSERT(cbInfo.numCalls == 0);
4389 
4390         // A medium-length match that runs long enough to invoke the
4391         //   callback, but not so long that the callback aborts it.
4392         status = U_ZERO_ERROR;
4393         cbInfo.reset(4);
4394         s = "aaaaaaaaaaaaaaaaaaab";
4395         matcher.reset(s);
4396         REGEX_ASSERT(matcher.matches(status)==FALSE);
4397         REGEX_CHECK_STATUS;
4398         REGEX_ASSERT(cbInfo.numCalls > 0);
4399 
4400         // A longer running match that the callback function will abort.
4401         status = U_ZERO_ERROR;
4402         cbInfo.reset(4);
4403         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4404         matcher.reset(s);
4405         REGEX_ASSERT(matcher.matches(status)==FALSE);
4406         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4407         REGEX_ASSERT(cbInfo.numCalls == 4);
4408     }
4409 
4410 
4411 }
4412 
4413 
4414 //---------------------------------------------------------------------------
4415 //
4416 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4417 //                             UTexts. The pure-C implementation of UText
4418 //                             has no mutable backing stores, but we can
4419 //                             use UnicodeString here to test the functionality.
4420 //
4421 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4422 void RegexTest::PreAllocatedUTextCAPI () {
4423     UErrorCode           status = U_ZERO_ERROR;
4424     URegularExpression  *re;
4425     UText                patternText = UTEXT_INITIALIZER;
4426     UnicodeString        buffer;
4427     UText                bufferText = UTEXT_INITIALIZER;
4428 
4429     utext_openUnicodeString(&bufferText, &buffer, &status);
4430 
4431     /*
4432      *  getText() and getUText()
4433      */
4434     {
4435         UText  text1 = UTEXT_INITIALIZER;
4436         UText  text2 = UTEXT_INITIALIZER;
4437         UChar  text2Chars[20];
4438         UText  *resultText;
4439 
4440         status = U_ZERO_ERROR;
4441         utext_openUTF8(&text1, "abcccd", -1, &status);
4442         utext_openUTF8(&text2, "abcccxd", -1, &status);
4443         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4444         utext_openUChars(&text2, text2Chars, -1, &status);
4445 
4446         utext_openUTF8(&patternText, "abc*d", -1, &status);
4447         re = uregex_openUText(&patternText, 0, NULL, &status);
4448 
4449         /* First set a UText */
4450         uregex_setUText(re, &text1, &status);
4451         resultText = uregex_getUText(re, &bufferText, &status);
4452         REGEX_CHECK_STATUS;
4453         REGEX_ASSERT(resultText == &bufferText);
4454         utext_setNativeIndex(resultText, 0);
4455         utext_setNativeIndex(&text1, 0);
4456         REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4457 
4458         resultText = uregex_getUText(re, &bufferText, &status);
4459         REGEX_CHECK_STATUS;
4460         REGEX_ASSERT(resultText == &bufferText);
4461         utext_setNativeIndex(resultText, 0);
4462         utext_setNativeIndex(&text1, 0);
4463         REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4464 
4465         /* Then set a UChar * */
4466         uregex_setText(re, text2Chars, 7, &status);
4467         resultText = uregex_getUText(re, &bufferText, &status);
4468         REGEX_CHECK_STATUS;
4469         REGEX_ASSERT(resultText == &bufferText);
4470         utext_setNativeIndex(resultText, 0);
4471         utext_setNativeIndex(&text2, 0);
4472         REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
4473 
4474         uregex_close(re);
4475         utext_close(&text1);
4476         utext_close(&text2);
4477     }
4478 
4479     /*
4480      *  group()
4481      */
4482     {
4483         UChar    text1[80];
4484         UText   *actual;
4485         UBool    result;
4486         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4487 
4488         status = U_ZERO_ERROR;
4489         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4490         REGEX_CHECK_STATUS;
4491 
4492         uregex_setText(re, text1, -1, &status);
4493         result = uregex_find(re, 0, &status);
4494         REGEX_ASSERT(result==TRUE);
4495 
4496         /*  Capture Group 0, the full match.  Should succeed.  */
4497         status = U_ZERO_ERROR;
4498         actual = uregex_groupUText(re, 0, &bufferText, &status);
4499         REGEX_CHECK_STATUS;
4500         REGEX_ASSERT(actual == &bufferText);
4501         REGEX_ASSERT_UTEXT("abc interior def", actual);
4502 
4503         /*  Capture group #1.  Should succeed. */
4504         status = U_ZERO_ERROR;
4505         actual = uregex_groupUText(re, 1, &bufferText, &status);
4506         REGEX_CHECK_STATUS;
4507         REGEX_ASSERT(actual == &bufferText);
4508         REGEX_ASSERT_UTEXT(" interior ", actual);
4509 
4510         /*  Capture group out of range.  Error. */
4511         status = U_ZERO_ERROR;
4512         actual = uregex_groupUText(re, 2, &bufferText, &status);
4513         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
4514         REGEX_ASSERT(actual == &bufferText);
4515 
4516         uregex_close(re);
4517 
4518     }
4519 
4520     /*
4521      *  replaceFirst()
4522      */
4523     {
4524         UChar    text1[80];
4525         UChar    text2[80];
4526         UText    replText = UTEXT_INITIALIZER;
4527         UText   *result;
4528 
4529         status = U_ZERO_ERROR;
4530         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
4531         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
4532         utext_openUTF8(&replText, "<$1>", -1, &status);
4533 
4534         re = uregex_openC("x(.*?)x", 0, NULL, &status);
4535         REGEX_CHECK_STATUS;
4536 
4537         /*  Normal case, with match */
4538         uregex_setText(re, text1, -1, &status);
4539         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4540         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4541         REGEX_CHECK_STATUS;
4542         REGEX_ASSERT(result == &bufferText);
4543         REGEX_ASSERT_UTEXT("Replace <aa> x1x x...x.", result);
4544 
4545         /* No match.  Text should copy to output with no changes.  */
4546         uregex_setText(re, text2, -1, &status);
4547         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4548         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4549         REGEX_CHECK_STATUS;
4550         REGEX_ASSERT(result == &bufferText);
4551         REGEX_ASSERT_UTEXT("No match here.", result);
4552 
4553         /* Unicode escapes */
4554         uregex_setText(re, text1, -1, &status);
4555         utext_openUTF8(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
4556         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4557         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4558         REGEX_CHECK_STATUS;
4559         REGEX_ASSERT(result == &bufferText);
4560         REGEX_ASSERT_UTEXT("Replace \\AaaB$a x1x x...x.", result);
4561 
4562         uregex_close(re);
4563         utext_close(&replText);
4564     }
4565 
4566 
4567     /*
4568      *  replaceAll()
4569      */
4570     {
4571         UChar    text1[80];
4572         UChar    text2[80];
4573         UText    replText = UTEXT_INITIALIZER;
4574         UText   *result;
4575 
4576         status = U_ZERO_ERROR;
4577         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
4578         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
4579         utext_openUTF8(&replText, "<$1>", -1, &status);
4580 
4581         re = uregex_openC("x(.*?)x", 0, NULL, &status);
4582         REGEX_CHECK_STATUS;
4583 
4584         /*  Normal case, with match */
4585         uregex_setText(re, text1, -1, &status);
4586         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4587         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4588         REGEX_CHECK_STATUS;
4589         REGEX_ASSERT(result == &bufferText);
4590         REGEX_ASSERT_UTEXT("Replace <aa> <1> <...>.", result);
4591 
4592         /* No match.  Text should copy to output with no changes.  */
4593         uregex_setText(re, text2, -1, &status);
4594         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4595         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4596         REGEX_CHECK_STATUS;
4597         REGEX_ASSERT(result == &bufferText);
4598         REGEX_ASSERT_UTEXT("No match here.", result);
4599 
4600         uregex_close(re);
4601         utext_close(&replText);
4602     }
4603 
4604 
4605     /*
4606      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
4607      *   so we don't need to test it here.
4608      */
4609 
4610     utext_close(&bufferText);
4611     utext_close(&patternText);
4612 }
4613 
4614 //--------------------------------------------------------------
4615 //
4616 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
4617 //
4618 //---------------------------------------------------------------
Bug7651()4619 void RegexTest::Bug7651() {
4620     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
4621     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
4622     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
4623     UnicodeString s("#ff @abcd This is test");
4624     RegexPattern  *REPattern = NULL;
4625     RegexMatcher  *REMatcher = NULL;
4626     UErrorCode status = U_ZERO_ERROR;
4627     UParseError pe;
4628 
4629     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
4630     REGEX_CHECK_STATUS;
4631     REMatcher = REPattern->matcher(s, status);
4632     REGEX_CHECK_STATUS;
4633     REGEX_ASSERT(REMatcher->find());
4634     REGEX_ASSERT(REMatcher->start(status) == 0);
4635     delete REPattern;
4636     delete REMatcher;
4637     status = U_ZERO_ERROR;
4638 
4639     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
4640     REGEX_CHECK_STATUS;
4641     REMatcher = REPattern->matcher(s, status);
4642     REGEX_CHECK_STATUS;
4643     REGEX_ASSERT(REMatcher->find());
4644     REGEX_ASSERT(REMatcher->start(status) == 0);
4645     delete REPattern;
4646     delete REMatcher;
4647     status = U_ZERO_ERROR;
4648  }
4649 
4650 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
4651 
4652