• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 2002-2011, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 
7 //
8 //   regextst.cpp
9 //
10 //      ICU Regular Expressions test, part of intltest.
11 //
12 
13 /*
14      NOTE!!
15 
16      PLEASE be careful about ASCII assumptions in this test.
17      This test is one of the worst repeat offenders.
18      If you have questions, contact someone on the ICU PMC
19      who has access to an EBCDIC system.
20 
21  */
22 
23 #include "intltest.h"
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
25 
26 #include "unicode/regex.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ucnv.h"
29 #include "unicode/ustring.h"
30 #include "regextst.h"
31 #include "uvector.h"
32 #include "util.h"
33 #include <stdlib.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include "cstring.h"
37 #include "uinvchar.h"
38 
39 #define SUPPORT_MUTATING_INPUT_STRING   0
40 
41 //---------------------------------------------------------------------------
42 //
43 //  Test class boilerplate
44 //
45 //---------------------------------------------------------------------------
RegexTest()46 RegexTest::RegexTest()
47 {
48 }
49 
50 
~RegexTest()51 RegexTest::~RegexTest()
52 {
53 }
54 
55 
56 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)57 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
58 {
59     if (exec) logln("TestSuite RegexTest: ");
60     switch (index) {
61 
62         case 0: name = "Basic";
63             if (exec) Basic();
64             break;
65         case 1: name = "API_Match";
66             if (exec) API_Match();
67             break;
68         case 2: name = "API_Replace";
69             if (exec) API_Replace();
70             break;
71         case 3: name = "API_Pattern";
72             if (exec) API_Pattern();
73             break;
74         case 4:
75 #if !UCONFIG_NO_FILE_IO
76             name = "Extended";
77             if (exec) Extended();
78 #else
79             name = "skip";
80 #endif
81             break;
82         case 5: name = "Errors";
83             if (exec) Errors();
84             break;
85         case 6: name = "PerlTests";
86             if (exec) PerlTests();
87             break;
88         case 7: name = "Callbacks";
89             if (exec) Callbacks();
90             break;
91         case 8: name = "FindProgressCallbacks";
92             if (exec) FindProgressCallbacks();
93             break;
94         case 9: name = "Bug 6149";
95              if (exec) Bug6149();
96              break;
97         case 10: name = "UTextBasic";
98           if (exec) UTextBasic();
99           break;
100         case 11: name = "API_Match_UTF8";
101           if (exec) API_Match_UTF8();
102           break;
103         case 12: name = "API_Replace_UTF8";
104           if (exec) API_Replace_UTF8();
105           break;
106         case 13: name = "API_Pattern_UTF8";
107           if (exec) API_Pattern_UTF8();
108           break;
109         case 14: name = "PerlTestsUTF8";
110           if (exec) PerlTestsUTF8();
111           break;
112         case 15: name = "PreAllocatedUTextCAPI";
113           if (exec) PreAllocatedUTextCAPI();
114           break;
115         case 16: name = "Bug 7651";
116              if (exec) Bug7651();
117              break;
118         case 17: name = "Bug 7740";
119             if (exec) Bug7740();
120             break;
121         case 18: name = "Bug 8479";
122             if (exec) Bug8479();
123             break;
124         case 19: name = "Bug 7029";
125             if (exec) Bug7029();
126             break;
127         case 20: name = "CheckInvBufSize";
128             if (exec) CheckInvBufSize();
129             break;
130 
131         default: name = "";
132             break; //needed to end loop
133     }
134 }
135 
136 
137 
138 /**
139  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
140  * into ASCII.
141  * @see utext_openUTF8
142  */
143 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
144 
145 //---------------------------------------------------------------------------
146 //
147 //   Error Checking / Reporting macros used in all of the tests.
148 //
149 //---------------------------------------------------------------------------
150 
utextToPrintable(char * buf,int32_t bufLen,UText * text)151 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
152   int64_t oldIndex = utext_getNativeIndex(text);
153   utext_setNativeIndex(text, 0);
154   char *bufPtr = buf;
155   UChar32 c = utext_next32From(text, 0);
156   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
157     if (0x000020<=c && c<0x00007e) {
158       *bufPtr = c;
159     } else {
160 #if 0
161       sprintf(bufPtr,"U+%04X", c);
162       bufPtr+= strlen(bufPtr)-1;
163 #else
164       *bufPtr = '%';
165 #endif
166     }
167     bufPtr++;
168     c = UTEXT_NEXT32(text);
169   }
170   *bufPtr = 0;
171 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
172   char *ebuf = (char*)malloc(bufLen);
173   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
174   uprv_strncpy(buf, ebuf, bufLen);
175   free((void*)ebuf);
176 #endif
177   utext_setNativeIndex(text, oldIndex);
178 }
179 
toHex(int32_t i)180 static inline UChar toHex(int32_t i) {
181     return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10)));
182 }
183 
escape(const UnicodeString & s,UnicodeString & result)184 static UnicodeString& escape(const UnicodeString& s, UnicodeString& result) {
185     for (int32_t i=0; i<s.length(); ++i) {
186         UChar c = s[i];
187         if ((c <= (UChar)0x7F) && (c>0)) {
188             result += c;
189         } else {
190             result += (UChar)0x5c;
191             result += (UChar)0x75;
192             result += toHex((c >> 12) & 0xF);
193             result += toHex((c >>  8) & 0xF);
194             result += toHex((c >>  4) & 0xF);
195             result += toHex( c        & 0xF);
196         }
197     }
198     return result;
199 }
200 
201 static char ASSERT_BUF[1024];
202 
extractToAssertBuf(const UnicodeString & message)203 static const char* extractToAssertBuf(const UnicodeString& message) {
204   if(message.length()==0) {
205     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
206   } else {
207     UnicodeString buf;
208     escape(message, buf);
209     if(buf.length()==0) {
210       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
211     } else {
212       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
213       if(ASSERT_BUF[0]==0) {
214         ASSERT_BUF[0]=0;
215         for(int32_t i=0;i<buf.length();i++) {
216           UChar ch = buf[i];
217           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
218         }
219       }
220     }
221   }
222   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
223   return ASSERT_BUF;
224 }
225 
226 
227 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
228 
229 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
230                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
231 
232 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
233 
234 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
235 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
236     __LINE__, u_errorName(errcode), u_errorName(status));};}
237 
238 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
239     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
240 
241 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
242     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
243 
244 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
245 
246 /**
247  * @param expected expected text in UTF-8 (not platform) codepage
248  */
assertUText(const char * expected,UText * actual,const char * file,int line)249 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
250     UErrorCode status = U_ZERO_ERROR;
251     UText expectedText = UTEXT_INITIALIZER;
252     utext_openUTF8(&expectedText, expected, -1, &status);
253     if(U_FAILURE(status)) {
254       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
255       return;
256     }
257     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
258       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
259       return;
260     }
261     utext_setNativeIndex(actual, 0);
262     if (utext_compare(&expectedText, -1, actual, -1) != 0) {
263         char buf[201 /*21*/];
264         char expectedBuf[201];
265         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
266         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
267         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
268     }
269     utext_close(&expectedText);
270 }
271 /**
272  * @param expected invariant (platform local text) input
273  */
274 
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)275 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
276     UErrorCode status = U_ZERO_ERROR;
277     UText expectedText = UTEXT_INITIALIZER;
278     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
279     if(U_FAILURE(status)) {
280       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
281       return;
282     }
283     utext_setNativeIndex(actual, 0);
284     if (utext_compare(&expectedText, -1, actual, -1) != 0) {
285         char buf[201 /*21*/];
286         char expectedBuf[201];
287         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
288         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
289         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
290     }
291     utext_close(&expectedText);
292 }
293 
294 /**
295  * Assumes utf-8 input
296  */
297 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
298 /**
299  * Assumes Invariant input
300  */
301 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
302 
303 /**
304  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
305  * passed into utext_openUTF8. An error will be given if
306  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
307  */
308 
309 #define INV_BUFSIZ 2048 /* increase this if too small */
310 
311 static int32_t inv_next=0;
312 
313 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
314 static char inv_buf[INV_BUFSIZ];
315 #endif
316 
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)317 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
318   if(length==-1) length=strlen(inv);
319 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
320   inv_next+=length;
321   return utext_openUTF8(ut, inv, length, status);
322 #else
323   if(inv_next+length+1>INV_BUFSIZ) {
324     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
325             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
326     *status = U_MEMORY_ALLOCATION_ERROR;
327     return NULL;
328   }
329 
330   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
331   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
332   inv_next+=length;
333 
334 #if 0
335   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
336 #endif
337 
338   return utext_openUTF8(ut, (const char*)buf, length, status);
339 #endif
340 }
341 
342 
343 //---------------------------------------------------------------------------
344 //
345 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
346 //                       for the LookingAt() and  Match() functions.
347 //
348 //       usage:
349 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
350 //
351 //          The expected results are UBool - TRUE or FALSE.
352 //          The input text is unescaped.  The pattern is not.
353 //
354 //
355 //---------------------------------------------------------------------------
356 
357 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
358 
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)359 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
360     const UnicodeString pattern(pat, -1, US_INV);
361     const UnicodeString inputText(text, -1, US_INV);
362     UErrorCode          status  = U_ZERO_ERROR;
363     UParseError         pe;
364     RegexPattern        *REPattern = NULL;
365     RegexMatcher        *REMatcher = NULL;
366     UBool               retVal     = TRUE;
367 
368     UnicodeString patString(pat, -1, US_INV);
369     REPattern = RegexPattern::compile(patString, 0, pe, status);
370     if (U_FAILURE(status)) {
371         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
372             line, u_errorName(status));
373         return FALSE;
374     }
375     if (line==376) { RegexPatternDump(REPattern);}
376 
377     UnicodeString inputString(inputText);
378     UnicodeString unEscapedInput = inputString.unescape();
379     REMatcher = REPattern->matcher(unEscapedInput, status);
380     if (U_FAILURE(status)) {
381         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
382             line, u_errorName(status));
383         return FALSE;
384     }
385 
386     UBool actualmatch;
387     actualmatch = REMatcher->lookingAt(status);
388     if (U_FAILURE(status)) {
389         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
390             line, u_errorName(status));
391         retVal =  FALSE;
392     }
393     if (actualmatch != looking) {
394         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
395         retVal = FALSE;
396     }
397 
398     status = U_ZERO_ERROR;
399     actualmatch = REMatcher->matches(status);
400     if (U_FAILURE(status)) {
401         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
402             line, u_errorName(status));
403         retVal = FALSE;
404     }
405     if (actualmatch != match) {
406         errln("RegexTest: wrong return from matches() at line %d.\n", line);
407         retVal = FALSE;
408     }
409 
410     if (retVal == FALSE) {
411         RegexPatternDump(REPattern);
412     }
413 
414     delete REPattern;
415     delete REMatcher;
416     return retVal;
417 }
418 
419 
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)420 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
421     UText               pattern    = UTEXT_INITIALIZER;
422     int32_t             inputUTF8Length;
423     char                *textChars = NULL;
424     UText               inputText  = UTEXT_INITIALIZER;
425     UErrorCode          status     = U_ZERO_ERROR;
426     UParseError         pe;
427     RegexPattern        *REPattern = NULL;
428     RegexMatcher        *REMatcher = NULL;
429     UBool               retVal     = TRUE;
430 
431     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
432     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
433     if (U_FAILURE(status)) {
434         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
435             line, u_errorName(status));
436         return FALSE;
437     }
438 
439     UnicodeString inputString(text, -1, US_INV);
440     UnicodeString unEscapedInput = inputString.unescape();
441     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
442     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
443 
444     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
445     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
446         // UTF-8 does not allow unpaired surrogates, so this could actually happen
447         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
448         return TRUE; // not a failure of the Regex engine
449     }
450     status = U_ZERO_ERROR; // buffer overflow
451     textChars = new char[inputUTF8Length+1];
452     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
453     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
454 
455     REMatcher = &REPattern->matcher(status)->reset(&inputText);
456     if (U_FAILURE(status)) {
457         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
458             line, u_errorName(status));
459         return FALSE;
460     }
461 
462     UBool actualmatch;
463     actualmatch = REMatcher->lookingAt(status);
464     if (U_FAILURE(status)) {
465         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
466             line, u_errorName(status));
467         retVal =  FALSE;
468     }
469     if (actualmatch != looking) {
470         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
471         retVal = FALSE;
472     }
473 
474     status = U_ZERO_ERROR;
475     actualmatch = REMatcher->matches(status);
476     if (U_FAILURE(status)) {
477         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
478             line, u_errorName(status));
479         retVal = FALSE;
480     }
481     if (actualmatch != match) {
482         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
483         retVal = FALSE;
484     }
485 
486     if (retVal == FALSE) {
487         RegexPatternDump(REPattern);
488     }
489 
490     delete REPattern;
491     delete REMatcher;
492     utext_close(&inputText);
493     utext_close(&pattern);
494     delete[] textChars;
495     return retVal;
496 }
497 
498 
499 
500 //---------------------------------------------------------------------------
501 //
502 //    REGEX_ERR       Macro + invocation function to simplify writing tests
503 //                       regex tests for incorrect patterns
504 //
505 //       usage:
506 //          REGEX_ERR("pattern",   expected error line, column, expected status);
507 //
508 //---------------------------------------------------------------------------
509 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
510 
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)511 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
512                           UErrorCode expectedStatus, int32_t line) {
513     UnicodeString       pattern(pat);
514 
515     UErrorCode          status         = U_ZERO_ERROR;
516     UParseError         pe;
517     RegexPattern        *callerPattern = NULL;
518 
519     //
520     //  Compile the caller's pattern
521     //
522     UnicodeString patString(pat);
523     callerPattern = RegexPattern::compile(patString, 0, pe, status);
524     if (status != expectedStatus) {
525         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
526     } else {
527         if (status != U_ZERO_ERROR) {
528             if (pe.line != errLine || pe.offset != errCol) {
529                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
530                     line, errLine, errCol, pe.line, pe.offset);
531             }
532         }
533     }
534 
535     delete callerPattern;
536 
537     //
538     //  Compile again, using a UTF-8-based UText
539     //
540     UText patternText = UTEXT_INITIALIZER;
541     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
542     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
543     if (status != expectedStatus) {
544         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
545     } else {
546         if (status != U_ZERO_ERROR) {
547             if (pe.line != errLine || pe.offset != errCol) {
548                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
549                     line, errLine, errCol, pe.line, pe.offset);
550             }
551         }
552     }
553 
554     delete callerPattern;
555     utext_close(&patternText);
556 }
557 
558 
559 
560 //---------------------------------------------------------------------------
561 //
562 //      Basic      Check for basic functionality of regex pattern matching.
563 //                 Avoid the use of REGEX_FIND test macro, which has
564 //                 substantial dependencies on basic Regex functionality.
565 //
566 //---------------------------------------------------------------------------
Basic()567 void RegexTest::Basic() {
568 
569 
570 //
571 // Debug - slide failing test cases early
572 //
573 #if 0
574     {
575         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
576         UParseError pe;
577         UErrorCode  status = U_ZERO_ERROR;
578         RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
579         // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
580         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
581     }
582     exit(1);
583 #endif
584 
585 
586     //
587     // Pattern with parentheses
588     //
589     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
590     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
591     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
592 
593     //
594     // Patterns with *
595     //
596     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
597     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
598     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
599     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
600     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
601 
602     REGEX_TESTLM("a*", "",  TRUE, TRUE);
603     REGEX_TESTLM("a*", "b", TRUE, FALSE);
604 
605 
606     //
607     //  Patterns with "."
608     //
609     REGEX_TESTLM(".", "abc", TRUE, FALSE);
610     REGEX_TESTLM("...", "abc", TRUE, TRUE);
611     REGEX_TESTLM("....", "abc", FALSE, FALSE);
612     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
613     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
614     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
615     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
616     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
617 
618     //
619     //  Patterns with * applied to chars at end of literal string
620     //
621     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
622     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
623 
624     //
625     //  Supplemental chars match as single chars, not a pair of surrogates.
626     //
627     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
628     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
629     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
630 
631 
632     //
633     //  UnicodeSets in the pattern
634     //
635     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
636     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
637     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
638     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
639     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
640     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
641 
642     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
643     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
644     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
645     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
646     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
647 
648     //
649     //   OR operator in patterns
650     //
651     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
652     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
653     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
654     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
655 
656     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
657     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
658     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
659     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
660     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
661     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
662 
663     //
664     //  +
665     //
666     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
667     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
668     REGEX_TESTLM("b+", "", FALSE, FALSE);
669     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
670     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
671     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
672 
673     //
674     //   ?
675     //
676     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
677     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
678     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
679     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
680     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
681     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
682     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
683     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
684     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
685 
686     //
687     //  Escape sequences that become single literal chars, handled internally
688     //   by ICU's Unescape.
689     //
690 
691     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
692     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
693     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
694     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
695     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
696     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
697     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
698     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
699     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
700     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
701 
702     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
703     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
704 
705     // Escape of special chars in patterns
706     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
707 }
708 
709 
710 //---------------------------------------------------------------------------
711 //
712 //    UTextBasic   Check for quirks that are specific to the UText
713 //                 implementation.
714 //
715 //---------------------------------------------------------------------------
UTextBasic()716 void RegexTest::UTextBasic() {
717     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
718     UErrorCode status = U_ZERO_ERROR;
719     UText pattern = UTEXT_INITIALIZER;
720     utext_openUTF8(&pattern, str_abc, -1, &status);
721     RegexMatcher matcher(&pattern, 0, status);
722     REGEX_CHECK_STATUS;
723 
724     UText input = UTEXT_INITIALIZER;
725     utext_openUTF8(&input, str_abc, -1, &status);
726     REGEX_CHECK_STATUS;
727     matcher.reset(&input);
728     REGEX_CHECK_STATUS;
729     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
730 
731     matcher.reset(matcher.inputText());
732     REGEX_CHECK_STATUS;
733     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
734 
735     utext_close(&pattern);
736     utext_close(&input);
737 }
738 
739 
740 //---------------------------------------------------------------------------
741 //
742 //      API_Match   Test that the API for class RegexMatcher
743 //                  is present and nominally working, but excluding functions
744 //                  implementing replace operations.
745 //
746 //---------------------------------------------------------------------------
API_Match()747 void RegexTest::API_Match() {
748     UParseError         pe;
749     UErrorCode          status=U_ZERO_ERROR;
750     int32_t             flags = 0;
751 
752     //
753     // Debug - slide failing test cases early
754     //
755 #if 0
756     {
757     }
758     return;
759 #endif
760 
761     //
762     // Simple pattern compilation
763     //
764     {
765         UnicodeString       re("abc");
766         RegexPattern        *pat2;
767         pat2 = RegexPattern::compile(re, flags, pe, status);
768         REGEX_CHECK_STATUS;
769 
770         UnicodeString inStr1 = "abcdef this is a test";
771         UnicodeString instr2 = "not abc";
772         UnicodeString empty  = "";
773 
774 
775         //
776         // Matcher creation and reset.
777         //
778         RegexMatcher *m1 = pat2->matcher(inStr1, status);
779         REGEX_CHECK_STATUS;
780         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
781         REGEX_ASSERT(m1->input() == inStr1);
782         m1->reset(instr2);
783         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
784         REGEX_ASSERT(m1->input() == instr2);
785         m1->reset(inStr1);
786         REGEX_ASSERT(m1->input() == inStr1);
787         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
788         m1->reset(empty);
789         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
790         REGEX_ASSERT(m1->input() == empty);
791         REGEX_ASSERT(&m1->pattern() == pat2);
792 
793         //
794         //  reset(pos, status)
795         //
796         m1->reset(inStr1);
797         m1->reset(4, status);
798         REGEX_CHECK_STATUS;
799         REGEX_ASSERT(m1->input() == inStr1);
800         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
801 
802         m1->reset(-1, status);
803         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
804         status = U_ZERO_ERROR;
805 
806         m1->reset(0, status);
807         REGEX_CHECK_STATUS;
808         status = U_ZERO_ERROR;
809 
810         int32_t len = m1->input().length();
811         m1->reset(len-1, status);
812         REGEX_CHECK_STATUS;
813         status = U_ZERO_ERROR;
814 
815         m1->reset(len, status);
816         REGEX_CHECK_STATUS;
817         status = U_ZERO_ERROR;
818 
819         m1->reset(len+1, status);
820         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
821         status = U_ZERO_ERROR;
822 
823         //
824         // match(pos, status)
825         //
826         m1->reset(instr2);
827         REGEX_ASSERT(m1->matches(4, status) == TRUE);
828         m1->reset();
829         REGEX_ASSERT(m1->matches(3, status) == FALSE);
830         m1->reset();
831         REGEX_ASSERT(m1->matches(5, status) == FALSE);
832         REGEX_ASSERT(m1->matches(4, status) == TRUE);
833         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
834         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
835 
836         // Match() at end of string should fail, but should not
837         //  be an error.
838         status = U_ZERO_ERROR;
839         len = m1->input().length();
840         REGEX_ASSERT(m1->matches(len, status) == FALSE);
841         REGEX_CHECK_STATUS;
842 
843         // Match beyond end of string should fail with an error.
844         status = U_ZERO_ERROR;
845         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
846         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
847 
848         // Successful match at end of string.
849         {
850             status = U_ZERO_ERROR;
851             RegexMatcher m("A?", 0, status);  // will match zero length string.
852             REGEX_CHECK_STATUS;
853             m.reset(inStr1);
854             len = inStr1.length();
855             REGEX_ASSERT(m.matches(len, status) == TRUE);
856             REGEX_CHECK_STATUS;
857             m.reset(empty);
858             REGEX_ASSERT(m.matches(0, status) == TRUE);
859             REGEX_CHECK_STATUS;
860         }
861 
862 
863         //
864         // lookingAt(pos, status)
865         //
866         status = U_ZERO_ERROR;
867         m1->reset(instr2);  // "not abc"
868         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
869         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
870         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
871         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
872         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
873         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
874         status = U_ZERO_ERROR;
875         len = m1->input().length();
876         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
877         REGEX_CHECK_STATUS;
878         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
879         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
880 
881         delete m1;
882         delete pat2;
883     }
884 
885 
886     //
887     // Capture Group.
888     //     RegexMatcher::start();
889     //     RegexMatcher::end();
890     //     RegexMatcher::groupCount();
891     //
892     {
893         int32_t             flags=0;
894         UParseError         pe;
895         UErrorCode          status=U_ZERO_ERROR;
896 
897         UnicodeString       re("01(23(45)67)(.*)");
898         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
899         REGEX_CHECK_STATUS;
900         UnicodeString data = "0123456789";
901 
902         RegexMatcher *matcher = pat->matcher(data, status);
903         REGEX_CHECK_STATUS;
904         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
905         static const int32_t matchStarts[] = {0,  2, 4, 8};
906         static const int32_t matchEnds[]   = {10, 8, 6, 10};
907         int32_t i;
908         for (i=0; i<4; i++) {
909             int32_t actualStart = matcher->start(i, status);
910             REGEX_CHECK_STATUS;
911             if (actualStart != matchStarts[i]) {
912                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
913                     __LINE__, i, matchStarts[i], actualStart);
914             }
915             int32_t actualEnd = matcher->end(i, status);
916             REGEX_CHECK_STATUS;
917             if (actualEnd != matchEnds[i]) {
918                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
919                     __LINE__, i, matchEnds[i], actualEnd);
920             }
921         }
922 
923         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
924         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
925 
926         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
927         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
928         matcher->reset();
929         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
930 
931         matcher->lookingAt(status);
932         REGEX_ASSERT(matcher->group(status)    == "0123456789");
933         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
934         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
935         REGEX_ASSERT(matcher->group(2, status) == "45"        );
936         REGEX_ASSERT(matcher->group(3, status) == "89"        );
937         REGEX_CHECK_STATUS;
938         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
939         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
940         matcher->reset();
941         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
942 
943         delete matcher;
944         delete pat;
945 
946     }
947 
948     //
949     //  find
950     //
951     {
952         int32_t             flags=0;
953         UParseError         pe;
954         UErrorCode          status=U_ZERO_ERROR;
955 
956         UnicodeString       re("abc");
957         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
958         REGEX_CHECK_STATUS;
959         UnicodeString data = ".abc..abc...abc..";
960         //                    012345678901234567
961 
962         RegexMatcher *matcher = pat->matcher(data, status);
963         REGEX_CHECK_STATUS;
964         REGEX_ASSERT(matcher->find());
965         REGEX_ASSERT(matcher->start(status) == 1);
966         REGEX_ASSERT(matcher->find());
967         REGEX_ASSERT(matcher->start(status) == 6);
968         REGEX_ASSERT(matcher->find());
969         REGEX_ASSERT(matcher->start(status) == 12);
970         REGEX_ASSERT(matcher->find() == FALSE);
971         REGEX_ASSERT(matcher->find() == FALSE);
972 
973         matcher->reset();
974         REGEX_ASSERT(matcher->find());
975         REGEX_ASSERT(matcher->start(status) == 1);
976 
977         REGEX_ASSERT(matcher->find(0, status));
978         REGEX_ASSERT(matcher->start(status) == 1);
979         REGEX_ASSERT(matcher->find(1, status));
980         REGEX_ASSERT(matcher->start(status) == 1);
981         REGEX_ASSERT(matcher->find(2, status));
982         REGEX_ASSERT(matcher->start(status) == 6);
983         REGEX_ASSERT(matcher->find(12, status));
984         REGEX_ASSERT(matcher->start(status) == 12);
985         REGEX_ASSERT(matcher->find(13, status) == FALSE);
986         REGEX_ASSERT(matcher->find(16, status) == FALSE);
987         REGEX_ASSERT(matcher->find(17, status) == FALSE);
988         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
989 
990         status = U_ZERO_ERROR;
991         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
992         status = U_ZERO_ERROR;
993         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
994 
995         REGEX_ASSERT(matcher->groupCount() == 0);
996 
997         delete matcher;
998         delete pat;
999     }
1000 
1001 
1002     //
1003     //  find, with \G in pattern (true if at the end of a previous match).
1004     //
1005     {
1006         int32_t             flags=0;
1007         UParseError         pe;
1008         UErrorCode          status=U_ZERO_ERROR;
1009 
1010         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1011         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1012         REGEX_CHECK_STATUS;
1013         UnicodeString data = ".abcabc.abc..";
1014         //                    012345678901234567
1015 
1016         RegexMatcher *matcher = pat->matcher(data, status);
1017         REGEX_CHECK_STATUS;
1018         REGEX_ASSERT(matcher->find());
1019         REGEX_ASSERT(matcher->start(status) == 0);
1020         REGEX_ASSERT(matcher->start(1, status) == -1);
1021         REGEX_ASSERT(matcher->start(2, status) == 1);
1022 
1023         REGEX_ASSERT(matcher->find());
1024         REGEX_ASSERT(matcher->start(status) == 4);
1025         REGEX_ASSERT(matcher->start(1, status) == 4);
1026         REGEX_ASSERT(matcher->start(2, status) == -1);
1027         REGEX_CHECK_STATUS;
1028 
1029         delete matcher;
1030         delete pat;
1031     }
1032 
1033     //
1034     //   find with zero length matches, match position should bump ahead
1035     //     to prevent loops.
1036     //
1037     {
1038         int32_t                 i;
1039         UErrorCode          status=U_ZERO_ERROR;
1040         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1041                                                       //   using an always-true look-ahead.
1042         REGEX_CHECK_STATUS;
1043         UnicodeString s("    ");
1044         m.reset(s);
1045         for (i=0; ; i++) {
1046             if (m.find() == FALSE) {
1047                 break;
1048             }
1049             REGEX_ASSERT(m.start(status) == i);
1050             REGEX_ASSERT(m.end(status) == i);
1051         }
1052         REGEX_ASSERT(i==5);
1053 
1054         // Check that the bump goes over surrogate pairs OK
1055         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1056         s = s.unescape();
1057         m.reset(s);
1058         for (i=0; ; i+=2) {
1059             if (m.find() == FALSE) {
1060                 break;
1061             }
1062             REGEX_ASSERT(m.start(status) == i);
1063             REGEX_ASSERT(m.end(status) == i);
1064         }
1065         REGEX_ASSERT(i==10);
1066     }
1067     {
1068         // find() loop breaking test.
1069         //        with pattern of /.?/, should see a series of one char matches, then a single
1070         //        match of zero length at the end of the input string.
1071         int32_t                 i;
1072         UErrorCode          status=U_ZERO_ERROR;
1073         RegexMatcher        m(".?", 0, status);
1074         REGEX_CHECK_STATUS;
1075         UnicodeString s("    ");
1076         m.reset(s);
1077         for (i=0; ; i++) {
1078             if (m.find() == FALSE) {
1079                 break;
1080             }
1081             REGEX_ASSERT(m.start(status) == i);
1082             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1083         }
1084         REGEX_ASSERT(i==5);
1085     }
1086 
1087 
1088     //
1089     // Matchers with no input string behave as if they had an empty input string.
1090     //
1091 
1092     {
1093         UErrorCode status = U_ZERO_ERROR;
1094         RegexMatcher  m(".?", 0, status);
1095         REGEX_CHECK_STATUS;
1096         REGEX_ASSERT(m.find());
1097         REGEX_ASSERT(m.start(status) == 0);
1098         REGEX_ASSERT(m.input() == "");
1099     }
1100     {
1101         UErrorCode status = U_ZERO_ERROR;
1102         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1103         RegexMatcher  *m = p->matcher(status);
1104         REGEX_CHECK_STATUS;
1105 
1106         REGEX_ASSERT(m->find() == FALSE);
1107         REGEX_ASSERT(m->input() == "");
1108         delete m;
1109         delete p;
1110     }
1111 
1112     //
1113     // Regions
1114     //
1115     {
1116         UErrorCode status = U_ZERO_ERROR;
1117         UnicodeString testString("This is test data");
1118         RegexMatcher m(".*", testString,  0, status);
1119         REGEX_CHECK_STATUS;
1120         REGEX_ASSERT(m.regionStart() == 0);
1121         REGEX_ASSERT(m.regionEnd() == testString.length());
1122         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1123         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1124 
1125         m.region(2,4, status);
1126         REGEX_CHECK_STATUS;
1127         REGEX_ASSERT(m.matches(status));
1128         REGEX_ASSERT(m.start(status)==2);
1129         REGEX_ASSERT(m.end(status)==4);
1130         REGEX_CHECK_STATUS;
1131 
1132         m.reset();
1133         REGEX_ASSERT(m.regionStart() == 0);
1134         REGEX_ASSERT(m.regionEnd() == testString.length());
1135 
1136         UnicodeString shorterString("short");
1137         m.reset(shorterString);
1138         REGEX_ASSERT(m.regionStart() == 0);
1139         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1140 
1141         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1142         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1143         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1144         REGEX_ASSERT(&m == &m.reset());
1145         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1146 
1147         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1148         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1149         REGEX_ASSERT(&m == &m.reset());
1150         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1151 
1152         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1153         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1154         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1155         REGEX_ASSERT(&m == &m.reset());
1156         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1157 
1158         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1159         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1160         REGEX_ASSERT(&m == &m.reset());
1161         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1162 
1163     }
1164 
1165     //
1166     // hitEnd() and requireEnd()
1167     //
1168     {
1169         UErrorCode status = U_ZERO_ERROR;
1170         UnicodeString testString("aabb");
1171         RegexMatcher m1(".*", testString,  0, status);
1172         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1173         REGEX_ASSERT(m1.hitEnd() == TRUE);
1174         REGEX_ASSERT(m1.requireEnd() == FALSE);
1175         REGEX_CHECK_STATUS;
1176 
1177         status = U_ZERO_ERROR;
1178         RegexMatcher m2("a*", testString, 0, status);
1179         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1180         REGEX_ASSERT(m2.hitEnd() == FALSE);
1181         REGEX_ASSERT(m2.requireEnd() == FALSE);
1182         REGEX_CHECK_STATUS;
1183 
1184         status = U_ZERO_ERROR;
1185         RegexMatcher m3(".*$", testString, 0, status);
1186         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1187         REGEX_ASSERT(m3.hitEnd() == TRUE);
1188         REGEX_ASSERT(m3.requireEnd() == TRUE);
1189         REGEX_CHECK_STATUS;
1190     }
1191 
1192 
1193     //
1194     // Compilation error on reset with UChar *
1195     //   These were a hazard that people were stumbling over with runtime errors.
1196     //   Changed them to compiler errors by adding private methods that more closely
1197     //   matched the incorrect use of the functions.
1198     //
1199 #if 0
1200     {
1201         UErrorCode status = U_ZERO_ERROR;
1202         UChar ucharString[20];
1203         RegexMatcher m(".", 0, status);
1204         m.reset(ucharString);  // should not compile.
1205 
1206         RegexPattern *p = RegexPattern::compile(".", 0, status);
1207         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1208 
1209         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1210     }
1211 #endif
1212 
1213     //
1214     //  Time Outs.
1215     //       Note:  These tests will need to be changed when the regexp engine is
1216     //              able to detect and cut short the exponential time behavior on
1217     //              this type of match.
1218     //
1219     {
1220         UErrorCode status = U_ZERO_ERROR;
1221         //    Enough 'a's in the string to cause the match to time out.
1222         //       (Each on additonal 'a' doubles the time)
1223         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1224         RegexMatcher matcher("(a+)+b", testString, 0, status);
1225         REGEX_CHECK_STATUS;
1226         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1227         matcher.setTimeLimit(100, status);
1228         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1229         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1230         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1231     }
1232     {
1233         UErrorCode status = U_ZERO_ERROR;
1234         //   Few enough 'a's to slip in under the time limit.
1235         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1236         RegexMatcher matcher("(a+)+b", testString, 0, status);
1237         REGEX_CHECK_STATUS;
1238         matcher.setTimeLimit(100, status);
1239         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1240         REGEX_CHECK_STATUS;
1241     }
1242 
1243     //
1244     //  Stack Limits
1245     //
1246     {
1247         UErrorCode status = U_ZERO_ERROR;
1248         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1249 
1250         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1251         //   of the '+', and makes the stack frames larger.
1252         RegexMatcher matcher("(A)+A$", testString, 0, status);
1253 
1254         // With the default stack, this match should fail to run
1255         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1256         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1257 
1258         // With unlimited stack, it should run
1259         status = U_ZERO_ERROR;
1260         matcher.setStackLimit(0, status);
1261         REGEX_CHECK_STATUS;
1262         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1263         REGEX_CHECK_STATUS;
1264         REGEX_ASSERT(matcher.getStackLimit() == 0);
1265 
1266         // With a limited stack, it the match should fail
1267         status = U_ZERO_ERROR;
1268         matcher.setStackLimit(10000, status);
1269         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1270         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1271         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1272     }
1273 
1274         // A pattern that doesn't save state should work with
1275         //   a minimal sized stack
1276     {
1277         UErrorCode status = U_ZERO_ERROR;
1278         UnicodeString testString = "abc";
1279         RegexMatcher matcher("abc", testString, 0, status);
1280         REGEX_CHECK_STATUS;
1281         matcher.setStackLimit(30, status);
1282         REGEX_CHECK_STATUS;
1283         REGEX_ASSERT(matcher.matches(status) == TRUE);
1284         REGEX_CHECK_STATUS;
1285         REGEX_ASSERT(matcher.getStackLimit() == 30);
1286 
1287         // Negative stack sizes should fail
1288         status = U_ZERO_ERROR;
1289         matcher.setStackLimit(1000, status);
1290         REGEX_CHECK_STATUS;
1291         matcher.setStackLimit(-1, status);
1292         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1293         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1294     }
1295 
1296 
1297 }
1298 
1299 
1300 
1301 
1302 
1303 
1304 //---------------------------------------------------------------------------
1305 //
1306 //      API_Replace        API test for class RegexMatcher, testing the
1307 //                         Replace family of functions.
1308 //
1309 //---------------------------------------------------------------------------
API_Replace()1310 void RegexTest::API_Replace() {
1311     //
1312     //  Replace
1313     //
1314     int32_t             flags=0;
1315     UParseError         pe;
1316     UErrorCode          status=U_ZERO_ERROR;
1317 
1318     UnicodeString       re("abc");
1319     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1320     REGEX_CHECK_STATUS;
1321     UnicodeString data = ".abc..abc...abc..";
1322     //                    012345678901234567
1323     RegexMatcher *matcher = pat->matcher(data, status);
1324 
1325     //
1326     //  Plain vanilla matches.
1327     //
1328     UnicodeString  dest;
1329     dest = matcher->replaceFirst("yz", status);
1330     REGEX_CHECK_STATUS;
1331     REGEX_ASSERT(dest == ".yz..abc...abc..");
1332 
1333     dest = matcher->replaceAll("yz", status);
1334     REGEX_CHECK_STATUS;
1335     REGEX_ASSERT(dest == ".yz..yz...yz..");
1336 
1337     //
1338     //  Plain vanilla non-matches.
1339     //
1340     UnicodeString d2 = ".abx..abx...abx..";
1341     matcher->reset(d2);
1342     dest = matcher->replaceFirst("yz", status);
1343     REGEX_CHECK_STATUS;
1344     REGEX_ASSERT(dest == ".abx..abx...abx..");
1345 
1346     dest = matcher->replaceAll("yz", status);
1347     REGEX_CHECK_STATUS;
1348     REGEX_ASSERT(dest == ".abx..abx...abx..");
1349 
1350     //
1351     // Empty source string
1352     //
1353     UnicodeString d3 = "";
1354     matcher->reset(d3);
1355     dest = matcher->replaceFirst("yz", status);
1356     REGEX_CHECK_STATUS;
1357     REGEX_ASSERT(dest == "");
1358 
1359     dest = matcher->replaceAll("yz", status);
1360     REGEX_CHECK_STATUS;
1361     REGEX_ASSERT(dest == "");
1362 
1363     //
1364     // Empty substitution string
1365     //
1366     matcher->reset(data);              // ".abc..abc...abc.."
1367     dest = matcher->replaceFirst("", status);
1368     REGEX_CHECK_STATUS;
1369     REGEX_ASSERT(dest == "...abc...abc..");
1370 
1371     dest = matcher->replaceAll("", status);
1372     REGEX_CHECK_STATUS;
1373     REGEX_ASSERT(dest == "........");
1374 
1375     //
1376     // match whole string
1377     //
1378     UnicodeString d4 = "abc";
1379     matcher->reset(d4);
1380     dest = matcher->replaceFirst("xyz", status);
1381     REGEX_CHECK_STATUS;
1382     REGEX_ASSERT(dest == "xyz");
1383 
1384     dest = matcher->replaceAll("xyz", status);
1385     REGEX_CHECK_STATUS;
1386     REGEX_ASSERT(dest == "xyz");
1387 
1388     //
1389     // Capture Group, simple case
1390     //
1391     UnicodeString       re2("a(..)");
1392     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1393     REGEX_CHECK_STATUS;
1394     UnicodeString d5 = "abcdefg";
1395     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1396     REGEX_CHECK_STATUS;
1397     dest = matcher2->replaceFirst("$1$1", status);
1398     REGEX_CHECK_STATUS;
1399     REGEX_ASSERT(dest == "bcbcdefg");
1400 
1401     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1402     REGEX_CHECK_STATUS;
1403     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1404 
1405     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1406     REGEX_CHECK_STATUS;
1407     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1408 
1409     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1410     replacement = replacement.unescape();
1411     dest = matcher2->replaceFirst(replacement, status);
1412     REGEX_CHECK_STATUS;
1413     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1414 
1415     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1416 
1417 
1418     //
1419     // Replacement String with \u hex escapes
1420     //
1421     {
1422         UnicodeString  src = "abc 1 abc 2 abc 3";
1423         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1424         matcher->reset(src);
1425         UnicodeString  result = matcher->replaceAll(substitute, status);
1426         REGEX_CHECK_STATUS;
1427         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1428     }
1429     {
1430         UnicodeString  src = "abc !";
1431         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1432         matcher->reset(src);
1433         UnicodeString  result = matcher->replaceAll(substitute, status);
1434         REGEX_CHECK_STATUS;
1435         UnicodeString expected = UnicodeString("--");
1436         expected.append((UChar32)0x10000);
1437         expected.append("-- !");
1438         REGEX_ASSERT(result == expected);
1439     }
1440     // TODO:  need more through testing of capture substitutions.
1441 
1442     // Bug 4057
1443     //
1444     {
1445         status = U_ZERO_ERROR;
1446         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1447         RegexMatcher m("ss(.*?)ee", 0, status);
1448         REGEX_CHECK_STATUS;
1449         UnicodeString result;
1450 
1451         // Multiple finds do NOT bump up the previous appendReplacement postion.
1452         m.reset(s);
1453         m.find();
1454         m.find();
1455         m.appendReplacement(result, "ooh", status);
1456         REGEX_CHECK_STATUS;
1457         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1458 
1459         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1460         status = U_ZERO_ERROR;
1461         result.truncate(0);
1462         m.reset(10, status);
1463         m.find();
1464         m.find();
1465         m.appendReplacement(result, "ooh", status);
1466         REGEX_CHECK_STATUS;
1467         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1468 
1469         // find() at interior of string, appendReplacemnt still starts at beginning.
1470         status = U_ZERO_ERROR;
1471         result.truncate(0);
1472         m.reset();
1473         m.find(10, status);
1474         m.find();
1475         m.appendReplacement(result, "ooh", status);
1476         REGEX_CHECK_STATUS;
1477         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1478 
1479         m.appendTail(result);
1480         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1481 
1482     }
1483 
1484     delete matcher2;
1485     delete pat2;
1486     delete matcher;
1487     delete pat;
1488 }
1489 
1490 
1491 //---------------------------------------------------------------------------
1492 //
1493 //      API_Pattern       Test that the API for class RegexPattern is
1494 //                        present and nominally working.
1495 //
1496 //---------------------------------------------------------------------------
API_Pattern()1497 void RegexTest::API_Pattern() {
1498     RegexPattern        pata;    // Test default constructor to not crash.
1499     RegexPattern        patb;
1500 
1501     REGEX_ASSERT(pata == patb);
1502     REGEX_ASSERT(pata == pata);
1503 
1504     UnicodeString re1("abc[a-l][m-z]");
1505     UnicodeString re2("def");
1506     UErrorCode    status = U_ZERO_ERROR;
1507     UParseError   pe;
1508 
1509     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1510     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1511     REGEX_CHECK_STATUS;
1512     REGEX_ASSERT(*pat1 == *pat1);
1513     REGEX_ASSERT(*pat1 != pata);
1514 
1515     // Assign
1516     patb = *pat1;
1517     REGEX_ASSERT(patb == *pat1);
1518 
1519     // Copy Construct
1520     RegexPattern patc(*pat1);
1521     REGEX_ASSERT(patc == *pat1);
1522     REGEX_ASSERT(patb == patc);
1523     REGEX_ASSERT(pat1 != pat2);
1524     patb = *pat2;
1525     REGEX_ASSERT(patb != patc);
1526     REGEX_ASSERT(patb == *pat2);
1527 
1528     // Compile with no flags.
1529     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1530     REGEX_ASSERT(*pat1a == *pat1);
1531 
1532     REGEX_ASSERT(pat1a->flags() == 0);
1533 
1534     // Compile with different flags should be not equal
1535     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1536     REGEX_CHECK_STATUS;
1537 
1538     REGEX_ASSERT(*pat1b != *pat1a);
1539     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1540     REGEX_ASSERT(pat1a->flags() == 0);
1541     delete pat1b;
1542 
1543     // clone
1544     RegexPattern *pat1c = pat1->clone();
1545     REGEX_ASSERT(*pat1c == *pat1);
1546     REGEX_ASSERT(*pat1c != *pat2);
1547 
1548     delete pat1c;
1549     delete pat1a;
1550     delete pat1;
1551     delete pat2;
1552 
1553 
1554     //
1555     //   Verify that a matcher created from a cloned pattern works.
1556     //     (Jitterbug 3423)
1557     //
1558     {
1559         UErrorCode     status     = U_ZERO_ERROR;
1560         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1561         RegexPattern  *pClone     = pSource->clone();
1562         delete         pSource;
1563         RegexMatcher  *mFromClone = pClone->matcher(status);
1564         REGEX_CHECK_STATUS;
1565         UnicodeString s = "Hello World";
1566         mFromClone->reset(s);
1567         REGEX_ASSERT(mFromClone->find() == TRUE);
1568         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1569         REGEX_ASSERT(mFromClone->find() == TRUE);
1570         REGEX_ASSERT(mFromClone->group(status) == "World");
1571         REGEX_ASSERT(mFromClone->find() == FALSE);
1572         delete mFromClone;
1573         delete pClone;
1574     }
1575 
1576     //
1577     //   matches convenience API
1578     //
1579     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1580     REGEX_CHECK_STATUS;
1581     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1582     REGEX_CHECK_STATUS;
1583     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1584     REGEX_CHECK_STATUS;
1585     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1586     REGEX_CHECK_STATUS;
1587     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1588     REGEX_CHECK_STATUS;
1589     status = U_INDEX_OUTOFBOUNDS_ERROR;
1590     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1591     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1592 
1593 
1594     //
1595     // Split()
1596     //
1597     status = U_ZERO_ERROR;
1598     pat1 = RegexPattern::compile(" +",  pe, status);
1599     REGEX_CHECK_STATUS;
1600     UnicodeString  fields[10];
1601 
1602     int32_t n;
1603     n = pat1->split("Now is the time", fields, 10, status);
1604     REGEX_CHECK_STATUS;
1605     REGEX_ASSERT(n==4);
1606     REGEX_ASSERT(fields[0]=="Now");
1607     REGEX_ASSERT(fields[1]=="is");
1608     REGEX_ASSERT(fields[2]=="the");
1609     REGEX_ASSERT(fields[3]=="time");
1610     REGEX_ASSERT(fields[4]=="");
1611 
1612     n = pat1->split("Now is the time", fields, 2, status);
1613     REGEX_CHECK_STATUS;
1614     REGEX_ASSERT(n==2);
1615     REGEX_ASSERT(fields[0]=="Now");
1616     REGEX_ASSERT(fields[1]=="is the time");
1617     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1618 
1619     fields[1] = "*";
1620     status = U_ZERO_ERROR;
1621     n = pat1->split("Now is the time", fields, 1, status);
1622     REGEX_CHECK_STATUS;
1623     REGEX_ASSERT(n==1);
1624     REGEX_ASSERT(fields[0]=="Now is the time");
1625     REGEX_ASSERT(fields[1]=="*");
1626     status = U_ZERO_ERROR;
1627 
1628     n = pat1->split("    Now       is the time   ", fields, 10, status);
1629     REGEX_CHECK_STATUS;
1630     REGEX_ASSERT(n==6);
1631     REGEX_ASSERT(fields[0]=="");
1632     REGEX_ASSERT(fields[1]=="Now");
1633     REGEX_ASSERT(fields[2]=="is");
1634     REGEX_ASSERT(fields[3]=="the");
1635     REGEX_ASSERT(fields[4]=="time");
1636     REGEX_ASSERT(fields[5]=="");
1637 
1638     n = pat1->split("     ", fields, 10, status);
1639     REGEX_CHECK_STATUS;
1640     REGEX_ASSERT(n==2);
1641     REGEX_ASSERT(fields[0]=="");
1642     REGEX_ASSERT(fields[1]=="");
1643 
1644     fields[0] = "foo";
1645     n = pat1->split("", fields, 10, status);
1646     REGEX_CHECK_STATUS;
1647     REGEX_ASSERT(n==0);
1648     REGEX_ASSERT(fields[0]=="foo");
1649 
1650     delete pat1;
1651 
1652     //  split, with a pattern with (capture)
1653     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1654     REGEX_CHECK_STATUS;
1655 
1656     status = U_ZERO_ERROR;
1657     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1658     REGEX_CHECK_STATUS;
1659     REGEX_ASSERT(n==7);
1660     REGEX_ASSERT(fields[0]=="");
1661     REGEX_ASSERT(fields[1]=="a");
1662     REGEX_ASSERT(fields[2]=="Now is ");
1663     REGEX_ASSERT(fields[3]=="b");
1664     REGEX_ASSERT(fields[4]=="the time");
1665     REGEX_ASSERT(fields[5]=="c");
1666     REGEX_ASSERT(fields[6]=="");
1667     REGEX_ASSERT(status==U_ZERO_ERROR);
1668 
1669     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1670     REGEX_CHECK_STATUS;
1671     REGEX_ASSERT(n==7);
1672     REGEX_ASSERT(fields[0]=="  ");
1673     REGEX_ASSERT(fields[1]=="a");
1674     REGEX_ASSERT(fields[2]=="Now is ");
1675     REGEX_ASSERT(fields[3]=="b");
1676     REGEX_ASSERT(fields[4]=="the time");
1677     REGEX_ASSERT(fields[5]=="c");
1678     REGEX_ASSERT(fields[6]=="");
1679 
1680     status = U_ZERO_ERROR;
1681     fields[6] = "foo";
1682     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1683     REGEX_CHECK_STATUS;
1684     REGEX_ASSERT(n==6);
1685     REGEX_ASSERT(fields[0]=="  ");
1686     REGEX_ASSERT(fields[1]=="a");
1687     REGEX_ASSERT(fields[2]=="Now is ");
1688     REGEX_ASSERT(fields[3]=="b");
1689     REGEX_ASSERT(fields[4]=="the time");
1690     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1691     REGEX_ASSERT(fields[6]=="foo");
1692 
1693     status = U_ZERO_ERROR;
1694     fields[5] = "foo";
1695     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1696     REGEX_CHECK_STATUS;
1697     REGEX_ASSERT(n==5);
1698     REGEX_ASSERT(fields[0]=="  ");
1699     REGEX_ASSERT(fields[1]=="a");
1700     REGEX_ASSERT(fields[2]=="Now is ");
1701     REGEX_ASSERT(fields[3]=="b");
1702     REGEX_ASSERT(fields[4]=="the time<c>");
1703     REGEX_ASSERT(fields[5]=="foo");
1704 
1705     status = U_ZERO_ERROR;
1706     fields[5] = "foo";
1707     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1708     REGEX_CHECK_STATUS;
1709     REGEX_ASSERT(n==5);
1710     REGEX_ASSERT(fields[0]=="  ");
1711     REGEX_ASSERT(fields[1]=="a");
1712     REGEX_ASSERT(fields[2]=="Now is ");
1713     REGEX_ASSERT(fields[3]=="b");
1714     REGEX_ASSERT(fields[4]=="the time");
1715     REGEX_ASSERT(fields[5]=="foo");
1716 
1717     status = U_ZERO_ERROR;
1718     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1719     REGEX_CHECK_STATUS;
1720     REGEX_ASSERT(n==4);
1721     REGEX_ASSERT(fields[0]=="  ");
1722     REGEX_ASSERT(fields[1]=="a");
1723     REGEX_ASSERT(fields[2]=="Now is ");
1724     REGEX_ASSERT(fields[3]=="the time<c>");
1725     status = U_ZERO_ERROR;
1726     delete pat1;
1727 
1728     pat1 = RegexPattern::compile("([-,])",  pe, status);
1729     REGEX_CHECK_STATUS;
1730     n = pat1->split("1-10,20", fields, 10, status);
1731     REGEX_CHECK_STATUS;
1732     REGEX_ASSERT(n==5);
1733     REGEX_ASSERT(fields[0]=="1");
1734     REGEX_ASSERT(fields[1]=="-");
1735     REGEX_ASSERT(fields[2]=="10");
1736     REGEX_ASSERT(fields[3]==",");
1737     REGEX_ASSERT(fields[4]=="20");
1738     delete pat1;
1739 
1740     // Test split of string with empty trailing fields
1741     pat1 = RegexPattern::compile(",", pe, status);
1742     REGEX_CHECK_STATUS;
1743     n = pat1->split("a,b,c,", fields, 10, status);
1744     REGEX_CHECK_STATUS;
1745     REGEX_ASSERT(n==4);
1746     REGEX_ASSERT(fields[0]=="a");
1747     REGEX_ASSERT(fields[1]=="b");
1748     REGEX_ASSERT(fields[2]=="c");
1749     REGEX_ASSERT(fields[3]=="");
1750 
1751     n = pat1->split("a,,,", fields, 10, status);
1752     REGEX_CHECK_STATUS;
1753     REGEX_ASSERT(n==4);
1754     REGEX_ASSERT(fields[0]=="a");
1755     REGEX_ASSERT(fields[1]=="");
1756     REGEX_ASSERT(fields[2]=="");
1757     REGEX_ASSERT(fields[3]=="");
1758     delete pat1;
1759 
1760     // Split Separator with zero length match.
1761     pat1 = RegexPattern::compile(":?", pe, status);
1762     REGEX_CHECK_STATUS;
1763     n = pat1->split("abc", fields, 10, status);
1764     REGEX_CHECK_STATUS;
1765     REGEX_ASSERT(n==5);
1766     REGEX_ASSERT(fields[0]=="");
1767     REGEX_ASSERT(fields[1]=="a");
1768     REGEX_ASSERT(fields[2]=="b");
1769     REGEX_ASSERT(fields[3]=="c");
1770     REGEX_ASSERT(fields[4]=="");
1771 
1772     delete pat1;
1773 
1774     //
1775     // RegexPattern::pattern()
1776     //
1777     pat1 = new RegexPattern();
1778     REGEX_ASSERT(pat1->pattern() == "");
1779     delete pat1;
1780 
1781     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1782     REGEX_CHECK_STATUS;
1783     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1784     delete pat1;
1785 
1786 
1787     //
1788     // classID functions
1789     //
1790     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1791     REGEX_CHECK_STATUS;
1792     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1793     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1794     UnicodeString Hello("Hello, world.");
1795     RegexMatcher *m = pat1->matcher(Hello, status);
1796     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1797     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1798     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1799     delete m;
1800     delete pat1;
1801 
1802 }
1803 
1804 //---------------------------------------------------------------------------
1805 //
1806 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1807 //                       is present and working, but excluding functions
1808 //                       implementing replace operations.
1809 //
1810 //---------------------------------------------------------------------------
API_Match_UTF8()1811 void RegexTest::API_Match_UTF8() {
1812     UParseError         pe;
1813     UErrorCode          status=U_ZERO_ERROR;
1814     int32_t             flags = 0;
1815 
1816     //
1817     // Debug - slide failing test cases early
1818     //
1819 #if 0
1820     {
1821     }
1822     return;
1823 #endif
1824 
1825     //
1826     // Simple pattern compilation
1827     //
1828     {
1829         UText               re = UTEXT_INITIALIZER;
1830         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1831         REGEX_VERBOSE_TEXT(&re);
1832         RegexPattern        *pat2;
1833         pat2 = RegexPattern::compile(&re, flags, pe, status);
1834         REGEX_CHECK_STATUS;
1835 
1836         UText input1 = UTEXT_INITIALIZER;
1837         UText input2 = UTEXT_INITIALIZER;
1838         UText empty  = UTEXT_INITIALIZER;
1839         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1840         REGEX_VERBOSE_TEXT(&input1);
1841         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1842         REGEX_VERBOSE_TEXT(&input2);
1843         utext_openUChars(&empty, NULL, 0, &status);
1844 
1845         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1846         int32_t input2Len = strlen("not abc");
1847 
1848 
1849         //
1850         // Matcher creation and reset.
1851         //
1852         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1853         REGEX_CHECK_STATUS;
1854         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1855         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1856         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1857         m1->reset(&input2);
1858         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1859         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1860         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1861         m1->reset(&input1);
1862         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1863         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1864         m1->reset(&empty);
1865         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1866         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1867 
1868         //
1869         //  reset(pos, status)
1870         //
1871         m1->reset(&input1);
1872         m1->reset(4, status);
1873         REGEX_CHECK_STATUS;
1874         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1875         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1876 
1877         m1->reset(-1, status);
1878         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1879         status = U_ZERO_ERROR;
1880 
1881         m1->reset(0, status);
1882         REGEX_CHECK_STATUS;
1883         status = U_ZERO_ERROR;
1884 
1885         m1->reset(input1Len-1, status);
1886         REGEX_CHECK_STATUS;
1887         status = U_ZERO_ERROR;
1888 
1889         m1->reset(input1Len, status);
1890         REGEX_CHECK_STATUS;
1891         status = U_ZERO_ERROR;
1892 
1893         m1->reset(input1Len+1, status);
1894         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1895         status = U_ZERO_ERROR;
1896 
1897         //
1898         // match(pos, status)
1899         //
1900         m1->reset(&input2);
1901         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1902         m1->reset();
1903         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1904         m1->reset();
1905         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1906         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1907         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1908         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1909 
1910         // Match() at end of string should fail, but should not
1911         //  be an error.
1912         status = U_ZERO_ERROR;
1913         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1914         REGEX_CHECK_STATUS;
1915 
1916         // Match beyond end of string should fail with an error.
1917         status = U_ZERO_ERROR;
1918         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1919         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1920 
1921         // Successful match at end of string.
1922         {
1923             status = U_ZERO_ERROR;
1924             RegexMatcher m("A?", 0, status);  // will match zero length string.
1925             REGEX_CHECK_STATUS;
1926             m.reset(&input1);
1927             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1928             REGEX_CHECK_STATUS;
1929             m.reset(&empty);
1930             REGEX_ASSERT(m.matches(0, status) == TRUE);
1931             REGEX_CHECK_STATUS;
1932         }
1933 
1934 
1935         //
1936         // lookingAt(pos, status)
1937         //
1938         status = U_ZERO_ERROR;
1939         m1->reset(&input2);  // "not abc"
1940         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1941         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1942         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1943         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1944         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1945         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1946         status = U_ZERO_ERROR;
1947         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1948         REGEX_CHECK_STATUS;
1949         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1950         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1951 
1952         delete m1;
1953         delete pat2;
1954 
1955         utext_close(&re);
1956         utext_close(&input1);
1957         utext_close(&input2);
1958         utext_close(&empty);
1959     }
1960 
1961 
1962     //
1963     // Capture Group.
1964     //     RegexMatcher::start();
1965     //     RegexMatcher::end();
1966     //     RegexMatcher::groupCount();
1967     //
1968     {
1969         int32_t             flags=0;
1970         UParseError         pe;
1971         UErrorCode          status=U_ZERO_ERROR;
1972         UText               re=UTEXT_INITIALIZER;
1973         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1974         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1975 
1976         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1977         REGEX_CHECK_STATUS;
1978 
1979         UText input = UTEXT_INITIALIZER;
1980         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1981         utext_openUTF8(&input, str_0123456789, -1, &status);
1982 
1983         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1984         REGEX_CHECK_STATUS;
1985         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1986         static const int32_t matchStarts[] = {0,  2, 4, 8};
1987         static const int32_t matchEnds[]   = {10, 8, 6, 10};
1988         int32_t i;
1989         for (i=0; i<4; i++) {
1990             int32_t actualStart = matcher->start(i, status);
1991             REGEX_CHECK_STATUS;
1992             if (actualStart != matchStarts[i]) {
1993                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
1994                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
1995             }
1996             int32_t actualEnd = matcher->end(i, status);
1997             REGEX_CHECK_STATUS;
1998             if (actualEnd != matchEnds[i]) {
1999                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2000                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2001             }
2002         }
2003 
2004         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2005         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2006 
2007         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2008         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2009         matcher->reset();
2010         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2011 
2012         matcher->lookingAt(status);
2013 
2014         UnicodeString dest;
2015         UText destText = UTEXT_INITIALIZER;
2016         utext_openUnicodeString(&destText, &dest, &status);
2017         UText *result;
2018         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2019         //	Test shallow-clone API
2020         int64_t   group_len;
2021         result = matcher->group((UText *)NULL, group_len, status);
2022         REGEX_CHECK_STATUS;
2023         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2024         utext_close(result);
2025         result = matcher->group(0, &destText, group_len, status);
2026         REGEX_CHECK_STATUS;
2027         REGEX_ASSERT(result == &destText);
2028         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2029         //  destText is now immutable, reopen it
2030         utext_close(&destText);
2031         utext_openUnicodeString(&destText, &dest, &status);
2032 
2033         result = matcher->group(0, NULL, status);
2034         REGEX_CHECK_STATUS;
2035         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2036         utext_close(result);
2037         result = matcher->group(0, &destText, status);
2038         REGEX_CHECK_STATUS;
2039         REGEX_ASSERT(result == &destText);
2040         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2041 
2042         result = matcher->group(1, NULL, status);
2043         REGEX_CHECK_STATUS;
2044         const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2045         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2046         utext_close(result);
2047         result = matcher->group(1, &destText, status);
2048         REGEX_CHECK_STATUS;
2049         REGEX_ASSERT(result == &destText);
2050         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2051 
2052         result = matcher->group(2, NULL, status);
2053         REGEX_CHECK_STATUS;
2054         const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
2055         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2056         utext_close(result);
2057         result = matcher->group(2, &destText, status);
2058         REGEX_CHECK_STATUS;
2059         REGEX_ASSERT(result == &destText);
2060         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2061 
2062         result = matcher->group(3, NULL, status);
2063         REGEX_CHECK_STATUS;
2064         const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
2065         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2066         utext_close(result);
2067         result = matcher->group(3, &destText, status);
2068         REGEX_CHECK_STATUS;
2069         REGEX_ASSERT(result == &destText);
2070         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2071 
2072         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2073         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2074         matcher->reset();
2075         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2076 
2077         delete matcher;
2078         delete pat;
2079 
2080         utext_close(&destText);
2081         utext_close(&input);
2082         utext_close(&re);
2083     }
2084 
2085     //
2086     //  find
2087     //
2088     {
2089         int32_t             flags=0;
2090         UParseError         pe;
2091         UErrorCode          status=U_ZERO_ERROR;
2092         UText               re=UTEXT_INITIALIZER;
2093         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2094         utext_openUTF8(&re, str_abc, -1, &status);
2095 
2096         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2097         REGEX_CHECK_STATUS;
2098         UText input = UTEXT_INITIALIZER;
2099         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2100         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2101         //                      012345678901234567
2102 
2103         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2104         REGEX_CHECK_STATUS;
2105         REGEX_ASSERT(matcher->find());
2106         REGEX_ASSERT(matcher->start(status) == 1);
2107         REGEX_ASSERT(matcher->find());
2108         REGEX_ASSERT(matcher->start(status) == 6);
2109         REGEX_ASSERT(matcher->find());
2110         REGEX_ASSERT(matcher->start(status) == 12);
2111         REGEX_ASSERT(matcher->find() == FALSE);
2112         REGEX_ASSERT(matcher->find() == FALSE);
2113 
2114         matcher->reset();
2115         REGEX_ASSERT(matcher->find());
2116         REGEX_ASSERT(matcher->start(status) == 1);
2117 
2118         REGEX_ASSERT(matcher->find(0, status));
2119         REGEX_ASSERT(matcher->start(status) == 1);
2120         REGEX_ASSERT(matcher->find(1, status));
2121         REGEX_ASSERT(matcher->start(status) == 1);
2122         REGEX_ASSERT(matcher->find(2, status));
2123         REGEX_ASSERT(matcher->start(status) == 6);
2124         REGEX_ASSERT(matcher->find(12, status));
2125         REGEX_ASSERT(matcher->start(status) == 12);
2126         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2127         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2128         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2129         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2130 
2131         status = U_ZERO_ERROR;
2132         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2133         status = U_ZERO_ERROR;
2134         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2135 
2136         REGEX_ASSERT(matcher->groupCount() == 0);
2137 
2138         delete matcher;
2139         delete pat;
2140 
2141         utext_close(&input);
2142         utext_close(&re);
2143     }
2144 
2145 
2146     //
2147     //  find, with \G in pattern (true if at the end of a previous match).
2148     //
2149     {
2150         int32_t             flags=0;
2151         UParseError         pe;
2152         UErrorCode          status=U_ZERO_ERROR;
2153         UText               re=UTEXT_INITIALIZER;
2154         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2155         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2156 
2157         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2158 
2159         REGEX_CHECK_STATUS;
2160         UText input = UTEXT_INITIALIZER;
2161         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2162         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2163         //                      012345678901234567
2164 
2165         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2166         REGEX_CHECK_STATUS;
2167         REGEX_ASSERT(matcher->find());
2168         REGEX_ASSERT(matcher->start(status) == 0);
2169         REGEX_ASSERT(matcher->start(1, status) == -1);
2170         REGEX_ASSERT(matcher->start(2, status) == 1);
2171 
2172         REGEX_ASSERT(matcher->find());
2173         REGEX_ASSERT(matcher->start(status) == 4);
2174         REGEX_ASSERT(matcher->start(1, status) == 4);
2175         REGEX_ASSERT(matcher->start(2, status) == -1);
2176         REGEX_CHECK_STATUS;
2177 
2178         delete matcher;
2179         delete pat;
2180 
2181         utext_close(&input);
2182         utext_close(&re);
2183     }
2184 
2185     //
2186     //   find with zero length matches, match position should bump ahead
2187     //     to prevent loops.
2188     //
2189     {
2190         int32_t                 i;
2191         UErrorCode          status=U_ZERO_ERROR;
2192         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2193                                                       //   using an always-true look-ahead.
2194         REGEX_CHECK_STATUS;
2195         UText s = UTEXT_INITIALIZER;
2196         utext_openUTF8(&s, "    ", -1, &status);
2197         m.reset(&s);
2198         for (i=0; ; i++) {
2199             if (m.find() == FALSE) {
2200                 break;
2201             }
2202             REGEX_ASSERT(m.start(status) == i);
2203             REGEX_ASSERT(m.end(status) == i);
2204         }
2205         REGEX_ASSERT(i==5);
2206 
2207         // Check that the bump goes over characters outside the BMP OK
2208         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2209         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2210         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2211         m.reset(&s);
2212         for (i=0; ; i+=4) {
2213             if (m.find() == FALSE) {
2214                 break;
2215             }
2216             REGEX_ASSERT(m.start(status) == i);
2217             REGEX_ASSERT(m.end(status) == i);
2218         }
2219         REGEX_ASSERT(i==20);
2220 
2221         utext_close(&s);
2222     }
2223     {
2224         // find() loop breaking test.
2225         //        with pattern of /.?/, should see a series of one char matches, then a single
2226         //        match of zero length at the end of the input string.
2227         int32_t                 i;
2228         UErrorCode          status=U_ZERO_ERROR;
2229         RegexMatcher        m(".?", 0, status);
2230         REGEX_CHECK_STATUS;
2231         UText s = UTEXT_INITIALIZER;
2232         utext_openUTF8(&s, "    ", -1, &status);
2233         m.reset(&s);
2234         for (i=0; ; i++) {
2235             if (m.find() == FALSE) {
2236                 break;
2237             }
2238             REGEX_ASSERT(m.start(status) == i);
2239             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2240         }
2241         REGEX_ASSERT(i==5);
2242 
2243         utext_close(&s);
2244     }
2245 
2246 
2247     //
2248     // Matchers with no input string behave as if they had an empty input string.
2249     //
2250 
2251     {
2252         UErrorCode status = U_ZERO_ERROR;
2253         RegexMatcher  m(".?", 0, status);
2254         REGEX_CHECK_STATUS;
2255         REGEX_ASSERT(m.find());
2256         REGEX_ASSERT(m.start(status) == 0);
2257         REGEX_ASSERT(m.input() == "");
2258     }
2259     {
2260         UErrorCode status = U_ZERO_ERROR;
2261         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2262         RegexMatcher  *m = p->matcher(status);
2263         REGEX_CHECK_STATUS;
2264 
2265         REGEX_ASSERT(m->find() == FALSE);
2266         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2267         delete m;
2268         delete p;
2269     }
2270 
2271     //
2272     // Regions
2273     //
2274     {
2275         UErrorCode status = U_ZERO_ERROR;
2276         UText testPattern = UTEXT_INITIALIZER;
2277         UText testText    = UTEXT_INITIALIZER;
2278         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2279         REGEX_VERBOSE_TEXT(&testPattern);
2280         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2281         REGEX_VERBOSE_TEXT(&testText);
2282 
2283         RegexMatcher m(&testPattern, &testText, 0, status);
2284         REGEX_CHECK_STATUS;
2285         REGEX_ASSERT(m.regionStart() == 0);
2286         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2287         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2288         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2289 
2290         m.region(2,4, status);
2291         REGEX_CHECK_STATUS;
2292         REGEX_ASSERT(m.matches(status));
2293         REGEX_ASSERT(m.start(status)==2);
2294         REGEX_ASSERT(m.end(status)==4);
2295         REGEX_CHECK_STATUS;
2296 
2297         m.reset();
2298         REGEX_ASSERT(m.regionStart() == 0);
2299         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2300 
2301         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2302         REGEX_VERBOSE_TEXT(&testText);
2303         m.reset(&testText);
2304         REGEX_ASSERT(m.regionStart() == 0);
2305         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2306 
2307         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2308         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2309         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2310         REGEX_ASSERT(&m == &m.reset());
2311         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2312 
2313         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2314         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2315         REGEX_ASSERT(&m == &m.reset());
2316         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2317 
2318         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2319         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2320         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2321         REGEX_ASSERT(&m == &m.reset());
2322         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2323 
2324         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2325         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2326         REGEX_ASSERT(&m == &m.reset());
2327         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2328 
2329         utext_close(&testText);
2330         utext_close(&testPattern);
2331     }
2332 
2333     //
2334     // hitEnd() and requireEnd()
2335     //
2336     {
2337         UErrorCode status = U_ZERO_ERROR;
2338         UText testPattern = UTEXT_INITIALIZER;
2339         UText testText    = UTEXT_INITIALIZER;
2340         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2341         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2342         utext_openUTF8(&testPattern, str_, -1, &status);
2343         utext_openUTF8(&testText, str_aabb, -1, &status);
2344 
2345         RegexMatcher m1(&testPattern, &testText,  0, status);
2346         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2347         REGEX_ASSERT(m1.hitEnd() == TRUE);
2348         REGEX_ASSERT(m1.requireEnd() == FALSE);
2349         REGEX_CHECK_STATUS;
2350 
2351         status = U_ZERO_ERROR;
2352         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2353         utext_openUTF8(&testPattern, str_a, -1, &status);
2354         RegexMatcher m2(&testPattern, &testText, 0, status);
2355         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2356         REGEX_ASSERT(m2.hitEnd() == FALSE);
2357         REGEX_ASSERT(m2.requireEnd() == FALSE);
2358         REGEX_CHECK_STATUS;
2359 
2360         status = U_ZERO_ERROR;
2361         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2362         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2363         RegexMatcher m3(&testPattern, &testText, 0, status);
2364         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2365         REGEX_ASSERT(m3.hitEnd() == TRUE);
2366         REGEX_ASSERT(m3.requireEnd() == TRUE);
2367         REGEX_CHECK_STATUS;
2368 
2369         utext_close(&testText);
2370         utext_close(&testPattern);
2371     }
2372 }
2373 
2374 
2375 //---------------------------------------------------------------------------
2376 //
2377 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2378 //                         Replace family of functions.
2379 //
2380 //---------------------------------------------------------------------------
API_Replace_UTF8()2381 void RegexTest::API_Replace_UTF8() {
2382     //
2383     //  Replace
2384     //
2385     int32_t             flags=0;
2386     UParseError         pe;
2387     UErrorCode          status=U_ZERO_ERROR;
2388 
2389     UText               re=UTEXT_INITIALIZER;
2390     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2391     REGEX_VERBOSE_TEXT(&re);
2392     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2393     REGEX_CHECK_STATUS;
2394 
2395     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2396     //             012345678901234567
2397     UText dataText = UTEXT_INITIALIZER;
2398     utext_openUTF8(&dataText, data, -1, &status);
2399     REGEX_CHECK_STATUS;
2400     REGEX_VERBOSE_TEXT(&dataText);
2401     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2402 
2403     //
2404     //  Plain vanilla matches.
2405     //
2406     UnicodeString  dest;
2407     UText destText = UTEXT_INITIALIZER;
2408     utext_openUnicodeString(&destText, &dest, &status);
2409     UText *result;
2410 
2411     UText replText = UTEXT_INITIALIZER;
2412 
2413     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2414     utext_openUTF8(&replText, str_yz, -1, &status);
2415     REGEX_VERBOSE_TEXT(&replText);
2416     result = matcher->replaceFirst(&replText, NULL, status);
2417     REGEX_CHECK_STATUS;
2418     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2419     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2420     utext_close(result);
2421     result = matcher->replaceFirst(&replText, &destText, status);
2422     REGEX_CHECK_STATUS;
2423     REGEX_ASSERT(result == &destText);
2424     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2425 
2426     result = matcher->replaceAll(&replText, NULL, status);
2427     REGEX_CHECK_STATUS;
2428     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2429     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2430     utext_close(result);
2431 
2432     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2433     result = matcher->replaceAll(&replText, &destText, status);
2434     REGEX_CHECK_STATUS;
2435     REGEX_ASSERT(result == &destText);
2436     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2437 
2438     //
2439     //  Plain vanilla non-matches.
2440     //
2441     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2442     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2443     matcher->reset(&dataText);
2444 
2445     result = matcher->replaceFirst(&replText, NULL, status);
2446     REGEX_CHECK_STATUS;
2447     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2448     utext_close(result);
2449     result = matcher->replaceFirst(&replText, &destText, status);
2450     REGEX_CHECK_STATUS;
2451     REGEX_ASSERT(result == &destText);
2452     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2453 
2454     result = matcher->replaceAll(&replText, NULL, status);
2455     REGEX_CHECK_STATUS;
2456     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2457     utext_close(result);
2458     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2459     result = matcher->replaceAll(&replText, &destText, status);
2460     REGEX_CHECK_STATUS;
2461     REGEX_ASSERT(result == &destText);
2462     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2463 
2464     //
2465     // Empty source string
2466     //
2467     utext_openUTF8(&dataText, NULL, 0, &status);
2468     matcher->reset(&dataText);
2469 
2470     result = matcher->replaceFirst(&replText, NULL, status);
2471     REGEX_CHECK_STATUS;
2472     REGEX_ASSERT_UTEXT_UTF8("", result);
2473     utext_close(result);
2474     result = matcher->replaceFirst(&replText, &destText, status);
2475     REGEX_CHECK_STATUS;
2476     REGEX_ASSERT(result == &destText);
2477     REGEX_ASSERT_UTEXT_UTF8("", result);
2478 
2479     result = matcher->replaceAll(&replText, NULL, status);
2480     REGEX_CHECK_STATUS;
2481     REGEX_ASSERT_UTEXT_UTF8("", result);
2482     utext_close(result);
2483     result = matcher->replaceAll(&replText, &destText, status);
2484     REGEX_CHECK_STATUS;
2485     REGEX_ASSERT(result == &destText);
2486     REGEX_ASSERT_UTEXT_UTF8("", result);
2487 
2488     //
2489     // Empty substitution string
2490     //
2491     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2492     matcher->reset(&dataText);
2493 
2494     utext_openUTF8(&replText, NULL, 0, &status);
2495     result = matcher->replaceFirst(&replText, NULL, status);
2496     REGEX_CHECK_STATUS;
2497     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2498     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2499     utext_close(result);
2500     result = matcher->replaceFirst(&replText, &destText, status);
2501     REGEX_CHECK_STATUS;
2502     REGEX_ASSERT(result == &destText);
2503     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2504 
2505     result = matcher->replaceAll(&replText, NULL, status);
2506     REGEX_CHECK_STATUS;
2507     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2508     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2509     utext_close(result);
2510     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2511     result = matcher->replaceAll(&replText, &destText, status);
2512     REGEX_CHECK_STATUS;
2513     REGEX_ASSERT(result == &destText);
2514     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2515 
2516     //
2517     // match whole string
2518     //
2519     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2520     utext_openUTF8(&dataText, str_abc, -1, &status);
2521     matcher->reset(&dataText);
2522 
2523     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2524     utext_openUTF8(&replText, str_xyz, -1, &status);
2525     result = matcher->replaceFirst(&replText, NULL, status);
2526     REGEX_CHECK_STATUS;
2527     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2528     utext_close(result);
2529     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2530     result = matcher->replaceFirst(&replText, &destText, status);
2531     REGEX_CHECK_STATUS;
2532     REGEX_ASSERT(result == &destText);
2533     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2534 
2535     result = matcher->replaceAll(&replText, NULL, status);
2536     REGEX_CHECK_STATUS;
2537     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2538     utext_close(result);
2539     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2540     result = matcher->replaceAll(&replText, &destText, status);
2541     REGEX_CHECK_STATUS;
2542     REGEX_ASSERT(result == &destText);
2543     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2544 
2545     //
2546     // Capture Group, simple case
2547     //
2548     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2549     utext_openUTF8(&re, str_add, -1, &status);
2550     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2551     REGEX_CHECK_STATUS;
2552 
2553     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2554     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2555     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2556     REGEX_CHECK_STATUS;
2557 
2558     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2559     utext_openUTF8(&replText, str_11, -1, &status);
2560     result = matcher2->replaceFirst(&replText, NULL, status);
2561     REGEX_CHECK_STATUS;
2562     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2563     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2564     utext_close(result);
2565     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2566     result = matcher2->replaceFirst(&replText, &destText, status);
2567     REGEX_CHECK_STATUS;
2568     REGEX_ASSERT(result == &destText);
2569     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2570 
2571     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2572     utext_openUTF8(&replText, str_v, -1, &status);
2573     REGEX_VERBOSE_TEXT(&replText);
2574     result = matcher2->replaceFirst(&replText, NULL, status);
2575     REGEX_CHECK_STATUS;
2576     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2577     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2578     utext_close(result);
2579     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2580     result = matcher2->replaceFirst(&replText, &destText, status);
2581     REGEX_CHECK_STATUS;
2582     REGEX_ASSERT(result == &destText);
2583     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2584 
2585     const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2586     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2587     result = matcher2->replaceFirst(&replText, NULL, status);
2588     REGEX_CHECK_STATUS;
2589     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2590     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2591     utext_close(result);
2592     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2593     result = matcher2->replaceFirst(&replText, &destText, status);
2594     REGEX_CHECK_STATUS;
2595     REGEX_ASSERT(result == &destText);
2596     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2597 
2598     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2599     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2600     //                                 012345678901234567890123456
2601     supplDigitChars[22] = 0xF0;
2602     supplDigitChars[23] = 0x9D;
2603     supplDigitChars[24] = 0x9F;
2604     supplDigitChars[25] = 0x8F;
2605     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2606 
2607     result = matcher2->replaceFirst(&replText, NULL, status);
2608     REGEX_CHECK_STATUS;
2609     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2610     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2611     utext_close(result);
2612     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2613     result = matcher2->replaceFirst(&replText, &destText, status);
2614     REGEX_CHECK_STATUS;
2615     REGEX_ASSERT(result == &destText);
2616     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2617     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2618     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2619     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2620 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2621     utext_close(result);
2622     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2623     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2624     REGEX_ASSERT(result == &destText);
2625 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2626 
2627     //
2628     // Replacement String with \u hex escapes
2629     //
2630     {
2631       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2632       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2633         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2634         utext_openUTF8(&replText, str_u0043, -1, &status);
2635         matcher->reset(&dataText);
2636 
2637         result = matcher->replaceAll(&replText, NULL, status);
2638         REGEX_CHECK_STATUS;
2639         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2640         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2641         utext_close(result);
2642         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2643         result = matcher->replaceAll(&replText, &destText, status);
2644         REGEX_CHECK_STATUS;
2645         REGEX_ASSERT(result == &destText);
2646         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2647     }
2648     {
2649       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2650         utext_openUTF8(&dataText, str_abc, -1, &status);
2651         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2652         utext_openUTF8(&replText, str_U00010000, -1, &status);
2653         matcher->reset(&dataText);
2654 
2655         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2656         //                          0123456789
2657         expected[2] = 0xF0;
2658         expected[3] = 0x90;
2659         expected[4] = 0x80;
2660         expected[5] = 0x80;
2661 
2662         result = matcher->replaceAll(&replText, NULL, status);
2663         REGEX_CHECK_STATUS;
2664         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2665         utext_close(result);
2666         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2667         result = matcher->replaceAll(&replText, &destText, status);
2668         REGEX_CHECK_STATUS;
2669         REGEX_ASSERT(result == &destText);
2670         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2671     }
2672     // TODO:  need more through testing of capture substitutions.
2673 
2674     // Bug 4057
2675     //
2676     {
2677         status = U_ZERO_ERROR;
2678 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2679 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2680 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2681         utext_openUTF8(&re, str_ssee, -1, &status);
2682         utext_openUTF8(&dataText, str_blah, -1, &status);
2683         utext_openUTF8(&replText, str_ooh, -1, &status);
2684 
2685         RegexMatcher m(&re, 0, status);
2686         REGEX_CHECK_STATUS;
2687 
2688         UnicodeString result;
2689         UText resultText = UTEXT_INITIALIZER;
2690         utext_openUnicodeString(&resultText, &result, &status);
2691 
2692         // Multiple finds do NOT bump up the previous appendReplacement postion.
2693         m.reset(&dataText);
2694         m.find();
2695         m.find();
2696         m.appendReplacement(&resultText, &replText, status);
2697         REGEX_CHECK_STATUS;
2698         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2699         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2700 
2701         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2702         status = U_ZERO_ERROR;
2703         result.truncate(0);
2704         utext_openUnicodeString(&resultText, &result, &status);
2705         m.reset(10, status);
2706         m.find();
2707         m.find();
2708         m.appendReplacement(&resultText, &replText, status);
2709         REGEX_CHECK_STATUS;
2710         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2711         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2712 
2713         // find() at interior of string, appendReplacement still starts at beginning.
2714         status = U_ZERO_ERROR;
2715         result.truncate(0);
2716         utext_openUnicodeString(&resultText, &result, &status);
2717         m.reset();
2718         m.find(10, status);
2719         m.find();
2720         m.appendReplacement(&resultText, &replText, status);
2721         REGEX_CHECK_STATUS;
2722         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2723         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2724 
2725         m.appendTail(&resultText, status);
2726         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2727         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2728 
2729         utext_close(&resultText);
2730     }
2731 
2732     delete matcher2;
2733     delete pat2;
2734     delete matcher;
2735     delete pat;
2736 
2737     utext_close(&dataText);
2738     utext_close(&replText);
2739     utext_close(&destText);
2740     utext_close(&re);
2741 }
2742 
2743 
2744 //---------------------------------------------------------------------------
2745 //
2746 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2747 //                        present and nominally working.
2748 //
2749 //---------------------------------------------------------------------------
API_Pattern_UTF8()2750 void RegexTest::API_Pattern_UTF8() {
2751     RegexPattern        pata;    // Test default constructor to not crash.
2752     RegexPattern        patb;
2753 
2754     REGEX_ASSERT(pata == patb);
2755     REGEX_ASSERT(pata == pata);
2756 
2757     UText         re1 = UTEXT_INITIALIZER;
2758     UText         re2 = UTEXT_INITIALIZER;
2759     UErrorCode    status = U_ZERO_ERROR;
2760     UParseError   pe;
2761 
2762     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2763     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2764     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2765     utext_openUTF8(&re2, str_def, -1, &status);
2766 
2767     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2768     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2769     REGEX_CHECK_STATUS;
2770     REGEX_ASSERT(*pat1 == *pat1);
2771     REGEX_ASSERT(*pat1 != pata);
2772 
2773     // Assign
2774     patb = *pat1;
2775     REGEX_ASSERT(patb == *pat1);
2776 
2777     // Copy Construct
2778     RegexPattern patc(*pat1);
2779     REGEX_ASSERT(patc == *pat1);
2780     REGEX_ASSERT(patb == patc);
2781     REGEX_ASSERT(pat1 != pat2);
2782     patb = *pat2;
2783     REGEX_ASSERT(patb != patc);
2784     REGEX_ASSERT(patb == *pat2);
2785 
2786     // Compile with no flags.
2787     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2788     REGEX_ASSERT(*pat1a == *pat1);
2789 
2790     REGEX_ASSERT(pat1a->flags() == 0);
2791 
2792     // Compile with different flags should be not equal
2793     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2794     REGEX_CHECK_STATUS;
2795 
2796     REGEX_ASSERT(*pat1b != *pat1a);
2797     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2798     REGEX_ASSERT(pat1a->flags() == 0);
2799     delete pat1b;
2800 
2801     // clone
2802     RegexPattern *pat1c = pat1->clone();
2803     REGEX_ASSERT(*pat1c == *pat1);
2804     REGEX_ASSERT(*pat1c != *pat2);
2805 
2806     delete pat1c;
2807     delete pat1a;
2808     delete pat1;
2809     delete pat2;
2810 
2811     utext_close(&re1);
2812     utext_close(&re2);
2813 
2814 
2815     //
2816     //   Verify that a matcher created from a cloned pattern works.
2817     //     (Jitterbug 3423)
2818     //
2819     {
2820         UErrorCode     status     = U_ZERO_ERROR;
2821         UText          pattern    = UTEXT_INITIALIZER;
2822         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2823         utext_openUTF8(&pattern, str_pL, -1, &status);
2824 
2825         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2826         RegexPattern  *pClone     = pSource->clone();
2827         delete         pSource;
2828         RegexMatcher  *mFromClone = pClone->matcher(status);
2829         REGEX_CHECK_STATUS;
2830 
2831         UText          input      = UTEXT_INITIALIZER;
2832         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2833         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2834         mFromClone->reset(&input);
2835         REGEX_ASSERT(mFromClone->find() == TRUE);
2836         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2837         REGEX_ASSERT(mFromClone->find() == TRUE);
2838         REGEX_ASSERT(mFromClone->group(status) == "World");
2839         REGEX_ASSERT(mFromClone->find() == FALSE);
2840         delete mFromClone;
2841         delete pClone;
2842 
2843         utext_close(&input);
2844         utext_close(&pattern);
2845     }
2846 
2847     //
2848     //   matches convenience API
2849     //
2850     {
2851         UErrorCode status  = U_ZERO_ERROR;
2852         UText      pattern = UTEXT_INITIALIZER;
2853         UText      input   = UTEXT_INITIALIZER;
2854 
2855         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2856         utext_openUTF8(&input, str_randominput, -1, &status);
2857 
2858         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2859         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2860         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2861         REGEX_CHECK_STATUS;
2862 
2863         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2864         utext_openUTF8(&pattern, str_abc, -1, &status);
2865         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2866         REGEX_CHECK_STATUS;
2867 
2868         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2869         utext_openUTF8(&pattern, str_nput, -1, &status);
2870         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2871         REGEX_CHECK_STATUS;
2872 
2873         utext_openUTF8(&pattern, str_randominput, -1, &status);
2874         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2875         REGEX_CHECK_STATUS;
2876 
2877         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2878         utext_openUTF8(&pattern, str_u, -1, &status);
2879         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2880         REGEX_CHECK_STATUS;
2881 
2882         utext_openUTF8(&input, str_abc, -1, &status);
2883         utext_openUTF8(&pattern, str_abc, -1, &status);
2884         status = U_INDEX_OUTOFBOUNDS_ERROR;
2885         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2886         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2887 
2888         utext_close(&input);
2889         utext_close(&pattern);
2890     }
2891 
2892 
2893     //
2894     // Split()
2895     //
2896     status = U_ZERO_ERROR;
2897     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2898     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2899     pat1 = RegexPattern::compile(&re1, pe, status);
2900     REGEX_CHECK_STATUS;
2901     UnicodeString  fields[10];
2902 
2903     int32_t n;
2904     n = pat1->split("Now is the time", fields, 10, status);
2905     REGEX_CHECK_STATUS;
2906     REGEX_ASSERT(n==4);
2907     REGEX_ASSERT(fields[0]=="Now");
2908     REGEX_ASSERT(fields[1]=="is");
2909     REGEX_ASSERT(fields[2]=="the");
2910     REGEX_ASSERT(fields[3]=="time");
2911     REGEX_ASSERT(fields[4]=="");
2912 
2913     n = pat1->split("Now is the time", fields, 2, status);
2914     REGEX_CHECK_STATUS;
2915     REGEX_ASSERT(n==2);
2916     REGEX_ASSERT(fields[0]=="Now");
2917     REGEX_ASSERT(fields[1]=="is the time");
2918     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2919 
2920     fields[1] = "*";
2921     status = U_ZERO_ERROR;
2922     n = pat1->split("Now is the time", fields, 1, status);
2923     REGEX_CHECK_STATUS;
2924     REGEX_ASSERT(n==1);
2925     REGEX_ASSERT(fields[0]=="Now is the time");
2926     REGEX_ASSERT(fields[1]=="*");
2927     status = U_ZERO_ERROR;
2928 
2929     n = pat1->split("    Now       is the time   ", fields, 10, status);
2930     REGEX_CHECK_STATUS;
2931     REGEX_ASSERT(n==6);
2932     REGEX_ASSERT(fields[0]=="");
2933     REGEX_ASSERT(fields[1]=="Now");
2934     REGEX_ASSERT(fields[2]=="is");
2935     REGEX_ASSERT(fields[3]=="the");
2936     REGEX_ASSERT(fields[4]=="time");
2937     REGEX_ASSERT(fields[5]=="");
2938     REGEX_ASSERT(fields[6]=="");
2939 
2940     fields[2] = "*";
2941     n = pat1->split("     ", fields, 10, status);
2942     REGEX_CHECK_STATUS;
2943     REGEX_ASSERT(n==2);
2944     REGEX_ASSERT(fields[0]=="");
2945     REGEX_ASSERT(fields[1]=="");
2946     REGEX_ASSERT(fields[2]=="*");
2947 
2948     fields[0] = "foo";
2949     n = pat1->split("", fields, 10, status);
2950     REGEX_CHECK_STATUS;
2951     REGEX_ASSERT(n==0);
2952     REGEX_ASSERT(fields[0]=="foo");
2953 
2954     delete pat1;
2955 
2956     //  split, with a pattern with (capture)
2957     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2958     pat1 = RegexPattern::compile(&re1,  pe, status);
2959     REGEX_CHECK_STATUS;
2960 
2961     status = U_ZERO_ERROR;
2962     fields[6] = fields[7] = "*";
2963     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2964     REGEX_CHECK_STATUS;
2965     REGEX_ASSERT(n==7);
2966     REGEX_ASSERT(fields[0]=="");
2967     REGEX_ASSERT(fields[1]=="a");
2968     REGEX_ASSERT(fields[2]=="Now is ");
2969     REGEX_ASSERT(fields[3]=="b");
2970     REGEX_ASSERT(fields[4]=="the time");
2971     REGEX_ASSERT(fields[5]=="c");
2972     REGEX_ASSERT(fields[6]=="");
2973     REGEX_ASSERT(fields[7]=="*");
2974     REGEX_ASSERT(status==U_ZERO_ERROR);
2975 
2976     fields[6] = fields[7] = "*";
2977     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2978     REGEX_CHECK_STATUS;
2979     REGEX_ASSERT(n==7);
2980     REGEX_ASSERT(fields[0]=="  ");
2981     REGEX_ASSERT(fields[1]=="a");
2982     REGEX_ASSERT(fields[2]=="Now is ");
2983     REGEX_ASSERT(fields[3]=="b");
2984     REGEX_ASSERT(fields[4]=="the time");
2985     REGEX_ASSERT(fields[5]=="c");
2986     REGEX_ASSERT(fields[6]=="");
2987     REGEX_ASSERT(fields[7]=="*");
2988 
2989     status = U_ZERO_ERROR;
2990     fields[6] = "foo";
2991     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
2992     REGEX_CHECK_STATUS;
2993     REGEX_ASSERT(n==6);
2994     REGEX_ASSERT(fields[0]=="  ");
2995     REGEX_ASSERT(fields[1]=="a");
2996     REGEX_ASSERT(fields[2]=="Now is ");
2997     REGEX_ASSERT(fields[3]=="b");
2998     REGEX_ASSERT(fields[4]=="the time");
2999     REGEX_ASSERT(fields[5]==" ");
3000     REGEX_ASSERT(fields[6]=="foo");
3001 
3002     status = U_ZERO_ERROR;
3003     fields[5] = "foo";
3004     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3005     REGEX_CHECK_STATUS;
3006     REGEX_ASSERT(n==5);
3007     REGEX_ASSERT(fields[0]=="  ");
3008     REGEX_ASSERT(fields[1]=="a");
3009     REGEX_ASSERT(fields[2]=="Now is ");
3010     REGEX_ASSERT(fields[3]=="b");
3011     REGEX_ASSERT(fields[4]=="the time<c>");
3012     REGEX_ASSERT(fields[5]=="foo");
3013 
3014     status = U_ZERO_ERROR;
3015     fields[5] = "foo";
3016     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3017     REGEX_CHECK_STATUS;
3018     REGEX_ASSERT(n==5);
3019     REGEX_ASSERT(fields[0]=="  ");
3020     REGEX_ASSERT(fields[1]=="a");
3021     REGEX_ASSERT(fields[2]=="Now is ");
3022     REGEX_ASSERT(fields[3]=="b");
3023     REGEX_ASSERT(fields[4]=="the time");
3024     REGEX_ASSERT(fields[5]=="foo");
3025 
3026     status = U_ZERO_ERROR;
3027     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3028     REGEX_CHECK_STATUS;
3029     REGEX_ASSERT(n==4);
3030     REGEX_ASSERT(fields[0]=="  ");
3031     REGEX_ASSERT(fields[1]=="a");
3032     REGEX_ASSERT(fields[2]=="Now is ");
3033     REGEX_ASSERT(fields[3]=="the time<c>");
3034     status = U_ZERO_ERROR;
3035     delete pat1;
3036 
3037     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3038     pat1 = RegexPattern::compile(&re1, pe, status);
3039     REGEX_CHECK_STATUS;
3040     n = pat1->split("1-10,20", fields, 10, status);
3041     REGEX_CHECK_STATUS;
3042     REGEX_ASSERT(n==5);
3043     REGEX_ASSERT(fields[0]=="1");
3044     REGEX_ASSERT(fields[1]=="-");
3045     REGEX_ASSERT(fields[2]=="10");
3046     REGEX_ASSERT(fields[3]==",");
3047     REGEX_ASSERT(fields[4]=="20");
3048     delete pat1;
3049 
3050 
3051     //
3052     // RegexPattern::pattern() and patternText()
3053     //
3054     pat1 = new RegexPattern();
3055     REGEX_ASSERT(pat1->pattern() == "");
3056     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3057     delete pat1;
3058     const char *helloWorldInvariant = "(Hello, world)*";
3059     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3060     pat1 = RegexPattern::compile(&re1, pe, status);
3061     REGEX_CHECK_STATUS;
3062     REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
3063     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3064     delete pat1;
3065 
3066     utext_close(&re1);
3067 }
3068 
3069 
3070 //---------------------------------------------------------------------------
3071 //
3072 //      Extended       A more thorough check for features of regex patterns
3073 //                     The test cases are in a separate data file,
3074 //                       source/tests/testdata/regextst.txt
3075 //                     A description of the test data format is included in that file.
3076 //
3077 //---------------------------------------------------------------------------
3078 
3079 const char *
getPath(char buffer[2048],const char * filename)3080 RegexTest::getPath(char buffer[2048], const char *filename) {
3081     UErrorCode status=U_ZERO_ERROR;
3082     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3083     if (U_FAILURE(status)) {
3084         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3085         return NULL;
3086     }
3087 
3088     strcpy(buffer, testDataDirectory);
3089     strcat(buffer, filename);
3090     return buffer;
3091 }
3092 
Extended()3093 void RegexTest::Extended() {
3094     char tdd[2048];
3095     const char *srcPath;
3096     UErrorCode  status  = U_ZERO_ERROR;
3097     int32_t     lineNum = 0;
3098 
3099     //
3100     //  Open and read the test data file.
3101     //
3102     srcPath=getPath(tdd, "regextst.txt");
3103     if(srcPath==NULL) {
3104         return; /* something went wrong, error already output */
3105     }
3106 
3107     int32_t    len;
3108     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3109     if (U_FAILURE(status)) {
3110         return; /* something went wrong, error already output */
3111     }
3112 
3113     //
3114     //  Put the test data into a UnicodeString
3115     //
3116     UnicodeString testString(FALSE, testData, len);
3117 
3118     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3119     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3120     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3121 
3122     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3123     UnicodeString   testPattern;   // The pattern for test from the test file.
3124     UnicodeString   testFlags;     // the flags   for a test.
3125     UnicodeString   matchString;   // The marked up string to be used as input
3126 
3127     if (U_FAILURE(status)){
3128         dataerrln("Construct RegexMatcher() error.");
3129         delete [] testData;
3130         return;
3131     }
3132 
3133     //
3134     //  Loop over the test data file, once per line.
3135     //
3136     while (lineMat.find()) {
3137         lineNum++;
3138         if (U_FAILURE(status)) {
3139           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3140         }
3141 
3142         status = U_ZERO_ERROR;
3143         UnicodeString testLine = lineMat.group(1, status);
3144         if (testLine.length() == 0) {
3145             continue;
3146         }
3147 
3148         //
3149         // Parse the test line.  Skip blank and comment only lines.
3150         // Separate out the three main fields - pattern, flags, target.
3151         //
3152 
3153         commentMat.reset(testLine);
3154         if (commentMat.lookingAt(status)) {
3155             // This line is a comment, or blank.
3156             continue;
3157         }
3158 
3159         //
3160         //  Pull out the pattern field, remove it from the test file line.
3161         //
3162         quotedStuffMat.reset(testLine);
3163         if (quotedStuffMat.lookingAt(status)) {
3164             testPattern = quotedStuffMat.group(2, status);
3165             testLine.remove(0, quotedStuffMat.end(0, status));
3166         } else {
3167             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3168             continue;
3169         }
3170 
3171 
3172         //
3173         //  Pull out the flags from the test file line.
3174         //
3175         flagsMat.reset(testLine);
3176         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3177         testFlags = flagsMat.group(1, status);
3178         if (flagsMat.group(2, status).length() > 0) {
3179             errln("Bad Match flag at line %d. Scanning %c\n",
3180                 lineNum, flagsMat.group(2, status).charAt(0));
3181             continue;
3182         }
3183         testLine.remove(0, flagsMat.end(0, status));
3184 
3185         //
3186         //  Pull out the match string, as a whole.
3187         //    We'll process the <tags> later.
3188         //
3189         quotedStuffMat.reset(testLine);
3190         if (quotedStuffMat.lookingAt(status)) {
3191             matchString = quotedStuffMat.group(2, status);
3192             testLine.remove(0, quotedStuffMat.end(0, status));
3193         } else {
3194             errln("Bad match string at test file line %d", lineNum);
3195             continue;
3196         }
3197 
3198         //
3199         //  The only thing left from the input line should be an optional trailing comment.
3200         //
3201         commentMat.reset(testLine);
3202         if (commentMat.lookingAt(status) == FALSE) {
3203             errln("Line %d: unexpected characters at end of test line.", lineNum);
3204             continue;
3205         }
3206 
3207         //
3208         //  Run the test
3209         //
3210         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3211     }
3212 
3213     delete [] testData;
3214 
3215 }
3216 
3217 
3218 
3219 //---------------------------------------------------------------------------
3220 //
3221 //    regex_find(pattern, flags, inputString, lineNumber)
3222 //
3223 //         Function to run a single test from the Extended (data driven) tests.
3224 //         See file test/testdata/regextst.txt for a description of the
3225 //         pattern and inputString fields, and the allowed flags.
3226 //         lineNumber is the source line in regextst.txt of the test.
3227 //
3228 //---------------------------------------------------------------------------
3229 
3230 
3231 //  Set a value into a UVector at position specified by a decimal number in
3232 //   a UnicodeString.   This is a utility function needed by the actual test function,
3233 //   which follows.
set(UVector & vec,int32_t val,UnicodeString index)3234 static void set(UVector &vec, int32_t val, UnicodeString index) {
3235     UErrorCode  status=U_ZERO_ERROR;
3236     int32_t  idx = 0;
3237     for (int32_t i=0; i<index.length(); i++) {
3238         int32_t d=u_charDigitValue(index.charAt(i));
3239         if (d<0) {return;}
3240         idx = idx*10 + d;
3241     }
3242     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3243     vec.setElementAt(val, idx);
3244 }
3245 
setInt(UVector & vec,int32_t val,int32_t idx)3246 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3247     UErrorCode  status=U_ZERO_ERROR;
3248     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3249     vec.setElementAt(val, idx);
3250 }
3251 
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3252 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3253 {
3254     UBool couldFind = TRUE;
3255     UTEXT_SETNATIVEINDEX(utext, 0);
3256     int32_t i = 0;
3257     while (i < unistrOffset) {
3258         UChar32 c = UTEXT_NEXT32(utext);
3259         if (c != U_SENTINEL) {
3260             i += U16_LENGTH(c);
3261         } else {
3262             couldFind = FALSE;
3263             break;
3264         }
3265     }
3266     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3267     return couldFind;
3268 }
3269 
3270 
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3271 void RegexTest::regex_find(const UnicodeString &pattern,
3272                            const UnicodeString &flags,
3273                            const UnicodeString &inputString,
3274                            const char *srcPath,
3275                            int32_t line) {
3276     UnicodeString       unEscapedInput;
3277     UnicodeString       deTaggedInput;
3278 
3279     int32_t             patternUTF8Length,      inputUTF8Length;
3280     char                *patternChars  = NULL, *inputChars = NULL;
3281     UText               patternText    = UTEXT_INITIALIZER;
3282     UText               inputText      = UTEXT_INITIALIZER;
3283     UConverter          *UTF8Converter = NULL;
3284 
3285     UErrorCode          status         = U_ZERO_ERROR;
3286     UParseError         pe;
3287     RegexPattern        *parsePat      = NULL;
3288     RegexMatcher        *parseMatcher  = NULL;
3289     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3290     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3291     UVector             groupStarts(status);
3292     UVector             groupEnds(status);
3293     UVector             groupStartsUTF8(status);
3294     UVector             groupEndsUTF8(status);
3295     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3296     UBool               failed         = FALSE;
3297     int32_t             numFinds;
3298     int32_t             i;
3299     UBool               useMatchesFunc   = FALSE;
3300     UBool               useLookingAtFunc = FALSE;
3301     int32_t             regionStart      = -1;
3302     int32_t             regionEnd        = -1;
3303     int32_t             regionStartUTF8  = -1;
3304     int32_t             regionEndUTF8    = -1;
3305 
3306 
3307     //
3308     //  Compile the caller's pattern
3309     //
3310     uint32_t bflags = 0;
3311     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3312         bflags |= UREGEX_CASE_INSENSITIVE;
3313     }
3314     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3315         bflags |= UREGEX_COMMENTS;
3316     }
3317     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3318         bflags |= UREGEX_DOTALL;
3319     }
3320     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3321         bflags |= UREGEX_MULTILINE;
3322     }
3323 
3324     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3325         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3326     }
3327     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3328         bflags |= UREGEX_UNIX_LINES;
3329     }
3330 
3331 
3332     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3333     if (status != U_ZERO_ERROR) {
3334         #if UCONFIG_NO_BREAK_ITERATION==1
3335         // 'v' test flag means that the test pattern should not compile if ICU was configured
3336         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3337         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3338             goto cleanupAndReturn;
3339         }
3340         #endif
3341         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3342             // Expected pattern compilation error.
3343             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3344                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3345             }
3346             goto cleanupAndReturn;
3347         } else {
3348             // Unexpected pattern compilation error.
3349             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3350             goto cleanupAndReturn;
3351         }
3352     }
3353 
3354     UTF8Converter = ucnv_open("UTF8", &status);
3355     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3356 
3357     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3358     status = U_ZERO_ERROR; // buffer overflow
3359     patternChars = new char[patternUTF8Length+1];
3360     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3361     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3362 
3363     if (status == U_ZERO_ERROR) {
3364         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3365 
3366         if (status != U_ZERO_ERROR) {
3367 #if UCONFIG_NO_BREAK_ITERATION==1
3368             // 'v' test flag means that the test pattern should not compile if ICU was configured
3369             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3370             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3371                 goto cleanupAndReturn;
3372             }
3373 #endif
3374             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3375                 // Expected pattern compilation error.
3376                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3377                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3378                 }
3379                 goto cleanupAndReturn;
3380             } else {
3381                 // Unexpected pattern compilation error.
3382                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3383                 goto cleanupAndReturn;
3384             }
3385         }
3386     }
3387 
3388     if (UTF8Pattern == NULL) {
3389         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3390         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3391         status = U_ZERO_ERROR;
3392     }
3393 
3394     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3395         RegexPatternDump(callerPattern);
3396     }
3397 
3398     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3399         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3400         goto cleanupAndReturn;
3401     }
3402 
3403 
3404     //
3405     // Number of times find() should be called on the test string, default to 1
3406     //
3407     numFinds = 1;
3408     for (i=2; i<=9; i++) {
3409         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3410             if (numFinds != 1) {
3411                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3412                 goto cleanupAndReturn;
3413             }
3414             numFinds = i;
3415         }
3416     }
3417 
3418     // 'M' flag.  Use matches() instead of find()
3419     if (flags.indexOf((UChar)0x4d) >= 0) {
3420         useMatchesFunc = TRUE;
3421     }
3422     if (flags.indexOf((UChar)0x4c) >= 0) {
3423         useLookingAtFunc = TRUE;
3424     }
3425 
3426     //
3427     //  Find the tags in the input data, remove them, and record the group boundary
3428     //    positions.
3429     //
3430     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3431     REGEX_CHECK_STATUS_L(line);
3432 
3433     unEscapedInput = inputString.unescape();
3434     parseMatcher = parsePat->matcher(unEscapedInput, status);
3435     REGEX_CHECK_STATUS_L(line);
3436     while(parseMatcher->find()) {
3437         parseMatcher->appendReplacement(deTaggedInput, "", status);
3438         REGEX_CHECK_STATUS;
3439         UnicodeString groupNum = parseMatcher->group(2, status);
3440         if (groupNum == "r") {
3441             // <r> or </r>, a region specification within the string
3442             if (parseMatcher->group(1, status) == "/") {
3443                 regionEnd = deTaggedInput.length();
3444             } else {
3445                 regionStart = deTaggedInput.length();
3446             }
3447         } else {
3448             // <digits> or </digits>, a group match boundary tag.
3449             if (parseMatcher->group(1, status) == "/") {
3450                 set(groupEnds, deTaggedInput.length(), groupNum);
3451             } else {
3452                 set(groupStarts, deTaggedInput.length(), groupNum);
3453             }
3454         }
3455     }
3456     parseMatcher->appendTail(deTaggedInput);
3457     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3458     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3459       errln("mismatched <r> tags");
3460       failed = TRUE;
3461       goto cleanupAndReturn;
3462     }
3463 
3464     //
3465     //  Configure the matcher according to the flags specified with this test.
3466     //
3467     matcher = callerPattern->matcher(deTaggedInput, status);
3468     REGEX_CHECK_STATUS_L(line);
3469     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3470         matcher->setTrace(TRUE);
3471     }
3472 
3473     if (UTF8Pattern != NULL) {
3474         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3475         status = U_ZERO_ERROR; // buffer overflow
3476         inputChars = new char[inputUTF8Length+1];
3477         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3478         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3479 
3480         if (status == U_ZERO_ERROR) {
3481             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3482             REGEX_CHECK_STATUS_L(line);
3483         }
3484 
3485         if (UTF8Matcher == NULL) {
3486             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3487           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3488             status = U_ZERO_ERROR;
3489         }
3490     }
3491 
3492     //
3493     //  Generate native indices for UTF8 versions of region and capture group info
3494     //
3495     if (UTF8Matcher != NULL) {
3496         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3497         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3498 
3499         //  Fill out the native index UVector info.
3500         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3501         for (i=0; i<groupStarts.size(); i++) {
3502             int32_t  start = groupStarts.elementAti(i);
3503             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3504             if (start >= 0) {
3505                 int32_t  startUTF8;
3506                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3507                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3508                     failed = TRUE;
3509                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3510                 }
3511                 setInt(groupStartsUTF8, startUTF8, i);
3512             }
3513 
3514             int32_t  end = groupEnds.elementAti(i);
3515             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3516             if (end >= 0) {
3517                 int32_t  endUTF8;
3518                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3519                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3520                     failed = TRUE;
3521                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3522                 }
3523                 setInt(groupEndsUTF8, endUTF8, i);
3524             }
3525         }
3526     }
3527 
3528     if (regionStart>=0) {
3529        matcher->region(regionStart, regionEnd, status);
3530        REGEX_CHECK_STATUS_L(line);
3531        if (UTF8Matcher != NULL) {
3532            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3533            REGEX_CHECK_STATUS_L(line);
3534        }
3535     }
3536     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3537         matcher->useAnchoringBounds(FALSE);
3538         if (UTF8Matcher != NULL) {
3539             UTF8Matcher->useAnchoringBounds(FALSE);
3540         }
3541     }
3542     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3543         matcher->useTransparentBounds(TRUE);
3544         if (UTF8Matcher != NULL) {
3545             UTF8Matcher->useTransparentBounds(TRUE);
3546         }
3547     }
3548 
3549 
3550 
3551     //
3552     // Do a find on the de-tagged input using the caller's pattern
3553     //     TODO: error on count>1 and not find().
3554     //           error on both matches() and lookingAt().
3555     //
3556     for (i=0; i<numFinds; i++) {
3557         if (useMatchesFunc) {
3558             isMatch = matcher->matches(status);
3559             if (UTF8Matcher != NULL) {
3560                isUTF8Match = UTF8Matcher->matches(status);
3561             }
3562         } else  if (useLookingAtFunc) {
3563             isMatch = matcher->lookingAt(status);
3564             if (UTF8Matcher != NULL) {
3565                 isUTF8Match = UTF8Matcher->lookingAt(status);
3566             }
3567         } else {
3568             isMatch = matcher->find();
3569             if (UTF8Matcher != NULL) {
3570                 isUTF8Match = UTF8Matcher->find();
3571             }
3572         }
3573     }
3574     matcher->setTrace(FALSE);
3575 
3576     //
3577     // Match up the groups from the find() with the groups from the tags
3578     //
3579 
3580     // number of tags should match number of groups from find operation.
3581     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3582     //   G option in test means that capture group data is not available in the
3583     //     expected results, so the check needs to be suppressed.
3584     if (isMatch == FALSE && groupStarts.size() != 0) {
3585         dataerrln("Error at line %d:  Match expected, but none found.", line);
3586         failed = TRUE;
3587         goto cleanupAndReturn;
3588     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3589         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3590         failed = TRUE;
3591         goto cleanupAndReturn;
3592     }
3593 
3594     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3595         // Only check for match / no match.  Don't check capture groups.
3596         if (isMatch && groupStarts.size() == 0) {
3597             errln("Error at line %d:  No match expected, but one found.", line);
3598             failed = TRUE;
3599         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3600             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3601             failed = TRUE;
3602         }
3603         goto cleanupAndReturn;
3604     }
3605 
3606     REGEX_CHECK_STATUS_L(line);
3607     for (i=0; i<=matcher->groupCount(); i++) {
3608         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3609         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3610         if (matcher->start(i, status) != expectedStart) {
3611             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3612                 line, i, expectedStart, matcher->start(i, status));
3613             failed = TRUE;
3614             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3615         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3616             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3617                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3618             failed = TRUE;
3619             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3620         }
3621 
3622         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3623         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3624         if (matcher->end(i, status) != expectedEnd) {
3625             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3626                 line, i, expectedEnd, matcher->end(i, status));
3627             failed = TRUE;
3628             // Error on end position;  keep going; real error is probably yet to come as group
3629             //   end positions work from end of the input data towards the front.
3630         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3631             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3632                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3633             failed = TRUE;
3634             // Error on end position;  keep going; real error is probably yet to come as group
3635             //   end positions work from end of the input data towards the front.
3636         }
3637     }
3638     if ( matcher->groupCount()+1 < groupStarts.size()) {
3639         errln("Error at line %d: Expected %d capture groups, found %d.",
3640             line, groupStarts.size()-1, matcher->groupCount());
3641         failed = TRUE;
3642         }
3643     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3644         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3645               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3646         failed = TRUE;
3647     }
3648 
3649     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3650         matcher->requireEnd() == TRUE) {
3651         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3652         failed = TRUE;
3653     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3654         UTF8Matcher->requireEnd() == TRUE) {
3655         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3656         failed = TRUE;
3657     }
3658 
3659     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3660         matcher->requireEnd() == FALSE) {
3661         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3662         failed = TRUE;
3663     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3664         UTF8Matcher->requireEnd() == FALSE) {
3665         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3666         failed = TRUE;
3667     }
3668 
3669     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3670         matcher->hitEnd() == TRUE) {
3671         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3672         failed = TRUE;
3673     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3674                UTF8Matcher->hitEnd() == TRUE) {
3675         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3676         failed = TRUE;
3677     }
3678 
3679     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3680         matcher->hitEnd() == FALSE) {
3681         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3682         failed = TRUE;
3683     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3684                UTF8Matcher->hitEnd() == FALSE) {
3685         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3686         failed = TRUE;
3687     }
3688 
3689 
3690 cleanupAndReturn:
3691     if (failed) {
3692         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3693             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3694         // callerPattern->dump();
3695     }
3696     delete parseMatcher;
3697     delete parsePat;
3698     delete UTF8Matcher;
3699     delete UTF8Pattern;
3700     delete matcher;
3701     delete callerPattern;
3702 
3703     utext_close(&inputText);
3704     delete[] inputChars;
3705     utext_close(&patternText);
3706     delete[] patternChars;
3707     ucnv_close(UTF8Converter);
3708 }
3709 
3710 
3711 
3712 
3713 //---------------------------------------------------------------------------
3714 //
3715 //      Errors     Check for error handling in patterns.
3716 //
3717 //---------------------------------------------------------------------------
Errors()3718 void RegexTest::Errors() {
3719     // \escape sequences that aren't implemented yet.
3720     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3721 
3722     // Missing close parentheses
3723     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3724     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3725     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3726 
3727     // Extra close paren
3728     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3729     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3730     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3731 
3732     // Look-ahead, Look-behind
3733     //  TODO:  add tests for unbounded length look-behinds.
3734     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3735 
3736     // Attempt to use non-default flags
3737     {
3738         UParseError   pe;
3739         UErrorCode    status = U_ZERO_ERROR;
3740         int32_t       flags  = UREGEX_CANON_EQ |
3741                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3742                                UREGEX_MULTILINE;
3743         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3744         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3745         delete pat1;
3746     }
3747 
3748 
3749     // Quantifiers are allowed only after something that can be quantified.
3750     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3751     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3752     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3753 
3754     // Mal-formed {min,max} quantifiers
3755     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3756     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3757     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3758     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3759     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3760     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3761     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3762     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3763     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3764 
3765     // Ticket 5389
3766     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3767 
3768     // Invalid Back Reference \0
3769     //    For ICU 3.8 and earlier
3770     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3771     //
3772     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3773 
3774 }
3775 
3776 
3777 //-------------------------------------------------------------------------------
3778 //
3779 //  Read a text data file, convert it to UChars, and return the data
3780 //    in one big UChar * buffer, which the caller must delete.
3781 //
3782 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3783 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3784                                      const char *defEncoding, UErrorCode &status) {
3785     UChar       *retPtr  = NULL;
3786     char        *fileBuf = NULL;
3787     UConverter* conv     = NULL;
3788     FILE        *f       = NULL;
3789 
3790     ulen = 0;
3791     if (U_FAILURE(status)) {
3792         return retPtr;
3793     }
3794 
3795     //
3796     //  Open the file.
3797     //
3798     f = fopen(fileName, "rb");
3799     if (f == 0) {
3800         dataerrln("Error opening test data file %s\n", fileName);
3801         status = U_FILE_ACCESS_ERROR;
3802         return NULL;
3803     }
3804     //
3805     //  Read it in
3806     //
3807     int32_t            fileSize;
3808     int32_t            amt_read;
3809 
3810     fseek( f, 0, SEEK_END);
3811     fileSize = ftell(f);
3812     fileBuf = new char[fileSize];
3813     fseek(f, 0, SEEK_SET);
3814     amt_read = fread(fileBuf, 1, fileSize, f);
3815     if (amt_read != fileSize || fileSize <= 0) {
3816         errln("Error reading test data file.");
3817         goto cleanUpAndReturn;
3818     }
3819 
3820     //
3821     // Look for a Unicode Signature (BOM) on the data just read
3822     //
3823     int32_t        signatureLength;
3824     const char *   fileBufC;
3825     const char*    encoding;
3826 
3827     fileBufC = fileBuf;
3828     encoding = ucnv_detectUnicodeSignature(
3829         fileBuf, fileSize, &signatureLength, &status);
3830     if(encoding!=NULL ){
3831         fileBufC  += signatureLength;
3832         fileSize  -= signatureLength;
3833     } else {
3834         encoding = defEncoding;
3835         if (strcmp(encoding, "utf-8") == 0) {
3836             errln("file %s is missing its BOM", fileName);
3837         }
3838     }
3839 
3840     //
3841     // Open a converter to take the rule file to UTF-16
3842     //
3843     conv = ucnv_open(encoding, &status);
3844     if (U_FAILURE(status)) {
3845         goto cleanUpAndReturn;
3846     }
3847 
3848     //
3849     // Convert the rules to UChar.
3850     //  Preflight first to determine required buffer size.
3851     //
3852     ulen = ucnv_toUChars(conv,
3853         NULL,           //  dest,
3854         0,              //  destCapacity,
3855         fileBufC,
3856         fileSize,
3857         &status);
3858     if (status == U_BUFFER_OVERFLOW_ERROR) {
3859         // Buffer Overflow is expected from the preflight operation.
3860         status = U_ZERO_ERROR;
3861 
3862         retPtr = new UChar[ulen+1];
3863         ucnv_toUChars(conv,
3864             retPtr,       //  dest,
3865             ulen+1,
3866             fileBufC,
3867             fileSize,
3868             &status);
3869     }
3870 
3871 cleanUpAndReturn:
3872     fclose(f);
3873     delete[] fileBuf;
3874     ucnv_close(conv);
3875     if (U_FAILURE(status)) {
3876         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3877         delete []retPtr;
3878         retPtr = 0;
3879         ulen   = 0;
3880     };
3881     return retPtr;
3882 }
3883 
3884 
3885 //-------------------------------------------------------------------------------
3886 //
3887 //   PerlTests  - Run Perl's regular expression tests
3888 //                The input file for this test is re_tests, the standard regular
3889 //                expression test data distributed with the Perl source code.
3890 //
3891 //                Here is Perl's description of the test data file:
3892 //
3893 //        # The tests are in a separate file 't/op/re_tests'.
3894 //        # Each line in that file is a separate test.
3895 //        # There are five columns, separated by tabs.
3896 //        #
3897 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3898 //        # Modifiers can be put after the closing C<'>.
3899 //        #
3900 //        # Column 2 contains the string to be matched.
3901 //        #
3902 //        # Column 3 contains the expected result:
3903 //        #     y   expect a match
3904 //        #     n   expect no match
3905 //        #     c   expect an error
3906 //        # B   test exposes a known bug in Perl, should be skipped
3907 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3908 //        #
3909 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3910 //        #
3911 //        # Column 4 contains a string, usually C<$&>.
3912 //        #
3913 //        # Column 5 contains the expected result of double-quote
3914 //        # interpolating that string after the match, or start of error message.
3915 //        #
3916 //        # Column 6, if present, contains a reason why the test is skipped.
3917 //        # This is printed with "skipped", for harness to pick up.
3918 //        #
3919 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3920 //        #
3921 //        # If you want to add a regular expression test that can't be expressed
3922 //        # in this format, don't add it here: put it in op/pat.t instead.
3923 //
3924 //        For ICU, if field 3 contains an 'i', the test will be skipped.
3925 //        The test exposes is some known incompatibility between ICU and Perl regexps.
3926 //        (The i is in addition to whatever was there before.)
3927 //
3928 //-------------------------------------------------------------------------------
PerlTests()3929 void RegexTest::PerlTests() {
3930     char tdd[2048];
3931     const char *srcPath;
3932     UErrorCode  status = U_ZERO_ERROR;
3933     UParseError pe;
3934 
3935     //
3936     //  Open and read the test data file.
3937     //
3938     srcPath=getPath(tdd, "re_tests.txt");
3939     if(srcPath==NULL) {
3940         return; /* something went wrong, error already output */
3941     }
3942 
3943     int32_t    len;
3944     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3945     if (U_FAILURE(status)) {
3946         return; /* something went wrong, error already output */
3947     }
3948 
3949     //
3950     //  Put the test data into a UnicodeString
3951     //
3952     UnicodeString testDataString(FALSE, testData, len);
3953 
3954     //
3955     //  Regex to break the input file into lines, and strip the new lines.
3956     //     One line per match, capture group one is the desired data.
3957     //
3958     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3959     if (U_FAILURE(status)) {
3960         dataerrln("RegexPattern::compile() error");
3961         return;
3962     }
3963     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3964 
3965     //
3966     //  Regex to split a test file line into fields.
3967     //    There are six fields, separated by tabs.
3968     //
3969     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3970 
3971     //
3972     //  Regex to identify test patterns with flag settings, and to separate them.
3973     //    Test patterns with flags look like 'pattern'i
3974     //    Test patterns without flags are not quoted:   pattern
3975     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3976     //
3977     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3978     RegexMatcher* flagMat = flagPat->matcher(status);
3979 
3980     //
3981     // The Perl tests reference several perl-isms, which are evaluated/substituted
3982     //   in the test data.  Not being perl, this must be done explicitly.  Here
3983     //   are string constants and REs for these constructs.
3984     //
3985     UnicodeString nulnulSrc("${nulnul}");
3986     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3987     nulnul = nulnul.unescape();
3988 
3989     UnicodeString ffffSrc("${ffff}");
3990     UnicodeString ffff("\\uffff", -1, US_INV);
3991     ffff = ffff.unescape();
3992 
3993     //  regexp for $-[0], $+[2], etc.
3994     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3995     RegexMatcher *groupsMat = groupsPat->matcher(status);
3996 
3997     //  regexp for $0, $1, $2, etc.
3998     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3999     RegexMatcher *cgMat = cgPat->matcher(status);
4000 
4001 
4002     //
4003     // Main Loop for the Perl Tests, runs once per line from the
4004     //   test data file.
4005     //
4006     int32_t  lineNum = 0;
4007     int32_t  skippedUnimplementedCount = 0;
4008     while (lineMat->find()) {
4009         lineNum++;
4010 
4011         //
4012         //  Get a line, break it into its fields, do the Perl
4013         //    variable substitutions.
4014         //
4015         UnicodeString line = lineMat->group(1, status);
4016         UnicodeString fields[7];
4017         fieldPat->split(line, fields, 7, status);
4018 
4019         flagMat->reset(fields[0]);
4020         flagMat->matches(status);
4021         UnicodeString pattern  = flagMat->group(2, status);
4022         pattern.findAndReplace("${bang}", "!");
4023         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4024         pattern.findAndReplace(ffffSrc, ffff);
4025 
4026         //
4027         //  Identify patterns that include match flag settings,
4028         //    split off the flags, remove the extra quotes.
4029         //
4030         UnicodeString flagStr = flagMat->group(3, status);
4031         if (U_FAILURE(status)) {
4032             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4033             return;
4034         }
4035         int32_t flags = 0;
4036         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4037         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4038         const UChar UChar_m = 0x6d;
4039         const UChar UChar_x = 0x78;
4040         const UChar UChar_y = 0x79;
4041         if (flagStr.indexOf(UChar_i) != -1) {
4042             flags |= UREGEX_CASE_INSENSITIVE;
4043         }
4044         if (flagStr.indexOf(UChar_m) != -1) {
4045             flags |= UREGEX_MULTILINE;
4046         }
4047         if (flagStr.indexOf(UChar_x) != -1) {
4048             flags |= UREGEX_COMMENTS;
4049         }
4050 
4051         //
4052         // Compile the test pattern.
4053         //
4054         status = U_ZERO_ERROR;
4055         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4056         if (status == U_REGEX_UNIMPLEMENTED) {
4057             //
4058             // Test of a feature that is planned for ICU, but not yet implemented.
4059             //   skip the test.
4060             skippedUnimplementedCount++;
4061             delete testPat;
4062             status = U_ZERO_ERROR;
4063             continue;
4064         }
4065 
4066         if (U_FAILURE(status)) {
4067             // Some tests are supposed to generate errors.
4068             //   Only report an error for tests that are supposed to succeed.
4069             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4070                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4071             {
4072                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4073             }
4074             status = U_ZERO_ERROR;
4075             delete testPat;
4076             continue;
4077         }
4078 
4079         if (fields[2].indexOf(UChar_i) >= 0) {
4080             // ICU should skip this test.
4081             delete testPat;
4082             continue;
4083         }
4084 
4085         if (fields[2].indexOf(UChar_c) >= 0) {
4086             // This pattern should have caused a compilation error, but didn't/
4087             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4088             delete testPat;
4089             continue;
4090         }
4091 
4092         //
4093         // replace the Perl variables that appear in some of the
4094         //   match data strings.
4095         //
4096         UnicodeString matchString = fields[1];
4097         matchString.findAndReplace(nulnulSrc, nulnul);
4098         matchString.findAndReplace(ffffSrc,   ffff);
4099 
4100         // Replace any \n in the match string with an actual new-line char.
4101         //  Don't do full unescape, as this unescapes more than Perl does, which
4102         //  causes other spurious failures in the tests.
4103         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4104 
4105 
4106 
4107         //
4108         // Run the test, check for expected match/don't match result.
4109         //
4110         RegexMatcher *testMat = testPat->matcher(matchString, status);
4111         UBool found = testMat->find();
4112         UBool expected = FALSE;
4113         if (fields[2].indexOf(UChar_y) >=0) {
4114             expected = TRUE;
4115         }
4116         if (expected != found) {
4117             errln("line %d: Expected %smatch, got %smatch",
4118                 lineNum, expected?"":"no ", found?"":"no " );
4119             continue;
4120         }
4121 
4122         // Don't try to check expected results if there is no match.
4123         //   (Some have stuff in the expected fields)
4124         if (!found) {
4125             delete testMat;
4126             delete testPat;
4127             continue;
4128         }
4129 
4130         //
4131         // Interpret the Perl expression from the fourth field of the data file,
4132         // building up an ICU string from the results of the ICU match.
4133         //   The Perl expression will contain references to the results of
4134         //     a regex match, including the matched string, capture group strings,
4135         //     group starting and ending indicies, etc.
4136         //
4137         UnicodeString resultString;
4138         UnicodeString perlExpr = fields[3];
4139 #if SUPPORT_MUTATING_INPUT_STRING
4140         groupsMat->reset(perlExpr);
4141         cgMat->reset(perlExpr);
4142 #endif
4143 
4144         while (perlExpr.length() > 0) {
4145 #if !SUPPORT_MUTATING_INPUT_STRING
4146             //  Perferred usage.  Reset after any modification to input string.
4147             groupsMat->reset(perlExpr);
4148             cgMat->reset(perlExpr);
4149 #endif
4150 
4151             if (perlExpr.startsWith("$&")) {
4152                 resultString.append(testMat->group(status));
4153                 perlExpr.remove(0, 2);
4154             }
4155 
4156             else if (groupsMat->lookingAt(status)) {
4157                 // $-[0]   $+[2]  etc.
4158                 UnicodeString digitString = groupsMat->group(2, status);
4159                 int32_t t = 0;
4160                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4161                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4162                 int32_t matchPosition;
4163                 if (plusOrMinus.compare("+") == 0) {
4164                     matchPosition = testMat->end(groupNum, status);
4165                 } else {
4166                     matchPosition = testMat->start(groupNum, status);
4167                 }
4168                 if (matchPosition != -1) {
4169                     ICU_Utility::appendNumber(resultString, matchPosition);
4170                 }
4171                 perlExpr.remove(0, groupsMat->end(status));
4172             }
4173 
4174             else if (cgMat->lookingAt(status)) {
4175                 // $1, $2, $3, etc.
4176                 UnicodeString digitString = cgMat->group(1, status);
4177                 int32_t t = 0;
4178                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4179                 if (U_SUCCESS(status)) {
4180                     resultString.append(testMat->group(groupNum, status));
4181                     status = U_ZERO_ERROR;
4182                 }
4183                 perlExpr.remove(0, cgMat->end(status));
4184             }
4185 
4186             else if (perlExpr.startsWith("@-")) {
4187                 int32_t i;
4188                 for (i=0; i<=testMat->groupCount(); i++) {
4189                     if (i>0) {
4190                         resultString.append(" ");
4191                     }
4192                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4193                 }
4194                 perlExpr.remove(0, 2);
4195             }
4196 
4197             else if (perlExpr.startsWith("@+")) {
4198                 int32_t i;
4199                 for (i=0; i<=testMat->groupCount(); i++) {
4200                     if (i>0) {
4201                         resultString.append(" ");
4202                     }
4203                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4204                 }
4205                 perlExpr.remove(0, 2);
4206             }
4207 
4208             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4209                                                      //           or as an escaped sequence (e.g. \n)
4210                 if (perlExpr.length() > 1) {
4211                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4212                 }
4213                 UChar c = perlExpr.charAt(0);
4214                 switch (c) {
4215                 case 'n':   c = '\n'; break;
4216                 // add any other escape sequences that show up in the test expected results.
4217                 }
4218                 resultString.append(c);
4219                 perlExpr.remove(0, 1);
4220             }
4221 
4222             else  {
4223                 // Any characters from the perl expression that we don't explicitly
4224                 //  recognize before here are assumed to be literals and copied
4225                 //  as-is to the expected results.
4226                 resultString.append(perlExpr.charAt(0));
4227                 perlExpr.remove(0, 1);
4228             }
4229 
4230             if (U_FAILURE(status)) {
4231                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4232                 break;
4233             }
4234         }
4235 
4236         //
4237         // Expected Results Compare
4238         //
4239         UnicodeString expectedS(fields[4]);
4240         expectedS.findAndReplace(nulnulSrc, nulnul);
4241         expectedS.findAndReplace(ffffSrc,   ffff);
4242         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4243 
4244 
4245         if (expectedS.compare(resultString) != 0) {
4246             err("Line %d: Incorrect perl expression results.", lineNum);
4247             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4248         }
4249 
4250         delete testMat;
4251         delete testPat;
4252     }
4253 
4254     //
4255     // All done.  Clean up allocated stuff.
4256     //
4257     delete cgMat;
4258     delete cgPat;
4259 
4260     delete groupsMat;
4261     delete groupsPat;
4262 
4263     delete flagMat;
4264     delete flagPat;
4265 
4266     delete lineMat;
4267     delete linePat;
4268 
4269     delete fieldPat;
4270     delete [] testData;
4271 
4272 
4273     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4274 
4275 }
4276 
4277 
4278 //-------------------------------------------------------------------------------
4279 //
4280 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4281 //                  (instead of using UnicodeStrings) to test the alternate engine.
4282 //                  The input file for this test is re_tests, the standard regular
4283 //                  expression test data distributed with the Perl source code.
4284 //                  See PerlTests() for more information.
4285 //
4286 //-------------------------------------------------------------------------------
PerlTestsUTF8()4287 void RegexTest::PerlTestsUTF8() {
4288     char tdd[2048];
4289     const char *srcPath;
4290     UErrorCode  status = U_ZERO_ERROR;
4291     UParseError pe;
4292     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4293     UText       patternText = UTEXT_INITIALIZER;
4294     char       *patternChars = NULL;
4295     int32_t     patternLength;
4296     int32_t     patternCapacity = 0;
4297     UText       inputText = UTEXT_INITIALIZER;
4298     char       *inputChars = NULL;
4299     int32_t     inputLength;
4300     int32_t     inputCapacity = 0;
4301 
4302     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4303 
4304     //
4305     //  Open and read the test data file.
4306     //
4307     srcPath=getPath(tdd, "re_tests.txt");
4308     if(srcPath==NULL) {
4309         return; /* something went wrong, error already output */
4310     }
4311 
4312     int32_t    len;
4313     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4314     if (U_FAILURE(status)) {
4315         return; /* something went wrong, error already output */
4316     }
4317 
4318     //
4319     //  Put the test data into a UnicodeString
4320     //
4321     UnicodeString testDataString(FALSE, testData, len);
4322 
4323     //
4324     //  Regex to break the input file into lines, and strip the new lines.
4325     //     One line per match, capture group one is the desired data.
4326     //
4327     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4328     if (U_FAILURE(status)) {
4329         dataerrln("RegexPattern::compile() error");
4330         return;
4331     }
4332     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4333 
4334     //
4335     //  Regex to split a test file line into fields.
4336     //    There are six fields, separated by tabs.
4337     //
4338     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4339 
4340     //
4341     //  Regex to identify test patterns with flag settings, and to separate them.
4342     //    Test patterns with flags look like 'pattern'i
4343     //    Test patterns without flags are not quoted:   pattern
4344     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4345     //
4346     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4347     RegexMatcher* flagMat = flagPat->matcher(status);
4348 
4349     //
4350     // The Perl tests reference several perl-isms, which are evaluated/substituted
4351     //   in the test data.  Not being perl, this must be done explicitly.  Here
4352     //   are string constants and REs for these constructs.
4353     //
4354     UnicodeString nulnulSrc("${nulnul}");
4355     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4356     nulnul = nulnul.unescape();
4357 
4358     UnicodeString ffffSrc("${ffff}");
4359     UnicodeString ffff("\\uffff", -1, US_INV);
4360     ffff = ffff.unescape();
4361 
4362     //  regexp for $-[0], $+[2], etc.
4363     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4364     RegexMatcher *groupsMat = groupsPat->matcher(status);
4365 
4366     //  regexp for $0, $1, $2, etc.
4367     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4368     RegexMatcher *cgMat = cgPat->matcher(status);
4369 
4370 
4371     //
4372     // Main Loop for the Perl Tests, runs once per line from the
4373     //   test data file.
4374     //
4375     int32_t  lineNum = 0;
4376     int32_t  skippedUnimplementedCount = 0;
4377     while (lineMat->find()) {
4378         lineNum++;
4379 
4380         //
4381         //  Get a line, break it into its fields, do the Perl
4382         //    variable substitutions.
4383         //
4384         UnicodeString line = lineMat->group(1, status);
4385         UnicodeString fields[7];
4386         fieldPat->split(line, fields, 7, status);
4387 
4388         flagMat->reset(fields[0]);
4389         flagMat->matches(status);
4390         UnicodeString pattern  = flagMat->group(2, status);
4391         pattern.findAndReplace("${bang}", "!");
4392         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4393         pattern.findAndReplace(ffffSrc, ffff);
4394 
4395         //
4396         //  Identify patterns that include match flag settings,
4397         //    split off the flags, remove the extra quotes.
4398         //
4399         UnicodeString flagStr = flagMat->group(3, status);
4400         if (U_FAILURE(status)) {
4401             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4402             return;
4403         }
4404         int32_t flags = 0;
4405         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4406         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4407         const UChar UChar_m = 0x6d;
4408         const UChar UChar_x = 0x78;
4409         const UChar UChar_y = 0x79;
4410         if (flagStr.indexOf(UChar_i) != -1) {
4411             flags |= UREGEX_CASE_INSENSITIVE;
4412         }
4413         if (flagStr.indexOf(UChar_m) != -1) {
4414             flags |= UREGEX_MULTILINE;
4415         }
4416         if (flagStr.indexOf(UChar_x) != -1) {
4417             flags |= UREGEX_COMMENTS;
4418         }
4419 
4420         //
4421         // Put the pattern in a UTF-8 UText
4422         //
4423         status = U_ZERO_ERROR;
4424         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4425         if (status == U_BUFFER_OVERFLOW_ERROR) {
4426             status = U_ZERO_ERROR;
4427             delete[] patternChars;
4428             patternCapacity = patternLength + 1;
4429             patternChars = new char[patternCapacity];
4430             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4431         }
4432         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4433 
4434         //
4435         // Compile the test pattern.
4436         //
4437         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4438         if (status == U_REGEX_UNIMPLEMENTED) {
4439             //
4440             // Test of a feature that is planned for ICU, but not yet implemented.
4441             //   skip the test.
4442             skippedUnimplementedCount++;
4443             delete testPat;
4444             status = U_ZERO_ERROR;
4445             continue;
4446         }
4447 
4448         if (U_FAILURE(status)) {
4449             // Some tests are supposed to generate errors.
4450             //   Only report an error for tests that are supposed to succeed.
4451             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4452                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4453             {
4454                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4455             }
4456             status = U_ZERO_ERROR;
4457             delete testPat;
4458             continue;
4459         }
4460 
4461         if (fields[2].indexOf(UChar_i) >= 0) {
4462             // ICU should skip this test.
4463             delete testPat;
4464             continue;
4465         }
4466 
4467         if (fields[2].indexOf(UChar_c) >= 0) {
4468             // This pattern should have caused a compilation error, but didn't/
4469             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4470             delete testPat;
4471             continue;
4472         }
4473 
4474 
4475         //
4476         // replace the Perl variables that appear in some of the
4477         //   match data strings.
4478         //
4479         UnicodeString matchString = fields[1];
4480         matchString.findAndReplace(nulnulSrc, nulnul);
4481         matchString.findAndReplace(ffffSrc,   ffff);
4482 
4483         // Replace any \n in the match string with an actual new-line char.
4484         //  Don't do full unescape, as this unescapes more than Perl does, which
4485         //  causes other spurious failures in the tests.
4486         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4487 
4488         //
4489         // Put the input in a UTF-8 UText
4490         //
4491         status = U_ZERO_ERROR;
4492         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4493         if (status == U_BUFFER_OVERFLOW_ERROR) {
4494             status = U_ZERO_ERROR;
4495             delete[] inputChars;
4496             inputCapacity = inputLength + 1;
4497             inputChars = new char[inputCapacity];
4498             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4499         }
4500         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4501 
4502         //
4503         // Run the test, check for expected match/don't match result.
4504         //
4505         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4506         UBool found = testMat->find();
4507         UBool expected = FALSE;
4508         if (fields[2].indexOf(UChar_y) >=0) {
4509             expected = TRUE;
4510         }
4511         if (expected != found) {
4512             errln("line %d: Expected %smatch, got %smatch",
4513                 lineNum, expected?"":"no ", found?"":"no " );
4514             continue;
4515         }
4516 
4517         // Don't try to check expected results if there is no match.
4518         //   (Some have stuff in the expected fields)
4519         if (!found) {
4520             delete testMat;
4521             delete testPat;
4522             continue;
4523         }
4524 
4525         //
4526         // Interpret the Perl expression from the fourth field of the data file,
4527         // building up an ICU string from the results of the ICU match.
4528         //   The Perl expression will contain references to the results of
4529         //     a regex match, including the matched string, capture group strings,
4530         //     group starting and ending indicies, etc.
4531         //
4532         UnicodeString resultString;
4533         UnicodeString perlExpr = fields[3];
4534 
4535         while (perlExpr.length() > 0) {
4536             groupsMat->reset(perlExpr);
4537             cgMat->reset(perlExpr);
4538 
4539             if (perlExpr.startsWith("$&")) {
4540                 resultString.append(testMat->group(status));
4541                 perlExpr.remove(0, 2);
4542             }
4543 
4544             else if (groupsMat->lookingAt(status)) {
4545                 // $-[0]   $+[2]  etc.
4546                 UnicodeString digitString = groupsMat->group(2, status);
4547                 int32_t t = 0;
4548                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4549                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4550                 int32_t matchPosition;
4551                 if (plusOrMinus.compare("+") == 0) {
4552                     matchPosition = testMat->end(groupNum, status);
4553                 } else {
4554                     matchPosition = testMat->start(groupNum, status);
4555                 }
4556                 if (matchPosition != -1) {
4557                     ICU_Utility::appendNumber(resultString, matchPosition);
4558                 }
4559                 perlExpr.remove(0, groupsMat->end(status));
4560             }
4561 
4562             else if (cgMat->lookingAt(status)) {
4563                 // $1, $2, $3, etc.
4564                 UnicodeString digitString = cgMat->group(1, status);
4565                 int32_t t = 0;
4566                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4567                 if (U_SUCCESS(status)) {
4568                     resultString.append(testMat->group(groupNum, status));
4569                     status = U_ZERO_ERROR;
4570                 }
4571                 perlExpr.remove(0, cgMat->end(status));
4572             }
4573 
4574             else if (perlExpr.startsWith("@-")) {
4575                 int32_t i;
4576                 for (i=0; i<=testMat->groupCount(); i++) {
4577                     if (i>0) {
4578                         resultString.append(" ");
4579                     }
4580                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4581                 }
4582                 perlExpr.remove(0, 2);
4583             }
4584 
4585             else if (perlExpr.startsWith("@+")) {
4586                 int32_t i;
4587                 for (i=0; i<=testMat->groupCount(); i++) {
4588                     if (i>0) {
4589                         resultString.append(" ");
4590                     }
4591                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4592                 }
4593                 perlExpr.remove(0, 2);
4594             }
4595 
4596             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4597                                                      //           or as an escaped sequence (e.g. \n)
4598                 if (perlExpr.length() > 1) {
4599                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4600                 }
4601                 UChar c = perlExpr.charAt(0);
4602                 switch (c) {
4603                 case 'n':   c = '\n'; break;
4604                 // add any other escape sequences that show up in the test expected results.
4605                 }
4606                 resultString.append(c);
4607                 perlExpr.remove(0, 1);
4608             }
4609 
4610             else  {
4611                 // Any characters from the perl expression that we don't explicitly
4612                 //  recognize before here are assumed to be literals and copied
4613                 //  as-is to the expected results.
4614                 resultString.append(perlExpr.charAt(0));
4615                 perlExpr.remove(0, 1);
4616             }
4617 
4618             if (U_FAILURE(status)) {
4619                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4620                 break;
4621             }
4622         }
4623 
4624         //
4625         // Expected Results Compare
4626         //
4627         UnicodeString expectedS(fields[4]);
4628         expectedS.findAndReplace(nulnulSrc, nulnul);
4629         expectedS.findAndReplace(ffffSrc,   ffff);
4630         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4631 
4632 
4633         if (expectedS.compare(resultString) != 0) {
4634             err("Line %d: Incorrect perl expression results.", lineNum);
4635             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4636         }
4637 
4638         delete testMat;
4639         delete testPat;
4640     }
4641 
4642     //
4643     // All done.  Clean up allocated stuff.
4644     //
4645     delete cgMat;
4646     delete cgPat;
4647 
4648     delete groupsMat;
4649     delete groupsPat;
4650 
4651     delete flagMat;
4652     delete flagPat;
4653 
4654     delete lineMat;
4655     delete linePat;
4656 
4657     delete fieldPat;
4658     delete [] testData;
4659 
4660     utext_close(&patternText);
4661     utext_close(&inputText);
4662 
4663     delete [] patternChars;
4664     delete [] inputChars;
4665 
4666 
4667     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4668 
4669 }
4670 
4671 
4672 //--------------------------------------------------------------
4673 //
4674 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4675 //             Use this pattern,
4676 //                 "(a?){1,}"
4677 //             The zero-length match will repeat forever.
4678 //                (That this goes into a loop is another bug)
4679 //
4680 //---------------------------------------------------------------
Bug6149()4681 void RegexTest::Bug6149() {
4682     UnicodeString pattern("(a?){1,}");
4683     UnicodeString s("xyz");
4684     uint32_t flags = 0;
4685     UErrorCode status = U_ZERO_ERROR;
4686 
4687     RegexMatcher  matcher(pattern, s, flags, status);
4688     UBool result = false;
4689     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4690     REGEX_ASSERT(result == FALSE);
4691  }
4692 
4693 
4694 //
4695 //   Callbacks()    Test the callback function.
4696 //                  When set, callbacks occur periodically during matching operations,
4697 //                  giving the application code the ability to abort the operation
4698 //                  before it's normal completion.
4699 //
4700 
4701 struct callBackContext {
4702     RegexTest        *test;
4703     int32_t          maxCalls;
4704     int32_t          numCalls;
4705     int32_t          lastSteps;
resetcallBackContext4706     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4707 };
4708 
4709 U_CDECL_BEGIN
4710 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4711 testCallBackFn(const void *context, int32_t steps) {
4712     callBackContext  *info = (callBackContext *)context;
4713     if (info->lastSteps+1 != steps) {
4714         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4715     }
4716     info->lastSteps = steps;
4717     info->numCalls++;
4718     return (info->numCalls < info->maxCalls);
4719 }
4720 U_CDECL_END
4721 
Callbacks()4722 void RegexTest::Callbacks() {
4723    {
4724         // Getter returns NULLs if no callback has been set
4725 
4726         //   The variables that the getter will fill in.
4727         //   Init to non-null values so that the action of the getter can be seen.
4728         const void          *returnedContext = &returnedContext;
4729         URegexMatchCallback *returnedFn = &testCallBackFn;
4730 
4731         UErrorCode status = U_ZERO_ERROR;
4732         RegexMatcher matcher("x", 0, status);
4733         REGEX_CHECK_STATUS;
4734         matcher.getMatchCallback(returnedFn, returnedContext, status);
4735         REGEX_CHECK_STATUS;
4736         REGEX_ASSERT(returnedFn == NULL);
4737         REGEX_ASSERT(returnedContext == NULL);
4738     }
4739 
4740    {
4741         // Set and Get work
4742         callBackContext cbInfo = {this, 0, 0, 0};
4743         const void          *returnedContext;
4744         URegexMatchCallback *returnedFn;
4745         UErrorCode status = U_ZERO_ERROR;
4746         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4747         REGEX_CHECK_STATUS;
4748         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4749         REGEX_CHECK_STATUS;
4750         matcher.getMatchCallback(returnedFn, returnedContext, status);
4751         REGEX_CHECK_STATUS;
4752         REGEX_ASSERT(returnedFn == testCallBackFn);
4753         REGEX_ASSERT(returnedContext == &cbInfo);
4754 
4755         // A short-running match shouldn't invoke the callback
4756         status = U_ZERO_ERROR;
4757         cbInfo.reset(1);
4758         UnicodeString s = "xxx";
4759         matcher.reset(s);
4760         REGEX_ASSERT(matcher.matches(status));
4761         REGEX_CHECK_STATUS;
4762         REGEX_ASSERT(cbInfo.numCalls == 0);
4763 
4764         // A medium-length match that runs long enough to invoke the
4765         //   callback, but not so long that the callback aborts it.
4766         status = U_ZERO_ERROR;
4767         cbInfo.reset(4);
4768         s = "aaaaaaaaaaaaaaaaaaab";
4769         matcher.reset(s);
4770         REGEX_ASSERT(matcher.matches(status)==FALSE);
4771         REGEX_CHECK_STATUS;
4772         REGEX_ASSERT(cbInfo.numCalls > 0);
4773 
4774         // A longer running match that the callback function will abort.
4775         status = U_ZERO_ERROR;
4776         cbInfo.reset(4);
4777         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4778         matcher.reset(s);
4779         REGEX_ASSERT(matcher.matches(status)==FALSE);
4780         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4781         REGEX_ASSERT(cbInfo.numCalls == 4);
4782     }
4783 
4784 
4785 }
4786 
4787 
4788 //
4789 //   FindProgressCallbacks()    Test the find "progress" callback function.
4790 //                  When set, the find progress callback will be invoked during a find operations
4791 //                  after each return from a match attempt, giving the application the opportunity
4792 //                  to terminate a long-running find operation before it's normal completion.
4793 //
4794 
4795 struct progressCallBackContext {
4796     RegexTest        *test;
4797     int64_t          lastIndex;
4798     int32_t          maxCalls;
4799     int32_t          numCalls;
resetprogressCallBackContext4800     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4801 };
4802 
4803 U_CDECL_BEGIN
4804 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4805 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4806     progressCallBackContext  *info = (progressCallBackContext *)context;
4807     info->numCalls++;
4808     info->lastIndex = matchIndex;
4809 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4810     return (info->numCalls < info->maxCalls);
4811 }
4812 U_CDECL_END
4813 
FindProgressCallbacks()4814 void RegexTest::FindProgressCallbacks() {
4815    {
4816         // Getter returns NULLs if no callback has been set
4817 
4818         //   The variables that the getter will fill in.
4819         //   Init to non-null values so that the action of the getter can be seen.
4820         const void                  *returnedContext = &returnedContext;
4821         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4822 
4823         UErrorCode status = U_ZERO_ERROR;
4824         RegexMatcher matcher("x", 0, status);
4825         REGEX_CHECK_STATUS;
4826         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4827         REGEX_CHECK_STATUS;
4828         REGEX_ASSERT(returnedFn == NULL);
4829         REGEX_ASSERT(returnedContext == NULL);
4830     }
4831 
4832    {
4833         // Set and Get work
4834         progressCallBackContext cbInfo = {this, 0, 0, 0};
4835         const void                  *returnedContext;
4836         URegexFindProgressCallback  *returnedFn;
4837         UErrorCode status = U_ZERO_ERROR;
4838         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4839         REGEX_CHECK_STATUS;
4840         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4841         REGEX_CHECK_STATUS;
4842         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4843         REGEX_CHECK_STATUS;
4844         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4845         REGEX_ASSERT(returnedContext == &cbInfo);
4846 
4847         // A short-running match should NOT invoke the callback.
4848         status = U_ZERO_ERROR;
4849         cbInfo.reset(100);
4850         UnicodeString s = "abxxx";
4851         matcher.reset(s);
4852 #if 0
4853         matcher.setTrace(TRUE);
4854 #endif
4855         REGEX_ASSERT(matcher.find(0, status));
4856         REGEX_CHECK_STATUS;
4857         REGEX_ASSERT(cbInfo.numCalls == 0);
4858 
4859         // A medium running match that causes matcher.find() to invoke our callback for each index.
4860         status = U_ZERO_ERROR;
4861         s = "aaaaaaaaaaaaaaaaaaab";
4862         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4863         matcher.reset(s);
4864         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4865         REGEX_CHECK_STATUS;
4866         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4867 
4868         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4869         status = U_ZERO_ERROR;
4870         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4871         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4872         matcher.reset(s1);
4873         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4874         REGEX_CHECK_STATUS;
4875         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4876 
4877 #if 0
4878         // Now a match that will succeed, but after an interruption
4879         status = U_ZERO_ERROR;
4880         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4881         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4882         matcher.reset(s2);
4883         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4884         REGEX_CHECK_STATUS;
4885         // Now retry the match from where left off
4886         cbInfo.maxCalls = 100; //  No callback limit
4887         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4888         REGEX_CHECK_STATUS;
4889 #endif
4890     }
4891 
4892 
4893 }
4894 
4895 
4896 //---------------------------------------------------------------------------
4897 //
4898 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4899 //                             UTexts. The pure-C implementation of UText
4900 //                             has no mutable backing stores, but we can
4901 //                             use UnicodeString here to test the functionality.
4902 //
4903 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4904 void RegexTest::PreAllocatedUTextCAPI () {
4905     UErrorCode           status = U_ZERO_ERROR;
4906     URegularExpression  *re;
4907     UText                patternText = UTEXT_INITIALIZER;
4908     UnicodeString        buffer;
4909     UText                bufferText = UTEXT_INITIALIZER;
4910 
4911     utext_openUnicodeString(&bufferText, &buffer, &status);
4912 
4913     /*
4914      *  getText() and getUText()
4915      */
4916     {
4917         UText  text1 = UTEXT_INITIALIZER;
4918         UText  text2 = UTEXT_INITIALIZER;
4919         UChar  text2Chars[20];
4920         UText  *resultText;
4921 
4922         status = U_ZERO_ERROR;
4923         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4924         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4925         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4926         utext_openUChars(&text2, text2Chars, -1, &status);
4927 
4928         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4929         re = uregex_openUText(&patternText, 0, NULL, &status);
4930 
4931         /* First set a UText */
4932         uregex_setUText(re, &text1, &status);
4933         resultText = uregex_getUText(re, &bufferText, &status);
4934         REGEX_CHECK_STATUS;
4935         REGEX_ASSERT(resultText == &bufferText);
4936         utext_setNativeIndex(resultText, 0);
4937         utext_setNativeIndex(&text1, 0);
4938         REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4939 
4940         resultText = uregex_getUText(re, &bufferText, &status);
4941         REGEX_CHECK_STATUS;
4942         REGEX_ASSERT(resultText == &bufferText);
4943         utext_setNativeIndex(resultText, 0);
4944         utext_setNativeIndex(&text1, 0);
4945         REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4946 
4947         /* Then set a UChar * */
4948         uregex_setText(re, text2Chars, 7, &status);
4949         resultText = uregex_getUText(re, &bufferText, &status);
4950         REGEX_CHECK_STATUS;
4951         REGEX_ASSERT(resultText == &bufferText);
4952         utext_setNativeIndex(resultText, 0);
4953         utext_setNativeIndex(&text2, 0);
4954         REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
4955 
4956         uregex_close(re);
4957         utext_close(&text1);
4958         utext_close(&text2);
4959     }
4960 
4961     /*
4962      *  group()
4963      */
4964     {
4965         UChar    text1[80];
4966         UText   *actual;
4967         UBool    result;
4968         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4969 
4970         status = U_ZERO_ERROR;
4971         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4972         REGEX_CHECK_STATUS;
4973 
4974         uregex_setText(re, text1, -1, &status);
4975         result = uregex_find(re, 0, &status);
4976         REGEX_ASSERT(result==TRUE);
4977 
4978         /*  Capture Group 0, the full match.  Should succeed.  */
4979         status = U_ZERO_ERROR;
4980         actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4981         REGEX_CHECK_STATUS;
4982         REGEX_ASSERT(actual == &bufferText);
4983         REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
4984 
4985         /*  Capture group #1.  Should succeed. */
4986         status = U_ZERO_ERROR;
4987         actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
4988         REGEX_CHECK_STATUS;
4989         REGEX_ASSERT(actual == &bufferText);
4990         REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
4991 
4992         /*  Capture group out of range.  Error. */
4993         status = U_ZERO_ERROR;
4994         actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
4995         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
4996         REGEX_ASSERT(actual == &bufferText);
4997 
4998         uregex_close(re);
4999 
5000     }
5001 
5002     /*
5003      *  replaceFirst()
5004      */
5005     {
5006         UChar    text1[80];
5007         UChar    text2[80];
5008         UText    replText = UTEXT_INITIALIZER;
5009         UText   *result;
5010 
5011         status = U_ZERO_ERROR;
5012         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5013         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5014         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5015 
5016         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5017         REGEX_CHECK_STATUS;
5018 
5019         /*  Normal case, with match */
5020         uregex_setText(re, text1, -1, &status);
5021         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5022         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5023         REGEX_CHECK_STATUS;
5024         REGEX_ASSERT(result == &bufferText);
5025         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5026 
5027         /* No match.  Text should copy to output with no changes.  */
5028         uregex_setText(re, text2, -1, &status);
5029         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5030         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5031         REGEX_CHECK_STATUS;
5032         REGEX_ASSERT(result == &bufferText);
5033         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5034 
5035         /* Unicode escapes */
5036         uregex_setText(re, text1, -1, &status);
5037         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
5038         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5039         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5040         REGEX_CHECK_STATUS;
5041         REGEX_ASSERT(result == &bufferText);
5042         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5043 
5044         uregex_close(re);
5045         utext_close(&replText);
5046     }
5047 
5048 
5049     /*
5050      *  replaceAll()
5051      */
5052     {
5053         UChar    text1[80];
5054         UChar    text2[80];
5055         UText    replText = UTEXT_INITIALIZER;
5056         UText   *result;
5057 
5058         status = U_ZERO_ERROR;
5059         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5060         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5061         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5062 
5063         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5064         REGEX_CHECK_STATUS;
5065 
5066         /*  Normal case, with match */
5067         uregex_setText(re, text1, -1, &status);
5068         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5069         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5070         REGEX_CHECK_STATUS;
5071         REGEX_ASSERT(result == &bufferText);
5072         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5073 
5074         /* No match.  Text should copy to output with no changes.  */
5075         uregex_setText(re, text2, -1, &status);
5076         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5077         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5078         REGEX_CHECK_STATUS;
5079         REGEX_ASSERT(result == &bufferText);
5080         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5081 
5082         uregex_close(re);
5083         utext_close(&replText);
5084     }
5085 
5086 
5087     /*
5088      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5089      *   so we don't need to test it here.
5090      */
5091 
5092     utext_close(&bufferText);
5093     utext_close(&patternText);
5094 }
5095 
5096 //--------------------------------------------------------------
5097 //
5098 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5099 //
5100 //---------------------------------------------------------------
Bug7651()5101 void RegexTest::Bug7651() {
5102     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5103     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5104     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5105     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5106     UnicodeString s("#ff @abcd This is test");
5107     RegexPattern  *REPattern = NULL;
5108     RegexMatcher  *REMatcher = NULL;
5109     UErrorCode status = U_ZERO_ERROR;
5110     UParseError pe;
5111 
5112     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5113     REGEX_CHECK_STATUS;
5114     REMatcher = REPattern->matcher(s, status);
5115     REGEX_CHECK_STATUS;
5116     REGEX_ASSERT(REMatcher->find());
5117     REGEX_ASSERT(REMatcher->start(status) == 0);
5118     delete REPattern;
5119     delete REMatcher;
5120     status = U_ZERO_ERROR;
5121 
5122     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5123     REGEX_CHECK_STATUS;
5124     REMatcher = REPattern->matcher(s, status);
5125     REGEX_CHECK_STATUS;
5126     REGEX_ASSERT(REMatcher->find());
5127     REGEX_ASSERT(REMatcher->start(status) == 0);
5128     delete REPattern;
5129     delete REMatcher;
5130     status = U_ZERO_ERROR;
5131  }
5132 
Bug7740()5133 void RegexTest::Bug7740() {
5134     UErrorCode status = U_ZERO_ERROR;
5135     UnicodeString pattern = "(a)";
5136     UnicodeString text = "abcdef";
5137     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5138     REGEX_CHECK_STATUS;
5139     REGEX_ASSERT(m->lookingAt(status));
5140     REGEX_CHECK_STATUS;
5141     status = U_ILLEGAL_ARGUMENT_ERROR;
5142     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5143     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5144     REGEX_ASSERT(s == "");
5145     delete m;
5146 }
5147 
5148 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5149 
Bug8479()5150 void RegexTest::Bug8479() {
5151     UErrorCode status = U_ZERO_ERROR;
5152 
5153     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5154     REGEX_CHECK_STATUS;
5155     if (U_SUCCESS(status))
5156     {
5157         UnicodeString str;
5158         str.setToBogus();
5159         pMatcher->reset(str);
5160         status = U_ZERO_ERROR;
5161         pMatcher->matches(status);
5162         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5163         delete pMatcher;
5164     }
5165 }
5166 
5167 
5168 // Bug 7029
Bug7029()5169 void RegexTest::Bug7029() {
5170     UErrorCode status = U_ZERO_ERROR;
5171 
5172     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5173     UnicodeString text = "abc.def";
5174     UnicodeString splits[10];
5175     REGEX_CHECK_STATUS;
5176     int32_t numFields = pMatcher->split(text, splits, 10, status);
5177     REGEX_CHECK_STATUS;
5178     REGEX_ASSERT(numFields == 8);
5179     delete pMatcher;
5180 }
5181 
CheckInvBufSize()5182 void RegexTest::CheckInvBufSize() {
5183   if(inv_next>=INV_BUFSIZ) {
5184     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5185           __FILE__, INV_BUFSIZ, inv_next);
5186   } else {
5187     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5188   }
5189 }
5190 
5191 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5192 
5193