• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 2002-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 
9 //
10 //   regextst.cpp
11 //
12 //      ICU Regular Expressions test, part of intltest.
13 //
14 
15 /*
16      NOTE!!
17 
18      PLEASE be careful about ASCII assumptions in this test.
19      This test is one of the worst repeat offenders.
20      If you have questions, contact someone on the ICU PMC
21      who has access to an EBCDIC system.
22 
23  */
24 
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utf16.h"
42 #include "cstr.h"
43 #include "regextst.h"
44 #include "regexcmp.h"
45 #include "uvector.h"
46 #include "util.h"
47 #include "cmemory.h"
48 #include "cstring.h"
49 #include "uinvchar.h"
50 
51 #define SUPPORT_MUTATING_INPUT_STRING   0
52 
53 //---------------------------------------------------------------------------
54 //
55 //  Test class boilerplate
56 //
57 //---------------------------------------------------------------------------
RegexTest()58 RegexTest::RegexTest()
59 {
60 }
61 
62 
~RegexTest()63 RegexTest::~RegexTest()
64 {
65 }
66 
67 
68 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)69 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
70 {
71     if (exec) logln("TestSuite RegexTest: ");
72     TESTCASE_AUTO_BEGIN;
73     TESTCASE_AUTO(Basic);
74     TESTCASE_AUTO(API_Match);
75     TESTCASE_AUTO(API_Replace);
76     TESTCASE_AUTO(API_Pattern);
77 #if !UCONFIG_NO_FILE_IO
78     TESTCASE_AUTO(Extended);
79 #endif
80     TESTCASE_AUTO(Errors);
81     TESTCASE_AUTO(PerlTests);
82     TESTCASE_AUTO(Callbacks);
83     TESTCASE_AUTO(FindProgressCallbacks);
84     TESTCASE_AUTO(Bug6149);
85     TESTCASE_AUTO(UTextBasic);
86     TESTCASE_AUTO(API_Match_UTF8);
87     TESTCASE_AUTO(API_Replace_UTF8);
88     TESTCASE_AUTO(API_Pattern_UTF8);
89     TESTCASE_AUTO(PerlTestsUTF8);
90     TESTCASE_AUTO(PreAllocatedUTextCAPI);
91     TESTCASE_AUTO(Bug7651);
92     TESTCASE_AUTO(Bug7740);
93     TESTCASE_AUTO(Bug8479);
94     TESTCASE_AUTO(Bug7029);
95     TESTCASE_AUTO(CheckInvBufSize);
96     TESTCASE_AUTO(Bug9283);
97     TESTCASE_AUTO(Bug10459);
98     TESTCASE_AUTO(TestCaseInsensitiveStarters);
99     TESTCASE_AUTO(TestBug11049);
100     TESTCASE_AUTO(TestBug11371);
101     TESTCASE_AUTO(TestBug11480);
102     TESTCASE_AUTO(NamedCapture);
103     TESTCASE_AUTO(NamedCaptureLimits);
104     TESTCASE_AUTO(TestBug12884);
105     TESTCASE_AUTO(TestBug13631);
106     TESTCASE_AUTO_END;
107 }
108 
109 
110 /**
111  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
112  * into ASCII.
113  * @see utext_openUTF8
114  */
115 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
116 
117 //---------------------------------------------------------------------------
118 //
119 //   Error Checking / Reporting macros used in all of the tests.
120 //
121 //---------------------------------------------------------------------------
122 
utextToPrintable(char * buf,int32_t bufLen,UText * text)123 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
124   int64_t oldIndex = utext_getNativeIndex(text);
125   utext_setNativeIndex(text, 0);
126   char *bufPtr = buf;
127   UChar32 c = utext_next32From(text, 0);
128   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
129     if (0x000020<=c && c<0x00007e) {
130       *bufPtr = c;
131     } else {
132 #if 0
133       sprintf(bufPtr,"U+%04X", c);
134       bufPtr+= strlen(bufPtr)-1;
135 #else
136       *bufPtr = '%';
137 #endif
138     }
139     bufPtr++;
140     c = UTEXT_NEXT32(text);
141   }
142   *bufPtr = 0;
143 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
144   char *ebuf = (char*)malloc(bufLen);
145   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
146   uprv_strncpy(buf, ebuf, bufLen);
147   free((void*)ebuf);
148 #endif
149   utext_setNativeIndex(text, oldIndex);
150 }
151 
152 
153 static char ASSERT_BUF[1024];
154 
extractToAssertBuf(const UnicodeString & message)155 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
156   if(message.length()==0) {
157     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
158   } else {
159     UnicodeString buf;
160     IntlTest::prettify(message,buf);
161     if(buf.length()==0) {
162       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
163     } else {
164       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
165       if(ASSERT_BUF[0]==0) {
166         ASSERT_BUF[0]=0;
167         for(int32_t i=0;i<buf.length();i++) {
168           UChar ch = buf[i];
169           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
170         }
171       }
172     }
173   }
174   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
175   return ASSERT_BUF;
176 }
177 
178 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
179 
180 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
181                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
182 
183 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
184 
185 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
186 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
187     __LINE__, u_errorName(errcode), u_errorName(status));};}
188 
189 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
190     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
191 
192 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
193     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
194 
195 // expected: const char * , restricted to invariant characters.
196 // actual: const UnicodeString &
197 #define REGEX_ASSERT_UNISTR(expected, actual) { \
198     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
199         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
200                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
201 
202 
testUTextEqual(UText * uta,UText * utb)203 static UBool testUTextEqual(UText *uta, UText *utb) {
204     UChar32 ca = 0;
205     UChar32 cb = 0;
206     utext_setNativeIndex(uta, 0);
207     utext_setNativeIndex(utb, 0);
208     do {
209         ca = utext_next32(uta);
210         cb = utext_next32(utb);
211         if (ca != cb) {
212             break;
213         }
214     } while (ca != U_SENTINEL);
215     return ca == cb;
216 }
217 
218 
219 /**
220  * @param expected expected text in UTF-8 (not platform) codepage
221  */
assertUText(const char * expected,UText * actual,const char * file,int line)222 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
223     UErrorCode status = U_ZERO_ERROR;
224     UText expectedText = UTEXT_INITIALIZER;
225     utext_openUTF8(&expectedText, expected, -1, &status);
226     if(U_FAILURE(status)) {
227       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
228       return;
229     }
230     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
231       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
232       return;
233     }
234     utext_setNativeIndex(actual, 0);
235     if (!testUTextEqual(&expectedText, actual)) {
236         char buf[201 /*21*/];
237         char expectedBuf[201];
238         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
239         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
240         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
241     }
242     utext_close(&expectedText);
243 }
244 /**
245  * @param expected invariant (platform local text) input
246  */
247 
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)248 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
249     UErrorCode status = U_ZERO_ERROR;
250     UText expectedText = UTEXT_INITIALIZER;
251     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
252     if(U_FAILURE(status)) {
253       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
254       return;
255     }
256     utext_setNativeIndex(actual, 0);
257     if (!testUTextEqual(&expectedText, actual)) {
258         char buf[201 /*21*/];
259         char expectedBuf[201];
260         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
261         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
262         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
263     }
264     utext_close(&expectedText);
265 }
266 
267 /**
268  * Assumes utf-8 input
269  */
270 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
271 /**
272  * Assumes Invariant input
273  */
274 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
275 
276 /**
277  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
278  * passed into utext_openUTF8. An error will be given if
279  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
280  */
281 
282 #define INV_BUFSIZ 2048 /* increase this if too small */
283 
284 static int64_t inv_next=0;
285 
286 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
287 static char inv_buf[INV_BUFSIZ];
288 #endif
289 
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)290 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
291   if(length==-1) length=strlen(inv);
292 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
293   inv_next+=length;
294   return utext_openUTF8(ut, inv, length, status);
295 #else
296   if(inv_next+length+1>INV_BUFSIZ) {
297     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
298             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
299     *status = U_MEMORY_ALLOCATION_ERROR;
300     return NULL;
301   }
302 
303   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
304   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
305   inv_next+=length;
306 
307 #if 0
308   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
309 #endif
310 
311   return utext_openUTF8(ut, (const char*)buf, length, status);
312 #endif
313 }
314 
315 
316 //---------------------------------------------------------------------------
317 //
318 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
319 //                       for the LookingAt() and  Match() functions.
320 //
321 //       usage:
322 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
323 //
324 //          The expected results are UBool - TRUE or FALSE.
325 //          The input text is unescaped.  The pattern is not.
326 //
327 //
328 //---------------------------------------------------------------------------
329 
330 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
331 
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)332 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
333     const UnicodeString pattern(pat, -1, US_INV);
334     const UnicodeString inputText(text, -1, US_INV);
335     UErrorCode          status  = U_ZERO_ERROR;
336     UParseError         pe;
337     RegexPattern        *REPattern = NULL;
338     RegexMatcher        *REMatcher = NULL;
339     UBool               retVal     = TRUE;
340 
341     UnicodeString patString(pat, -1, US_INV);
342     REPattern = RegexPattern::compile(patString, 0, pe, status);
343     if (U_FAILURE(status)) {
344         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
345             line, u_errorName(status));
346         return FALSE;
347     }
348     if (line==376) { REPattern->dumpPattern();}
349 
350     UnicodeString inputString(inputText);
351     UnicodeString unEscapedInput = inputString.unescape();
352     REMatcher = REPattern->matcher(unEscapedInput, status);
353     if (U_FAILURE(status)) {
354         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
355             line, u_errorName(status));
356         return FALSE;
357     }
358 
359     UBool actualmatch;
360     actualmatch = REMatcher->lookingAt(status);
361     if (U_FAILURE(status)) {
362         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
363             line, u_errorName(status));
364         retVal =  FALSE;
365     }
366     if (actualmatch != looking) {
367         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
368         retVal = FALSE;
369     }
370 
371     status = U_ZERO_ERROR;
372     actualmatch = REMatcher->matches(status);
373     if (U_FAILURE(status)) {
374         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
375             line, u_errorName(status));
376         retVal = FALSE;
377     }
378     if (actualmatch != match) {
379         errln("RegexTest: wrong return from matches() at line %d.\n", line);
380         retVal = FALSE;
381     }
382 
383     if (retVal == FALSE) {
384         REPattern->dumpPattern();
385     }
386 
387     delete REPattern;
388     delete REMatcher;
389     return retVal;
390 }
391 
392 
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)393 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
394     UText               pattern    = UTEXT_INITIALIZER;
395     int32_t             inputUTF8Length;
396     char                *textChars = NULL;
397     UText               inputText  = UTEXT_INITIALIZER;
398     UErrorCode          status     = U_ZERO_ERROR;
399     UParseError         pe;
400     RegexPattern        *REPattern = NULL;
401     RegexMatcher        *REMatcher = NULL;
402     UBool               retVal     = TRUE;
403 
404     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
405     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
406     if (U_FAILURE(status)) {
407         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
408             line, u_errorName(status));
409         return FALSE;
410     }
411 
412     UnicodeString inputString(text, -1, US_INV);
413     UnicodeString unEscapedInput = inputString.unescape();
414     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
415     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
416 
417     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
418     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
419         // UTF-8 does not allow unpaired surrogates, so this could actually happen
420         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
421         return TRUE; // not a failure of the Regex engine
422     }
423     status = U_ZERO_ERROR; // buffer overflow
424     textChars = new char[inputUTF8Length+1];
425     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
426     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
427 
428     REMatcher = &REPattern->matcher(status)->reset(&inputText);
429     if (U_FAILURE(status)) {
430         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
431             line, u_errorName(status));
432         return FALSE;
433     }
434 
435     UBool actualmatch;
436     actualmatch = REMatcher->lookingAt(status);
437     if (U_FAILURE(status)) {
438         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
439             line, u_errorName(status));
440         retVal =  FALSE;
441     }
442     if (actualmatch != looking) {
443         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
444         retVal = FALSE;
445     }
446 
447     status = U_ZERO_ERROR;
448     actualmatch = REMatcher->matches(status);
449     if (U_FAILURE(status)) {
450         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
451             line, u_errorName(status));
452         retVal = FALSE;
453     }
454     if (actualmatch != match) {
455         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
456         retVal = FALSE;
457     }
458 
459     if (retVal == FALSE) {
460         REPattern->dumpPattern();
461     }
462 
463     delete REPattern;
464     delete REMatcher;
465     utext_close(&inputText);
466     utext_close(&pattern);
467     delete[] textChars;
468     return retVal;
469 }
470 
471 
472 
473 //---------------------------------------------------------------------------
474 //
475 //    REGEX_ERR       Macro + invocation function to simplify writing tests
476 //                       regex tests for incorrect patterns
477 //
478 //       usage:
479 //          REGEX_ERR("pattern",   expected error line, column, expected status);
480 //
481 //---------------------------------------------------------------------------
482 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
483 
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)484 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
485                           UErrorCode expectedStatus, int32_t line) {
486     UnicodeString       pattern(pat);
487 
488     UErrorCode          status         = U_ZERO_ERROR;
489     UParseError         pe;
490     RegexPattern        *callerPattern = NULL;
491 
492     //
493     //  Compile the caller's pattern
494     //
495     UnicodeString patString(pat);
496     callerPattern = RegexPattern::compile(patString, 0, pe, status);
497     if (status != expectedStatus) {
498         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
499     } else {
500         if (status != U_ZERO_ERROR) {
501             if (pe.line != errLine || pe.offset != errCol) {
502                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
503                     line, errLine, errCol, pe.line, pe.offset);
504             }
505         }
506     }
507 
508     delete callerPattern;
509 
510     //
511     //  Compile again, using a UTF-8-based UText
512     //
513     UText patternText = UTEXT_INITIALIZER;
514     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
515     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
516     if (status != expectedStatus) {
517         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
518     } else {
519         if (status != U_ZERO_ERROR) {
520             if (pe.line != errLine || pe.offset != errCol) {
521                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
522                     line, errLine, errCol, pe.line, pe.offset);
523             }
524         }
525     }
526 
527     delete callerPattern;
528     utext_close(&patternText);
529 }
530 
531 
532 
533 //---------------------------------------------------------------------------
534 //
535 //      Basic      Check for basic functionality of regex pattern matching.
536 //                 Avoid the use of REGEX_FIND test macro, which has
537 //                 substantial dependencies on basic Regex functionality.
538 //
539 //---------------------------------------------------------------------------
Basic()540 void RegexTest::Basic() {
541 
542 
543 //
544 // Debug - slide failing test cases early
545 //
546 #if 0
547     {
548         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
549         UParseError pe;
550         UErrorCode  status = U_ZERO_ERROR;
551         RegexPattern *pattern;
552         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
553         pattern->dumpPattern();
554         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
555         UBool result = m->find();
556         printf("result = %d\n", result);
557         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
558         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
559     }
560     exit(1);
561 #endif
562 
563 
564     //
565     // Pattern with parentheses
566     //
567     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
568     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
569     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
570 
571     //
572     // Patterns with *
573     //
574     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
575     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
576     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
577     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
578     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
579 
580     REGEX_TESTLM("a*", "",  TRUE, TRUE);
581     REGEX_TESTLM("a*", "b", TRUE, FALSE);
582 
583 
584     //
585     //  Patterns with "."
586     //
587     REGEX_TESTLM(".", "abc", TRUE, FALSE);
588     REGEX_TESTLM("...", "abc", TRUE, TRUE);
589     REGEX_TESTLM("....", "abc", FALSE, FALSE);
590     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
591     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
592     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
593     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
594     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
595 
596     //
597     //  Patterns with * applied to chars at end of literal string
598     //
599     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
600     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
601 
602     //
603     //  Supplemental chars match as single chars, not a pair of surrogates.
604     //
605     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
606     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
607     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
608 
609 
610     //
611     //  UnicodeSets in the pattern
612     //
613     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
614     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
615     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
616     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
617     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
618     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
619 
620     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
621     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
622     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
623     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
624     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
625 
626     //
627     //   OR operator in patterns
628     //
629     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
630     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
631     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
632     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
633 
634     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
635     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
636     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
637     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
638     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
639     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
640 
641     //
642     //  +
643     //
644     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
645     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
646     REGEX_TESTLM("b+", "", FALSE, FALSE);
647     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
648     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
649     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
650 
651     //
652     //   ?
653     //
654     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
655     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
656     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
657     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
658     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
659     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
660     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
661     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
662     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
663 
664     //
665     //  Escape sequences that become single literal chars, handled internally
666     //   by ICU's Unescape.
667     //
668 
669     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
670     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
671     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
672     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
673     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
674     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
675     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
676     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
677     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
678     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
679 
680     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
681     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
682 
683     // Escape of special chars in patterns
684     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
685 }
686 
687 
688 //---------------------------------------------------------------------------
689 //
690 //    UTextBasic   Check for quirks that are specific to the UText
691 //                 implementation.
692 //
693 //---------------------------------------------------------------------------
UTextBasic()694 void RegexTest::UTextBasic() {
695     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
696     UErrorCode status = U_ZERO_ERROR;
697     UText pattern = UTEXT_INITIALIZER;
698     utext_openUTF8(&pattern, str_abc, -1, &status);
699     RegexMatcher matcher(&pattern, 0, status);
700     REGEX_CHECK_STATUS;
701 
702     UText input = UTEXT_INITIALIZER;
703     utext_openUTF8(&input, str_abc, -1, &status);
704     REGEX_CHECK_STATUS;
705     matcher.reset(&input);
706     REGEX_CHECK_STATUS;
707     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
708 
709     matcher.reset(matcher.inputText());
710     REGEX_CHECK_STATUS;
711     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
712 
713     utext_close(&pattern);
714     utext_close(&input);
715 }
716 
717 
718 //---------------------------------------------------------------------------
719 //
720 //      API_Match   Test that the API for class RegexMatcher
721 //                  is present and nominally working, but excluding functions
722 //                  implementing replace operations.
723 //
724 //---------------------------------------------------------------------------
API_Match()725 void RegexTest::API_Match() {
726     UParseError         pe;
727     UErrorCode          status=U_ZERO_ERROR;
728     int32_t             flags = 0;
729 
730     //
731     // Debug - slide failing test cases early
732     //
733 #if 0
734     {
735     }
736     return;
737 #endif
738 
739     //
740     // Simple pattern compilation
741     //
742     {
743         UnicodeString       re("abc");
744         RegexPattern        *pat2;
745         pat2 = RegexPattern::compile(re, flags, pe, status);
746         REGEX_CHECK_STATUS;
747 
748         UnicodeString inStr1 = "abcdef this is a test";
749         UnicodeString instr2 = "not abc";
750         UnicodeString empty  = "";
751 
752 
753         //
754         // Matcher creation and reset.
755         //
756         RegexMatcher *m1 = pat2->matcher(inStr1, status);
757         REGEX_CHECK_STATUS;
758         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
759         REGEX_ASSERT(m1->input() == inStr1);
760         m1->reset(instr2);
761         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
762         REGEX_ASSERT(m1->input() == instr2);
763         m1->reset(inStr1);
764         REGEX_ASSERT(m1->input() == inStr1);
765         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
766         m1->reset(empty);
767         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
768         REGEX_ASSERT(m1->input() == empty);
769         REGEX_ASSERT(&m1->pattern() == pat2);
770 
771         //
772         //  reset(pos, status)
773         //
774         m1->reset(inStr1);
775         m1->reset(4, status);
776         REGEX_CHECK_STATUS;
777         REGEX_ASSERT(m1->input() == inStr1);
778         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
779 
780         m1->reset(-1, status);
781         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
782         status = U_ZERO_ERROR;
783 
784         m1->reset(0, status);
785         REGEX_CHECK_STATUS;
786         status = U_ZERO_ERROR;
787 
788         int32_t len = m1->input().length();
789         m1->reset(len-1, status);
790         REGEX_CHECK_STATUS;
791         status = U_ZERO_ERROR;
792 
793         m1->reset(len, status);
794         REGEX_CHECK_STATUS;
795         status = U_ZERO_ERROR;
796 
797         m1->reset(len+1, status);
798         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
799         status = U_ZERO_ERROR;
800 
801         //
802         // match(pos, status)
803         //
804         m1->reset(instr2);
805         REGEX_ASSERT(m1->matches(4, status) == TRUE);
806         m1->reset();
807         REGEX_ASSERT(m1->matches(3, status) == FALSE);
808         m1->reset();
809         REGEX_ASSERT(m1->matches(5, status) == FALSE);
810         REGEX_ASSERT(m1->matches(4, status) == TRUE);
811         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
812         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
813 
814         // Match() at end of string should fail, but should not
815         //  be an error.
816         status = U_ZERO_ERROR;
817         len = m1->input().length();
818         REGEX_ASSERT(m1->matches(len, status) == FALSE);
819         REGEX_CHECK_STATUS;
820 
821         // Match beyond end of string should fail with an error.
822         status = U_ZERO_ERROR;
823         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
824         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
825 
826         // Successful match at end of string.
827         {
828             status = U_ZERO_ERROR;
829             RegexMatcher m("A?", 0, status);  // will match zero length string.
830             REGEX_CHECK_STATUS;
831             m.reset(inStr1);
832             len = inStr1.length();
833             REGEX_ASSERT(m.matches(len, status) == TRUE);
834             REGEX_CHECK_STATUS;
835             m.reset(empty);
836             REGEX_ASSERT(m.matches(0, status) == TRUE);
837             REGEX_CHECK_STATUS;
838         }
839 
840 
841         //
842         // lookingAt(pos, status)
843         //
844         status = U_ZERO_ERROR;
845         m1->reset(instr2);  // "not abc"
846         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
847         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
848         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
849         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
850         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
851         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
852         status = U_ZERO_ERROR;
853         len = m1->input().length();
854         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
855         REGEX_CHECK_STATUS;
856         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
857         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
858 
859         delete m1;
860         delete pat2;
861     }
862 
863 
864     //
865     // Capture Group.
866     //     RegexMatcher::start();
867     //     RegexMatcher::end();
868     //     RegexMatcher::groupCount();
869     //
870     {
871         int32_t             flags=0;
872         UParseError         pe;
873         UErrorCode          status=U_ZERO_ERROR;
874 
875         UnicodeString       re("01(23(45)67)(.*)");
876         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
877         REGEX_CHECK_STATUS;
878         UnicodeString data = "0123456789";
879 
880         RegexMatcher *matcher = pat->matcher(data, status);
881         REGEX_CHECK_STATUS;
882         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
883         static const int32_t matchStarts[] = {0,  2, 4, 8};
884         static const int32_t matchEnds[]   = {10, 8, 6, 10};
885         int32_t i;
886         for (i=0; i<4; i++) {
887             int32_t actualStart = matcher->start(i, status);
888             REGEX_CHECK_STATUS;
889             if (actualStart != matchStarts[i]) {
890                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
891                     __LINE__, i, matchStarts[i], actualStart);
892             }
893             int32_t actualEnd = matcher->end(i, status);
894             REGEX_CHECK_STATUS;
895             if (actualEnd != matchEnds[i]) {
896                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
897                     __LINE__, i, matchEnds[i], actualEnd);
898             }
899         }
900 
901         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
902         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
903 
904         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
905         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
906         matcher->reset();
907         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
908 
909         matcher->lookingAt(status);
910         REGEX_ASSERT(matcher->group(status)    == "0123456789");
911         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
912         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
913         REGEX_ASSERT(matcher->group(2, status) == "45"        );
914         REGEX_ASSERT(matcher->group(3, status) == "89"        );
915         REGEX_CHECK_STATUS;
916         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
917         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
918         matcher->reset();
919         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
920 
921         delete matcher;
922         delete pat;
923 
924     }
925 
926     //
927     //  find
928     //
929     {
930         int32_t             flags=0;
931         UParseError         pe;
932         UErrorCode          status=U_ZERO_ERROR;
933 
934         UnicodeString       re("abc");
935         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
936         REGEX_CHECK_STATUS;
937         UnicodeString data = ".abc..abc...abc..";
938         //                    012345678901234567
939 
940         RegexMatcher *matcher = pat->matcher(data, status);
941         REGEX_CHECK_STATUS;
942         REGEX_ASSERT(matcher->find());
943         REGEX_ASSERT(matcher->start(status) == 1);
944         REGEX_ASSERT(matcher->find());
945         REGEX_ASSERT(matcher->start(status) == 6);
946         REGEX_ASSERT(matcher->find());
947         REGEX_ASSERT(matcher->start(status) == 12);
948         REGEX_ASSERT(matcher->find() == FALSE);
949         REGEX_ASSERT(matcher->find() == FALSE);
950 
951         matcher->reset();
952         REGEX_ASSERT(matcher->find());
953         REGEX_ASSERT(matcher->start(status) == 1);
954 
955         REGEX_ASSERT(matcher->find(0, status));
956         REGEX_ASSERT(matcher->start(status) == 1);
957         REGEX_ASSERT(matcher->find(1, status));
958         REGEX_ASSERT(matcher->start(status) == 1);
959         REGEX_ASSERT(matcher->find(2, status));
960         REGEX_ASSERT(matcher->start(status) == 6);
961         REGEX_ASSERT(matcher->find(12, status));
962         REGEX_ASSERT(matcher->start(status) == 12);
963         REGEX_ASSERT(matcher->find(13, status) == FALSE);
964         REGEX_ASSERT(matcher->find(16, status) == FALSE);
965         REGEX_ASSERT(matcher->find(17, status) == FALSE);
966         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
967 
968         status = U_ZERO_ERROR;
969         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
970         status = U_ZERO_ERROR;
971         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
972 
973         REGEX_ASSERT(matcher->groupCount() == 0);
974 
975         delete matcher;
976         delete pat;
977     }
978 
979 
980     //
981     //  find, with \G in pattern (true if at the end of a previous match).
982     //
983     {
984         int32_t             flags=0;
985         UParseError         pe;
986         UErrorCode          status=U_ZERO_ERROR;
987 
988         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
989         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
990         REGEX_CHECK_STATUS;
991         UnicodeString data = ".abcabc.abc..";
992         //                    012345678901234567
993 
994         RegexMatcher *matcher = pat->matcher(data, status);
995         REGEX_CHECK_STATUS;
996         REGEX_ASSERT(matcher->find());
997         REGEX_ASSERT(matcher->start(status) == 0);
998         REGEX_ASSERT(matcher->start(1, status) == -1);
999         REGEX_ASSERT(matcher->start(2, status) == 1);
1000 
1001         REGEX_ASSERT(matcher->find());
1002         REGEX_ASSERT(matcher->start(status) == 4);
1003         REGEX_ASSERT(matcher->start(1, status) == 4);
1004         REGEX_ASSERT(matcher->start(2, status) == -1);
1005         REGEX_CHECK_STATUS;
1006 
1007         delete matcher;
1008         delete pat;
1009     }
1010 
1011     //
1012     //   find with zero length matches, match position should bump ahead
1013     //     to prevent loops.
1014     //
1015     {
1016         int32_t                 i;
1017         UErrorCode          status=U_ZERO_ERROR;
1018         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1019                                                       //   using an always-true look-ahead.
1020         REGEX_CHECK_STATUS;
1021         UnicodeString s("    ");
1022         m.reset(s);
1023         for (i=0; ; i++) {
1024             if (m.find() == FALSE) {
1025                 break;
1026             }
1027             REGEX_ASSERT(m.start(status) == i);
1028             REGEX_ASSERT(m.end(status) == i);
1029         }
1030         REGEX_ASSERT(i==5);
1031 
1032         // Check that the bump goes over surrogate pairs OK
1033         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1034         s = s.unescape();
1035         m.reset(s);
1036         for (i=0; ; i+=2) {
1037             if (m.find() == FALSE) {
1038                 break;
1039             }
1040             REGEX_ASSERT(m.start(status) == i);
1041             REGEX_ASSERT(m.end(status) == i);
1042         }
1043         REGEX_ASSERT(i==10);
1044     }
1045     {
1046         // find() loop breaking test.
1047         //        with pattern of /.?/, should see a series of one char matches, then a single
1048         //        match of zero length at the end of the input string.
1049         int32_t                 i;
1050         UErrorCode          status=U_ZERO_ERROR;
1051         RegexMatcher        m(".?", 0, status);
1052         REGEX_CHECK_STATUS;
1053         UnicodeString s("    ");
1054         m.reset(s);
1055         for (i=0; ; i++) {
1056             if (m.find() == FALSE) {
1057                 break;
1058             }
1059             REGEX_ASSERT(m.start(status) == i);
1060             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1061         }
1062         REGEX_ASSERT(i==5);
1063     }
1064 
1065 
1066     //
1067     // Matchers with no input string behave as if they had an empty input string.
1068     //
1069 
1070     {
1071         UErrorCode status = U_ZERO_ERROR;
1072         RegexMatcher  m(".?", 0, status);
1073         REGEX_CHECK_STATUS;
1074         REGEX_ASSERT(m.find());
1075         REGEX_ASSERT(m.start(status) == 0);
1076         REGEX_ASSERT(m.input() == "");
1077     }
1078     {
1079         UErrorCode status = U_ZERO_ERROR;
1080         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1081         RegexMatcher  *m = p->matcher(status);
1082         REGEX_CHECK_STATUS;
1083 
1084         REGEX_ASSERT(m->find() == FALSE);
1085         REGEX_ASSERT(m->input() == "");
1086         delete m;
1087         delete p;
1088     }
1089 
1090     //
1091     // Regions
1092     //
1093     {
1094         UErrorCode status = U_ZERO_ERROR;
1095         UnicodeString testString("This is test data");
1096         RegexMatcher m(".*", testString,  0, status);
1097         REGEX_CHECK_STATUS;
1098         REGEX_ASSERT(m.regionStart() == 0);
1099         REGEX_ASSERT(m.regionEnd() == testString.length());
1100         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1101         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1102 
1103         m.region(2,4, status);
1104         REGEX_CHECK_STATUS;
1105         REGEX_ASSERT(m.matches(status));
1106         REGEX_ASSERT(m.start(status)==2);
1107         REGEX_ASSERT(m.end(status)==4);
1108         REGEX_CHECK_STATUS;
1109 
1110         m.reset();
1111         REGEX_ASSERT(m.regionStart() == 0);
1112         REGEX_ASSERT(m.regionEnd() == testString.length());
1113 
1114         UnicodeString shorterString("short");
1115         m.reset(shorterString);
1116         REGEX_ASSERT(m.regionStart() == 0);
1117         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1118 
1119         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1120         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1121         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1122         REGEX_ASSERT(&m == &m.reset());
1123         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1124 
1125         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1126         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1127         REGEX_ASSERT(&m == &m.reset());
1128         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1129 
1130         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1131         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1132         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1133         REGEX_ASSERT(&m == &m.reset());
1134         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1135 
1136         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1137         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1138         REGEX_ASSERT(&m == &m.reset());
1139         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1140 
1141     }
1142 
1143     //
1144     // hitEnd() and requireEnd()
1145     //
1146     {
1147         UErrorCode status = U_ZERO_ERROR;
1148         UnicodeString testString("aabb");
1149         RegexMatcher m1(".*", testString,  0, status);
1150         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1151         REGEX_ASSERT(m1.hitEnd() == TRUE);
1152         REGEX_ASSERT(m1.requireEnd() == FALSE);
1153         REGEX_CHECK_STATUS;
1154 
1155         status = U_ZERO_ERROR;
1156         RegexMatcher m2("a*", testString, 0, status);
1157         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1158         REGEX_ASSERT(m2.hitEnd() == FALSE);
1159         REGEX_ASSERT(m2.requireEnd() == FALSE);
1160         REGEX_CHECK_STATUS;
1161 
1162         status = U_ZERO_ERROR;
1163         RegexMatcher m3(".*$", testString, 0, status);
1164         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1165         REGEX_ASSERT(m3.hitEnd() == TRUE);
1166         REGEX_ASSERT(m3.requireEnd() == TRUE);
1167         REGEX_CHECK_STATUS;
1168     }
1169 
1170 
1171     //
1172     // Compilation error on reset with UChar *
1173     //   These were a hazard that people were stumbling over with runtime errors.
1174     //   Changed them to compiler errors by adding private methods that more closely
1175     //   matched the incorrect use of the functions.
1176     //
1177 #if 0
1178     {
1179         UErrorCode status = U_ZERO_ERROR;
1180         UChar ucharString[20];
1181         RegexMatcher m(".", 0, status);
1182         m.reset(ucharString);  // should not compile.
1183 
1184         RegexPattern *p = RegexPattern::compile(".", 0, status);
1185         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1186 
1187         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1188     }
1189 #endif
1190 
1191     //
1192     //  Time Outs.
1193     //       Note:  These tests will need to be changed when the regexp engine is
1194     //              able to detect and cut short the exponential time behavior on
1195     //              this type of match.
1196     //
1197     {
1198         UErrorCode status = U_ZERO_ERROR;
1199         //    Enough 'a's in the string to cause the match to time out.
1200         //       (Each on additonal 'a' doubles the time)
1201         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1202         RegexMatcher matcher("(a+)+b", testString, 0, status);
1203         REGEX_CHECK_STATUS;
1204         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1205         matcher.setTimeLimit(100, status);
1206         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1207         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1208         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1209     }
1210     {
1211         UErrorCode status = U_ZERO_ERROR;
1212         //   Few enough 'a's to slip in under the time limit.
1213         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1214         RegexMatcher matcher("(a+)+b", testString, 0, status);
1215         REGEX_CHECK_STATUS;
1216         matcher.setTimeLimit(100, status);
1217         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1218         REGEX_CHECK_STATUS;
1219     }
1220 
1221     //
1222     //  Stack Limits
1223     //
1224     {
1225         UErrorCode status = U_ZERO_ERROR;
1226         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1227 
1228         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1229         //   of the '+', and makes the stack frames larger.
1230         RegexMatcher matcher("(A)+A$", testString, 0, status);
1231 
1232         // With the default stack, this match should fail to run
1233         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1234         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1235 
1236         // With unlimited stack, it should run
1237         status = U_ZERO_ERROR;
1238         matcher.setStackLimit(0, status);
1239         REGEX_CHECK_STATUS;
1240         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1241         REGEX_CHECK_STATUS;
1242         REGEX_ASSERT(matcher.getStackLimit() == 0);
1243 
1244         // With a limited stack, it the match should fail
1245         status = U_ZERO_ERROR;
1246         matcher.setStackLimit(10000, status);
1247         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1248         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1249         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1250     }
1251 
1252         // A pattern that doesn't save state should work with
1253         //   a minimal sized stack
1254     {
1255         UErrorCode status = U_ZERO_ERROR;
1256         UnicodeString testString = "abc";
1257         RegexMatcher matcher("abc", testString, 0, status);
1258         REGEX_CHECK_STATUS;
1259         matcher.setStackLimit(30, status);
1260         REGEX_CHECK_STATUS;
1261         REGEX_ASSERT(matcher.matches(status) == TRUE);
1262         REGEX_CHECK_STATUS;
1263         REGEX_ASSERT(matcher.getStackLimit() == 30);
1264 
1265         // Negative stack sizes should fail
1266         status = U_ZERO_ERROR;
1267         matcher.setStackLimit(1000, status);
1268         REGEX_CHECK_STATUS;
1269         matcher.setStackLimit(-1, status);
1270         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1271         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1272     }
1273 
1274 
1275 }
1276 
1277 
1278 
1279 
1280 
1281 
1282 //---------------------------------------------------------------------------
1283 //
1284 //      API_Replace        API test for class RegexMatcher, testing the
1285 //                         Replace family of functions.
1286 //
1287 //---------------------------------------------------------------------------
API_Replace()1288 void RegexTest::API_Replace() {
1289     //
1290     //  Replace
1291     //
1292     int32_t             flags=0;
1293     UParseError         pe;
1294     UErrorCode          status=U_ZERO_ERROR;
1295 
1296     UnicodeString       re("abc");
1297     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1298     REGEX_CHECK_STATUS;
1299     UnicodeString data = ".abc..abc...abc..";
1300     //                    012345678901234567
1301     RegexMatcher *matcher = pat->matcher(data, status);
1302 
1303     //
1304     //  Plain vanilla matches.
1305     //
1306     UnicodeString  dest;
1307     dest = matcher->replaceFirst("yz", status);
1308     REGEX_CHECK_STATUS;
1309     REGEX_ASSERT(dest == ".yz..abc...abc..");
1310 
1311     dest = matcher->replaceAll("yz", status);
1312     REGEX_CHECK_STATUS;
1313     REGEX_ASSERT(dest == ".yz..yz...yz..");
1314 
1315     //
1316     //  Plain vanilla non-matches.
1317     //
1318     UnicodeString d2 = ".abx..abx...abx..";
1319     matcher->reset(d2);
1320     dest = matcher->replaceFirst("yz", status);
1321     REGEX_CHECK_STATUS;
1322     REGEX_ASSERT(dest == ".abx..abx...abx..");
1323 
1324     dest = matcher->replaceAll("yz", status);
1325     REGEX_CHECK_STATUS;
1326     REGEX_ASSERT(dest == ".abx..abx...abx..");
1327 
1328     //
1329     // Empty source string
1330     //
1331     UnicodeString d3 = "";
1332     matcher->reset(d3);
1333     dest = matcher->replaceFirst("yz", status);
1334     REGEX_CHECK_STATUS;
1335     REGEX_ASSERT(dest == "");
1336 
1337     dest = matcher->replaceAll("yz", status);
1338     REGEX_CHECK_STATUS;
1339     REGEX_ASSERT(dest == "");
1340 
1341     //
1342     // Empty substitution string
1343     //
1344     matcher->reset(data);              // ".abc..abc...abc.."
1345     dest = matcher->replaceFirst("", status);
1346     REGEX_CHECK_STATUS;
1347     REGEX_ASSERT(dest == "...abc...abc..");
1348 
1349     dest = matcher->replaceAll("", status);
1350     REGEX_CHECK_STATUS;
1351     REGEX_ASSERT(dest == "........");
1352 
1353     //
1354     // match whole string
1355     //
1356     UnicodeString d4 = "abc";
1357     matcher->reset(d4);
1358     dest = matcher->replaceFirst("xyz", status);
1359     REGEX_CHECK_STATUS;
1360     REGEX_ASSERT(dest == "xyz");
1361 
1362     dest = matcher->replaceAll("xyz", status);
1363     REGEX_CHECK_STATUS;
1364     REGEX_ASSERT(dest == "xyz");
1365 
1366     //
1367     // Capture Group, simple case
1368     //
1369     UnicodeString       re2("a(..)");
1370     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1371     REGEX_CHECK_STATUS;
1372     UnicodeString d5 = "abcdefg";
1373     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1374     REGEX_CHECK_STATUS;
1375     dest = matcher2->replaceFirst("$1$1", status);
1376     REGEX_CHECK_STATUS;
1377     REGEX_ASSERT(dest == "bcbcdefg");
1378 
1379     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1380     REGEX_CHECK_STATUS;
1381     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1382 
1383     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1384     REGEX_ASSERT(U_FAILURE(status));
1385     status = U_ZERO_ERROR;
1386 
1387     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1388     replacement = replacement.unescape();
1389     dest = matcher2->replaceFirst(replacement, status);
1390     REGEX_CHECK_STATUS;
1391     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1392 
1393     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1394 
1395 
1396     //
1397     // Replacement String with \u hex escapes
1398     //
1399     {
1400         UnicodeString  src = "abc 1 abc 2 abc 3";
1401         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1402         matcher->reset(src);
1403         UnicodeString  result = matcher->replaceAll(substitute, status);
1404         REGEX_CHECK_STATUS;
1405         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1406     }
1407     {
1408         UnicodeString  src = "abc !";
1409         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1410         matcher->reset(src);
1411         UnicodeString  result = matcher->replaceAll(substitute, status);
1412         REGEX_CHECK_STATUS;
1413         UnicodeString expected = UnicodeString("--");
1414         expected.append((UChar32)0x10000);
1415         expected.append("-- !");
1416         REGEX_ASSERT(result == expected);
1417     }
1418     // TODO:  need more through testing of capture substitutions.
1419 
1420     // Bug 4057
1421     //
1422     {
1423         status = U_ZERO_ERROR;
1424         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1425         RegexMatcher m("ss(.*?)ee", 0, status);
1426         REGEX_CHECK_STATUS;
1427         UnicodeString result;
1428 
1429         // Multiple finds do NOT bump up the previous appendReplacement postion.
1430         m.reset(s);
1431         m.find();
1432         m.find();
1433         m.appendReplacement(result, "ooh", status);
1434         REGEX_CHECK_STATUS;
1435         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1436 
1437         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1438         status = U_ZERO_ERROR;
1439         result.truncate(0);
1440         m.reset(10, status);
1441         m.find();
1442         m.find();
1443         m.appendReplacement(result, "ooh", status);
1444         REGEX_CHECK_STATUS;
1445         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1446 
1447         // find() at interior of string, appendReplacemnt still starts at beginning.
1448         status = U_ZERO_ERROR;
1449         result.truncate(0);
1450         m.reset();
1451         m.find(10, status);
1452         m.find();
1453         m.appendReplacement(result, "ooh", status);
1454         REGEX_CHECK_STATUS;
1455         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1456 
1457         m.appendTail(result);
1458         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1459 
1460     }
1461 
1462     delete matcher2;
1463     delete pat2;
1464     delete matcher;
1465     delete pat;
1466 }
1467 
1468 
1469 //---------------------------------------------------------------------------
1470 //
1471 //      API_Pattern       Test that the API for class RegexPattern is
1472 //                        present and nominally working.
1473 //
1474 //---------------------------------------------------------------------------
API_Pattern()1475 void RegexTest::API_Pattern() {
1476     RegexPattern        pata;    // Test default constructor to not crash.
1477     RegexPattern        patb;
1478 
1479     REGEX_ASSERT(pata == patb);
1480     REGEX_ASSERT(pata == pata);
1481 
1482     UnicodeString re1("abc[a-l][m-z]");
1483     UnicodeString re2("def");
1484     UErrorCode    status = U_ZERO_ERROR;
1485     UParseError   pe;
1486 
1487     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1488     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1489     REGEX_CHECK_STATUS;
1490     REGEX_ASSERT(*pat1 == *pat1);
1491     REGEX_ASSERT(*pat1 != pata);
1492 
1493     // Assign
1494     patb = *pat1;
1495     REGEX_ASSERT(patb == *pat1);
1496 
1497     // Copy Construct
1498     RegexPattern patc(*pat1);
1499     REGEX_ASSERT(patc == *pat1);
1500     REGEX_ASSERT(patb == patc);
1501     REGEX_ASSERT(pat1 != pat2);
1502     patb = *pat2;
1503     REGEX_ASSERT(patb != patc);
1504     REGEX_ASSERT(patb == *pat2);
1505 
1506     // Compile with no flags.
1507     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1508     REGEX_ASSERT(*pat1a == *pat1);
1509 
1510     REGEX_ASSERT(pat1a->flags() == 0);
1511 
1512     // Compile with different flags should be not equal
1513     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1514     REGEX_CHECK_STATUS;
1515 
1516     REGEX_ASSERT(*pat1b != *pat1a);
1517     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1518     REGEX_ASSERT(pat1a->flags() == 0);
1519     delete pat1b;
1520 
1521     // clone
1522     RegexPattern *pat1c = pat1->clone();
1523     REGEX_ASSERT(*pat1c == *pat1);
1524     REGEX_ASSERT(*pat1c != *pat2);
1525 
1526     delete pat1c;
1527     delete pat1a;
1528     delete pat1;
1529     delete pat2;
1530 
1531 
1532     //
1533     //   Verify that a matcher created from a cloned pattern works.
1534     //     (Jitterbug 3423)
1535     //
1536     {
1537         UErrorCode     status     = U_ZERO_ERROR;
1538         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1539         RegexPattern  *pClone     = pSource->clone();
1540         delete         pSource;
1541         RegexMatcher  *mFromClone = pClone->matcher(status);
1542         REGEX_CHECK_STATUS;
1543         UnicodeString s = "Hello World";
1544         mFromClone->reset(s);
1545         REGEX_ASSERT(mFromClone->find() == TRUE);
1546         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1547         REGEX_ASSERT(mFromClone->find() == TRUE);
1548         REGEX_ASSERT(mFromClone->group(status) == "World");
1549         REGEX_ASSERT(mFromClone->find() == FALSE);
1550         delete mFromClone;
1551         delete pClone;
1552     }
1553 
1554     //
1555     //   matches convenience API
1556     //
1557     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1558     REGEX_CHECK_STATUS;
1559     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1560     REGEX_CHECK_STATUS;
1561     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1562     REGEX_CHECK_STATUS;
1563     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1564     REGEX_CHECK_STATUS;
1565     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1566     REGEX_CHECK_STATUS;
1567     status = U_INDEX_OUTOFBOUNDS_ERROR;
1568     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1569     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1570 
1571 
1572     //
1573     // Split()
1574     //
1575     status = U_ZERO_ERROR;
1576     pat1 = RegexPattern::compile(" +",  pe, status);
1577     REGEX_CHECK_STATUS;
1578     UnicodeString  fields[10];
1579 
1580     int32_t n;
1581     n = pat1->split("Now is the time", fields, 10, status);
1582     REGEX_CHECK_STATUS;
1583     REGEX_ASSERT(n==4);
1584     REGEX_ASSERT(fields[0]=="Now");
1585     REGEX_ASSERT(fields[1]=="is");
1586     REGEX_ASSERT(fields[2]=="the");
1587     REGEX_ASSERT(fields[3]=="time");
1588     REGEX_ASSERT(fields[4]=="");
1589 
1590     n = pat1->split("Now is the time", fields, 2, status);
1591     REGEX_CHECK_STATUS;
1592     REGEX_ASSERT(n==2);
1593     REGEX_ASSERT(fields[0]=="Now");
1594     REGEX_ASSERT(fields[1]=="is the time");
1595     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1596 
1597     fields[1] = "*";
1598     status = U_ZERO_ERROR;
1599     n = pat1->split("Now is the time", fields, 1, status);
1600     REGEX_CHECK_STATUS;
1601     REGEX_ASSERT(n==1);
1602     REGEX_ASSERT(fields[0]=="Now is the time");
1603     REGEX_ASSERT(fields[1]=="*");
1604     status = U_ZERO_ERROR;
1605 
1606     n = pat1->split("    Now       is the time   ", fields, 10, status);
1607     REGEX_CHECK_STATUS;
1608     REGEX_ASSERT(n==6);
1609     REGEX_ASSERT(fields[0]=="");
1610     REGEX_ASSERT(fields[1]=="Now");
1611     REGEX_ASSERT(fields[2]=="is");
1612     REGEX_ASSERT(fields[3]=="the");
1613     REGEX_ASSERT(fields[4]=="time");
1614     REGEX_ASSERT(fields[5]=="");
1615 
1616     n = pat1->split("     ", fields, 10, status);
1617     REGEX_CHECK_STATUS;
1618     REGEX_ASSERT(n==2);
1619     REGEX_ASSERT(fields[0]=="");
1620     REGEX_ASSERT(fields[1]=="");
1621 
1622     fields[0] = "foo";
1623     n = pat1->split("", fields, 10, status);
1624     REGEX_CHECK_STATUS;
1625     REGEX_ASSERT(n==0);
1626     REGEX_ASSERT(fields[0]=="foo");
1627 
1628     delete pat1;
1629 
1630     //  split, with a pattern with (capture)
1631     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1632     REGEX_CHECK_STATUS;
1633 
1634     status = U_ZERO_ERROR;
1635     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1636     REGEX_CHECK_STATUS;
1637     REGEX_ASSERT(n==7);
1638     REGEX_ASSERT(fields[0]=="");
1639     REGEX_ASSERT(fields[1]=="a");
1640     REGEX_ASSERT(fields[2]=="Now is ");
1641     REGEX_ASSERT(fields[3]=="b");
1642     REGEX_ASSERT(fields[4]=="the time");
1643     REGEX_ASSERT(fields[5]=="c");
1644     REGEX_ASSERT(fields[6]=="");
1645     REGEX_ASSERT(status==U_ZERO_ERROR);
1646 
1647     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1648     REGEX_CHECK_STATUS;
1649     REGEX_ASSERT(n==7);
1650     REGEX_ASSERT(fields[0]=="  ");
1651     REGEX_ASSERT(fields[1]=="a");
1652     REGEX_ASSERT(fields[2]=="Now is ");
1653     REGEX_ASSERT(fields[3]=="b");
1654     REGEX_ASSERT(fields[4]=="the time");
1655     REGEX_ASSERT(fields[5]=="c");
1656     REGEX_ASSERT(fields[6]=="");
1657 
1658     status = U_ZERO_ERROR;
1659     fields[6] = "foo";
1660     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1661     REGEX_CHECK_STATUS;
1662     REGEX_ASSERT(n==6);
1663     REGEX_ASSERT(fields[0]=="  ");
1664     REGEX_ASSERT(fields[1]=="a");
1665     REGEX_ASSERT(fields[2]=="Now is ");
1666     REGEX_ASSERT(fields[3]=="b");
1667     REGEX_ASSERT(fields[4]=="the time");
1668     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1669     REGEX_ASSERT(fields[6]=="foo");
1670 
1671     status = U_ZERO_ERROR;
1672     fields[5] = "foo";
1673     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1674     REGEX_CHECK_STATUS;
1675     REGEX_ASSERT(n==5);
1676     REGEX_ASSERT(fields[0]=="  ");
1677     REGEX_ASSERT(fields[1]=="a");
1678     REGEX_ASSERT(fields[2]=="Now is ");
1679     REGEX_ASSERT(fields[3]=="b");
1680     REGEX_ASSERT(fields[4]=="the time<c>");
1681     REGEX_ASSERT(fields[5]=="foo");
1682 
1683     status = U_ZERO_ERROR;
1684     fields[5] = "foo";
1685     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1686     REGEX_CHECK_STATUS;
1687     REGEX_ASSERT(n==5);
1688     REGEX_ASSERT(fields[0]=="  ");
1689     REGEX_ASSERT(fields[1]=="a");
1690     REGEX_ASSERT(fields[2]=="Now is ");
1691     REGEX_ASSERT(fields[3]=="b");
1692     REGEX_ASSERT(fields[4]=="the time");
1693     REGEX_ASSERT(fields[5]=="foo");
1694 
1695     status = U_ZERO_ERROR;
1696     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1697     REGEX_CHECK_STATUS;
1698     REGEX_ASSERT(n==4);
1699     REGEX_ASSERT(fields[0]=="  ");
1700     REGEX_ASSERT(fields[1]=="a");
1701     REGEX_ASSERT(fields[2]=="Now is ");
1702     REGEX_ASSERT(fields[3]=="the time<c>");
1703     status = U_ZERO_ERROR;
1704     delete pat1;
1705 
1706     pat1 = RegexPattern::compile("([-,])",  pe, status);
1707     REGEX_CHECK_STATUS;
1708     n = pat1->split("1-10,20", fields, 10, status);
1709     REGEX_CHECK_STATUS;
1710     REGEX_ASSERT(n==5);
1711     REGEX_ASSERT(fields[0]=="1");
1712     REGEX_ASSERT(fields[1]=="-");
1713     REGEX_ASSERT(fields[2]=="10");
1714     REGEX_ASSERT(fields[3]==",");
1715     REGEX_ASSERT(fields[4]=="20");
1716     delete pat1;
1717 
1718     // Test split of string with empty trailing fields
1719     pat1 = RegexPattern::compile(",", pe, status);
1720     REGEX_CHECK_STATUS;
1721     n = pat1->split("a,b,c,", fields, 10, status);
1722     REGEX_CHECK_STATUS;
1723     REGEX_ASSERT(n==4);
1724     REGEX_ASSERT(fields[0]=="a");
1725     REGEX_ASSERT(fields[1]=="b");
1726     REGEX_ASSERT(fields[2]=="c");
1727     REGEX_ASSERT(fields[3]=="");
1728 
1729     n = pat1->split("a,,,", fields, 10, status);
1730     REGEX_CHECK_STATUS;
1731     REGEX_ASSERT(n==4);
1732     REGEX_ASSERT(fields[0]=="a");
1733     REGEX_ASSERT(fields[1]=="");
1734     REGEX_ASSERT(fields[2]=="");
1735     REGEX_ASSERT(fields[3]=="");
1736     delete pat1;
1737 
1738     // Split Separator with zero length match.
1739     pat1 = RegexPattern::compile(":?", pe, status);
1740     REGEX_CHECK_STATUS;
1741     n = pat1->split("abc", fields, 10, status);
1742     REGEX_CHECK_STATUS;
1743     REGEX_ASSERT(n==5);
1744     REGEX_ASSERT(fields[0]=="");
1745     REGEX_ASSERT(fields[1]=="a");
1746     REGEX_ASSERT(fields[2]=="b");
1747     REGEX_ASSERT(fields[3]=="c");
1748     REGEX_ASSERT(fields[4]=="");
1749 
1750     delete pat1;
1751 
1752     //
1753     // RegexPattern::pattern()
1754     //
1755     pat1 = new RegexPattern();
1756     REGEX_ASSERT(pat1->pattern() == "");
1757     delete pat1;
1758 
1759     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1760     REGEX_CHECK_STATUS;
1761     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1762     delete pat1;
1763 
1764 
1765     //
1766     // classID functions
1767     //
1768     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1769     REGEX_CHECK_STATUS;
1770     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1771     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1772     UnicodeString Hello("Hello, world.");
1773     RegexMatcher *m = pat1->matcher(Hello, status);
1774     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1775     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1776     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1777     delete m;
1778     delete pat1;
1779 
1780 }
1781 
1782 //---------------------------------------------------------------------------
1783 //
1784 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1785 //                       is present and working, but excluding functions
1786 //                       implementing replace operations.
1787 //
1788 //---------------------------------------------------------------------------
API_Match_UTF8()1789 void RegexTest::API_Match_UTF8() {
1790     UParseError         pe;
1791     UErrorCode          status=U_ZERO_ERROR;
1792     int32_t             flags = 0;
1793 
1794     //
1795     // Debug - slide failing test cases early
1796     //
1797 #if 0
1798     {
1799     }
1800     return;
1801 #endif
1802 
1803     //
1804     // Simple pattern compilation
1805     //
1806     {
1807         UText               re = UTEXT_INITIALIZER;
1808         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1809         REGEX_VERBOSE_TEXT(&re);
1810         RegexPattern        *pat2;
1811         pat2 = RegexPattern::compile(&re, flags, pe, status);
1812         REGEX_CHECK_STATUS;
1813 
1814         UText input1 = UTEXT_INITIALIZER;
1815         UText input2 = UTEXT_INITIALIZER;
1816         UText empty  = UTEXT_INITIALIZER;
1817         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1818         REGEX_VERBOSE_TEXT(&input1);
1819         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1820         REGEX_VERBOSE_TEXT(&input2);
1821         utext_openUChars(&empty, NULL, 0, &status);
1822 
1823         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1824         int32_t input2Len = strlen("not abc");
1825 
1826 
1827         //
1828         // Matcher creation and reset.
1829         //
1830         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1831         REGEX_CHECK_STATUS;
1832         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1833         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1834         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1835         m1->reset(&input2);
1836         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1837         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1838         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1839         m1->reset(&input1);
1840         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1841         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1842         m1->reset(&empty);
1843         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1844         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1845 
1846         //
1847         //  reset(pos, status)
1848         //
1849         m1->reset(&input1);
1850         m1->reset(4, status);
1851         REGEX_CHECK_STATUS;
1852         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1853         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1854 
1855         m1->reset(-1, status);
1856         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1857         status = U_ZERO_ERROR;
1858 
1859         m1->reset(0, status);
1860         REGEX_CHECK_STATUS;
1861         status = U_ZERO_ERROR;
1862 
1863         m1->reset(input1Len-1, status);
1864         REGEX_CHECK_STATUS;
1865         status = U_ZERO_ERROR;
1866 
1867         m1->reset(input1Len, status);
1868         REGEX_CHECK_STATUS;
1869         status = U_ZERO_ERROR;
1870 
1871         m1->reset(input1Len+1, status);
1872         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1873         status = U_ZERO_ERROR;
1874 
1875         //
1876         // match(pos, status)
1877         //
1878         m1->reset(&input2);
1879         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1880         m1->reset();
1881         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1882         m1->reset();
1883         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1884         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1885         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1886         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1887 
1888         // Match() at end of string should fail, but should not
1889         //  be an error.
1890         status = U_ZERO_ERROR;
1891         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1892         REGEX_CHECK_STATUS;
1893 
1894         // Match beyond end of string should fail with an error.
1895         status = U_ZERO_ERROR;
1896         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1897         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1898 
1899         // Successful match at end of string.
1900         {
1901             status = U_ZERO_ERROR;
1902             RegexMatcher m("A?", 0, status);  // will match zero length string.
1903             REGEX_CHECK_STATUS;
1904             m.reset(&input1);
1905             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1906             REGEX_CHECK_STATUS;
1907             m.reset(&empty);
1908             REGEX_ASSERT(m.matches(0, status) == TRUE);
1909             REGEX_CHECK_STATUS;
1910         }
1911 
1912 
1913         //
1914         // lookingAt(pos, status)
1915         //
1916         status = U_ZERO_ERROR;
1917         m1->reset(&input2);  // "not abc"
1918         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1919         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1920         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1921         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1922         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1923         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1924         status = U_ZERO_ERROR;
1925         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1926         REGEX_CHECK_STATUS;
1927         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1928         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1929 
1930         delete m1;
1931         delete pat2;
1932 
1933         utext_close(&re);
1934         utext_close(&input1);
1935         utext_close(&input2);
1936         utext_close(&empty);
1937     }
1938 
1939 
1940     //
1941     // Capture Group.
1942     //     RegexMatcher::start();
1943     //     RegexMatcher::end();
1944     //     RegexMatcher::groupCount();
1945     //
1946     {
1947         int32_t             flags=0;
1948         UParseError         pe;
1949         UErrorCode          status=U_ZERO_ERROR;
1950         UText               re=UTEXT_INITIALIZER;
1951         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1952         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1953 
1954         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1955         REGEX_CHECK_STATUS;
1956 
1957         UText input = UTEXT_INITIALIZER;
1958         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1959         utext_openUTF8(&input, str_0123456789, -1, &status);
1960 
1961         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1962         REGEX_CHECK_STATUS;
1963         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1964         static const int32_t matchStarts[] = {0,  2, 4, 8};
1965         static const int32_t matchEnds[]   = {10, 8, 6, 10};
1966         int32_t i;
1967         for (i=0; i<4; i++) {
1968             int32_t actualStart = matcher->start(i, status);
1969             REGEX_CHECK_STATUS;
1970             if (actualStart != matchStarts[i]) {
1971                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
1972                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
1973             }
1974             int32_t actualEnd = matcher->end(i, status);
1975             REGEX_CHECK_STATUS;
1976             if (actualEnd != matchEnds[i]) {
1977                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
1978                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1979             }
1980         }
1981 
1982         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1983         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1984 
1985         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1986         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1987         matcher->reset();
1988         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1989 
1990         matcher->lookingAt(status);
1991 
1992         UnicodeString dest;
1993         UText destText = UTEXT_INITIALIZER;
1994         utext_openUnicodeString(&destText, &dest, &status);
1995         UText *result;
1996         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1997         //  Test shallow-clone API
1998         int64_t   group_len;
1999         result = matcher->group((UText *)NULL, group_len, status);
2000         REGEX_CHECK_STATUS;
2001         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2002         utext_close(result);
2003         result = matcher->group(0, &destText, group_len, status);
2004         REGEX_CHECK_STATUS;
2005         REGEX_ASSERT(result == &destText);
2006         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2007         //  destText is now immutable, reopen it
2008         utext_close(&destText);
2009         utext_openUnicodeString(&destText, &dest, &status);
2010 
2011         int64_t length;
2012         result = matcher->group(0, NULL, length, status);
2013         REGEX_CHECK_STATUS;
2014         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2015         utext_close(result);
2016         result = matcher->group(0, &destText, length, status);
2017         REGEX_CHECK_STATUS;
2018         REGEX_ASSERT(result == &destText);
2019         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2020         REGEX_ASSERT(length == 10);
2021         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2022 
2023         // Capture Group 1 == "234567"
2024         result = matcher->group(1, NULL, length, status);
2025         REGEX_CHECK_STATUS;
2026         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2027         REGEX_ASSERT(length == 6);
2028         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2029         utext_close(result);
2030 
2031         result = matcher->group(1, &destText, length, status);
2032         REGEX_CHECK_STATUS;
2033         REGEX_ASSERT(result == &destText);
2034         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2035         REGEX_ASSERT(length == 6);
2036         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2037         utext_close(result);
2038 
2039         // Capture Group 2 == "45"
2040         result = matcher->group(2, NULL, length, status);
2041         REGEX_CHECK_STATUS;
2042         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2043         REGEX_ASSERT(length == 2);
2044         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2045         utext_close(result);
2046 
2047         result = matcher->group(2, &destText, length, status);
2048         REGEX_CHECK_STATUS;
2049         REGEX_ASSERT(result == &destText);
2050         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2051         REGEX_ASSERT(length == 2);
2052         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2053         utext_close(result);
2054 
2055         // Capture Group 3 == "89"
2056         result = matcher->group(3, NULL, length, status);
2057         REGEX_CHECK_STATUS;
2058         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2059         REGEX_ASSERT(length == 2);
2060         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2061         utext_close(result);
2062 
2063         result = matcher->group(3, &destText, length, status);
2064         REGEX_CHECK_STATUS;
2065         REGEX_ASSERT(result == &destText);
2066         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2067         REGEX_ASSERT(length == 2);
2068         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2069         utext_close(result);
2070 
2071         // Capture Group number out of range.
2072         status = U_ZERO_ERROR;
2073         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2074         status = U_ZERO_ERROR;
2075         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2076         status = U_ZERO_ERROR;
2077         matcher->reset();
2078         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2079 
2080         delete matcher;
2081         delete pat;
2082 
2083         utext_close(&destText);
2084         utext_close(&input);
2085         utext_close(&re);
2086     }
2087 
2088     //
2089     //  find
2090     //
2091     {
2092         int32_t             flags=0;
2093         UParseError         pe;
2094         UErrorCode          status=U_ZERO_ERROR;
2095         UText               re=UTEXT_INITIALIZER;
2096         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2097         utext_openUTF8(&re, str_abc, -1, &status);
2098 
2099         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2100         REGEX_CHECK_STATUS;
2101         UText input = UTEXT_INITIALIZER;
2102         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2103         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2104         //                      012345678901234567
2105 
2106         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2107         REGEX_CHECK_STATUS;
2108         REGEX_ASSERT(matcher->find());
2109         REGEX_ASSERT(matcher->start(status) == 1);
2110         REGEX_ASSERT(matcher->find());
2111         REGEX_ASSERT(matcher->start(status) == 6);
2112         REGEX_ASSERT(matcher->find());
2113         REGEX_ASSERT(matcher->start(status) == 12);
2114         REGEX_ASSERT(matcher->find() == FALSE);
2115         REGEX_ASSERT(matcher->find() == FALSE);
2116 
2117         matcher->reset();
2118         REGEX_ASSERT(matcher->find());
2119         REGEX_ASSERT(matcher->start(status) == 1);
2120 
2121         REGEX_ASSERT(matcher->find(0, status));
2122         REGEX_ASSERT(matcher->start(status) == 1);
2123         REGEX_ASSERT(matcher->find(1, status));
2124         REGEX_ASSERT(matcher->start(status) == 1);
2125         REGEX_ASSERT(matcher->find(2, status));
2126         REGEX_ASSERT(matcher->start(status) == 6);
2127         REGEX_ASSERT(matcher->find(12, status));
2128         REGEX_ASSERT(matcher->start(status) == 12);
2129         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2130         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2131         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2132         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2133 
2134         status = U_ZERO_ERROR;
2135         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136         status = U_ZERO_ERROR;
2137         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2138 
2139         REGEX_ASSERT(matcher->groupCount() == 0);
2140 
2141         delete matcher;
2142         delete pat;
2143 
2144         utext_close(&input);
2145         utext_close(&re);
2146     }
2147 
2148 
2149     //
2150     //  find, with \G in pattern (true if at the end of a previous match).
2151     //
2152     {
2153         int32_t             flags=0;
2154         UParseError         pe;
2155         UErrorCode          status=U_ZERO_ERROR;
2156         UText               re=UTEXT_INITIALIZER;
2157         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2158         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2159 
2160         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2161 
2162         REGEX_CHECK_STATUS;
2163         UText input = UTEXT_INITIALIZER;
2164         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2165         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2166         //                      012345678901234567
2167 
2168         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2169         REGEX_CHECK_STATUS;
2170         REGEX_ASSERT(matcher->find());
2171         REGEX_ASSERT(matcher->start(status) == 0);
2172         REGEX_ASSERT(matcher->start(1, status) == -1);
2173         REGEX_ASSERT(matcher->start(2, status) == 1);
2174 
2175         REGEX_ASSERT(matcher->find());
2176         REGEX_ASSERT(matcher->start(status) == 4);
2177         REGEX_ASSERT(matcher->start(1, status) == 4);
2178         REGEX_ASSERT(matcher->start(2, status) == -1);
2179         REGEX_CHECK_STATUS;
2180 
2181         delete matcher;
2182         delete pat;
2183 
2184         utext_close(&input);
2185         utext_close(&re);
2186     }
2187 
2188     //
2189     //   find with zero length matches, match position should bump ahead
2190     //     to prevent loops.
2191     //
2192     {
2193         int32_t                 i;
2194         UErrorCode          status=U_ZERO_ERROR;
2195         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2196                                                       //   using an always-true look-ahead.
2197         REGEX_CHECK_STATUS;
2198         UText s = UTEXT_INITIALIZER;
2199         utext_openUTF8(&s, "    ", -1, &status);
2200         m.reset(&s);
2201         for (i=0; ; i++) {
2202             if (m.find() == FALSE) {
2203                 break;
2204             }
2205             REGEX_ASSERT(m.start(status) == i);
2206             REGEX_ASSERT(m.end(status) == i);
2207         }
2208         REGEX_ASSERT(i==5);
2209 
2210         // Check that the bump goes over characters outside the BMP OK
2211         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2212         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2213         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2214         m.reset(&s);
2215         for (i=0; ; i+=4) {
2216             if (m.find() == FALSE) {
2217                 break;
2218             }
2219             REGEX_ASSERT(m.start(status) == i);
2220             REGEX_ASSERT(m.end(status) == i);
2221         }
2222         REGEX_ASSERT(i==20);
2223 
2224         utext_close(&s);
2225     }
2226     {
2227         // find() loop breaking test.
2228         //        with pattern of /.?/, should see a series of one char matches, then a single
2229         //        match of zero length at the end of the input string.
2230         int32_t                 i;
2231         UErrorCode          status=U_ZERO_ERROR;
2232         RegexMatcher        m(".?", 0, status);
2233         REGEX_CHECK_STATUS;
2234         UText s = UTEXT_INITIALIZER;
2235         utext_openUTF8(&s, "    ", -1, &status);
2236         m.reset(&s);
2237         for (i=0; ; i++) {
2238             if (m.find() == FALSE) {
2239                 break;
2240             }
2241             REGEX_ASSERT(m.start(status) == i);
2242             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2243         }
2244         REGEX_ASSERT(i==5);
2245 
2246         utext_close(&s);
2247     }
2248 
2249 
2250     //
2251     // Matchers with no input string behave as if they had an empty input string.
2252     //
2253 
2254     {
2255         UErrorCode status = U_ZERO_ERROR;
2256         RegexMatcher  m(".?", 0, status);
2257         REGEX_CHECK_STATUS;
2258         REGEX_ASSERT(m.find());
2259         REGEX_ASSERT(m.start(status) == 0);
2260         REGEX_ASSERT(m.input() == "");
2261     }
2262     {
2263         UErrorCode status = U_ZERO_ERROR;
2264         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2265         RegexMatcher  *m = p->matcher(status);
2266         REGEX_CHECK_STATUS;
2267 
2268         REGEX_ASSERT(m->find() == FALSE);
2269         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2270         delete m;
2271         delete p;
2272     }
2273 
2274     //
2275     // Regions
2276     //
2277     {
2278         UErrorCode status = U_ZERO_ERROR;
2279         UText testPattern = UTEXT_INITIALIZER;
2280         UText testText    = UTEXT_INITIALIZER;
2281         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2282         REGEX_VERBOSE_TEXT(&testPattern);
2283         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2284         REGEX_VERBOSE_TEXT(&testText);
2285 
2286         RegexMatcher m(&testPattern, &testText, 0, status);
2287         REGEX_CHECK_STATUS;
2288         REGEX_ASSERT(m.regionStart() == 0);
2289         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2290         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2291         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2292 
2293         m.region(2,4, status);
2294         REGEX_CHECK_STATUS;
2295         REGEX_ASSERT(m.matches(status));
2296         REGEX_ASSERT(m.start(status)==2);
2297         REGEX_ASSERT(m.end(status)==4);
2298         REGEX_CHECK_STATUS;
2299 
2300         m.reset();
2301         REGEX_ASSERT(m.regionStart() == 0);
2302         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2303 
2304         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2305         REGEX_VERBOSE_TEXT(&testText);
2306         m.reset(&testText);
2307         REGEX_ASSERT(m.regionStart() == 0);
2308         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2309 
2310         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2311         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2312         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2313         REGEX_ASSERT(&m == &m.reset());
2314         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2315 
2316         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2317         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2318         REGEX_ASSERT(&m == &m.reset());
2319         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2320 
2321         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2322         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2323         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2324         REGEX_ASSERT(&m == &m.reset());
2325         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2326 
2327         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2328         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2329         REGEX_ASSERT(&m == &m.reset());
2330         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2331 
2332         utext_close(&testText);
2333         utext_close(&testPattern);
2334     }
2335 
2336     //
2337     // hitEnd() and requireEnd()
2338     //
2339     {
2340         UErrorCode status = U_ZERO_ERROR;
2341         UText testPattern = UTEXT_INITIALIZER;
2342         UText testText    = UTEXT_INITIALIZER;
2343         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2344         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2345         utext_openUTF8(&testPattern, str_, -1, &status);
2346         utext_openUTF8(&testText, str_aabb, -1, &status);
2347 
2348         RegexMatcher m1(&testPattern, &testText,  0, status);
2349         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2350         REGEX_ASSERT(m1.hitEnd() == TRUE);
2351         REGEX_ASSERT(m1.requireEnd() == FALSE);
2352         REGEX_CHECK_STATUS;
2353 
2354         status = U_ZERO_ERROR;
2355         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2356         utext_openUTF8(&testPattern, str_a, -1, &status);
2357         RegexMatcher m2(&testPattern, &testText, 0, status);
2358         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2359         REGEX_ASSERT(m2.hitEnd() == FALSE);
2360         REGEX_ASSERT(m2.requireEnd() == FALSE);
2361         REGEX_CHECK_STATUS;
2362 
2363         status = U_ZERO_ERROR;
2364         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2365         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2366         RegexMatcher m3(&testPattern, &testText, 0, status);
2367         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2368         REGEX_ASSERT(m3.hitEnd() == TRUE);
2369         REGEX_ASSERT(m3.requireEnd() == TRUE);
2370         REGEX_CHECK_STATUS;
2371 
2372         utext_close(&testText);
2373         utext_close(&testPattern);
2374     }
2375 }
2376 
2377 
2378 //---------------------------------------------------------------------------
2379 //
2380 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2381 //                         Replace family of functions.
2382 //
2383 //---------------------------------------------------------------------------
API_Replace_UTF8()2384 void RegexTest::API_Replace_UTF8() {
2385     //
2386     //  Replace
2387     //
2388     int32_t             flags=0;
2389     UParseError         pe;
2390     UErrorCode          status=U_ZERO_ERROR;
2391 
2392     UText               re=UTEXT_INITIALIZER;
2393     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2394     REGEX_VERBOSE_TEXT(&re);
2395     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2396     REGEX_CHECK_STATUS;
2397 
2398     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2399     //             012345678901234567
2400     UText dataText = UTEXT_INITIALIZER;
2401     utext_openUTF8(&dataText, data, -1, &status);
2402     REGEX_CHECK_STATUS;
2403     REGEX_VERBOSE_TEXT(&dataText);
2404     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2405 
2406     //
2407     //  Plain vanilla matches.
2408     //
2409     UnicodeString  dest;
2410     UText destText = UTEXT_INITIALIZER;
2411     utext_openUnicodeString(&destText, &dest, &status);
2412     UText *result;
2413 
2414     UText replText = UTEXT_INITIALIZER;
2415 
2416     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2417     utext_openUTF8(&replText, str_yz, -1, &status);
2418     REGEX_VERBOSE_TEXT(&replText);
2419     result = matcher->replaceFirst(&replText, NULL, status);
2420     REGEX_CHECK_STATUS;
2421     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2422     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2423     utext_close(result);
2424     result = matcher->replaceFirst(&replText, &destText, status);
2425     REGEX_CHECK_STATUS;
2426     REGEX_ASSERT(result == &destText);
2427     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2428 
2429     result = matcher->replaceAll(&replText, NULL, status);
2430     REGEX_CHECK_STATUS;
2431     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2432     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2433     utext_close(result);
2434 
2435     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2436     result = matcher->replaceAll(&replText, &destText, status);
2437     REGEX_CHECK_STATUS;
2438     REGEX_ASSERT(result == &destText);
2439     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2440 
2441     //
2442     //  Plain vanilla non-matches.
2443     //
2444     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2445     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2446     matcher->reset(&dataText);
2447 
2448     result = matcher->replaceFirst(&replText, NULL, status);
2449     REGEX_CHECK_STATUS;
2450     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2451     utext_close(result);
2452     result = matcher->replaceFirst(&replText, &destText, status);
2453     REGEX_CHECK_STATUS;
2454     REGEX_ASSERT(result == &destText);
2455     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2456 
2457     result = matcher->replaceAll(&replText, NULL, status);
2458     REGEX_CHECK_STATUS;
2459     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2460     utext_close(result);
2461     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2462     result = matcher->replaceAll(&replText, &destText, status);
2463     REGEX_CHECK_STATUS;
2464     REGEX_ASSERT(result == &destText);
2465     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2466 
2467     //
2468     // Empty source string
2469     //
2470     utext_openUTF8(&dataText, NULL, 0, &status);
2471     matcher->reset(&dataText);
2472 
2473     result = matcher->replaceFirst(&replText, NULL, status);
2474     REGEX_CHECK_STATUS;
2475     REGEX_ASSERT_UTEXT_UTF8("", result);
2476     utext_close(result);
2477     result = matcher->replaceFirst(&replText, &destText, status);
2478     REGEX_CHECK_STATUS;
2479     REGEX_ASSERT(result == &destText);
2480     REGEX_ASSERT_UTEXT_UTF8("", result);
2481 
2482     result = matcher->replaceAll(&replText, NULL, status);
2483     REGEX_CHECK_STATUS;
2484     REGEX_ASSERT_UTEXT_UTF8("", result);
2485     utext_close(result);
2486     result = matcher->replaceAll(&replText, &destText, status);
2487     REGEX_CHECK_STATUS;
2488     REGEX_ASSERT(result == &destText);
2489     REGEX_ASSERT_UTEXT_UTF8("", result);
2490 
2491     //
2492     // Empty substitution string
2493     //
2494     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2495     matcher->reset(&dataText);
2496 
2497     utext_openUTF8(&replText, NULL, 0, &status);
2498     result = matcher->replaceFirst(&replText, NULL, status);
2499     REGEX_CHECK_STATUS;
2500     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2501     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2502     utext_close(result);
2503     result = matcher->replaceFirst(&replText, &destText, status);
2504     REGEX_CHECK_STATUS;
2505     REGEX_ASSERT(result == &destText);
2506     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2507 
2508     result = matcher->replaceAll(&replText, NULL, status);
2509     REGEX_CHECK_STATUS;
2510     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2511     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2512     utext_close(result);
2513     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2514     result = matcher->replaceAll(&replText, &destText, status);
2515     REGEX_CHECK_STATUS;
2516     REGEX_ASSERT(result == &destText);
2517     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2518 
2519     //
2520     // match whole string
2521     //
2522     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2523     utext_openUTF8(&dataText, str_abc, -1, &status);
2524     matcher->reset(&dataText);
2525 
2526     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2527     utext_openUTF8(&replText, str_xyz, -1, &status);
2528     result = matcher->replaceFirst(&replText, NULL, status);
2529     REGEX_CHECK_STATUS;
2530     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2531     utext_close(result);
2532     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2533     result = matcher->replaceFirst(&replText, &destText, status);
2534     REGEX_CHECK_STATUS;
2535     REGEX_ASSERT(result == &destText);
2536     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2537 
2538     result = matcher->replaceAll(&replText, NULL, status);
2539     REGEX_CHECK_STATUS;
2540     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2541     utext_close(result);
2542     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2543     result = matcher->replaceAll(&replText, &destText, status);
2544     REGEX_CHECK_STATUS;
2545     REGEX_ASSERT(result == &destText);
2546     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2547 
2548     //
2549     // Capture Group, simple case
2550     //
2551     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2552     utext_openUTF8(&re, str_add, -1, &status);
2553     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2554     REGEX_CHECK_STATUS;
2555 
2556     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2557     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2558     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2559     REGEX_CHECK_STATUS;
2560 
2561     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2562     utext_openUTF8(&replText, str_11, -1, &status);
2563     result = matcher2->replaceFirst(&replText, NULL, status);
2564     REGEX_CHECK_STATUS;
2565     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2566     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2567     utext_close(result);
2568     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2569     result = matcher2->replaceFirst(&replText, &destText, status);
2570     REGEX_CHECK_STATUS;
2571     REGEX_ASSERT(result == &destText);
2572     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2573 
2574     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2575     utext_openUTF8(&replText, str_v, -1, &status);
2576     REGEX_VERBOSE_TEXT(&replText);
2577     result = matcher2->replaceFirst(&replText, NULL, status);
2578     REGEX_CHECK_STATUS;
2579     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2580     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2581     utext_close(result);
2582     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2583     result = matcher2->replaceFirst(&replText, &destText, status);
2584     REGEX_CHECK_STATUS;
2585     REGEX_ASSERT(result == &destText);
2586     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2587 
2588     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2589                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2590                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2591     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2592     result = matcher2->replaceFirst(&replText, NULL, status);
2593     REGEX_CHECK_STATUS;
2594     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2595     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2596     utext_close(result);
2597     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2598     result = matcher2->replaceFirst(&replText, &destText, status);
2599     REGEX_CHECK_STATUS;
2600     REGEX_ASSERT(result == &destText);
2601     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2602 
2603     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2604     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2605     //                                 012345678901234567890123456
2606     supplDigitChars[22] = 0xF0;
2607     supplDigitChars[23] = 0x9D;
2608     supplDigitChars[24] = 0x9F;
2609     supplDigitChars[25] = 0x8F;
2610     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2611 
2612     result = matcher2->replaceFirst(&replText, NULL, status);
2613     REGEX_CHECK_STATUS;
2614     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2615     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2616     utext_close(result);
2617     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2618     result = matcher2->replaceFirst(&replText, &destText, status);
2619     REGEX_CHECK_STATUS;
2620     REGEX_ASSERT(result == &destText);
2621     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2622     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2623     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2624     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2625 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2626     utext_close(result);
2627     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2628     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2629     REGEX_ASSERT(result == &destText);
2630 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2631 
2632     //
2633     // Replacement String with \u hex escapes
2634     //
2635     {
2636       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2637       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2638         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2639         utext_openUTF8(&replText, str_u0043, -1, &status);
2640         matcher->reset(&dataText);
2641 
2642         result = matcher->replaceAll(&replText, NULL, status);
2643         REGEX_CHECK_STATUS;
2644         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2645         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2646         utext_close(result);
2647         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2648         result = matcher->replaceAll(&replText, &destText, status);
2649         REGEX_CHECK_STATUS;
2650         REGEX_ASSERT(result == &destText);
2651         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2652     }
2653     {
2654       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2655         utext_openUTF8(&dataText, str_abc, -1, &status);
2656         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2657         utext_openUTF8(&replText, str_U00010000, -1, &status);
2658         matcher->reset(&dataText);
2659 
2660         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2661         //                          0123456789
2662         expected[2] = 0xF0;
2663         expected[3] = 0x90;
2664         expected[4] = 0x80;
2665         expected[5] = 0x80;
2666 
2667         result = matcher->replaceAll(&replText, NULL, status);
2668         REGEX_CHECK_STATUS;
2669         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2670         utext_close(result);
2671         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2672         result = matcher->replaceAll(&replText, &destText, status);
2673         REGEX_CHECK_STATUS;
2674         REGEX_ASSERT(result == &destText);
2675         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2676     }
2677     // TODO:  need more through testing of capture substitutions.
2678 
2679     // Bug 4057
2680     //
2681     {
2682         status = U_ZERO_ERROR;
2683 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2684 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2685 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2686         utext_openUTF8(&re, str_ssee, -1, &status);
2687         utext_openUTF8(&dataText, str_blah, -1, &status);
2688         utext_openUTF8(&replText, str_ooh, -1, &status);
2689 
2690         RegexMatcher m(&re, 0, status);
2691         REGEX_CHECK_STATUS;
2692 
2693         UnicodeString result;
2694         UText resultText = UTEXT_INITIALIZER;
2695         utext_openUnicodeString(&resultText, &result, &status);
2696 
2697         // Multiple finds do NOT bump up the previous appendReplacement postion.
2698         m.reset(&dataText);
2699         m.find();
2700         m.find();
2701         m.appendReplacement(&resultText, &replText, status);
2702         REGEX_CHECK_STATUS;
2703         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2704         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2705 
2706         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2707         status = U_ZERO_ERROR;
2708         result.truncate(0);
2709         utext_openUnicodeString(&resultText, &result, &status);
2710         m.reset(10, status);
2711         m.find();
2712         m.find();
2713         m.appendReplacement(&resultText, &replText, status);
2714         REGEX_CHECK_STATUS;
2715         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2716         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2717 
2718         // find() at interior of string, appendReplacement still starts at beginning.
2719         status = U_ZERO_ERROR;
2720         result.truncate(0);
2721         utext_openUnicodeString(&resultText, &result, &status);
2722         m.reset();
2723         m.find(10, status);
2724         m.find();
2725         m.appendReplacement(&resultText, &replText, status);
2726         REGEX_CHECK_STATUS;
2727         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2728         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2729 
2730         m.appendTail(&resultText, status);
2731         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2732         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2733 
2734         utext_close(&resultText);
2735     }
2736 
2737     delete matcher2;
2738     delete pat2;
2739     delete matcher;
2740     delete pat;
2741 
2742     utext_close(&dataText);
2743     utext_close(&replText);
2744     utext_close(&destText);
2745     utext_close(&re);
2746 }
2747 
2748 
2749 //---------------------------------------------------------------------------
2750 //
2751 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2752 //                        present and nominally working.
2753 //
2754 //---------------------------------------------------------------------------
API_Pattern_UTF8()2755 void RegexTest::API_Pattern_UTF8() {
2756     RegexPattern        pata;    // Test default constructor to not crash.
2757     RegexPattern        patb;
2758 
2759     REGEX_ASSERT(pata == patb);
2760     REGEX_ASSERT(pata == pata);
2761 
2762     UText         re1 = UTEXT_INITIALIZER;
2763     UText         re2 = UTEXT_INITIALIZER;
2764     UErrorCode    status = U_ZERO_ERROR;
2765     UParseError   pe;
2766 
2767     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2768     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2769     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2770     utext_openUTF8(&re2, str_def, -1, &status);
2771 
2772     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2773     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2774     REGEX_CHECK_STATUS;
2775     REGEX_ASSERT(*pat1 == *pat1);
2776     REGEX_ASSERT(*pat1 != pata);
2777 
2778     // Assign
2779     patb = *pat1;
2780     REGEX_ASSERT(patb == *pat1);
2781 
2782     // Copy Construct
2783     RegexPattern patc(*pat1);
2784     REGEX_ASSERT(patc == *pat1);
2785     REGEX_ASSERT(patb == patc);
2786     REGEX_ASSERT(pat1 != pat2);
2787     patb = *pat2;
2788     REGEX_ASSERT(patb != patc);
2789     REGEX_ASSERT(patb == *pat2);
2790 
2791     // Compile with no flags.
2792     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2793     REGEX_ASSERT(*pat1a == *pat1);
2794 
2795     REGEX_ASSERT(pat1a->flags() == 0);
2796 
2797     // Compile with different flags should be not equal
2798     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2799     REGEX_CHECK_STATUS;
2800 
2801     REGEX_ASSERT(*pat1b != *pat1a);
2802     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2803     REGEX_ASSERT(pat1a->flags() == 0);
2804     delete pat1b;
2805 
2806     // clone
2807     RegexPattern *pat1c = pat1->clone();
2808     REGEX_ASSERT(*pat1c == *pat1);
2809     REGEX_ASSERT(*pat1c != *pat2);
2810 
2811     delete pat1c;
2812     delete pat1a;
2813     delete pat1;
2814     delete pat2;
2815 
2816     utext_close(&re1);
2817     utext_close(&re2);
2818 
2819 
2820     //
2821     //   Verify that a matcher created from a cloned pattern works.
2822     //     (Jitterbug 3423)
2823     //
2824     {
2825         UErrorCode     status     = U_ZERO_ERROR;
2826         UText          pattern    = UTEXT_INITIALIZER;
2827         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2828         utext_openUTF8(&pattern, str_pL, -1, &status);
2829 
2830         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2831         RegexPattern  *pClone     = pSource->clone();
2832         delete         pSource;
2833         RegexMatcher  *mFromClone = pClone->matcher(status);
2834         REGEX_CHECK_STATUS;
2835 
2836         UText          input      = UTEXT_INITIALIZER;
2837         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2838         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2839         mFromClone->reset(&input);
2840         REGEX_ASSERT(mFromClone->find() == TRUE);
2841         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2842         REGEX_ASSERT(mFromClone->find() == TRUE);
2843         REGEX_ASSERT(mFromClone->group(status) == "World");
2844         REGEX_ASSERT(mFromClone->find() == FALSE);
2845         delete mFromClone;
2846         delete pClone;
2847 
2848         utext_close(&input);
2849         utext_close(&pattern);
2850     }
2851 
2852     //
2853     //   matches convenience API
2854     //
2855     {
2856         UErrorCode status  = U_ZERO_ERROR;
2857         UText      pattern = UTEXT_INITIALIZER;
2858         UText      input   = UTEXT_INITIALIZER;
2859 
2860         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2861         utext_openUTF8(&input, str_randominput, -1, &status);
2862 
2863         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2864         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2865         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2866         REGEX_CHECK_STATUS;
2867 
2868         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2869         utext_openUTF8(&pattern, str_abc, -1, &status);
2870         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2871         REGEX_CHECK_STATUS;
2872 
2873         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2874         utext_openUTF8(&pattern, str_nput, -1, &status);
2875         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2876         REGEX_CHECK_STATUS;
2877 
2878         utext_openUTF8(&pattern, str_randominput, -1, &status);
2879         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2880         REGEX_CHECK_STATUS;
2881 
2882         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2883         utext_openUTF8(&pattern, str_u, -1, &status);
2884         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2885         REGEX_CHECK_STATUS;
2886 
2887         utext_openUTF8(&input, str_abc, -1, &status);
2888         utext_openUTF8(&pattern, str_abc, -1, &status);
2889         status = U_INDEX_OUTOFBOUNDS_ERROR;
2890         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2891         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2892 
2893         utext_close(&input);
2894         utext_close(&pattern);
2895     }
2896 
2897 
2898     //
2899     // Split()
2900     //
2901     status = U_ZERO_ERROR;
2902     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2903     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2904     pat1 = RegexPattern::compile(&re1, pe, status);
2905     REGEX_CHECK_STATUS;
2906     UnicodeString  fields[10];
2907 
2908     int32_t n;
2909     n = pat1->split("Now is the time", fields, 10, status);
2910     REGEX_CHECK_STATUS;
2911     REGEX_ASSERT(n==4);
2912     REGEX_ASSERT(fields[0]=="Now");
2913     REGEX_ASSERT(fields[1]=="is");
2914     REGEX_ASSERT(fields[2]=="the");
2915     REGEX_ASSERT(fields[3]=="time");
2916     REGEX_ASSERT(fields[4]=="");
2917 
2918     n = pat1->split("Now is the time", fields, 2, status);
2919     REGEX_CHECK_STATUS;
2920     REGEX_ASSERT(n==2);
2921     REGEX_ASSERT(fields[0]=="Now");
2922     REGEX_ASSERT(fields[1]=="is the time");
2923     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2924 
2925     fields[1] = "*";
2926     status = U_ZERO_ERROR;
2927     n = pat1->split("Now is the time", fields, 1, status);
2928     REGEX_CHECK_STATUS;
2929     REGEX_ASSERT(n==1);
2930     REGEX_ASSERT(fields[0]=="Now is the time");
2931     REGEX_ASSERT(fields[1]=="*");
2932     status = U_ZERO_ERROR;
2933 
2934     n = pat1->split("    Now       is the time   ", fields, 10, status);
2935     REGEX_CHECK_STATUS;
2936     REGEX_ASSERT(n==6);
2937     REGEX_ASSERT(fields[0]=="");
2938     REGEX_ASSERT(fields[1]=="Now");
2939     REGEX_ASSERT(fields[2]=="is");
2940     REGEX_ASSERT(fields[3]=="the");
2941     REGEX_ASSERT(fields[4]=="time");
2942     REGEX_ASSERT(fields[5]=="");
2943     REGEX_ASSERT(fields[6]=="");
2944 
2945     fields[2] = "*";
2946     n = pat1->split("     ", fields, 10, status);
2947     REGEX_CHECK_STATUS;
2948     REGEX_ASSERT(n==2);
2949     REGEX_ASSERT(fields[0]=="");
2950     REGEX_ASSERT(fields[1]=="");
2951     REGEX_ASSERT(fields[2]=="*");
2952 
2953     fields[0] = "foo";
2954     n = pat1->split("", fields, 10, status);
2955     REGEX_CHECK_STATUS;
2956     REGEX_ASSERT(n==0);
2957     REGEX_ASSERT(fields[0]=="foo");
2958 
2959     delete pat1;
2960 
2961     //  split, with a pattern with (capture)
2962     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2963     pat1 = RegexPattern::compile(&re1,  pe, status);
2964     REGEX_CHECK_STATUS;
2965 
2966     status = U_ZERO_ERROR;
2967     fields[6] = fields[7] = "*";
2968     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2969     REGEX_CHECK_STATUS;
2970     REGEX_ASSERT(n==7);
2971     REGEX_ASSERT(fields[0]=="");
2972     REGEX_ASSERT(fields[1]=="a");
2973     REGEX_ASSERT(fields[2]=="Now is ");
2974     REGEX_ASSERT(fields[3]=="b");
2975     REGEX_ASSERT(fields[4]=="the time");
2976     REGEX_ASSERT(fields[5]=="c");
2977     REGEX_ASSERT(fields[6]=="");
2978     REGEX_ASSERT(fields[7]=="*");
2979     REGEX_ASSERT(status==U_ZERO_ERROR);
2980 
2981     fields[6] = fields[7] = "*";
2982     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2983     REGEX_CHECK_STATUS;
2984     REGEX_ASSERT(n==7);
2985     REGEX_ASSERT(fields[0]=="  ");
2986     REGEX_ASSERT(fields[1]=="a");
2987     REGEX_ASSERT(fields[2]=="Now is ");
2988     REGEX_ASSERT(fields[3]=="b");
2989     REGEX_ASSERT(fields[4]=="the time");
2990     REGEX_ASSERT(fields[5]=="c");
2991     REGEX_ASSERT(fields[6]=="");
2992     REGEX_ASSERT(fields[7]=="*");
2993 
2994     status = U_ZERO_ERROR;
2995     fields[6] = "foo";
2996     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
2997     REGEX_CHECK_STATUS;
2998     REGEX_ASSERT(n==6);
2999     REGEX_ASSERT(fields[0]=="  ");
3000     REGEX_ASSERT(fields[1]=="a");
3001     REGEX_ASSERT(fields[2]=="Now is ");
3002     REGEX_ASSERT(fields[3]=="b");
3003     REGEX_ASSERT(fields[4]=="the time");
3004     REGEX_ASSERT(fields[5]==" ");
3005     REGEX_ASSERT(fields[6]=="foo");
3006 
3007     status = U_ZERO_ERROR;
3008     fields[5] = "foo";
3009     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3010     REGEX_CHECK_STATUS;
3011     REGEX_ASSERT(n==5);
3012     REGEX_ASSERT(fields[0]=="  ");
3013     REGEX_ASSERT(fields[1]=="a");
3014     REGEX_ASSERT(fields[2]=="Now is ");
3015     REGEX_ASSERT(fields[3]=="b");
3016     REGEX_ASSERT(fields[4]=="the time<c>");
3017     REGEX_ASSERT(fields[5]=="foo");
3018 
3019     status = U_ZERO_ERROR;
3020     fields[5] = "foo";
3021     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3022     REGEX_CHECK_STATUS;
3023     REGEX_ASSERT(n==5);
3024     REGEX_ASSERT(fields[0]=="  ");
3025     REGEX_ASSERT(fields[1]=="a");
3026     REGEX_ASSERT(fields[2]=="Now is ");
3027     REGEX_ASSERT(fields[3]=="b");
3028     REGEX_ASSERT(fields[4]=="the time");
3029     REGEX_ASSERT(fields[5]=="foo");
3030 
3031     status = U_ZERO_ERROR;
3032     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3033     REGEX_CHECK_STATUS;
3034     REGEX_ASSERT(n==4);
3035     REGEX_ASSERT(fields[0]=="  ");
3036     REGEX_ASSERT(fields[1]=="a");
3037     REGEX_ASSERT(fields[2]=="Now is ");
3038     REGEX_ASSERT(fields[3]=="the time<c>");
3039     status = U_ZERO_ERROR;
3040     delete pat1;
3041 
3042     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3043     pat1 = RegexPattern::compile(&re1, pe, status);
3044     REGEX_CHECK_STATUS;
3045     n = pat1->split("1-10,20", fields, 10, status);
3046     REGEX_CHECK_STATUS;
3047     REGEX_ASSERT(n==5);
3048     REGEX_ASSERT(fields[0]=="1");
3049     REGEX_ASSERT(fields[1]=="-");
3050     REGEX_ASSERT(fields[2]=="10");
3051     REGEX_ASSERT(fields[3]==",");
3052     REGEX_ASSERT(fields[4]=="20");
3053     delete pat1;
3054 
3055 
3056     //
3057     // split of a UText based string, with library allocating output UTexts.
3058     //
3059     {
3060         status = U_ZERO_ERROR;
3061         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3062         UnicodeString stringToSplit("first:second:third");
3063         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3064         REGEX_CHECK_STATUS;
3065 
3066         UText *splits[10] = {NULL};
3067         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3068         REGEX_CHECK_STATUS;
3069         REGEX_ASSERT(numFields == 5);
3070         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3071         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3072         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3073         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3074         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3075         REGEX_ASSERT(splits[5] == NULL);
3076 
3077         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3078             if (splits[i]) {
3079                 utext_close(splits[i]);
3080                 splits[i] = NULL;
3081             }
3082         }
3083         utext_close(textToSplit);
3084     }
3085 
3086 
3087     //
3088     // RegexPattern::pattern() and patternText()
3089     //
3090     pat1 = new RegexPattern();
3091     REGEX_ASSERT(pat1->pattern() == "");
3092     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3093     delete pat1;
3094     const char *helloWorldInvariant = "(Hello, world)*";
3095     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3096     pat1 = RegexPattern::compile(&re1, pe, status);
3097     REGEX_CHECK_STATUS;
3098     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3099     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3100     delete pat1;
3101 
3102     utext_close(&re1);
3103 }
3104 
3105 
3106 //---------------------------------------------------------------------------
3107 //
3108 //      Extended       A more thorough check for features of regex patterns
3109 //                     The test cases are in a separate data file,
3110 //                       source/tests/testdata/regextst.txt
3111 //                     A description of the test data format is included in that file.
3112 //
3113 //---------------------------------------------------------------------------
3114 
3115 const char *
getPath(char buffer[2048],const char * filename)3116 RegexTest::getPath(char buffer[2048], const char *filename) {
3117     UErrorCode status=U_ZERO_ERROR;
3118     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3119     if (U_FAILURE(status)) {
3120         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3121         return NULL;
3122     }
3123 
3124     strcpy(buffer, testDataDirectory);
3125     strcat(buffer, filename);
3126     return buffer;
3127 }
3128 
Extended()3129 void RegexTest::Extended() {
3130     char tdd[2048];
3131     const char *srcPath;
3132     UErrorCode  status  = U_ZERO_ERROR;
3133     int32_t     lineNum = 0;
3134 
3135     //
3136     //  Open and read the test data file.
3137     //
3138     srcPath=getPath(tdd, "regextst.txt");
3139     if(srcPath==NULL) {
3140         return; /* something went wrong, error already output */
3141     }
3142 
3143     int32_t    len;
3144     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3145     if (U_FAILURE(status)) {
3146         return; /* something went wrong, error already output */
3147     }
3148 
3149     //
3150     //  Put the test data into a UnicodeString
3151     //
3152     UnicodeString testString(FALSE, testData, len);
3153 
3154     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3155     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3156     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3157 
3158     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3159     UnicodeString   testPattern;   // The pattern for test from the test file.
3160     UnicodeString   testFlags;     // the flags   for a test.
3161     UnicodeString   matchString;   // The marked up string to be used as input
3162 
3163     if (U_FAILURE(status)){
3164         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3165         delete [] testData;
3166         return;
3167     }
3168 
3169     //
3170     //  Loop over the test data file, once per line.
3171     //
3172     while (lineMat.find()) {
3173         lineNum++;
3174         if (U_FAILURE(status)) {
3175           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3176         }
3177 
3178         status = U_ZERO_ERROR;
3179         UnicodeString testLine = lineMat.group(1, status);
3180         if (testLine.length() == 0) {
3181             continue;
3182         }
3183 
3184         //
3185         // Parse the test line.  Skip blank and comment only lines.
3186         // Separate out the three main fields - pattern, flags, target.
3187         //
3188 
3189         commentMat.reset(testLine);
3190         if (commentMat.lookingAt(status)) {
3191             // This line is a comment, or blank.
3192             continue;
3193         }
3194 
3195         //
3196         //  Pull out the pattern field, remove it from the test file line.
3197         //
3198         quotedStuffMat.reset(testLine);
3199         if (quotedStuffMat.lookingAt(status)) {
3200             testPattern = quotedStuffMat.group(2, status);
3201             testLine.remove(0, quotedStuffMat.end(0, status));
3202         } else {
3203             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3204             continue;
3205         }
3206 
3207 
3208         //
3209         //  Pull out the flags from the test file line.
3210         //
3211         flagsMat.reset(testLine);
3212         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3213         testFlags = flagsMat.group(1, status);
3214         if (flagsMat.group(2, status).length() > 0) {
3215             errln("Bad Match flag at line %d. Scanning %c\n",
3216                 lineNum, flagsMat.group(2, status).charAt(0));
3217             continue;
3218         }
3219         testLine.remove(0, flagsMat.end(0, status));
3220 
3221         //
3222         //  Pull out the match string, as a whole.
3223         //    We'll process the <tags> later.
3224         //
3225         quotedStuffMat.reset(testLine);
3226         if (quotedStuffMat.lookingAt(status)) {
3227             matchString = quotedStuffMat.group(2, status);
3228             testLine.remove(0, quotedStuffMat.end(0, status));
3229         } else {
3230             errln("Bad match string at test file line %d", lineNum);
3231             continue;
3232         }
3233 
3234         //
3235         //  The only thing left from the input line should be an optional trailing comment.
3236         //
3237         commentMat.reset(testLine);
3238         if (commentMat.lookingAt(status) == FALSE) {
3239             errln("Line %d: unexpected characters at end of test line.", lineNum);
3240             continue;
3241         }
3242 
3243         //
3244         //  Run the test
3245         //
3246         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3247     }
3248 
3249     delete [] testData;
3250 
3251 }
3252 
3253 
3254 
3255 //---------------------------------------------------------------------------
3256 //
3257 //    regex_find(pattern, flags, inputString, lineNumber)
3258 //
3259 //         Function to run a single test from the Extended (data driven) tests.
3260 //         See file test/testdata/regextst.txt for a description of the
3261 //         pattern and inputString fields, and the allowed flags.
3262 //         lineNumber is the source line in regextst.txt of the test.
3263 //
3264 //---------------------------------------------------------------------------
3265 
3266 
3267 //  Set a value into a UVector at position specified by a decimal number in
3268 //   a UnicodeString.   This is a utility function needed by the actual test function,
3269 //   which follows.
set(UVector & vec,int32_t val,UnicodeString index)3270 static void set(UVector &vec, int32_t val, UnicodeString index) {
3271     UErrorCode  status=U_ZERO_ERROR;
3272     int32_t  idx = 0;
3273     for (int32_t i=0; i<index.length(); i++) {
3274         int32_t d=u_charDigitValue(index.charAt(i));
3275         if (d<0) {return;}
3276         idx = idx*10 + d;
3277     }
3278     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3279     vec.setElementAt(val, idx);
3280 }
3281 
setInt(UVector & vec,int32_t val,int32_t idx)3282 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3283     UErrorCode  status=U_ZERO_ERROR;
3284     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3285     vec.setElementAt(val, idx);
3286 }
3287 
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3288 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3289 {
3290     UBool couldFind = TRUE;
3291     UTEXT_SETNATIVEINDEX(utext, 0);
3292     int32_t i = 0;
3293     while (i < unistrOffset) {
3294         UChar32 c = UTEXT_NEXT32(utext);
3295         if (c != U_SENTINEL) {
3296             i += U16_LENGTH(c);
3297         } else {
3298             couldFind = FALSE;
3299             break;
3300         }
3301     }
3302     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3303     return couldFind;
3304 }
3305 
3306 
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3307 void RegexTest::regex_find(const UnicodeString &pattern,
3308                            const UnicodeString &flags,
3309                            const UnicodeString &inputString,
3310                            const char *srcPath,
3311                            int32_t line) {
3312     UnicodeString       unEscapedInput;
3313     UnicodeString       deTaggedInput;
3314 
3315     int32_t             patternUTF8Length,      inputUTF8Length;
3316     char                *patternChars  = NULL, *inputChars = NULL;
3317     UText               patternText    = UTEXT_INITIALIZER;
3318     UText               inputText      = UTEXT_INITIALIZER;
3319     UConverter          *UTF8Converter = NULL;
3320 
3321     UErrorCode          status         = U_ZERO_ERROR;
3322     UParseError         pe;
3323     RegexPattern        *parsePat      = NULL;
3324     RegexMatcher        *parseMatcher  = NULL;
3325     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3326     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3327     UVector             groupStarts(status);
3328     UVector             groupEnds(status);
3329     UVector             groupStartsUTF8(status);
3330     UVector             groupEndsUTF8(status);
3331     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3332     UBool               failed         = FALSE;
3333     int32_t             numFinds;
3334     int32_t             i;
3335     UBool               useMatchesFunc   = FALSE;
3336     UBool               useLookingAtFunc = FALSE;
3337     int32_t             regionStart      = -1;
3338     int32_t             regionEnd        = -1;
3339     int32_t             regionStartUTF8  = -1;
3340     int32_t             regionEndUTF8    = -1;
3341 
3342 
3343     //
3344     //  Compile the caller's pattern
3345     //
3346     uint32_t bflags = 0;
3347     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3348         bflags |= UREGEX_CASE_INSENSITIVE;
3349     }
3350     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3351         bflags |= UREGEX_COMMENTS;
3352     }
3353     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3354         bflags |= UREGEX_DOTALL;
3355     }
3356     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3357         bflags |= UREGEX_MULTILINE;
3358     }
3359 
3360     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3361         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3362     }
3363     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3364         bflags |= UREGEX_UNIX_LINES;
3365     }
3366     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3367         bflags |= UREGEX_LITERAL;
3368     }
3369 
3370 
3371     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3372     if (status != U_ZERO_ERROR) {
3373         #if UCONFIG_NO_BREAK_ITERATION==1
3374         // 'v' test flag means that the test pattern should not compile if ICU was configured
3375         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3376         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3377             goto cleanupAndReturn;
3378         }
3379         #endif
3380         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3381             // Expected pattern compilation error.
3382             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3383                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3384             }
3385             goto cleanupAndReturn;
3386         } else {
3387             // Unexpected pattern compilation error.
3388             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3389             goto cleanupAndReturn;
3390         }
3391     }
3392 
3393     UTF8Converter = ucnv_open("UTF8", &status);
3394     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3395 
3396     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3397     status = U_ZERO_ERROR; // buffer overflow
3398     patternChars = new char[patternUTF8Length+1];
3399     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3400     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3401 
3402     if (status == U_ZERO_ERROR) {
3403         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3404 
3405         if (status != U_ZERO_ERROR) {
3406 #if UCONFIG_NO_BREAK_ITERATION==1
3407             // 'v' test flag means that the test pattern should not compile if ICU was configured
3408             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3409             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3410                 goto cleanupAndReturn;
3411             }
3412 #endif
3413             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3414                 // Expected pattern compilation error.
3415                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3416                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3417                 }
3418                 goto cleanupAndReturn;
3419             } else {
3420                 // Unexpected pattern compilation error.
3421                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3422                 goto cleanupAndReturn;
3423             }
3424         }
3425     }
3426 
3427     if (UTF8Pattern == NULL) {
3428         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3429         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3430         status = U_ZERO_ERROR;
3431     }
3432 
3433     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3434         callerPattern->dumpPattern();
3435     }
3436 
3437     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3438         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3439         goto cleanupAndReturn;
3440     }
3441 
3442 
3443     //
3444     // Number of times find() should be called on the test string, default to 1
3445     //
3446     numFinds = 1;
3447     for (i=2; i<=9; i++) {
3448         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3449             if (numFinds != 1) {
3450                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3451                 goto cleanupAndReturn;
3452             }
3453             numFinds = i;
3454         }
3455     }
3456 
3457     // 'M' flag.  Use matches() instead of find()
3458     if (flags.indexOf((UChar)0x4d) >= 0) {
3459         useMatchesFunc = TRUE;
3460     }
3461     if (flags.indexOf((UChar)0x4c) >= 0) {
3462         useLookingAtFunc = TRUE;
3463     }
3464 
3465     //
3466     //  Find the tags in the input data, remove them, and record the group boundary
3467     //    positions.
3468     //
3469     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3470     REGEX_CHECK_STATUS_L(line);
3471 
3472     unEscapedInput = inputString.unescape();
3473     parseMatcher = parsePat->matcher(unEscapedInput, status);
3474     REGEX_CHECK_STATUS_L(line);
3475     while(parseMatcher->find()) {
3476         parseMatcher->appendReplacement(deTaggedInput, "", status);
3477         REGEX_CHECK_STATUS;
3478         UnicodeString groupNum = parseMatcher->group(2, status);
3479         if (groupNum == "r") {
3480             // <r> or </r>, a region specification within the string
3481             if (parseMatcher->group(1, status) == "/") {
3482                 regionEnd = deTaggedInput.length();
3483             } else {
3484                 regionStart = deTaggedInput.length();
3485             }
3486         } else {
3487             // <digits> or </digits>, a group match boundary tag.
3488             if (parseMatcher->group(1, status) == "/") {
3489                 set(groupEnds, deTaggedInput.length(), groupNum);
3490             } else {
3491                 set(groupStarts, deTaggedInput.length(), groupNum);
3492             }
3493         }
3494     }
3495     parseMatcher->appendTail(deTaggedInput);
3496     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3497     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3498       errln("mismatched <r> tags");
3499       failed = TRUE;
3500       goto cleanupAndReturn;
3501     }
3502 
3503     //
3504     //  Configure the matcher according to the flags specified with this test.
3505     //
3506     matcher = callerPattern->matcher(deTaggedInput, status);
3507     REGEX_CHECK_STATUS_L(line);
3508     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3509         matcher->setTrace(TRUE);
3510     }
3511 
3512     if (UTF8Pattern != NULL) {
3513         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3514         status = U_ZERO_ERROR; // buffer overflow
3515         inputChars = new char[inputUTF8Length+1];
3516         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3517         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3518 
3519         if (status == U_ZERO_ERROR) {
3520             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3521             REGEX_CHECK_STATUS_L(line);
3522         }
3523 
3524         if (UTF8Matcher == NULL) {
3525             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3526             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3527             status = U_ZERO_ERROR;
3528         }
3529     }
3530 
3531     //
3532     //  Generate native indices for UTF8 versions of region and capture group info
3533     //
3534     if (UTF8Matcher != NULL) {
3535         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3536             UTF8Matcher->setTrace(TRUE);
3537         }
3538         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3539         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3540 
3541         //  Fill out the native index UVector info.
3542         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3543         for (i=0; i<groupStarts.size(); i++) {
3544             int32_t  start = groupStarts.elementAti(i);
3545             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3546             if (start >= 0) {
3547                 int32_t  startUTF8;
3548                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3549                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3550                     failed = TRUE;
3551                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3552                 }
3553                 setInt(groupStartsUTF8, startUTF8, i);
3554             }
3555 
3556             int32_t  end = groupEnds.elementAti(i);
3557             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3558             if (end >= 0) {
3559                 int32_t  endUTF8;
3560                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3561                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3562                     failed = TRUE;
3563                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3564                 }
3565                 setInt(groupEndsUTF8, endUTF8, i);
3566             }
3567         }
3568     }
3569 
3570     if (regionStart>=0) {
3571        matcher->region(regionStart, regionEnd, status);
3572        REGEX_CHECK_STATUS_L(line);
3573        if (UTF8Matcher != NULL) {
3574            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3575            REGEX_CHECK_STATUS_L(line);
3576        }
3577     }
3578     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3579         matcher->useAnchoringBounds(FALSE);
3580         if (UTF8Matcher != NULL) {
3581             UTF8Matcher->useAnchoringBounds(FALSE);
3582         }
3583     }
3584     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3585         matcher->useTransparentBounds(TRUE);
3586         if (UTF8Matcher != NULL) {
3587             UTF8Matcher->useTransparentBounds(TRUE);
3588         }
3589     }
3590 
3591 
3592 
3593     //
3594     // Do a find on the de-tagged input using the caller's pattern
3595     //     TODO: error on count>1 and not find().
3596     //           error on both matches() and lookingAt().
3597     //
3598     for (i=0; i<numFinds; i++) {
3599         if (useMatchesFunc) {
3600             isMatch = matcher->matches(status);
3601             if (UTF8Matcher != NULL) {
3602                isUTF8Match = UTF8Matcher->matches(status);
3603             }
3604         } else  if (useLookingAtFunc) {
3605             isMatch = matcher->lookingAt(status);
3606             if (UTF8Matcher != NULL) {
3607                 isUTF8Match = UTF8Matcher->lookingAt(status);
3608             }
3609         } else {
3610             isMatch = matcher->find();
3611             if (UTF8Matcher != NULL) {
3612                 isUTF8Match = UTF8Matcher->find();
3613             }
3614         }
3615     }
3616     matcher->setTrace(FALSE);
3617     if (UTF8Matcher) {
3618         UTF8Matcher->setTrace(FALSE);
3619     }
3620     if (U_FAILURE(status)) {
3621         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3622     }
3623 
3624     //
3625     // Match up the groups from the find() with the groups from the tags
3626     //
3627 
3628     // number of tags should match number of groups from find operation.
3629     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3630     //   G option in test means that capture group data is not available in the
3631     //     expected results, so the check needs to be suppressed.
3632     if (isMatch == FALSE && groupStarts.size() != 0) {
3633         dataerrln("Error at line %d:  Match expected, but none found.", line);
3634         failed = TRUE;
3635         goto cleanupAndReturn;
3636     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3637         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3638         failed = TRUE;
3639         goto cleanupAndReturn;
3640     }
3641     if (isMatch && groupStarts.size() == 0) {
3642         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3643         failed = TRUE;
3644     }
3645     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3646         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3647         failed = TRUE;
3648     }
3649 
3650     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3651         // Only check for match / no match.  Don't check capture groups.
3652         goto cleanupAndReturn;
3653     }
3654 
3655     REGEX_CHECK_STATUS_L(line);
3656     for (i=0; i<=matcher->groupCount(); i++) {
3657         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3658         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3659         if (matcher->start(i, status) != expectedStart) {
3660             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3661                 line, i, expectedStart, matcher->start(i, status));
3662             failed = TRUE;
3663             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3664         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3665             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3666                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3667             failed = TRUE;
3668             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3669         }
3670 
3671         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3672         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3673         if (matcher->end(i, status) != expectedEnd) {
3674             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3675                 line, i, expectedEnd, matcher->end(i, status));
3676             failed = TRUE;
3677             // Error on end position;  keep going; real error is probably yet to come as group
3678             //   end positions work from end of the input data towards the front.
3679         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3680             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3681                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3682             failed = TRUE;
3683             // Error on end position;  keep going; real error is probably yet to come as group
3684             //   end positions work from end of the input data towards the front.
3685         }
3686     }
3687     if ( matcher->groupCount()+1 < groupStarts.size()) {
3688         errln("Error at line %d: Expected %d capture groups, found %d.",
3689             line, groupStarts.size()-1, matcher->groupCount());
3690         failed = TRUE;
3691         }
3692     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3693         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3694               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3695         failed = TRUE;
3696     }
3697 
3698     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3699         matcher->requireEnd() == TRUE) {
3700         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3701         failed = TRUE;
3702     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3703         UTF8Matcher->requireEnd() == TRUE) {
3704         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3705         failed = TRUE;
3706     }
3707 
3708     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3709         matcher->requireEnd() == FALSE) {
3710         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3711         failed = TRUE;
3712     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3713         UTF8Matcher->requireEnd() == FALSE) {
3714         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3715         failed = TRUE;
3716     }
3717 
3718     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3719         matcher->hitEnd() == TRUE) {
3720         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3721         failed = TRUE;
3722     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3723                UTF8Matcher->hitEnd() == TRUE) {
3724         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3725         failed = TRUE;
3726     }
3727 
3728     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3729         matcher->hitEnd() == FALSE) {
3730         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3731         failed = TRUE;
3732     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3733                UTF8Matcher->hitEnd() == FALSE) {
3734         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3735         failed = TRUE;
3736     }
3737 
3738 
3739 cleanupAndReturn:
3740     if (failed) {
3741         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3742             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3743         // callerPattern->dump();
3744     }
3745     delete parseMatcher;
3746     delete parsePat;
3747     delete UTF8Matcher;
3748     delete UTF8Pattern;
3749     delete matcher;
3750     delete callerPattern;
3751 
3752     utext_close(&inputText);
3753     delete[] inputChars;
3754     utext_close(&patternText);
3755     delete[] patternChars;
3756     ucnv_close(UTF8Converter);
3757 }
3758 
3759 
3760 
3761 
3762 //---------------------------------------------------------------------------
3763 //
3764 //      Errors     Check for error handling in patterns.
3765 //
3766 //---------------------------------------------------------------------------
Errors()3767 void RegexTest::Errors() {
3768     // \escape sequences that aren't implemented yet.
3769     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3770 
3771     // Missing close parentheses
3772     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3773     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3774     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3775 
3776     // Extra close paren
3777     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3778     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3779     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3780 
3781     // Look-ahead, Look-behind
3782     //  TODO:  add tests for unbounded length look-behinds.
3783     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3784 
3785     // Attempt to use non-default flags
3786     {
3787         UParseError   pe;
3788         UErrorCode    status = U_ZERO_ERROR;
3789         int32_t       flags  = UREGEX_CANON_EQ |
3790                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3791                                UREGEX_MULTILINE;
3792         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3793         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3794         delete pat1;
3795     }
3796 
3797 
3798     // Quantifiers are allowed only after something that can be quantified.
3799     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3800     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3801     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3802 
3803     // Mal-formed {min,max} quantifiers
3804     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3805     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3806     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3807     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3808     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3809     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3810     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3811     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3812     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3813 
3814     // Ticket 5389
3815     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3816 
3817     // Invalid Back Reference \0
3818     //    For ICU 3.8 and earlier
3819     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3820     //
3821     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3822 
3823 }
3824 
3825 
3826 //-------------------------------------------------------------------------------
3827 //
3828 //  Read a text data file, convert it to UChars, and return the data
3829 //    in one big UChar * buffer, which the caller must delete.
3830 //
3831 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3832 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3833                                      const char *defEncoding, UErrorCode &status) {
3834     UChar       *retPtr  = NULL;
3835     char        *fileBuf = NULL;
3836     UConverter* conv     = NULL;
3837     FILE        *f       = NULL;
3838 
3839     ulen = 0;
3840     if (U_FAILURE(status)) {
3841         return retPtr;
3842     }
3843 
3844     //
3845     //  Open the file.
3846     //
3847     f = fopen(fileName, "rb");
3848     if (f == 0) {
3849         dataerrln("Error opening test data file %s\n", fileName);
3850         status = U_FILE_ACCESS_ERROR;
3851         return NULL;
3852     }
3853     //
3854     //  Read it in
3855     //
3856     int32_t            fileSize;
3857     int32_t            amt_read;
3858 
3859     fseek( f, 0, SEEK_END);
3860     fileSize = ftell(f);
3861     fileBuf = new char[fileSize];
3862     fseek(f, 0, SEEK_SET);
3863     amt_read = fread(fileBuf, 1, fileSize, f);
3864     if (amt_read != fileSize || fileSize <= 0) {
3865         errln("Error reading test data file.");
3866         goto cleanUpAndReturn;
3867     }
3868 
3869     //
3870     // Look for a Unicode Signature (BOM) on the data just read
3871     //
3872     int32_t        signatureLength;
3873     const char *   fileBufC;
3874     const char*    encoding;
3875 
3876     fileBufC = fileBuf;
3877     encoding = ucnv_detectUnicodeSignature(
3878         fileBuf, fileSize, &signatureLength, &status);
3879     if(encoding!=NULL ){
3880         fileBufC  += signatureLength;
3881         fileSize  -= signatureLength;
3882     } else {
3883         encoding = defEncoding;
3884         if (strcmp(encoding, "utf-8") == 0) {
3885             errln("file %s is missing its BOM", fileName);
3886         }
3887     }
3888 
3889     //
3890     // Open a converter to take the rule file to UTF-16
3891     //
3892     conv = ucnv_open(encoding, &status);
3893     if (U_FAILURE(status)) {
3894         goto cleanUpAndReturn;
3895     }
3896 
3897     //
3898     // Convert the rules to UChar.
3899     //  Preflight first to determine required buffer size.
3900     //
3901     ulen = ucnv_toUChars(conv,
3902         NULL,           //  dest,
3903         0,              //  destCapacity,
3904         fileBufC,
3905         fileSize,
3906         &status);
3907     if (status == U_BUFFER_OVERFLOW_ERROR) {
3908         // Buffer Overflow is expected from the preflight operation.
3909         status = U_ZERO_ERROR;
3910 
3911         retPtr = new UChar[ulen+1];
3912         ucnv_toUChars(conv,
3913             retPtr,       //  dest,
3914             ulen+1,
3915             fileBufC,
3916             fileSize,
3917             &status);
3918     }
3919 
3920 cleanUpAndReturn:
3921     fclose(f);
3922     delete[] fileBuf;
3923     ucnv_close(conv);
3924     if (U_FAILURE(status)) {
3925         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3926         delete []retPtr;
3927         retPtr = 0;
3928         ulen   = 0;
3929     };
3930     return retPtr;
3931 }
3932 
3933 
3934 //-------------------------------------------------------------------------------
3935 //
3936 //   PerlTests  - Run Perl's regular expression tests
3937 //                The input file for this test is re_tests, the standard regular
3938 //                expression test data distributed with the Perl source code.
3939 //
3940 //                Here is Perl's description of the test data file:
3941 //
3942 //        # The tests are in a separate file 't/op/re_tests'.
3943 //        # Each line in that file is a separate test.
3944 //        # There are five columns, separated by tabs.
3945 //        #
3946 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3947 //        # Modifiers can be put after the closing C<'>.
3948 //        #
3949 //        # Column 2 contains the string to be matched.
3950 //        #
3951 //        # Column 3 contains the expected result:
3952 //        #     y   expect a match
3953 //        #     n   expect no match
3954 //        #     c   expect an error
3955 //        # B   test exposes a known bug in Perl, should be skipped
3956 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3957 //        #
3958 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3959 //        #
3960 //        # Column 4 contains a string, usually C<$&>.
3961 //        #
3962 //        # Column 5 contains the expected result of double-quote
3963 //        # interpolating that string after the match, or start of error message.
3964 //        #
3965 //        # Column 6, if present, contains a reason why the test is skipped.
3966 //        # This is printed with "skipped", for harness to pick up.
3967 //        #
3968 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3969 //        #
3970 //        # If you want to add a regular expression test that can't be expressed
3971 //        # in this format, don't add it here: put it in op/pat.t instead.
3972 //
3973 //        For ICU, if field 3 contains an 'i', the test will be skipped.
3974 //        The test exposes is some known incompatibility between ICU and Perl regexps.
3975 //        (The i is in addition to whatever was there before.)
3976 //
3977 //-------------------------------------------------------------------------------
PerlTests()3978 void RegexTest::PerlTests() {
3979     char tdd[2048];
3980     const char *srcPath;
3981     UErrorCode  status = U_ZERO_ERROR;
3982     UParseError pe;
3983 
3984     //
3985     //  Open and read the test data file.
3986     //
3987     srcPath=getPath(tdd, "re_tests.txt");
3988     if(srcPath==NULL) {
3989         return; /* something went wrong, error already output */
3990     }
3991 
3992     int32_t    len;
3993     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3994     if (U_FAILURE(status)) {
3995         return; /* something went wrong, error already output */
3996     }
3997 
3998     //
3999     //  Put the test data into a UnicodeString
4000     //
4001     UnicodeString testDataString(FALSE, testData, len);
4002 
4003     //
4004     //  Regex to break the input file into lines, and strip the new lines.
4005     //     One line per match, capture group one is the desired data.
4006     //
4007     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4008     if (U_FAILURE(status)) {
4009         dataerrln("RegexPattern::compile() error");
4010         return;
4011     }
4012     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4013 
4014     //
4015     //  Regex to split a test file line into fields.
4016     //    There are six fields, separated by tabs.
4017     //
4018     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4019 
4020     //
4021     //  Regex to identify test patterns with flag settings, and to separate them.
4022     //    Test patterns with flags look like 'pattern'i
4023     //    Test patterns without flags are not quoted:   pattern
4024     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4025     //
4026     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4027     RegexMatcher* flagMat = flagPat->matcher(status);
4028 
4029     //
4030     // The Perl tests reference several perl-isms, which are evaluated/substituted
4031     //   in the test data.  Not being perl, this must be done explicitly.  Here
4032     //   are string constants and REs for these constructs.
4033     //
4034     UnicodeString nulnulSrc("${nulnul}");
4035     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4036     nulnul = nulnul.unescape();
4037 
4038     UnicodeString ffffSrc("${ffff}");
4039     UnicodeString ffff("\\uffff", -1, US_INV);
4040     ffff = ffff.unescape();
4041 
4042     //  regexp for $-[0], $+[2], etc.
4043     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4044     RegexMatcher *groupsMat = groupsPat->matcher(status);
4045 
4046     //  regexp for $0, $1, $2, etc.
4047     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4048     RegexMatcher *cgMat = cgPat->matcher(status);
4049 
4050 
4051     //
4052     // Main Loop for the Perl Tests, runs once per line from the
4053     //   test data file.
4054     //
4055     int32_t  lineNum = 0;
4056     int32_t  skippedUnimplementedCount = 0;
4057     while (lineMat->find()) {
4058         lineNum++;
4059 
4060         //
4061         //  Get a line, break it into its fields, do the Perl
4062         //    variable substitutions.
4063         //
4064         UnicodeString line = lineMat->group(1, status);
4065         UnicodeString fields[7];
4066         fieldPat->split(line, fields, 7, status);
4067 
4068         flagMat->reset(fields[0]);
4069         flagMat->matches(status);
4070         UnicodeString pattern  = flagMat->group(2, status);
4071         pattern.findAndReplace("${bang}", "!");
4072         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4073         pattern.findAndReplace(ffffSrc, ffff);
4074 
4075         //
4076         //  Identify patterns that include match flag settings,
4077         //    split off the flags, remove the extra quotes.
4078         //
4079         UnicodeString flagStr = flagMat->group(3, status);
4080         if (U_FAILURE(status)) {
4081             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4082             return;
4083         }
4084         int32_t flags = 0;
4085         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4086         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4087         const UChar UChar_m = 0x6d;
4088         const UChar UChar_x = 0x78;
4089         const UChar UChar_y = 0x79;
4090         if (flagStr.indexOf(UChar_i) != -1) {
4091             flags |= UREGEX_CASE_INSENSITIVE;
4092         }
4093         if (flagStr.indexOf(UChar_m) != -1) {
4094             flags |= UREGEX_MULTILINE;
4095         }
4096         if (flagStr.indexOf(UChar_x) != -1) {
4097             flags |= UREGEX_COMMENTS;
4098         }
4099 
4100         //
4101         // Compile the test pattern.
4102         //
4103         status = U_ZERO_ERROR;
4104         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4105         if (status == U_REGEX_UNIMPLEMENTED) {
4106             //
4107             // Test of a feature that is planned for ICU, but not yet implemented.
4108             //   skip the test.
4109             skippedUnimplementedCount++;
4110             delete testPat;
4111             status = U_ZERO_ERROR;
4112             continue;
4113         }
4114 
4115         if (U_FAILURE(status)) {
4116             // Some tests are supposed to generate errors.
4117             //   Only report an error for tests that are supposed to succeed.
4118             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4119                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4120             {
4121                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4122             }
4123             status = U_ZERO_ERROR;
4124             delete testPat;
4125             continue;
4126         }
4127 
4128         if (fields[2].indexOf(UChar_i) >= 0) {
4129             // ICU should skip this test.
4130             delete testPat;
4131             continue;
4132         }
4133 
4134         if (fields[2].indexOf(UChar_c) >= 0) {
4135             // This pattern should have caused a compilation error, but didn't/
4136             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4137             delete testPat;
4138             continue;
4139         }
4140 
4141         //
4142         // replace the Perl variables that appear in some of the
4143         //   match data strings.
4144         //
4145         UnicodeString matchString = fields[1];
4146         matchString.findAndReplace(nulnulSrc, nulnul);
4147         matchString.findAndReplace(ffffSrc,   ffff);
4148 
4149         // Replace any \n in the match string with an actual new-line char.
4150         //  Don't do full unescape, as this unescapes more than Perl does, which
4151         //  causes other spurious failures in the tests.
4152         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4153 
4154 
4155 
4156         //
4157         // Run the test, check for expected match/don't match result.
4158         //
4159         RegexMatcher *testMat = testPat->matcher(matchString, status);
4160         UBool found = testMat->find();
4161         UBool expected = FALSE;
4162         if (fields[2].indexOf(UChar_y) >=0) {
4163             expected = TRUE;
4164         }
4165         if (expected != found) {
4166             errln("line %d: Expected %smatch, got %smatch",
4167                 lineNum, expected?"":"no ", found?"":"no " );
4168             continue;
4169         }
4170 
4171         // Don't try to check expected results if there is no match.
4172         //   (Some have stuff in the expected fields)
4173         if (!found) {
4174             delete testMat;
4175             delete testPat;
4176             continue;
4177         }
4178 
4179         //
4180         // Interpret the Perl expression from the fourth field of the data file,
4181         // building up an ICU string from the results of the ICU match.
4182         //   The Perl expression will contain references to the results of
4183         //     a regex match, including the matched string, capture group strings,
4184         //     group starting and ending indicies, etc.
4185         //
4186         UnicodeString resultString;
4187         UnicodeString perlExpr = fields[3];
4188 #if SUPPORT_MUTATING_INPUT_STRING
4189         groupsMat->reset(perlExpr);
4190         cgMat->reset(perlExpr);
4191 #endif
4192 
4193         while (perlExpr.length() > 0) {
4194 #if !SUPPORT_MUTATING_INPUT_STRING
4195             //  Perferred usage.  Reset after any modification to input string.
4196             groupsMat->reset(perlExpr);
4197             cgMat->reset(perlExpr);
4198 #endif
4199 
4200             if (perlExpr.startsWith("$&")) {
4201                 resultString.append(testMat->group(status));
4202                 perlExpr.remove(0, 2);
4203             }
4204 
4205             else if (groupsMat->lookingAt(status)) {
4206                 // $-[0]   $+[2]  etc.
4207                 UnicodeString digitString = groupsMat->group(2, status);
4208                 int32_t t = 0;
4209                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4210                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4211                 int32_t matchPosition;
4212                 if (plusOrMinus.compare("+") == 0) {
4213                     matchPosition = testMat->end(groupNum, status);
4214                 } else {
4215                     matchPosition = testMat->start(groupNum, status);
4216                 }
4217                 if (matchPosition != -1) {
4218                     ICU_Utility::appendNumber(resultString, matchPosition);
4219                 }
4220                 perlExpr.remove(0, groupsMat->end(status));
4221             }
4222 
4223             else if (cgMat->lookingAt(status)) {
4224                 // $1, $2, $3, etc.
4225                 UnicodeString digitString = cgMat->group(1, status);
4226                 int32_t t = 0;
4227                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4228                 if (U_SUCCESS(status)) {
4229                     resultString.append(testMat->group(groupNum, status));
4230                     status = U_ZERO_ERROR;
4231                 }
4232                 perlExpr.remove(0, cgMat->end(status));
4233             }
4234 
4235             else if (perlExpr.startsWith("@-")) {
4236                 int32_t i;
4237                 for (i=0; i<=testMat->groupCount(); i++) {
4238                     if (i>0) {
4239                         resultString.append(" ");
4240                     }
4241                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4242                 }
4243                 perlExpr.remove(0, 2);
4244             }
4245 
4246             else if (perlExpr.startsWith("@+")) {
4247                 int32_t i;
4248                 for (i=0; i<=testMat->groupCount(); i++) {
4249                     if (i>0) {
4250                         resultString.append(" ");
4251                     }
4252                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4253                 }
4254                 perlExpr.remove(0, 2);
4255             }
4256 
4257             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4258                                                      //           or as an escaped sequence (e.g. \n)
4259                 if (perlExpr.length() > 1) {
4260                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4261                 }
4262                 UChar c = perlExpr.charAt(0);
4263                 switch (c) {
4264                 case 'n':   c = '\n'; break;
4265                 // add any other escape sequences that show up in the test expected results.
4266                 }
4267                 resultString.append(c);
4268                 perlExpr.remove(0, 1);
4269             }
4270 
4271             else  {
4272                 // Any characters from the perl expression that we don't explicitly
4273                 //  recognize before here are assumed to be literals and copied
4274                 //  as-is to the expected results.
4275                 resultString.append(perlExpr.charAt(0));
4276                 perlExpr.remove(0, 1);
4277             }
4278 
4279             if (U_FAILURE(status)) {
4280                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4281                 break;
4282             }
4283         }
4284 
4285         //
4286         // Expected Results Compare
4287         //
4288         UnicodeString expectedS(fields[4]);
4289         expectedS.findAndReplace(nulnulSrc, nulnul);
4290         expectedS.findAndReplace(ffffSrc,   ffff);
4291         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4292 
4293 
4294         if (expectedS.compare(resultString) != 0) {
4295             err("Line %d: Incorrect perl expression results.", lineNum);
4296             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4297         }
4298 
4299         delete testMat;
4300         delete testPat;
4301     }
4302 
4303     //
4304     // All done.  Clean up allocated stuff.
4305     //
4306     delete cgMat;
4307     delete cgPat;
4308 
4309     delete groupsMat;
4310     delete groupsPat;
4311 
4312     delete flagMat;
4313     delete flagPat;
4314 
4315     delete lineMat;
4316     delete linePat;
4317 
4318     delete fieldPat;
4319     delete [] testData;
4320 
4321 
4322     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4323 
4324 }
4325 
4326 
4327 //-------------------------------------------------------------------------------
4328 //
4329 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4330 //                  (instead of using UnicodeStrings) to test the alternate engine.
4331 //                  The input file for this test is re_tests, the standard regular
4332 //                  expression test data distributed with the Perl source code.
4333 //                  See PerlTests() for more information.
4334 //
4335 //-------------------------------------------------------------------------------
PerlTestsUTF8()4336 void RegexTest::PerlTestsUTF8() {
4337     char tdd[2048];
4338     const char *srcPath;
4339     UErrorCode  status = U_ZERO_ERROR;
4340     UParseError pe;
4341     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4342     UText       patternText = UTEXT_INITIALIZER;
4343     char       *patternChars = NULL;
4344     int32_t     patternLength;
4345     int32_t     patternCapacity = 0;
4346     UText       inputText = UTEXT_INITIALIZER;
4347     char       *inputChars = NULL;
4348     int32_t     inputLength;
4349     int32_t     inputCapacity = 0;
4350 
4351     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4352 
4353     //
4354     //  Open and read the test data file.
4355     //
4356     srcPath=getPath(tdd, "re_tests.txt");
4357     if(srcPath==NULL) {
4358         return; /* something went wrong, error already output */
4359     }
4360 
4361     int32_t    len;
4362     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4363     if (U_FAILURE(status)) {
4364         return; /* something went wrong, error already output */
4365     }
4366 
4367     //
4368     //  Put the test data into a UnicodeString
4369     //
4370     UnicodeString testDataString(FALSE, testData, len);
4371 
4372     //
4373     //  Regex to break the input file into lines, and strip the new lines.
4374     //     One line per match, capture group one is the desired data.
4375     //
4376     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4377     if (U_FAILURE(status)) {
4378         dataerrln("RegexPattern::compile() error");
4379         return;
4380     }
4381     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4382 
4383     //
4384     //  Regex to split a test file line into fields.
4385     //    There are six fields, separated by tabs.
4386     //
4387     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4388 
4389     //
4390     //  Regex to identify test patterns with flag settings, and to separate them.
4391     //    Test patterns with flags look like 'pattern'i
4392     //    Test patterns without flags are not quoted:   pattern
4393     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4394     //
4395     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4396     RegexMatcher* flagMat = flagPat->matcher(status);
4397 
4398     //
4399     // The Perl tests reference several perl-isms, which are evaluated/substituted
4400     //   in the test data.  Not being perl, this must be done explicitly.  Here
4401     //   are string constants and REs for these constructs.
4402     //
4403     UnicodeString nulnulSrc("${nulnul}");
4404     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4405     nulnul = nulnul.unescape();
4406 
4407     UnicodeString ffffSrc("${ffff}");
4408     UnicodeString ffff("\\uffff", -1, US_INV);
4409     ffff = ffff.unescape();
4410 
4411     //  regexp for $-[0], $+[2], etc.
4412     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4413     RegexMatcher *groupsMat = groupsPat->matcher(status);
4414 
4415     //  regexp for $0, $1, $2, etc.
4416     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4417     RegexMatcher *cgMat = cgPat->matcher(status);
4418 
4419 
4420     //
4421     // Main Loop for the Perl Tests, runs once per line from the
4422     //   test data file.
4423     //
4424     int32_t  lineNum = 0;
4425     int32_t  skippedUnimplementedCount = 0;
4426     while (lineMat->find()) {
4427         lineNum++;
4428 
4429         //
4430         //  Get a line, break it into its fields, do the Perl
4431         //    variable substitutions.
4432         //
4433         UnicodeString line = lineMat->group(1, status);
4434         UnicodeString fields[7];
4435         fieldPat->split(line, fields, 7, status);
4436 
4437         flagMat->reset(fields[0]);
4438         flagMat->matches(status);
4439         UnicodeString pattern  = flagMat->group(2, status);
4440         pattern.findAndReplace("${bang}", "!");
4441         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4442         pattern.findAndReplace(ffffSrc, ffff);
4443 
4444         //
4445         //  Identify patterns that include match flag settings,
4446         //    split off the flags, remove the extra quotes.
4447         //
4448         UnicodeString flagStr = flagMat->group(3, status);
4449         if (U_FAILURE(status)) {
4450             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4451             return;
4452         }
4453         int32_t flags = 0;
4454         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4455         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4456         const UChar UChar_m = 0x6d;
4457         const UChar UChar_x = 0x78;
4458         const UChar UChar_y = 0x79;
4459         if (flagStr.indexOf(UChar_i) != -1) {
4460             flags |= UREGEX_CASE_INSENSITIVE;
4461         }
4462         if (flagStr.indexOf(UChar_m) != -1) {
4463             flags |= UREGEX_MULTILINE;
4464         }
4465         if (flagStr.indexOf(UChar_x) != -1) {
4466             flags |= UREGEX_COMMENTS;
4467         }
4468 
4469         //
4470         // Put the pattern in a UTF-8 UText
4471         //
4472         status = U_ZERO_ERROR;
4473         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4474         if (status == U_BUFFER_OVERFLOW_ERROR) {
4475             status = U_ZERO_ERROR;
4476             delete[] patternChars;
4477             patternCapacity = patternLength + 1;
4478             patternChars = new char[patternCapacity];
4479             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4480         }
4481         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4482 
4483         //
4484         // Compile the test pattern.
4485         //
4486         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4487         if (status == U_REGEX_UNIMPLEMENTED) {
4488             //
4489             // Test of a feature that is planned for ICU, but not yet implemented.
4490             //   skip the test.
4491             skippedUnimplementedCount++;
4492             delete testPat;
4493             status = U_ZERO_ERROR;
4494             continue;
4495         }
4496 
4497         if (U_FAILURE(status)) {
4498             // Some tests are supposed to generate errors.
4499             //   Only report an error for tests that are supposed to succeed.
4500             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4501                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4502             {
4503                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4504             }
4505             status = U_ZERO_ERROR;
4506             delete testPat;
4507             continue;
4508         }
4509 
4510         if (fields[2].indexOf(UChar_i) >= 0) {
4511             // ICU should skip this test.
4512             delete testPat;
4513             continue;
4514         }
4515 
4516         if (fields[2].indexOf(UChar_c) >= 0) {
4517             // This pattern should have caused a compilation error, but didn't/
4518             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4519             delete testPat;
4520             continue;
4521         }
4522 
4523 
4524         //
4525         // replace the Perl variables that appear in some of the
4526         //   match data strings.
4527         //
4528         UnicodeString matchString = fields[1];
4529         matchString.findAndReplace(nulnulSrc, nulnul);
4530         matchString.findAndReplace(ffffSrc,   ffff);
4531 
4532         // Replace any \n in the match string with an actual new-line char.
4533         //  Don't do full unescape, as this unescapes more than Perl does, which
4534         //  causes other spurious failures in the tests.
4535         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4536 
4537         //
4538         // Put the input in a UTF-8 UText
4539         //
4540         status = U_ZERO_ERROR;
4541         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4542         if (status == U_BUFFER_OVERFLOW_ERROR) {
4543             status = U_ZERO_ERROR;
4544             delete[] inputChars;
4545             inputCapacity = inputLength + 1;
4546             inputChars = new char[inputCapacity];
4547             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4548         }
4549         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4550 
4551         //
4552         // Run the test, check for expected match/don't match result.
4553         //
4554         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4555         UBool found = testMat->find();
4556         UBool expected = FALSE;
4557         if (fields[2].indexOf(UChar_y) >=0) {
4558             expected = TRUE;
4559         }
4560         if (expected != found) {
4561             errln("line %d: Expected %smatch, got %smatch",
4562                 lineNum, expected?"":"no ", found?"":"no " );
4563             continue;
4564         }
4565 
4566         // Don't try to check expected results if there is no match.
4567         //   (Some have stuff in the expected fields)
4568         if (!found) {
4569             delete testMat;
4570             delete testPat;
4571             continue;
4572         }
4573 
4574         //
4575         // Interpret the Perl expression from the fourth field of the data file,
4576         // building up an ICU string from the results of the ICU match.
4577         //   The Perl expression will contain references to the results of
4578         //     a regex match, including the matched string, capture group strings,
4579         //     group starting and ending indicies, etc.
4580         //
4581         UnicodeString resultString;
4582         UnicodeString perlExpr = fields[3];
4583 
4584         while (perlExpr.length() > 0) {
4585             groupsMat->reset(perlExpr);
4586             cgMat->reset(perlExpr);
4587 
4588             if (perlExpr.startsWith("$&")) {
4589                 resultString.append(testMat->group(status));
4590                 perlExpr.remove(0, 2);
4591             }
4592 
4593             else if (groupsMat->lookingAt(status)) {
4594                 // $-[0]   $+[2]  etc.
4595                 UnicodeString digitString = groupsMat->group(2, status);
4596                 int32_t t = 0;
4597                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4598                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4599                 int32_t matchPosition;
4600                 if (plusOrMinus.compare("+") == 0) {
4601                     matchPosition = testMat->end(groupNum, status);
4602                 } else {
4603                     matchPosition = testMat->start(groupNum, status);
4604                 }
4605                 if (matchPosition != -1) {
4606                     ICU_Utility::appendNumber(resultString, matchPosition);
4607                 }
4608                 perlExpr.remove(0, groupsMat->end(status));
4609             }
4610 
4611             else if (cgMat->lookingAt(status)) {
4612                 // $1, $2, $3, etc.
4613                 UnicodeString digitString = cgMat->group(1, status);
4614                 int32_t t = 0;
4615                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4616                 if (U_SUCCESS(status)) {
4617                     resultString.append(testMat->group(groupNum, status));
4618                     status = U_ZERO_ERROR;
4619                 }
4620                 perlExpr.remove(0, cgMat->end(status));
4621             }
4622 
4623             else if (perlExpr.startsWith("@-")) {
4624                 int32_t i;
4625                 for (i=0; i<=testMat->groupCount(); i++) {
4626                     if (i>0) {
4627                         resultString.append(" ");
4628                     }
4629                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4630                 }
4631                 perlExpr.remove(0, 2);
4632             }
4633 
4634             else if (perlExpr.startsWith("@+")) {
4635                 int32_t i;
4636                 for (i=0; i<=testMat->groupCount(); i++) {
4637                     if (i>0) {
4638                         resultString.append(" ");
4639                     }
4640                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4641                 }
4642                 perlExpr.remove(0, 2);
4643             }
4644 
4645             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4646                                                      //           or as an escaped sequence (e.g. \n)
4647                 if (perlExpr.length() > 1) {
4648                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4649                 }
4650                 UChar c = perlExpr.charAt(0);
4651                 switch (c) {
4652                 case 'n':   c = '\n'; break;
4653                 // add any other escape sequences that show up in the test expected results.
4654                 }
4655                 resultString.append(c);
4656                 perlExpr.remove(0, 1);
4657             }
4658 
4659             else  {
4660                 // Any characters from the perl expression that we don't explicitly
4661                 //  recognize before here are assumed to be literals and copied
4662                 //  as-is to the expected results.
4663                 resultString.append(perlExpr.charAt(0));
4664                 perlExpr.remove(0, 1);
4665             }
4666 
4667             if (U_FAILURE(status)) {
4668                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4669                 break;
4670             }
4671         }
4672 
4673         //
4674         // Expected Results Compare
4675         //
4676         UnicodeString expectedS(fields[4]);
4677         expectedS.findAndReplace(nulnulSrc, nulnul);
4678         expectedS.findAndReplace(ffffSrc,   ffff);
4679         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4680 
4681 
4682         if (expectedS.compare(resultString) != 0) {
4683             err("Line %d: Incorrect perl expression results.", lineNum);
4684             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4685         }
4686 
4687         delete testMat;
4688         delete testPat;
4689     }
4690 
4691     //
4692     // All done.  Clean up allocated stuff.
4693     //
4694     delete cgMat;
4695     delete cgPat;
4696 
4697     delete groupsMat;
4698     delete groupsPat;
4699 
4700     delete flagMat;
4701     delete flagPat;
4702 
4703     delete lineMat;
4704     delete linePat;
4705 
4706     delete fieldPat;
4707     delete [] testData;
4708 
4709     utext_close(&patternText);
4710     utext_close(&inputText);
4711 
4712     delete [] patternChars;
4713     delete [] inputChars;
4714 
4715 
4716     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4717 
4718 }
4719 
4720 
4721 //--------------------------------------------------------------
4722 //
4723 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4724 //             Use this pattern,
4725 //                 "(a?){1,8000000}"
4726 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4727 //                   This test is likely to be fragile, as further optimizations stop
4728 //                   more cases of pointless looping in the match engine.
4729 //
4730 //---------------------------------------------------------------
Bug6149()4731 void RegexTest::Bug6149() {
4732     UnicodeString pattern("(a?){1,8000000}");
4733     UnicodeString s("xyz");
4734     uint32_t flags = 0;
4735     UErrorCode status = U_ZERO_ERROR;
4736 
4737     RegexMatcher  matcher(pattern, s, flags, status);
4738     UBool result = false;
4739     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4740     REGEX_ASSERT(result == FALSE);
4741  }
4742 
4743 
4744 //
4745 //   Callbacks()    Test the callback function.
4746 //                  When set, callbacks occur periodically during matching operations,
4747 //                  giving the application code the ability to abort the operation
4748 //                  before it's normal completion.
4749 //
4750 
4751 struct callBackContext {
4752     RegexTest        *test;
4753     int32_t          maxCalls;
4754     int32_t          numCalls;
4755     int32_t          lastSteps;
resetcallBackContext4756     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4757 };
4758 
4759 U_CDECL_BEGIN
4760 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4761 testCallBackFn(const void *context, int32_t steps) {
4762     callBackContext  *info = (callBackContext *)context;
4763     if (info->lastSteps+1 != steps) {
4764         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4765     }
4766     info->lastSteps = steps;
4767     info->numCalls++;
4768     return (info->numCalls < info->maxCalls);
4769 }
4770 U_CDECL_END
4771 
Callbacks()4772 void RegexTest::Callbacks() {
4773    {
4774         // Getter returns NULLs if no callback has been set
4775 
4776         //   The variables that the getter will fill in.
4777         //   Init to non-null values so that the action of the getter can be seen.
4778         const void          *returnedContext = &returnedContext;
4779         URegexMatchCallback *returnedFn = &testCallBackFn;
4780 
4781         UErrorCode status = U_ZERO_ERROR;
4782         RegexMatcher matcher("x", 0, status);
4783         REGEX_CHECK_STATUS;
4784         matcher.getMatchCallback(returnedFn, returnedContext, status);
4785         REGEX_CHECK_STATUS;
4786         REGEX_ASSERT(returnedFn == NULL);
4787         REGEX_ASSERT(returnedContext == NULL);
4788     }
4789 
4790    {
4791         // Set and Get work
4792         callBackContext cbInfo = {this, 0, 0, 0};
4793         const void          *returnedContext;
4794         URegexMatchCallback *returnedFn;
4795         UErrorCode status = U_ZERO_ERROR;
4796         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4797         REGEX_CHECK_STATUS;
4798         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4799         REGEX_CHECK_STATUS;
4800         matcher.getMatchCallback(returnedFn, returnedContext, status);
4801         REGEX_CHECK_STATUS;
4802         REGEX_ASSERT(returnedFn == testCallBackFn);
4803         REGEX_ASSERT(returnedContext == &cbInfo);
4804 
4805         // A short-running match shouldn't invoke the callback
4806         status = U_ZERO_ERROR;
4807         cbInfo.reset(1);
4808         UnicodeString s = "xxx";
4809         matcher.reset(s);
4810         REGEX_ASSERT(matcher.matches(status));
4811         REGEX_CHECK_STATUS;
4812         REGEX_ASSERT(cbInfo.numCalls == 0);
4813 
4814         // A medium-length match that runs long enough to invoke the
4815         //   callback, but not so long that the callback aborts it.
4816         status = U_ZERO_ERROR;
4817         cbInfo.reset(4);
4818         s = "aaaaaaaaaaaaaaaaaaab";
4819         matcher.reset(s);
4820         REGEX_ASSERT(matcher.matches(status)==FALSE);
4821         REGEX_CHECK_STATUS;
4822         REGEX_ASSERT(cbInfo.numCalls > 0);
4823 
4824         // A longer running match that the callback function will abort.
4825         status = U_ZERO_ERROR;
4826         cbInfo.reset(4);
4827         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4828         matcher.reset(s);
4829         REGEX_ASSERT(matcher.matches(status)==FALSE);
4830         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4831         REGEX_ASSERT(cbInfo.numCalls == 4);
4832 
4833         // A longer running find that the callback function will abort.
4834         status = U_ZERO_ERROR;
4835         cbInfo.reset(4);
4836         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4837         matcher.reset(s);
4838         REGEX_ASSERT(matcher.find(status)==FALSE);
4839         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4840         REGEX_ASSERT(cbInfo.numCalls == 4);
4841     }
4842 
4843 
4844 }
4845 
4846 
4847 //
4848 //   FindProgressCallbacks()    Test the find "progress" callback function.
4849 //                  When set, the find progress callback will be invoked during a find operations
4850 //                  after each return from a match attempt, giving the application the opportunity
4851 //                  to terminate a long-running find operation before it's normal completion.
4852 //
4853 
4854 struct progressCallBackContext {
4855     RegexTest        *test;
4856     int64_t          lastIndex;
4857     int32_t          maxCalls;
4858     int32_t          numCalls;
resetprogressCallBackContext4859     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4860 };
4861 
4862 // call-back function for find().
4863 // Return TRUE to continue the find().
4864 // Return FALSE to stop the find().
4865 U_CDECL_BEGIN
4866 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4867 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4868     progressCallBackContext  *info = (progressCallBackContext *)context;
4869     info->numCalls++;
4870     info->lastIndex = matchIndex;
4871 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4872     return (info->numCalls < info->maxCalls);
4873 }
4874 U_CDECL_END
4875 
FindProgressCallbacks()4876 void RegexTest::FindProgressCallbacks() {
4877    {
4878         // Getter returns NULLs if no callback has been set
4879 
4880         //   The variables that the getter will fill in.
4881         //   Init to non-null values so that the action of the getter can be seen.
4882         const void                  *returnedContext = &returnedContext;
4883         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4884 
4885         UErrorCode status = U_ZERO_ERROR;
4886         RegexMatcher matcher("x", 0, status);
4887         REGEX_CHECK_STATUS;
4888         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4889         REGEX_CHECK_STATUS;
4890         REGEX_ASSERT(returnedFn == NULL);
4891         REGEX_ASSERT(returnedContext == NULL);
4892     }
4893 
4894    {
4895         // Set and Get work
4896         progressCallBackContext cbInfo = {this, 0, 0, 0};
4897         const void                  *returnedContext;
4898         URegexFindProgressCallback  *returnedFn;
4899         UErrorCode status = U_ZERO_ERROR;
4900         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4901         REGEX_CHECK_STATUS;
4902         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4903         REGEX_CHECK_STATUS;
4904         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4905         REGEX_CHECK_STATUS;
4906         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4907         REGEX_ASSERT(returnedContext == &cbInfo);
4908 
4909         // A find that matches on the initial position does NOT invoke the callback.
4910         status = U_ZERO_ERROR;
4911         cbInfo.reset(100);
4912         UnicodeString s = "aaxxx";
4913         matcher.reset(s);
4914 #if 0
4915         matcher.setTrace(TRUE);
4916 #endif
4917         REGEX_ASSERT(matcher.find(0, status));
4918         REGEX_CHECK_STATUS;
4919         REGEX_ASSERT(cbInfo.numCalls == 0);
4920 
4921         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4922         //   but not so many times that we interrupt the operation.
4923         status = U_ZERO_ERROR;
4924         s = "aaaaaaaaaaaaaaaaaaab";
4925         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4926         matcher.reset(s);
4927         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4928         REGEX_CHECK_STATUS;
4929         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4930 
4931         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4932         status = U_ZERO_ERROR;
4933         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4934         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4935         matcher.reset(s1);
4936         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4937         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4938         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4939 
4940         // Now a match that will succeed, but after an interruption
4941         status = U_ZERO_ERROR;
4942         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4943         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4944         matcher.reset(s2);
4945         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4946         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4947         // Now retry the match from where left off
4948         cbInfo.maxCalls = 100; //  No callback limit
4949         status = U_ZERO_ERROR;
4950         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4951         REGEX_CHECK_STATUS;
4952     }
4953 
4954 
4955 }
4956 
4957 
4958 //---------------------------------------------------------------------------
4959 //
4960 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4961 //                             UTexts. The pure-C implementation of UText
4962 //                             has no mutable backing stores, but we can
4963 //                             use UnicodeString here to test the functionality.
4964 //
4965 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4966 void RegexTest::PreAllocatedUTextCAPI () {
4967     UErrorCode           status = U_ZERO_ERROR;
4968     URegularExpression  *re;
4969     UText                patternText = UTEXT_INITIALIZER;
4970     UnicodeString        buffer;
4971     UText                bufferText = UTEXT_INITIALIZER;
4972 
4973     utext_openUnicodeString(&bufferText, &buffer, &status);
4974 
4975     /*
4976      *  getText() and getUText()
4977      */
4978     {
4979         UText  text1 = UTEXT_INITIALIZER;
4980         UText  text2 = UTEXT_INITIALIZER;
4981         UChar  text2Chars[20];
4982         UText  *resultText;
4983 
4984         status = U_ZERO_ERROR;
4985         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4986         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4987         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4988         utext_openUChars(&text2, text2Chars, -1, &status);
4989 
4990         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4991         re = uregex_openUText(&patternText, 0, NULL, &status);
4992 
4993         /* First set a UText */
4994         uregex_setUText(re, &text1, &status);
4995         resultText = uregex_getUText(re, &bufferText, &status);
4996         REGEX_CHECK_STATUS;
4997         REGEX_ASSERT(resultText == &bufferText);
4998         utext_setNativeIndex(resultText, 0);
4999         utext_setNativeIndex(&text1, 0);
5000         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5001 
5002         resultText = uregex_getUText(re, &bufferText, &status);
5003         REGEX_CHECK_STATUS;
5004         REGEX_ASSERT(resultText == &bufferText);
5005         utext_setNativeIndex(resultText, 0);
5006         utext_setNativeIndex(&text1, 0);
5007         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5008 
5009         /* Then set a UChar * */
5010         uregex_setText(re, text2Chars, 7, &status);
5011         resultText = uregex_getUText(re, &bufferText, &status);
5012         REGEX_CHECK_STATUS;
5013         REGEX_ASSERT(resultText == &bufferText);
5014         utext_setNativeIndex(resultText, 0);
5015         utext_setNativeIndex(&text2, 0);
5016         REGEX_ASSERT(testUTextEqual(resultText, &text2));
5017 
5018         uregex_close(re);
5019         utext_close(&text1);
5020         utext_close(&text2);
5021     }
5022 
5023     /*
5024      *  group()
5025      */
5026     {
5027         UChar    text1[80];
5028         UText   *actual;
5029         UBool    result;
5030         int64_t  length = 0;
5031 
5032         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5033         //                  012345678901234567890123456789012345678901234567
5034         //                  0         1         2         3         4
5035 
5036         status = U_ZERO_ERROR;
5037         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5038         REGEX_CHECK_STATUS;
5039 
5040         uregex_setText(re, text1, -1, &status);
5041         result = uregex_find(re, 0, &status);
5042         REGEX_ASSERT(result==TRUE);
5043 
5044         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5045         status = U_ZERO_ERROR;
5046         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5047         REGEX_CHECK_STATUS;
5048         REGEX_ASSERT(actual == &bufferText);
5049         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5050         REGEX_ASSERT(length == 16);
5051         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5052 
5053         /*  Capture group #1.  Should succeed, matching " interior ". */
5054         status = U_ZERO_ERROR;
5055         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5056         REGEX_CHECK_STATUS;
5057         REGEX_ASSERT(actual == &bufferText);
5058         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5059         REGEX_ASSERT(length == 10);
5060         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5061 
5062         /*  Capture group out of range.  Error. */
5063         status = U_ZERO_ERROR;
5064         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5065         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5066         REGEX_ASSERT(actual == &bufferText);
5067         uregex_close(re);
5068 
5069     }
5070 
5071     /*
5072      *  replaceFirst()
5073      */
5074     {
5075         UChar    text1[80];
5076         UChar    text2[80];
5077         UText    replText = UTEXT_INITIALIZER;
5078         UText   *result;
5079         status = U_ZERO_ERROR;
5080         utext_openUnicodeString(&bufferText, &buffer, &status);
5081 
5082         status = U_ZERO_ERROR;
5083         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5084         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5085         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5086 
5087         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5088         REGEX_CHECK_STATUS;
5089 
5090         /*  Normal case, with match */
5091         uregex_setText(re, text1, -1, &status);
5092         REGEX_CHECK_STATUS;
5093         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5094         REGEX_CHECK_STATUS;
5095         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5096         REGEX_CHECK_STATUS;
5097         REGEX_ASSERT(result == &bufferText);
5098         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5099 
5100         /* No match.  Text should copy to output with no changes.  */
5101         uregex_setText(re, text2, -1, &status);
5102         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5103         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5104         REGEX_CHECK_STATUS;
5105         REGEX_ASSERT(result == &bufferText);
5106         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5107 
5108         /* Unicode escapes */
5109         uregex_setText(re, text1, -1, &status);
5110         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5111         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5112         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5113         REGEX_CHECK_STATUS;
5114         REGEX_ASSERT(result == &bufferText);
5115         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5116 
5117         uregex_close(re);
5118         utext_close(&replText);
5119     }
5120 
5121 
5122     /*
5123      *  replaceAll()
5124      */
5125     {
5126         UChar    text1[80];
5127         UChar    text2[80];
5128         UText    replText = UTEXT_INITIALIZER;
5129         UText   *result;
5130 
5131         status = U_ZERO_ERROR;
5132         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5133         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5134         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5135 
5136         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5137         REGEX_CHECK_STATUS;
5138 
5139         /*  Normal case, with match */
5140         uregex_setText(re, text1, -1, &status);
5141         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5142         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5143         REGEX_CHECK_STATUS;
5144         REGEX_ASSERT(result == &bufferText);
5145         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5146 
5147         /* No match.  Text should copy to output with no changes.  */
5148         uregex_setText(re, text2, -1, &status);
5149         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5150         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5151         REGEX_CHECK_STATUS;
5152         REGEX_ASSERT(result == &bufferText);
5153         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5154 
5155         uregex_close(re);
5156         utext_close(&replText);
5157     }
5158 
5159 
5160     /*
5161      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5162      *   so we don't need to test it here.
5163      */
5164 
5165     utext_close(&bufferText);
5166     utext_close(&patternText);
5167 }
5168 
5169 
5170 //--------------------------------------------------------------
5171 //
5172 //  NamedCapture   Check basic named capture group functionality
5173 //
5174 //--------------------------------------------------------------
NamedCapture()5175 void RegexTest::NamedCapture() {
5176     UErrorCode status = U_ZERO_ERROR;
5177     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5178             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5179     REGEX_CHECK_STATUS;
5180     int32_t group = pat->groupNumberFromName("five", -1, status);
5181     REGEX_CHECK_STATUS;
5182     REGEX_ASSERT(5 == group);
5183     group = pat->groupNumberFromName("three", -1, status);
5184     REGEX_CHECK_STATUS;
5185     REGEX_ASSERT(3 == group);
5186 
5187     status = U_ZERO_ERROR;
5188     group = pat->groupNumberFromName(UnicodeString("six"), status);
5189     REGEX_CHECK_STATUS;
5190     REGEX_ASSERT(6 == group);
5191 
5192     status = U_ZERO_ERROR;
5193     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5194     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5195 
5196     status = U_ZERO_ERROR;
5197 
5198     // After copying a pattern, named capture should still work in the copy.
5199     RegexPattern *copiedPat = new RegexPattern(*pat);
5200     REGEX_ASSERT(*copiedPat == *pat);
5201     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5202 
5203     group = copiedPat->groupNumberFromName("five", -1, status);
5204     REGEX_CHECK_STATUS;
5205     REGEX_ASSERT(5 == group);
5206     group = copiedPat->groupNumberFromName("three", -1, status);
5207     REGEX_CHECK_STATUS;
5208     REGEX_ASSERT(3 == group);
5209     delete copiedPat;
5210 
5211     // ReplaceAll with named capture group.
5212     status = U_ZERO_ERROR;
5213     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5214     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5215     REGEX_CHECK_STATUS;
5216     // m.pattern().dumpPattern();
5217     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5218     REGEX_CHECK_STATUS;
5219     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5220     delete m;
5221 
5222     // ReplaceAll, allowed capture group numbers.
5223     text = UnicodeString("abcmxyz");
5224     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5225     REGEX_CHECK_STATUS;
5226 
5227     status = U_ZERO_ERROR;
5228     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5229     REGEX_CHECK_STATUS;
5230     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5231 
5232     status = U_ZERO_ERROR;
5233     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5234     REGEX_CHECK_STATUS;
5235     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5236 
5237     status = U_ZERO_ERROR;
5238     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5239     REGEX_CHECK_STATUS;
5240     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5241 
5242     status = U_ZERO_ERROR;
5243     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5244     REGEX_CHECK_STATUS;
5245     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5246 
5247     status = U_ZERO_ERROR;
5248     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5249     REGEX_CHECK_STATUS;
5250     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5251 
5252     status = U_ZERO_ERROR;
5253     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5254     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5255 
5256     status = U_ZERO_ERROR;
5257     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5258     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5259     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5260 
5261     status = U_ZERO_ERROR;
5262     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5263     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5264     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5265 
5266     status = U_ZERO_ERROR;
5267     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5268     REGEX_CHECK_STATUS;
5269     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5270 
5271     status = U_ZERO_ERROR;
5272     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5273     REGEX_CHECK_STATUS;
5274     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5275 
5276     status = U_ZERO_ERROR;
5277     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5278     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5279 
5280     status = U_ZERO_ERROR;
5281     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5282     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5283 
5284     status = U_ZERO_ERROR;
5285     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5286     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5287 
5288     status = U_ZERO_ERROR;
5289     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5290     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5291 
5292     delete m;
5293 
5294     // Repeat the above replaceAll() tests using the plain C API, which
5295     //  has a separate implementation internally.
5296     //  TODO: factor out the test data.
5297 
5298     status = U_ZERO_ERROR;
5299     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5300     REGEX_CHECK_STATUS;
5301     text = UnicodeString("abcmxyz");
5302     uregex_setText(re, text.getBuffer(), text.length(), &status);
5303     REGEX_CHECK_STATUS;
5304 
5305     UChar resultBuf[100];
5306     int32_t resultLength;
5307     UnicodeString repl;
5308 
5309     status = U_ZERO_ERROR;
5310     repl = UnicodeString("<$0>");
5311     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5312     REGEX_CHECK_STATUS;
5313     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5314 
5315     status = U_ZERO_ERROR;
5316     repl = UnicodeString("<$1>");
5317     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5318     REGEX_CHECK_STATUS;
5319     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5320 
5321     status = U_ZERO_ERROR;
5322     repl = UnicodeString("<${one}>");
5323     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5324     REGEX_CHECK_STATUS;
5325     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5326 
5327     status = U_ZERO_ERROR;
5328     repl = UnicodeString("<$2>");
5329     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5330     REGEX_CHECK_STATUS;
5331     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5332 
5333     status = U_ZERO_ERROR;
5334     repl = UnicodeString("<$3>");
5335     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5336     REGEX_CHECK_STATUS;
5337     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5338 
5339     status = U_ZERO_ERROR;
5340     repl = UnicodeString("<$4>");
5341     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5342     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5343 
5344     status = U_ZERO_ERROR;
5345     repl = UnicodeString("<$04>");
5346     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5347     REGEX_CHECK_STATUS;
5348     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5349 
5350     status = U_ZERO_ERROR;
5351     repl = UnicodeString("<$000016>");
5352     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5353     REGEX_CHECK_STATUS;
5354     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5355 
5356     status = U_ZERO_ERROR;
5357     repl = UnicodeString("<$3$2$1${one}>");
5358     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5359     REGEX_CHECK_STATUS;
5360     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5361 
5362     status = U_ZERO_ERROR;
5363     repl = UnicodeString("$3$2$1${one}");
5364     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5365     REGEX_CHECK_STATUS;
5366     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5367 
5368     status = U_ZERO_ERROR;
5369     repl = UnicodeString("<${noSuchName}>");
5370     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5371     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5372 
5373     status = U_ZERO_ERROR;
5374     repl = UnicodeString("<${invalid-name}>");
5375     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5376     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5377 
5378     status = U_ZERO_ERROR;
5379     repl = UnicodeString("<${one");
5380     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5381     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5382 
5383     status = U_ZERO_ERROR;
5384     repl = UnicodeString("$not a capture group");
5385     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5386     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5387 
5388     uregex_close(re);
5389 }
5390 
5391 //--------------------------------------------------------------
5392 //
5393 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5394 //                       The point is not so much what the exact limit is,
5395 //                       but that a largish number doesn't hit bad non-linear performance,
5396 //                       and that exceeding the limit fails cleanly.
5397 //
5398 //--------------------------------------------------------------
NamedCaptureLimits()5399 void RegexTest::NamedCaptureLimits() {
5400     if (quick) {
5401         logln("Skipping test. Runs in exhuastive mode only.");
5402         return;
5403     }
5404     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5405     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5406     char nnbuf[100];
5407     UnicodeString pattern;
5408     int32_t nn;
5409 
5410     for (nn=1; nn<goodLimit; nn++) {
5411         sprintf(nnbuf, "(?<nn%d>)", nn);
5412         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5413     }
5414     UErrorCode status = U_ZERO_ERROR;
5415     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5416     REGEX_CHECK_STATUS;
5417     for (nn=1; nn<goodLimit; nn++) {
5418         sprintf(nnbuf, "nn%d", nn);
5419         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5420         REGEX_ASSERT(nn == groupNum);
5421         if (nn != groupNum) {
5422             break;
5423         }
5424     }
5425     delete pat;
5426 
5427     pattern.remove();
5428     for (nn=1; nn<failLimit; nn++) {
5429         sprintf(nnbuf, "(?<nn%d>)", nn);
5430         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5431     }
5432     status = U_ZERO_ERROR;
5433     pat = RegexPattern::compile(pattern, 0, status);
5434     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5435     delete pat;
5436 }
5437 
5438 
5439 //--------------------------------------------------------------
5440 //
5441 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5442 //
5443 //---------------------------------------------------------------
Bug7651()5444 void RegexTest::Bug7651() {
5445     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5446     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5447     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5448     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5449     UnicodeString s("#ff @abcd This is test");
5450     RegexPattern  *REPattern = NULL;
5451     RegexMatcher  *REMatcher = NULL;
5452     UErrorCode status = U_ZERO_ERROR;
5453     UParseError pe;
5454 
5455     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5456     REGEX_CHECK_STATUS;
5457     REMatcher = REPattern->matcher(s, status);
5458     REGEX_CHECK_STATUS;
5459     REGEX_ASSERT(REMatcher->find());
5460     REGEX_ASSERT(REMatcher->start(status) == 0);
5461     delete REPattern;
5462     delete REMatcher;
5463     status = U_ZERO_ERROR;
5464 
5465     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5466     REGEX_CHECK_STATUS;
5467     REMatcher = REPattern->matcher(s, status);
5468     REGEX_CHECK_STATUS;
5469     REGEX_ASSERT(REMatcher->find());
5470     REGEX_ASSERT(REMatcher->start(status) == 0);
5471     delete REPattern;
5472     delete REMatcher;
5473     status = U_ZERO_ERROR;
5474  }
5475 
Bug7740()5476 void RegexTest::Bug7740() {
5477     UErrorCode status = U_ZERO_ERROR;
5478     UnicodeString pattern = "(a)";
5479     UnicodeString text = "abcdef";
5480     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5481     REGEX_CHECK_STATUS;
5482     REGEX_ASSERT(m->lookingAt(status));
5483     REGEX_CHECK_STATUS;
5484     status = U_ILLEGAL_ARGUMENT_ERROR;
5485     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5486     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5487     REGEX_ASSERT(s == "");
5488     delete m;
5489 }
5490 
5491 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5492 
Bug8479()5493 void RegexTest::Bug8479() {
5494     UErrorCode status = U_ZERO_ERROR;
5495 
5496     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5497     REGEX_CHECK_STATUS;
5498     if (U_SUCCESS(status))
5499     {
5500         UnicodeString str;
5501         str.setToBogus();
5502         pMatcher->reset(str);
5503         status = U_ZERO_ERROR;
5504         pMatcher->matches(status);
5505         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5506         delete pMatcher;
5507     }
5508 }
5509 
5510 
5511 // Bug 7029
Bug7029()5512 void RegexTest::Bug7029() {
5513     UErrorCode status = U_ZERO_ERROR;
5514 
5515     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5516     UnicodeString text = "abc.def";
5517     UnicodeString splits[10];
5518     REGEX_CHECK_STATUS;
5519     int32_t numFields = pMatcher->split(text, splits, 10, status);
5520     REGEX_CHECK_STATUS;
5521     REGEX_ASSERT(numFields == 8);
5522     delete pMatcher;
5523 }
5524 
5525 // Bug 9283
5526 //   This test is checking for the existance of any supplemental characters that case-fold
5527 //   to a bmp character.
5528 //
5529 //   At the time of this writing there are none. If any should appear in a subsequent release
5530 //   of Unicode, the code in regular expressions compilation that determines the longest
5531 //   posssible match for a literal string  will need to be enhanced.
5532 //
5533 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5534 //   for details on what to do in case of a failure of this test.
5535 //
Bug9283()5536 void RegexTest::Bug9283() {
5537 #if !UCONFIG_NO_NORMALIZATION
5538     UErrorCode status = U_ZERO_ERROR;
5539     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5540     REGEX_CHECK_STATUS;
5541     int32_t index;
5542     UChar32 c;
5543     for (index=0; ; index++) {
5544         c = supplementalsWithCaseFolding.charAt(index);
5545         if (c == -1) {
5546             break;
5547         }
5548         UnicodeString cf = UnicodeString(c).foldCase();
5549         REGEX_ASSERT(cf.length() >= 2);
5550     }
5551 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5552 }
5553 
5554 
CheckInvBufSize()5555 void RegexTest::CheckInvBufSize() {
5556   if(inv_next>=INV_BUFSIZ) {
5557     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5558           __FILE__, INV_BUFSIZ, inv_next);
5559   } else {
5560     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5561   }
5562 }
5563 
5564 
Bug10459()5565 void RegexTest::Bug10459() {
5566     UErrorCode status = U_ZERO_ERROR;
5567     UnicodeString patternString("(txt)");
5568     UnicodeString txtString("txt");
5569 
5570     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5571     REGEX_CHECK_STATUS;
5572     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5573     REGEX_CHECK_STATUS;
5574 
5575     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5576     REGEX_CHECK_STATUS;
5577 
5578     uregex_setUText(icu_re, utext_txt, &status);
5579     REGEX_CHECK_STATUS;
5580 
5581     // The bug was that calling uregex_group() before doing a matching operation
5582     //   was causing a segfault. Only for Regular Expressions created from UText.
5583     //   It should set an U_REGEX_INVALID_STATE.
5584 
5585     UChar buf[100];
5586     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5587     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5588     REGEX_ASSERT(len == 0);
5589 
5590     uregex_close(icu_re);
5591     utext_close(utext_pat);
5592     utext_close(utext_txt);
5593 }
5594 
TestCaseInsensitiveStarters()5595 void RegexTest::TestCaseInsensitiveStarters() {
5596     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5597     //  become stale because of new Unicode characters.
5598     // If it is stale, rerun the generation tool
5599     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5600     // and replace the embedded data in i18n/regexcmp.cpp
5601 
5602     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5603         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5604             continue;
5605         }
5606         UnicodeSet s(cp, cp);
5607         s.closeOver(USET_CASE_INSENSITIVE);
5608         UnicodeSetIterator setIter(s);
5609         while (setIter.next()) {
5610             if (!setIter.isString()) {
5611                 continue;
5612             }
5613             const UnicodeString &str = setIter.getString();
5614             UChar32 firstChar = str.char32At(0);
5615             UnicodeSet starters;
5616             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5617             if (!starters.contains(cp)) {
5618                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5619                 return;
5620             }
5621         }
5622     }
5623 }
5624 
5625 
TestBug11049()5626 void RegexTest::TestBug11049() {
5627     // Original bug report: pattern with match start consisting of one of several individual characters,
5628     //  and the text being matched ending with a supplementary character. find() would read past the
5629     //  end of the input text when searching for potential match starting points.
5630 
5631     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5632     // detect the bad read.
5633 
5634     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5635     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5636 
5637     // Test again with a pattern starting with a single character,
5638     // which takes a different code path than starting with an OR expression,
5639     // but with similar logic.
5640     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5641     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5642 }
5643 
5644 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5645 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5646     UErrorCode status = U_ZERO_ERROR;
5647     UnicodeString patternString = UnicodeString(pattern).unescape();
5648     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5649 
5650     UnicodeString dataString = UnicodeString(data).unescape();
5651     UChar *exactBuffer = new UChar[dataString.length()];
5652     dataString.extract(exactBuffer, dataString.length(), status);
5653     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5654 
5655     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5656     REGEX_CHECK_STATUS;
5657     matcher->reset(ut);
5658     UBool result = matcher->find();
5659     if (result != expectMatch) {
5660         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5661               __FILE__, lineNumber, expectMatch, result, pattern, data);
5662     }
5663 
5664     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5665     //   off-by-one on find() with match at the last code point.
5666     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5667     //   because string.unescape() will only shrink it.
5668     char * utf8Buffer = new char[uprv_strlen(data)+1];
5669     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5670     REGEX_CHECK_STATUS;
5671     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5672     REGEX_CHECK_STATUS;
5673     matcher->reset(ut);
5674     result = matcher->find();
5675     if (result != expectMatch) {
5676         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5677               __FILE__, lineNumber, expectMatch, result, pattern, data);
5678     }
5679     delete [] utf8Buffer;
5680 
5681     utext_close(ut);
5682     delete [] exactBuffer;
5683 }
5684 
5685 
TestBug11371()5686 void RegexTest::TestBug11371() {
5687     if (quick) {
5688         logln("Skipping test. Runs in exhuastive mode only.");
5689         return;
5690     }
5691     UErrorCode status = U_ZERO_ERROR;
5692     UnicodeString patternString;
5693 
5694     for (int i=0; i<8000000; i++) {
5695         patternString.append(UnicodeString("()"));
5696     }
5697     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5698     if (status != U_REGEX_PATTERN_TOO_BIG) {
5699         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5700               __FILE__, __LINE__, u_errorName(status));
5701     }
5702 
5703     status = U_ZERO_ERROR;
5704     patternString = "(";
5705     for (int i=0; i<20000000; i++) {
5706         patternString.append(UnicodeString("A++"));
5707     }
5708     patternString.append(UnicodeString("){0}B++"));
5709     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5710     if (status != U_REGEX_PATTERN_TOO_BIG) {
5711         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5712               __FILE__, __LINE__, u_errorName(status));
5713     }
5714 
5715     // Pattern with too much string data, such that string indexes overflow operand data field size
5716     // in compiled instruction.
5717     status = U_ZERO_ERROR;
5718     patternString = "";
5719     while (patternString.length() < 0x00ffffff) {
5720         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5721     }
5722     patternString.append(UnicodeString("X? trailing string"));
5723     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5724     if (status != U_REGEX_PATTERN_TOO_BIG) {
5725         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5726               __FILE__, __LINE__, u_errorName(status));
5727     }
5728 }
5729 
TestBug11480()5730 void RegexTest::TestBug11480() {
5731     // C API, get capture group of a group that does not participate in the match.
5732     //        (Returns a zero length string, with nul termination,
5733     //         indistinguishable from a group with a zero length match.)
5734 
5735     UErrorCode status = U_ZERO_ERROR;
5736     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5737     REGEX_CHECK_STATUS;
5738     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5739     uregex_setText(re, text.getBuffer(), text.length(), &status);
5740     REGEX_CHECK_STATUS;
5741     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5742     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5743     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5744     REGEX_ASSERT(length == 0);
5745     REGEX_ASSERT(buf[0] == 13);
5746     REGEX_ASSERT(buf[1] == 0);
5747     REGEX_ASSERT(buf[2] == 13);
5748     uregex_close(re);
5749 
5750     // UText C++ API, length of match is 0 for non-participating matches.
5751     UText ut = UTEXT_INITIALIZER;
5752     utext_openUnicodeString(&ut, &text, &status);
5753     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5754     REGEX_CHECK_STATUS;
5755     matcher.reset(&ut);
5756     REGEX_ASSERT(matcher.lookingAt(0, status));
5757 
5758     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5759     int64_t groupLen = -666;
5760     UText group = UTEXT_INITIALIZER;
5761     matcher.group(1, &group, groupLen, status);
5762     REGEX_CHECK_STATUS;
5763     REGEX_ASSERT(groupLen == 1);
5764     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5765 
5766     // Capture group 2, the (B), does not participate in the match.
5767     matcher.group(2, &group, groupLen, status);
5768     REGEX_CHECK_STATUS;
5769     REGEX_ASSERT(groupLen == 0);
5770     REGEX_ASSERT(matcher.start(2, status) == -1);
5771     REGEX_CHECK_STATUS;
5772 }
5773 
TestBug12884()5774 void RegexTest::TestBug12884() {
5775     // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5776     UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5777     UnicodeString text(u"hello");
5778     UErrorCode status = U_ZERO_ERROR;
5779     RegexMatcher m(pattern, text, 0, status);
5780     REGEX_CHECK_STATUS;
5781     m.setTimeLimit(5, status);
5782     m.find(status);
5783     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5784 
5785     // Non-greedy loops. They take a different code path during matching.
5786     UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5787     status = U_ZERO_ERROR;
5788     RegexMatcher ngM(ngPattern, text, 0, status);
5789     REGEX_CHECK_STATUS;
5790     ngM.setTimeLimit(5, status);
5791     ngM.find(status);
5792     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5793 
5794     // UText, wrapping non-UTF-16 text, also takes a different execution path.
5795     const char *text8 = u8"¿Qué es Unicode?  Unicode proporciona un número único para cada"
5796                           "carácter, sin importar la plataforma, sin importar el programa,"
5797                           "sin importar el idioma.";
5798     status = U_ZERO_ERROR;
5799     LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5800     REGEX_CHECK_STATUS;
5801     m.reset(ut.getAlias());
5802     m.find(status);
5803     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5804 
5805     status = U_ZERO_ERROR;
5806     ngM.reset(ut.getAlias());
5807     ngM.find(status);
5808     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5809 }
5810 
5811 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5812 //            can cause a read past the end of the input text.
5813 //            The failure is seen when running this test with Clang's Addresss Sanitizer.
5814 
TestBug13631()5815 void RegexTest::TestBug13631() {
5816     const UChar *pats[] = { u"(?<!^)",
5817                             u"(?<=^)",
5818                             nullptr
5819                           };
5820     for (const UChar **pat=pats; *pat; ++pat) {
5821         UErrorCode status = U_ZERO_ERROR;
5822         UnicodeString upat(*pat);
5823         RegexMatcher matcher(upat, 0, status);
5824         const UChar s =u'a';
5825         UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5826         REGEX_CHECK_STATUS;
5827         matcher.reset(ut);
5828         while (matcher.find()) {
5829         }
5830         utext_close(ut);
5831     }
5832 }
5833 
5834 
5835 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5836