• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 2002-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 
9 //
10 //   regextst.cpp
11 //
12 //      ICU Regular Expressions test, part of intltest.
13 //
14 
15 /*
16      NOTE!!
17 
18      PLEASE be careful about ASCII assumptions in this test.
19      This test is one of the worst repeat offenders.
20      If you have questions, contact someone on the ICU PMC
21      who has access to an EBCDIC system.
22 
23  */
24 
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utf16.h"
42 #include "cstr.h"
43 #include "regextst.h"
44 #include "regexcmp.h"
45 #include "uvector.h"
46 #include "util.h"
47 #include "cmemory.h"
48 #include "cstring.h"
49 #include "uinvchar.h"
50 
51 #define SUPPORT_MUTATING_INPUT_STRING   0
52 
53 //---------------------------------------------------------------------------
54 //
55 //  Test class boilerplate
56 //
57 //---------------------------------------------------------------------------
RegexTest()58 RegexTest::RegexTest()
59 {
60 }
61 
62 
~RegexTest()63 RegexTest::~RegexTest()
64 {
65 }
66 
67 
68 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)69 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
70 {
71     if (exec) logln("TestSuite RegexTest: ");
72     TESTCASE_AUTO_BEGIN;
73     TESTCASE_AUTO(Basic);
74     TESTCASE_AUTO(API_Match);
75     TESTCASE_AUTO(API_Replace);
76     TESTCASE_AUTO(API_Pattern);
77 #if !UCONFIG_NO_FILE_IO
78     TESTCASE_AUTO(Extended);
79 #endif
80     TESTCASE_AUTO(Errors);
81     TESTCASE_AUTO(PerlTests);
82     TESTCASE_AUTO(Callbacks);
83     TESTCASE_AUTO(FindProgressCallbacks);
84     TESTCASE_AUTO(Bug6149);
85     TESTCASE_AUTO(UTextBasic);
86     TESTCASE_AUTO(API_Match_UTF8);
87     TESTCASE_AUTO(API_Replace_UTF8);
88     TESTCASE_AUTO(API_Pattern_UTF8);
89     TESTCASE_AUTO(PerlTestsUTF8);
90     TESTCASE_AUTO(PreAllocatedUTextCAPI);
91     TESTCASE_AUTO(Bug7651);
92     TESTCASE_AUTO(Bug7740);
93     TESTCASE_AUTO(Bug8479);
94     TESTCASE_AUTO(Bug7029);
95     TESTCASE_AUTO(CheckInvBufSize);
96     TESTCASE_AUTO(Bug9283);
97     TESTCASE_AUTO(Bug10459);
98     TESTCASE_AUTO(TestCaseInsensitiveStarters);
99     TESTCASE_AUTO(TestBug11049);
100     TESTCASE_AUTO(TestBug11371);
101     TESTCASE_AUTO(TestBug11480);
102     TESTCASE_AUTO(NamedCapture);
103     TESTCASE_AUTO(NamedCaptureLimits);
104     TESTCASE_AUTO(TestBug12884);
105     TESTCASE_AUTO(TestBug13631);
106     TESTCASE_AUTO(TestBug13632);
107     TESTCASE_AUTO(TestBug20359);
108     TESTCASE_AUTO(TestBug20863);
109     TESTCASE_AUTO_END;
110 }
111 
112 
113 /**
114  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
115  * into ASCII.
116  * @see utext_openUTF8
117  */
118 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
119 
120 //---------------------------------------------------------------------------
121 //
122 //   Error Checking / Reporting macros used in all of the tests.
123 //
124 //---------------------------------------------------------------------------
125 
utextToPrintable(char * buf,int32_t bufLen,UText * text)126 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
127   int64_t oldIndex = utext_getNativeIndex(text);
128   utext_setNativeIndex(text, 0);
129   char *bufPtr = buf;
130   UChar32 c = utext_next32From(text, 0);
131   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
132     if (0x000020<=c && c<0x00007e) {
133       *bufPtr = c;
134     } else {
135 #if 0
136       sprintf(bufPtr,"U+%04X", c);
137       bufPtr+= strlen(bufPtr)-1;
138 #else
139       *bufPtr = '%';
140 #endif
141     }
142     bufPtr++;
143     c = UTEXT_NEXT32(text);
144   }
145   *bufPtr = 0;
146 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
147   char *ebuf = (char*)malloc(bufLen);
148   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
149   uprv_strncpy(buf, ebuf, bufLen);
150   free((void*)ebuf);
151 #endif
152   utext_setNativeIndex(text, oldIndex);
153 }
154 
155 
156 static char ASSERT_BUF[1024];
157 
extractToAssertBuf(const UnicodeString & message)158 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
159   if(message.length()==0) {
160     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
161   } else {
162     UnicodeString buf;
163     IntlTest::prettify(message,buf);
164     if(buf.length()==0) {
165       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
166     } else {
167       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
168       if(ASSERT_BUF[0]==0) {
169         ASSERT_BUF[0]=0;
170         for(int32_t i=0;i<buf.length();i++) {
171           UChar ch = buf[i];
172           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
173         }
174       }
175     }
176   }
177   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
178   return ASSERT_BUF;
179 }
180 
181 #define REGEX_VERBOSE_TEXT(text) UPRV_BLOCK_MACRO_BEGIN { \
182     char buf[200]; \
183     utextToPrintable(buf,UPRV_LENGTHOF(buf),text); \
184     logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf); \
185 } UPRV_BLOCK_MACRO_END
186 
187 #define REGEX_CHECK_STATUS UPRV_BLOCK_MACRO_BEGIN { \
188     if (U_FAILURE(status)) { \
189         dataerrln("%s:%d: RegexTest failure.  status=%s", \
190                   __FILE__, __LINE__, u_errorName(status)); \
191         return; \
192     } \
193 } UPRV_BLOCK_MACRO_END
194 
195 #define REGEX_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
196     if ((expr)==FALSE) { \
197         errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr); \
198     } \
199 } UPRV_BLOCK_MACRO_END
200 
201 #define REGEX_ASSERT_FAIL(expr, errcode) UPRV_BLOCK_MACRO_BEGIN { \
202     UErrorCode status=U_ZERO_ERROR; \
203     (expr); \
204     if (status!=errcode) { \
205         dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
206                   __LINE__, u_errorName(errcode), u_errorName(status)); \
207     } \
208 } UPRV_BLOCK_MACRO_END
209 
210 #define REGEX_CHECK_STATUS_L(line) UPRV_BLOCK_MACRO_BEGIN { \
211     if (U_FAILURE(status)) { \
212         errln("RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); \
213     } \
214 } UPRV_BLOCK_MACRO_END
215 
216 #define REGEX_ASSERT_L(expr, line) UPRV_BLOCK_MACRO_BEGIN { \
217     if ((expr)==FALSE) { \
218         errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); \
219         return; \
220     } \
221 } UPRV_BLOCK_MACRO_END
222 
223 // expected: const char * , restricted to invariant characters.
224 // actual: const UnicodeString &
225 #define REGEX_ASSERT_UNISTR(expected, actual) UPRV_BLOCK_MACRO_BEGIN { \
226     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
227         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
228               __FILE__, __LINE__, expected, extractToAssertBuf(actual)); \
229     } \
230 } UPRV_BLOCK_MACRO_END
231 
232 
testUTextEqual(UText * uta,UText * utb)233 static UBool testUTextEqual(UText *uta, UText *utb) {
234     UChar32 ca = 0;
235     UChar32 cb = 0;
236     utext_setNativeIndex(uta, 0);
237     utext_setNativeIndex(utb, 0);
238     do {
239         ca = utext_next32(uta);
240         cb = utext_next32(utb);
241         if (ca != cb) {
242             break;
243         }
244     } while (ca != U_SENTINEL);
245     return ca == cb;
246 }
247 
248 
249 /**
250  * @param expected expected text in UTF-8 (not platform) codepage
251  */
assertUText(const char * expected,UText * actual,const char * file,int line)252 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
253     UErrorCode status = U_ZERO_ERROR;
254     UText expectedText = UTEXT_INITIALIZER;
255     utext_openUTF8(&expectedText, expected, -1, &status);
256     if(U_FAILURE(status)) {
257       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
258       return;
259     }
260     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
261       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
262       return;
263     }
264     utext_setNativeIndex(actual, 0);
265     if (!testUTextEqual(&expectedText, actual)) {
266         char buf[201 /*21*/];
267         char expectedBuf[201];
268         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
269         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
270         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
271     }
272     utext_close(&expectedText);
273 }
274 /**
275  * @param expected invariant (platform local text) input
276  */
277 
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)278 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
279     UErrorCode status = U_ZERO_ERROR;
280     UText expectedText = UTEXT_INITIALIZER;
281     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
282     if(U_FAILURE(status)) {
283       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
284       return;
285     }
286     utext_setNativeIndex(actual, 0);
287     if (!testUTextEqual(&expectedText, actual)) {
288         char buf[201 /*21*/];
289         char expectedBuf[201];
290         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
291         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
292         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
293     }
294     utext_close(&expectedText);
295 }
296 
297 /**
298  * Assumes utf-8 input
299  */
300 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
301 /**
302  * Assumes Invariant input
303  */
304 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
305 
306 /**
307  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
308  * passed into utext_openUTF8. An error will be given if
309  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
310  */
311 
312 #define INV_BUFSIZ 2048 /* increase this if too small */
313 
314 static int64_t inv_next=0;
315 
316 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
317 static char inv_buf[INV_BUFSIZ];
318 #endif
319 
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)320 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
321   if(length==-1) length=strlen(inv);
322 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
323   inv_next+=length;
324   return utext_openUTF8(ut, inv, length, status);
325 #else
326   if(inv_next+length+1>INV_BUFSIZ) {
327     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
328             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
329     *status = U_MEMORY_ALLOCATION_ERROR;
330     return NULL;
331   }
332 
333   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
334   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
335   inv_next+=length;
336 
337 #if 0
338   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
339 #endif
340 
341   return utext_openUTF8(ut, (const char*)buf, length, status);
342 #endif
343 }
344 
345 
346 //---------------------------------------------------------------------------
347 //
348 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
349 //                       for the LookingAt() and  Match() functions.
350 //
351 //       usage:
352 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
353 //
354 //          The expected results are UBool - TRUE or FALSE.
355 //          The input text is unescaped.  The pattern is not.
356 //
357 //
358 //---------------------------------------------------------------------------
359 
360 #define REGEX_TESTLM(pat, text, looking, match) UPRV_BLOCK_MACRO_BEGIN { \
361     doRegexLMTest(pat, text, looking, match, __LINE__); \
362     doRegexLMTestUTF8(pat, text, looking, match, __LINE__); \
363 } UPRV_BLOCK_MACRO_END
364 
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)365 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
366     const UnicodeString pattern(pat, -1, US_INV);
367     const UnicodeString inputText(text, -1, US_INV);
368     UErrorCode          status  = U_ZERO_ERROR;
369     UParseError         pe;
370     RegexPattern        *REPattern = NULL;
371     RegexMatcher        *REMatcher = NULL;
372     UBool               retVal     = TRUE;
373 
374     UnicodeString patString(pat, -1, US_INV);
375     REPattern = RegexPattern::compile(patString, 0, pe, status);
376     if (U_FAILURE(status)) {
377         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
378             line, u_errorName(status));
379         return FALSE;
380     }
381     if (line==376) { REPattern->dumpPattern();}
382 
383     UnicodeString inputString(inputText);
384     UnicodeString unEscapedInput = inputString.unescape();
385     REMatcher = REPattern->matcher(unEscapedInput, status);
386     if (U_FAILURE(status)) {
387         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
388             line, u_errorName(status));
389         return FALSE;
390     }
391 
392     UBool actualmatch;
393     actualmatch = REMatcher->lookingAt(status);
394     if (U_FAILURE(status)) {
395         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
396             line, u_errorName(status));
397         retVal =  FALSE;
398     }
399     if (actualmatch != looking) {
400         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
401         retVal = FALSE;
402     }
403 
404     status = U_ZERO_ERROR;
405     actualmatch = REMatcher->matches(status);
406     if (U_FAILURE(status)) {
407         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
408             line, u_errorName(status));
409         retVal = FALSE;
410     }
411     if (actualmatch != match) {
412         errln("RegexTest: wrong return from matches() at line %d.\n", line);
413         retVal = FALSE;
414     }
415 
416     if (retVal == FALSE) {
417         REPattern->dumpPattern();
418     }
419 
420     delete REPattern;
421     delete REMatcher;
422     return retVal;
423 }
424 
425 
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)426 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
427     UText               pattern    = UTEXT_INITIALIZER;
428     int32_t             inputUTF8Length;
429     char                *textChars = NULL;
430     UText               inputText  = UTEXT_INITIALIZER;
431     UErrorCode          status     = U_ZERO_ERROR;
432     UParseError         pe;
433     RegexPattern        *REPattern = NULL;
434     RegexMatcher        *REMatcher = NULL;
435     UBool               retVal     = TRUE;
436 
437     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
438     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
439     if (U_FAILURE(status)) {
440         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
441             line, u_errorName(status));
442         return FALSE;
443     }
444 
445     UnicodeString inputString(text, -1, US_INV);
446     UnicodeString unEscapedInput = inputString.unescape();
447     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
448     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
449 
450     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
451     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
452         // UTF-8 does not allow unpaired surrogates, so this could actually happen
453         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
454         return TRUE; // not a failure of the Regex engine
455     }
456     status = U_ZERO_ERROR; // buffer overflow
457     textChars = new char[inputUTF8Length+1];
458     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
459     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
460 
461     REMatcher = &REPattern->matcher(status)->reset(&inputText);
462     if (U_FAILURE(status)) {
463         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
464             line, u_errorName(status));
465         return FALSE;
466     }
467 
468     UBool actualmatch;
469     actualmatch = REMatcher->lookingAt(status);
470     if (U_FAILURE(status)) {
471         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
472             line, u_errorName(status));
473         retVal =  FALSE;
474     }
475     if (actualmatch != looking) {
476         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
477         retVal = FALSE;
478     }
479 
480     status = U_ZERO_ERROR;
481     actualmatch = REMatcher->matches(status);
482     if (U_FAILURE(status)) {
483         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
484             line, u_errorName(status));
485         retVal = FALSE;
486     }
487     if (actualmatch != match) {
488         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
489         retVal = FALSE;
490     }
491 
492     if (retVal == FALSE) {
493         REPattern->dumpPattern();
494     }
495 
496     delete REPattern;
497     delete REMatcher;
498     utext_close(&inputText);
499     utext_close(&pattern);
500     delete[] textChars;
501     return retVal;
502 }
503 
504 
505 
506 //---------------------------------------------------------------------------
507 //
508 //    REGEX_ERR       Macro + invocation function to simplify writing tests
509 //                       regex tests for incorrect patterns
510 //
511 //       usage:
512 //          REGEX_ERR("pattern",   expected error line, column, expected status);
513 //
514 //---------------------------------------------------------------------------
515 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__)
516 
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)517 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
518                           UErrorCode expectedStatus, int32_t line) {
519     UnicodeString       pattern(pat);
520 
521     UErrorCode          status         = U_ZERO_ERROR;
522     UParseError         pe;
523     RegexPattern        *callerPattern = NULL;
524 
525     //
526     //  Compile the caller's pattern
527     //
528     UnicodeString patString(pat);
529     callerPattern = RegexPattern::compile(patString, 0, pe, status);
530     if (status != expectedStatus) {
531         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
532     } else {
533         if (status != U_ZERO_ERROR) {
534             if (pe.line != errLine || pe.offset != errCol) {
535                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
536                     line, errLine, errCol, pe.line, pe.offset);
537             }
538         }
539     }
540 
541     delete callerPattern;
542 
543     //
544     //  Compile again, using a UTF-8-based UText
545     //
546     UText patternText = UTEXT_INITIALIZER;
547     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
548     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
549     if (status != expectedStatus) {
550         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
551     } else {
552         if (status != U_ZERO_ERROR) {
553             if (pe.line != errLine || pe.offset != errCol) {
554                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
555                     line, errLine, errCol, pe.line, pe.offset);
556             }
557         }
558     }
559 
560     delete callerPattern;
561     utext_close(&patternText);
562 }
563 
564 
565 
566 //---------------------------------------------------------------------------
567 //
568 //      Basic      Check for basic functionality of regex pattern matching.
569 //                 Avoid the use of REGEX_FIND test macro, which has
570 //                 substantial dependencies on basic Regex functionality.
571 //
572 //---------------------------------------------------------------------------
Basic()573 void RegexTest::Basic() {
574 
575 
576 //
577 // Debug - slide failing test cases early
578 //
579 #if 0
580     {
581         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
582         UParseError pe;
583         UErrorCode  status = U_ZERO_ERROR;
584         RegexPattern *pattern;
585         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
586         pattern->dumpPattern();
587         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
588         UBool result = m->find();
589         printf("result = %d\n", result);
590         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
591         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
592     }
593     exit(1);
594 #endif
595 
596 
597     //
598     // Pattern with parentheses
599     //
600     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
601     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
602     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
603 
604     //
605     // Patterns with *
606     //
607     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
608     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
609     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
610     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
611     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
612 
613     REGEX_TESTLM("a*", "",  TRUE, TRUE);
614     REGEX_TESTLM("a*", "b", TRUE, FALSE);
615 
616 
617     //
618     //  Patterns with "."
619     //
620     REGEX_TESTLM(".", "abc", TRUE, FALSE);
621     REGEX_TESTLM("...", "abc", TRUE, TRUE);
622     REGEX_TESTLM("....", "abc", FALSE, FALSE);
623     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
624     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
625     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
626     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
627     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
628 
629     //
630     //  Patterns with * applied to chars at end of literal string
631     //
632     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
633     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
634 
635     //
636     //  Supplemental chars match as single chars, not a pair of surrogates.
637     //
638     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
639     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
640     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
641 
642 
643     //
644     //  UnicodeSets in the pattern
645     //
646     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
647     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
648     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
649     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
650     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
651     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
652 
653     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
654     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
655     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
656     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
657     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
658 
659     //
660     //   OR operator in patterns
661     //
662     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
663     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
664     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
665     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
666 
667     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
668     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
669     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
670     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
671     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
672     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
673 
674     //
675     //  +
676     //
677     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
678     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
679     REGEX_TESTLM("b+", "", FALSE, FALSE);
680     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
681     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
682     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
683 
684     //
685     //   ?
686     //
687     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
688     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
689     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
690     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
691     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
692     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
693     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
694     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
695     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
696 
697     //
698     //  Escape sequences that become single literal chars, handled internally
699     //   by ICU's Unescape.
700     //
701 
702     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
703     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
704     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
705     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
706     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
707     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
708     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
709     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
710     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
711     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
712 
713     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
714     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
715 
716     // Escape of special chars in patterns
717     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
718 }
719 
720 
721 //---------------------------------------------------------------------------
722 //
723 //    UTextBasic   Check for quirks that are specific to the UText
724 //                 implementation.
725 //
726 //---------------------------------------------------------------------------
UTextBasic()727 void RegexTest::UTextBasic() {
728     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
729     UErrorCode status = U_ZERO_ERROR;
730     UText pattern = UTEXT_INITIALIZER;
731     utext_openUTF8(&pattern, str_abc, -1, &status);
732     RegexMatcher matcher(&pattern, 0, status);
733     REGEX_CHECK_STATUS;
734 
735     UText input = UTEXT_INITIALIZER;
736     utext_openUTF8(&input, str_abc, -1, &status);
737     REGEX_CHECK_STATUS;
738     matcher.reset(&input);
739     REGEX_CHECK_STATUS;
740     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
741 
742     matcher.reset(matcher.inputText());
743     REGEX_CHECK_STATUS;
744     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
745 
746     utext_close(&pattern);
747     utext_close(&input);
748 }
749 
750 
751 //---------------------------------------------------------------------------
752 //
753 //      API_Match   Test that the API for class RegexMatcher
754 //                  is present and nominally working, but excluding functions
755 //                  implementing replace operations.
756 //
757 //---------------------------------------------------------------------------
API_Match()758 void RegexTest::API_Match() {
759     UParseError         pe;
760     UErrorCode          status=U_ZERO_ERROR;
761     int32_t             flags = 0;
762 
763     //
764     // Debug - slide failing test cases early
765     //
766 #if 0
767     {
768     }
769     return;
770 #endif
771 
772     //
773     // Simple pattern compilation
774     //
775     {
776         UnicodeString       re("abc");
777         RegexPattern        *pat2;
778         pat2 = RegexPattern::compile(re, flags, pe, status);
779         REGEX_CHECK_STATUS;
780 
781         UnicodeString inStr1 = "abcdef this is a test";
782         UnicodeString instr2 = "not abc";
783         UnicodeString empty  = "";
784 
785 
786         //
787         // Matcher creation and reset.
788         //
789         RegexMatcher *m1 = pat2->matcher(inStr1, status);
790         REGEX_CHECK_STATUS;
791         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
792         REGEX_ASSERT(m1->input() == inStr1);
793         m1->reset(instr2);
794         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
795         REGEX_ASSERT(m1->input() == instr2);
796         m1->reset(inStr1);
797         REGEX_ASSERT(m1->input() == inStr1);
798         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
799         m1->reset(empty);
800         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
801         REGEX_ASSERT(m1->input() == empty);
802         REGEX_ASSERT(&m1->pattern() == pat2);
803 
804         //
805         //  reset(pos, status)
806         //
807         m1->reset(inStr1);
808         m1->reset(4, status);
809         REGEX_CHECK_STATUS;
810         REGEX_ASSERT(m1->input() == inStr1);
811         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
812 
813         m1->reset(-1, status);
814         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
815         status = U_ZERO_ERROR;
816 
817         m1->reset(0, status);
818         REGEX_CHECK_STATUS;
819         status = U_ZERO_ERROR;
820 
821         int32_t len = m1->input().length();
822         m1->reset(len-1, status);
823         REGEX_CHECK_STATUS;
824         status = U_ZERO_ERROR;
825 
826         m1->reset(len, status);
827         REGEX_CHECK_STATUS;
828         status = U_ZERO_ERROR;
829 
830         m1->reset(len+1, status);
831         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
832         status = U_ZERO_ERROR;
833 
834         //
835         // match(pos, status)
836         //
837         m1->reset(instr2);
838         REGEX_ASSERT(m1->matches(4, status) == TRUE);
839         m1->reset();
840         REGEX_ASSERT(m1->matches(3, status) == FALSE);
841         m1->reset();
842         REGEX_ASSERT(m1->matches(5, status) == FALSE);
843         REGEX_ASSERT(m1->matches(4, status) == TRUE);
844         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
845         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
846 
847         // Match() at end of string should fail, but should not
848         //  be an error.
849         status = U_ZERO_ERROR;
850         len = m1->input().length();
851         REGEX_ASSERT(m1->matches(len, status) == FALSE);
852         REGEX_CHECK_STATUS;
853 
854         // Match beyond end of string should fail with an error.
855         status = U_ZERO_ERROR;
856         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
857         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
858 
859         // Successful match at end of string.
860         {
861             status = U_ZERO_ERROR;
862             RegexMatcher m("A?", 0, status);  // will match zero length string.
863             REGEX_CHECK_STATUS;
864             m.reset(inStr1);
865             len = inStr1.length();
866             REGEX_ASSERT(m.matches(len, status) == TRUE);
867             REGEX_CHECK_STATUS;
868             m.reset(empty);
869             REGEX_ASSERT(m.matches(0, status) == TRUE);
870             REGEX_CHECK_STATUS;
871         }
872 
873 
874         //
875         // lookingAt(pos, status)
876         //
877         status = U_ZERO_ERROR;
878         m1->reset(instr2);  // "not abc"
879         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
880         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
881         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
882         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
883         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
884         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
885         status = U_ZERO_ERROR;
886         len = m1->input().length();
887         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
888         REGEX_CHECK_STATUS;
889         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
890         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
891 
892         delete m1;
893         delete pat2;
894     }
895 
896 
897     //
898     // Capture Group.
899     //     RegexMatcher::start();
900     //     RegexMatcher::end();
901     //     RegexMatcher::groupCount();
902     //
903     {
904         int32_t             flags=0;
905         UParseError         pe;
906         UErrorCode          status=U_ZERO_ERROR;
907 
908         UnicodeString       re("01(23(45)67)(.*)");
909         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
910         REGEX_CHECK_STATUS;
911         UnicodeString data = "0123456789";
912 
913         RegexMatcher *matcher = pat->matcher(data, status);
914         REGEX_CHECK_STATUS;
915         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
916         static const int32_t matchStarts[] = {0,  2, 4, 8};
917         static const int32_t matchEnds[]   = {10, 8, 6, 10};
918         int32_t i;
919         for (i=0; i<4; i++) {
920             int32_t actualStart = matcher->start(i, status);
921             REGEX_CHECK_STATUS;
922             if (actualStart != matchStarts[i]) {
923                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
924                     __LINE__, i, matchStarts[i], actualStart);
925             }
926             int32_t actualEnd = matcher->end(i, status);
927             REGEX_CHECK_STATUS;
928             if (actualEnd != matchEnds[i]) {
929                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
930                     __LINE__, i, matchEnds[i], actualEnd);
931             }
932         }
933 
934         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
935         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
936 
937         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
938         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
939         matcher->reset();
940         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
941 
942         matcher->lookingAt(status);
943         REGEX_ASSERT(matcher->group(status)    == "0123456789");
944         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
945         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
946         REGEX_ASSERT(matcher->group(2, status) == "45"        );
947         REGEX_ASSERT(matcher->group(3, status) == "89"        );
948         REGEX_CHECK_STATUS;
949         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
950         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
951         matcher->reset();
952         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
953 
954         delete matcher;
955         delete pat;
956 
957     }
958 
959     //
960     //  find
961     //
962     {
963         int32_t             flags=0;
964         UParseError         pe;
965         UErrorCode          status=U_ZERO_ERROR;
966 
967         UnicodeString       re("abc");
968         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
969         REGEX_CHECK_STATUS;
970         UnicodeString data = ".abc..abc...abc..";
971         //                    012345678901234567
972 
973         RegexMatcher *matcher = pat->matcher(data, status);
974         REGEX_CHECK_STATUS;
975         REGEX_ASSERT(matcher->find());
976         REGEX_ASSERT(matcher->start(status) == 1);
977         REGEX_ASSERT(matcher->find());
978         REGEX_ASSERT(matcher->start(status) == 6);
979         REGEX_ASSERT(matcher->find());
980         REGEX_ASSERT(matcher->start(status) == 12);
981         REGEX_ASSERT(matcher->find() == FALSE);
982         REGEX_ASSERT(matcher->find() == FALSE);
983 
984         matcher->reset();
985         REGEX_ASSERT(matcher->find());
986         REGEX_ASSERT(matcher->start(status) == 1);
987 
988         REGEX_ASSERT(matcher->find(0, status));
989         REGEX_ASSERT(matcher->start(status) == 1);
990         REGEX_ASSERT(matcher->find(1, status));
991         REGEX_ASSERT(matcher->start(status) == 1);
992         REGEX_ASSERT(matcher->find(2, status));
993         REGEX_ASSERT(matcher->start(status) == 6);
994         REGEX_ASSERT(matcher->find(12, status));
995         REGEX_ASSERT(matcher->start(status) == 12);
996         REGEX_ASSERT(matcher->find(13, status) == FALSE);
997         REGEX_ASSERT(matcher->find(16, status) == FALSE);
998         REGEX_ASSERT(matcher->find(17, status) == FALSE);
999         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1000 
1001         status = U_ZERO_ERROR;
1002         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1003         status = U_ZERO_ERROR;
1004         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1005 
1006         REGEX_ASSERT(matcher->groupCount() == 0);
1007 
1008         delete matcher;
1009         delete pat;
1010     }
1011 
1012 
1013     //
1014     //  find, with \G in pattern (true if at the end of a previous match).
1015     //
1016     {
1017         int32_t             flags=0;
1018         UParseError         pe;
1019         UErrorCode          status=U_ZERO_ERROR;
1020 
1021         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1022         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1023         REGEX_CHECK_STATUS;
1024         UnicodeString data = ".abcabc.abc..";
1025         //                    012345678901234567
1026 
1027         RegexMatcher *matcher = pat->matcher(data, status);
1028         REGEX_CHECK_STATUS;
1029         REGEX_ASSERT(matcher->find());
1030         REGEX_ASSERT(matcher->start(status) == 0);
1031         REGEX_ASSERT(matcher->start(1, status) == -1);
1032         REGEX_ASSERT(matcher->start(2, status) == 1);
1033 
1034         REGEX_ASSERT(matcher->find());
1035         REGEX_ASSERT(matcher->start(status) == 4);
1036         REGEX_ASSERT(matcher->start(1, status) == 4);
1037         REGEX_ASSERT(matcher->start(2, status) == -1);
1038         REGEX_CHECK_STATUS;
1039 
1040         delete matcher;
1041         delete pat;
1042     }
1043 
1044     //
1045     //   find with zero length matches, match position should bump ahead
1046     //     to prevent loops.
1047     //
1048     {
1049         int32_t                 i;
1050         UErrorCode          status=U_ZERO_ERROR;
1051         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1052                                                       //   using an always-true look-ahead.
1053         REGEX_CHECK_STATUS;
1054         UnicodeString s("    ");
1055         m.reset(s);
1056         for (i=0; ; i++) {
1057             if (m.find() == FALSE) {
1058                 break;
1059             }
1060             REGEX_ASSERT(m.start(status) == i);
1061             REGEX_ASSERT(m.end(status) == i);
1062         }
1063         REGEX_ASSERT(i==5);
1064 
1065         // Check that the bump goes over surrogate pairs OK
1066         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1067         s = s.unescape();
1068         m.reset(s);
1069         for (i=0; ; i+=2) {
1070             if (m.find() == FALSE) {
1071                 break;
1072             }
1073             REGEX_ASSERT(m.start(status) == i);
1074             REGEX_ASSERT(m.end(status) == i);
1075         }
1076         REGEX_ASSERT(i==10);
1077     }
1078     {
1079         // find() loop breaking test.
1080         //        with pattern of /.?/, should see a series of one char matches, then a single
1081         //        match of zero length at the end of the input string.
1082         int32_t                 i;
1083         UErrorCode          status=U_ZERO_ERROR;
1084         RegexMatcher        m(".?", 0, status);
1085         REGEX_CHECK_STATUS;
1086         UnicodeString s("    ");
1087         m.reset(s);
1088         for (i=0; ; i++) {
1089             if (m.find() == FALSE) {
1090                 break;
1091             }
1092             REGEX_ASSERT(m.start(status) == i);
1093             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1094         }
1095         REGEX_ASSERT(i==5);
1096     }
1097 
1098 
1099     //
1100     // Matchers with no input string behave as if they had an empty input string.
1101     //
1102 
1103     {
1104         UErrorCode status = U_ZERO_ERROR;
1105         RegexMatcher  m(".?", 0, status);
1106         REGEX_CHECK_STATUS;
1107         REGEX_ASSERT(m.find());
1108         REGEX_ASSERT(m.start(status) == 0);
1109         REGEX_ASSERT(m.input() == "");
1110     }
1111     {
1112         UErrorCode status = U_ZERO_ERROR;
1113         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1114         RegexMatcher  *m = p->matcher(status);
1115         REGEX_CHECK_STATUS;
1116 
1117         REGEX_ASSERT(m->find() == FALSE);
1118         REGEX_ASSERT(m->input() == "");
1119         delete m;
1120         delete p;
1121     }
1122 
1123     //
1124     // Regions
1125     //
1126     {
1127         UErrorCode status = U_ZERO_ERROR;
1128         UnicodeString testString("This is test data");
1129         RegexMatcher m(".*", testString,  0, status);
1130         REGEX_CHECK_STATUS;
1131         REGEX_ASSERT(m.regionStart() == 0);
1132         REGEX_ASSERT(m.regionEnd() == testString.length());
1133         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1134         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1135 
1136         m.region(2,4, status);
1137         REGEX_CHECK_STATUS;
1138         REGEX_ASSERT(m.matches(status));
1139         REGEX_ASSERT(m.start(status)==2);
1140         REGEX_ASSERT(m.end(status)==4);
1141         REGEX_CHECK_STATUS;
1142 
1143         m.reset();
1144         REGEX_ASSERT(m.regionStart() == 0);
1145         REGEX_ASSERT(m.regionEnd() == testString.length());
1146 
1147         UnicodeString shorterString("short");
1148         m.reset(shorterString);
1149         REGEX_ASSERT(m.regionStart() == 0);
1150         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1151 
1152         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1153         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1154         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1155         REGEX_ASSERT(&m == &m.reset());
1156         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1157 
1158         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1159         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1160         REGEX_ASSERT(&m == &m.reset());
1161         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1162 
1163         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1164         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1165         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1166         REGEX_ASSERT(&m == &m.reset());
1167         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1168 
1169         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1170         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1171         REGEX_ASSERT(&m == &m.reset());
1172         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1173 
1174     }
1175 
1176     //
1177     // hitEnd() and requireEnd()
1178     //
1179     {
1180         UErrorCode status = U_ZERO_ERROR;
1181         UnicodeString testString("aabb");
1182         RegexMatcher m1(".*", testString,  0, status);
1183         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1184         REGEX_ASSERT(m1.hitEnd() == TRUE);
1185         REGEX_ASSERT(m1.requireEnd() == FALSE);
1186         REGEX_CHECK_STATUS;
1187 
1188         status = U_ZERO_ERROR;
1189         RegexMatcher m2("a*", testString, 0, status);
1190         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1191         REGEX_ASSERT(m2.hitEnd() == FALSE);
1192         REGEX_ASSERT(m2.requireEnd() == FALSE);
1193         REGEX_CHECK_STATUS;
1194 
1195         status = U_ZERO_ERROR;
1196         RegexMatcher m3(".*$", testString, 0, status);
1197         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1198         REGEX_ASSERT(m3.hitEnd() == TRUE);
1199         REGEX_ASSERT(m3.requireEnd() == TRUE);
1200         REGEX_CHECK_STATUS;
1201     }
1202 
1203 
1204     //
1205     // Compilation error on reset with UChar *
1206     //   These were a hazard that people were stumbling over with runtime errors.
1207     //   Changed them to compiler errors by adding private methods that more closely
1208     //   matched the incorrect use of the functions.
1209     //
1210 #if 0
1211     {
1212         UErrorCode status = U_ZERO_ERROR;
1213         UChar ucharString[20];
1214         RegexMatcher m(".", 0, status);
1215         m.reset(ucharString);  // should not compile.
1216 
1217         RegexPattern *p = RegexPattern::compile(".", 0, status);
1218         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1219 
1220         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1221     }
1222 #endif
1223 
1224     //
1225     //  Time Outs.
1226     //       Note:  These tests will need to be changed when the regexp engine is
1227     //              able to detect and cut short the exponential time behavior on
1228     //              this type of match.
1229     //
1230     {
1231         UErrorCode status = U_ZERO_ERROR;
1232         //    Enough 'a's in the string to cause the match to time out.
1233         //       (Each on additonal 'a' doubles the time)
1234         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1235         RegexMatcher matcher("(a+)+b", testString, 0, status);
1236         REGEX_CHECK_STATUS;
1237         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1238         matcher.setTimeLimit(100, status);
1239         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1240         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1241         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1242     }
1243     {
1244         UErrorCode status = U_ZERO_ERROR;
1245         //   Few enough 'a's to slip in under the time limit.
1246         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1247         RegexMatcher matcher("(a+)+b", testString, 0, status);
1248         REGEX_CHECK_STATUS;
1249         matcher.setTimeLimit(100, status);
1250         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1251         REGEX_CHECK_STATUS;
1252     }
1253 
1254     //
1255     //  Stack Limits
1256     //
1257     {
1258         UErrorCode status = U_ZERO_ERROR;
1259         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1260 
1261         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1262         //   of the '+', and makes the stack frames larger.
1263         RegexMatcher matcher("(A)+A$", testString, 0, status);
1264 
1265         // With the default stack, this match should fail to run
1266         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1267         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1268 
1269         // With unlimited stack, it should run
1270         status = U_ZERO_ERROR;
1271         matcher.setStackLimit(0, status);
1272         REGEX_CHECK_STATUS;
1273         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1274         REGEX_CHECK_STATUS;
1275         REGEX_ASSERT(matcher.getStackLimit() == 0);
1276 
1277         // With a limited stack, it the match should fail
1278         status = U_ZERO_ERROR;
1279         matcher.setStackLimit(10000, status);
1280         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1281         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1282         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1283     }
1284 
1285         // A pattern that doesn't save state should work with
1286         //   a minimal sized stack
1287     {
1288         UErrorCode status = U_ZERO_ERROR;
1289         UnicodeString testString = "abc";
1290         RegexMatcher matcher("abc", testString, 0, status);
1291         REGEX_CHECK_STATUS;
1292         matcher.setStackLimit(30, status);
1293         REGEX_CHECK_STATUS;
1294         REGEX_ASSERT(matcher.matches(status) == TRUE);
1295         REGEX_CHECK_STATUS;
1296         REGEX_ASSERT(matcher.getStackLimit() == 30);
1297 
1298         // Negative stack sizes should fail
1299         status = U_ZERO_ERROR;
1300         matcher.setStackLimit(1000, status);
1301         REGEX_CHECK_STATUS;
1302         matcher.setStackLimit(-1, status);
1303         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1304         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1305     }
1306 
1307 
1308 }
1309 
1310 
1311 
1312 
1313 
1314 
1315 //---------------------------------------------------------------------------
1316 //
1317 //      API_Replace        API test for class RegexMatcher, testing the
1318 //                         Replace family of functions.
1319 //
1320 //---------------------------------------------------------------------------
API_Replace()1321 void RegexTest::API_Replace() {
1322     //
1323     //  Replace
1324     //
1325     int32_t             flags=0;
1326     UParseError         pe;
1327     UErrorCode          status=U_ZERO_ERROR;
1328 
1329     UnicodeString       re("abc");
1330     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1331     REGEX_CHECK_STATUS;
1332     UnicodeString data = ".abc..abc...abc..";
1333     //                    012345678901234567
1334     RegexMatcher *matcher = pat->matcher(data, status);
1335 
1336     //
1337     //  Plain vanilla matches.
1338     //
1339     UnicodeString  dest;
1340     dest = matcher->replaceFirst("yz", status);
1341     REGEX_CHECK_STATUS;
1342     REGEX_ASSERT(dest == ".yz..abc...abc..");
1343 
1344     dest = matcher->replaceAll("yz", status);
1345     REGEX_CHECK_STATUS;
1346     REGEX_ASSERT(dest == ".yz..yz...yz..");
1347 
1348     //
1349     //  Plain vanilla non-matches.
1350     //
1351     UnicodeString d2 = ".abx..abx...abx..";
1352     matcher->reset(d2);
1353     dest = matcher->replaceFirst("yz", status);
1354     REGEX_CHECK_STATUS;
1355     REGEX_ASSERT(dest == ".abx..abx...abx..");
1356 
1357     dest = matcher->replaceAll("yz", status);
1358     REGEX_CHECK_STATUS;
1359     REGEX_ASSERT(dest == ".abx..abx...abx..");
1360 
1361     //
1362     // Empty source string
1363     //
1364     UnicodeString d3 = "";
1365     matcher->reset(d3);
1366     dest = matcher->replaceFirst("yz", status);
1367     REGEX_CHECK_STATUS;
1368     REGEX_ASSERT(dest == "");
1369 
1370     dest = matcher->replaceAll("yz", status);
1371     REGEX_CHECK_STATUS;
1372     REGEX_ASSERT(dest == "");
1373 
1374     //
1375     // Empty substitution string
1376     //
1377     matcher->reset(data);              // ".abc..abc...abc.."
1378     dest = matcher->replaceFirst("", status);
1379     REGEX_CHECK_STATUS;
1380     REGEX_ASSERT(dest == "...abc...abc..");
1381 
1382     dest = matcher->replaceAll("", status);
1383     REGEX_CHECK_STATUS;
1384     REGEX_ASSERT(dest == "........");
1385 
1386     //
1387     // match whole string
1388     //
1389     UnicodeString d4 = "abc";
1390     matcher->reset(d4);
1391     dest = matcher->replaceFirst("xyz", status);
1392     REGEX_CHECK_STATUS;
1393     REGEX_ASSERT(dest == "xyz");
1394 
1395     dest = matcher->replaceAll("xyz", status);
1396     REGEX_CHECK_STATUS;
1397     REGEX_ASSERT(dest == "xyz");
1398 
1399     //
1400     // Capture Group, simple case
1401     //
1402     UnicodeString       re2("a(..)");
1403     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1404     REGEX_CHECK_STATUS;
1405     UnicodeString d5 = "abcdefg";
1406     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1407     REGEX_CHECK_STATUS;
1408     dest = matcher2->replaceFirst("$1$1", status);
1409     REGEX_CHECK_STATUS;
1410     REGEX_ASSERT(dest == "bcbcdefg");
1411 
1412     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1413     REGEX_CHECK_STATUS;
1414     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1415 
1416     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1417     REGEX_ASSERT(U_FAILURE(status));
1418     status = U_ZERO_ERROR;
1419 
1420     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1421     replacement = replacement.unescape();
1422     dest = matcher2->replaceFirst(replacement, status);
1423     REGEX_CHECK_STATUS;
1424     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1425 
1426     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1427 
1428 
1429     //
1430     // Replacement String with \u hex escapes
1431     //
1432     {
1433         UnicodeString  src = "abc 1 abc 2 abc 3";
1434         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1435         matcher->reset(src);
1436         UnicodeString  result = matcher->replaceAll(substitute, status);
1437         REGEX_CHECK_STATUS;
1438         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1439     }
1440     {
1441         UnicodeString  src = "abc !";
1442         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1443         matcher->reset(src);
1444         UnicodeString  result = matcher->replaceAll(substitute, status);
1445         REGEX_CHECK_STATUS;
1446         UnicodeString expected = UnicodeString("--");
1447         expected.append((UChar32)0x10000);
1448         expected.append("-- !");
1449         REGEX_ASSERT(result == expected);
1450     }
1451     // TODO:  need more through testing of capture substitutions.
1452 
1453     // Bug 4057
1454     //
1455     {
1456         status = U_ZERO_ERROR;
1457         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1458         RegexMatcher m("ss(.*?)ee", 0, status);
1459         REGEX_CHECK_STATUS;
1460         UnicodeString result;
1461 
1462         // Multiple finds do NOT bump up the previous appendReplacement postion.
1463         m.reset(s);
1464         m.find();
1465         m.find();
1466         m.appendReplacement(result, "ooh", status);
1467         REGEX_CHECK_STATUS;
1468         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1469 
1470         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1471         status = U_ZERO_ERROR;
1472         result.truncate(0);
1473         m.reset(10, status);
1474         m.find();
1475         m.find();
1476         m.appendReplacement(result, "ooh", status);
1477         REGEX_CHECK_STATUS;
1478         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1479 
1480         // find() at interior of string, appendReplacemnt still starts at beginning.
1481         status = U_ZERO_ERROR;
1482         result.truncate(0);
1483         m.reset();
1484         m.find(10, status);
1485         m.find();
1486         m.appendReplacement(result, "ooh", status);
1487         REGEX_CHECK_STATUS;
1488         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1489 
1490         m.appendTail(result);
1491         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1492 
1493     }
1494 
1495     delete matcher2;
1496     delete pat2;
1497     delete matcher;
1498     delete pat;
1499 }
1500 
1501 
1502 //---------------------------------------------------------------------------
1503 //
1504 //      API_Pattern       Test that the API for class RegexPattern is
1505 //                        present and nominally working.
1506 //
1507 //---------------------------------------------------------------------------
API_Pattern()1508 void RegexTest::API_Pattern() {
1509     RegexPattern        pata;    // Test default constructor to not crash.
1510     RegexPattern        patb;
1511 
1512     REGEX_ASSERT(pata == patb);
1513     REGEX_ASSERT(pata == pata);
1514 
1515     UnicodeString re1("abc[a-l][m-z]");
1516     UnicodeString re2("def");
1517     UErrorCode    status = U_ZERO_ERROR;
1518     UParseError   pe;
1519 
1520     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1521     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1522     REGEX_CHECK_STATUS;
1523     REGEX_ASSERT(*pat1 == *pat1);
1524     REGEX_ASSERT(*pat1 != pata);
1525 
1526     // Assign
1527     patb = *pat1;
1528     REGEX_ASSERT(patb == *pat1);
1529 
1530     // Copy Construct
1531     RegexPattern patc(*pat1);
1532     REGEX_ASSERT(patc == *pat1);
1533     REGEX_ASSERT(patb == patc);
1534     REGEX_ASSERT(pat1 != pat2);
1535     patb = *pat2;
1536     REGEX_ASSERT(patb != patc);
1537     REGEX_ASSERT(patb == *pat2);
1538 
1539     // Compile with no flags.
1540     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1541     REGEX_ASSERT(*pat1a == *pat1);
1542 
1543     REGEX_ASSERT(pat1a->flags() == 0);
1544 
1545     // Compile with different flags should be not equal
1546     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1547     REGEX_CHECK_STATUS;
1548 
1549     REGEX_ASSERT(*pat1b != *pat1a);
1550     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1551     REGEX_ASSERT(pat1a->flags() == 0);
1552     delete pat1b;
1553 
1554     // clone
1555     RegexPattern *pat1c = pat1->clone();
1556     REGEX_ASSERT(*pat1c == *pat1);
1557     REGEX_ASSERT(*pat1c != *pat2);
1558 
1559     delete pat1c;
1560     delete pat1a;
1561     delete pat1;
1562     delete pat2;
1563 
1564 
1565     //
1566     //   Verify that a matcher created from a cloned pattern works.
1567     //     (Jitterbug 3423)
1568     //
1569     {
1570         UErrorCode     status     = U_ZERO_ERROR;
1571         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1572         RegexPattern  *pClone     = pSource->clone();
1573         delete         pSource;
1574         RegexMatcher  *mFromClone = pClone->matcher(status);
1575         REGEX_CHECK_STATUS;
1576         UnicodeString s = "Hello World";
1577         mFromClone->reset(s);
1578         REGEX_ASSERT(mFromClone->find() == TRUE);
1579         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1580         REGEX_ASSERT(mFromClone->find() == TRUE);
1581         REGEX_ASSERT(mFromClone->group(status) == "World");
1582         REGEX_ASSERT(mFromClone->find() == FALSE);
1583         delete mFromClone;
1584         delete pClone;
1585     }
1586 
1587     //
1588     //   matches convenience API
1589     //
1590     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1591     REGEX_CHECK_STATUS;
1592     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1593     REGEX_CHECK_STATUS;
1594     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1595     REGEX_CHECK_STATUS;
1596     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1597     REGEX_CHECK_STATUS;
1598     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1599     REGEX_CHECK_STATUS;
1600     status = U_INDEX_OUTOFBOUNDS_ERROR;
1601     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1602     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1603 
1604 
1605     //
1606     // Split()
1607     //
1608     status = U_ZERO_ERROR;
1609     pat1 = RegexPattern::compile(" +",  pe, status);
1610     REGEX_CHECK_STATUS;
1611     UnicodeString  fields[10];
1612 
1613     int32_t n;
1614     n = pat1->split("Now is the time", fields, 10, status);
1615     REGEX_CHECK_STATUS;
1616     REGEX_ASSERT(n==4);
1617     REGEX_ASSERT(fields[0]=="Now");
1618     REGEX_ASSERT(fields[1]=="is");
1619     REGEX_ASSERT(fields[2]=="the");
1620     REGEX_ASSERT(fields[3]=="time");
1621     REGEX_ASSERT(fields[4]=="");
1622 
1623     n = pat1->split("Now is the time", fields, 2, status);
1624     REGEX_CHECK_STATUS;
1625     REGEX_ASSERT(n==2);
1626     REGEX_ASSERT(fields[0]=="Now");
1627     REGEX_ASSERT(fields[1]=="is the time");
1628     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1629 
1630     fields[1] = "*";
1631     status = U_ZERO_ERROR;
1632     n = pat1->split("Now is the time", fields, 1, status);
1633     REGEX_CHECK_STATUS;
1634     REGEX_ASSERT(n==1);
1635     REGEX_ASSERT(fields[0]=="Now is the time");
1636     REGEX_ASSERT(fields[1]=="*");
1637     status = U_ZERO_ERROR;
1638 
1639     n = pat1->split("    Now       is the time   ", fields, 10, status);
1640     REGEX_CHECK_STATUS;
1641     REGEX_ASSERT(n==6);
1642     REGEX_ASSERT(fields[0]=="");
1643     REGEX_ASSERT(fields[1]=="Now");
1644     REGEX_ASSERT(fields[2]=="is");
1645     REGEX_ASSERT(fields[3]=="the");
1646     REGEX_ASSERT(fields[4]=="time");
1647     REGEX_ASSERT(fields[5]=="");
1648 
1649     n = pat1->split("     ", fields, 10, status);
1650     REGEX_CHECK_STATUS;
1651     REGEX_ASSERT(n==2);
1652     REGEX_ASSERT(fields[0]=="");
1653     REGEX_ASSERT(fields[1]=="");
1654 
1655     fields[0] = "foo";
1656     n = pat1->split("", fields, 10, status);
1657     REGEX_CHECK_STATUS;
1658     REGEX_ASSERT(n==0);
1659     REGEX_ASSERT(fields[0]=="foo");
1660 
1661     delete pat1;
1662 
1663     //  split, with a pattern with (capture)
1664     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1665     REGEX_CHECK_STATUS;
1666 
1667     status = U_ZERO_ERROR;
1668     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1669     REGEX_CHECK_STATUS;
1670     REGEX_ASSERT(n==7);
1671     REGEX_ASSERT(fields[0]=="");
1672     REGEX_ASSERT(fields[1]=="a");
1673     REGEX_ASSERT(fields[2]=="Now is ");
1674     REGEX_ASSERT(fields[3]=="b");
1675     REGEX_ASSERT(fields[4]=="the time");
1676     REGEX_ASSERT(fields[5]=="c");
1677     REGEX_ASSERT(fields[6]=="");
1678     REGEX_ASSERT(status==U_ZERO_ERROR);
1679 
1680     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1681     REGEX_CHECK_STATUS;
1682     REGEX_ASSERT(n==7);
1683     REGEX_ASSERT(fields[0]=="  ");
1684     REGEX_ASSERT(fields[1]=="a");
1685     REGEX_ASSERT(fields[2]=="Now is ");
1686     REGEX_ASSERT(fields[3]=="b");
1687     REGEX_ASSERT(fields[4]=="the time");
1688     REGEX_ASSERT(fields[5]=="c");
1689     REGEX_ASSERT(fields[6]=="");
1690 
1691     status = U_ZERO_ERROR;
1692     fields[6] = "foo";
1693     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1694     REGEX_CHECK_STATUS;
1695     REGEX_ASSERT(n==6);
1696     REGEX_ASSERT(fields[0]=="  ");
1697     REGEX_ASSERT(fields[1]=="a");
1698     REGEX_ASSERT(fields[2]=="Now is ");
1699     REGEX_ASSERT(fields[3]=="b");
1700     REGEX_ASSERT(fields[4]=="the time");
1701     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1702     REGEX_ASSERT(fields[6]=="foo");
1703 
1704     status = U_ZERO_ERROR;
1705     fields[5] = "foo";
1706     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1707     REGEX_CHECK_STATUS;
1708     REGEX_ASSERT(n==5);
1709     REGEX_ASSERT(fields[0]=="  ");
1710     REGEX_ASSERT(fields[1]=="a");
1711     REGEX_ASSERT(fields[2]=="Now is ");
1712     REGEX_ASSERT(fields[3]=="b");
1713     REGEX_ASSERT(fields[4]=="the time<c>");
1714     REGEX_ASSERT(fields[5]=="foo");
1715 
1716     status = U_ZERO_ERROR;
1717     fields[5] = "foo";
1718     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1719     REGEX_CHECK_STATUS;
1720     REGEX_ASSERT(n==5);
1721     REGEX_ASSERT(fields[0]=="  ");
1722     REGEX_ASSERT(fields[1]=="a");
1723     REGEX_ASSERT(fields[2]=="Now is ");
1724     REGEX_ASSERT(fields[3]=="b");
1725     REGEX_ASSERT(fields[4]=="the time");
1726     REGEX_ASSERT(fields[5]=="foo");
1727 
1728     status = U_ZERO_ERROR;
1729     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1730     REGEX_CHECK_STATUS;
1731     REGEX_ASSERT(n==4);
1732     REGEX_ASSERT(fields[0]=="  ");
1733     REGEX_ASSERT(fields[1]=="a");
1734     REGEX_ASSERT(fields[2]=="Now is ");
1735     REGEX_ASSERT(fields[3]=="the time<c>");
1736     status = U_ZERO_ERROR;
1737     delete pat1;
1738 
1739     pat1 = RegexPattern::compile("([-,])",  pe, status);
1740     REGEX_CHECK_STATUS;
1741     n = pat1->split("1-10,20", fields, 10, status);
1742     REGEX_CHECK_STATUS;
1743     REGEX_ASSERT(n==5);
1744     REGEX_ASSERT(fields[0]=="1");
1745     REGEX_ASSERT(fields[1]=="-");
1746     REGEX_ASSERT(fields[2]=="10");
1747     REGEX_ASSERT(fields[3]==",");
1748     REGEX_ASSERT(fields[4]=="20");
1749     delete pat1;
1750 
1751     // Test split of string with empty trailing fields
1752     pat1 = RegexPattern::compile(",", pe, status);
1753     REGEX_CHECK_STATUS;
1754     n = pat1->split("a,b,c,", fields, 10, status);
1755     REGEX_CHECK_STATUS;
1756     REGEX_ASSERT(n==4);
1757     REGEX_ASSERT(fields[0]=="a");
1758     REGEX_ASSERT(fields[1]=="b");
1759     REGEX_ASSERT(fields[2]=="c");
1760     REGEX_ASSERT(fields[3]=="");
1761 
1762     n = pat1->split("a,,,", fields, 10, status);
1763     REGEX_CHECK_STATUS;
1764     REGEX_ASSERT(n==4);
1765     REGEX_ASSERT(fields[0]=="a");
1766     REGEX_ASSERT(fields[1]=="");
1767     REGEX_ASSERT(fields[2]=="");
1768     REGEX_ASSERT(fields[3]=="");
1769     delete pat1;
1770 
1771     // Split Separator with zero length match.
1772     pat1 = RegexPattern::compile(":?", pe, status);
1773     REGEX_CHECK_STATUS;
1774     n = pat1->split("abc", fields, 10, status);
1775     REGEX_CHECK_STATUS;
1776     REGEX_ASSERT(n==5);
1777     REGEX_ASSERT(fields[0]=="");
1778     REGEX_ASSERT(fields[1]=="a");
1779     REGEX_ASSERT(fields[2]=="b");
1780     REGEX_ASSERT(fields[3]=="c");
1781     REGEX_ASSERT(fields[4]=="");
1782 
1783     delete pat1;
1784 
1785     //
1786     // RegexPattern::pattern()
1787     //
1788     pat1 = new RegexPattern();
1789     REGEX_ASSERT(pat1->pattern() == "");
1790     delete pat1;
1791 
1792     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1793     REGEX_CHECK_STATUS;
1794     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1795     delete pat1;
1796 
1797 
1798     //
1799     // classID functions
1800     //
1801     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1802     REGEX_CHECK_STATUS;
1803     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1804     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1805     UnicodeString Hello("Hello, world.");
1806     RegexMatcher *m = pat1->matcher(Hello, status);
1807     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1808     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1809     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1810     delete m;
1811     delete pat1;
1812 
1813 }
1814 
1815 //---------------------------------------------------------------------------
1816 //
1817 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1818 //                       is present and working, but excluding functions
1819 //                       implementing replace operations.
1820 //
1821 //---------------------------------------------------------------------------
API_Match_UTF8()1822 void RegexTest::API_Match_UTF8() {
1823     UParseError         pe;
1824     UErrorCode          status=U_ZERO_ERROR;
1825     int32_t             flags = 0;
1826 
1827     //
1828     // Debug - slide failing test cases early
1829     //
1830 #if 0
1831     {
1832     }
1833     return;
1834 #endif
1835 
1836     //
1837     // Simple pattern compilation
1838     //
1839     {
1840         UText               re = UTEXT_INITIALIZER;
1841         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1842         REGEX_VERBOSE_TEXT(&re);
1843         RegexPattern        *pat2;
1844         pat2 = RegexPattern::compile(&re, flags, pe, status);
1845         REGEX_CHECK_STATUS;
1846 
1847         UText input1 = UTEXT_INITIALIZER;
1848         UText input2 = UTEXT_INITIALIZER;
1849         UText empty  = UTEXT_INITIALIZER;
1850         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1851         REGEX_VERBOSE_TEXT(&input1);
1852         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1853         REGEX_VERBOSE_TEXT(&input2);
1854         utext_openUChars(&empty, NULL, 0, &status);
1855 
1856         int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
1857         int32_t input2Len = static_cast<int32_t>(strlen("not abc"));
1858 
1859 
1860         //
1861         // Matcher creation and reset.
1862         //
1863         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1864         REGEX_CHECK_STATUS;
1865         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1866         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1867         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1868         m1->reset(&input2);
1869         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1870         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1871         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1872         m1->reset(&input1);
1873         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1874         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1875         m1->reset(&empty);
1876         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1877         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1878 
1879         //
1880         //  reset(pos, status)
1881         //
1882         m1->reset(&input1);
1883         m1->reset(4, status);
1884         REGEX_CHECK_STATUS;
1885         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1886         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1887 
1888         m1->reset(-1, status);
1889         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1890         status = U_ZERO_ERROR;
1891 
1892         m1->reset(0, status);
1893         REGEX_CHECK_STATUS;
1894         status = U_ZERO_ERROR;
1895 
1896         m1->reset(input1Len-1, status);
1897         REGEX_CHECK_STATUS;
1898         status = U_ZERO_ERROR;
1899 
1900         m1->reset(input1Len, status);
1901         REGEX_CHECK_STATUS;
1902         status = U_ZERO_ERROR;
1903 
1904         m1->reset(input1Len+1, status);
1905         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1906         status = U_ZERO_ERROR;
1907 
1908         //
1909         // match(pos, status)
1910         //
1911         m1->reset(&input2);
1912         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1913         m1->reset();
1914         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1915         m1->reset();
1916         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1917         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1918         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1919         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1920 
1921         // Match() at end of string should fail, but should not
1922         //  be an error.
1923         status = U_ZERO_ERROR;
1924         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1925         REGEX_CHECK_STATUS;
1926 
1927         // Match beyond end of string should fail with an error.
1928         status = U_ZERO_ERROR;
1929         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1930         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1931 
1932         // Successful match at end of string.
1933         {
1934             status = U_ZERO_ERROR;
1935             RegexMatcher m("A?", 0, status);  // will match zero length string.
1936             REGEX_CHECK_STATUS;
1937             m.reset(&input1);
1938             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1939             REGEX_CHECK_STATUS;
1940             m.reset(&empty);
1941             REGEX_ASSERT(m.matches(0, status) == TRUE);
1942             REGEX_CHECK_STATUS;
1943         }
1944 
1945 
1946         //
1947         // lookingAt(pos, status)
1948         //
1949         status = U_ZERO_ERROR;
1950         m1->reset(&input2);  // "not abc"
1951         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1952         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1953         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1954         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1955         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1956         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1957         status = U_ZERO_ERROR;
1958         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1959         REGEX_CHECK_STATUS;
1960         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1961         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1962 
1963         delete m1;
1964         delete pat2;
1965 
1966         utext_close(&re);
1967         utext_close(&input1);
1968         utext_close(&input2);
1969         utext_close(&empty);
1970     }
1971 
1972 
1973     //
1974     // Capture Group.
1975     //     RegexMatcher::start();
1976     //     RegexMatcher::end();
1977     //     RegexMatcher::groupCount();
1978     //
1979     {
1980         int32_t             flags=0;
1981         UParseError         pe;
1982         UErrorCode          status=U_ZERO_ERROR;
1983         UText               re=UTEXT_INITIALIZER;
1984         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1985         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1986 
1987         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1988         REGEX_CHECK_STATUS;
1989 
1990         UText input = UTEXT_INITIALIZER;
1991         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1992         utext_openUTF8(&input, str_0123456789, -1, &status);
1993 
1994         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1995         REGEX_CHECK_STATUS;
1996         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1997         static const int32_t matchStarts[] = {0,  2, 4, 8};
1998         static const int32_t matchEnds[]   = {10, 8, 6, 10};
1999         int32_t i;
2000         for (i=0; i<4; i++) {
2001             int32_t actualStart = matcher->start(i, status);
2002             REGEX_CHECK_STATUS;
2003             if (actualStart != matchStarts[i]) {
2004                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2005                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
2006             }
2007             int32_t actualEnd = matcher->end(i, status);
2008             REGEX_CHECK_STATUS;
2009             if (actualEnd != matchEnds[i]) {
2010                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2011                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2012             }
2013         }
2014 
2015         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2016         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2017 
2018         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2019         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2020         matcher->reset();
2021         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2022 
2023         matcher->lookingAt(status);
2024 
2025         UnicodeString dest;
2026         UText destText = UTEXT_INITIALIZER;
2027         utext_openUnicodeString(&destText, &dest, &status);
2028         UText *result;
2029         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2030         //  Test shallow-clone API
2031         int64_t   group_len;
2032         result = matcher->group((UText *)NULL, group_len, status);
2033         REGEX_CHECK_STATUS;
2034         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2035         utext_close(result);
2036         result = matcher->group(0, &destText, group_len, status);
2037         REGEX_CHECK_STATUS;
2038         REGEX_ASSERT(result == &destText);
2039         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2040         //  destText is now immutable, reopen it
2041         utext_close(&destText);
2042         utext_openUnicodeString(&destText, &dest, &status);
2043 
2044         int64_t length;
2045         result = matcher->group(0, NULL, length, status);
2046         REGEX_CHECK_STATUS;
2047         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2048         utext_close(result);
2049         result = matcher->group(0, &destText, length, status);
2050         REGEX_CHECK_STATUS;
2051         REGEX_ASSERT(result == &destText);
2052         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2053         REGEX_ASSERT(length == 10);
2054         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2055 
2056         // Capture Group 1 == "234567"
2057         result = matcher->group(1, NULL, length, status);
2058         REGEX_CHECK_STATUS;
2059         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2060         REGEX_ASSERT(length == 6);
2061         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2062         utext_close(result);
2063 
2064         result = matcher->group(1, &destText, length, status);
2065         REGEX_CHECK_STATUS;
2066         REGEX_ASSERT(result == &destText);
2067         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2068         REGEX_ASSERT(length == 6);
2069         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2070         utext_close(result);
2071 
2072         // Capture Group 2 == "45"
2073         result = matcher->group(2, NULL, length, status);
2074         REGEX_CHECK_STATUS;
2075         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2076         REGEX_ASSERT(length == 2);
2077         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2078         utext_close(result);
2079 
2080         result = matcher->group(2, &destText, length, status);
2081         REGEX_CHECK_STATUS;
2082         REGEX_ASSERT(result == &destText);
2083         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2084         REGEX_ASSERT(length == 2);
2085         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2086         utext_close(result);
2087 
2088         // Capture Group 3 == "89"
2089         result = matcher->group(3, NULL, length, status);
2090         REGEX_CHECK_STATUS;
2091         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2092         REGEX_ASSERT(length == 2);
2093         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2094         utext_close(result);
2095 
2096         result = matcher->group(3, &destText, length, status);
2097         REGEX_CHECK_STATUS;
2098         REGEX_ASSERT(result == &destText);
2099         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2100         REGEX_ASSERT(length == 2);
2101         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2102         utext_close(result);
2103 
2104         // Capture Group number out of range.
2105         status = U_ZERO_ERROR;
2106         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2107         status = U_ZERO_ERROR;
2108         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2109         status = U_ZERO_ERROR;
2110         matcher->reset();
2111         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2112 
2113         delete matcher;
2114         delete pat;
2115 
2116         utext_close(&destText);
2117         utext_close(&input);
2118         utext_close(&re);
2119     }
2120 
2121     //
2122     //  find
2123     //
2124     {
2125         int32_t             flags=0;
2126         UParseError         pe;
2127         UErrorCode          status=U_ZERO_ERROR;
2128         UText               re=UTEXT_INITIALIZER;
2129         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2130         utext_openUTF8(&re, str_abc, -1, &status);
2131 
2132         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2133         REGEX_CHECK_STATUS;
2134         UText input = UTEXT_INITIALIZER;
2135         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2136         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2137         //                      012345678901234567
2138 
2139         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2140         REGEX_CHECK_STATUS;
2141         REGEX_ASSERT(matcher->find());
2142         REGEX_ASSERT(matcher->start(status) == 1);
2143         REGEX_ASSERT(matcher->find());
2144         REGEX_ASSERT(matcher->start(status) == 6);
2145         REGEX_ASSERT(matcher->find());
2146         REGEX_ASSERT(matcher->start(status) == 12);
2147         REGEX_ASSERT(matcher->find() == FALSE);
2148         REGEX_ASSERT(matcher->find() == FALSE);
2149 
2150         matcher->reset();
2151         REGEX_ASSERT(matcher->find());
2152         REGEX_ASSERT(matcher->start(status) == 1);
2153 
2154         REGEX_ASSERT(matcher->find(0, status));
2155         REGEX_ASSERT(matcher->start(status) == 1);
2156         REGEX_ASSERT(matcher->find(1, status));
2157         REGEX_ASSERT(matcher->start(status) == 1);
2158         REGEX_ASSERT(matcher->find(2, status));
2159         REGEX_ASSERT(matcher->start(status) == 6);
2160         REGEX_ASSERT(matcher->find(12, status));
2161         REGEX_ASSERT(matcher->start(status) == 12);
2162         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2163         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2164         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2165         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2166 
2167         status = U_ZERO_ERROR;
2168         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2169         status = U_ZERO_ERROR;
2170         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2171 
2172         REGEX_ASSERT(matcher->groupCount() == 0);
2173 
2174         delete matcher;
2175         delete pat;
2176 
2177         utext_close(&input);
2178         utext_close(&re);
2179     }
2180 
2181 
2182     //
2183     //  find, with \G in pattern (true if at the end of a previous match).
2184     //
2185     {
2186         int32_t             flags=0;
2187         UParseError         pe;
2188         UErrorCode          status=U_ZERO_ERROR;
2189         UText               re=UTEXT_INITIALIZER;
2190         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2191         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2192 
2193         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2194 
2195         REGEX_CHECK_STATUS;
2196         UText input = UTEXT_INITIALIZER;
2197         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2198         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2199         //                      012345678901234567
2200 
2201         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2202         REGEX_CHECK_STATUS;
2203         REGEX_ASSERT(matcher->find());
2204         REGEX_ASSERT(matcher->start(status) == 0);
2205         REGEX_ASSERT(matcher->start(1, status) == -1);
2206         REGEX_ASSERT(matcher->start(2, status) == 1);
2207 
2208         REGEX_ASSERT(matcher->find());
2209         REGEX_ASSERT(matcher->start(status) == 4);
2210         REGEX_ASSERT(matcher->start(1, status) == 4);
2211         REGEX_ASSERT(matcher->start(2, status) == -1);
2212         REGEX_CHECK_STATUS;
2213 
2214         delete matcher;
2215         delete pat;
2216 
2217         utext_close(&input);
2218         utext_close(&re);
2219     }
2220 
2221     //
2222     //   find with zero length matches, match position should bump ahead
2223     //     to prevent loops.
2224     //
2225     {
2226         int32_t                 i;
2227         UErrorCode          status=U_ZERO_ERROR;
2228         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2229                                                       //   using an always-true look-ahead.
2230         REGEX_CHECK_STATUS;
2231         UText s = UTEXT_INITIALIZER;
2232         utext_openUTF8(&s, "    ", -1, &status);
2233         m.reset(&s);
2234         for (i=0; ; i++) {
2235             if (m.find() == FALSE) {
2236                 break;
2237             }
2238             REGEX_ASSERT(m.start(status) == i);
2239             REGEX_ASSERT(m.end(status) == i);
2240         }
2241         REGEX_ASSERT(i==5);
2242 
2243         // Check that the bump goes over characters outside the BMP OK
2244         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2245         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2246         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2247         m.reset(&s);
2248         for (i=0; ; i+=4) {
2249             if (m.find() == FALSE) {
2250                 break;
2251             }
2252             REGEX_ASSERT(m.start(status) == i);
2253             REGEX_ASSERT(m.end(status) == i);
2254         }
2255         REGEX_ASSERT(i==20);
2256 
2257         utext_close(&s);
2258     }
2259     {
2260         // find() loop breaking test.
2261         //        with pattern of /.?/, should see a series of one char matches, then a single
2262         //        match of zero length at the end of the input string.
2263         int32_t                 i;
2264         UErrorCode          status=U_ZERO_ERROR;
2265         RegexMatcher        m(".?", 0, status);
2266         REGEX_CHECK_STATUS;
2267         UText s = UTEXT_INITIALIZER;
2268         utext_openUTF8(&s, "    ", -1, &status);
2269         m.reset(&s);
2270         for (i=0; ; i++) {
2271             if (m.find() == FALSE) {
2272                 break;
2273             }
2274             REGEX_ASSERT(m.start(status) == i);
2275             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2276         }
2277         REGEX_ASSERT(i==5);
2278 
2279         utext_close(&s);
2280     }
2281 
2282 
2283     //
2284     // Matchers with no input string behave as if they had an empty input string.
2285     //
2286 
2287     {
2288         UErrorCode status = U_ZERO_ERROR;
2289         RegexMatcher  m(".?", 0, status);
2290         REGEX_CHECK_STATUS;
2291         REGEX_ASSERT(m.find());
2292         REGEX_ASSERT(m.start(status) == 0);
2293         REGEX_ASSERT(m.input() == "");
2294     }
2295     {
2296         UErrorCode status = U_ZERO_ERROR;
2297         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2298         RegexMatcher  *m = p->matcher(status);
2299         REGEX_CHECK_STATUS;
2300 
2301         REGEX_ASSERT(m->find() == FALSE);
2302         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2303         delete m;
2304         delete p;
2305     }
2306 
2307     //
2308     // Regions
2309     //
2310     {
2311         UErrorCode status = U_ZERO_ERROR;
2312         UText testPattern = UTEXT_INITIALIZER;
2313         UText testText    = UTEXT_INITIALIZER;
2314         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2315         REGEX_VERBOSE_TEXT(&testPattern);
2316         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2317         REGEX_VERBOSE_TEXT(&testText);
2318 
2319         RegexMatcher m(&testPattern, &testText, 0, status);
2320         REGEX_CHECK_STATUS;
2321         REGEX_ASSERT(m.regionStart() == 0);
2322         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2323         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2324         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2325 
2326         m.region(2,4, status);
2327         REGEX_CHECK_STATUS;
2328         REGEX_ASSERT(m.matches(status));
2329         REGEX_ASSERT(m.start(status)==2);
2330         REGEX_ASSERT(m.end(status)==4);
2331         REGEX_CHECK_STATUS;
2332 
2333         m.reset();
2334         REGEX_ASSERT(m.regionStart() == 0);
2335         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2336 
2337         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2338         REGEX_VERBOSE_TEXT(&testText);
2339         m.reset(&testText);
2340         REGEX_ASSERT(m.regionStart() == 0);
2341         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2342 
2343         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2344         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2345         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2346         REGEX_ASSERT(&m == &m.reset());
2347         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2348 
2349         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2350         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2351         REGEX_ASSERT(&m == &m.reset());
2352         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2353 
2354         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2355         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2356         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2357         REGEX_ASSERT(&m == &m.reset());
2358         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2359 
2360         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2361         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2362         REGEX_ASSERT(&m == &m.reset());
2363         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2364 
2365         utext_close(&testText);
2366         utext_close(&testPattern);
2367     }
2368 
2369     //
2370     // hitEnd() and requireEnd()
2371     //
2372     {
2373         UErrorCode status = U_ZERO_ERROR;
2374         UText testPattern = UTEXT_INITIALIZER;
2375         UText testText    = UTEXT_INITIALIZER;
2376         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2377         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2378         utext_openUTF8(&testPattern, str_, -1, &status);
2379         utext_openUTF8(&testText, str_aabb, -1, &status);
2380 
2381         RegexMatcher m1(&testPattern, &testText,  0, status);
2382         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2383         REGEX_ASSERT(m1.hitEnd() == TRUE);
2384         REGEX_ASSERT(m1.requireEnd() == FALSE);
2385         REGEX_CHECK_STATUS;
2386 
2387         status = U_ZERO_ERROR;
2388         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2389         utext_openUTF8(&testPattern, str_a, -1, &status);
2390         RegexMatcher m2(&testPattern, &testText, 0, status);
2391         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2392         REGEX_ASSERT(m2.hitEnd() == FALSE);
2393         REGEX_ASSERT(m2.requireEnd() == FALSE);
2394         REGEX_CHECK_STATUS;
2395 
2396         status = U_ZERO_ERROR;
2397         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2398         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2399         RegexMatcher m3(&testPattern, &testText, 0, status);
2400         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2401         REGEX_ASSERT(m3.hitEnd() == TRUE);
2402         REGEX_ASSERT(m3.requireEnd() == TRUE);
2403         REGEX_CHECK_STATUS;
2404 
2405         utext_close(&testText);
2406         utext_close(&testPattern);
2407     }
2408 }
2409 
2410 
2411 //---------------------------------------------------------------------------
2412 //
2413 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2414 //                         Replace family of functions.
2415 //
2416 //---------------------------------------------------------------------------
API_Replace_UTF8()2417 void RegexTest::API_Replace_UTF8() {
2418     //
2419     //  Replace
2420     //
2421     int32_t             flags=0;
2422     UParseError         pe;
2423     UErrorCode          status=U_ZERO_ERROR;
2424 
2425     UText               re=UTEXT_INITIALIZER;
2426     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2427     REGEX_VERBOSE_TEXT(&re);
2428     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2429     REGEX_CHECK_STATUS;
2430 
2431     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2432     //             012345678901234567
2433     UText dataText = UTEXT_INITIALIZER;
2434     utext_openUTF8(&dataText, data, -1, &status);
2435     REGEX_CHECK_STATUS;
2436     REGEX_VERBOSE_TEXT(&dataText);
2437     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2438 
2439     //
2440     //  Plain vanilla matches.
2441     //
2442     UnicodeString  dest;
2443     UText destText = UTEXT_INITIALIZER;
2444     utext_openUnicodeString(&destText, &dest, &status);
2445     UText *result;
2446 
2447     UText replText = UTEXT_INITIALIZER;
2448 
2449     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2450     utext_openUTF8(&replText, str_yz, -1, &status);
2451     REGEX_VERBOSE_TEXT(&replText);
2452     result = matcher->replaceFirst(&replText, NULL, status);
2453     REGEX_CHECK_STATUS;
2454     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2455     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2456     utext_close(result);
2457     result = matcher->replaceFirst(&replText, &destText, status);
2458     REGEX_CHECK_STATUS;
2459     REGEX_ASSERT(result == &destText);
2460     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2461 
2462     result = matcher->replaceAll(&replText, NULL, status);
2463     REGEX_CHECK_STATUS;
2464     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2465     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2466     utext_close(result);
2467 
2468     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2469     result = matcher->replaceAll(&replText, &destText, status);
2470     REGEX_CHECK_STATUS;
2471     REGEX_ASSERT(result == &destText);
2472     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2473 
2474     //
2475     //  Plain vanilla non-matches.
2476     //
2477     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2478     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2479     matcher->reset(&dataText);
2480 
2481     result = matcher->replaceFirst(&replText, NULL, status);
2482     REGEX_CHECK_STATUS;
2483     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2484     utext_close(result);
2485     result = matcher->replaceFirst(&replText, &destText, status);
2486     REGEX_CHECK_STATUS;
2487     REGEX_ASSERT(result == &destText);
2488     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2489 
2490     result = matcher->replaceAll(&replText, NULL, status);
2491     REGEX_CHECK_STATUS;
2492     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2493     utext_close(result);
2494     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2495     result = matcher->replaceAll(&replText, &destText, status);
2496     REGEX_CHECK_STATUS;
2497     REGEX_ASSERT(result == &destText);
2498     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2499 
2500     //
2501     // Empty source string
2502     //
2503     utext_openUTF8(&dataText, NULL, 0, &status);
2504     matcher->reset(&dataText);
2505 
2506     result = matcher->replaceFirst(&replText, NULL, status);
2507     REGEX_CHECK_STATUS;
2508     REGEX_ASSERT_UTEXT_UTF8("", result);
2509     utext_close(result);
2510     result = matcher->replaceFirst(&replText, &destText, status);
2511     REGEX_CHECK_STATUS;
2512     REGEX_ASSERT(result == &destText);
2513     REGEX_ASSERT_UTEXT_UTF8("", result);
2514 
2515     result = matcher->replaceAll(&replText, NULL, status);
2516     REGEX_CHECK_STATUS;
2517     REGEX_ASSERT_UTEXT_UTF8("", result);
2518     utext_close(result);
2519     result = matcher->replaceAll(&replText, &destText, status);
2520     REGEX_CHECK_STATUS;
2521     REGEX_ASSERT(result == &destText);
2522     REGEX_ASSERT_UTEXT_UTF8("", result);
2523 
2524     //
2525     // Empty substitution string
2526     //
2527     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2528     matcher->reset(&dataText);
2529 
2530     utext_openUTF8(&replText, NULL, 0, &status);
2531     result = matcher->replaceFirst(&replText, NULL, status);
2532     REGEX_CHECK_STATUS;
2533     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2534     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2535     utext_close(result);
2536     result = matcher->replaceFirst(&replText, &destText, status);
2537     REGEX_CHECK_STATUS;
2538     REGEX_ASSERT(result == &destText);
2539     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2540 
2541     result = matcher->replaceAll(&replText, NULL, status);
2542     REGEX_CHECK_STATUS;
2543     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2544     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2545     utext_close(result);
2546     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2547     result = matcher->replaceAll(&replText, &destText, status);
2548     REGEX_CHECK_STATUS;
2549     REGEX_ASSERT(result == &destText);
2550     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2551 
2552     //
2553     // match whole string
2554     //
2555     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2556     utext_openUTF8(&dataText, str_abc, -1, &status);
2557     matcher->reset(&dataText);
2558 
2559     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2560     utext_openUTF8(&replText, str_xyz, -1, &status);
2561     result = matcher->replaceFirst(&replText, NULL, status);
2562     REGEX_CHECK_STATUS;
2563     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2564     utext_close(result);
2565     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2566     result = matcher->replaceFirst(&replText, &destText, status);
2567     REGEX_CHECK_STATUS;
2568     REGEX_ASSERT(result == &destText);
2569     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2570 
2571     result = matcher->replaceAll(&replText, NULL, status);
2572     REGEX_CHECK_STATUS;
2573     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2574     utext_close(result);
2575     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2576     result = matcher->replaceAll(&replText, &destText, status);
2577     REGEX_CHECK_STATUS;
2578     REGEX_ASSERT(result == &destText);
2579     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2580 
2581     //
2582     // Capture Group, simple case
2583     //
2584     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2585     utext_openUTF8(&re, str_add, -1, &status);
2586     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2587     REGEX_CHECK_STATUS;
2588 
2589     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2590     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2591     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2592     REGEX_CHECK_STATUS;
2593 
2594     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2595     utext_openUTF8(&replText, str_11, -1, &status);
2596     result = matcher2->replaceFirst(&replText, NULL, status);
2597     REGEX_CHECK_STATUS;
2598     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2599     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2600     utext_close(result);
2601     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2602     result = matcher2->replaceFirst(&replText, &destText, status);
2603     REGEX_CHECK_STATUS;
2604     REGEX_ASSERT(result == &destText);
2605     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2606 
2607     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2608     utext_openUTF8(&replText, str_v, -1, &status);
2609     REGEX_VERBOSE_TEXT(&replText);
2610     result = matcher2->replaceFirst(&replText, NULL, status);
2611     REGEX_CHECK_STATUS;
2612     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2613     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2614     utext_close(result);
2615     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2616     result = matcher2->replaceFirst(&replText, &destText, status);
2617     REGEX_CHECK_STATUS;
2618     REGEX_ASSERT(result == &destText);
2619     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2620 
2621     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2622                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2623                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2624     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2625     result = matcher2->replaceFirst(&replText, NULL, status);
2626     REGEX_CHECK_STATUS;
2627     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2628     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2629     utext_close(result);
2630     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2631     result = matcher2->replaceFirst(&replText, &destText, status);
2632     REGEX_CHECK_STATUS;
2633     REGEX_ASSERT(result == &destText);
2634     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2635 
2636     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2637     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2638     //                                 012345678901234567890123456
2639     supplDigitChars[22] = 0xF0;
2640     supplDigitChars[23] = 0x9D;
2641     supplDigitChars[24] = 0x9F;
2642     supplDigitChars[25] = 0x8F;
2643     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2644 
2645     result = matcher2->replaceFirst(&replText, NULL, status);
2646     REGEX_CHECK_STATUS;
2647     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2648     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2649     utext_close(result);
2650     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2651     result = matcher2->replaceFirst(&replText, &destText, status);
2652     REGEX_CHECK_STATUS;
2653     REGEX_ASSERT(result == &destText);
2654     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2655     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2656     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2657     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2658 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2659     utext_close(result);
2660     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2661     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2662     REGEX_ASSERT(result == &destText);
2663 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2664 
2665     //
2666     // Replacement String with \u hex escapes
2667     //
2668     {
2669       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2670       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2671         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2672         utext_openUTF8(&replText, str_u0043, -1, &status);
2673         matcher->reset(&dataText);
2674 
2675         result = matcher->replaceAll(&replText, NULL, status);
2676         REGEX_CHECK_STATUS;
2677         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2678         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2679         utext_close(result);
2680         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2681         result = matcher->replaceAll(&replText, &destText, status);
2682         REGEX_CHECK_STATUS;
2683         REGEX_ASSERT(result == &destText);
2684         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2685     }
2686     {
2687       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2688         utext_openUTF8(&dataText, str_abc, -1, &status);
2689         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2690         utext_openUTF8(&replText, str_U00010000, -1, &status);
2691         matcher->reset(&dataText);
2692 
2693         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2694         //                          0123456789
2695         expected[2] = 0xF0;
2696         expected[3] = 0x90;
2697         expected[4] = 0x80;
2698         expected[5] = 0x80;
2699 
2700         result = matcher->replaceAll(&replText, NULL, status);
2701         REGEX_CHECK_STATUS;
2702         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2703         utext_close(result);
2704         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2705         result = matcher->replaceAll(&replText, &destText, status);
2706         REGEX_CHECK_STATUS;
2707         REGEX_ASSERT(result == &destText);
2708         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2709     }
2710     // TODO:  need more through testing of capture substitutions.
2711 
2712     // Bug 4057
2713     //
2714     {
2715         status = U_ZERO_ERROR;
2716 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2717 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2718 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2719         utext_openUTF8(&re, str_ssee, -1, &status);
2720         utext_openUTF8(&dataText, str_blah, -1, &status);
2721         utext_openUTF8(&replText, str_ooh, -1, &status);
2722 
2723         RegexMatcher m(&re, 0, status);
2724         REGEX_CHECK_STATUS;
2725 
2726         UnicodeString result;
2727         UText resultText = UTEXT_INITIALIZER;
2728         utext_openUnicodeString(&resultText, &result, &status);
2729 
2730         // Multiple finds do NOT bump up the previous appendReplacement postion.
2731         m.reset(&dataText);
2732         m.find();
2733         m.find();
2734         m.appendReplacement(&resultText, &replText, status);
2735         REGEX_CHECK_STATUS;
2736         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2737         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2738 
2739         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2740         status = U_ZERO_ERROR;
2741         result.truncate(0);
2742         utext_openUnicodeString(&resultText, &result, &status);
2743         m.reset(10, status);
2744         m.find();
2745         m.find();
2746         m.appendReplacement(&resultText, &replText, status);
2747         REGEX_CHECK_STATUS;
2748         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2749         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2750 
2751         // find() at interior of string, appendReplacement still starts at beginning.
2752         status = U_ZERO_ERROR;
2753         result.truncate(0);
2754         utext_openUnicodeString(&resultText, &result, &status);
2755         m.reset();
2756         m.find(10, status);
2757         m.find();
2758         m.appendReplacement(&resultText, &replText, status);
2759         REGEX_CHECK_STATUS;
2760         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2761         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2762 
2763         m.appendTail(&resultText, status);
2764         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2765         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2766 
2767         utext_close(&resultText);
2768     }
2769 
2770     delete matcher2;
2771     delete pat2;
2772     delete matcher;
2773     delete pat;
2774 
2775     utext_close(&dataText);
2776     utext_close(&replText);
2777     utext_close(&destText);
2778     utext_close(&re);
2779 }
2780 
2781 
2782 //---------------------------------------------------------------------------
2783 //
2784 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2785 //                        present and nominally working.
2786 //
2787 //---------------------------------------------------------------------------
API_Pattern_UTF8()2788 void RegexTest::API_Pattern_UTF8() {
2789     RegexPattern        pata;    // Test default constructor to not crash.
2790     RegexPattern        patb;
2791 
2792     REGEX_ASSERT(pata == patb);
2793     REGEX_ASSERT(pata == pata);
2794 
2795     UText         re1 = UTEXT_INITIALIZER;
2796     UText         re2 = UTEXT_INITIALIZER;
2797     UErrorCode    status = U_ZERO_ERROR;
2798     UParseError   pe;
2799 
2800     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2801     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2802     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2803     utext_openUTF8(&re2, str_def, -1, &status);
2804 
2805     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2806     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2807     REGEX_CHECK_STATUS;
2808     REGEX_ASSERT(*pat1 == *pat1);
2809     REGEX_ASSERT(*pat1 != pata);
2810 
2811     // Assign
2812     patb = *pat1;
2813     REGEX_ASSERT(patb == *pat1);
2814 
2815     // Copy Construct
2816     RegexPattern patc(*pat1);
2817     REGEX_ASSERT(patc == *pat1);
2818     REGEX_ASSERT(patb == patc);
2819     REGEX_ASSERT(pat1 != pat2);
2820     patb = *pat2;
2821     REGEX_ASSERT(patb != patc);
2822     REGEX_ASSERT(patb == *pat2);
2823 
2824     // Compile with no flags.
2825     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2826     REGEX_ASSERT(*pat1a == *pat1);
2827 
2828     REGEX_ASSERT(pat1a->flags() == 0);
2829 
2830     // Compile with different flags should be not equal
2831     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2832     REGEX_CHECK_STATUS;
2833 
2834     REGEX_ASSERT(*pat1b != *pat1a);
2835     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2836     REGEX_ASSERT(pat1a->flags() == 0);
2837     delete pat1b;
2838 
2839     // clone
2840     RegexPattern *pat1c = pat1->clone();
2841     REGEX_ASSERT(*pat1c == *pat1);
2842     REGEX_ASSERT(*pat1c != *pat2);
2843 
2844     delete pat1c;
2845     delete pat1a;
2846     delete pat1;
2847     delete pat2;
2848 
2849     utext_close(&re1);
2850     utext_close(&re2);
2851 
2852 
2853     //
2854     //   Verify that a matcher created from a cloned pattern works.
2855     //     (Jitterbug 3423)
2856     //
2857     {
2858         UErrorCode     status     = U_ZERO_ERROR;
2859         UText          pattern    = UTEXT_INITIALIZER;
2860         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2861         utext_openUTF8(&pattern, str_pL, -1, &status);
2862 
2863         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2864         RegexPattern  *pClone     = pSource->clone();
2865         delete         pSource;
2866         RegexMatcher  *mFromClone = pClone->matcher(status);
2867         REGEX_CHECK_STATUS;
2868 
2869         UText          input      = UTEXT_INITIALIZER;
2870         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2871         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2872         mFromClone->reset(&input);
2873         REGEX_ASSERT(mFromClone->find() == TRUE);
2874         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2875         REGEX_ASSERT(mFromClone->find() == TRUE);
2876         REGEX_ASSERT(mFromClone->group(status) == "World");
2877         REGEX_ASSERT(mFromClone->find() == FALSE);
2878         delete mFromClone;
2879         delete pClone;
2880 
2881         utext_close(&input);
2882         utext_close(&pattern);
2883     }
2884 
2885     //
2886     //   matches convenience API
2887     //
2888     {
2889         UErrorCode status  = U_ZERO_ERROR;
2890         UText      pattern = UTEXT_INITIALIZER;
2891         UText      input   = UTEXT_INITIALIZER;
2892 
2893         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2894         utext_openUTF8(&input, str_randominput, -1, &status);
2895 
2896         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2897         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2898         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2899         REGEX_CHECK_STATUS;
2900 
2901         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2902         utext_openUTF8(&pattern, str_abc, -1, &status);
2903         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2904         REGEX_CHECK_STATUS;
2905 
2906         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2907         utext_openUTF8(&pattern, str_nput, -1, &status);
2908         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2909         REGEX_CHECK_STATUS;
2910 
2911         utext_openUTF8(&pattern, str_randominput, -1, &status);
2912         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2913         REGEX_CHECK_STATUS;
2914 
2915         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2916         utext_openUTF8(&pattern, str_u, -1, &status);
2917         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2918         REGEX_CHECK_STATUS;
2919 
2920         utext_openUTF8(&input, str_abc, -1, &status);
2921         utext_openUTF8(&pattern, str_abc, -1, &status);
2922         status = U_INDEX_OUTOFBOUNDS_ERROR;
2923         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2924         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2925 
2926         utext_close(&input);
2927         utext_close(&pattern);
2928     }
2929 
2930 
2931     //
2932     // Split()
2933     //
2934     status = U_ZERO_ERROR;
2935     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2936     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2937     pat1 = RegexPattern::compile(&re1, pe, status);
2938     REGEX_CHECK_STATUS;
2939     UnicodeString  fields[10];
2940 
2941     int32_t n;
2942     n = pat1->split("Now is the time", fields, 10, status);
2943     REGEX_CHECK_STATUS;
2944     REGEX_ASSERT(n==4);
2945     REGEX_ASSERT(fields[0]=="Now");
2946     REGEX_ASSERT(fields[1]=="is");
2947     REGEX_ASSERT(fields[2]=="the");
2948     REGEX_ASSERT(fields[3]=="time");
2949     REGEX_ASSERT(fields[4]=="");
2950 
2951     n = pat1->split("Now is the time", fields, 2, status);
2952     REGEX_CHECK_STATUS;
2953     REGEX_ASSERT(n==2);
2954     REGEX_ASSERT(fields[0]=="Now");
2955     REGEX_ASSERT(fields[1]=="is the time");
2956     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2957 
2958     fields[1] = "*";
2959     status = U_ZERO_ERROR;
2960     n = pat1->split("Now is the time", fields, 1, status);
2961     REGEX_CHECK_STATUS;
2962     REGEX_ASSERT(n==1);
2963     REGEX_ASSERT(fields[0]=="Now is the time");
2964     REGEX_ASSERT(fields[1]=="*");
2965     status = U_ZERO_ERROR;
2966 
2967     n = pat1->split("    Now       is the time   ", fields, 10, status);
2968     REGEX_CHECK_STATUS;
2969     REGEX_ASSERT(n==6);
2970     REGEX_ASSERT(fields[0]=="");
2971     REGEX_ASSERT(fields[1]=="Now");
2972     REGEX_ASSERT(fields[2]=="is");
2973     REGEX_ASSERT(fields[3]=="the");
2974     REGEX_ASSERT(fields[4]=="time");
2975     REGEX_ASSERT(fields[5]=="");
2976     REGEX_ASSERT(fields[6]=="");
2977 
2978     fields[2] = "*";
2979     n = pat1->split("     ", fields, 10, status);
2980     REGEX_CHECK_STATUS;
2981     REGEX_ASSERT(n==2);
2982     REGEX_ASSERT(fields[0]=="");
2983     REGEX_ASSERT(fields[1]=="");
2984     REGEX_ASSERT(fields[2]=="*");
2985 
2986     fields[0] = "foo";
2987     n = pat1->split("", fields, 10, status);
2988     REGEX_CHECK_STATUS;
2989     REGEX_ASSERT(n==0);
2990     REGEX_ASSERT(fields[0]=="foo");
2991 
2992     delete pat1;
2993 
2994     //  split, with a pattern with (capture)
2995     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2996     pat1 = RegexPattern::compile(&re1,  pe, status);
2997     REGEX_CHECK_STATUS;
2998 
2999     status = U_ZERO_ERROR;
3000     fields[6] = fields[7] = "*";
3001     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3002     REGEX_CHECK_STATUS;
3003     REGEX_ASSERT(n==7);
3004     REGEX_ASSERT(fields[0]=="");
3005     REGEX_ASSERT(fields[1]=="a");
3006     REGEX_ASSERT(fields[2]=="Now is ");
3007     REGEX_ASSERT(fields[3]=="b");
3008     REGEX_ASSERT(fields[4]=="the time");
3009     REGEX_ASSERT(fields[5]=="c");
3010     REGEX_ASSERT(fields[6]=="");
3011     REGEX_ASSERT(fields[7]=="*");
3012     REGEX_ASSERT(status==U_ZERO_ERROR);
3013 
3014     fields[6] = fields[7] = "*";
3015     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
3016     REGEX_CHECK_STATUS;
3017     REGEX_ASSERT(n==7);
3018     REGEX_ASSERT(fields[0]=="  ");
3019     REGEX_ASSERT(fields[1]=="a");
3020     REGEX_ASSERT(fields[2]=="Now is ");
3021     REGEX_ASSERT(fields[3]=="b");
3022     REGEX_ASSERT(fields[4]=="the time");
3023     REGEX_ASSERT(fields[5]=="c");
3024     REGEX_ASSERT(fields[6]=="");
3025     REGEX_ASSERT(fields[7]=="*");
3026 
3027     status = U_ZERO_ERROR;
3028     fields[6] = "foo";
3029     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3030     REGEX_CHECK_STATUS;
3031     REGEX_ASSERT(n==6);
3032     REGEX_ASSERT(fields[0]=="  ");
3033     REGEX_ASSERT(fields[1]=="a");
3034     REGEX_ASSERT(fields[2]=="Now is ");
3035     REGEX_ASSERT(fields[3]=="b");
3036     REGEX_ASSERT(fields[4]=="the time");
3037     REGEX_ASSERT(fields[5]==" ");
3038     REGEX_ASSERT(fields[6]=="foo");
3039 
3040     status = U_ZERO_ERROR;
3041     fields[5] = "foo";
3042     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3043     REGEX_CHECK_STATUS;
3044     REGEX_ASSERT(n==5);
3045     REGEX_ASSERT(fields[0]=="  ");
3046     REGEX_ASSERT(fields[1]=="a");
3047     REGEX_ASSERT(fields[2]=="Now is ");
3048     REGEX_ASSERT(fields[3]=="b");
3049     REGEX_ASSERT(fields[4]=="the time<c>");
3050     REGEX_ASSERT(fields[5]=="foo");
3051 
3052     status = U_ZERO_ERROR;
3053     fields[5] = "foo";
3054     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3055     REGEX_CHECK_STATUS;
3056     REGEX_ASSERT(n==5);
3057     REGEX_ASSERT(fields[0]=="  ");
3058     REGEX_ASSERT(fields[1]=="a");
3059     REGEX_ASSERT(fields[2]=="Now is ");
3060     REGEX_ASSERT(fields[3]=="b");
3061     REGEX_ASSERT(fields[4]=="the time");
3062     REGEX_ASSERT(fields[5]=="foo");
3063 
3064     status = U_ZERO_ERROR;
3065     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3066     REGEX_CHECK_STATUS;
3067     REGEX_ASSERT(n==4);
3068     REGEX_ASSERT(fields[0]=="  ");
3069     REGEX_ASSERT(fields[1]=="a");
3070     REGEX_ASSERT(fields[2]=="Now is ");
3071     REGEX_ASSERT(fields[3]=="the time<c>");
3072     status = U_ZERO_ERROR;
3073     delete pat1;
3074 
3075     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3076     pat1 = RegexPattern::compile(&re1, pe, status);
3077     REGEX_CHECK_STATUS;
3078     n = pat1->split("1-10,20", fields, 10, status);
3079     REGEX_CHECK_STATUS;
3080     REGEX_ASSERT(n==5);
3081     REGEX_ASSERT(fields[0]=="1");
3082     REGEX_ASSERT(fields[1]=="-");
3083     REGEX_ASSERT(fields[2]=="10");
3084     REGEX_ASSERT(fields[3]==",");
3085     REGEX_ASSERT(fields[4]=="20");
3086     delete pat1;
3087 
3088 
3089     //
3090     // split of a UText based string, with library allocating output UTexts.
3091     //
3092     {
3093         status = U_ZERO_ERROR;
3094         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3095         UnicodeString stringToSplit("first:second:third");
3096         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3097         REGEX_CHECK_STATUS;
3098 
3099         UText *splits[10] = {NULL};
3100         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3101         REGEX_CHECK_STATUS;
3102         REGEX_ASSERT(numFields == 5);
3103         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3104         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3105         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3106         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3107         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3108         REGEX_ASSERT(splits[5] == NULL);
3109 
3110         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3111             if (splits[i]) {
3112                 utext_close(splits[i]);
3113                 splits[i] = NULL;
3114             }
3115         }
3116         utext_close(textToSplit);
3117     }
3118 
3119 
3120     //
3121     // RegexPattern::pattern() and patternText()
3122     //
3123     pat1 = new RegexPattern();
3124     REGEX_ASSERT(pat1->pattern() == "");
3125     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3126     delete pat1;
3127     const char *helloWorldInvariant = "(Hello, world)*";
3128     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3129     pat1 = RegexPattern::compile(&re1, pe, status);
3130     REGEX_CHECK_STATUS;
3131     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3132     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3133     delete pat1;
3134 
3135     utext_close(&re1);
3136 }
3137 
3138 
3139 //---------------------------------------------------------------------------
3140 //
3141 //      Extended       A more thorough check for features of regex patterns
3142 //                     The test cases are in a separate data file,
3143 //                       source/tests/testdata/regextst.txt
3144 //                     A description of the test data format is included in that file.
3145 //
3146 //---------------------------------------------------------------------------
3147 
3148 const char *
getPath(char buffer[2048],const char * filename)3149 RegexTest::getPath(char buffer[2048], const char *filename) {
3150     UErrorCode status=U_ZERO_ERROR;
3151     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3152     if (U_FAILURE(status)) {
3153         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3154         return NULL;
3155     }
3156 
3157     strcpy(buffer, testDataDirectory);
3158     strcat(buffer, filename);
3159     return buffer;
3160 }
3161 
Extended()3162 void RegexTest::Extended() {
3163     char tdd[2048];
3164     const char *srcPath;
3165     UErrorCode  status  = U_ZERO_ERROR;
3166     int32_t     lineNum = 0;
3167 
3168     //
3169     //  Open and read the test data file.
3170     //
3171     srcPath=getPath(tdd, "regextst.txt");
3172     if(srcPath==NULL) {
3173         return; /* something went wrong, error already output */
3174     }
3175 
3176     int32_t    len;
3177     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3178     if (U_FAILURE(status)) {
3179         return; /* something went wrong, error already output */
3180     }
3181 
3182     //
3183     //  Put the test data into a UnicodeString
3184     //
3185     UnicodeString testString(FALSE, testData, len);
3186 
3187     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3188     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3189     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3190 
3191     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3192     UnicodeString   testPattern;   // The pattern for test from the test file.
3193     UnicodeString   testFlags;     // the flags   for a test.
3194     UnicodeString   matchString;   // The marked up string to be used as input
3195 
3196     if (U_FAILURE(status)){
3197         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3198         delete [] testData;
3199         return;
3200     }
3201 
3202     //
3203     //  Loop over the test data file, once per line.
3204     //
3205     while (lineMat.find()) {
3206         lineNum++;
3207         if (U_FAILURE(status)) {
3208           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3209         }
3210 
3211         status = U_ZERO_ERROR;
3212         UnicodeString testLine = lineMat.group(1, status);
3213         if (testLine.length() == 0) {
3214             continue;
3215         }
3216 
3217         //
3218         // Parse the test line.  Skip blank and comment only lines.
3219         // Separate out the three main fields - pattern, flags, target.
3220         //
3221 
3222         commentMat.reset(testLine);
3223         if (commentMat.lookingAt(status)) {
3224             // This line is a comment, or blank.
3225             continue;
3226         }
3227 
3228         //
3229         //  Pull out the pattern field, remove it from the test file line.
3230         //
3231         quotedStuffMat.reset(testLine);
3232         if (quotedStuffMat.lookingAt(status)) {
3233             testPattern = quotedStuffMat.group(2, status);
3234             testLine.remove(0, quotedStuffMat.end(0, status));
3235         } else {
3236             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3237             continue;
3238         }
3239 
3240 
3241         //
3242         //  Pull out the flags from the test file line.
3243         //
3244         flagsMat.reset(testLine);
3245         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3246         testFlags = flagsMat.group(1, status);
3247         if (flagsMat.group(2, status).length() > 0) {
3248             errln("Bad Match flag at line %d. Scanning %c\n",
3249                 lineNum, flagsMat.group(2, status).charAt(0));
3250             continue;
3251         }
3252         testLine.remove(0, flagsMat.end(0, status));
3253 
3254         //
3255         //  Pull out the match string, as a whole.
3256         //    We'll process the <tags> later.
3257         //
3258         quotedStuffMat.reset(testLine);
3259         if (quotedStuffMat.lookingAt(status)) {
3260             matchString = quotedStuffMat.group(2, status);
3261             testLine.remove(0, quotedStuffMat.end(0, status));
3262         } else {
3263             errln("Bad match string at test file line %d", lineNum);
3264             continue;
3265         }
3266 
3267         //
3268         //  The only thing left from the input line should be an optional trailing comment.
3269         //
3270         commentMat.reset(testLine);
3271         if (commentMat.lookingAt(status) == FALSE) {
3272             errln("Line %d: unexpected characters at end of test line.", lineNum);
3273             continue;
3274         }
3275 
3276         //
3277         //  Run the test
3278         //
3279         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3280     }
3281 
3282     delete [] testData;
3283 
3284 }
3285 
3286 
3287 
3288 //---------------------------------------------------------------------------
3289 //
3290 //    regex_find(pattern, flags, inputString, lineNumber)
3291 //
3292 //         Function to run a single test from the Extended (data driven) tests.
3293 //         See file test/testdata/regextst.txt for a description of the
3294 //         pattern and inputString fields, and the allowed flags.
3295 //         lineNumber is the source line in regextst.txt of the test.
3296 //
3297 //---------------------------------------------------------------------------
3298 
3299 
3300 //  Set a value into a UVector at position specified by a decimal number in
3301 //   a UnicodeString.   This is a utility function needed by the actual test function,
3302 //   which follows.
set(UVector & vec,int32_t val,UnicodeString index)3303 static void set(UVector &vec, int32_t val, UnicodeString index) {
3304     UErrorCode  status=U_ZERO_ERROR;
3305     int32_t  idx = 0;
3306     for (int32_t i=0; i<index.length(); i++) {
3307         int32_t d=u_charDigitValue(index.charAt(i));
3308         if (d<0) {return;}
3309         idx = idx*10 + d;
3310     }
3311     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3312     vec.setElementAt(val, idx);
3313 }
3314 
setInt(UVector & vec,int32_t val,int32_t idx)3315 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3316     UErrorCode  status=U_ZERO_ERROR;
3317     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3318     vec.setElementAt(val, idx);
3319 }
3320 
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3321 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3322 {
3323     UBool couldFind = TRUE;
3324     UTEXT_SETNATIVEINDEX(utext, 0);
3325     int32_t i = 0;
3326     while (i < unistrOffset) {
3327         UChar32 c = UTEXT_NEXT32(utext);
3328         if (c != U_SENTINEL) {
3329             i += U16_LENGTH(c);
3330         } else {
3331             couldFind = FALSE;
3332             break;
3333         }
3334     }
3335     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3336     return couldFind;
3337 }
3338 
3339 
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3340 void RegexTest::regex_find(const UnicodeString &pattern,
3341                            const UnicodeString &flags,
3342                            const UnicodeString &inputString,
3343                            const char *srcPath,
3344                            int32_t line) {
3345     UnicodeString       unEscapedInput;
3346     UnicodeString       deTaggedInput;
3347 
3348     int32_t             patternUTF8Length,      inputUTF8Length;
3349     char                *patternChars  = NULL, *inputChars = NULL;
3350     UText               patternText    = UTEXT_INITIALIZER;
3351     UText               inputText      = UTEXT_INITIALIZER;
3352     UConverter          *UTF8Converter = NULL;
3353 
3354     UErrorCode          status         = U_ZERO_ERROR;
3355     UParseError         pe;
3356     RegexPattern        *parsePat      = NULL;
3357     RegexMatcher        *parseMatcher  = NULL;
3358     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3359     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3360     UVector             groupStarts(status);
3361     UVector             groupEnds(status);
3362     UVector             groupStartsUTF8(status);
3363     UVector             groupEndsUTF8(status);
3364     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3365     UBool               failed         = FALSE;
3366     int32_t             numFinds;
3367     int32_t             i;
3368     UBool               useMatchesFunc   = FALSE;
3369     UBool               useLookingAtFunc = FALSE;
3370     int32_t             regionStart      = -1;
3371     int32_t             regionEnd        = -1;
3372     int32_t             regionStartUTF8  = -1;
3373     int32_t             regionEndUTF8    = -1;
3374 
3375 
3376     //
3377     //  Compile the caller's pattern
3378     //
3379     uint32_t bflags = 0;
3380     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3381         bflags |= UREGEX_CASE_INSENSITIVE;
3382     }
3383     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3384         bflags |= UREGEX_COMMENTS;
3385     }
3386     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3387         bflags |= UREGEX_DOTALL;
3388     }
3389     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3390         bflags |= UREGEX_MULTILINE;
3391     }
3392 
3393     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3394         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3395     }
3396     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3397         bflags |= UREGEX_UNIX_LINES;
3398     }
3399     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3400         bflags |= UREGEX_LITERAL;
3401     }
3402 
3403 
3404     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3405     if (status != U_ZERO_ERROR) {
3406         #if UCONFIG_NO_BREAK_ITERATION==1
3407         // 'v' test flag means that the test pattern should not compile if ICU was configured
3408         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3409         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3410             goto cleanupAndReturn;
3411         }
3412         #endif
3413         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3414             // Expected pattern compilation error.
3415             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3416                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3417             }
3418             goto cleanupAndReturn;
3419         } else {
3420             // Unexpected pattern compilation error.
3421             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3422             goto cleanupAndReturn;
3423         }
3424     }
3425 
3426     UTF8Converter = ucnv_open("UTF8", &status);
3427     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3428 
3429     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3430     status = U_ZERO_ERROR; // buffer overflow
3431     patternChars = new char[patternUTF8Length+1];
3432     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3433     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3434 
3435     if (status == U_ZERO_ERROR) {
3436         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3437 
3438         if (status != U_ZERO_ERROR) {
3439 #if UCONFIG_NO_BREAK_ITERATION==1
3440             // 'v' test flag means that the test pattern should not compile if ICU was configured
3441             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3442             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3443                 goto cleanupAndReturn;
3444             }
3445 #endif
3446             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3447                 // Expected pattern compilation error.
3448                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3449                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3450                 }
3451                 goto cleanupAndReturn;
3452             } else {
3453                 // Unexpected pattern compilation error.
3454                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3455                 goto cleanupAndReturn;
3456             }
3457         }
3458     }
3459 
3460     if (UTF8Pattern == NULL) {
3461         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3462         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3463         status = U_ZERO_ERROR;
3464     }
3465 
3466     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3467         callerPattern->dumpPattern();
3468     }
3469 
3470     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3471         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3472         goto cleanupAndReturn;
3473     }
3474 
3475 
3476     //
3477     // Number of times find() should be called on the test string, default to 1
3478     //
3479     numFinds = 1;
3480     for (i=2; i<=9; i++) {
3481         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3482             if (numFinds != 1) {
3483                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3484                 goto cleanupAndReturn;
3485             }
3486             numFinds = i;
3487         }
3488     }
3489 
3490     // 'M' flag.  Use matches() instead of find()
3491     if (flags.indexOf((UChar)0x4d) >= 0) {
3492         useMatchesFunc = TRUE;
3493     }
3494     if (flags.indexOf((UChar)0x4c) >= 0) {
3495         useLookingAtFunc = TRUE;
3496     }
3497 
3498     //
3499     //  Find the tags in the input data, remove them, and record the group boundary
3500     //    positions.
3501     //
3502     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3503     REGEX_CHECK_STATUS_L(line);
3504 
3505     unEscapedInput = inputString.unescape();
3506     parseMatcher = parsePat->matcher(unEscapedInput, status);
3507     REGEX_CHECK_STATUS_L(line);
3508     while(parseMatcher->find()) {
3509         parseMatcher->appendReplacement(deTaggedInput, "", status);
3510         REGEX_CHECK_STATUS;
3511         UnicodeString groupNum = parseMatcher->group(2, status);
3512         if (groupNum == "r") {
3513             // <r> or </r>, a region specification within the string
3514             if (parseMatcher->group(1, status) == "/") {
3515                 regionEnd = deTaggedInput.length();
3516             } else {
3517                 regionStart = deTaggedInput.length();
3518             }
3519         } else {
3520             // <digits> or </digits>, a group match boundary tag.
3521             if (parseMatcher->group(1, status) == "/") {
3522                 set(groupEnds, deTaggedInput.length(), groupNum);
3523             } else {
3524                 set(groupStarts, deTaggedInput.length(), groupNum);
3525             }
3526         }
3527     }
3528     parseMatcher->appendTail(deTaggedInput);
3529 
3530     if (groupStarts.size() != groupEnds.size()) {
3531         errln("Error at line %d: mismatched <n> group tags in expected results.", line);
3532         failed = true;
3533         goto cleanupAndReturn;
3534     }
3535     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3536         errln("mismatched <r> tags");
3537         failed = TRUE;
3538         goto cleanupAndReturn;
3539     }
3540 
3541     //
3542     //  Configure the matcher according to the flags specified with this test.
3543     //
3544     matcher = callerPattern->matcher(deTaggedInput, status);
3545     REGEX_CHECK_STATUS_L(line);
3546     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3547         matcher->setTrace(TRUE);
3548     }
3549 
3550     if (UTF8Pattern != NULL) {
3551         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3552         status = U_ZERO_ERROR; // buffer overflow
3553         inputChars = new char[inputUTF8Length+1];
3554         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3555         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3556 
3557         if (status == U_ZERO_ERROR) {
3558             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3559             REGEX_CHECK_STATUS_L(line);
3560         }
3561 
3562         if (UTF8Matcher == NULL) {
3563             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3564             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3565             status = U_ZERO_ERROR;
3566         }
3567     }
3568 
3569     //
3570     //  Generate native indices for UTF8 versions of region and capture group info
3571     //
3572     if (UTF8Matcher != NULL) {
3573         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3574             UTF8Matcher->setTrace(TRUE);
3575         }
3576         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3577         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3578 
3579         //  Fill out the native index UVector info.
3580         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3581         for (i=0; i<groupStarts.size(); i++) {
3582             int32_t  start = groupStarts.elementAti(i);
3583             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3584             if (start >= 0) {
3585                 int32_t  startUTF8;
3586                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3587                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3588                     failed = TRUE;
3589                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3590                 }
3591                 setInt(groupStartsUTF8, startUTF8, i);
3592             }
3593 
3594             int32_t  end = groupEnds.elementAti(i);
3595             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3596             if (end >= 0) {
3597                 int32_t  endUTF8;
3598                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3599                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3600                     failed = TRUE;
3601                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3602                 }
3603                 setInt(groupEndsUTF8, endUTF8, i);
3604             }
3605         }
3606     }
3607 
3608     if (regionStart>=0) {
3609        matcher->region(regionStart, regionEnd, status);
3610        REGEX_CHECK_STATUS_L(line);
3611        if (UTF8Matcher != NULL) {
3612            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3613            REGEX_CHECK_STATUS_L(line);
3614        }
3615     }
3616     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3617         matcher->useAnchoringBounds(FALSE);
3618         if (UTF8Matcher != NULL) {
3619             UTF8Matcher->useAnchoringBounds(FALSE);
3620         }
3621     }
3622     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3623         matcher->useTransparentBounds(TRUE);
3624         if (UTF8Matcher != NULL) {
3625             UTF8Matcher->useTransparentBounds(TRUE);
3626         }
3627     }
3628 
3629 
3630 
3631     //
3632     // Do a find on the de-tagged input using the caller's pattern
3633     //     TODO: error on count>1 and not find().
3634     //           error on both matches() and lookingAt().
3635     //
3636     for (i=0; i<numFinds; i++) {
3637         if (useMatchesFunc) {
3638             isMatch = matcher->matches(status);
3639             if (UTF8Matcher != NULL) {
3640                isUTF8Match = UTF8Matcher->matches(status);
3641             }
3642         } else  if (useLookingAtFunc) {
3643             isMatch = matcher->lookingAt(status);
3644             if (UTF8Matcher != NULL) {
3645                 isUTF8Match = UTF8Matcher->lookingAt(status);
3646             }
3647         } else {
3648             isMatch = matcher->find();
3649             if (UTF8Matcher != NULL) {
3650                 isUTF8Match = UTF8Matcher->find();
3651             }
3652         }
3653     }
3654     matcher->setTrace(FALSE);
3655     if (UTF8Matcher) {
3656         UTF8Matcher->setTrace(FALSE);
3657     }
3658     if (U_FAILURE(status)) {
3659         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3660     }
3661 
3662     //
3663     // Match up the groups from the find() with the groups from the tags
3664     //
3665 
3666     // number of tags should match number of groups from find operation.
3667     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3668     //   G option in test means that capture group data is not available in the
3669     //     expected results, so the check needs to be suppressed.
3670     if (isMatch == FALSE && groupStarts.size() != 0) {
3671         dataerrln("Error at line %d:  Match expected, but none found.", line);
3672         failed = TRUE;
3673         goto cleanupAndReturn;
3674     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3675         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3676         failed = TRUE;
3677         goto cleanupAndReturn;
3678     }
3679     if (isMatch && groupStarts.size() == 0) {
3680         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3681         failed = TRUE;
3682     }
3683     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3684         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3685         failed = TRUE;
3686     }
3687 
3688     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3689         // Only check for match / no match.  Don't check capture groups.
3690         goto cleanupAndReturn;
3691     }
3692 
3693     REGEX_CHECK_STATUS_L(line);
3694     for (i=0; i<=matcher->groupCount(); i++) {
3695         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3696         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3697         if (matcher->start(i, status) != expectedStart) {
3698             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3699                 line, i, expectedStart, matcher->start(i, status));
3700             failed = TRUE;
3701             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3702         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3703             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3704                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3705             failed = TRUE;
3706             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3707         }
3708 
3709         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3710         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3711         if (matcher->end(i, status) != expectedEnd) {
3712             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3713                 line, i, expectedEnd, matcher->end(i, status));
3714             failed = TRUE;
3715             // Error on end position;  keep going; real error is probably yet to come as group
3716             //   end positions work from end of the input data towards the front.
3717         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3718             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3719                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3720             failed = TRUE;
3721             // Error on end position;  keep going; real error is probably yet to come as group
3722             //   end positions work from end of the input data towards the front.
3723         }
3724     }
3725     if ( matcher->groupCount()+1 < groupStarts.size()) {
3726         errln("Error at line %d: Expected %d capture groups, found %d.",
3727             line, groupStarts.size()-1, matcher->groupCount());
3728         failed = TRUE;
3729         }
3730     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3731         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3732               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3733         failed = TRUE;
3734     }
3735 
3736     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3737         matcher->requireEnd() == TRUE) {
3738         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3739         failed = TRUE;
3740     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3741         UTF8Matcher->requireEnd() == TRUE) {
3742         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3743         failed = TRUE;
3744     }
3745 
3746     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3747         matcher->requireEnd() == FALSE) {
3748         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3749         failed = TRUE;
3750     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3751         UTF8Matcher->requireEnd() == FALSE) {
3752         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3753         failed = TRUE;
3754     }
3755 
3756     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3757         matcher->hitEnd() == TRUE) {
3758         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3759         failed = TRUE;
3760     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3761                UTF8Matcher->hitEnd() == TRUE) {
3762         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3763         failed = TRUE;
3764     }
3765 
3766     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3767         matcher->hitEnd() == FALSE) {
3768         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3769         failed = TRUE;
3770     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3771                UTF8Matcher->hitEnd() == FALSE) {
3772         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3773         failed = TRUE;
3774     }
3775 
3776 
3777 cleanupAndReturn:
3778     if (failed) {
3779         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3780             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3781         // callerPattern->dump();
3782     }
3783     delete parseMatcher;
3784     delete parsePat;
3785     delete UTF8Matcher;
3786     delete UTF8Pattern;
3787     delete matcher;
3788     delete callerPattern;
3789 
3790     utext_close(&inputText);
3791     delete[] inputChars;
3792     utext_close(&patternText);
3793     delete[] patternChars;
3794     ucnv_close(UTF8Converter);
3795 }
3796 
3797 
3798 
3799 
3800 //---------------------------------------------------------------------------
3801 //
3802 //      Errors     Check for error handling in patterns.
3803 //
3804 //---------------------------------------------------------------------------
Errors()3805 void RegexTest::Errors() {
3806     // \escape sequences that aren't implemented yet.
3807     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3808 
3809     // Missing close parentheses
3810     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3811     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3812     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3813 
3814     // Extra close paren
3815     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3816     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3817     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3818 
3819     // Look-ahead, Look-behind
3820     //  TODO:  add tests for unbounded length look-behinds.
3821     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3822 
3823     // Attempt to use non-default flags
3824     {
3825         UParseError   pe;
3826         UErrorCode    status = U_ZERO_ERROR;
3827         int32_t       flags  = UREGEX_CANON_EQ |
3828                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3829                                UREGEX_MULTILINE;
3830         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3831         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3832         delete pat1;
3833     }
3834 
3835 
3836     // Quantifiers are allowed only after something that can be quantified.
3837     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3838     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3839     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3840 
3841     // Mal-formed {min,max} quantifiers
3842     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3843     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3844     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3845     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3846     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3847     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3848     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3849     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3850     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3851 
3852     // Ticket 5389
3853     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3854 
3855     // Invalid Back Reference \0
3856     //    For ICU 3.8 and earlier
3857     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3858     //
3859     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3860 
3861 }
3862 
3863 
3864 //-------------------------------------------------------------------------------
3865 //
3866 //  Read a text data file, convert it to UChars, and return the data
3867 //    in one big UChar * buffer, which the caller must delete.
3868 //
3869 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3870 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3871                                      const char *defEncoding, UErrorCode &status) {
3872     UChar       *retPtr  = NULL;
3873     char        *fileBuf = NULL;
3874     UConverter* conv     = NULL;
3875     FILE        *f       = NULL;
3876 
3877     ulen = 0;
3878     if (U_FAILURE(status)) {
3879         return retPtr;
3880     }
3881 
3882     //
3883     //  Open the file.
3884     //
3885     f = fopen(fileName, "rb");
3886     if (f == 0) {
3887         dataerrln("Error opening test data file %s\n", fileName);
3888         status = U_FILE_ACCESS_ERROR;
3889         return NULL;
3890     }
3891     //
3892     //  Read it in
3893     //
3894     int32_t            fileSize;
3895     int32_t            amt_read;
3896 
3897     fseek( f, 0, SEEK_END);
3898     fileSize = ftell(f);
3899     fileBuf = new char[fileSize];
3900     fseek(f, 0, SEEK_SET);
3901     amt_read = static_cast<int32_t>(fread(fileBuf, 1, fileSize, f));
3902     if (amt_read != fileSize || fileSize <= 0) {
3903         errln("Error reading test data file.");
3904         goto cleanUpAndReturn;
3905     }
3906 
3907     //
3908     // Look for a Unicode Signature (BOM) on the data just read
3909     //
3910     int32_t        signatureLength;
3911     const char *   fileBufC;
3912     const char*    encoding;
3913 
3914     fileBufC = fileBuf;
3915     encoding = ucnv_detectUnicodeSignature(
3916         fileBuf, fileSize, &signatureLength, &status);
3917     if(encoding!=NULL ){
3918         fileBufC  += signatureLength;
3919         fileSize  -= signatureLength;
3920     } else {
3921         encoding = defEncoding;
3922         if (strcmp(encoding, "utf-8") == 0) {
3923             errln("file %s is missing its BOM", fileName);
3924         }
3925     }
3926 
3927     //
3928     // Open a converter to take the rule file to UTF-16
3929     //
3930     conv = ucnv_open(encoding, &status);
3931     if (U_FAILURE(status)) {
3932         goto cleanUpAndReturn;
3933     }
3934 
3935     //
3936     // Convert the rules to UChar.
3937     //  Preflight first to determine required buffer size.
3938     //
3939     ulen = ucnv_toUChars(conv,
3940         NULL,           //  dest,
3941         0,              //  destCapacity,
3942         fileBufC,
3943         fileSize,
3944         &status);
3945     if (status == U_BUFFER_OVERFLOW_ERROR) {
3946         // Buffer Overflow is expected from the preflight operation.
3947         status = U_ZERO_ERROR;
3948 
3949         retPtr = new UChar[ulen+1];
3950         ucnv_toUChars(conv,
3951             retPtr,       //  dest,
3952             ulen+1,
3953             fileBufC,
3954             fileSize,
3955             &status);
3956     }
3957 
3958 cleanUpAndReturn:
3959     fclose(f);
3960     delete[] fileBuf;
3961     ucnv_close(conv);
3962     if (U_FAILURE(status)) {
3963         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3964         delete []retPtr;
3965         retPtr = 0;
3966         ulen   = 0;
3967     }
3968     return retPtr;
3969 }
3970 
3971 
3972 //-------------------------------------------------------------------------------
3973 //
3974 //   PerlTests  - Run Perl's regular expression tests
3975 //                The input file for this test is re_tests, the standard regular
3976 //                expression test data distributed with the Perl source code.
3977 //
3978 //                Here is Perl's description of the test data file:
3979 //
3980 //        # The tests are in a separate file 't/op/re_tests'.
3981 //        # Each line in that file is a separate test.
3982 //        # There are five columns, separated by tabs.
3983 //        #
3984 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3985 //        # Modifiers can be put after the closing C<'>.
3986 //        #
3987 //        # Column 2 contains the string to be matched.
3988 //        #
3989 //        # Column 3 contains the expected result:
3990 //        #     y   expect a match
3991 //        #     n   expect no match
3992 //        #     c   expect an error
3993 //        # B   test exposes a known bug in Perl, should be skipped
3994 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3995 //        #
3996 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3997 //        #
3998 //        # Column 4 contains a string, usually C<$&>.
3999 //        #
4000 //        # Column 5 contains the expected result of double-quote
4001 //        # interpolating that string after the match, or start of error message.
4002 //        #
4003 //        # Column 6, if present, contains a reason why the test is skipped.
4004 //        # This is printed with "skipped", for harness to pick up.
4005 //        #
4006 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
4007 //        #
4008 //        # If you want to add a regular expression test that can't be expressed
4009 //        # in this format, don't add it here: put it in op/pat.t instead.
4010 //
4011 //        For ICU, if field 3 contains an 'i', the test will be skipped.
4012 //        The test exposes is some known incompatibility between ICU and Perl regexps.
4013 //        (The i is in addition to whatever was there before.)
4014 //
4015 //-------------------------------------------------------------------------------
PerlTests()4016 void RegexTest::PerlTests() {
4017     char tdd[2048];
4018     const char *srcPath;
4019     UErrorCode  status = U_ZERO_ERROR;
4020     UParseError pe;
4021 
4022     //
4023     //  Open and read the test data file.
4024     //
4025     srcPath=getPath(tdd, "re_tests.txt");
4026     if(srcPath==NULL) {
4027         return; /* something went wrong, error already output */
4028     }
4029 
4030     int32_t    len;
4031     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4032     if (U_FAILURE(status)) {
4033         return; /* something went wrong, error already output */
4034     }
4035 
4036     //
4037     //  Put the test data into a UnicodeString
4038     //
4039     UnicodeString testDataString(FALSE, testData, len);
4040 
4041     //
4042     //  Regex to break the input file into lines, and strip the new lines.
4043     //     One line per match, capture group one is the desired data.
4044     //
4045     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4046     if (U_FAILURE(status)) {
4047         dataerrln("RegexPattern::compile() error");
4048         return;
4049     }
4050     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4051 
4052     //
4053     //  Regex to split a test file line into fields.
4054     //    There are six fields, separated by tabs.
4055     //
4056     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4057 
4058     //
4059     //  Regex to identify test patterns with flag settings, and to separate them.
4060     //    Test patterns with flags look like 'pattern'i
4061     //    Test patterns without flags are not quoted:   pattern
4062     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4063     //
4064     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4065     RegexMatcher* flagMat = flagPat->matcher(status);
4066 
4067     //
4068     // The Perl tests reference several perl-isms, which are evaluated/substituted
4069     //   in the test data.  Not being perl, this must be done explicitly.  Here
4070     //   are string constants and REs for these constructs.
4071     //
4072     UnicodeString nulnulSrc("${nulnul}");
4073     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4074     nulnul = nulnul.unescape();
4075 
4076     UnicodeString ffffSrc("${ffff}");
4077     UnicodeString ffff("\\uffff", -1, US_INV);
4078     ffff = ffff.unescape();
4079 
4080     //  regexp for $-[0], $+[2], etc.
4081     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4082     RegexMatcher *groupsMat = groupsPat->matcher(status);
4083 
4084     //  regexp for $0, $1, $2, etc.
4085     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4086     RegexMatcher *cgMat = cgPat->matcher(status);
4087 
4088 
4089     //
4090     // Main Loop for the Perl Tests, runs once per line from the
4091     //   test data file.
4092     //
4093     int32_t  lineNum = 0;
4094     int32_t  skippedUnimplementedCount = 0;
4095     while (lineMat->find()) {
4096         lineNum++;
4097 
4098         //
4099         //  Get a line, break it into its fields, do the Perl
4100         //    variable substitutions.
4101         //
4102         UnicodeString line = lineMat->group(1, status);
4103         UnicodeString fields[7];
4104         fieldPat->split(line, fields, 7, status);
4105 
4106         flagMat->reset(fields[0]);
4107         flagMat->matches(status);
4108         UnicodeString pattern  = flagMat->group(2, status);
4109         pattern.findAndReplace("${bang}", "!");
4110         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4111         pattern.findAndReplace(ffffSrc, ffff);
4112 
4113         //
4114         //  Identify patterns that include match flag settings,
4115         //    split off the flags, remove the extra quotes.
4116         //
4117         UnicodeString flagStr = flagMat->group(3, status);
4118         if (U_FAILURE(status)) {
4119             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4120             return;
4121         }
4122         int32_t flags = 0;
4123         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4124         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4125         const UChar UChar_m = 0x6d;
4126         const UChar UChar_x = 0x78;
4127         const UChar UChar_y = 0x79;
4128         if (flagStr.indexOf(UChar_i) != -1) {
4129             flags |= UREGEX_CASE_INSENSITIVE;
4130         }
4131         if (flagStr.indexOf(UChar_m) != -1) {
4132             flags |= UREGEX_MULTILINE;
4133         }
4134         if (flagStr.indexOf(UChar_x) != -1) {
4135             flags |= UREGEX_COMMENTS;
4136         }
4137 
4138         //
4139         // Compile the test pattern.
4140         //
4141         status = U_ZERO_ERROR;
4142         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4143         if (status == U_REGEX_UNIMPLEMENTED) {
4144             //
4145             // Test of a feature that is planned for ICU, but not yet implemented.
4146             //   skip the test.
4147             skippedUnimplementedCount++;
4148             delete testPat;
4149             status = U_ZERO_ERROR;
4150             continue;
4151         }
4152 
4153         if (U_FAILURE(status)) {
4154             // Some tests are supposed to generate errors.
4155             //   Only report an error for tests that are supposed to succeed.
4156             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4157                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4158             {
4159                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4160             }
4161             status = U_ZERO_ERROR;
4162             delete testPat;
4163             continue;
4164         }
4165 
4166         if (fields[2].indexOf(UChar_i) >= 0) {
4167             // ICU should skip this test.
4168             delete testPat;
4169             continue;
4170         }
4171 
4172         if (fields[2].indexOf(UChar_c) >= 0) {
4173             // This pattern should have caused a compilation error, but didn't/
4174             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4175             delete testPat;
4176             continue;
4177         }
4178 
4179         //
4180         // replace the Perl variables that appear in some of the
4181         //   match data strings.
4182         //
4183         UnicodeString matchString = fields[1];
4184         matchString.findAndReplace(nulnulSrc, nulnul);
4185         matchString.findAndReplace(ffffSrc,   ffff);
4186 
4187         // Replace any \n in the match string with an actual new-line char.
4188         //  Don't do full unescape, as this unescapes more than Perl does, which
4189         //  causes other spurious failures in the tests.
4190         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4191 
4192 
4193 
4194         //
4195         // Run the test, check for expected match/don't match result.
4196         //
4197         RegexMatcher *testMat = testPat->matcher(matchString, status);
4198         UBool found = testMat->find();
4199         UBool expected = FALSE;
4200         if (fields[2].indexOf(UChar_y) >=0) {
4201             expected = TRUE;
4202         }
4203         if (expected != found) {
4204             errln("line %d: Expected %smatch, got %smatch",
4205                 lineNum, expected?"":"no ", found?"":"no " );
4206             continue;
4207         }
4208 
4209         // Don't try to check expected results if there is no match.
4210         //   (Some have stuff in the expected fields)
4211         if (!found) {
4212             delete testMat;
4213             delete testPat;
4214             continue;
4215         }
4216 
4217         //
4218         // Interpret the Perl expression from the fourth field of the data file,
4219         // building up an ICU string from the results of the ICU match.
4220         //   The Perl expression will contain references to the results of
4221         //     a regex match, including the matched string, capture group strings,
4222         //     group starting and ending indicies, etc.
4223         //
4224         UnicodeString resultString;
4225         UnicodeString perlExpr = fields[3];
4226 #if SUPPORT_MUTATING_INPUT_STRING
4227         groupsMat->reset(perlExpr);
4228         cgMat->reset(perlExpr);
4229 #endif
4230 
4231         while (perlExpr.length() > 0) {
4232 #if !SUPPORT_MUTATING_INPUT_STRING
4233             //  Perferred usage.  Reset after any modification to input string.
4234             groupsMat->reset(perlExpr);
4235             cgMat->reset(perlExpr);
4236 #endif
4237 
4238             if (perlExpr.startsWith("$&")) {
4239                 resultString.append(testMat->group(status));
4240                 perlExpr.remove(0, 2);
4241             }
4242 
4243             else if (groupsMat->lookingAt(status)) {
4244                 // $-[0]   $+[2]  etc.
4245                 UnicodeString digitString = groupsMat->group(2, status);
4246                 int32_t t = 0;
4247                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4248                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4249                 int32_t matchPosition;
4250                 if (plusOrMinus.compare("+") == 0) {
4251                     matchPosition = testMat->end(groupNum, status);
4252                 } else {
4253                     matchPosition = testMat->start(groupNum, status);
4254                 }
4255                 if (matchPosition != -1) {
4256                     ICU_Utility::appendNumber(resultString, matchPosition);
4257                 }
4258                 perlExpr.remove(0, groupsMat->end(status));
4259             }
4260 
4261             else if (cgMat->lookingAt(status)) {
4262                 // $1, $2, $3, etc.
4263                 UnicodeString digitString = cgMat->group(1, status);
4264                 int32_t t = 0;
4265                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4266                 if (U_SUCCESS(status)) {
4267                     resultString.append(testMat->group(groupNum, status));
4268                     status = U_ZERO_ERROR;
4269                 }
4270                 perlExpr.remove(0, cgMat->end(status));
4271             }
4272 
4273             else if (perlExpr.startsWith("@-")) {
4274                 int32_t i;
4275                 for (i=0; i<=testMat->groupCount(); i++) {
4276                     if (i>0) {
4277                         resultString.append(" ");
4278                     }
4279                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4280                 }
4281                 perlExpr.remove(0, 2);
4282             }
4283 
4284             else if (perlExpr.startsWith("@+")) {
4285                 int32_t i;
4286                 for (i=0; i<=testMat->groupCount(); i++) {
4287                     if (i>0) {
4288                         resultString.append(" ");
4289                     }
4290                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4291                 }
4292                 perlExpr.remove(0, 2);
4293             }
4294 
4295             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4296                                                      //           or as an escaped sequence (e.g. \n)
4297                 if (perlExpr.length() > 1) {
4298                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4299                 }
4300                 UChar c = perlExpr.charAt(0);
4301                 switch (c) {
4302                 case 'n':   c = '\n'; break;
4303                 // add any other escape sequences that show up in the test expected results.
4304                 }
4305                 resultString.append(c);
4306                 perlExpr.remove(0, 1);
4307             }
4308 
4309             else  {
4310                 // Any characters from the perl expression that we don't explicitly
4311                 //  recognize before here are assumed to be literals and copied
4312                 //  as-is to the expected results.
4313                 resultString.append(perlExpr.charAt(0));
4314                 perlExpr.remove(0, 1);
4315             }
4316 
4317             if (U_FAILURE(status)) {
4318                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4319                 break;
4320             }
4321         }
4322 
4323         //
4324         // Expected Results Compare
4325         //
4326         UnicodeString expectedS(fields[4]);
4327         expectedS.findAndReplace(nulnulSrc, nulnul);
4328         expectedS.findAndReplace(ffffSrc,   ffff);
4329         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4330 
4331 
4332         if (expectedS.compare(resultString) != 0) {
4333             err("Line %d: Incorrect perl expression results.", lineNum);
4334             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4335         }
4336 
4337         delete testMat;
4338         delete testPat;
4339     }
4340 
4341     //
4342     // All done.  Clean up allocated stuff.
4343     //
4344     delete cgMat;
4345     delete cgPat;
4346 
4347     delete groupsMat;
4348     delete groupsPat;
4349 
4350     delete flagMat;
4351     delete flagPat;
4352 
4353     delete lineMat;
4354     delete linePat;
4355 
4356     delete fieldPat;
4357     delete [] testData;
4358 
4359 
4360     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4361 
4362 }
4363 
4364 
4365 //-------------------------------------------------------------------------------
4366 //
4367 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4368 //                  (instead of using UnicodeStrings) to test the alternate engine.
4369 //                  The input file for this test is re_tests, the standard regular
4370 //                  expression test data distributed with the Perl source code.
4371 //                  See PerlTests() for more information.
4372 //
4373 //-------------------------------------------------------------------------------
PerlTestsUTF8()4374 void RegexTest::PerlTestsUTF8() {
4375     char tdd[2048];
4376     const char *srcPath;
4377     UErrorCode  status = U_ZERO_ERROR;
4378     UParseError pe;
4379     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4380     UText       patternText = UTEXT_INITIALIZER;
4381     char       *patternChars = NULL;
4382     int32_t     patternLength;
4383     int32_t     patternCapacity = 0;
4384     UText       inputText = UTEXT_INITIALIZER;
4385     char       *inputChars = NULL;
4386     int32_t     inputLength;
4387     int32_t     inputCapacity = 0;
4388 
4389     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4390 
4391     //
4392     //  Open and read the test data file.
4393     //
4394     srcPath=getPath(tdd, "re_tests.txt");
4395     if(srcPath==NULL) {
4396         return; /* something went wrong, error already output */
4397     }
4398 
4399     int32_t    len;
4400     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4401     if (U_FAILURE(status)) {
4402         return; /* something went wrong, error already output */
4403     }
4404 
4405     //
4406     //  Put the test data into a UnicodeString
4407     //
4408     UnicodeString testDataString(FALSE, testData, len);
4409 
4410     //
4411     //  Regex to break the input file into lines, and strip the new lines.
4412     //     One line per match, capture group one is the desired data.
4413     //
4414     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4415     if (U_FAILURE(status)) {
4416         dataerrln("RegexPattern::compile() error");
4417         return;
4418     }
4419     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4420 
4421     //
4422     //  Regex to split a test file line into fields.
4423     //    There are six fields, separated by tabs.
4424     //
4425     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4426 
4427     //
4428     //  Regex to identify test patterns with flag settings, and to separate them.
4429     //    Test patterns with flags look like 'pattern'i
4430     //    Test patterns without flags are not quoted:   pattern
4431     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4432     //
4433     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4434     RegexMatcher* flagMat = flagPat->matcher(status);
4435 
4436     //
4437     // The Perl tests reference several perl-isms, which are evaluated/substituted
4438     //   in the test data.  Not being perl, this must be done explicitly.  Here
4439     //   are string constants and REs for these constructs.
4440     //
4441     UnicodeString nulnulSrc("${nulnul}");
4442     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4443     nulnul = nulnul.unescape();
4444 
4445     UnicodeString ffffSrc("${ffff}");
4446     UnicodeString ffff("\\uffff", -1, US_INV);
4447     ffff = ffff.unescape();
4448 
4449     //  regexp for $-[0], $+[2], etc.
4450     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4451     RegexMatcher *groupsMat = groupsPat->matcher(status);
4452 
4453     //  regexp for $0, $1, $2, etc.
4454     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4455     RegexMatcher *cgMat = cgPat->matcher(status);
4456 
4457 
4458     //
4459     // Main Loop for the Perl Tests, runs once per line from the
4460     //   test data file.
4461     //
4462     int32_t  lineNum = 0;
4463     int32_t  skippedUnimplementedCount = 0;
4464     while (lineMat->find()) {
4465         lineNum++;
4466 
4467         //
4468         //  Get a line, break it into its fields, do the Perl
4469         //    variable substitutions.
4470         //
4471         UnicodeString line = lineMat->group(1, status);
4472         UnicodeString fields[7];
4473         fieldPat->split(line, fields, 7, status);
4474 
4475         flagMat->reset(fields[0]);
4476         flagMat->matches(status);
4477         UnicodeString pattern  = flagMat->group(2, status);
4478         pattern.findAndReplace("${bang}", "!");
4479         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4480         pattern.findAndReplace(ffffSrc, ffff);
4481 
4482         //
4483         //  Identify patterns that include match flag settings,
4484         //    split off the flags, remove the extra quotes.
4485         //
4486         UnicodeString flagStr = flagMat->group(3, status);
4487         if (U_FAILURE(status)) {
4488             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4489             return;
4490         }
4491         int32_t flags = 0;
4492         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4493         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4494         const UChar UChar_m = 0x6d;
4495         const UChar UChar_x = 0x78;
4496         const UChar UChar_y = 0x79;
4497         if (flagStr.indexOf(UChar_i) != -1) {
4498             flags |= UREGEX_CASE_INSENSITIVE;
4499         }
4500         if (flagStr.indexOf(UChar_m) != -1) {
4501             flags |= UREGEX_MULTILINE;
4502         }
4503         if (flagStr.indexOf(UChar_x) != -1) {
4504             flags |= UREGEX_COMMENTS;
4505         }
4506 
4507         //
4508         // Put the pattern in a UTF-8 UText
4509         //
4510         status = U_ZERO_ERROR;
4511         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4512         if (status == U_BUFFER_OVERFLOW_ERROR) {
4513             status = U_ZERO_ERROR;
4514             delete[] patternChars;
4515             patternCapacity = patternLength + 1;
4516             patternChars = new char[patternCapacity];
4517             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4518         }
4519         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4520 
4521         //
4522         // Compile the test pattern.
4523         //
4524         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4525         if (status == U_REGEX_UNIMPLEMENTED) {
4526             //
4527             // Test of a feature that is planned for ICU, but not yet implemented.
4528             //   skip the test.
4529             skippedUnimplementedCount++;
4530             delete testPat;
4531             status = U_ZERO_ERROR;
4532             continue;
4533         }
4534 
4535         if (U_FAILURE(status)) {
4536             // Some tests are supposed to generate errors.
4537             //   Only report an error for tests that are supposed to succeed.
4538             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4539                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4540             {
4541                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4542             }
4543             status = U_ZERO_ERROR;
4544             delete testPat;
4545             continue;
4546         }
4547 
4548         if (fields[2].indexOf(UChar_i) >= 0) {
4549             // ICU should skip this test.
4550             delete testPat;
4551             continue;
4552         }
4553 
4554         if (fields[2].indexOf(UChar_c) >= 0) {
4555             // This pattern should have caused a compilation error, but didn't/
4556             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4557             delete testPat;
4558             continue;
4559         }
4560 
4561 
4562         //
4563         // replace the Perl variables that appear in some of the
4564         //   match data strings.
4565         //
4566         UnicodeString matchString = fields[1];
4567         matchString.findAndReplace(nulnulSrc, nulnul);
4568         matchString.findAndReplace(ffffSrc,   ffff);
4569 
4570         // Replace any \n in the match string with an actual new-line char.
4571         //  Don't do full unescape, as this unescapes more than Perl does, which
4572         //  causes other spurious failures in the tests.
4573         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4574 
4575         //
4576         // Put the input in a UTF-8 UText
4577         //
4578         status = U_ZERO_ERROR;
4579         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4580         if (status == U_BUFFER_OVERFLOW_ERROR) {
4581             status = U_ZERO_ERROR;
4582             delete[] inputChars;
4583             inputCapacity = inputLength + 1;
4584             inputChars = new char[inputCapacity];
4585             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4586         }
4587         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4588 
4589         //
4590         // Run the test, check for expected match/don't match result.
4591         //
4592         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4593         UBool found = testMat->find();
4594         UBool expected = FALSE;
4595         if (fields[2].indexOf(UChar_y) >=0) {
4596             expected = TRUE;
4597         }
4598         if (expected != found) {
4599             errln("line %d: Expected %smatch, got %smatch",
4600                 lineNum, expected?"":"no ", found?"":"no " );
4601             continue;
4602         }
4603 
4604         // Don't try to check expected results if there is no match.
4605         //   (Some have stuff in the expected fields)
4606         if (!found) {
4607             delete testMat;
4608             delete testPat;
4609             continue;
4610         }
4611 
4612         //
4613         // Interpret the Perl expression from the fourth field of the data file,
4614         // building up an ICU string from the results of the ICU match.
4615         //   The Perl expression will contain references to the results of
4616         //     a regex match, including the matched string, capture group strings,
4617         //     group starting and ending indicies, etc.
4618         //
4619         UnicodeString resultString;
4620         UnicodeString perlExpr = fields[3];
4621 
4622         while (perlExpr.length() > 0) {
4623             groupsMat->reset(perlExpr);
4624             cgMat->reset(perlExpr);
4625 
4626             if (perlExpr.startsWith("$&")) {
4627                 resultString.append(testMat->group(status));
4628                 perlExpr.remove(0, 2);
4629             }
4630 
4631             else if (groupsMat->lookingAt(status)) {
4632                 // $-[0]   $+[2]  etc.
4633                 UnicodeString digitString = groupsMat->group(2, status);
4634                 int32_t t = 0;
4635                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4636                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4637                 int32_t matchPosition;
4638                 if (plusOrMinus.compare("+") == 0) {
4639                     matchPosition = testMat->end(groupNum, status);
4640                 } else {
4641                     matchPosition = testMat->start(groupNum, status);
4642                 }
4643                 if (matchPosition != -1) {
4644                     ICU_Utility::appendNumber(resultString, matchPosition);
4645                 }
4646                 perlExpr.remove(0, groupsMat->end(status));
4647             }
4648 
4649             else if (cgMat->lookingAt(status)) {
4650                 // $1, $2, $3, etc.
4651                 UnicodeString digitString = cgMat->group(1, status);
4652                 int32_t t = 0;
4653                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4654                 if (U_SUCCESS(status)) {
4655                     resultString.append(testMat->group(groupNum, status));
4656                     status = U_ZERO_ERROR;
4657                 }
4658                 perlExpr.remove(0, cgMat->end(status));
4659             }
4660 
4661             else if (perlExpr.startsWith("@-")) {
4662                 int32_t i;
4663                 for (i=0; i<=testMat->groupCount(); i++) {
4664                     if (i>0) {
4665                         resultString.append(" ");
4666                     }
4667                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4668                 }
4669                 perlExpr.remove(0, 2);
4670             }
4671 
4672             else if (perlExpr.startsWith("@+")) {
4673                 int32_t i;
4674                 for (i=0; i<=testMat->groupCount(); i++) {
4675                     if (i>0) {
4676                         resultString.append(" ");
4677                     }
4678                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4679                 }
4680                 perlExpr.remove(0, 2);
4681             }
4682 
4683             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4684                                                      //           or as an escaped sequence (e.g. \n)
4685                 if (perlExpr.length() > 1) {
4686                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4687                 }
4688                 UChar c = perlExpr.charAt(0);
4689                 switch (c) {
4690                 case 'n':   c = '\n'; break;
4691                 // add any other escape sequences that show up in the test expected results.
4692                 }
4693                 resultString.append(c);
4694                 perlExpr.remove(0, 1);
4695             }
4696 
4697             else  {
4698                 // Any characters from the perl expression that we don't explicitly
4699                 //  recognize before here are assumed to be literals and copied
4700                 //  as-is to the expected results.
4701                 resultString.append(perlExpr.charAt(0));
4702                 perlExpr.remove(0, 1);
4703             }
4704 
4705             if (U_FAILURE(status)) {
4706                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4707                 break;
4708             }
4709         }
4710 
4711         //
4712         // Expected Results Compare
4713         //
4714         UnicodeString expectedS(fields[4]);
4715         expectedS.findAndReplace(nulnulSrc, nulnul);
4716         expectedS.findAndReplace(ffffSrc,   ffff);
4717         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4718 
4719 
4720         if (expectedS.compare(resultString) != 0) {
4721             err("Line %d: Incorrect perl expression results.", lineNum);
4722             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4723         }
4724 
4725         delete testMat;
4726         delete testPat;
4727     }
4728 
4729     //
4730     // All done.  Clean up allocated stuff.
4731     //
4732     delete cgMat;
4733     delete cgPat;
4734 
4735     delete groupsMat;
4736     delete groupsPat;
4737 
4738     delete flagMat;
4739     delete flagPat;
4740 
4741     delete lineMat;
4742     delete linePat;
4743 
4744     delete fieldPat;
4745     delete [] testData;
4746 
4747     utext_close(&patternText);
4748     utext_close(&inputText);
4749 
4750     delete [] patternChars;
4751     delete [] inputChars;
4752 
4753 
4754     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4755 
4756 }
4757 
4758 
4759 //--------------------------------------------------------------
4760 //
4761 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4762 //             Use this pattern,
4763 //                 "(a?){1,8000000}"
4764 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4765 //                   This test is likely to be fragile, as further optimizations stop
4766 //                   more cases of pointless looping in the match engine.
4767 //
4768 //---------------------------------------------------------------
Bug6149()4769 void RegexTest::Bug6149() {
4770     UnicodeString pattern("(a?){1,8000000}");
4771     UnicodeString s("xyz");
4772     uint32_t flags = 0;
4773     UErrorCode status = U_ZERO_ERROR;
4774 
4775     RegexMatcher  matcher(pattern, s, flags, status);
4776     UBool result = false;
4777     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4778     REGEX_ASSERT(result == FALSE);
4779  }
4780 
4781 
4782 //
4783 //   Callbacks()    Test the callback function.
4784 //                  When set, callbacks occur periodically during matching operations,
4785 //                  giving the application code the ability to abort the operation
4786 //                  before it's normal completion.
4787 //
4788 
4789 struct callBackContext {
4790     RegexTest        *test;
4791     int32_t          maxCalls;
4792     int32_t          numCalls;
4793     int32_t          lastSteps;
resetcallBackContext4794     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}
4795 };
4796 
4797 U_CDECL_BEGIN
4798 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4799 testCallBackFn(const void *context, int32_t steps) {
4800     callBackContext  *info = (callBackContext *)context;
4801     if (info->lastSteps+1 != steps) {
4802         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4803     }
4804     info->lastSteps = steps;
4805     info->numCalls++;
4806     return (info->numCalls < info->maxCalls);
4807 }
4808 U_CDECL_END
4809 
Callbacks()4810 void RegexTest::Callbacks() {
4811    {
4812         // Getter returns NULLs if no callback has been set
4813 
4814         //   The variables that the getter will fill in.
4815         //   Init to non-null values so that the action of the getter can be seen.
4816         const void          *returnedContext = &returnedContext;
4817         URegexMatchCallback *returnedFn = &testCallBackFn;
4818 
4819         UErrorCode status = U_ZERO_ERROR;
4820         RegexMatcher matcher("x", 0, status);
4821         REGEX_CHECK_STATUS;
4822         matcher.getMatchCallback(returnedFn, returnedContext, status);
4823         REGEX_CHECK_STATUS;
4824         REGEX_ASSERT(returnedFn == NULL);
4825         REGEX_ASSERT(returnedContext == NULL);
4826     }
4827 
4828    {
4829         // Set and Get work
4830         callBackContext cbInfo = {this, 0, 0, 0};
4831         const void          *returnedContext;
4832         URegexMatchCallback *returnedFn;
4833         UErrorCode status = U_ZERO_ERROR;
4834         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4835         REGEX_CHECK_STATUS;
4836         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4837         REGEX_CHECK_STATUS;
4838         matcher.getMatchCallback(returnedFn, returnedContext, status);
4839         REGEX_CHECK_STATUS;
4840         REGEX_ASSERT(returnedFn == testCallBackFn);
4841         REGEX_ASSERT(returnedContext == &cbInfo);
4842 
4843         // A short-running match shouldn't invoke the callback
4844         status = U_ZERO_ERROR;
4845         cbInfo.reset(1);
4846         UnicodeString s = "xxx";
4847         matcher.reset(s);
4848         REGEX_ASSERT(matcher.matches(status));
4849         REGEX_CHECK_STATUS;
4850         REGEX_ASSERT(cbInfo.numCalls == 0);
4851 
4852         // A medium-length match that runs long enough to invoke the
4853         //   callback, but not so long that the callback aborts it.
4854         status = U_ZERO_ERROR;
4855         cbInfo.reset(4);
4856         s = "aaaaaaaaaaaaaaaaaaab";
4857         matcher.reset(s);
4858         REGEX_ASSERT(matcher.matches(status)==FALSE);
4859         REGEX_CHECK_STATUS;
4860         REGEX_ASSERT(cbInfo.numCalls > 0);
4861 
4862         // A longer running match that the callback function will abort.
4863         status = U_ZERO_ERROR;
4864         cbInfo.reset(4);
4865         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4866         matcher.reset(s);
4867         REGEX_ASSERT(matcher.matches(status)==FALSE);
4868         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4869         REGEX_ASSERT(cbInfo.numCalls == 4);
4870 
4871         // A longer running find that the callback function will abort.
4872         status = U_ZERO_ERROR;
4873         cbInfo.reset(4);
4874         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4875         matcher.reset(s);
4876         REGEX_ASSERT(matcher.find(status)==FALSE);
4877         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4878         REGEX_ASSERT(cbInfo.numCalls == 4);
4879     }
4880 
4881 
4882 }
4883 
4884 
4885 //
4886 //   FindProgressCallbacks()    Test the find "progress" callback function.
4887 //                  When set, the find progress callback will be invoked during a find operations
4888 //                  after each return from a match attempt, giving the application the opportunity
4889 //                  to terminate a long-running find operation before it's normal completion.
4890 //
4891 
4892 struct progressCallBackContext {
4893     RegexTest        *test;
4894     int64_t          lastIndex;
4895     int32_t          maxCalls;
4896     int32_t          numCalls;
resetprogressCallBackContext4897     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}
4898 };
4899 
4900 // call-back function for find().
4901 // Return TRUE to continue the find().
4902 // Return FALSE to stop the find().
4903 U_CDECL_BEGIN
4904 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4905 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4906     progressCallBackContext  *info = (progressCallBackContext *)context;
4907     info->numCalls++;
4908     info->lastIndex = matchIndex;
4909 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4910     return (info->numCalls < info->maxCalls);
4911 }
4912 U_CDECL_END
4913 
FindProgressCallbacks()4914 void RegexTest::FindProgressCallbacks() {
4915    {
4916         // Getter returns NULLs if no callback has been set
4917 
4918         //   The variables that the getter will fill in.
4919         //   Init to non-null values so that the action of the getter can be seen.
4920         const void                  *returnedContext = &returnedContext;
4921         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4922 
4923         UErrorCode status = U_ZERO_ERROR;
4924         RegexMatcher matcher("x", 0, status);
4925         REGEX_CHECK_STATUS;
4926         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4927         REGEX_CHECK_STATUS;
4928         REGEX_ASSERT(returnedFn == NULL);
4929         REGEX_ASSERT(returnedContext == NULL);
4930     }
4931 
4932    {
4933         // Set and Get work
4934         progressCallBackContext cbInfo = {this, 0, 0, 0};
4935         const void                  *returnedContext;
4936         URegexFindProgressCallback  *returnedFn;
4937         UErrorCode status = U_ZERO_ERROR;
4938         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4939         REGEX_CHECK_STATUS;
4940         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4941         REGEX_CHECK_STATUS;
4942         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4943         REGEX_CHECK_STATUS;
4944         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4945         REGEX_ASSERT(returnedContext == &cbInfo);
4946 
4947         // A find that matches on the initial position does NOT invoke the callback.
4948         status = U_ZERO_ERROR;
4949         cbInfo.reset(100);
4950         UnicodeString s = "aaxxx";
4951         matcher.reset(s);
4952 #if 0
4953         matcher.setTrace(TRUE);
4954 #endif
4955         REGEX_ASSERT(matcher.find(0, status));
4956         REGEX_CHECK_STATUS;
4957         REGEX_ASSERT(cbInfo.numCalls == 0);
4958 
4959         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4960         //   but not so many times that we interrupt the operation.
4961         status = U_ZERO_ERROR;
4962         s = "aaaaaaaaaaaaaaaaaaab";
4963         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4964         matcher.reset(s);
4965         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4966         REGEX_CHECK_STATUS;
4967         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4968 
4969         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4970         status = U_ZERO_ERROR;
4971         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4972         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4973         matcher.reset(s1);
4974         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4975         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4976         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4977 
4978         // Now a match that will succeed, but after an interruption
4979         status = U_ZERO_ERROR;
4980         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4981         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4982         matcher.reset(s2);
4983         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4984         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4985         // Now retry the match from where left off
4986         cbInfo.maxCalls = 100; //  No callback limit
4987         status = U_ZERO_ERROR;
4988         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4989         REGEX_CHECK_STATUS;
4990     }
4991 
4992 
4993 }
4994 
4995 
4996 //---------------------------------------------------------------------------
4997 //
4998 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4999 //                             UTexts. The pure-C implementation of UText
5000 //                             has no mutable backing stores, but we can
5001 //                             use UnicodeString here to test the functionality.
5002 //
5003 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()5004 void RegexTest::PreAllocatedUTextCAPI () {
5005     UErrorCode           status = U_ZERO_ERROR;
5006     URegularExpression  *re;
5007     UText                patternText = UTEXT_INITIALIZER;
5008     UnicodeString        buffer;
5009     UText                bufferText = UTEXT_INITIALIZER;
5010 
5011     utext_openUnicodeString(&bufferText, &buffer, &status);
5012 
5013     /*
5014      *  getText() and getUText()
5015      */
5016     {
5017         UText  text1 = UTEXT_INITIALIZER;
5018         UText  text2 = UTEXT_INITIALIZER;
5019         UChar  text2Chars[20];
5020         UText  *resultText;
5021 
5022         status = U_ZERO_ERROR;
5023         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5024         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5025         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5026         utext_openUChars(&text2, text2Chars, -1, &status);
5027 
5028         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5029         re = uregex_openUText(&patternText, 0, NULL, &status);
5030 
5031         /* First set a UText */
5032         uregex_setUText(re, &text1, &status);
5033         resultText = uregex_getUText(re, &bufferText, &status);
5034         REGEX_CHECK_STATUS;
5035         REGEX_ASSERT(resultText == &bufferText);
5036         utext_setNativeIndex(resultText, 0);
5037         utext_setNativeIndex(&text1, 0);
5038         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5039 
5040         resultText = uregex_getUText(re, &bufferText, &status);
5041         REGEX_CHECK_STATUS;
5042         REGEX_ASSERT(resultText == &bufferText);
5043         utext_setNativeIndex(resultText, 0);
5044         utext_setNativeIndex(&text1, 0);
5045         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5046 
5047         /* Then set a UChar * */
5048         uregex_setText(re, text2Chars, 7, &status);
5049         resultText = uregex_getUText(re, &bufferText, &status);
5050         REGEX_CHECK_STATUS;
5051         REGEX_ASSERT(resultText == &bufferText);
5052         utext_setNativeIndex(resultText, 0);
5053         utext_setNativeIndex(&text2, 0);
5054         REGEX_ASSERT(testUTextEqual(resultText, &text2));
5055 
5056         uregex_close(re);
5057         utext_close(&text1);
5058         utext_close(&text2);
5059     }
5060 
5061     /*
5062      *  group()
5063      */
5064     {
5065         UChar    text1[80];
5066         UText   *actual;
5067         UBool    result;
5068         int64_t  length = 0;
5069 
5070         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5071         //                  012345678901234567890123456789012345678901234567
5072         //                  0         1         2         3         4
5073 
5074         status = U_ZERO_ERROR;
5075         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5076         REGEX_CHECK_STATUS;
5077 
5078         uregex_setText(re, text1, -1, &status);
5079         result = uregex_find(re, 0, &status);
5080         REGEX_ASSERT(result==TRUE);
5081 
5082         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5083         status = U_ZERO_ERROR;
5084         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5085         REGEX_CHECK_STATUS;
5086         REGEX_ASSERT(actual == &bufferText);
5087         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5088         REGEX_ASSERT(length == 16);
5089         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5090 
5091         /*  Capture group #1.  Should succeed, matching " interior ". */
5092         status = U_ZERO_ERROR;
5093         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5094         REGEX_CHECK_STATUS;
5095         REGEX_ASSERT(actual == &bufferText);
5096         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5097         REGEX_ASSERT(length == 10);
5098         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5099 
5100         /*  Capture group out of range.  Error. */
5101         status = U_ZERO_ERROR;
5102         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5103         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5104         REGEX_ASSERT(actual == &bufferText);
5105         uregex_close(re);
5106 
5107     }
5108 
5109     /*
5110      *  replaceFirst()
5111      */
5112     {
5113         UChar    text1[80];
5114         UChar    text2[80];
5115         UText    replText = UTEXT_INITIALIZER;
5116         UText   *result;
5117         status = U_ZERO_ERROR;
5118         utext_openUnicodeString(&bufferText, &buffer, &status);
5119 
5120         status = U_ZERO_ERROR;
5121         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5122         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5123         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5124 
5125         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5126         REGEX_CHECK_STATUS;
5127 
5128         /*  Normal case, with match */
5129         uregex_setText(re, text1, -1, &status);
5130         REGEX_CHECK_STATUS;
5131         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5132         REGEX_CHECK_STATUS;
5133         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5134         REGEX_CHECK_STATUS;
5135         REGEX_ASSERT(result == &bufferText);
5136         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5137 
5138         /* No match.  Text should copy to output with no changes.  */
5139         uregex_setText(re, text2, -1, &status);
5140         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5141         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5142         REGEX_CHECK_STATUS;
5143         REGEX_ASSERT(result == &bufferText);
5144         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5145 
5146         /* Unicode escapes */
5147         uregex_setText(re, text1, -1, &status);
5148         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5149         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5150         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5151         REGEX_CHECK_STATUS;
5152         REGEX_ASSERT(result == &bufferText);
5153         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5154 
5155         uregex_close(re);
5156         utext_close(&replText);
5157     }
5158 
5159 
5160     /*
5161      *  replaceAll()
5162      */
5163     {
5164         UChar    text1[80];
5165         UChar    text2[80];
5166         UText    replText = UTEXT_INITIALIZER;
5167         UText   *result;
5168 
5169         status = U_ZERO_ERROR;
5170         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5171         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5172         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5173 
5174         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5175         REGEX_CHECK_STATUS;
5176 
5177         /*  Normal case, with match */
5178         uregex_setText(re, text1, -1, &status);
5179         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5180         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5181         REGEX_CHECK_STATUS;
5182         REGEX_ASSERT(result == &bufferText);
5183         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5184 
5185         /* No match.  Text should copy to output with no changes.  */
5186         uregex_setText(re, text2, -1, &status);
5187         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5188         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5189         REGEX_CHECK_STATUS;
5190         REGEX_ASSERT(result == &bufferText);
5191         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5192 
5193         uregex_close(re);
5194         utext_close(&replText);
5195     }
5196 
5197 
5198     /*
5199      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5200      *   so we don't need to test it here.
5201      */
5202 
5203     utext_close(&bufferText);
5204     utext_close(&patternText);
5205 }
5206 
5207 
5208 //--------------------------------------------------------------
5209 //
5210 //  NamedCapture   Check basic named capture group functionality
5211 //
5212 //--------------------------------------------------------------
NamedCapture()5213 void RegexTest::NamedCapture() {
5214     UErrorCode status = U_ZERO_ERROR;
5215     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5216             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5217     REGEX_CHECK_STATUS;
5218     int32_t group = pat->groupNumberFromName("five", -1, status);
5219     REGEX_CHECK_STATUS;
5220     REGEX_ASSERT(5 == group);
5221     group = pat->groupNumberFromName("three", -1, status);
5222     REGEX_CHECK_STATUS;
5223     REGEX_ASSERT(3 == group);
5224 
5225     status = U_ZERO_ERROR;
5226     group = pat->groupNumberFromName(UnicodeString("six"), status);
5227     REGEX_CHECK_STATUS;
5228     REGEX_ASSERT(6 == group);
5229 
5230     status = U_ZERO_ERROR;
5231     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5232     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5233 
5234     status = U_ZERO_ERROR;
5235 
5236     // After copying a pattern, named capture should still work in the copy.
5237     RegexPattern *copiedPat = new RegexPattern(*pat);
5238     REGEX_ASSERT(*copiedPat == *pat);
5239     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5240 
5241     group = copiedPat->groupNumberFromName("five", -1, status);
5242     REGEX_CHECK_STATUS;
5243     REGEX_ASSERT(5 == group);
5244     group = copiedPat->groupNumberFromName("three", -1, status);
5245     REGEX_CHECK_STATUS;
5246     REGEX_ASSERT(3 == group);
5247     delete copiedPat;
5248 
5249     // ReplaceAll with named capture group.
5250     status = U_ZERO_ERROR;
5251     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5252     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5253     REGEX_CHECK_STATUS;
5254     // m.pattern().dumpPattern();
5255     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5256     REGEX_CHECK_STATUS;
5257     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5258     delete m;
5259 
5260     // ReplaceAll, allowed capture group numbers.
5261     text = UnicodeString("abcmxyz");
5262     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5263     REGEX_CHECK_STATUS;
5264 
5265     status = U_ZERO_ERROR;
5266     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5267     REGEX_CHECK_STATUS;
5268     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5269 
5270     status = U_ZERO_ERROR;
5271     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5272     REGEX_CHECK_STATUS;
5273     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5274 
5275     status = U_ZERO_ERROR;
5276     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5277     REGEX_CHECK_STATUS;
5278     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5279 
5280     status = U_ZERO_ERROR;
5281     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5282     REGEX_CHECK_STATUS;
5283     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5284 
5285     status = U_ZERO_ERROR;
5286     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5287     REGEX_CHECK_STATUS;
5288     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5289 
5290     status = U_ZERO_ERROR;
5291     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5292     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5293 
5294     status = U_ZERO_ERROR;
5295     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5296     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5297     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5298 
5299     status = U_ZERO_ERROR;
5300     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5301     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5302     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5303 
5304     status = U_ZERO_ERROR;
5305     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5306     REGEX_CHECK_STATUS;
5307     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5308 
5309     status = U_ZERO_ERROR;
5310     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5311     REGEX_CHECK_STATUS;
5312     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5313 
5314     status = U_ZERO_ERROR;
5315     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5316     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5317 
5318     status = U_ZERO_ERROR;
5319     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5320     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5321 
5322     status = U_ZERO_ERROR;
5323     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5324     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5325 
5326     status = U_ZERO_ERROR;
5327     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5328     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5329 
5330     delete m;
5331 
5332     // Repeat the above replaceAll() tests using the plain C API, which
5333     //  has a separate implementation internally.
5334     //  TODO: factor out the test data.
5335 
5336     status = U_ZERO_ERROR;
5337     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5338     REGEX_CHECK_STATUS;
5339     text = UnicodeString("abcmxyz");
5340     uregex_setText(re, text.getBuffer(), text.length(), &status);
5341     REGEX_CHECK_STATUS;
5342 
5343     UChar resultBuf[100];
5344     int32_t resultLength;
5345     UnicodeString repl;
5346 
5347     status = U_ZERO_ERROR;
5348     repl = UnicodeString("<$0>");
5349     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5350     REGEX_CHECK_STATUS;
5351     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5352 
5353     status = U_ZERO_ERROR;
5354     repl = UnicodeString("<$1>");
5355     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5356     REGEX_CHECK_STATUS;
5357     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5358 
5359     status = U_ZERO_ERROR;
5360     repl = UnicodeString("<${one}>");
5361     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5362     REGEX_CHECK_STATUS;
5363     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5364 
5365     status = U_ZERO_ERROR;
5366     repl = UnicodeString("<$2>");
5367     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5368     REGEX_CHECK_STATUS;
5369     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5370 
5371     status = U_ZERO_ERROR;
5372     repl = UnicodeString("<$3>");
5373     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5374     REGEX_CHECK_STATUS;
5375     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5376 
5377     status = U_ZERO_ERROR;
5378     repl = UnicodeString("<$4>");
5379     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5380     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5381 
5382     status = U_ZERO_ERROR;
5383     repl = UnicodeString("<$04>");
5384     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5385     REGEX_CHECK_STATUS;
5386     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5387 
5388     status = U_ZERO_ERROR;
5389     repl = UnicodeString("<$000016>");
5390     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5391     REGEX_CHECK_STATUS;
5392     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5393 
5394     status = U_ZERO_ERROR;
5395     repl = UnicodeString("<$3$2$1${one}>");
5396     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5397     REGEX_CHECK_STATUS;
5398     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5399 
5400     status = U_ZERO_ERROR;
5401     repl = UnicodeString("$3$2$1${one}");
5402     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5403     REGEX_CHECK_STATUS;
5404     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5405 
5406     status = U_ZERO_ERROR;
5407     repl = UnicodeString("<${noSuchName}>");
5408     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5409     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5410 
5411     status = U_ZERO_ERROR;
5412     repl = UnicodeString("<${invalid-name}>");
5413     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5414     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5415 
5416     status = U_ZERO_ERROR;
5417     repl = UnicodeString("<${one");
5418     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5419     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5420 
5421     status = U_ZERO_ERROR;
5422     repl = UnicodeString("$not a capture group");
5423     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5424     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5425 
5426     uregex_close(re);
5427 }
5428 
5429 //--------------------------------------------------------------
5430 //
5431 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5432 //                       The point is not so much what the exact limit is,
5433 //                       but that a largish number doesn't hit bad non-linear performance,
5434 //                       and that exceeding the limit fails cleanly.
5435 //
5436 //--------------------------------------------------------------
NamedCaptureLimits()5437 void RegexTest::NamedCaptureLimits() {
5438     if (quick) {
5439         logln("Skipping test. Runs in exhuastive mode only.");
5440         return;
5441     }
5442     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5443     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5444     char nnbuf[100];
5445     UnicodeString pattern;
5446     int32_t nn;
5447 
5448     for (nn=1; nn<goodLimit; nn++) {
5449         sprintf(nnbuf, "(?<nn%d>)", nn);
5450         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5451     }
5452     UErrorCode status = U_ZERO_ERROR;
5453     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5454     REGEX_CHECK_STATUS;
5455     for (nn=1; nn<goodLimit; nn++) {
5456         sprintf(nnbuf, "nn%d", nn);
5457         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5458         REGEX_ASSERT(nn == groupNum);
5459         if (nn != groupNum) {
5460             break;
5461         }
5462     }
5463     delete pat;
5464 
5465     pattern.remove();
5466     for (nn=1; nn<failLimit; nn++) {
5467         sprintf(nnbuf, "(?<nn%d>)", nn);
5468         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5469     }
5470     status = U_ZERO_ERROR;
5471     pat = RegexPattern::compile(pattern, 0, status);
5472     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5473     delete pat;
5474 }
5475 
5476 
5477 //--------------------------------------------------------------
5478 //
5479 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5480 //
5481 //---------------------------------------------------------------
Bug7651()5482 void RegexTest::Bug7651() {
5483     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5484     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5485     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5486     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5487     UnicodeString s("#ff @abcd This is test");
5488     RegexPattern  *REPattern = NULL;
5489     RegexMatcher  *REMatcher = NULL;
5490     UErrorCode status = U_ZERO_ERROR;
5491     UParseError pe;
5492 
5493     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5494     REGEX_CHECK_STATUS;
5495     REMatcher = REPattern->matcher(s, status);
5496     REGEX_CHECK_STATUS;
5497     REGEX_ASSERT(REMatcher->find());
5498     REGEX_ASSERT(REMatcher->start(status) == 0);
5499     delete REPattern;
5500     delete REMatcher;
5501     status = U_ZERO_ERROR;
5502 
5503     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5504     REGEX_CHECK_STATUS;
5505     REMatcher = REPattern->matcher(s, status);
5506     REGEX_CHECK_STATUS;
5507     REGEX_ASSERT(REMatcher->find());
5508     REGEX_ASSERT(REMatcher->start(status) == 0);
5509     delete REPattern;
5510     delete REMatcher;
5511     status = U_ZERO_ERROR;
5512  }
5513 
Bug7740()5514 void RegexTest::Bug7740() {
5515     UErrorCode status = U_ZERO_ERROR;
5516     UnicodeString pattern = "(a)";
5517     UnicodeString text = "abcdef";
5518     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5519     REGEX_CHECK_STATUS;
5520     REGEX_ASSERT(m->lookingAt(status));
5521     REGEX_CHECK_STATUS;
5522     status = U_ILLEGAL_ARGUMENT_ERROR;
5523     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5524     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5525     REGEX_ASSERT(s == "");
5526     delete m;
5527 }
5528 
5529 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5530 
Bug8479()5531 void RegexTest::Bug8479() {
5532     UErrorCode status = U_ZERO_ERROR;
5533 
5534     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5535     REGEX_CHECK_STATUS;
5536     if (U_SUCCESS(status))
5537     {
5538         UnicodeString str;
5539         str.setToBogus();
5540         pMatcher->reset(str);
5541         status = U_ZERO_ERROR;
5542         pMatcher->matches(status);
5543         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5544         delete pMatcher;
5545     }
5546 }
5547 
5548 
5549 // Bug 7029
Bug7029()5550 void RegexTest::Bug7029() {
5551     UErrorCode status = U_ZERO_ERROR;
5552 
5553     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5554     UnicodeString text = "abc.def";
5555     UnicodeString splits[10];
5556     REGEX_CHECK_STATUS;
5557     int32_t numFields = pMatcher->split(text, splits, 10, status);
5558     REGEX_CHECK_STATUS;
5559     REGEX_ASSERT(numFields == 8);
5560     delete pMatcher;
5561 }
5562 
5563 // Bug 9283
5564 //   This test is checking for the existance of any supplemental characters that case-fold
5565 //   to a bmp character.
5566 //
5567 //   At the time of this writing there are none. If any should appear in a subsequent release
5568 //   of Unicode, the code in regular expressions compilation that determines the longest
5569 //   posssible match for a literal string  will need to be enhanced.
5570 //
5571 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5572 //   for details on what to do in case of a failure of this test.
5573 //
Bug9283()5574 void RegexTest::Bug9283() {
5575 #if !UCONFIG_NO_NORMALIZATION
5576     UErrorCode status = U_ZERO_ERROR;
5577     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5578     REGEX_CHECK_STATUS;
5579     int32_t index;
5580     UChar32 c;
5581     for (index=0; ; index++) {
5582         c = supplementalsWithCaseFolding.charAt(index);
5583         if (c == -1) {
5584             break;
5585         }
5586         UnicodeString cf = UnicodeString(c).foldCase();
5587         REGEX_ASSERT(cf.length() >= 2);
5588     }
5589 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5590 }
5591 
5592 
CheckInvBufSize()5593 void RegexTest::CheckInvBufSize() {
5594   if(inv_next>=INV_BUFSIZ) {
5595     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5596           __FILE__, INV_BUFSIZ, inv_next);
5597   } else {
5598     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5599   }
5600 }
5601 
5602 
Bug10459()5603 void RegexTest::Bug10459() {
5604     UErrorCode status = U_ZERO_ERROR;
5605     UnicodeString patternString("(txt)");
5606     UnicodeString txtString("txt");
5607 
5608     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5609     REGEX_CHECK_STATUS;
5610     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5611     REGEX_CHECK_STATUS;
5612 
5613     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5614     REGEX_CHECK_STATUS;
5615 
5616     uregex_setUText(icu_re, utext_txt, &status);
5617     REGEX_CHECK_STATUS;
5618 
5619     // The bug was that calling uregex_group() before doing a matching operation
5620     //   was causing a segfault. Only for Regular Expressions created from UText.
5621     //   It should set an U_REGEX_INVALID_STATE.
5622 
5623     UChar buf[100];
5624     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5625     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5626     REGEX_ASSERT(len == 0);
5627 
5628     uregex_close(icu_re);
5629     utext_close(utext_pat);
5630     utext_close(utext_txt);
5631 }
5632 
TestCaseInsensitiveStarters()5633 void RegexTest::TestCaseInsensitiveStarters() {
5634     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5635     //  become stale because of new Unicode characters.
5636     // If it is stale, rerun the generation tool
5637     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5638     // and replace the embedded data in i18n/regexcmp.cpp
5639 
5640     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5641         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5642             continue;
5643         }
5644         UnicodeSet s(cp, cp);
5645         s.closeOver(USET_CASE_INSENSITIVE);
5646         UnicodeSetIterator setIter(s);
5647         while (setIter.next()) {
5648             if (!setIter.isString()) {
5649                 continue;
5650             }
5651             const UnicodeString &str = setIter.getString();
5652             UChar32 firstChar = str.char32At(0);
5653             UnicodeSet starters;
5654             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5655             if (!starters.contains(cp)) {
5656                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5657                 return;
5658             }
5659         }
5660     }
5661 }
5662 
5663 
TestBug11049()5664 void RegexTest::TestBug11049() {
5665     // Original bug report: pattern with match start consisting of one of several individual characters,
5666     //  and the text being matched ending with a supplementary character. find() would read past the
5667     //  end of the input text when searching for potential match starting points.
5668 
5669     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5670     // detect the bad read.
5671 
5672     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5673     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5674 
5675     // Test again with a pattern starting with a single character,
5676     // which takes a different code path than starting with an OR expression,
5677     // but with similar logic.
5678     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5679     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5680 }
5681 
5682 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5683 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5684     UErrorCode status = U_ZERO_ERROR;
5685     UnicodeString patternString = UnicodeString(pattern).unescape();
5686     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5687 
5688     UnicodeString dataString = UnicodeString(data).unescape();
5689     UChar *exactBuffer = new UChar[dataString.length()];
5690     dataString.extract(exactBuffer, dataString.length(), status);
5691     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5692 
5693     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5694     REGEX_CHECK_STATUS;
5695     matcher->reset(ut);
5696     UBool result = matcher->find();
5697     if (result != expectMatch) {
5698         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5699               __FILE__, lineNumber, expectMatch, result, pattern, data);
5700     }
5701 
5702     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5703     //   off-by-one on find() with match at the last code point.
5704     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5705     //   because string.unescape() will only shrink it.
5706     char * utf8Buffer = new char[uprv_strlen(data)+1];
5707     u_strToUTF8(utf8Buffer, static_cast<int32_t>(uprv_strlen(data)+1), NULL, dataString.getBuffer(), dataString.length(), &status);
5708     REGEX_CHECK_STATUS;
5709     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5710     REGEX_CHECK_STATUS;
5711     matcher->reset(ut);
5712     result = matcher->find();
5713     if (result != expectMatch) {
5714         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5715               __FILE__, lineNumber, expectMatch, result, pattern, data);
5716     }
5717     delete [] utf8Buffer;
5718 
5719     utext_close(ut);
5720     delete [] exactBuffer;
5721 }
5722 
5723 
TestBug11371()5724 void RegexTest::TestBug11371() {
5725     if (quick) {
5726         logln("Skipping test. Runs in exhuastive mode only.");
5727         return;
5728     }
5729     UErrorCode status = U_ZERO_ERROR;
5730     UnicodeString patternString;
5731 
5732     for (int i=0; i<8000000; i++) {
5733         patternString.append(UnicodeString("()"));
5734     }
5735     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5736     if (status != U_REGEX_PATTERN_TOO_BIG) {
5737         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5738               __FILE__, __LINE__, u_errorName(status));
5739     }
5740 
5741     status = U_ZERO_ERROR;
5742     patternString = "(";
5743     for (int i=0; i<20000000; i++) {
5744         patternString.append(UnicodeString("A++"));
5745     }
5746     patternString.append(UnicodeString("){0}B++"));
5747     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5748     if (status != U_REGEX_PATTERN_TOO_BIG) {
5749         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5750               __FILE__, __LINE__, u_errorName(status));
5751     }
5752 
5753     // Pattern with too much string data, such that string indexes overflow operand data field size
5754     // in compiled instruction.
5755     status = U_ZERO_ERROR;
5756     patternString = "";
5757     while (patternString.length() < 0x00ffffff) {
5758         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5759     }
5760     patternString.append(UnicodeString("X? trailing string"));
5761     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5762     if (status != U_REGEX_PATTERN_TOO_BIG) {
5763         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5764               __FILE__, __LINE__, u_errorName(status));
5765     }
5766 }
5767 
TestBug11480()5768 void RegexTest::TestBug11480() {
5769     // C API, get capture group of a group that does not participate in the match.
5770     //        (Returns a zero length string, with nul termination,
5771     //         indistinguishable from a group with a zero length match.)
5772 
5773     UErrorCode status = U_ZERO_ERROR;
5774     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5775     REGEX_CHECK_STATUS;
5776     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5777     uregex_setText(re, text.getBuffer(), text.length(), &status);
5778     REGEX_CHECK_STATUS;
5779     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5780     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5781     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5782     REGEX_ASSERT(length == 0);
5783     REGEX_ASSERT(buf[0] == 13);
5784     REGEX_ASSERT(buf[1] == 0);
5785     REGEX_ASSERT(buf[2] == 13);
5786     uregex_close(re);
5787 
5788     // UText C++ API, length of match is 0 for non-participating matches.
5789     UText ut = UTEXT_INITIALIZER;
5790     utext_openUnicodeString(&ut, &text, &status);
5791     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5792     REGEX_CHECK_STATUS;
5793     matcher.reset(&ut);
5794     REGEX_ASSERT(matcher.lookingAt(0, status));
5795 
5796     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5797     int64_t groupLen = -666;
5798     UText group = UTEXT_INITIALIZER;
5799     matcher.group(1, &group, groupLen, status);
5800     REGEX_CHECK_STATUS;
5801     REGEX_ASSERT(groupLen == 1);
5802     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5803 
5804     // Capture group 2, the (B), does not participate in the match.
5805     matcher.group(2, &group, groupLen, status);
5806     REGEX_CHECK_STATUS;
5807     REGEX_ASSERT(groupLen == 0);
5808     REGEX_ASSERT(matcher.start(2, status) == -1);
5809     REGEX_CHECK_STATUS;
5810 }
5811 
TestBug12884()5812 void RegexTest::TestBug12884() {
5813     // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5814     UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5815     UnicodeString text(u"hello");
5816     UErrorCode status = U_ZERO_ERROR;
5817     RegexMatcher m(pattern, text, 0, status);
5818     REGEX_CHECK_STATUS;
5819     m.setTimeLimit(5, status);
5820     m.find(status);
5821     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5822 
5823     // Non-greedy loops. They take a different code path during matching.
5824     UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5825     status = U_ZERO_ERROR;
5826     RegexMatcher ngM(ngPattern, text, 0, status);
5827     REGEX_CHECK_STATUS;
5828     ngM.setTimeLimit(5, status);
5829     ngM.find(status);
5830     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5831 
5832     // UText, wrapping non-UTF-16 text, also takes a different execution path.
5833     const char *text8 = reinterpret_cast<const char*>(u8"¿Qué es Unicode?  Unicode proporciona un número único para cada"
5834                           "carácter, sin importar la plataforma, sin importar el programa,"
5835                           "sin importar el idioma.");
5836     status = U_ZERO_ERROR;
5837     LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5838     REGEX_CHECK_STATUS;
5839     m.reset(ut.getAlias());
5840     m.find(status);
5841     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5842 
5843     status = U_ZERO_ERROR;
5844     ngM.reset(ut.getAlias());
5845     ngM.find(status);
5846     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5847 }
5848 
5849 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5850 //            can cause a read past the end of the input text.
5851 //            The failure is seen when running this test with Clang's Addresss Sanitizer.
5852 
TestBug13631()5853 void RegexTest::TestBug13631() {
5854     const UChar *pats[] = { u"(?<!^)",
5855                             u"(?<=^)",
5856                             nullptr
5857                           };
5858     for (const UChar **pat=pats; *pat; ++pat) {
5859         UErrorCode status = U_ZERO_ERROR;
5860         UnicodeString upat(*pat);
5861         RegexMatcher matcher(upat, 0, status);
5862         const UChar s =u'a';
5863         UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5864         REGEX_CHECK_STATUS;
5865         matcher.reset(ut);
5866         while (matcher.find()) {
5867         }
5868         utext_close(ut);
5869     }
5870 }
5871 
5872 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5873 //           where a following group specification would be expected.
5874 //           Failure shows when running the test under Clang's Address Sanitizer.
5875 
TestBug13632()5876 void RegexTest::TestBug13632() {
5877     UErrorCode status = U_ZERO_ERROR;
5878     URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
5879     const char16_t *sourceString = u"Hello, world.";
5880     uregex_setText(re, sourceString, u_strlen(sourceString), &status);
5881 
5882     const int32_t destCap = 20;
5883     char16_t dest[destCap] = {};
5884     const char16_t replacement[] = {u'x', u'$'};    // Not nul terminated string.
5885     uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
5886 
5887     assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5888     uregex_close(re);
5889 }
5890 
TestBug20359()5891 void RegexTest::TestBug20359() {
5892     // The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
5893     // pairs. (Enter and exit pattern literal quote mode). Logic was correct.
5894     // Changed implementation to loop instead of recursing.
5895 
5896     UnicodeString pattern;
5897     for (int i=0; i<50000; ++i) {
5898         pattern += u"\\Q\\E";
5899     }
5900     pattern += u"x";
5901 
5902     UErrorCode status = U_ZERO_ERROR;
5903     LocalURegularExpressionPointer re(uregex_open(pattern.getBuffer(), pattern.length(),
5904                                        0, nullptr, &status));
5905     assertSuccess(WHERE, status);
5906 
5907     // We have passed the point where the bug crashed. The following is a small sanity
5908     // check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
5909 
5910     uregex_setText(re.getAlias(), u"abcxyz", -1, &status);
5911     assertSuccess(WHERE, status);
5912     assertTrue(WHERE, uregex_find(re.getAlias(), 0, &status));
5913     assertEquals(WHERE, 3, uregex_start(re.getAlias(), 0, &status));
5914     assertSuccess(WHERE, status);
5915 }
5916 
5917 
TestBug20863()5918 void RegexTest::TestBug20863() {
5919     // Test that patterns with a large number of named capture groups work correctly.
5920     //
5921     // The ticket was not for a bug per se, but to reduce memory usage by using lazy
5922     // construction of the map from capture names to numbers, and decreasing the
5923     // default size of the map.
5924 
5925     constexpr int GROUP_COUNT = 2000;
5926     std::vector<UnicodeString> groupNames;
5927     for (int32_t i=0; i<GROUP_COUNT; ++i) {
5928         UnicodeString name;
5929         name.append(u"name");
5930         name.append(Int64ToUnicodeString(i));
5931         groupNames.push_back(name);
5932     }
5933 
5934     UnicodeString patternString;
5935     for (UnicodeString name: groupNames) {
5936         patternString.append(u"(?<");
5937         patternString.append(name);
5938         patternString.append(u">.)");
5939     }
5940 
5941     UErrorCode status = U_ZERO_ERROR;
5942     UParseError pe;
5943     LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
5944     if (!assertSuccess(WHERE, status)) {
5945         return;
5946     }
5947 
5948     for (int32_t i=0; i<GROUP_COUNT; ++i) {
5949         int32_t group = pattern->groupNumberFromName(groupNames[i], status);
5950         if (!assertSuccess(WHERE, status)) {
5951             return;
5952         }
5953         assertEquals(WHERE, i+1, group);
5954         // Note: group 0 is the overall match; group 1 is the first separate capture group.
5955     }
5956 
5957     // Verify that assignment of patterns with various combinations of named capture work.
5958     // Lazy creation of the internal named capture map changed the implementation logic here.
5959     {
5960         LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5961         LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5962         assertSuccess(WHERE, status);
5963         assertFalse(WHERE, *pat1 == *pat2);
5964         *pat1 = *pat2;
5965         assertTrue(WHERE, *pat1 == *pat2);
5966         assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
5967         assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
5968         assertSuccess(WHERE, status);
5969     }
5970 
5971     {
5972         LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5973         LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5974         assertSuccess(WHERE, status);
5975         assertFalse(WHERE, *pat1 == *pat2);
5976         *pat2 = *pat1;
5977         assertTrue(WHERE, *pat1 == *pat2);
5978         assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
5979         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5980         status = U_ZERO_ERROR;
5981         assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
5982         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5983         status = U_ZERO_ERROR;
5984     }
5985 
5986     {
5987         LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
5988         LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
5989         assertSuccess(WHERE, status);
5990         assertFalse(WHERE, *pat1 == *pat2);
5991         *pat2 = *pat1;
5992         assertTrue(WHERE, *pat1 == *pat2);
5993         assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
5994         assertSuccess(WHERE, status);
5995         assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
5996         assertSuccess(WHERE, status);
5997         assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
5998         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5999         status = U_ZERO_ERROR;
6000         assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
6001         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
6002         status = U_ZERO_ERROR;
6003     }
6004 
6005 }
6006 
6007 
6008 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
6009