1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8
9 //
10 // regextst.cpp
11 //
12 // ICU Regular Expressions test, part of intltest.
13 //
14
15 /*
16 NOTE!!
17
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
22
23 */
24
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utf16.h"
42 #include "cstr.h"
43 #include "regextst.h"
44 #include "regexcmp.h"
45 #include "uvector.h"
46 #include "util.h"
47 #include "cmemory.h"
48 #include "cstring.h"
49 #include "uinvchar.h"
50
51 #define SUPPORT_MUTATING_INPUT_STRING 0
52
53 //---------------------------------------------------------------------------
54 //
55 // Test class boilerplate
56 //
57 //---------------------------------------------------------------------------
RegexTest()58 RegexTest::RegexTest()
59 {
60 }
61
62
~RegexTest()63 RegexTest::~RegexTest()
64 {
65 }
66
67
68
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)69 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
70 {
71 if (exec) logln("TestSuite RegexTest: ");
72 TESTCASE_AUTO_BEGIN;
73 TESTCASE_AUTO(Basic);
74 TESTCASE_AUTO(API_Match);
75 TESTCASE_AUTO(API_Replace);
76 TESTCASE_AUTO(API_Pattern);
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(Extended);
79 #endif
80 TESTCASE_AUTO(Errors);
81 TESTCASE_AUTO(PerlTests);
82 TESTCASE_AUTO(Callbacks);
83 TESTCASE_AUTO(FindProgressCallbacks);
84 TESTCASE_AUTO(Bug6149);
85 TESTCASE_AUTO(UTextBasic);
86 TESTCASE_AUTO(API_Match_UTF8);
87 TESTCASE_AUTO(API_Replace_UTF8);
88 TESTCASE_AUTO(API_Pattern_UTF8);
89 TESTCASE_AUTO(PerlTestsUTF8);
90 TESTCASE_AUTO(PreAllocatedUTextCAPI);
91 TESTCASE_AUTO(Bug7651);
92 TESTCASE_AUTO(Bug7740);
93 TESTCASE_AUTO(Bug8479);
94 TESTCASE_AUTO(Bug7029);
95 TESTCASE_AUTO(CheckInvBufSize);
96 TESTCASE_AUTO(Bug9283);
97 TESTCASE_AUTO(Bug10459);
98 TESTCASE_AUTO(TestCaseInsensitiveStarters);
99 TESTCASE_AUTO(TestBug11049);
100 TESTCASE_AUTO(TestBug11371);
101 TESTCASE_AUTO(TestBug11480);
102 TESTCASE_AUTO(NamedCapture);
103 TESTCASE_AUTO(NamedCaptureLimits);
104 TESTCASE_AUTO(TestBug12884);
105 TESTCASE_AUTO(TestBug13631);
106 TESTCASE_AUTO_END;
107 }
108
109
110 /**
111 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
112 * into ASCII.
113 * @see utext_openUTF8
114 */
115 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
116
117 //---------------------------------------------------------------------------
118 //
119 // Error Checking / Reporting macros used in all of the tests.
120 //
121 //---------------------------------------------------------------------------
122
utextToPrintable(char * buf,int32_t bufLen,UText * text)123 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
124 int64_t oldIndex = utext_getNativeIndex(text);
125 utext_setNativeIndex(text, 0);
126 char *bufPtr = buf;
127 UChar32 c = utext_next32From(text, 0);
128 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
129 if (0x000020<=c && c<0x00007e) {
130 *bufPtr = c;
131 } else {
132 #if 0
133 sprintf(bufPtr,"U+%04X", c);
134 bufPtr+= strlen(bufPtr)-1;
135 #else
136 *bufPtr = '%';
137 #endif
138 }
139 bufPtr++;
140 c = UTEXT_NEXT32(text);
141 }
142 *bufPtr = 0;
143 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
144 char *ebuf = (char*)malloc(bufLen);
145 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
146 uprv_strncpy(buf, ebuf, bufLen);
147 free((void*)ebuf);
148 #endif
149 utext_setNativeIndex(text, oldIndex);
150 }
151
152
153 static char ASSERT_BUF[1024];
154
extractToAssertBuf(const UnicodeString & message)155 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
156 if(message.length()==0) {
157 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
158 } else {
159 UnicodeString buf;
160 IntlTest::prettify(message,buf);
161 if(buf.length()==0) {
162 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
163 } else {
164 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
165 if(ASSERT_BUF[0]==0) {
166 ASSERT_BUF[0]=0;
167 for(int32_t i=0;i<buf.length();i++) {
168 UChar ch = buf[i];
169 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
170 }
171 }
172 }
173 }
174 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
175 return ASSERT_BUF;
176 }
177
178 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
179
180 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
181 __FILE__, __LINE__, u_errorName(status)); return;}}
182
183 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
184
185 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
186 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
187 __LINE__, u_errorName(errcode), u_errorName(status));};}
188
189 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
190 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
191
192 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
193 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
194
195 // expected: const char * , restricted to invariant characters.
196 // actual: const UnicodeString &
197 #define REGEX_ASSERT_UNISTR(expected, actual) { \
198 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
199 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
200 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
201
202
testUTextEqual(UText * uta,UText * utb)203 static UBool testUTextEqual(UText *uta, UText *utb) {
204 UChar32 ca = 0;
205 UChar32 cb = 0;
206 utext_setNativeIndex(uta, 0);
207 utext_setNativeIndex(utb, 0);
208 do {
209 ca = utext_next32(uta);
210 cb = utext_next32(utb);
211 if (ca != cb) {
212 break;
213 }
214 } while (ca != U_SENTINEL);
215 return ca == cb;
216 }
217
218
219 /**
220 * @param expected expected text in UTF-8 (not platform) codepage
221 */
assertUText(const char * expected,UText * actual,const char * file,int line)222 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
223 UErrorCode status = U_ZERO_ERROR;
224 UText expectedText = UTEXT_INITIALIZER;
225 utext_openUTF8(&expectedText, expected, -1, &status);
226 if(U_FAILURE(status)) {
227 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
228 return;
229 }
230 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
231 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
232 return;
233 }
234 utext_setNativeIndex(actual, 0);
235 if (!testUTextEqual(&expectedText, actual)) {
236 char buf[201 /*21*/];
237 char expectedBuf[201];
238 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
239 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
240 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
241 }
242 utext_close(&expectedText);
243 }
244 /**
245 * @param expected invariant (platform local text) input
246 */
247
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)248 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
249 UErrorCode status = U_ZERO_ERROR;
250 UText expectedText = UTEXT_INITIALIZER;
251 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
252 if(U_FAILURE(status)) {
253 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
254 return;
255 }
256 utext_setNativeIndex(actual, 0);
257 if (!testUTextEqual(&expectedText, actual)) {
258 char buf[201 /*21*/];
259 char expectedBuf[201];
260 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
261 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
262 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
263 }
264 utext_close(&expectedText);
265 }
266
267 /**
268 * Assumes utf-8 input
269 */
270 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
271 /**
272 * Assumes Invariant input
273 */
274 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
275
276 /**
277 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
278 * passed into utext_openUTF8. An error will be given if
279 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
280 */
281
282 #define INV_BUFSIZ 2048 /* increase this if too small */
283
284 static int64_t inv_next=0;
285
286 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
287 static char inv_buf[INV_BUFSIZ];
288 #endif
289
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)290 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
291 if(length==-1) length=strlen(inv);
292 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
293 inv_next+=length;
294 return utext_openUTF8(ut, inv, length, status);
295 #else
296 if(inv_next+length+1>INV_BUFSIZ) {
297 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
298 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
299 *status = U_MEMORY_ALLOCATION_ERROR;
300 return NULL;
301 }
302
303 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
304 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
305 inv_next+=length;
306
307 #if 0
308 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
309 #endif
310
311 return utext_openUTF8(ut, (const char*)buf, length, status);
312 #endif
313 }
314
315
316 //---------------------------------------------------------------------------
317 //
318 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
319 // for the LookingAt() and Match() functions.
320 //
321 // usage:
322 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
323 //
324 // The expected results are UBool - TRUE or FALSE.
325 // The input text is unescaped. The pattern is not.
326 //
327 //
328 //---------------------------------------------------------------------------
329
330 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
331
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)332 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
333 const UnicodeString pattern(pat, -1, US_INV);
334 const UnicodeString inputText(text, -1, US_INV);
335 UErrorCode status = U_ZERO_ERROR;
336 UParseError pe;
337 RegexPattern *REPattern = NULL;
338 RegexMatcher *REMatcher = NULL;
339 UBool retVal = TRUE;
340
341 UnicodeString patString(pat, -1, US_INV);
342 REPattern = RegexPattern::compile(patString, 0, pe, status);
343 if (U_FAILURE(status)) {
344 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
345 line, u_errorName(status));
346 return FALSE;
347 }
348 if (line==376) { REPattern->dumpPattern();}
349
350 UnicodeString inputString(inputText);
351 UnicodeString unEscapedInput = inputString.unescape();
352 REMatcher = REPattern->matcher(unEscapedInput, status);
353 if (U_FAILURE(status)) {
354 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
355 line, u_errorName(status));
356 return FALSE;
357 }
358
359 UBool actualmatch;
360 actualmatch = REMatcher->lookingAt(status);
361 if (U_FAILURE(status)) {
362 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
363 line, u_errorName(status));
364 retVal = FALSE;
365 }
366 if (actualmatch != looking) {
367 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
368 retVal = FALSE;
369 }
370
371 status = U_ZERO_ERROR;
372 actualmatch = REMatcher->matches(status);
373 if (U_FAILURE(status)) {
374 errln("RegexTest failure in matches() at line %d. Status = %s\n",
375 line, u_errorName(status));
376 retVal = FALSE;
377 }
378 if (actualmatch != match) {
379 errln("RegexTest: wrong return from matches() at line %d.\n", line);
380 retVal = FALSE;
381 }
382
383 if (retVal == FALSE) {
384 REPattern->dumpPattern();
385 }
386
387 delete REPattern;
388 delete REMatcher;
389 return retVal;
390 }
391
392
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)393 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
394 UText pattern = UTEXT_INITIALIZER;
395 int32_t inputUTF8Length;
396 char *textChars = NULL;
397 UText inputText = UTEXT_INITIALIZER;
398 UErrorCode status = U_ZERO_ERROR;
399 UParseError pe;
400 RegexPattern *REPattern = NULL;
401 RegexMatcher *REMatcher = NULL;
402 UBool retVal = TRUE;
403
404 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
405 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
406 if (U_FAILURE(status)) {
407 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
408 line, u_errorName(status));
409 return FALSE;
410 }
411
412 UnicodeString inputString(text, -1, US_INV);
413 UnicodeString unEscapedInput = inputString.unescape();
414 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
415 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
416
417 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
418 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
419 // UTF-8 does not allow unpaired surrogates, so this could actually happen
420 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
421 return TRUE; // not a failure of the Regex engine
422 }
423 status = U_ZERO_ERROR; // buffer overflow
424 textChars = new char[inputUTF8Length+1];
425 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
426 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
427
428 REMatcher = &REPattern->matcher(status)->reset(&inputText);
429 if (U_FAILURE(status)) {
430 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
431 line, u_errorName(status));
432 return FALSE;
433 }
434
435 UBool actualmatch;
436 actualmatch = REMatcher->lookingAt(status);
437 if (U_FAILURE(status)) {
438 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
439 line, u_errorName(status));
440 retVal = FALSE;
441 }
442 if (actualmatch != looking) {
443 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
444 retVal = FALSE;
445 }
446
447 status = U_ZERO_ERROR;
448 actualmatch = REMatcher->matches(status);
449 if (U_FAILURE(status)) {
450 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
451 line, u_errorName(status));
452 retVal = FALSE;
453 }
454 if (actualmatch != match) {
455 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
456 retVal = FALSE;
457 }
458
459 if (retVal == FALSE) {
460 REPattern->dumpPattern();
461 }
462
463 delete REPattern;
464 delete REMatcher;
465 utext_close(&inputText);
466 utext_close(&pattern);
467 delete[] textChars;
468 return retVal;
469 }
470
471
472
473 //---------------------------------------------------------------------------
474 //
475 // REGEX_ERR Macro + invocation function to simplify writing tests
476 // regex tests for incorrect patterns
477 //
478 // usage:
479 // REGEX_ERR("pattern", expected error line, column, expected status);
480 //
481 //---------------------------------------------------------------------------
482 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
483
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)484 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
485 UErrorCode expectedStatus, int32_t line) {
486 UnicodeString pattern(pat);
487
488 UErrorCode status = U_ZERO_ERROR;
489 UParseError pe;
490 RegexPattern *callerPattern = NULL;
491
492 //
493 // Compile the caller's pattern
494 //
495 UnicodeString patString(pat);
496 callerPattern = RegexPattern::compile(patString, 0, pe, status);
497 if (status != expectedStatus) {
498 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
499 } else {
500 if (status != U_ZERO_ERROR) {
501 if (pe.line != errLine || pe.offset != errCol) {
502 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
503 line, errLine, errCol, pe.line, pe.offset);
504 }
505 }
506 }
507
508 delete callerPattern;
509
510 //
511 // Compile again, using a UTF-8-based UText
512 //
513 UText patternText = UTEXT_INITIALIZER;
514 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
515 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
516 if (status != expectedStatus) {
517 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
518 } else {
519 if (status != U_ZERO_ERROR) {
520 if (pe.line != errLine || pe.offset != errCol) {
521 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
522 line, errLine, errCol, pe.line, pe.offset);
523 }
524 }
525 }
526
527 delete callerPattern;
528 utext_close(&patternText);
529 }
530
531
532
533 //---------------------------------------------------------------------------
534 //
535 // Basic Check for basic functionality of regex pattern matching.
536 // Avoid the use of REGEX_FIND test macro, which has
537 // substantial dependencies on basic Regex functionality.
538 //
539 //---------------------------------------------------------------------------
Basic()540 void RegexTest::Basic() {
541
542
543 //
544 // Debug - slide failing test cases early
545 //
546 #if 0
547 {
548 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
549 UParseError pe;
550 UErrorCode status = U_ZERO_ERROR;
551 RegexPattern *pattern;
552 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
553 pattern->dumpPattern();
554 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
555 UBool result = m->find();
556 printf("result = %d\n", result);
557 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
558 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
559 }
560 exit(1);
561 #endif
562
563
564 //
565 // Pattern with parentheses
566 //
567 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
568 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
569 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
570
571 //
572 // Patterns with *
573 //
574 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
575 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
576 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
577 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
578 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
579
580 REGEX_TESTLM("a*", "", TRUE, TRUE);
581 REGEX_TESTLM("a*", "b", TRUE, FALSE);
582
583
584 //
585 // Patterns with "."
586 //
587 REGEX_TESTLM(".", "abc", TRUE, FALSE);
588 REGEX_TESTLM("...", "abc", TRUE, TRUE);
589 REGEX_TESTLM("....", "abc", FALSE, FALSE);
590 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
591 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
592 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
593 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
594 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
595
596 //
597 // Patterns with * applied to chars at end of literal string
598 //
599 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
600 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
601
602 //
603 // Supplemental chars match as single chars, not a pair of surrogates.
604 //
605 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
606 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
607 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
608
609
610 //
611 // UnicodeSets in the pattern
612 //
613 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
614 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
615 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
616 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
617 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
618 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
619
620 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
621 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
622 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
623 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
624 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
625
626 //
627 // OR operator in patterns
628 //
629 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
630 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
631 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
632 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
633
634 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
635 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
636 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
637 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
638 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
639 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
640
641 //
642 // +
643 //
644 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
645 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
646 REGEX_TESTLM("b+", "", FALSE, FALSE);
647 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
648 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
649 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
650
651 //
652 // ?
653 //
654 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
655 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
656 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
657 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
658 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
659 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
660 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
661 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
662 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
663
664 //
665 // Escape sequences that become single literal chars, handled internally
666 // by ICU's Unescape.
667 //
668
669 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
670 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
671 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
672 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
673 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
674 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
675 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
676 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
677 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
678 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
679
680 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
681 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
682
683 // Escape of special chars in patterns
684 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
685 }
686
687
688 //---------------------------------------------------------------------------
689 //
690 // UTextBasic Check for quirks that are specific to the UText
691 // implementation.
692 //
693 //---------------------------------------------------------------------------
UTextBasic()694 void RegexTest::UTextBasic() {
695 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
696 UErrorCode status = U_ZERO_ERROR;
697 UText pattern = UTEXT_INITIALIZER;
698 utext_openUTF8(&pattern, str_abc, -1, &status);
699 RegexMatcher matcher(&pattern, 0, status);
700 REGEX_CHECK_STATUS;
701
702 UText input = UTEXT_INITIALIZER;
703 utext_openUTF8(&input, str_abc, -1, &status);
704 REGEX_CHECK_STATUS;
705 matcher.reset(&input);
706 REGEX_CHECK_STATUS;
707 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
708
709 matcher.reset(matcher.inputText());
710 REGEX_CHECK_STATUS;
711 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
712
713 utext_close(&pattern);
714 utext_close(&input);
715 }
716
717
718 //---------------------------------------------------------------------------
719 //
720 // API_Match Test that the API for class RegexMatcher
721 // is present and nominally working, but excluding functions
722 // implementing replace operations.
723 //
724 //---------------------------------------------------------------------------
API_Match()725 void RegexTest::API_Match() {
726 UParseError pe;
727 UErrorCode status=U_ZERO_ERROR;
728 int32_t flags = 0;
729
730 //
731 // Debug - slide failing test cases early
732 //
733 #if 0
734 {
735 }
736 return;
737 #endif
738
739 //
740 // Simple pattern compilation
741 //
742 {
743 UnicodeString re("abc");
744 RegexPattern *pat2;
745 pat2 = RegexPattern::compile(re, flags, pe, status);
746 REGEX_CHECK_STATUS;
747
748 UnicodeString inStr1 = "abcdef this is a test";
749 UnicodeString instr2 = "not abc";
750 UnicodeString empty = "";
751
752
753 //
754 // Matcher creation and reset.
755 //
756 RegexMatcher *m1 = pat2->matcher(inStr1, status);
757 REGEX_CHECK_STATUS;
758 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
759 REGEX_ASSERT(m1->input() == inStr1);
760 m1->reset(instr2);
761 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
762 REGEX_ASSERT(m1->input() == instr2);
763 m1->reset(inStr1);
764 REGEX_ASSERT(m1->input() == inStr1);
765 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
766 m1->reset(empty);
767 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
768 REGEX_ASSERT(m1->input() == empty);
769 REGEX_ASSERT(&m1->pattern() == pat2);
770
771 //
772 // reset(pos, status)
773 //
774 m1->reset(inStr1);
775 m1->reset(4, status);
776 REGEX_CHECK_STATUS;
777 REGEX_ASSERT(m1->input() == inStr1);
778 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
779
780 m1->reset(-1, status);
781 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
782 status = U_ZERO_ERROR;
783
784 m1->reset(0, status);
785 REGEX_CHECK_STATUS;
786 status = U_ZERO_ERROR;
787
788 int32_t len = m1->input().length();
789 m1->reset(len-1, status);
790 REGEX_CHECK_STATUS;
791 status = U_ZERO_ERROR;
792
793 m1->reset(len, status);
794 REGEX_CHECK_STATUS;
795 status = U_ZERO_ERROR;
796
797 m1->reset(len+1, status);
798 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
799 status = U_ZERO_ERROR;
800
801 //
802 // match(pos, status)
803 //
804 m1->reset(instr2);
805 REGEX_ASSERT(m1->matches(4, status) == TRUE);
806 m1->reset();
807 REGEX_ASSERT(m1->matches(3, status) == FALSE);
808 m1->reset();
809 REGEX_ASSERT(m1->matches(5, status) == FALSE);
810 REGEX_ASSERT(m1->matches(4, status) == TRUE);
811 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
812 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
813
814 // Match() at end of string should fail, but should not
815 // be an error.
816 status = U_ZERO_ERROR;
817 len = m1->input().length();
818 REGEX_ASSERT(m1->matches(len, status) == FALSE);
819 REGEX_CHECK_STATUS;
820
821 // Match beyond end of string should fail with an error.
822 status = U_ZERO_ERROR;
823 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
824 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
825
826 // Successful match at end of string.
827 {
828 status = U_ZERO_ERROR;
829 RegexMatcher m("A?", 0, status); // will match zero length string.
830 REGEX_CHECK_STATUS;
831 m.reset(inStr1);
832 len = inStr1.length();
833 REGEX_ASSERT(m.matches(len, status) == TRUE);
834 REGEX_CHECK_STATUS;
835 m.reset(empty);
836 REGEX_ASSERT(m.matches(0, status) == TRUE);
837 REGEX_CHECK_STATUS;
838 }
839
840
841 //
842 // lookingAt(pos, status)
843 //
844 status = U_ZERO_ERROR;
845 m1->reset(instr2); // "not abc"
846 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
847 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
848 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
849 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
850 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
851 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
852 status = U_ZERO_ERROR;
853 len = m1->input().length();
854 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
855 REGEX_CHECK_STATUS;
856 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
857 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
858
859 delete m1;
860 delete pat2;
861 }
862
863
864 //
865 // Capture Group.
866 // RegexMatcher::start();
867 // RegexMatcher::end();
868 // RegexMatcher::groupCount();
869 //
870 {
871 int32_t flags=0;
872 UParseError pe;
873 UErrorCode status=U_ZERO_ERROR;
874
875 UnicodeString re("01(23(45)67)(.*)");
876 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
877 REGEX_CHECK_STATUS;
878 UnicodeString data = "0123456789";
879
880 RegexMatcher *matcher = pat->matcher(data, status);
881 REGEX_CHECK_STATUS;
882 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
883 static const int32_t matchStarts[] = {0, 2, 4, 8};
884 static const int32_t matchEnds[] = {10, 8, 6, 10};
885 int32_t i;
886 for (i=0; i<4; i++) {
887 int32_t actualStart = matcher->start(i, status);
888 REGEX_CHECK_STATUS;
889 if (actualStart != matchStarts[i]) {
890 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
891 __LINE__, i, matchStarts[i], actualStart);
892 }
893 int32_t actualEnd = matcher->end(i, status);
894 REGEX_CHECK_STATUS;
895 if (actualEnd != matchEnds[i]) {
896 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
897 __LINE__, i, matchEnds[i], actualEnd);
898 }
899 }
900
901 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
902 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
903
904 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
905 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
906 matcher->reset();
907 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
908
909 matcher->lookingAt(status);
910 REGEX_ASSERT(matcher->group(status) == "0123456789");
911 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
912 REGEX_ASSERT(matcher->group(1, status) == "234567" );
913 REGEX_ASSERT(matcher->group(2, status) == "45" );
914 REGEX_ASSERT(matcher->group(3, status) == "89" );
915 REGEX_CHECK_STATUS;
916 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
917 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
918 matcher->reset();
919 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
920
921 delete matcher;
922 delete pat;
923
924 }
925
926 //
927 // find
928 //
929 {
930 int32_t flags=0;
931 UParseError pe;
932 UErrorCode status=U_ZERO_ERROR;
933
934 UnicodeString re("abc");
935 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
936 REGEX_CHECK_STATUS;
937 UnicodeString data = ".abc..abc...abc..";
938 // 012345678901234567
939
940 RegexMatcher *matcher = pat->matcher(data, status);
941 REGEX_CHECK_STATUS;
942 REGEX_ASSERT(matcher->find());
943 REGEX_ASSERT(matcher->start(status) == 1);
944 REGEX_ASSERT(matcher->find());
945 REGEX_ASSERT(matcher->start(status) == 6);
946 REGEX_ASSERT(matcher->find());
947 REGEX_ASSERT(matcher->start(status) == 12);
948 REGEX_ASSERT(matcher->find() == FALSE);
949 REGEX_ASSERT(matcher->find() == FALSE);
950
951 matcher->reset();
952 REGEX_ASSERT(matcher->find());
953 REGEX_ASSERT(matcher->start(status) == 1);
954
955 REGEX_ASSERT(matcher->find(0, status));
956 REGEX_ASSERT(matcher->start(status) == 1);
957 REGEX_ASSERT(matcher->find(1, status));
958 REGEX_ASSERT(matcher->start(status) == 1);
959 REGEX_ASSERT(matcher->find(2, status));
960 REGEX_ASSERT(matcher->start(status) == 6);
961 REGEX_ASSERT(matcher->find(12, status));
962 REGEX_ASSERT(matcher->start(status) == 12);
963 REGEX_ASSERT(matcher->find(13, status) == FALSE);
964 REGEX_ASSERT(matcher->find(16, status) == FALSE);
965 REGEX_ASSERT(matcher->find(17, status) == FALSE);
966 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
967
968 status = U_ZERO_ERROR;
969 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
970 status = U_ZERO_ERROR;
971 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
972
973 REGEX_ASSERT(matcher->groupCount() == 0);
974
975 delete matcher;
976 delete pat;
977 }
978
979
980 //
981 // find, with \G in pattern (true if at the end of a previous match).
982 //
983 {
984 int32_t flags=0;
985 UParseError pe;
986 UErrorCode status=U_ZERO_ERROR;
987
988 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
989 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
990 REGEX_CHECK_STATUS;
991 UnicodeString data = ".abcabc.abc..";
992 // 012345678901234567
993
994 RegexMatcher *matcher = pat->matcher(data, status);
995 REGEX_CHECK_STATUS;
996 REGEX_ASSERT(matcher->find());
997 REGEX_ASSERT(matcher->start(status) == 0);
998 REGEX_ASSERT(matcher->start(1, status) == -1);
999 REGEX_ASSERT(matcher->start(2, status) == 1);
1000
1001 REGEX_ASSERT(matcher->find());
1002 REGEX_ASSERT(matcher->start(status) == 4);
1003 REGEX_ASSERT(matcher->start(1, status) == 4);
1004 REGEX_ASSERT(matcher->start(2, status) == -1);
1005 REGEX_CHECK_STATUS;
1006
1007 delete matcher;
1008 delete pat;
1009 }
1010
1011 //
1012 // find with zero length matches, match position should bump ahead
1013 // to prevent loops.
1014 //
1015 {
1016 int32_t i;
1017 UErrorCode status=U_ZERO_ERROR;
1018 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1019 // using an always-true look-ahead.
1020 REGEX_CHECK_STATUS;
1021 UnicodeString s(" ");
1022 m.reset(s);
1023 for (i=0; ; i++) {
1024 if (m.find() == FALSE) {
1025 break;
1026 }
1027 REGEX_ASSERT(m.start(status) == i);
1028 REGEX_ASSERT(m.end(status) == i);
1029 }
1030 REGEX_ASSERT(i==5);
1031
1032 // Check that the bump goes over surrogate pairs OK
1033 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1034 s = s.unescape();
1035 m.reset(s);
1036 for (i=0; ; i+=2) {
1037 if (m.find() == FALSE) {
1038 break;
1039 }
1040 REGEX_ASSERT(m.start(status) == i);
1041 REGEX_ASSERT(m.end(status) == i);
1042 }
1043 REGEX_ASSERT(i==10);
1044 }
1045 {
1046 // find() loop breaking test.
1047 // with pattern of /.?/, should see a series of one char matches, then a single
1048 // match of zero length at the end of the input string.
1049 int32_t i;
1050 UErrorCode status=U_ZERO_ERROR;
1051 RegexMatcher m(".?", 0, status);
1052 REGEX_CHECK_STATUS;
1053 UnicodeString s(" ");
1054 m.reset(s);
1055 for (i=0; ; i++) {
1056 if (m.find() == FALSE) {
1057 break;
1058 }
1059 REGEX_ASSERT(m.start(status) == i);
1060 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1061 }
1062 REGEX_ASSERT(i==5);
1063 }
1064
1065
1066 //
1067 // Matchers with no input string behave as if they had an empty input string.
1068 //
1069
1070 {
1071 UErrorCode status = U_ZERO_ERROR;
1072 RegexMatcher m(".?", 0, status);
1073 REGEX_CHECK_STATUS;
1074 REGEX_ASSERT(m.find());
1075 REGEX_ASSERT(m.start(status) == 0);
1076 REGEX_ASSERT(m.input() == "");
1077 }
1078 {
1079 UErrorCode status = U_ZERO_ERROR;
1080 RegexPattern *p = RegexPattern::compile(".", 0, status);
1081 RegexMatcher *m = p->matcher(status);
1082 REGEX_CHECK_STATUS;
1083
1084 REGEX_ASSERT(m->find() == FALSE);
1085 REGEX_ASSERT(m->input() == "");
1086 delete m;
1087 delete p;
1088 }
1089
1090 //
1091 // Regions
1092 //
1093 {
1094 UErrorCode status = U_ZERO_ERROR;
1095 UnicodeString testString("This is test data");
1096 RegexMatcher m(".*", testString, 0, status);
1097 REGEX_CHECK_STATUS;
1098 REGEX_ASSERT(m.regionStart() == 0);
1099 REGEX_ASSERT(m.regionEnd() == testString.length());
1100 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1101 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1102
1103 m.region(2,4, status);
1104 REGEX_CHECK_STATUS;
1105 REGEX_ASSERT(m.matches(status));
1106 REGEX_ASSERT(m.start(status)==2);
1107 REGEX_ASSERT(m.end(status)==4);
1108 REGEX_CHECK_STATUS;
1109
1110 m.reset();
1111 REGEX_ASSERT(m.regionStart() == 0);
1112 REGEX_ASSERT(m.regionEnd() == testString.length());
1113
1114 UnicodeString shorterString("short");
1115 m.reset(shorterString);
1116 REGEX_ASSERT(m.regionStart() == 0);
1117 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1118
1119 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1120 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1121 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1122 REGEX_ASSERT(&m == &m.reset());
1123 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1124
1125 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1126 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1127 REGEX_ASSERT(&m == &m.reset());
1128 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1129
1130 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1131 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1132 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1133 REGEX_ASSERT(&m == &m.reset());
1134 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1135
1136 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1137 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1138 REGEX_ASSERT(&m == &m.reset());
1139 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1140
1141 }
1142
1143 //
1144 // hitEnd() and requireEnd()
1145 //
1146 {
1147 UErrorCode status = U_ZERO_ERROR;
1148 UnicodeString testString("aabb");
1149 RegexMatcher m1(".*", testString, 0, status);
1150 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1151 REGEX_ASSERT(m1.hitEnd() == TRUE);
1152 REGEX_ASSERT(m1.requireEnd() == FALSE);
1153 REGEX_CHECK_STATUS;
1154
1155 status = U_ZERO_ERROR;
1156 RegexMatcher m2("a*", testString, 0, status);
1157 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1158 REGEX_ASSERT(m2.hitEnd() == FALSE);
1159 REGEX_ASSERT(m2.requireEnd() == FALSE);
1160 REGEX_CHECK_STATUS;
1161
1162 status = U_ZERO_ERROR;
1163 RegexMatcher m3(".*$", testString, 0, status);
1164 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1165 REGEX_ASSERT(m3.hitEnd() == TRUE);
1166 REGEX_ASSERT(m3.requireEnd() == TRUE);
1167 REGEX_CHECK_STATUS;
1168 }
1169
1170
1171 //
1172 // Compilation error on reset with UChar *
1173 // These were a hazard that people were stumbling over with runtime errors.
1174 // Changed them to compiler errors by adding private methods that more closely
1175 // matched the incorrect use of the functions.
1176 //
1177 #if 0
1178 {
1179 UErrorCode status = U_ZERO_ERROR;
1180 UChar ucharString[20];
1181 RegexMatcher m(".", 0, status);
1182 m.reset(ucharString); // should not compile.
1183
1184 RegexPattern *p = RegexPattern::compile(".", 0, status);
1185 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1186
1187 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1188 }
1189 #endif
1190
1191 //
1192 // Time Outs.
1193 // Note: These tests will need to be changed when the regexp engine is
1194 // able to detect and cut short the exponential time behavior on
1195 // this type of match.
1196 //
1197 {
1198 UErrorCode status = U_ZERO_ERROR;
1199 // Enough 'a's in the string to cause the match to time out.
1200 // (Each on additonal 'a' doubles the time)
1201 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1202 RegexMatcher matcher("(a+)+b", testString, 0, status);
1203 REGEX_CHECK_STATUS;
1204 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1205 matcher.setTimeLimit(100, status);
1206 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1207 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1208 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1209 }
1210 {
1211 UErrorCode status = U_ZERO_ERROR;
1212 // Few enough 'a's to slip in under the time limit.
1213 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1214 RegexMatcher matcher("(a+)+b", testString, 0, status);
1215 REGEX_CHECK_STATUS;
1216 matcher.setTimeLimit(100, status);
1217 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1218 REGEX_CHECK_STATUS;
1219 }
1220
1221 //
1222 // Stack Limits
1223 //
1224 {
1225 UErrorCode status = U_ZERO_ERROR;
1226 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1227
1228 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1229 // of the '+', and makes the stack frames larger.
1230 RegexMatcher matcher("(A)+A$", testString, 0, status);
1231
1232 // With the default stack, this match should fail to run
1233 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1234 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1235
1236 // With unlimited stack, it should run
1237 status = U_ZERO_ERROR;
1238 matcher.setStackLimit(0, status);
1239 REGEX_CHECK_STATUS;
1240 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1241 REGEX_CHECK_STATUS;
1242 REGEX_ASSERT(matcher.getStackLimit() == 0);
1243
1244 // With a limited stack, it the match should fail
1245 status = U_ZERO_ERROR;
1246 matcher.setStackLimit(10000, status);
1247 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1248 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1249 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1250 }
1251
1252 // A pattern that doesn't save state should work with
1253 // a minimal sized stack
1254 {
1255 UErrorCode status = U_ZERO_ERROR;
1256 UnicodeString testString = "abc";
1257 RegexMatcher matcher("abc", testString, 0, status);
1258 REGEX_CHECK_STATUS;
1259 matcher.setStackLimit(30, status);
1260 REGEX_CHECK_STATUS;
1261 REGEX_ASSERT(matcher.matches(status) == TRUE);
1262 REGEX_CHECK_STATUS;
1263 REGEX_ASSERT(matcher.getStackLimit() == 30);
1264
1265 // Negative stack sizes should fail
1266 status = U_ZERO_ERROR;
1267 matcher.setStackLimit(1000, status);
1268 REGEX_CHECK_STATUS;
1269 matcher.setStackLimit(-1, status);
1270 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1271 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1272 }
1273
1274
1275 }
1276
1277
1278
1279
1280
1281
1282 //---------------------------------------------------------------------------
1283 //
1284 // API_Replace API test for class RegexMatcher, testing the
1285 // Replace family of functions.
1286 //
1287 //---------------------------------------------------------------------------
API_Replace()1288 void RegexTest::API_Replace() {
1289 //
1290 // Replace
1291 //
1292 int32_t flags=0;
1293 UParseError pe;
1294 UErrorCode status=U_ZERO_ERROR;
1295
1296 UnicodeString re("abc");
1297 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1298 REGEX_CHECK_STATUS;
1299 UnicodeString data = ".abc..abc...abc..";
1300 // 012345678901234567
1301 RegexMatcher *matcher = pat->matcher(data, status);
1302
1303 //
1304 // Plain vanilla matches.
1305 //
1306 UnicodeString dest;
1307 dest = matcher->replaceFirst("yz", status);
1308 REGEX_CHECK_STATUS;
1309 REGEX_ASSERT(dest == ".yz..abc...abc..");
1310
1311 dest = matcher->replaceAll("yz", status);
1312 REGEX_CHECK_STATUS;
1313 REGEX_ASSERT(dest == ".yz..yz...yz..");
1314
1315 //
1316 // Plain vanilla non-matches.
1317 //
1318 UnicodeString d2 = ".abx..abx...abx..";
1319 matcher->reset(d2);
1320 dest = matcher->replaceFirst("yz", status);
1321 REGEX_CHECK_STATUS;
1322 REGEX_ASSERT(dest == ".abx..abx...abx..");
1323
1324 dest = matcher->replaceAll("yz", status);
1325 REGEX_CHECK_STATUS;
1326 REGEX_ASSERT(dest == ".abx..abx...abx..");
1327
1328 //
1329 // Empty source string
1330 //
1331 UnicodeString d3 = "";
1332 matcher->reset(d3);
1333 dest = matcher->replaceFirst("yz", status);
1334 REGEX_CHECK_STATUS;
1335 REGEX_ASSERT(dest == "");
1336
1337 dest = matcher->replaceAll("yz", status);
1338 REGEX_CHECK_STATUS;
1339 REGEX_ASSERT(dest == "");
1340
1341 //
1342 // Empty substitution string
1343 //
1344 matcher->reset(data); // ".abc..abc...abc.."
1345 dest = matcher->replaceFirst("", status);
1346 REGEX_CHECK_STATUS;
1347 REGEX_ASSERT(dest == "...abc...abc..");
1348
1349 dest = matcher->replaceAll("", status);
1350 REGEX_CHECK_STATUS;
1351 REGEX_ASSERT(dest == "........");
1352
1353 //
1354 // match whole string
1355 //
1356 UnicodeString d4 = "abc";
1357 matcher->reset(d4);
1358 dest = matcher->replaceFirst("xyz", status);
1359 REGEX_CHECK_STATUS;
1360 REGEX_ASSERT(dest == "xyz");
1361
1362 dest = matcher->replaceAll("xyz", status);
1363 REGEX_CHECK_STATUS;
1364 REGEX_ASSERT(dest == "xyz");
1365
1366 //
1367 // Capture Group, simple case
1368 //
1369 UnicodeString re2("a(..)");
1370 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1371 REGEX_CHECK_STATUS;
1372 UnicodeString d5 = "abcdefg";
1373 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1374 REGEX_CHECK_STATUS;
1375 dest = matcher2->replaceFirst("$1$1", status);
1376 REGEX_CHECK_STATUS;
1377 REGEX_ASSERT(dest == "bcbcdefg");
1378
1379 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1380 REGEX_CHECK_STATUS;
1381 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1382
1383 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1384 REGEX_ASSERT(U_FAILURE(status));
1385 status = U_ZERO_ERROR;
1386
1387 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1388 replacement = replacement.unescape();
1389 dest = matcher2->replaceFirst(replacement, status);
1390 REGEX_CHECK_STATUS;
1391 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1392
1393 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1394
1395
1396 //
1397 // Replacement String with \u hex escapes
1398 //
1399 {
1400 UnicodeString src = "abc 1 abc 2 abc 3";
1401 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1402 matcher->reset(src);
1403 UnicodeString result = matcher->replaceAll(substitute, status);
1404 REGEX_CHECK_STATUS;
1405 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1406 }
1407 {
1408 UnicodeString src = "abc !";
1409 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1410 matcher->reset(src);
1411 UnicodeString result = matcher->replaceAll(substitute, status);
1412 REGEX_CHECK_STATUS;
1413 UnicodeString expected = UnicodeString("--");
1414 expected.append((UChar32)0x10000);
1415 expected.append("-- !");
1416 REGEX_ASSERT(result == expected);
1417 }
1418 // TODO: need more through testing of capture substitutions.
1419
1420 // Bug 4057
1421 //
1422 {
1423 status = U_ZERO_ERROR;
1424 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1425 RegexMatcher m("ss(.*?)ee", 0, status);
1426 REGEX_CHECK_STATUS;
1427 UnicodeString result;
1428
1429 // Multiple finds do NOT bump up the previous appendReplacement postion.
1430 m.reset(s);
1431 m.find();
1432 m.find();
1433 m.appendReplacement(result, "ooh", status);
1434 REGEX_CHECK_STATUS;
1435 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1436
1437 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1438 status = U_ZERO_ERROR;
1439 result.truncate(0);
1440 m.reset(10, status);
1441 m.find();
1442 m.find();
1443 m.appendReplacement(result, "ooh", status);
1444 REGEX_CHECK_STATUS;
1445 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1446
1447 // find() at interior of string, appendReplacemnt still starts at beginning.
1448 status = U_ZERO_ERROR;
1449 result.truncate(0);
1450 m.reset();
1451 m.find(10, status);
1452 m.find();
1453 m.appendReplacement(result, "ooh", status);
1454 REGEX_CHECK_STATUS;
1455 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1456
1457 m.appendTail(result);
1458 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1459
1460 }
1461
1462 delete matcher2;
1463 delete pat2;
1464 delete matcher;
1465 delete pat;
1466 }
1467
1468
1469 //---------------------------------------------------------------------------
1470 //
1471 // API_Pattern Test that the API for class RegexPattern is
1472 // present and nominally working.
1473 //
1474 //---------------------------------------------------------------------------
API_Pattern()1475 void RegexTest::API_Pattern() {
1476 RegexPattern pata; // Test default constructor to not crash.
1477 RegexPattern patb;
1478
1479 REGEX_ASSERT(pata == patb);
1480 REGEX_ASSERT(pata == pata);
1481
1482 UnicodeString re1("abc[a-l][m-z]");
1483 UnicodeString re2("def");
1484 UErrorCode status = U_ZERO_ERROR;
1485 UParseError pe;
1486
1487 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1488 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1489 REGEX_CHECK_STATUS;
1490 REGEX_ASSERT(*pat1 == *pat1);
1491 REGEX_ASSERT(*pat1 != pata);
1492
1493 // Assign
1494 patb = *pat1;
1495 REGEX_ASSERT(patb == *pat1);
1496
1497 // Copy Construct
1498 RegexPattern patc(*pat1);
1499 REGEX_ASSERT(patc == *pat1);
1500 REGEX_ASSERT(patb == patc);
1501 REGEX_ASSERT(pat1 != pat2);
1502 patb = *pat2;
1503 REGEX_ASSERT(patb != patc);
1504 REGEX_ASSERT(patb == *pat2);
1505
1506 // Compile with no flags.
1507 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1508 REGEX_ASSERT(*pat1a == *pat1);
1509
1510 REGEX_ASSERT(pat1a->flags() == 0);
1511
1512 // Compile with different flags should be not equal
1513 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1514 REGEX_CHECK_STATUS;
1515
1516 REGEX_ASSERT(*pat1b != *pat1a);
1517 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1518 REGEX_ASSERT(pat1a->flags() == 0);
1519 delete pat1b;
1520
1521 // clone
1522 RegexPattern *pat1c = pat1->clone();
1523 REGEX_ASSERT(*pat1c == *pat1);
1524 REGEX_ASSERT(*pat1c != *pat2);
1525
1526 delete pat1c;
1527 delete pat1a;
1528 delete pat1;
1529 delete pat2;
1530
1531
1532 //
1533 // Verify that a matcher created from a cloned pattern works.
1534 // (Jitterbug 3423)
1535 //
1536 {
1537 UErrorCode status = U_ZERO_ERROR;
1538 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1539 RegexPattern *pClone = pSource->clone();
1540 delete pSource;
1541 RegexMatcher *mFromClone = pClone->matcher(status);
1542 REGEX_CHECK_STATUS;
1543 UnicodeString s = "Hello World";
1544 mFromClone->reset(s);
1545 REGEX_ASSERT(mFromClone->find() == TRUE);
1546 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1547 REGEX_ASSERT(mFromClone->find() == TRUE);
1548 REGEX_ASSERT(mFromClone->group(status) == "World");
1549 REGEX_ASSERT(mFromClone->find() == FALSE);
1550 delete mFromClone;
1551 delete pClone;
1552 }
1553
1554 //
1555 // matches convenience API
1556 //
1557 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1558 REGEX_CHECK_STATUS;
1559 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1560 REGEX_CHECK_STATUS;
1561 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1562 REGEX_CHECK_STATUS;
1563 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1564 REGEX_CHECK_STATUS;
1565 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1566 REGEX_CHECK_STATUS;
1567 status = U_INDEX_OUTOFBOUNDS_ERROR;
1568 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1569 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1570
1571
1572 //
1573 // Split()
1574 //
1575 status = U_ZERO_ERROR;
1576 pat1 = RegexPattern::compile(" +", pe, status);
1577 REGEX_CHECK_STATUS;
1578 UnicodeString fields[10];
1579
1580 int32_t n;
1581 n = pat1->split("Now is the time", fields, 10, status);
1582 REGEX_CHECK_STATUS;
1583 REGEX_ASSERT(n==4);
1584 REGEX_ASSERT(fields[0]=="Now");
1585 REGEX_ASSERT(fields[1]=="is");
1586 REGEX_ASSERT(fields[2]=="the");
1587 REGEX_ASSERT(fields[3]=="time");
1588 REGEX_ASSERT(fields[4]=="");
1589
1590 n = pat1->split("Now is the time", fields, 2, status);
1591 REGEX_CHECK_STATUS;
1592 REGEX_ASSERT(n==2);
1593 REGEX_ASSERT(fields[0]=="Now");
1594 REGEX_ASSERT(fields[1]=="is the time");
1595 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1596
1597 fields[1] = "*";
1598 status = U_ZERO_ERROR;
1599 n = pat1->split("Now is the time", fields, 1, status);
1600 REGEX_CHECK_STATUS;
1601 REGEX_ASSERT(n==1);
1602 REGEX_ASSERT(fields[0]=="Now is the time");
1603 REGEX_ASSERT(fields[1]=="*");
1604 status = U_ZERO_ERROR;
1605
1606 n = pat1->split(" Now is the time ", fields, 10, status);
1607 REGEX_CHECK_STATUS;
1608 REGEX_ASSERT(n==6);
1609 REGEX_ASSERT(fields[0]=="");
1610 REGEX_ASSERT(fields[1]=="Now");
1611 REGEX_ASSERT(fields[2]=="is");
1612 REGEX_ASSERT(fields[3]=="the");
1613 REGEX_ASSERT(fields[4]=="time");
1614 REGEX_ASSERT(fields[5]=="");
1615
1616 n = pat1->split(" ", fields, 10, status);
1617 REGEX_CHECK_STATUS;
1618 REGEX_ASSERT(n==2);
1619 REGEX_ASSERT(fields[0]=="");
1620 REGEX_ASSERT(fields[1]=="");
1621
1622 fields[0] = "foo";
1623 n = pat1->split("", fields, 10, status);
1624 REGEX_CHECK_STATUS;
1625 REGEX_ASSERT(n==0);
1626 REGEX_ASSERT(fields[0]=="foo");
1627
1628 delete pat1;
1629
1630 // split, with a pattern with (capture)
1631 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1632 REGEX_CHECK_STATUS;
1633
1634 status = U_ZERO_ERROR;
1635 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1636 REGEX_CHECK_STATUS;
1637 REGEX_ASSERT(n==7);
1638 REGEX_ASSERT(fields[0]=="");
1639 REGEX_ASSERT(fields[1]=="a");
1640 REGEX_ASSERT(fields[2]=="Now is ");
1641 REGEX_ASSERT(fields[3]=="b");
1642 REGEX_ASSERT(fields[4]=="the time");
1643 REGEX_ASSERT(fields[5]=="c");
1644 REGEX_ASSERT(fields[6]=="");
1645 REGEX_ASSERT(status==U_ZERO_ERROR);
1646
1647 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1648 REGEX_CHECK_STATUS;
1649 REGEX_ASSERT(n==7);
1650 REGEX_ASSERT(fields[0]==" ");
1651 REGEX_ASSERT(fields[1]=="a");
1652 REGEX_ASSERT(fields[2]=="Now is ");
1653 REGEX_ASSERT(fields[3]=="b");
1654 REGEX_ASSERT(fields[4]=="the time");
1655 REGEX_ASSERT(fields[5]=="c");
1656 REGEX_ASSERT(fields[6]=="");
1657
1658 status = U_ZERO_ERROR;
1659 fields[6] = "foo";
1660 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1661 REGEX_CHECK_STATUS;
1662 REGEX_ASSERT(n==6);
1663 REGEX_ASSERT(fields[0]==" ");
1664 REGEX_ASSERT(fields[1]=="a");
1665 REGEX_ASSERT(fields[2]=="Now is ");
1666 REGEX_ASSERT(fields[3]=="b");
1667 REGEX_ASSERT(fields[4]=="the time");
1668 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1669 REGEX_ASSERT(fields[6]=="foo");
1670
1671 status = U_ZERO_ERROR;
1672 fields[5] = "foo";
1673 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1674 REGEX_CHECK_STATUS;
1675 REGEX_ASSERT(n==5);
1676 REGEX_ASSERT(fields[0]==" ");
1677 REGEX_ASSERT(fields[1]=="a");
1678 REGEX_ASSERT(fields[2]=="Now is ");
1679 REGEX_ASSERT(fields[3]=="b");
1680 REGEX_ASSERT(fields[4]=="the time<c>");
1681 REGEX_ASSERT(fields[5]=="foo");
1682
1683 status = U_ZERO_ERROR;
1684 fields[5] = "foo";
1685 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1686 REGEX_CHECK_STATUS;
1687 REGEX_ASSERT(n==5);
1688 REGEX_ASSERT(fields[0]==" ");
1689 REGEX_ASSERT(fields[1]=="a");
1690 REGEX_ASSERT(fields[2]=="Now is ");
1691 REGEX_ASSERT(fields[3]=="b");
1692 REGEX_ASSERT(fields[4]=="the time");
1693 REGEX_ASSERT(fields[5]=="foo");
1694
1695 status = U_ZERO_ERROR;
1696 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1697 REGEX_CHECK_STATUS;
1698 REGEX_ASSERT(n==4);
1699 REGEX_ASSERT(fields[0]==" ");
1700 REGEX_ASSERT(fields[1]=="a");
1701 REGEX_ASSERT(fields[2]=="Now is ");
1702 REGEX_ASSERT(fields[3]=="the time<c>");
1703 status = U_ZERO_ERROR;
1704 delete pat1;
1705
1706 pat1 = RegexPattern::compile("([-,])", pe, status);
1707 REGEX_CHECK_STATUS;
1708 n = pat1->split("1-10,20", fields, 10, status);
1709 REGEX_CHECK_STATUS;
1710 REGEX_ASSERT(n==5);
1711 REGEX_ASSERT(fields[0]=="1");
1712 REGEX_ASSERT(fields[1]=="-");
1713 REGEX_ASSERT(fields[2]=="10");
1714 REGEX_ASSERT(fields[3]==",");
1715 REGEX_ASSERT(fields[4]=="20");
1716 delete pat1;
1717
1718 // Test split of string with empty trailing fields
1719 pat1 = RegexPattern::compile(",", pe, status);
1720 REGEX_CHECK_STATUS;
1721 n = pat1->split("a,b,c,", fields, 10, status);
1722 REGEX_CHECK_STATUS;
1723 REGEX_ASSERT(n==4);
1724 REGEX_ASSERT(fields[0]=="a");
1725 REGEX_ASSERT(fields[1]=="b");
1726 REGEX_ASSERT(fields[2]=="c");
1727 REGEX_ASSERT(fields[3]=="");
1728
1729 n = pat1->split("a,,,", fields, 10, status);
1730 REGEX_CHECK_STATUS;
1731 REGEX_ASSERT(n==4);
1732 REGEX_ASSERT(fields[0]=="a");
1733 REGEX_ASSERT(fields[1]=="");
1734 REGEX_ASSERT(fields[2]=="");
1735 REGEX_ASSERT(fields[3]=="");
1736 delete pat1;
1737
1738 // Split Separator with zero length match.
1739 pat1 = RegexPattern::compile(":?", pe, status);
1740 REGEX_CHECK_STATUS;
1741 n = pat1->split("abc", fields, 10, status);
1742 REGEX_CHECK_STATUS;
1743 REGEX_ASSERT(n==5);
1744 REGEX_ASSERT(fields[0]=="");
1745 REGEX_ASSERT(fields[1]=="a");
1746 REGEX_ASSERT(fields[2]=="b");
1747 REGEX_ASSERT(fields[3]=="c");
1748 REGEX_ASSERT(fields[4]=="");
1749
1750 delete pat1;
1751
1752 //
1753 // RegexPattern::pattern()
1754 //
1755 pat1 = new RegexPattern();
1756 REGEX_ASSERT(pat1->pattern() == "");
1757 delete pat1;
1758
1759 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1760 REGEX_CHECK_STATUS;
1761 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1762 delete pat1;
1763
1764
1765 //
1766 // classID functions
1767 //
1768 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1769 REGEX_CHECK_STATUS;
1770 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1771 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1772 UnicodeString Hello("Hello, world.");
1773 RegexMatcher *m = pat1->matcher(Hello, status);
1774 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1775 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1776 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1777 delete m;
1778 delete pat1;
1779
1780 }
1781
1782 //---------------------------------------------------------------------------
1783 //
1784 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1785 // is present and working, but excluding functions
1786 // implementing replace operations.
1787 //
1788 //---------------------------------------------------------------------------
API_Match_UTF8()1789 void RegexTest::API_Match_UTF8() {
1790 UParseError pe;
1791 UErrorCode status=U_ZERO_ERROR;
1792 int32_t flags = 0;
1793
1794 //
1795 // Debug - slide failing test cases early
1796 //
1797 #if 0
1798 {
1799 }
1800 return;
1801 #endif
1802
1803 //
1804 // Simple pattern compilation
1805 //
1806 {
1807 UText re = UTEXT_INITIALIZER;
1808 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1809 REGEX_VERBOSE_TEXT(&re);
1810 RegexPattern *pat2;
1811 pat2 = RegexPattern::compile(&re, flags, pe, status);
1812 REGEX_CHECK_STATUS;
1813
1814 UText input1 = UTEXT_INITIALIZER;
1815 UText input2 = UTEXT_INITIALIZER;
1816 UText empty = UTEXT_INITIALIZER;
1817 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1818 REGEX_VERBOSE_TEXT(&input1);
1819 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1820 REGEX_VERBOSE_TEXT(&input2);
1821 utext_openUChars(&empty, NULL, 0, &status);
1822
1823 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1824 int32_t input2Len = strlen("not abc");
1825
1826
1827 //
1828 // Matcher creation and reset.
1829 //
1830 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1831 REGEX_CHECK_STATUS;
1832 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1833 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1834 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1835 m1->reset(&input2);
1836 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1837 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1838 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1839 m1->reset(&input1);
1840 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1841 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1842 m1->reset(&empty);
1843 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1844 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1845
1846 //
1847 // reset(pos, status)
1848 //
1849 m1->reset(&input1);
1850 m1->reset(4, status);
1851 REGEX_CHECK_STATUS;
1852 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1853 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1854
1855 m1->reset(-1, status);
1856 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1857 status = U_ZERO_ERROR;
1858
1859 m1->reset(0, status);
1860 REGEX_CHECK_STATUS;
1861 status = U_ZERO_ERROR;
1862
1863 m1->reset(input1Len-1, status);
1864 REGEX_CHECK_STATUS;
1865 status = U_ZERO_ERROR;
1866
1867 m1->reset(input1Len, status);
1868 REGEX_CHECK_STATUS;
1869 status = U_ZERO_ERROR;
1870
1871 m1->reset(input1Len+1, status);
1872 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1873 status = U_ZERO_ERROR;
1874
1875 //
1876 // match(pos, status)
1877 //
1878 m1->reset(&input2);
1879 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1880 m1->reset();
1881 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1882 m1->reset();
1883 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1884 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1885 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1886 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1887
1888 // Match() at end of string should fail, but should not
1889 // be an error.
1890 status = U_ZERO_ERROR;
1891 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1892 REGEX_CHECK_STATUS;
1893
1894 // Match beyond end of string should fail with an error.
1895 status = U_ZERO_ERROR;
1896 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1897 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1898
1899 // Successful match at end of string.
1900 {
1901 status = U_ZERO_ERROR;
1902 RegexMatcher m("A?", 0, status); // will match zero length string.
1903 REGEX_CHECK_STATUS;
1904 m.reset(&input1);
1905 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1906 REGEX_CHECK_STATUS;
1907 m.reset(&empty);
1908 REGEX_ASSERT(m.matches(0, status) == TRUE);
1909 REGEX_CHECK_STATUS;
1910 }
1911
1912
1913 //
1914 // lookingAt(pos, status)
1915 //
1916 status = U_ZERO_ERROR;
1917 m1->reset(&input2); // "not abc"
1918 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1919 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1920 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1921 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1922 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1923 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1924 status = U_ZERO_ERROR;
1925 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1926 REGEX_CHECK_STATUS;
1927 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1928 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1929
1930 delete m1;
1931 delete pat2;
1932
1933 utext_close(&re);
1934 utext_close(&input1);
1935 utext_close(&input2);
1936 utext_close(&empty);
1937 }
1938
1939
1940 //
1941 // Capture Group.
1942 // RegexMatcher::start();
1943 // RegexMatcher::end();
1944 // RegexMatcher::groupCount();
1945 //
1946 {
1947 int32_t flags=0;
1948 UParseError pe;
1949 UErrorCode status=U_ZERO_ERROR;
1950 UText re=UTEXT_INITIALIZER;
1951 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1952 utext_openUTF8(&re, str_01234567_pat, -1, &status);
1953
1954 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1955 REGEX_CHECK_STATUS;
1956
1957 UText input = UTEXT_INITIALIZER;
1958 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1959 utext_openUTF8(&input, str_0123456789, -1, &status);
1960
1961 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1962 REGEX_CHECK_STATUS;
1963 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1964 static const int32_t matchStarts[] = {0, 2, 4, 8};
1965 static const int32_t matchEnds[] = {10, 8, 6, 10};
1966 int32_t i;
1967 for (i=0; i<4; i++) {
1968 int32_t actualStart = matcher->start(i, status);
1969 REGEX_CHECK_STATUS;
1970 if (actualStart != matchStarts[i]) {
1971 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
1972 __FILE__, __LINE__, i, matchStarts[i], actualStart);
1973 }
1974 int32_t actualEnd = matcher->end(i, status);
1975 REGEX_CHECK_STATUS;
1976 if (actualEnd != matchEnds[i]) {
1977 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
1978 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1979 }
1980 }
1981
1982 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1983 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1984
1985 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1986 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1987 matcher->reset();
1988 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1989
1990 matcher->lookingAt(status);
1991
1992 UnicodeString dest;
1993 UText destText = UTEXT_INITIALIZER;
1994 utext_openUnicodeString(&destText, &dest, &status);
1995 UText *result;
1996 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1997 // Test shallow-clone API
1998 int64_t group_len;
1999 result = matcher->group((UText *)NULL, group_len, status);
2000 REGEX_CHECK_STATUS;
2001 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2002 utext_close(result);
2003 result = matcher->group(0, &destText, group_len, status);
2004 REGEX_CHECK_STATUS;
2005 REGEX_ASSERT(result == &destText);
2006 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2007 // destText is now immutable, reopen it
2008 utext_close(&destText);
2009 utext_openUnicodeString(&destText, &dest, &status);
2010
2011 int64_t length;
2012 result = matcher->group(0, NULL, length, status);
2013 REGEX_CHECK_STATUS;
2014 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2015 utext_close(result);
2016 result = matcher->group(0, &destText, length, status);
2017 REGEX_CHECK_STATUS;
2018 REGEX_ASSERT(result == &destText);
2019 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2020 REGEX_ASSERT(length == 10);
2021 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2022
2023 // Capture Group 1 == "234567"
2024 result = matcher->group(1, NULL, length, status);
2025 REGEX_CHECK_STATUS;
2026 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2027 REGEX_ASSERT(length == 6);
2028 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2029 utext_close(result);
2030
2031 result = matcher->group(1, &destText, length, status);
2032 REGEX_CHECK_STATUS;
2033 REGEX_ASSERT(result == &destText);
2034 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2035 REGEX_ASSERT(length == 6);
2036 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2037 utext_close(result);
2038
2039 // Capture Group 2 == "45"
2040 result = matcher->group(2, NULL, length, status);
2041 REGEX_CHECK_STATUS;
2042 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2043 REGEX_ASSERT(length == 2);
2044 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2045 utext_close(result);
2046
2047 result = matcher->group(2, &destText, length, status);
2048 REGEX_CHECK_STATUS;
2049 REGEX_ASSERT(result == &destText);
2050 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2051 REGEX_ASSERT(length == 2);
2052 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2053 utext_close(result);
2054
2055 // Capture Group 3 == "89"
2056 result = matcher->group(3, NULL, length, status);
2057 REGEX_CHECK_STATUS;
2058 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2059 REGEX_ASSERT(length == 2);
2060 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2061 utext_close(result);
2062
2063 result = matcher->group(3, &destText, length, status);
2064 REGEX_CHECK_STATUS;
2065 REGEX_ASSERT(result == &destText);
2066 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2067 REGEX_ASSERT(length == 2);
2068 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2069 utext_close(result);
2070
2071 // Capture Group number out of range.
2072 status = U_ZERO_ERROR;
2073 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2074 status = U_ZERO_ERROR;
2075 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2076 status = U_ZERO_ERROR;
2077 matcher->reset();
2078 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2079
2080 delete matcher;
2081 delete pat;
2082
2083 utext_close(&destText);
2084 utext_close(&input);
2085 utext_close(&re);
2086 }
2087
2088 //
2089 // find
2090 //
2091 {
2092 int32_t flags=0;
2093 UParseError pe;
2094 UErrorCode status=U_ZERO_ERROR;
2095 UText re=UTEXT_INITIALIZER;
2096 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2097 utext_openUTF8(&re, str_abc, -1, &status);
2098
2099 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2100 REGEX_CHECK_STATUS;
2101 UText input = UTEXT_INITIALIZER;
2102 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2103 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2104 // 012345678901234567
2105
2106 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2107 REGEX_CHECK_STATUS;
2108 REGEX_ASSERT(matcher->find());
2109 REGEX_ASSERT(matcher->start(status) == 1);
2110 REGEX_ASSERT(matcher->find());
2111 REGEX_ASSERT(matcher->start(status) == 6);
2112 REGEX_ASSERT(matcher->find());
2113 REGEX_ASSERT(matcher->start(status) == 12);
2114 REGEX_ASSERT(matcher->find() == FALSE);
2115 REGEX_ASSERT(matcher->find() == FALSE);
2116
2117 matcher->reset();
2118 REGEX_ASSERT(matcher->find());
2119 REGEX_ASSERT(matcher->start(status) == 1);
2120
2121 REGEX_ASSERT(matcher->find(0, status));
2122 REGEX_ASSERT(matcher->start(status) == 1);
2123 REGEX_ASSERT(matcher->find(1, status));
2124 REGEX_ASSERT(matcher->start(status) == 1);
2125 REGEX_ASSERT(matcher->find(2, status));
2126 REGEX_ASSERT(matcher->start(status) == 6);
2127 REGEX_ASSERT(matcher->find(12, status));
2128 REGEX_ASSERT(matcher->start(status) == 12);
2129 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2130 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2131 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2132 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2133
2134 status = U_ZERO_ERROR;
2135 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136 status = U_ZERO_ERROR;
2137 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2138
2139 REGEX_ASSERT(matcher->groupCount() == 0);
2140
2141 delete matcher;
2142 delete pat;
2143
2144 utext_close(&input);
2145 utext_close(&re);
2146 }
2147
2148
2149 //
2150 // find, with \G in pattern (true if at the end of a previous match).
2151 //
2152 {
2153 int32_t flags=0;
2154 UParseError pe;
2155 UErrorCode status=U_ZERO_ERROR;
2156 UText re=UTEXT_INITIALIZER;
2157 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2158 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2159
2160 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2161
2162 REGEX_CHECK_STATUS;
2163 UText input = UTEXT_INITIALIZER;
2164 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2165 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2166 // 012345678901234567
2167
2168 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2169 REGEX_CHECK_STATUS;
2170 REGEX_ASSERT(matcher->find());
2171 REGEX_ASSERT(matcher->start(status) == 0);
2172 REGEX_ASSERT(matcher->start(1, status) == -1);
2173 REGEX_ASSERT(matcher->start(2, status) == 1);
2174
2175 REGEX_ASSERT(matcher->find());
2176 REGEX_ASSERT(matcher->start(status) == 4);
2177 REGEX_ASSERT(matcher->start(1, status) == 4);
2178 REGEX_ASSERT(matcher->start(2, status) == -1);
2179 REGEX_CHECK_STATUS;
2180
2181 delete matcher;
2182 delete pat;
2183
2184 utext_close(&input);
2185 utext_close(&re);
2186 }
2187
2188 //
2189 // find with zero length matches, match position should bump ahead
2190 // to prevent loops.
2191 //
2192 {
2193 int32_t i;
2194 UErrorCode status=U_ZERO_ERROR;
2195 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2196 // using an always-true look-ahead.
2197 REGEX_CHECK_STATUS;
2198 UText s = UTEXT_INITIALIZER;
2199 utext_openUTF8(&s, " ", -1, &status);
2200 m.reset(&s);
2201 for (i=0; ; i++) {
2202 if (m.find() == FALSE) {
2203 break;
2204 }
2205 REGEX_ASSERT(m.start(status) == i);
2206 REGEX_ASSERT(m.end(status) == i);
2207 }
2208 REGEX_ASSERT(i==5);
2209
2210 // Check that the bump goes over characters outside the BMP OK
2211 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2212 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2213 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2214 m.reset(&s);
2215 for (i=0; ; i+=4) {
2216 if (m.find() == FALSE) {
2217 break;
2218 }
2219 REGEX_ASSERT(m.start(status) == i);
2220 REGEX_ASSERT(m.end(status) == i);
2221 }
2222 REGEX_ASSERT(i==20);
2223
2224 utext_close(&s);
2225 }
2226 {
2227 // find() loop breaking test.
2228 // with pattern of /.?/, should see a series of one char matches, then a single
2229 // match of zero length at the end of the input string.
2230 int32_t i;
2231 UErrorCode status=U_ZERO_ERROR;
2232 RegexMatcher m(".?", 0, status);
2233 REGEX_CHECK_STATUS;
2234 UText s = UTEXT_INITIALIZER;
2235 utext_openUTF8(&s, " ", -1, &status);
2236 m.reset(&s);
2237 for (i=0; ; i++) {
2238 if (m.find() == FALSE) {
2239 break;
2240 }
2241 REGEX_ASSERT(m.start(status) == i);
2242 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2243 }
2244 REGEX_ASSERT(i==5);
2245
2246 utext_close(&s);
2247 }
2248
2249
2250 //
2251 // Matchers with no input string behave as if they had an empty input string.
2252 //
2253
2254 {
2255 UErrorCode status = U_ZERO_ERROR;
2256 RegexMatcher m(".?", 0, status);
2257 REGEX_CHECK_STATUS;
2258 REGEX_ASSERT(m.find());
2259 REGEX_ASSERT(m.start(status) == 0);
2260 REGEX_ASSERT(m.input() == "");
2261 }
2262 {
2263 UErrorCode status = U_ZERO_ERROR;
2264 RegexPattern *p = RegexPattern::compile(".", 0, status);
2265 RegexMatcher *m = p->matcher(status);
2266 REGEX_CHECK_STATUS;
2267
2268 REGEX_ASSERT(m->find() == FALSE);
2269 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2270 delete m;
2271 delete p;
2272 }
2273
2274 //
2275 // Regions
2276 //
2277 {
2278 UErrorCode status = U_ZERO_ERROR;
2279 UText testPattern = UTEXT_INITIALIZER;
2280 UText testText = UTEXT_INITIALIZER;
2281 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2282 REGEX_VERBOSE_TEXT(&testPattern);
2283 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2284 REGEX_VERBOSE_TEXT(&testText);
2285
2286 RegexMatcher m(&testPattern, &testText, 0, status);
2287 REGEX_CHECK_STATUS;
2288 REGEX_ASSERT(m.regionStart() == 0);
2289 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2290 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2291 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2292
2293 m.region(2,4, status);
2294 REGEX_CHECK_STATUS;
2295 REGEX_ASSERT(m.matches(status));
2296 REGEX_ASSERT(m.start(status)==2);
2297 REGEX_ASSERT(m.end(status)==4);
2298 REGEX_CHECK_STATUS;
2299
2300 m.reset();
2301 REGEX_ASSERT(m.regionStart() == 0);
2302 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2303
2304 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2305 REGEX_VERBOSE_TEXT(&testText);
2306 m.reset(&testText);
2307 REGEX_ASSERT(m.regionStart() == 0);
2308 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2309
2310 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2311 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2312 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2313 REGEX_ASSERT(&m == &m.reset());
2314 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2315
2316 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2317 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2318 REGEX_ASSERT(&m == &m.reset());
2319 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2320
2321 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2322 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2323 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2324 REGEX_ASSERT(&m == &m.reset());
2325 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2326
2327 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2328 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2329 REGEX_ASSERT(&m == &m.reset());
2330 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2331
2332 utext_close(&testText);
2333 utext_close(&testPattern);
2334 }
2335
2336 //
2337 // hitEnd() and requireEnd()
2338 //
2339 {
2340 UErrorCode status = U_ZERO_ERROR;
2341 UText testPattern = UTEXT_INITIALIZER;
2342 UText testText = UTEXT_INITIALIZER;
2343 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2344 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2345 utext_openUTF8(&testPattern, str_, -1, &status);
2346 utext_openUTF8(&testText, str_aabb, -1, &status);
2347
2348 RegexMatcher m1(&testPattern, &testText, 0, status);
2349 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2350 REGEX_ASSERT(m1.hitEnd() == TRUE);
2351 REGEX_ASSERT(m1.requireEnd() == FALSE);
2352 REGEX_CHECK_STATUS;
2353
2354 status = U_ZERO_ERROR;
2355 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2356 utext_openUTF8(&testPattern, str_a, -1, &status);
2357 RegexMatcher m2(&testPattern, &testText, 0, status);
2358 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2359 REGEX_ASSERT(m2.hitEnd() == FALSE);
2360 REGEX_ASSERT(m2.requireEnd() == FALSE);
2361 REGEX_CHECK_STATUS;
2362
2363 status = U_ZERO_ERROR;
2364 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2365 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2366 RegexMatcher m3(&testPattern, &testText, 0, status);
2367 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2368 REGEX_ASSERT(m3.hitEnd() == TRUE);
2369 REGEX_ASSERT(m3.requireEnd() == TRUE);
2370 REGEX_CHECK_STATUS;
2371
2372 utext_close(&testText);
2373 utext_close(&testPattern);
2374 }
2375 }
2376
2377
2378 //---------------------------------------------------------------------------
2379 //
2380 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2381 // Replace family of functions.
2382 //
2383 //---------------------------------------------------------------------------
API_Replace_UTF8()2384 void RegexTest::API_Replace_UTF8() {
2385 //
2386 // Replace
2387 //
2388 int32_t flags=0;
2389 UParseError pe;
2390 UErrorCode status=U_ZERO_ERROR;
2391
2392 UText re=UTEXT_INITIALIZER;
2393 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2394 REGEX_VERBOSE_TEXT(&re);
2395 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2396 REGEX_CHECK_STATUS;
2397
2398 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2399 // 012345678901234567
2400 UText dataText = UTEXT_INITIALIZER;
2401 utext_openUTF8(&dataText, data, -1, &status);
2402 REGEX_CHECK_STATUS;
2403 REGEX_VERBOSE_TEXT(&dataText);
2404 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2405
2406 //
2407 // Plain vanilla matches.
2408 //
2409 UnicodeString dest;
2410 UText destText = UTEXT_INITIALIZER;
2411 utext_openUnicodeString(&destText, &dest, &status);
2412 UText *result;
2413
2414 UText replText = UTEXT_INITIALIZER;
2415
2416 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2417 utext_openUTF8(&replText, str_yz, -1, &status);
2418 REGEX_VERBOSE_TEXT(&replText);
2419 result = matcher->replaceFirst(&replText, NULL, status);
2420 REGEX_CHECK_STATUS;
2421 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2422 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2423 utext_close(result);
2424 result = matcher->replaceFirst(&replText, &destText, status);
2425 REGEX_CHECK_STATUS;
2426 REGEX_ASSERT(result == &destText);
2427 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2428
2429 result = matcher->replaceAll(&replText, NULL, status);
2430 REGEX_CHECK_STATUS;
2431 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2432 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2433 utext_close(result);
2434
2435 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2436 result = matcher->replaceAll(&replText, &destText, status);
2437 REGEX_CHECK_STATUS;
2438 REGEX_ASSERT(result == &destText);
2439 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2440
2441 //
2442 // Plain vanilla non-matches.
2443 //
2444 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2445 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2446 matcher->reset(&dataText);
2447
2448 result = matcher->replaceFirst(&replText, NULL, status);
2449 REGEX_CHECK_STATUS;
2450 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2451 utext_close(result);
2452 result = matcher->replaceFirst(&replText, &destText, status);
2453 REGEX_CHECK_STATUS;
2454 REGEX_ASSERT(result == &destText);
2455 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2456
2457 result = matcher->replaceAll(&replText, NULL, status);
2458 REGEX_CHECK_STATUS;
2459 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2460 utext_close(result);
2461 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2462 result = matcher->replaceAll(&replText, &destText, status);
2463 REGEX_CHECK_STATUS;
2464 REGEX_ASSERT(result == &destText);
2465 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2466
2467 //
2468 // Empty source string
2469 //
2470 utext_openUTF8(&dataText, NULL, 0, &status);
2471 matcher->reset(&dataText);
2472
2473 result = matcher->replaceFirst(&replText, NULL, status);
2474 REGEX_CHECK_STATUS;
2475 REGEX_ASSERT_UTEXT_UTF8("", result);
2476 utext_close(result);
2477 result = matcher->replaceFirst(&replText, &destText, status);
2478 REGEX_CHECK_STATUS;
2479 REGEX_ASSERT(result == &destText);
2480 REGEX_ASSERT_UTEXT_UTF8("", result);
2481
2482 result = matcher->replaceAll(&replText, NULL, status);
2483 REGEX_CHECK_STATUS;
2484 REGEX_ASSERT_UTEXT_UTF8("", result);
2485 utext_close(result);
2486 result = matcher->replaceAll(&replText, &destText, status);
2487 REGEX_CHECK_STATUS;
2488 REGEX_ASSERT(result == &destText);
2489 REGEX_ASSERT_UTEXT_UTF8("", result);
2490
2491 //
2492 // Empty substitution string
2493 //
2494 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2495 matcher->reset(&dataText);
2496
2497 utext_openUTF8(&replText, NULL, 0, &status);
2498 result = matcher->replaceFirst(&replText, NULL, status);
2499 REGEX_CHECK_STATUS;
2500 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2501 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2502 utext_close(result);
2503 result = matcher->replaceFirst(&replText, &destText, status);
2504 REGEX_CHECK_STATUS;
2505 REGEX_ASSERT(result == &destText);
2506 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2507
2508 result = matcher->replaceAll(&replText, NULL, status);
2509 REGEX_CHECK_STATUS;
2510 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2511 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2512 utext_close(result);
2513 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2514 result = matcher->replaceAll(&replText, &destText, status);
2515 REGEX_CHECK_STATUS;
2516 REGEX_ASSERT(result == &destText);
2517 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2518
2519 //
2520 // match whole string
2521 //
2522 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2523 utext_openUTF8(&dataText, str_abc, -1, &status);
2524 matcher->reset(&dataText);
2525
2526 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2527 utext_openUTF8(&replText, str_xyz, -1, &status);
2528 result = matcher->replaceFirst(&replText, NULL, status);
2529 REGEX_CHECK_STATUS;
2530 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2531 utext_close(result);
2532 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2533 result = matcher->replaceFirst(&replText, &destText, status);
2534 REGEX_CHECK_STATUS;
2535 REGEX_ASSERT(result == &destText);
2536 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2537
2538 result = matcher->replaceAll(&replText, NULL, status);
2539 REGEX_CHECK_STATUS;
2540 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2541 utext_close(result);
2542 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2543 result = matcher->replaceAll(&replText, &destText, status);
2544 REGEX_CHECK_STATUS;
2545 REGEX_ASSERT(result == &destText);
2546 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2547
2548 //
2549 // Capture Group, simple case
2550 //
2551 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2552 utext_openUTF8(&re, str_add, -1, &status);
2553 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2554 REGEX_CHECK_STATUS;
2555
2556 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2557 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2558 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2559 REGEX_CHECK_STATUS;
2560
2561 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2562 utext_openUTF8(&replText, str_11, -1, &status);
2563 result = matcher2->replaceFirst(&replText, NULL, status);
2564 REGEX_CHECK_STATUS;
2565 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2566 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2567 utext_close(result);
2568 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2569 result = matcher2->replaceFirst(&replText, &destText, status);
2570 REGEX_CHECK_STATUS;
2571 REGEX_ASSERT(result == &destText);
2572 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2573
2574 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2575 utext_openUTF8(&replText, str_v, -1, &status);
2576 REGEX_VERBOSE_TEXT(&replText);
2577 result = matcher2->replaceFirst(&replText, NULL, status);
2578 REGEX_CHECK_STATUS;
2579 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2580 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2581 utext_close(result);
2582 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2583 result = matcher2->replaceFirst(&replText, &destText, status);
2584 REGEX_CHECK_STATUS;
2585 REGEX_ASSERT(result == &destText);
2586 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2587
2588 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2589 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2590 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2591 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2592 result = matcher2->replaceFirst(&replText, NULL, status);
2593 REGEX_CHECK_STATUS;
2594 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2595 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2596 utext_close(result);
2597 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2598 result = matcher2->replaceFirst(&replText, &destText, status);
2599 REGEX_CHECK_STATUS;
2600 REGEX_ASSERT(result == &destText);
2601 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2602
2603 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2604 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2605 // 012345678901234567890123456
2606 supplDigitChars[22] = 0xF0;
2607 supplDigitChars[23] = 0x9D;
2608 supplDigitChars[24] = 0x9F;
2609 supplDigitChars[25] = 0x8F;
2610 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2611
2612 result = matcher2->replaceFirst(&replText, NULL, status);
2613 REGEX_CHECK_STATUS;
2614 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2615 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2616 utext_close(result);
2617 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2618 result = matcher2->replaceFirst(&replText, &destText, status);
2619 REGEX_CHECK_STATUS;
2620 REGEX_ASSERT(result == &destText);
2621 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2622 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2623 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2624 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2625 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2626 utext_close(result);
2627 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2628 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2629 REGEX_ASSERT(result == &destText);
2630 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2631
2632 //
2633 // Replacement String with \u hex escapes
2634 //
2635 {
2636 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2637 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2638 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2639 utext_openUTF8(&replText, str_u0043, -1, &status);
2640 matcher->reset(&dataText);
2641
2642 result = matcher->replaceAll(&replText, NULL, status);
2643 REGEX_CHECK_STATUS;
2644 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2645 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2646 utext_close(result);
2647 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2648 result = matcher->replaceAll(&replText, &destText, status);
2649 REGEX_CHECK_STATUS;
2650 REGEX_ASSERT(result == &destText);
2651 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2652 }
2653 {
2654 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2655 utext_openUTF8(&dataText, str_abc, -1, &status);
2656 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2657 utext_openUTF8(&replText, str_U00010000, -1, &status);
2658 matcher->reset(&dataText);
2659
2660 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2661 // 0123456789
2662 expected[2] = 0xF0;
2663 expected[3] = 0x90;
2664 expected[4] = 0x80;
2665 expected[5] = 0x80;
2666
2667 result = matcher->replaceAll(&replText, NULL, status);
2668 REGEX_CHECK_STATUS;
2669 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2670 utext_close(result);
2671 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2672 result = matcher->replaceAll(&replText, &destText, status);
2673 REGEX_CHECK_STATUS;
2674 REGEX_ASSERT(result == &destText);
2675 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2676 }
2677 // TODO: need more through testing of capture substitutions.
2678
2679 // Bug 4057
2680 //
2681 {
2682 status = U_ZERO_ERROR;
2683 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2684 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2685 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2686 utext_openUTF8(&re, str_ssee, -1, &status);
2687 utext_openUTF8(&dataText, str_blah, -1, &status);
2688 utext_openUTF8(&replText, str_ooh, -1, &status);
2689
2690 RegexMatcher m(&re, 0, status);
2691 REGEX_CHECK_STATUS;
2692
2693 UnicodeString result;
2694 UText resultText = UTEXT_INITIALIZER;
2695 utext_openUnicodeString(&resultText, &result, &status);
2696
2697 // Multiple finds do NOT bump up the previous appendReplacement postion.
2698 m.reset(&dataText);
2699 m.find();
2700 m.find();
2701 m.appendReplacement(&resultText, &replText, status);
2702 REGEX_CHECK_STATUS;
2703 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2704 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2705
2706 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2707 status = U_ZERO_ERROR;
2708 result.truncate(0);
2709 utext_openUnicodeString(&resultText, &result, &status);
2710 m.reset(10, status);
2711 m.find();
2712 m.find();
2713 m.appendReplacement(&resultText, &replText, status);
2714 REGEX_CHECK_STATUS;
2715 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2716 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2717
2718 // find() at interior of string, appendReplacement still starts at beginning.
2719 status = U_ZERO_ERROR;
2720 result.truncate(0);
2721 utext_openUnicodeString(&resultText, &result, &status);
2722 m.reset();
2723 m.find(10, status);
2724 m.find();
2725 m.appendReplacement(&resultText, &replText, status);
2726 REGEX_CHECK_STATUS;
2727 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2728 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2729
2730 m.appendTail(&resultText, status);
2731 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2732 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2733
2734 utext_close(&resultText);
2735 }
2736
2737 delete matcher2;
2738 delete pat2;
2739 delete matcher;
2740 delete pat;
2741
2742 utext_close(&dataText);
2743 utext_close(&replText);
2744 utext_close(&destText);
2745 utext_close(&re);
2746 }
2747
2748
2749 //---------------------------------------------------------------------------
2750 //
2751 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2752 // present and nominally working.
2753 //
2754 //---------------------------------------------------------------------------
API_Pattern_UTF8()2755 void RegexTest::API_Pattern_UTF8() {
2756 RegexPattern pata; // Test default constructor to not crash.
2757 RegexPattern patb;
2758
2759 REGEX_ASSERT(pata == patb);
2760 REGEX_ASSERT(pata == pata);
2761
2762 UText re1 = UTEXT_INITIALIZER;
2763 UText re2 = UTEXT_INITIALIZER;
2764 UErrorCode status = U_ZERO_ERROR;
2765 UParseError pe;
2766
2767 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2768 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2769 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2770 utext_openUTF8(&re2, str_def, -1, &status);
2771
2772 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2773 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2774 REGEX_CHECK_STATUS;
2775 REGEX_ASSERT(*pat1 == *pat1);
2776 REGEX_ASSERT(*pat1 != pata);
2777
2778 // Assign
2779 patb = *pat1;
2780 REGEX_ASSERT(patb == *pat1);
2781
2782 // Copy Construct
2783 RegexPattern patc(*pat1);
2784 REGEX_ASSERT(patc == *pat1);
2785 REGEX_ASSERT(patb == patc);
2786 REGEX_ASSERT(pat1 != pat2);
2787 patb = *pat2;
2788 REGEX_ASSERT(patb != patc);
2789 REGEX_ASSERT(patb == *pat2);
2790
2791 // Compile with no flags.
2792 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2793 REGEX_ASSERT(*pat1a == *pat1);
2794
2795 REGEX_ASSERT(pat1a->flags() == 0);
2796
2797 // Compile with different flags should be not equal
2798 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2799 REGEX_CHECK_STATUS;
2800
2801 REGEX_ASSERT(*pat1b != *pat1a);
2802 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2803 REGEX_ASSERT(pat1a->flags() == 0);
2804 delete pat1b;
2805
2806 // clone
2807 RegexPattern *pat1c = pat1->clone();
2808 REGEX_ASSERT(*pat1c == *pat1);
2809 REGEX_ASSERT(*pat1c != *pat2);
2810
2811 delete pat1c;
2812 delete pat1a;
2813 delete pat1;
2814 delete pat2;
2815
2816 utext_close(&re1);
2817 utext_close(&re2);
2818
2819
2820 //
2821 // Verify that a matcher created from a cloned pattern works.
2822 // (Jitterbug 3423)
2823 //
2824 {
2825 UErrorCode status = U_ZERO_ERROR;
2826 UText pattern = UTEXT_INITIALIZER;
2827 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2828 utext_openUTF8(&pattern, str_pL, -1, &status);
2829
2830 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2831 RegexPattern *pClone = pSource->clone();
2832 delete pSource;
2833 RegexMatcher *mFromClone = pClone->matcher(status);
2834 REGEX_CHECK_STATUS;
2835
2836 UText input = UTEXT_INITIALIZER;
2837 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2838 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2839 mFromClone->reset(&input);
2840 REGEX_ASSERT(mFromClone->find() == TRUE);
2841 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2842 REGEX_ASSERT(mFromClone->find() == TRUE);
2843 REGEX_ASSERT(mFromClone->group(status) == "World");
2844 REGEX_ASSERT(mFromClone->find() == FALSE);
2845 delete mFromClone;
2846 delete pClone;
2847
2848 utext_close(&input);
2849 utext_close(&pattern);
2850 }
2851
2852 //
2853 // matches convenience API
2854 //
2855 {
2856 UErrorCode status = U_ZERO_ERROR;
2857 UText pattern = UTEXT_INITIALIZER;
2858 UText input = UTEXT_INITIALIZER;
2859
2860 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2861 utext_openUTF8(&input, str_randominput, -1, &status);
2862
2863 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2864 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2865 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2866 REGEX_CHECK_STATUS;
2867
2868 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2869 utext_openUTF8(&pattern, str_abc, -1, &status);
2870 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2871 REGEX_CHECK_STATUS;
2872
2873 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2874 utext_openUTF8(&pattern, str_nput, -1, &status);
2875 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2876 REGEX_CHECK_STATUS;
2877
2878 utext_openUTF8(&pattern, str_randominput, -1, &status);
2879 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2880 REGEX_CHECK_STATUS;
2881
2882 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2883 utext_openUTF8(&pattern, str_u, -1, &status);
2884 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2885 REGEX_CHECK_STATUS;
2886
2887 utext_openUTF8(&input, str_abc, -1, &status);
2888 utext_openUTF8(&pattern, str_abc, -1, &status);
2889 status = U_INDEX_OUTOFBOUNDS_ERROR;
2890 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2891 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2892
2893 utext_close(&input);
2894 utext_close(&pattern);
2895 }
2896
2897
2898 //
2899 // Split()
2900 //
2901 status = U_ZERO_ERROR;
2902 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2903 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2904 pat1 = RegexPattern::compile(&re1, pe, status);
2905 REGEX_CHECK_STATUS;
2906 UnicodeString fields[10];
2907
2908 int32_t n;
2909 n = pat1->split("Now is the time", fields, 10, status);
2910 REGEX_CHECK_STATUS;
2911 REGEX_ASSERT(n==4);
2912 REGEX_ASSERT(fields[0]=="Now");
2913 REGEX_ASSERT(fields[1]=="is");
2914 REGEX_ASSERT(fields[2]=="the");
2915 REGEX_ASSERT(fields[3]=="time");
2916 REGEX_ASSERT(fields[4]=="");
2917
2918 n = pat1->split("Now is the time", fields, 2, status);
2919 REGEX_CHECK_STATUS;
2920 REGEX_ASSERT(n==2);
2921 REGEX_ASSERT(fields[0]=="Now");
2922 REGEX_ASSERT(fields[1]=="is the time");
2923 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2924
2925 fields[1] = "*";
2926 status = U_ZERO_ERROR;
2927 n = pat1->split("Now is the time", fields, 1, status);
2928 REGEX_CHECK_STATUS;
2929 REGEX_ASSERT(n==1);
2930 REGEX_ASSERT(fields[0]=="Now is the time");
2931 REGEX_ASSERT(fields[1]=="*");
2932 status = U_ZERO_ERROR;
2933
2934 n = pat1->split(" Now is the time ", fields, 10, status);
2935 REGEX_CHECK_STATUS;
2936 REGEX_ASSERT(n==6);
2937 REGEX_ASSERT(fields[0]=="");
2938 REGEX_ASSERT(fields[1]=="Now");
2939 REGEX_ASSERT(fields[2]=="is");
2940 REGEX_ASSERT(fields[3]=="the");
2941 REGEX_ASSERT(fields[4]=="time");
2942 REGEX_ASSERT(fields[5]=="");
2943 REGEX_ASSERT(fields[6]=="");
2944
2945 fields[2] = "*";
2946 n = pat1->split(" ", fields, 10, status);
2947 REGEX_CHECK_STATUS;
2948 REGEX_ASSERT(n==2);
2949 REGEX_ASSERT(fields[0]=="");
2950 REGEX_ASSERT(fields[1]=="");
2951 REGEX_ASSERT(fields[2]=="*");
2952
2953 fields[0] = "foo";
2954 n = pat1->split("", fields, 10, status);
2955 REGEX_CHECK_STATUS;
2956 REGEX_ASSERT(n==0);
2957 REGEX_ASSERT(fields[0]=="foo");
2958
2959 delete pat1;
2960
2961 // split, with a pattern with (capture)
2962 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2963 pat1 = RegexPattern::compile(&re1, pe, status);
2964 REGEX_CHECK_STATUS;
2965
2966 status = U_ZERO_ERROR;
2967 fields[6] = fields[7] = "*";
2968 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2969 REGEX_CHECK_STATUS;
2970 REGEX_ASSERT(n==7);
2971 REGEX_ASSERT(fields[0]=="");
2972 REGEX_ASSERT(fields[1]=="a");
2973 REGEX_ASSERT(fields[2]=="Now is ");
2974 REGEX_ASSERT(fields[3]=="b");
2975 REGEX_ASSERT(fields[4]=="the time");
2976 REGEX_ASSERT(fields[5]=="c");
2977 REGEX_ASSERT(fields[6]=="");
2978 REGEX_ASSERT(fields[7]=="*");
2979 REGEX_ASSERT(status==U_ZERO_ERROR);
2980
2981 fields[6] = fields[7] = "*";
2982 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
2983 REGEX_CHECK_STATUS;
2984 REGEX_ASSERT(n==7);
2985 REGEX_ASSERT(fields[0]==" ");
2986 REGEX_ASSERT(fields[1]=="a");
2987 REGEX_ASSERT(fields[2]=="Now is ");
2988 REGEX_ASSERT(fields[3]=="b");
2989 REGEX_ASSERT(fields[4]=="the time");
2990 REGEX_ASSERT(fields[5]=="c");
2991 REGEX_ASSERT(fields[6]=="");
2992 REGEX_ASSERT(fields[7]=="*");
2993
2994 status = U_ZERO_ERROR;
2995 fields[6] = "foo";
2996 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
2997 REGEX_CHECK_STATUS;
2998 REGEX_ASSERT(n==6);
2999 REGEX_ASSERT(fields[0]==" ");
3000 REGEX_ASSERT(fields[1]=="a");
3001 REGEX_ASSERT(fields[2]=="Now is ");
3002 REGEX_ASSERT(fields[3]=="b");
3003 REGEX_ASSERT(fields[4]=="the time");
3004 REGEX_ASSERT(fields[5]==" ");
3005 REGEX_ASSERT(fields[6]=="foo");
3006
3007 status = U_ZERO_ERROR;
3008 fields[5] = "foo";
3009 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3010 REGEX_CHECK_STATUS;
3011 REGEX_ASSERT(n==5);
3012 REGEX_ASSERT(fields[0]==" ");
3013 REGEX_ASSERT(fields[1]=="a");
3014 REGEX_ASSERT(fields[2]=="Now is ");
3015 REGEX_ASSERT(fields[3]=="b");
3016 REGEX_ASSERT(fields[4]=="the time<c>");
3017 REGEX_ASSERT(fields[5]=="foo");
3018
3019 status = U_ZERO_ERROR;
3020 fields[5] = "foo";
3021 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3022 REGEX_CHECK_STATUS;
3023 REGEX_ASSERT(n==5);
3024 REGEX_ASSERT(fields[0]==" ");
3025 REGEX_ASSERT(fields[1]=="a");
3026 REGEX_ASSERT(fields[2]=="Now is ");
3027 REGEX_ASSERT(fields[3]=="b");
3028 REGEX_ASSERT(fields[4]=="the time");
3029 REGEX_ASSERT(fields[5]=="foo");
3030
3031 status = U_ZERO_ERROR;
3032 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3033 REGEX_CHECK_STATUS;
3034 REGEX_ASSERT(n==4);
3035 REGEX_ASSERT(fields[0]==" ");
3036 REGEX_ASSERT(fields[1]=="a");
3037 REGEX_ASSERT(fields[2]=="Now is ");
3038 REGEX_ASSERT(fields[3]=="the time<c>");
3039 status = U_ZERO_ERROR;
3040 delete pat1;
3041
3042 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3043 pat1 = RegexPattern::compile(&re1, pe, status);
3044 REGEX_CHECK_STATUS;
3045 n = pat1->split("1-10,20", fields, 10, status);
3046 REGEX_CHECK_STATUS;
3047 REGEX_ASSERT(n==5);
3048 REGEX_ASSERT(fields[0]=="1");
3049 REGEX_ASSERT(fields[1]=="-");
3050 REGEX_ASSERT(fields[2]=="10");
3051 REGEX_ASSERT(fields[3]==",");
3052 REGEX_ASSERT(fields[4]=="20");
3053 delete pat1;
3054
3055
3056 //
3057 // split of a UText based string, with library allocating output UTexts.
3058 //
3059 {
3060 status = U_ZERO_ERROR;
3061 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3062 UnicodeString stringToSplit("first:second:third");
3063 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3064 REGEX_CHECK_STATUS;
3065
3066 UText *splits[10] = {NULL};
3067 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3068 REGEX_CHECK_STATUS;
3069 REGEX_ASSERT(numFields == 5);
3070 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3071 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3072 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3073 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3074 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3075 REGEX_ASSERT(splits[5] == NULL);
3076
3077 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3078 if (splits[i]) {
3079 utext_close(splits[i]);
3080 splits[i] = NULL;
3081 }
3082 }
3083 utext_close(textToSplit);
3084 }
3085
3086
3087 //
3088 // RegexPattern::pattern() and patternText()
3089 //
3090 pat1 = new RegexPattern();
3091 REGEX_ASSERT(pat1->pattern() == "");
3092 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3093 delete pat1;
3094 const char *helloWorldInvariant = "(Hello, world)*";
3095 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3096 pat1 = RegexPattern::compile(&re1, pe, status);
3097 REGEX_CHECK_STATUS;
3098 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3099 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3100 delete pat1;
3101
3102 utext_close(&re1);
3103 }
3104
3105
3106 //---------------------------------------------------------------------------
3107 //
3108 // Extended A more thorough check for features of regex patterns
3109 // The test cases are in a separate data file,
3110 // source/tests/testdata/regextst.txt
3111 // A description of the test data format is included in that file.
3112 //
3113 //---------------------------------------------------------------------------
3114
3115 const char *
getPath(char buffer[2048],const char * filename)3116 RegexTest::getPath(char buffer[2048], const char *filename) {
3117 UErrorCode status=U_ZERO_ERROR;
3118 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3119 if (U_FAILURE(status)) {
3120 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3121 return NULL;
3122 }
3123
3124 strcpy(buffer, testDataDirectory);
3125 strcat(buffer, filename);
3126 return buffer;
3127 }
3128
Extended()3129 void RegexTest::Extended() {
3130 char tdd[2048];
3131 const char *srcPath;
3132 UErrorCode status = U_ZERO_ERROR;
3133 int32_t lineNum = 0;
3134
3135 //
3136 // Open and read the test data file.
3137 //
3138 srcPath=getPath(tdd, "regextst.txt");
3139 if(srcPath==NULL) {
3140 return; /* something went wrong, error already output */
3141 }
3142
3143 int32_t len;
3144 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3145 if (U_FAILURE(status)) {
3146 return; /* something went wrong, error already output */
3147 }
3148
3149 //
3150 // Put the test data into a UnicodeString
3151 //
3152 UnicodeString testString(FALSE, testData, len);
3153
3154 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3155 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3156 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3157
3158 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3159 UnicodeString testPattern; // The pattern for test from the test file.
3160 UnicodeString testFlags; // the flags for a test.
3161 UnicodeString matchString; // The marked up string to be used as input
3162
3163 if (U_FAILURE(status)){
3164 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3165 delete [] testData;
3166 return;
3167 }
3168
3169 //
3170 // Loop over the test data file, once per line.
3171 //
3172 while (lineMat.find()) {
3173 lineNum++;
3174 if (U_FAILURE(status)) {
3175 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3176 }
3177
3178 status = U_ZERO_ERROR;
3179 UnicodeString testLine = lineMat.group(1, status);
3180 if (testLine.length() == 0) {
3181 continue;
3182 }
3183
3184 //
3185 // Parse the test line. Skip blank and comment only lines.
3186 // Separate out the three main fields - pattern, flags, target.
3187 //
3188
3189 commentMat.reset(testLine);
3190 if (commentMat.lookingAt(status)) {
3191 // This line is a comment, or blank.
3192 continue;
3193 }
3194
3195 //
3196 // Pull out the pattern field, remove it from the test file line.
3197 //
3198 quotedStuffMat.reset(testLine);
3199 if (quotedStuffMat.lookingAt(status)) {
3200 testPattern = quotedStuffMat.group(2, status);
3201 testLine.remove(0, quotedStuffMat.end(0, status));
3202 } else {
3203 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3204 continue;
3205 }
3206
3207
3208 //
3209 // Pull out the flags from the test file line.
3210 //
3211 flagsMat.reset(testLine);
3212 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3213 testFlags = flagsMat.group(1, status);
3214 if (flagsMat.group(2, status).length() > 0) {
3215 errln("Bad Match flag at line %d. Scanning %c\n",
3216 lineNum, flagsMat.group(2, status).charAt(0));
3217 continue;
3218 }
3219 testLine.remove(0, flagsMat.end(0, status));
3220
3221 //
3222 // Pull out the match string, as a whole.
3223 // We'll process the <tags> later.
3224 //
3225 quotedStuffMat.reset(testLine);
3226 if (quotedStuffMat.lookingAt(status)) {
3227 matchString = quotedStuffMat.group(2, status);
3228 testLine.remove(0, quotedStuffMat.end(0, status));
3229 } else {
3230 errln("Bad match string at test file line %d", lineNum);
3231 continue;
3232 }
3233
3234 //
3235 // The only thing left from the input line should be an optional trailing comment.
3236 //
3237 commentMat.reset(testLine);
3238 if (commentMat.lookingAt(status) == FALSE) {
3239 errln("Line %d: unexpected characters at end of test line.", lineNum);
3240 continue;
3241 }
3242
3243 //
3244 // Run the test
3245 //
3246 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3247 }
3248
3249 delete [] testData;
3250
3251 }
3252
3253
3254
3255 //---------------------------------------------------------------------------
3256 //
3257 // regex_find(pattern, flags, inputString, lineNumber)
3258 //
3259 // Function to run a single test from the Extended (data driven) tests.
3260 // See file test/testdata/regextst.txt for a description of the
3261 // pattern and inputString fields, and the allowed flags.
3262 // lineNumber is the source line in regextst.txt of the test.
3263 //
3264 //---------------------------------------------------------------------------
3265
3266
3267 // Set a value into a UVector at position specified by a decimal number in
3268 // a UnicodeString. This is a utility function needed by the actual test function,
3269 // which follows.
set(UVector & vec,int32_t val,UnicodeString index)3270 static void set(UVector &vec, int32_t val, UnicodeString index) {
3271 UErrorCode status=U_ZERO_ERROR;
3272 int32_t idx = 0;
3273 for (int32_t i=0; i<index.length(); i++) {
3274 int32_t d=u_charDigitValue(index.charAt(i));
3275 if (d<0) {return;}
3276 idx = idx*10 + d;
3277 }
3278 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3279 vec.setElementAt(val, idx);
3280 }
3281
setInt(UVector & vec,int32_t val,int32_t idx)3282 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3283 UErrorCode status=U_ZERO_ERROR;
3284 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3285 vec.setElementAt(val, idx);
3286 }
3287
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3288 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3289 {
3290 UBool couldFind = TRUE;
3291 UTEXT_SETNATIVEINDEX(utext, 0);
3292 int32_t i = 0;
3293 while (i < unistrOffset) {
3294 UChar32 c = UTEXT_NEXT32(utext);
3295 if (c != U_SENTINEL) {
3296 i += U16_LENGTH(c);
3297 } else {
3298 couldFind = FALSE;
3299 break;
3300 }
3301 }
3302 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3303 return couldFind;
3304 }
3305
3306
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3307 void RegexTest::regex_find(const UnicodeString &pattern,
3308 const UnicodeString &flags,
3309 const UnicodeString &inputString,
3310 const char *srcPath,
3311 int32_t line) {
3312 UnicodeString unEscapedInput;
3313 UnicodeString deTaggedInput;
3314
3315 int32_t patternUTF8Length, inputUTF8Length;
3316 char *patternChars = NULL, *inputChars = NULL;
3317 UText patternText = UTEXT_INITIALIZER;
3318 UText inputText = UTEXT_INITIALIZER;
3319 UConverter *UTF8Converter = NULL;
3320
3321 UErrorCode status = U_ZERO_ERROR;
3322 UParseError pe;
3323 RegexPattern *parsePat = NULL;
3324 RegexMatcher *parseMatcher = NULL;
3325 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3326 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3327 UVector groupStarts(status);
3328 UVector groupEnds(status);
3329 UVector groupStartsUTF8(status);
3330 UVector groupEndsUTF8(status);
3331 UBool isMatch = FALSE, isUTF8Match = FALSE;
3332 UBool failed = FALSE;
3333 int32_t numFinds;
3334 int32_t i;
3335 UBool useMatchesFunc = FALSE;
3336 UBool useLookingAtFunc = FALSE;
3337 int32_t regionStart = -1;
3338 int32_t regionEnd = -1;
3339 int32_t regionStartUTF8 = -1;
3340 int32_t regionEndUTF8 = -1;
3341
3342
3343 //
3344 // Compile the caller's pattern
3345 //
3346 uint32_t bflags = 0;
3347 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3348 bflags |= UREGEX_CASE_INSENSITIVE;
3349 }
3350 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3351 bflags |= UREGEX_COMMENTS;
3352 }
3353 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3354 bflags |= UREGEX_DOTALL;
3355 }
3356 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3357 bflags |= UREGEX_MULTILINE;
3358 }
3359
3360 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3361 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3362 }
3363 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3364 bflags |= UREGEX_UNIX_LINES;
3365 }
3366 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3367 bflags |= UREGEX_LITERAL;
3368 }
3369
3370
3371 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3372 if (status != U_ZERO_ERROR) {
3373 #if UCONFIG_NO_BREAK_ITERATION==1
3374 // 'v' test flag means that the test pattern should not compile if ICU was configured
3375 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3376 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3377 goto cleanupAndReturn;
3378 }
3379 #endif
3380 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3381 // Expected pattern compilation error.
3382 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3383 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3384 }
3385 goto cleanupAndReturn;
3386 } else {
3387 // Unexpected pattern compilation error.
3388 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3389 goto cleanupAndReturn;
3390 }
3391 }
3392
3393 UTF8Converter = ucnv_open("UTF8", &status);
3394 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3395
3396 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3397 status = U_ZERO_ERROR; // buffer overflow
3398 patternChars = new char[patternUTF8Length+1];
3399 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3400 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3401
3402 if (status == U_ZERO_ERROR) {
3403 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3404
3405 if (status != U_ZERO_ERROR) {
3406 #if UCONFIG_NO_BREAK_ITERATION==1
3407 // 'v' test flag means that the test pattern should not compile if ICU was configured
3408 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3409 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3410 goto cleanupAndReturn;
3411 }
3412 #endif
3413 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3414 // Expected pattern compilation error.
3415 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3416 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3417 }
3418 goto cleanupAndReturn;
3419 } else {
3420 // Unexpected pattern compilation error.
3421 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3422 goto cleanupAndReturn;
3423 }
3424 }
3425 }
3426
3427 if (UTF8Pattern == NULL) {
3428 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3429 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3430 status = U_ZERO_ERROR;
3431 }
3432
3433 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3434 callerPattern->dumpPattern();
3435 }
3436
3437 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3438 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3439 goto cleanupAndReturn;
3440 }
3441
3442
3443 //
3444 // Number of times find() should be called on the test string, default to 1
3445 //
3446 numFinds = 1;
3447 for (i=2; i<=9; i++) {
3448 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3449 if (numFinds != 1) {
3450 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3451 goto cleanupAndReturn;
3452 }
3453 numFinds = i;
3454 }
3455 }
3456
3457 // 'M' flag. Use matches() instead of find()
3458 if (flags.indexOf((UChar)0x4d) >= 0) {
3459 useMatchesFunc = TRUE;
3460 }
3461 if (flags.indexOf((UChar)0x4c) >= 0) {
3462 useLookingAtFunc = TRUE;
3463 }
3464
3465 //
3466 // Find the tags in the input data, remove them, and record the group boundary
3467 // positions.
3468 //
3469 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3470 REGEX_CHECK_STATUS_L(line);
3471
3472 unEscapedInput = inputString.unescape();
3473 parseMatcher = parsePat->matcher(unEscapedInput, status);
3474 REGEX_CHECK_STATUS_L(line);
3475 while(parseMatcher->find()) {
3476 parseMatcher->appendReplacement(deTaggedInput, "", status);
3477 REGEX_CHECK_STATUS;
3478 UnicodeString groupNum = parseMatcher->group(2, status);
3479 if (groupNum == "r") {
3480 // <r> or </r>, a region specification within the string
3481 if (parseMatcher->group(1, status) == "/") {
3482 regionEnd = deTaggedInput.length();
3483 } else {
3484 regionStart = deTaggedInput.length();
3485 }
3486 } else {
3487 // <digits> or </digits>, a group match boundary tag.
3488 if (parseMatcher->group(1, status) == "/") {
3489 set(groupEnds, deTaggedInput.length(), groupNum);
3490 } else {
3491 set(groupStarts, deTaggedInput.length(), groupNum);
3492 }
3493 }
3494 }
3495 parseMatcher->appendTail(deTaggedInput);
3496 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3497 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3498 errln("mismatched <r> tags");
3499 failed = TRUE;
3500 goto cleanupAndReturn;
3501 }
3502
3503 //
3504 // Configure the matcher according to the flags specified with this test.
3505 //
3506 matcher = callerPattern->matcher(deTaggedInput, status);
3507 REGEX_CHECK_STATUS_L(line);
3508 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3509 matcher->setTrace(TRUE);
3510 }
3511
3512 if (UTF8Pattern != NULL) {
3513 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3514 status = U_ZERO_ERROR; // buffer overflow
3515 inputChars = new char[inputUTF8Length+1];
3516 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3517 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3518
3519 if (status == U_ZERO_ERROR) {
3520 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3521 REGEX_CHECK_STATUS_L(line);
3522 }
3523
3524 if (UTF8Matcher == NULL) {
3525 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3526 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3527 status = U_ZERO_ERROR;
3528 }
3529 }
3530
3531 //
3532 // Generate native indices for UTF8 versions of region and capture group info
3533 //
3534 if (UTF8Matcher != NULL) {
3535 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3536 UTF8Matcher->setTrace(TRUE);
3537 }
3538 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3539 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3540
3541 // Fill out the native index UVector info.
3542 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3543 for (i=0; i<groupStarts.size(); i++) {
3544 int32_t start = groupStarts.elementAti(i);
3545 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3546 if (start >= 0) {
3547 int32_t startUTF8;
3548 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3549 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3550 failed = TRUE;
3551 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3552 }
3553 setInt(groupStartsUTF8, startUTF8, i);
3554 }
3555
3556 int32_t end = groupEnds.elementAti(i);
3557 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3558 if (end >= 0) {
3559 int32_t endUTF8;
3560 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3561 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3562 failed = TRUE;
3563 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3564 }
3565 setInt(groupEndsUTF8, endUTF8, i);
3566 }
3567 }
3568 }
3569
3570 if (regionStart>=0) {
3571 matcher->region(regionStart, regionEnd, status);
3572 REGEX_CHECK_STATUS_L(line);
3573 if (UTF8Matcher != NULL) {
3574 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3575 REGEX_CHECK_STATUS_L(line);
3576 }
3577 }
3578 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3579 matcher->useAnchoringBounds(FALSE);
3580 if (UTF8Matcher != NULL) {
3581 UTF8Matcher->useAnchoringBounds(FALSE);
3582 }
3583 }
3584 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3585 matcher->useTransparentBounds(TRUE);
3586 if (UTF8Matcher != NULL) {
3587 UTF8Matcher->useTransparentBounds(TRUE);
3588 }
3589 }
3590
3591
3592
3593 //
3594 // Do a find on the de-tagged input using the caller's pattern
3595 // TODO: error on count>1 and not find().
3596 // error on both matches() and lookingAt().
3597 //
3598 for (i=0; i<numFinds; i++) {
3599 if (useMatchesFunc) {
3600 isMatch = matcher->matches(status);
3601 if (UTF8Matcher != NULL) {
3602 isUTF8Match = UTF8Matcher->matches(status);
3603 }
3604 } else if (useLookingAtFunc) {
3605 isMatch = matcher->lookingAt(status);
3606 if (UTF8Matcher != NULL) {
3607 isUTF8Match = UTF8Matcher->lookingAt(status);
3608 }
3609 } else {
3610 isMatch = matcher->find();
3611 if (UTF8Matcher != NULL) {
3612 isUTF8Match = UTF8Matcher->find();
3613 }
3614 }
3615 }
3616 matcher->setTrace(FALSE);
3617 if (UTF8Matcher) {
3618 UTF8Matcher->setTrace(FALSE);
3619 }
3620 if (U_FAILURE(status)) {
3621 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3622 }
3623
3624 //
3625 // Match up the groups from the find() with the groups from the tags
3626 //
3627
3628 // number of tags should match number of groups from find operation.
3629 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3630 // G option in test means that capture group data is not available in the
3631 // expected results, so the check needs to be suppressed.
3632 if (isMatch == FALSE && groupStarts.size() != 0) {
3633 dataerrln("Error at line %d: Match expected, but none found.", line);
3634 failed = TRUE;
3635 goto cleanupAndReturn;
3636 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3637 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3638 failed = TRUE;
3639 goto cleanupAndReturn;
3640 }
3641 if (isMatch && groupStarts.size() == 0) {
3642 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3643 failed = TRUE;
3644 }
3645 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3646 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3647 failed = TRUE;
3648 }
3649
3650 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3651 // Only check for match / no match. Don't check capture groups.
3652 goto cleanupAndReturn;
3653 }
3654
3655 REGEX_CHECK_STATUS_L(line);
3656 for (i=0; i<=matcher->groupCount(); i++) {
3657 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3658 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3659 if (matcher->start(i, status) != expectedStart) {
3660 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3661 line, i, expectedStart, matcher->start(i, status));
3662 failed = TRUE;
3663 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3664 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3665 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3666 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3667 failed = TRUE;
3668 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3669 }
3670
3671 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3672 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3673 if (matcher->end(i, status) != expectedEnd) {
3674 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3675 line, i, expectedEnd, matcher->end(i, status));
3676 failed = TRUE;
3677 // Error on end position; keep going; real error is probably yet to come as group
3678 // end positions work from end of the input data towards the front.
3679 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3680 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3681 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3682 failed = TRUE;
3683 // Error on end position; keep going; real error is probably yet to come as group
3684 // end positions work from end of the input data towards the front.
3685 }
3686 }
3687 if ( matcher->groupCount()+1 < groupStarts.size()) {
3688 errln("Error at line %d: Expected %d capture groups, found %d.",
3689 line, groupStarts.size()-1, matcher->groupCount());
3690 failed = TRUE;
3691 }
3692 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3693 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3694 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3695 failed = TRUE;
3696 }
3697
3698 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3699 matcher->requireEnd() == TRUE) {
3700 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3701 failed = TRUE;
3702 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3703 UTF8Matcher->requireEnd() == TRUE) {
3704 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3705 failed = TRUE;
3706 }
3707
3708 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3709 matcher->requireEnd() == FALSE) {
3710 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3711 failed = TRUE;
3712 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3713 UTF8Matcher->requireEnd() == FALSE) {
3714 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3715 failed = TRUE;
3716 }
3717
3718 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3719 matcher->hitEnd() == TRUE) {
3720 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3721 failed = TRUE;
3722 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3723 UTF8Matcher->hitEnd() == TRUE) {
3724 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3725 failed = TRUE;
3726 }
3727
3728 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3729 matcher->hitEnd() == FALSE) {
3730 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3731 failed = TRUE;
3732 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3733 UTF8Matcher->hitEnd() == FALSE) {
3734 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3735 failed = TRUE;
3736 }
3737
3738
3739 cleanupAndReturn:
3740 if (failed) {
3741 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3742 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3743 // callerPattern->dump();
3744 }
3745 delete parseMatcher;
3746 delete parsePat;
3747 delete UTF8Matcher;
3748 delete UTF8Pattern;
3749 delete matcher;
3750 delete callerPattern;
3751
3752 utext_close(&inputText);
3753 delete[] inputChars;
3754 utext_close(&patternText);
3755 delete[] patternChars;
3756 ucnv_close(UTF8Converter);
3757 }
3758
3759
3760
3761
3762 //---------------------------------------------------------------------------
3763 //
3764 // Errors Check for error handling in patterns.
3765 //
3766 //---------------------------------------------------------------------------
Errors()3767 void RegexTest::Errors() {
3768 // \escape sequences that aren't implemented yet.
3769 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3770
3771 // Missing close parentheses
3772 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3773 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3774 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3775
3776 // Extra close paren
3777 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3778 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3779 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3780
3781 // Look-ahead, Look-behind
3782 // TODO: add tests for unbounded length look-behinds.
3783 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3784
3785 // Attempt to use non-default flags
3786 {
3787 UParseError pe;
3788 UErrorCode status = U_ZERO_ERROR;
3789 int32_t flags = UREGEX_CANON_EQ |
3790 UREGEX_COMMENTS | UREGEX_DOTALL |
3791 UREGEX_MULTILINE;
3792 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3793 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3794 delete pat1;
3795 }
3796
3797
3798 // Quantifiers are allowed only after something that can be quantified.
3799 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3800 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3801 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3802
3803 // Mal-formed {min,max} quantifiers
3804 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3805 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3806 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3807 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3808 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3809 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3810 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3811 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3812 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3813
3814 // Ticket 5389
3815 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3816
3817 // Invalid Back Reference \0
3818 // For ICU 3.8 and earlier
3819 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3820 //
3821 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3822
3823 }
3824
3825
3826 //-------------------------------------------------------------------------------
3827 //
3828 // Read a text data file, convert it to UChars, and return the data
3829 // in one big UChar * buffer, which the caller must delete.
3830 //
3831 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3832 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3833 const char *defEncoding, UErrorCode &status) {
3834 UChar *retPtr = NULL;
3835 char *fileBuf = NULL;
3836 UConverter* conv = NULL;
3837 FILE *f = NULL;
3838
3839 ulen = 0;
3840 if (U_FAILURE(status)) {
3841 return retPtr;
3842 }
3843
3844 //
3845 // Open the file.
3846 //
3847 f = fopen(fileName, "rb");
3848 if (f == 0) {
3849 dataerrln("Error opening test data file %s\n", fileName);
3850 status = U_FILE_ACCESS_ERROR;
3851 return NULL;
3852 }
3853 //
3854 // Read it in
3855 //
3856 int32_t fileSize;
3857 int32_t amt_read;
3858
3859 fseek( f, 0, SEEK_END);
3860 fileSize = ftell(f);
3861 fileBuf = new char[fileSize];
3862 fseek(f, 0, SEEK_SET);
3863 amt_read = fread(fileBuf, 1, fileSize, f);
3864 if (amt_read != fileSize || fileSize <= 0) {
3865 errln("Error reading test data file.");
3866 goto cleanUpAndReturn;
3867 }
3868
3869 //
3870 // Look for a Unicode Signature (BOM) on the data just read
3871 //
3872 int32_t signatureLength;
3873 const char * fileBufC;
3874 const char* encoding;
3875
3876 fileBufC = fileBuf;
3877 encoding = ucnv_detectUnicodeSignature(
3878 fileBuf, fileSize, &signatureLength, &status);
3879 if(encoding!=NULL ){
3880 fileBufC += signatureLength;
3881 fileSize -= signatureLength;
3882 } else {
3883 encoding = defEncoding;
3884 if (strcmp(encoding, "utf-8") == 0) {
3885 errln("file %s is missing its BOM", fileName);
3886 }
3887 }
3888
3889 //
3890 // Open a converter to take the rule file to UTF-16
3891 //
3892 conv = ucnv_open(encoding, &status);
3893 if (U_FAILURE(status)) {
3894 goto cleanUpAndReturn;
3895 }
3896
3897 //
3898 // Convert the rules to UChar.
3899 // Preflight first to determine required buffer size.
3900 //
3901 ulen = ucnv_toUChars(conv,
3902 NULL, // dest,
3903 0, // destCapacity,
3904 fileBufC,
3905 fileSize,
3906 &status);
3907 if (status == U_BUFFER_OVERFLOW_ERROR) {
3908 // Buffer Overflow is expected from the preflight operation.
3909 status = U_ZERO_ERROR;
3910
3911 retPtr = new UChar[ulen+1];
3912 ucnv_toUChars(conv,
3913 retPtr, // dest,
3914 ulen+1,
3915 fileBufC,
3916 fileSize,
3917 &status);
3918 }
3919
3920 cleanUpAndReturn:
3921 fclose(f);
3922 delete[] fileBuf;
3923 ucnv_close(conv);
3924 if (U_FAILURE(status)) {
3925 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3926 delete []retPtr;
3927 retPtr = 0;
3928 ulen = 0;
3929 };
3930 return retPtr;
3931 }
3932
3933
3934 //-------------------------------------------------------------------------------
3935 //
3936 // PerlTests - Run Perl's regular expression tests
3937 // The input file for this test is re_tests, the standard regular
3938 // expression test data distributed with the Perl source code.
3939 //
3940 // Here is Perl's description of the test data file:
3941 //
3942 // # The tests are in a separate file 't/op/re_tests'.
3943 // # Each line in that file is a separate test.
3944 // # There are five columns, separated by tabs.
3945 // #
3946 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3947 // # Modifiers can be put after the closing C<'>.
3948 // #
3949 // # Column 2 contains the string to be matched.
3950 // #
3951 // # Column 3 contains the expected result:
3952 // # y expect a match
3953 // # n expect no match
3954 // # c expect an error
3955 // # B test exposes a known bug in Perl, should be skipped
3956 // # b test exposes a known bug in Perl, should be skipped if noamp
3957 // #
3958 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3959 // #
3960 // # Column 4 contains a string, usually C<$&>.
3961 // #
3962 // # Column 5 contains the expected result of double-quote
3963 // # interpolating that string after the match, or start of error message.
3964 // #
3965 // # Column 6, if present, contains a reason why the test is skipped.
3966 // # This is printed with "skipped", for harness to pick up.
3967 // #
3968 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3969 // #
3970 // # If you want to add a regular expression test that can't be expressed
3971 // # in this format, don't add it here: put it in op/pat.t instead.
3972 //
3973 // For ICU, if field 3 contains an 'i', the test will be skipped.
3974 // The test exposes is some known incompatibility between ICU and Perl regexps.
3975 // (The i is in addition to whatever was there before.)
3976 //
3977 //-------------------------------------------------------------------------------
PerlTests()3978 void RegexTest::PerlTests() {
3979 char tdd[2048];
3980 const char *srcPath;
3981 UErrorCode status = U_ZERO_ERROR;
3982 UParseError pe;
3983
3984 //
3985 // Open and read the test data file.
3986 //
3987 srcPath=getPath(tdd, "re_tests.txt");
3988 if(srcPath==NULL) {
3989 return; /* something went wrong, error already output */
3990 }
3991
3992 int32_t len;
3993 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3994 if (U_FAILURE(status)) {
3995 return; /* something went wrong, error already output */
3996 }
3997
3998 //
3999 // Put the test data into a UnicodeString
4000 //
4001 UnicodeString testDataString(FALSE, testData, len);
4002
4003 //
4004 // Regex to break the input file into lines, and strip the new lines.
4005 // One line per match, capture group one is the desired data.
4006 //
4007 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4008 if (U_FAILURE(status)) {
4009 dataerrln("RegexPattern::compile() error");
4010 return;
4011 }
4012 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4013
4014 //
4015 // Regex to split a test file line into fields.
4016 // There are six fields, separated by tabs.
4017 //
4018 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4019
4020 //
4021 // Regex to identify test patterns with flag settings, and to separate them.
4022 // Test patterns with flags look like 'pattern'i
4023 // Test patterns without flags are not quoted: pattern
4024 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4025 //
4026 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4027 RegexMatcher* flagMat = flagPat->matcher(status);
4028
4029 //
4030 // The Perl tests reference several perl-isms, which are evaluated/substituted
4031 // in the test data. Not being perl, this must be done explicitly. Here
4032 // are string constants and REs for these constructs.
4033 //
4034 UnicodeString nulnulSrc("${nulnul}");
4035 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4036 nulnul = nulnul.unescape();
4037
4038 UnicodeString ffffSrc("${ffff}");
4039 UnicodeString ffff("\\uffff", -1, US_INV);
4040 ffff = ffff.unescape();
4041
4042 // regexp for $-[0], $+[2], etc.
4043 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4044 RegexMatcher *groupsMat = groupsPat->matcher(status);
4045
4046 // regexp for $0, $1, $2, etc.
4047 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4048 RegexMatcher *cgMat = cgPat->matcher(status);
4049
4050
4051 //
4052 // Main Loop for the Perl Tests, runs once per line from the
4053 // test data file.
4054 //
4055 int32_t lineNum = 0;
4056 int32_t skippedUnimplementedCount = 0;
4057 while (lineMat->find()) {
4058 lineNum++;
4059
4060 //
4061 // Get a line, break it into its fields, do the Perl
4062 // variable substitutions.
4063 //
4064 UnicodeString line = lineMat->group(1, status);
4065 UnicodeString fields[7];
4066 fieldPat->split(line, fields, 7, status);
4067
4068 flagMat->reset(fields[0]);
4069 flagMat->matches(status);
4070 UnicodeString pattern = flagMat->group(2, status);
4071 pattern.findAndReplace("${bang}", "!");
4072 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4073 pattern.findAndReplace(ffffSrc, ffff);
4074
4075 //
4076 // Identify patterns that include match flag settings,
4077 // split off the flags, remove the extra quotes.
4078 //
4079 UnicodeString flagStr = flagMat->group(3, status);
4080 if (U_FAILURE(status)) {
4081 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4082 return;
4083 }
4084 int32_t flags = 0;
4085 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4086 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4087 const UChar UChar_m = 0x6d;
4088 const UChar UChar_x = 0x78;
4089 const UChar UChar_y = 0x79;
4090 if (flagStr.indexOf(UChar_i) != -1) {
4091 flags |= UREGEX_CASE_INSENSITIVE;
4092 }
4093 if (flagStr.indexOf(UChar_m) != -1) {
4094 flags |= UREGEX_MULTILINE;
4095 }
4096 if (flagStr.indexOf(UChar_x) != -1) {
4097 flags |= UREGEX_COMMENTS;
4098 }
4099
4100 //
4101 // Compile the test pattern.
4102 //
4103 status = U_ZERO_ERROR;
4104 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4105 if (status == U_REGEX_UNIMPLEMENTED) {
4106 //
4107 // Test of a feature that is planned for ICU, but not yet implemented.
4108 // skip the test.
4109 skippedUnimplementedCount++;
4110 delete testPat;
4111 status = U_ZERO_ERROR;
4112 continue;
4113 }
4114
4115 if (U_FAILURE(status)) {
4116 // Some tests are supposed to generate errors.
4117 // Only report an error for tests that are supposed to succeed.
4118 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4119 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4120 {
4121 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4122 }
4123 status = U_ZERO_ERROR;
4124 delete testPat;
4125 continue;
4126 }
4127
4128 if (fields[2].indexOf(UChar_i) >= 0) {
4129 // ICU should skip this test.
4130 delete testPat;
4131 continue;
4132 }
4133
4134 if (fields[2].indexOf(UChar_c) >= 0) {
4135 // This pattern should have caused a compilation error, but didn't/
4136 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4137 delete testPat;
4138 continue;
4139 }
4140
4141 //
4142 // replace the Perl variables that appear in some of the
4143 // match data strings.
4144 //
4145 UnicodeString matchString = fields[1];
4146 matchString.findAndReplace(nulnulSrc, nulnul);
4147 matchString.findAndReplace(ffffSrc, ffff);
4148
4149 // Replace any \n in the match string with an actual new-line char.
4150 // Don't do full unescape, as this unescapes more than Perl does, which
4151 // causes other spurious failures in the tests.
4152 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4153
4154
4155
4156 //
4157 // Run the test, check for expected match/don't match result.
4158 //
4159 RegexMatcher *testMat = testPat->matcher(matchString, status);
4160 UBool found = testMat->find();
4161 UBool expected = FALSE;
4162 if (fields[2].indexOf(UChar_y) >=0) {
4163 expected = TRUE;
4164 }
4165 if (expected != found) {
4166 errln("line %d: Expected %smatch, got %smatch",
4167 lineNum, expected?"":"no ", found?"":"no " );
4168 continue;
4169 }
4170
4171 // Don't try to check expected results if there is no match.
4172 // (Some have stuff in the expected fields)
4173 if (!found) {
4174 delete testMat;
4175 delete testPat;
4176 continue;
4177 }
4178
4179 //
4180 // Interpret the Perl expression from the fourth field of the data file,
4181 // building up an ICU string from the results of the ICU match.
4182 // The Perl expression will contain references to the results of
4183 // a regex match, including the matched string, capture group strings,
4184 // group starting and ending indicies, etc.
4185 //
4186 UnicodeString resultString;
4187 UnicodeString perlExpr = fields[3];
4188 #if SUPPORT_MUTATING_INPUT_STRING
4189 groupsMat->reset(perlExpr);
4190 cgMat->reset(perlExpr);
4191 #endif
4192
4193 while (perlExpr.length() > 0) {
4194 #if !SUPPORT_MUTATING_INPUT_STRING
4195 // Perferred usage. Reset after any modification to input string.
4196 groupsMat->reset(perlExpr);
4197 cgMat->reset(perlExpr);
4198 #endif
4199
4200 if (perlExpr.startsWith("$&")) {
4201 resultString.append(testMat->group(status));
4202 perlExpr.remove(0, 2);
4203 }
4204
4205 else if (groupsMat->lookingAt(status)) {
4206 // $-[0] $+[2] etc.
4207 UnicodeString digitString = groupsMat->group(2, status);
4208 int32_t t = 0;
4209 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4210 UnicodeString plusOrMinus = groupsMat->group(1, status);
4211 int32_t matchPosition;
4212 if (plusOrMinus.compare("+") == 0) {
4213 matchPosition = testMat->end(groupNum, status);
4214 } else {
4215 matchPosition = testMat->start(groupNum, status);
4216 }
4217 if (matchPosition != -1) {
4218 ICU_Utility::appendNumber(resultString, matchPosition);
4219 }
4220 perlExpr.remove(0, groupsMat->end(status));
4221 }
4222
4223 else if (cgMat->lookingAt(status)) {
4224 // $1, $2, $3, etc.
4225 UnicodeString digitString = cgMat->group(1, status);
4226 int32_t t = 0;
4227 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4228 if (U_SUCCESS(status)) {
4229 resultString.append(testMat->group(groupNum, status));
4230 status = U_ZERO_ERROR;
4231 }
4232 perlExpr.remove(0, cgMat->end(status));
4233 }
4234
4235 else if (perlExpr.startsWith("@-")) {
4236 int32_t i;
4237 for (i=0; i<=testMat->groupCount(); i++) {
4238 if (i>0) {
4239 resultString.append(" ");
4240 }
4241 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4242 }
4243 perlExpr.remove(0, 2);
4244 }
4245
4246 else if (perlExpr.startsWith("@+")) {
4247 int32_t i;
4248 for (i=0; i<=testMat->groupCount(); i++) {
4249 if (i>0) {
4250 resultString.append(" ");
4251 }
4252 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4253 }
4254 perlExpr.remove(0, 2);
4255 }
4256
4257 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4258 // or as an escaped sequence (e.g. \n)
4259 if (perlExpr.length() > 1) {
4260 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4261 }
4262 UChar c = perlExpr.charAt(0);
4263 switch (c) {
4264 case 'n': c = '\n'; break;
4265 // add any other escape sequences that show up in the test expected results.
4266 }
4267 resultString.append(c);
4268 perlExpr.remove(0, 1);
4269 }
4270
4271 else {
4272 // Any characters from the perl expression that we don't explicitly
4273 // recognize before here are assumed to be literals and copied
4274 // as-is to the expected results.
4275 resultString.append(perlExpr.charAt(0));
4276 perlExpr.remove(0, 1);
4277 }
4278
4279 if (U_FAILURE(status)) {
4280 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4281 break;
4282 }
4283 }
4284
4285 //
4286 // Expected Results Compare
4287 //
4288 UnicodeString expectedS(fields[4]);
4289 expectedS.findAndReplace(nulnulSrc, nulnul);
4290 expectedS.findAndReplace(ffffSrc, ffff);
4291 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4292
4293
4294 if (expectedS.compare(resultString) != 0) {
4295 err("Line %d: Incorrect perl expression results.", lineNum);
4296 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4297 }
4298
4299 delete testMat;
4300 delete testPat;
4301 }
4302
4303 //
4304 // All done. Clean up allocated stuff.
4305 //
4306 delete cgMat;
4307 delete cgPat;
4308
4309 delete groupsMat;
4310 delete groupsPat;
4311
4312 delete flagMat;
4313 delete flagPat;
4314
4315 delete lineMat;
4316 delete linePat;
4317
4318 delete fieldPat;
4319 delete [] testData;
4320
4321
4322 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4323
4324 }
4325
4326
4327 //-------------------------------------------------------------------------------
4328 //
4329 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4330 // (instead of using UnicodeStrings) to test the alternate engine.
4331 // The input file for this test is re_tests, the standard regular
4332 // expression test data distributed with the Perl source code.
4333 // See PerlTests() for more information.
4334 //
4335 //-------------------------------------------------------------------------------
PerlTestsUTF8()4336 void RegexTest::PerlTestsUTF8() {
4337 char tdd[2048];
4338 const char *srcPath;
4339 UErrorCode status = U_ZERO_ERROR;
4340 UParseError pe;
4341 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4342 UText patternText = UTEXT_INITIALIZER;
4343 char *patternChars = NULL;
4344 int32_t patternLength;
4345 int32_t patternCapacity = 0;
4346 UText inputText = UTEXT_INITIALIZER;
4347 char *inputChars = NULL;
4348 int32_t inputLength;
4349 int32_t inputCapacity = 0;
4350
4351 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4352
4353 //
4354 // Open and read the test data file.
4355 //
4356 srcPath=getPath(tdd, "re_tests.txt");
4357 if(srcPath==NULL) {
4358 return; /* something went wrong, error already output */
4359 }
4360
4361 int32_t len;
4362 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4363 if (U_FAILURE(status)) {
4364 return; /* something went wrong, error already output */
4365 }
4366
4367 //
4368 // Put the test data into a UnicodeString
4369 //
4370 UnicodeString testDataString(FALSE, testData, len);
4371
4372 //
4373 // Regex to break the input file into lines, and strip the new lines.
4374 // One line per match, capture group one is the desired data.
4375 //
4376 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4377 if (U_FAILURE(status)) {
4378 dataerrln("RegexPattern::compile() error");
4379 return;
4380 }
4381 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4382
4383 //
4384 // Regex to split a test file line into fields.
4385 // There are six fields, separated by tabs.
4386 //
4387 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4388
4389 //
4390 // Regex to identify test patterns with flag settings, and to separate them.
4391 // Test patterns with flags look like 'pattern'i
4392 // Test patterns without flags are not quoted: pattern
4393 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4394 //
4395 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4396 RegexMatcher* flagMat = flagPat->matcher(status);
4397
4398 //
4399 // The Perl tests reference several perl-isms, which are evaluated/substituted
4400 // in the test data. Not being perl, this must be done explicitly. Here
4401 // are string constants and REs for these constructs.
4402 //
4403 UnicodeString nulnulSrc("${nulnul}");
4404 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4405 nulnul = nulnul.unescape();
4406
4407 UnicodeString ffffSrc("${ffff}");
4408 UnicodeString ffff("\\uffff", -1, US_INV);
4409 ffff = ffff.unescape();
4410
4411 // regexp for $-[0], $+[2], etc.
4412 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4413 RegexMatcher *groupsMat = groupsPat->matcher(status);
4414
4415 // regexp for $0, $1, $2, etc.
4416 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4417 RegexMatcher *cgMat = cgPat->matcher(status);
4418
4419
4420 //
4421 // Main Loop for the Perl Tests, runs once per line from the
4422 // test data file.
4423 //
4424 int32_t lineNum = 0;
4425 int32_t skippedUnimplementedCount = 0;
4426 while (lineMat->find()) {
4427 lineNum++;
4428
4429 //
4430 // Get a line, break it into its fields, do the Perl
4431 // variable substitutions.
4432 //
4433 UnicodeString line = lineMat->group(1, status);
4434 UnicodeString fields[7];
4435 fieldPat->split(line, fields, 7, status);
4436
4437 flagMat->reset(fields[0]);
4438 flagMat->matches(status);
4439 UnicodeString pattern = flagMat->group(2, status);
4440 pattern.findAndReplace("${bang}", "!");
4441 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4442 pattern.findAndReplace(ffffSrc, ffff);
4443
4444 //
4445 // Identify patterns that include match flag settings,
4446 // split off the flags, remove the extra quotes.
4447 //
4448 UnicodeString flagStr = flagMat->group(3, status);
4449 if (U_FAILURE(status)) {
4450 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4451 return;
4452 }
4453 int32_t flags = 0;
4454 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4455 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4456 const UChar UChar_m = 0x6d;
4457 const UChar UChar_x = 0x78;
4458 const UChar UChar_y = 0x79;
4459 if (flagStr.indexOf(UChar_i) != -1) {
4460 flags |= UREGEX_CASE_INSENSITIVE;
4461 }
4462 if (flagStr.indexOf(UChar_m) != -1) {
4463 flags |= UREGEX_MULTILINE;
4464 }
4465 if (flagStr.indexOf(UChar_x) != -1) {
4466 flags |= UREGEX_COMMENTS;
4467 }
4468
4469 //
4470 // Put the pattern in a UTF-8 UText
4471 //
4472 status = U_ZERO_ERROR;
4473 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4474 if (status == U_BUFFER_OVERFLOW_ERROR) {
4475 status = U_ZERO_ERROR;
4476 delete[] patternChars;
4477 patternCapacity = patternLength + 1;
4478 patternChars = new char[patternCapacity];
4479 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4480 }
4481 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4482
4483 //
4484 // Compile the test pattern.
4485 //
4486 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4487 if (status == U_REGEX_UNIMPLEMENTED) {
4488 //
4489 // Test of a feature that is planned for ICU, but not yet implemented.
4490 // skip the test.
4491 skippedUnimplementedCount++;
4492 delete testPat;
4493 status = U_ZERO_ERROR;
4494 continue;
4495 }
4496
4497 if (U_FAILURE(status)) {
4498 // Some tests are supposed to generate errors.
4499 // Only report an error for tests that are supposed to succeed.
4500 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4501 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4502 {
4503 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4504 }
4505 status = U_ZERO_ERROR;
4506 delete testPat;
4507 continue;
4508 }
4509
4510 if (fields[2].indexOf(UChar_i) >= 0) {
4511 // ICU should skip this test.
4512 delete testPat;
4513 continue;
4514 }
4515
4516 if (fields[2].indexOf(UChar_c) >= 0) {
4517 // This pattern should have caused a compilation error, but didn't/
4518 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4519 delete testPat;
4520 continue;
4521 }
4522
4523
4524 //
4525 // replace the Perl variables that appear in some of the
4526 // match data strings.
4527 //
4528 UnicodeString matchString = fields[1];
4529 matchString.findAndReplace(nulnulSrc, nulnul);
4530 matchString.findAndReplace(ffffSrc, ffff);
4531
4532 // Replace any \n in the match string with an actual new-line char.
4533 // Don't do full unescape, as this unescapes more than Perl does, which
4534 // causes other spurious failures in the tests.
4535 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4536
4537 //
4538 // Put the input in a UTF-8 UText
4539 //
4540 status = U_ZERO_ERROR;
4541 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4542 if (status == U_BUFFER_OVERFLOW_ERROR) {
4543 status = U_ZERO_ERROR;
4544 delete[] inputChars;
4545 inputCapacity = inputLength + 1;
4546 inputChars = new char[inputCapacity];
4547 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4548 }
4549 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4550
4551 //
4552 // Run the test, check for expected match/don't match result.
4553 //
4554 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4555 UBool found = testMat->find();
4556 UBool expected = FALSE;
4557 if (fields[2].indexOf(UChar_y) >=0) {
4558 expected = TRUE;
4559 }
4560 if (expected != found) {
4561 errln("line %d: Expected %smatch, got %smatch",
4562 lineNum, expected?"":"no ", found?"":"no " );
4563 continue;
4564 }
4565
4566 // Don't try to check expected results if there is no match.
4567 // (Some have stuff in the expected fields)
4568 if (!found) {
4569 delete testMat;
4570 delete testPat;
4571 continue;
4572 }
4573
4574 //
4575 // Interpret the Perl expression from the fourth field of the data file,
4576 // building up an ICU string from the results of the ICU match.
4577 // The Perl expression will contain references to the results of
4578 // a regex match, including the matched string, capture group strings,
4579 // group starting and ending indicies, etc.
4580 //
4581 UnicodeString resultString;
4582 UnicodeString perlExpr = fields[3];
4583
4584 while (perlExpr.length() > 0) {
4585 groupsMat->reset(perlExpr);
4586 cgMat->reset(perlExpr);
4587
4588 if (perlExpr.startsWith("$&")) {
4589 resultString.append(testMat->group(status));
4590 perlExpr.remove(0, 2);
4591 }
4592
4593 else if (groupsMat->lookingAt(status)) {
4594 // $-[0] $+[2] etc.
4595 UnicodeString digitString = groupsMat->group(2, status);
4596 int32_t t = 0;
4597 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4598 UnicodeString plusOrMinus = groupsMat->group(1, status);
4599 int32_t matchPosition;
4600 if (plusOrMinus.compare("+") == 0) {
4601 matchPosition = testMat->end(groupNum, status);
4602 } else {
4603 matchPosition = testMat->start(groupNum, status);
4604 }
4605 if (matchPosition != -1) {
4606 ICU_Utility::appendNumber(resultString, matchPosition);
4607 }
4608 perlExpr.remove(0, groupsMat->end(status));
4609 }
4610
4611 else if (cgMat->lookingAt(status)) {
4612 // $1, $2, $3, etc.
4613 UnicodeString digitString = cgMat->group(1, status);
4614 int32_t t = 0;
4615 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4616 if (U_SUCCESS(status)) {
4617 resultString.append(testMat->group(groupNum, status));
4618 status = U_ZERO_ERROR;
4619 }
4620 perlExpr.remove(0, cgMat->end(status));
4621 }
4622
4623 else if (perlExpr.startsWith("@-")) {
4624 int32_t i;
4625 for (i=0; i<=testMat->groupCount(); i++) {
4626 if (i>0) {
4627 resultString.append(" ");
4628 }
4629 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4630 }
4631 perlExpr.remove(0, 2);
4632 }
4633
4634 else if (perlExpr.startsWith("@+")) {
4635 int32_t i;
4636 for (i=0; i<=testMat->groupCount(); i++) {
4637 if (i>0) {
4638 resultString.append(" ");
4639 }
4640 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4641 }
4642 perlExpr.remove(0, 2);
4643 }
4644
4645 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4646 // or as an escaped sequence (e.g. \n)
4647 if (perlExpr.length() > 1) {
4648 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4649 }
4650 UChar c = perlExpr.charAt(0);
4651 switch (c) {
4652 case 'n': c = '\n'; break;
4653 // add any other escape sequences that show up in the test expected results.
4654 }
4655 resultString.append(c);
4656 perlExpr.remove(0, 1);
4657 }
4658
4659 else {
4660 // Any characters from the perl expression that we don't explicitly
4661 // recognize before here are assumed to be literals and copied
4662 // as-is to the expected results.
4663 resultString.append(perlExpr.charAt(0));
4664 perlExpr.remove(0, 1);
4665 }
4666
4667 if (U_FAILURE(status)) {
4668 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4669 break;
4670 }
4671 }
4672
4673 //
4674 // Expected Results Compare
4675 //
4676 UnicodeString expectedS(fields[4]);
4677 expectedS.findAndReplace(nulnulSrc, nulnul);
4678 expectedS.findAndReplace(ffffSrc, ffff);
4679 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4680
4681
4682 if (expectedS.compare(resultString) != 0) {
4683 err("Line %d: Incorrect perl expression results.", lineNum);
4684 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4685 }
4686
4687 delete testMat;
4688 delete testPat;
4689 }
4690
4691 //
4692 // All done. Clean up allocated stuff.
4693 //
4694 delete cgMat;
4695 delete cgPat;
4696
4697 delete groupsMat;
4698 delete groupsPat;
4699
4700 delete flagMat;
4701 delete flagPat;
4702
4703 delete lineMat;
4704 delete linePat;
4705
4706 delete fieldPat;
4707 delete [] testData;
4708
4709 utext_close(&patternText);
4710 utext_close(&inputText);
4711
4712 delete [] patternChars;
4713 delete [] inputChars;
4714
4715
4716 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4717
4718 }
4719
4720
4721 //--------------------------------------------------------------
4722 //
4723 // Bug6149 Verify limits to heap expansion for backtrack stack.
4724 // Use this pattern,
4725 // "(a?){1,8000000}"
4726 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4727 // This test is likely to be fragile, as further optimizations stop
4728 // more cases of pointless looping in the match engine.
4729 //
4730 //---------------------------------------------------------------
Bug6149()4731 void RegexTest::Bug6149() {
4732 UnicodeString pattern("(a?){1,8000000}");
4733 UnicodeString s("xyz");
4734 uint32_t flags = 0;
4735 UErrorCode status = U_ZERO_ERROR;
4736
4737 RegexMatcher matcher(pattern, s, flags, status);
4738 UBool result = false;
4739 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4740 REGEX_ASSERT(result == FALSE);
4741 }
4742
4743
4744 //
4745 // Callbacks() Test the callback function.
4746 // When set, callbacks occur periodically during matching operations,
4747 // giving the application code the ability to abort the operation
4748 // before it's normal completion.
4749 //
4750
4751 struct callBackContext {
4752 RegexTest *test;
4753 int32_t maxCalls;
4754 int32_t numCalls;
4755 int32_t lastSteps;
resetcallBackContext4756 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4757 };
4758
4759 U_CDECL_BEGIN
4760 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4761 testCallBackFn(const void *context, int32_t steps) {
4762 callBackContext *info = (callBackContext *)context;
4763 if (info->lastSteps+1 != steps) {
4764 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4765 }
4766 info->lastSteps = steps;
4767 info->numCalls++;
4768 return (info->numCalls < info->maxCalls);
4769 }
4770 U_CDECL_END
4771
Callbacks()4772 void RegexTest::Callbacks() {
4773 {
4774 // Getter returns NULLs if no callback has been set
4775
4776 // The variables that the getter will fill in.
4777 // Init to non-null values so that the action of the getter can be seen.
4778 const void *returnedContext = &returnedContext;
4779 URegexMatchCallback *returnedFn = &testCallBackFn;
4780
4781 UErrorCode status = U_ZERO_ERROR;
4782 RegexMatcher matcher("x", 0, status);
4783 REGEX_CHECK_STATUS;
4784 matcher.getMatchCallback(returnedFn, returnedContext, status);
4785 REGEX_CHECK_STATUS;
4786 REGEX_ASSERT(returnedFn == NULL);
4787 REGEX_ASSERT(returnedContext == NULL);
4788 }
4789
4790 {
4791 // Set and Get work
4792 callBackContext cbInfo = {this, 0, 0, 0};
4793 const void *returnedContext;
4794 URegexMatchCallback *returnedFn;
4795 UErrorCode status = U_ZERO_ERROR;
4796 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4797 REGEX_CHECK_STATUS;
4798 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4799 REGEX_CHECK_STATUS;
4800 matcher.getMatchCallback(returnedFn, returnedContext, status);
4801 REGEX_CHECK_STATUS;
4802 REGEX_ASSERT(returnedFn == testCallBackFn);
4803 REGEX_ASSERT(returnedContext == &cbInfo);
4804
4805 // A short-running match shouldn't invoke the callback
4806 status = U_ZERO_ERROR;
4807 cbInfo.reset(1);
4808 UnicodeString s = "xxx";
4809 matcher.reset(s);
4810 REGEX_ASSERT(matcher.matches(status));
4811 REGEX_CHECK_STATUS;
4812 REGEX_ASSERT(cbInfo.numCalls == 0);
4813
4814 // A medium-length match that runs long enough to invoke the
4815 // callback, but not so long that the callback aborts it.
4816 status = U_ZERO_ERROR;
4817 cbInfo.reset(4);
4818 s = "aaaaaaaaaaaaaaaaaaab";
4819 matcher.reset(s);
4820 REGEX_ASSERT(matcher.matches(status)==FALSE);
4821 REGEX_CHECK_STATUS;
4822 REGEX_ASSERT(cbInfo.numCalls > 0);
4823
4824 // A longer running match that the callback function will abort.
4825 status = U_ZERO_ERROR;
4826 cbInfo.reset(4);
4827 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4828 matcher.reset(s);
4829 REGEX_ASSERT(matcher.matches(status)==FALSE);
4830 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4831 REGEX_ASSERT(cbInfo.numCalls == 4);
4832
4833 // A longer running find that the callback function will abort.
4834 status = U_ZERO_ERROR;
4835 cbInfo.reset(4);
4836 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4837 matcher.reset(s);
4838 REGEX_ASSERT(matcher.find(status)==FALSE);
4839 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4840 REGEX_ASSERT(cbInfo.numCalls == 4);
4841 }
4842
4843
4844 }
4845
4846
4847 //
4848 // FindProgressCallbacks() Test the find "progress" callback function.
4849 // When set, the find progress callback will be invoked during a find operations
4850 // after each return from a match attempt, giving the application the opportunity
4851 // to terminate a long-running find operation before it's normal completion.
4852 //
4853
4854 struct progressCallBackContext {
4855 RegexTest *test;
4856 int64_t lastIndex;
4857 int32_t maxCalls;
4858 int32_t numCalls;
resetprogressCallBackContext4859 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4860 };
4861
4862 // call-back function for find().
4863 // Return TRUE to continue the find().
4864 // Return FALSE to stop the find().
4865 U_CDECL_BEGIN
4866 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4867 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4868 progressCallBackContext *info = (progressCallBackContext *)context;
4869 info->numCalls++;
4870 info->lastIndex = matchIndex;
4871 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4872 return (info->numCalls < info->maxCalls);
4873 }
4874 U_CDECL_END
4875
FindProgressCallbacks()4876 void RegexTest::FindProgressCallbacks() {
4877 {
4878 // Getter returns NULLs if no callback has been set
4879
4880 // The variables that the getter will fill in.
4881 // Init to non-null values so that the action of the getter can be seen.
4882 const void *returnedContext = &returnedContext;
4883 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4884
4885 UErrorCode status = U_ZERO_ERROR;
4886 RegexMatcher matcher("x", 0, status);
4887 REGEX_CHECK_STATUS;
4888 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4889 REGEX_CHECK_STATUS;
4890 REGEX_ASSERT(returnedFn == NULL);
4891 REGEX_ASSERT(returnedContext == NULL);
4892 }
4893
4894 {
4895 // Set and Get work
4896 progressCallBackContext cbInfo = {this, 0, 0, 0};
4897 const void *returnedContext;
4898 URegexFindProgressCallback *returnedFn;
4899 UErrorCode status = U_ZERO_ERROR;
4900 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4901 REGEX_CHECK_STATUS;
4902 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4903 REGEX_CHECK_STATUS;
4904 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4905 REGEX_CHECK_STATUS;
4906 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4907 REGEX_ASSERT(returnedContext == &cbInfo);
4908
4909 // A find that matches on the initial position does NOT invoke the callback.
4910 status = U_ZERO_ERROR;
4911 cbInfo.reset(100);
4912 UnicodeString s = "aaxxx";
4913 matcher.reset(s);
4914 #if 0
4915 matcher.setTrace(TRUE);
4916 #endif
4917 REGEX_ASSERT(matcher.find(0, status));
4918 REGEX_CHECK_STATUS;
4919 REGEX_ASSERT(cbInfo.numCalls == 0);
4920
4921 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4922 // but not so many times that we interrupt the operation.
4923 status = U_ZERO_ERROR;
4924 s = "aaaaaaaaaaaaaaaaaaab";
4925 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4926 matcher.reset(s);
4927 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4928 REGEX_CHECK_STATUS;
4929 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4930
4931 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4932 status = U_ZERO_ERROR;
4933 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4934 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4935 matcher.reset(s1);
4936 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4937 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4938 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4939
4940 // Now a match that will succeed, but after an interruption
4941 status = U_ZERO_ERROR;
4942 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4943 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4944 matcher.reset(s2);
4945 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4946 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4947 // Now retry the match from where left off
4948 cbInfo.maxCalls = 100; // No callback limit
4949 status = U_ZERO_ERROR;
4950 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4951 REGEX_CHECK_STATUS;
4952 }
4953
4954
4955 }
4956
4957
4958 //---------------------------------------------------------------------------
4959 //
4960 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4961 // UTexts. The pure-C implementation of UText
4962 // has no mutable backing stores, but we can
4963 // use UnicodeString here to test the functionality.
4964 //
4965 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4966 void RegexTest::PreAllocatedUTextCAPI () {
4967 UErrorCode status = U_ZERO_ERROR;
4968 URegularExpression *re;
4969 UText patternText = UTEXT_INITIALIZER;
4970 UnicodeString buffer;
4971 UText bufferText = UTEXT_INITIALIZER;
4972
4973 utext_openUnicodeString(&bufferText, &buffer, &status);
4974
4975 /*
4976 * getText() and getUText()
4977 */
4978 {
4979 UText text1 = UTEXT_INITIALIZER;
4980 UText text2 = UTEXT_INITIALIZER;
4981 UChar text2Chars[20];
4982 UText *resultText;
4983
4984 status = U_ZERO_ERROR;
4985 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4986 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4987 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4988 utext_openUChars(&text2, text2Chars, -1, &status);
4989
4990 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4991 re = uregex_openUText(&patternText, 0, NULL, &status);
4992
4993 /* First set a UText */
4994 uregex_setUText(re, &text1, &status);
4995 resultText = uregex_getUText(re, &bufferText, &status);
4996 REGEX_CHECK_STATUS;
4997 REGEX_ASSERT(resultText == &bufferText);
4998 utext_setNativeIndex(resultText, 0);
4999 utext_setNativeIndex(&text1, 0);
5000 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5001
5002 resultText = uregex_getUText(re, &bufferText, &status);
5003 REGEX_CHECK_STATUS;
5004 REGEX_ASSERT(resultText == &bufferText);
5005 utext_setNativeIndex(resultText, 0);
5006 utext_setNativeIndex(&text1, 0);
5007 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5008
5009 /* Then set a UChar * */
5010 uregex_setText(re, text2Chars, 7, &status);
5011 resultText = uregex_getUText(re, &bufferText, &status);
5012 REGEX_CHECK_STATUS;
5013 REGEX_ASSERT(resultText == &bufferText);
5014 utext_setNativeIndex(resultText, 0);
5015 utext_setNativeIndex(&text2, 0);
5016 REGEX_ASSERT(testUTextEqual(resultText, &text2));
5017
5018 uregex_close(re);
5019 utext_close(&text1);
5020 utext_close(&text2);
5021 }
5022
5023 /*
5024 * group()
5025 */
5026 {
5027 UChar text1[80];
5028 UText *actual;
5029 UBool result;
5030 int64_t length = 0;
5031
5032 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5033 // 012345678901234567890123456789012345678901234567
5034 // 0 1 2 3 4
5035
5036 status = U_ZERO_ERROR;
5037 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5038 REGEX_CHECK_STATUS;
5039
5040 uregex_setText(re, text1, -1, &status);
5041 result = uregex_find(re, 0, &status);
5042 REGEX_ASSERT(result==TRUE);
5043
5044 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5045 status = U_ZERO_ERROR;
5046 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5047 REGEX_CHECK_STATUS;
5048 REGEX_ASSERT(actual == &bufferText);
5049 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5050 REGEX_ASSERT(length == 16);
5051 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5052
5053 /* Capture group #1. Should succeed, matching " interior ". */
5054 status = U_ZERO_ERROR;
5055 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5056 REGEX_CHECK_STATUS;
5057 REGEX_ASSERT(actual == &bufferText);
5058 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5059 REGEX_ASSERT(length == 10);
5060 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5061
5062 /* Capture group out of range. Error. */
5063 status = U_ZERO_ERROR;
5064 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5065 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5066 REGEX_ASSERT(actual == &bufferText);
5067 uregex_close(re);
5068
5069 }
5070
5071 /*
5072 * replaceFirst()
5073 */
5074 {
5075 UChar text1[80];
5076 UChar text2[80];
5077 UText replText = UTEXT_INITIALIZER;
5078 UText *result;
5079 status = U_ZERO_ERROR;
5080 utext_openUnicodeString(&bufferText, &buffer, &status);
5081
5082 status = U_ZERO_ERROR;
5083 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5084 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5085 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5086
5087 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5088 REGEX_CHECK_STATUS;
5089
5090 /* Normal case, with match */
5091 uregex_setText(re, text1, -1, &status);
5092 REGEX_CHECK_STATUS;
5093 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5094 REGEX_CHECK_STATUS;
5095 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5096 REGEX_CHECK_STATUS;
5097 REGEX_ASSERT(result == &bufferText);
5098 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5099
5100 /* No match. Text should copy to output with no changes. */
5101 uregex_setText(re, text2, -1, &status);
5102 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5103 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5104 REGEX_CHECK_STATUS;
5105 REGEX_ASSERT(result == &bufferText);
5106 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5107
5108 /* Unicode escapes */
5109 uregex_setText(re, text1, -1, &status);
5110 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5111 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5112 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5113 REGEX_CHECK_STATUS;
5114 REGEX_ASSERT(result == &bufferText);
5115 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5116
5117 uregex_close(re);
5118 utext_close(&replText);
5119 }
5120
5121
5122 /*
5123 * replaceAll()
5124 */
5125 {
5126 UChar text1[80];
5127 UChar text2[80];
5128 UText replText = UTEXT_INITIALIZER;
5129 UText *result;
5130
5131 status = U_ZERO_ERROR;
5132 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5133 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5134 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5135
5136 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5137 REGEX_CHECK_STATUS;
5138
5139 /* Normal case, with match */
5140 uregex_setText(re, text1, -1, &status);
5141 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5142 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5143 REGEX_CHECK_STATUS;
5144 REGEX_ASSERT(result == &bufferText);
5145 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5146
5147 /* No match. Text should copy to output with no changes. */
5148 uregex_setText(re, text2, -1, &status);
5149 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5150 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5151 REGEX_CHECK_STATUS;
5152 REGEX_ASSERT(result == &bufferText);
5153 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5154
5155 uregex_close(re);
5156 utext_close(&replText);
5157 }
5158
5159
5160 /*
5161 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5162 * so we don't need to test it here.
5163 */
5164
5165 utext_close(&bufferText);
5166 utext_close(&patternText);
5167 }
5168
5169
5170 //--------------------------------------------------------------
5171 //
5172 // NamedCapture Check basic named capture group functionality
5173 //
5174 //--------------------------------------------------------------
NamedCapture()5175 void RegexTest::NamedCapture() {
5176 UErrorCode status = U_ZERO_ERROR;
5177 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5178 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5179 REGEX_CHECK_STATUS;
5180 int32_t group = pat->groupNumberFromName("five", -1, status);
5181 REGEX_CHECK_STATUS;
5182 REGEX_ASSERT(5 == group);
5183 group = pat->groupNumberFromName("three", -1, status);
5184 REGEX_CHECK_STATUS;
5185 REGEX_ASSERT(3 == group);
5186
5187 status = U_ZERO_ERROR;
5188 group = pat->groupNumberFromName(UnicodeString("six"), status);
5189 REGEX_CHECK_STATUS;
5190 REGEX_ASSERT(6 == group);
5191
5192 status = U_ZERO_ERROR;
5193 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5194 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5195
5196 status = U_ZERO_ERROR;
5197
5198 // After copying a pattern, named capture should still work in the copy.
5199 RegexPattern *copiedPat = new RegexPattern(*pat);
5200 REGEX_ASSERT(*copiedPat == *pat);
5201 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5202
5203 group = copiedPat->groupNumberFromName("five", -1, status);
5204 REGEX_CHECK_STATUS;
5205 REGEX_ASSERT(5 == group);
5206 group = copiedPat->groupNumberFromName("three", -1, status);
5207 REGEX_CHECK_STATUS;
5208 REGEX_ASSERT(3 == group);
5209 delete copiedPat;
5210
5211 // ReplaceAll with named capture group.
5212 status = U_ZERO_ERROR;
5213 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5214 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5215 REGEX_CHECK_STATUS;
5216 // m.pattern().dumpPattern();
5217 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5218 REGEX_CHECK_STATUS;
5219 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5220 delete m;
5221
5222 // ReplaceAll, allowed capture group numbers.
5223 text = UnicodeString("abcmxyz");
5224 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5225 REGEX_CHECK_STATUS;
5226
5227 status = U_ZERO_ERROR;
5228 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5229 REGEX_CHECK_STATUS;
5230 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5231
5232 status = U_ZERO_ERROR;
5233 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5234 REGEX_CHECK_STATUS;
5235 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5236
5237 status = U_ZERO_ERROR;
5238 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5239 REGEX_CHECK_STATUS;
5240 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5241
5242 status = U_ZERO_ERROR;
5243 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5244 REGEX_CHECK_STATUS;
5245 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5246
5247 status = U_ZERO_ERROR;
5248 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5249 REGEX_CHECK_STATUS;
5250 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5251
5252 status = U_ZERO_ERROR;
5253 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5254 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5255
5256 status = U_ZERO_ERROR;
5257 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5258 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5259 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5260
5261 status = U_ZERO_ERROR;
5262 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5263 REGEX_CHECK_STATUS; // that push group num out of range.
5264 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5265
5266 status = U_ZERO_ERROR;
5267 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5268 REGEX_CHECK_STATUS;
5269 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5270
5271 status = U_ZERO_ERROR;
5272 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5273 REGEX_CHECK_STATUS;
5274 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5275
5276 status = U_ZERO_ERROR;
5277 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5278 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5279
5280 status = U_ZERO_ERROR;
5281 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5282 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5283
5284 status = U_ZERO_ERROR;
5285 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5286 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5287
5288 status = U_ZERO_ERROR;
5289 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5290 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5291
5292 delete m;
5293
5294 // Repeat the above replaceAll() tests using the plain C API, which
5295 // has a separate implementation internally.
5296 // TODO: factor out the test data.
5297
5298 status = U_ZERO_ERROR;
5299 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5300 REGEX_CHECK_STATUS;
5301 text = UnicodeString("abcmxyz");
5302 uregex_setText(re, text.getBuffer(), text.length(), &status);
5303 REGEX_CHECK_STATUS;
5304
5305 UChar resultBuf[100];
5306 int32_t resultLength;
5307 UnicodeString repl;
5308
5309 status = U_ZERO_ERROR;
5310 repl = UnicodeString("<$0>");
5311 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5312 REGEX_CHECK_STATUS;
5313 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5314
5315 status = U_ZERO_ERROR;
5316 repl = UnicodeString("<$1>");
5317 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5318 REGEX_CHECK_STATUS;
5319 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5320
5321 status = U_ZERO_ERROR;
5322 repl = UnicodeString("<${one}>");
5323 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5324 REGEX_CHECK_STATUS;
5325 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5326
5327 status = U_ZERO_ERROR;
5328 repl = UnicodeString("<$2>");
5329 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5330 REGEX_CHECK_STATUS;
5331 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5332
5333 status = U_ZERO_ERROR;
5334 repl = UnicodeString("<$3>");
5335 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5336 REGEX_CHECK_STATUS;
5337 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5338
5339 status = U_ZERO_ERROR;
5340 repl = UnicodeString("<$4>");
5341 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5342 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5343
5344 status = U_ZERO_ERROR;
5345 repl = UnicodeString("<$04>");
5346 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5347 REGEX_CHECK_STATUS;
5348 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5349
5350 status = U_ZERO_ERROR;
5351 repl = UnicodeString("<$000016>");
5352 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5353 REGEX_CHECK_STATUS;
5354 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5355
5356 status = U_ZERO_ERROR;
5357 repl = UnicodeString("<$3$2$1${one}>");
5358 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5359 REGEX_CHECK_STATUS;
5360 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5361
5362 status = U_ZERO_ERROR;
5363 repl = UnicodeString("$3$2$1${one}");
5364 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5365 REGEX_CHECK_STATUS;
5366 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5367
5368 status = U_ZERO_ERROR;
5369 repl = UnicodeString("<${noSuchName}>");
5370 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5371 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5372
5373 status = U_ZERO_ERROR;
5374 repl = UnicodeString("<${invalid-name}>");
5375 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5376 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5377
5378 status = U_ZERO_ERROR;
5379 repl = UnicodeString("<${one");
5380 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5381 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5382
5383 status = U_ZERO_ERROR;
5384 repl = UnicodeString("$not a capture group");
5385 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5386 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5387
5388 uregex_close(re);
5389 }
5390
5391 //--------------------------------------------------------------
5392 //
5393 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5394 // The point is not so much what the exact limit is,
5395 // but that a largish number doesn't hit bad non-linear performance,
5396 // and that exceeding the limit fails cleanly.
5397 //
5398 //--------------------------------------------------------------
NamedCaptureLimits()5399 void RegexTest::NamedCaptureLimits() {
5400 if (quick) {
5401 logln("Skipping test. Runs in exhuastive mode only.");
5402 return;
5403 }
5404 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5405 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5406 char nnbuf[100];
5407 UnicodeString pattern;
5408 int32_t nn;
5409
5410 for (nn=1; nn<goodLimit; nn++) {
5411 sprintf(nnbuf, "(?<nn%d>)", nn);
5412 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5413 }
5414 UErrorCode status = U_ZERO_ERROR;
5415 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5416 REGEX_CHECK_STATUS;
5417 for (nn=1; nn<goodLimit; nn++) {
5418 sprintf(nnbuf, "nn%d", nn);
5419 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5420 REGEX_ASSERT(nn == groupNum);
5421 if (nn != groupNum) {
5422 break;
5423 }
5424 }
5425 delete pat;
5426
5427 pattern.remove();
5428 for (nn=1; nn<failLimit; nn++) {
5429 sprintf(nnbuf, "(?<nn%d>)", nn);
5430 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5431 }
5432 status = U_ZERO_ERROR;
5433 pat = RegexPattern::compile(pattern, 0, status);
5434 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5435 delete pat;
5436 }
5437
5438
5439 //--------------------------------------------------------------
5440 //
5441 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5442 //
5443 //---------------------------------------------------------------
Bug7651()5444 void RegexTest::Bug7651() {
5445 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5446 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5447 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5448 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5449 UnicodeString s("#ff @abcd This is test");
5450 RegexPattern *REPattern = NULL;
5451 RegexMatcher *REMatcher = NULL;
5452 UErrorCode status = U_ZERO_ERROR;
5453 UParseError pe;
5454
5455 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5456 REGEX_CHECK_STATUS;
5457 REMatcher = REPattern->matcher(s, status);
5458 REGEX_CHECK_STATUS;
5459 REGEX_ASSERT(REMatcher->find());
5460 REGEX_ASSERT(REMatcher->start(status) == 0);
5461 delete REPattern;
5462 delete REMatcher;
5463 status = U_ZERO_ERROR;
5464
5465 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5466 REGEX_CHECK_STATUS;
5467 REMatcher = REPattern->matcher(s, status);
5468 REGEX_CHECK_STATUS;
5469 REGEX_ASSERT(REMatcher->find());
5470 REGEX_ASSERT(REMatcher->start(status) == 0);
5471 delete REPattern;
5472 delete REMatcher;
5473 status = U_ZERO_ERROR;
5474 }
5475
Bug7740()5476 void RegexTest::Bug7740() {
5477 UErrorCode status = U_ZERO_ERROR;
5478 UnicodeString pattern = "(a)";
5479 UnicodeString text = "abcdef";
5480 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5481 REGEX_CHECK_STATUS;
5482 REGEX_ASSERT(m->lookingAt(status));
5483 REGEX_CHECK_STATUS;
5484 status = U_ILLEGAL_ARGUMENT_ERROR;
5485 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5486 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5487 REGEX_ASSERT(s == "");
5488 delete m;
5489 }
5490
5491 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5492
Bug8479()5493 void RegexTest::Bug8479() {
5494 UErrorCode status = U_ZERO_ERROR;
5495
5496 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5497 REGEX_CHECK_STATUS;
5498 if (U_SUCCESS(status))
5499 {
5500 UnicodeString str;
5501 str.setToBogus();
5502 pMatcher->reset(str);
5503 status = U_ZERO_ERROR;
5504 pMatcher->matches(status);
5505 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5506 delete pMatcher;
5507 }
5508 }
5509
5510
5511 // Bug 7029
Bug7029()5512 void RegexTest::Bug7029() {
5513 UErrorCode status = U_ZERO_ERROR;
5514
5515 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5516 UnicodeString text = "abc.def";
5517 UnicodeString splits[10];
5518 REGEX_CHECK_STATUS;
5519 int32_t numFields = pMatcher->split(text, splits, 10, status);
5520 REGEX_CHECK_STATUS;
5521 REGEX_ASSERT(numFields == 8);
5522 delete pMatcher;
5523 }
5524
5525 // Bug 9283
5526 // This test is checking for the existance of any supplemental characters that case-fold
5527 // to a bmp character.
5528 //
5529 // At the time of this writing there are none. If any should appear in a subsequent release
5530 // of Unicode, the code in regular expressions compilation that determines the longest
5531 // posssible match for a literal string will need to be enhanced.
5532 //
5533 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5534 // for details on what to do in case of a failure of this test.
5535 //
Bug9283()5536 void RegexTest::Bug9283() {
5537 #if !UCONFIG_NO_NORMALIZATION
5538 UErrorCode status = U_ZERO_ERROR;
5539 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5540 REGEX_CHECK_STATUS;
5541 int32_t index;
5542 UChar32 c;
5543 for (index=0; ; index++) {
5544 c = supplementalsWithCaseFolding.charAt(index);
5545 if (c == -1) {
5546 break;
5547 }
5548 UnicodeString cf = UnicodeString(c).foldCase();
5549 REGEX_ASSERT(cf.length() >= 2);
5550 }
5551 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5552 }
5553
5554
CheckInvBufSize()5555 void RegexTest::CheckInvBufSize() {
5556 if(inv_next>=INV_BUFSIZ) {
5557 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5558 __FILE__, INV_BUFSIZ, inv_next);
5559 } else {
5560 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5561 }
5562 }
5563
5564
Bug10459()5565 void RegexTest::Bug10459() {
5566 UErrorCode status = U_ZERO_ERROR;
5567 UnicodeString patternString("(txt)");
5568 UnicodeString txtString("txt");
5569
5570 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5571 REGEX_CHECK_STATUS;
5572 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5573 REGEX_CHECK_STATUS;
5574
5575 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5576 REGEX_CHECK_STATUS;
5577
5578 uregex_setUText(icu_re, utext_txt, &status);
5579 REGEX_CHECK_STATUS;
5580
5581 // The bug was that calling uregex_group() before doing a matching operation
5582 // was causing a segfault. Only for Regular Expressions created from UText.
5583 // It should set an U_REGEX_INVALID_STATE.
5584
5585 UChar buf[100];
5586 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5587 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5588 REGEX_ASSERT(len == 0);
5589
5590 uregex_close(icu_re);
5591 utext_close(utext_pat);
5592 utext_close(utext_txt);
5593 }
5594
TestCaseInsensitiveStarters()5595 void RegexTest::TestCaseInsensitiveStarters() {
5596 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5597 // become stale because of new Unicode characters.
5598 // If it is stale, rerun the generation tool
5599 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5600 // and replace the embedded data in i18n/regexcmp.cpp
5601
5602 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5603 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5604 continue;
5605 }
5606 UnicodeSet s(cp, cp);
5607 s.closeOver(USET_CASE_INSENSITIVE);
5608 UnicodeSetIterator setIter(s);
5609 while (setIter.next()) {
5610 if (!setIter.isString()) {
5611 continue;
5612 }
5613 const UnicodeString &str = setIter.getString();
5614 UChar32 firstChar = str.char32At(0);
5615 UnicodeSet starters;
5616 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5617 if (!starters.contains(cp)) {
5618 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5619 return;
5620 }
5621 }
5622 }
5623 }
5624
5625
TestBug11049()5626 void RegexTest::TestBug11049() {
5627 // Original bug report: pattern with match start consisting of one of several individual characters,
5628 // and the text being matched ending with a supplementary character. find() would read past the
5629 // end of the input text when searching for potential match starting points.
5630
5631 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5632 // detect the bad read.
5633
5634 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5635 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5636
5637 // Test again with a pattern starting with a single character,
5638 // which takes a different code path than starting with an OR expression,
5639 // but with similar logic.
5640 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5641 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5642 }
5643
5644 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5645 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5646 UErrorCode status = U_ZERO_ERROR;
5647 UnicodeString patternString = UnicodeString(pattern).unescape();
5648 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5649
5650 UnicodeString dataString = UnicodeString(data).unescape();
5651 UChar *exactBuffer = new UChar[dataString.length()];
5652 dataString.extract(exactBuffer, dataString.length(), status);
5653 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5654
5655 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5656 REGEX_CHECK_STATUS;
5657 matcher->reset(ut);
5658 UBool result = matcher->find();
5659 if (result != expectMatch) {
5660 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5661 __FILE__, lineNumber, expectMatch, result, pattern, data);
5662 }
5663
5664 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5665 // off-by-one on find() with match at the last code point.
5666 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5667 // because string.unescape() will only shrink it.
5668 char * utf8Buffer = new char[uprv_strlen(data)+1];
5669 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5670 REGEX_CHECK_STATUS;
5671 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5672 REGEX_CHECK_STATUS;
5673 matcher->reset(ut);
5674 result = matcher->find();
5675 if (result != expectMatch) {
5676 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5677 __FILE__, lineNumber, expectMatch, result, pattern, data);
5678 }
5679 delete [] utf8Buffer;
5680
5681 utext_close(ut);
5682 delete [] exactBuffer;
5683 }
5684
5685
TestBug11371()5686 void RegexTest::TestBug11371() {
5687 if (quick) {
5688 logln("Skipping test. Runs in exhuastive mode only.");
5689 return;
5690 }
5691 UErrorCode status = U_ZERO_ERROR;
5692 UnicodeString patternString;
5693
5694 for (int i=0; i<8000000; i++) {
5695 patternString.append(UnicodeString("()"));
5696 }
5697 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5698 if (status != U_REGEX_PATTERN_TOO_BIG) {
5699 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5700 __FILE__, __LINE__, u_errorName(status));
5701 }
5702
5703 status = U_ZERO_ERROR;
5704 patternString = "(";
5705 for (int i=0; i<20000000; i++) {
5706 patternString.append(UnicodeString("A++"));
5707 }
5708 patternString.append(UnicodeString("){0}B++"));
5709 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5710 if (status != U_REGEX_PATTERN_TOO_BIG) {
5711 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5712 __FILE__, __LINE__, u_errorName(status));
5713 }
5714
5715 // Pattern with too much string data, such that string indexes overflow operand data field size
5716 // in compiled instruction.
5717 status = U_ZERO_ERROR;
5718 patternString = "";
5719 while (patternString.length() < 0x00ffffff) {
5720 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5721 }
5722 patternString.append(UnicodeString("X? trailing string"));
5723 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5724 if (status != U_REGEX_PATTERN_TOO_BIG) {
5725 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5726 __FILE__, __LINE__, u_errorName(status));
5727 }
5728 }
5729
TestBug11480()5730 void RegexTest::TestBug11480() {
5731 // C API, get capture group of a group that does not participate in the match.
5732 // (Returns a zero length string, with nul termination,
5733 // indistinguishable from a group with a zero length match.)
5734
5735 UErrorCode status = U_ZERO_ERROR;
5736 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5737 REGEX_CHECK_STATUS;
5738 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5739 uregex_setText(re, text.getBuffer(), text.length(), &status);
5740 REGEX_CHECK_STATUS;
5741 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5742 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5743 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5744 REGEX_ASSERT(length == 0);
5745 REGEX_ASSERT(buf[0] == 13);
5746 REGEX_ASSERT(buf[1] == 0);
5747 REGEX_ASSERT(buf[2] == 13);
5748 uregex_close(re);
5749
5750 // UText C++ API, length of match is 0 for non-participating matches.
5751 UText ut = UTEXT_INITIALIZER;
5752 utext_openUnicodeString(&ut, &text, &status);
5753 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5754 REGEX_CHECK_STATUS;
5755 matcher.reset(&ut);
5756 REGEX_ASSERT(matcher.lookingAt(0, status));
5757
5758 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5759 int64_t groupLen = -666;
5760 UText group = UTEXT_INITIALIZER;
5761 matcher.group(1, &group, groupLen, status);
5762 REGEX_CHECK_STATUS;
5763 REGEX_ASSERT(groupLen == 1);
5764 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5765
5766 // Capture group 2, the (B), does not participate in the match.
5767 matcher.group(2, &group, groupLen, status);
5768 REGEX_CHECK_STATUS;
5769 REGEX_ASSERT(groupLen == 0);
5770 REGEX_ASSERT(matcher.start(2, status) == -1);
5771 REGEX_CHECK_STATUS;
5772 }
5773
TestBug12884()5774 void RegexTest::TestBug12884() {
5775 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5776 UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5777 UnicodeString text(u"hello");
5778 UErrorCode status = U_ZERO_ERROR;
5779 RegexMatcher m(pattern, text, 0, status);
5780 REGEX_CHECK_STATUS;
5781 m.setTimeLimit(5, status);
5782 m.find(status);
5783 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5784
5785 // Non-greedy loops. They take a different code path during matching.
5786 UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5787 status = U_ZERO_ERROR;
5788 RegexMatcher ngM(ngPattern, text, 0, status);
5789 REGEX_CHECK_STATUS;
5790 ngM.setTimeLimit(5, status);
5791 ngM.find(status);
5792 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5793
5794 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5795 const char *text8 = u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
5796 "carácter, sin importar la plataforma, sin importar el programa,"
5797 "sin importar el idioma.";
5798 status = U_ZERO_ERROR;
5799 LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5800 REGEX_CHECK_STATUS;
5801 m.reset(ut.getAlias());
5802 m.find(status);
5803 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5804
5805 status = U_ZERO_ERROR;
5806 ngM.reset(ut.getAlias());
5807 ngM.find(status);
5808 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5809 }
5810
5811 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5812 // can cause a read past the end of the input text.
5813 // The failure is seen when running this test with Clang's Addresss Sanitizer.
5814
TestBug13631()5815 void RegexTest::TestBug13631() {
5816 const UChar *pats[] = { u"(?<!^)",
5817 u"(?<=^)",
5818 nullptr
5819 };
5820 for (const UChar **pat=pats; *pat; ++pat) {
5821 UErrorCode status = U_ZERO_ERROR;
5822 UnicodeString upat(*pat);
5823 RegexMatcher matcher(upat, 0, status);
5824 const UChar s =u'a';
5825 UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5826 REGEX_CHECK_STATUS;
5827 matcher.reset(ut);
5828 while (matcher.find()) {
5829 }
5830 utext_close(ut);
5831 }
5832 }
5833
5834
5835 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
5836