1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7 //
8 // regextst.cpp
9 //
10 // ICU Regular Expressions test, part of intltest.
11 //
12
13 /*
14 NOTE!!
15
16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system.
20
21 */
22
23 #include "intltest.h"
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29
30 #include "unicode/localpointer.h"
31 #include "unicode/regex.h"
32 #include "unicode/uchar.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uregex.h"
36 #include "unicode/usetiter.h"
37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
39
40 #include "regextst.h"
41 #include "regexcmp.h"
42 #include "uvector.h"
43 #include "util.h"
44 #include "cmemory.h"
45 #include "cstring.h"
46 #include "uinvchar.h"
47
48 #define SUPPORT_MUTATING_INPUT_STRING 0
49
50 //---------------------------------------------------------------------------
51 //
52 // Test class boilerplate
53 //
54 //---------------------------------------------------------------------------
RegexTest()55 RegexTest::RegexTest()
56 {
57 }
58
59
~RegexTest()60 RegexTest::~RegexTest()
61 {
62 }
63
64
65
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)66 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
67 {
68 if (exec) logln("TestSuite RegexTest: ");
69 switch (index) {
70
71 case 0: name = "Basic";
72 if (exec) Basic();
73 break;
74 case 1: name = "API_Match";
75 if (exec) API_Match();
76 break;
77 case 2: name = "API_Replace";
78 if (exec) API_Replace();
79 break;
80 case 3: name = "API_Pattern";
81 if (exec) API_Pattern();
82 break;
83 case 4:
84 #if !UCONFIG_NO_FILE_IO
85 name = "Extended";
86 if (exec) Extended();
87 #else
88 name = "skip";
89 #endif
90 break;
91 case 5: name = "Errors";
92 if (exec) Errors();
93 break;
94 case 6: name = "PerlTests";
95 if (exec) PerlTests();
96 break;
97 case 7: name = "Callbacks";
98 if (exec) Callbacks();
99 break;
100 case 8: name = "FindProgressCallbacks";
101 if (exec) FindProgressCallbacks();
102 break;
103 case 9: name = "Bug 6149";
104 if (exec) Bug6149();
105 break;
106 case 10: name = "UTextBasic";
107 if (exec) UTextBasic();
108 break;
109 case 11: name = "API_Match_UTF8";
110 if (exec) API_Match_UTF8();
111 break;
112 case 12: name = "API_Replace_UTF8";
113 if (exec) API_Replace_UTF8();
114 break;
115 case 13: name = "API_Pattern_UTF8";
116 if (exec) API_Pattern_UTF8();
117 break;
118 case 14: name = "PerlTestsUTF8";
119 if (exec) PerlTestsUTF8();
120 break;
121 case 15: name = "PreAllocatedUTextCAPI";
122 if (exec) PreAllocatedUTextCAPI();
123 break;
124 case 16: name = "Bug 7651";
125 if (exec) Bug7651();
126 break;
127 case 17: name = "Bug 7740";
128 if (exec) Bug7740();
129 break;
130 case 18: name = "Bug 8479";
131 if (exec) Bug8479();
132 break;
133 case 19: name = "Bug 7029";
134 if (exec) Bug7029();
135 break;
136 case 20: name = "CheckInvBufSize";
137 if (exec) CheckInvBufSize();
138 break;
139 case 21: name = "Bug 9283";
140 if (exec) Bug9283();
141 break;
142 case 22: name = "Bug10459";
143 if (exec) Bug10459();
144 break;
145 case 23: name = "TestCaseInsensitiveStarters";
146 if (exec) TestCaseInsensitiveStarters();
147 break;
148 case 24: name = "TestBug11049";
149 if (exec) TestBug11049();
150 break;
151 case 25: name = "TestBug11371";
152 if (exec) TestBug11371();
153 break;
154 case 26: name = "TestBug11480";
155 if (exec) TestBug11480();
156 break;
157 case 27: name = "NamedCapture";
158 if (exec) NamedCapture();
159 break;
160 case 28: name = "NamedCaptureLimits";
161 if (exec) NamedCaptureLimits();
162 break;
163 default: name = "";
164 break; //needed to end loop
165 }
166 }
167
168
169
170 /**
171 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
172 * into ASCII.
173 * @see utext_openUTF8
174 */
175 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
176
177 //---------------------------------------------------------------------------
178 //
179 // Error Checking / Reporting macros used in all of the tests.
180 //
181 //---------------------------------------------------------------------------
182
utextToPrintable(char * buf,int32_t bufLen,UText * text)183 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
184 int64_t oldIndex = utext_getNativeIndex(text);
185 utext_setNativeIndex(text, 0);
186 char *bufPtr = buf;
187 UChar32 c = utext_next32From(text, 0);
188 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
189 if (0x000020<=c && c<0x00007e) {
190 *bufPtr = c;
191 } else {
192 #if 0
193 sprintf(bufPtr,"U+%04X", c);
194 bufPtr+= strlen(bufPtr)-1;
195 #else
196 *bufPtr = '%';
197 #endif
198 }
199 bufPtr++;
200 c = UTEXT_NEXT32(text);
201 }
202 *bufPtr = 0;
203 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
204 char *ebuf = (char*)malloc(bufLen);
205 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
206 uprv_strncpy(buf, ebuf, bufLen);
207 free((void*)ebuf);
208 #endif
209 utext_setNativeIndex(text, oldIndex);
210 }
211
212
213 static char ASSERT_BUF[1024];
214
extractToAssertBuf(const UnicodeString & message)215 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
216 if(message.length()==0) {
217 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
218 } else {
219 UnicodeString buf;
220 IntlTest::prettify(message,buf);
221 if(buf.length()==0) {
222 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
223 } else {
224 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
225 if(ASSERT_BUF[0]==0) {
226 ASSERT_BUF[0]=0;
227 for(int32_t i=0;i<buf.length();i++) {
228 UChar ch = buf[i];
229 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
230 }
231 }
232 }
233 }
234 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
235 return ASSERT_BUF;
236 }
237
238 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
239
240 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
241 __FILE__, __LINE__, u_errorName(status)); return;}}
242
243 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
244
245 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
246 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
247 __LINE__, u_errorName(errcode), u_errorName(status));};}
248
249 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
250 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
251
252 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
253 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
254
255 // expected: const char * , restricted to invariant characters.
256 // actual: const UnicodeString &
257 #define REGEX_ASSERT_UNISTR(expected, actual) { \
258 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
259 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
260 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
261
262
testUTextEqual(UText * uta,UText * utb)263 static UBool testUTextEqual(UText *uta, UText *utb) {
264 UChar32 ca = 0;
265 UChar32 cb = 0;
266 utext_setNativeIndex(uta, 0);
267 utext_setNativeIndex(utb, 0);
268 do {
269 ca = utext_next32(uta);
270 cb = utext_next32(utb);
271 if (ca != cb) {
272 break;
273 }
274 } while (ca != U_SENTINEL);
275 return ca == cb;
276 }
277
278
279 /**
280 * @param expected expected text in UTF-8 (not platform) codepage
281 */
assertUText(const char * expected,UText * actual,const char * file,int line)282 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
283 UErrorCode status = U_ZERO_ERROR;
284 UText expectedText = UTEXT_INITIALIZER;
285 utext_openUTF8(&expectedText, expected, -1, &status);
286 if(U_FAILURE(status)) {
287 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
288 return;
289 }
290 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
291 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
292 return;
293 }
294 utext_setNativeIndex(actual, 0);
295 if (!testUTextEqual(&expectedText, actual)) {
296 char buf[201 /*21*/];
297 char expectedBuf[201];
298 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
299 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
300 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
301 }
302 utext_close(&expectedText);
303 }
304 /**
305 * @param expected invariant (platform local text) input
306 */
307
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)308 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
309 UErrorCode status = U_ZERO_ERROR;
310 UText expectedText = UTEXT_INITIALIZER;
311 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
312 if(U_FAILURE(status)) {
313 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
314 return;
315 }
316 utext_setNativeIndex(actual, 0);
317 if (!testUTextEqual(&expectedText, actual)) {
318 char buf[201 /*21*/];
319 char expectedBuf[201];
320 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
321 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
322 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
323 }
324 utext_close(&expectedText);
325 }
326
327 /**
328 * Assumes utf-8 input
329 */
330 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
331 /**
332 * Assumes Invariant input
333 */
334 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
335
336 /**
337 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
338 * passed into utext_openUTF8. An error will be given if
339 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
340 */
341
342 #define INV_BUFSIZ 2048 /* increase this if too small */
343
344 static int64_t inv_next=0;
345
346 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
347 static char inv_buf[INV_BUFSIZ];
348 #endif
349
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)350 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
351 if(length==-1) length=strlen(inv);
352 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
353 inv_next+=length;
354 return utext_openUTF8(ut, inv, length, status);
355 #else
356 if(inv_next+length+1>INV_BUFSIZ) {
357 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
358 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
359 *status = U_MEMORY_ALLOCATION_ERROR;
360 return NULL;
361 }
362
363 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
364 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
365 inv_next+=length;
366
367 #if 0
368 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
369 #endif
370
371 return utext_openUTF8(ut, (const char*)buf, length, status);
372 #endif
373 }
374
375
376 //---------------------------------------------------------------------------
377 //
378 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
379 // for the LookingAt() and Match() functions.
380 //
381 // usage:
382 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
383 //
384 // The expected results are UBool - TRUE or FALSE.
385 // The input text is unescaped. The pattern is not.
386 //
387 //
388 //---------------------------------------------------------------------------
389
390 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
391
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)392 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
393 const UnicodeString pattern(pat, -1, US_INV);
394 const UnicodeString inputText(text, -1, US_INV);
395 UErrorCode status = U_ZERO_ERROR;
396 UParseError pe;
397 RegexPattern *REPattern = NULL;
398 RegexMatcher *REMatcher = NULL;
399 UBool retVal = TRUE;
400
401 UnicodeString patString(pat, -1, US_INV);
402 REPattern = RegexPattern::compile(patString, 0, pe, status);
403 if (U_FAILURE(status)) {
404 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
405 line, u_errorName(status));
406 return FALSE;
407 }
408 if (line==376) { REPattern->dumpPattern();}
409
410 UnicodeString inputString(inputText);
411 UnicodeString unEscapedInput = inputString.unescape();
412 REMatcher = REPattern->matcher(unEscapedInput, status);
413 if (U_FAILURE(status)) {
414 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
415 line, u_errorName(status));
416 return FALSE;
417 }
418
419 UBool actualmatch;
420 actualmatch = REMatcher->lookingAt(status);
421 if (U_FAILURE(status)) {
422 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
423 line, u_errorName(status));
424 retVal = FALSE;
425 }
426 if (actualmatch != looking) {
427 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
428 retVal = FALSE;
429 }
430
431 status = U_ZERO_ERROR;
432 actualmatch = REMatcher->matches(status);
433 if (U_FAILURE(status)) {
434 errln("RegexTest failure in matches() at line %d. Status = %s\n",
435 line, u_errorName(status));
436 retVal = FALSE;
437 }
438 if (actualmatch != match) {
439 errln("RegexTest: wrong return from matches() at line %d.\n", line);
440 retVal = FALSE;
441 }
442
443 if (retVal == FALSE) {
444 REPattern->dumpPattern();
445 }
446
447 delete REPattern;
448 delete REMatcher;
449 return retVal;
450 }
451
452
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)453 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
454 UText pattern = UTEXT_INITIALIZER;
455 int32_t inputUTF8Length;
456 char *textChars = NULL;
457 UText inputText = UTEXT_INITIALIZER;
458 UErrorCode status = U_ZERO_ERROR;
459 UParseError pe;
460 RegexPattern *REPattern = NULL;
461 RegexMatcher *REMatcher = NULL;
462 UBool retVal = TRUE;
463
464 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
465 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
466 if (U_FAILURE(status)) {
467 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
468 line, u_errorName(status));
469 return FALSE;
470 }
471
472 UnicodeString inputString(text, -1, US_INV);
473 UnicodeString unEscapedInput = inputString.unescape();
474 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
475 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
476
477 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
478 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
479 // UTF-8 does not allow unpaired surrogates, so this could actually happen
480 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
481 return TRUE; // not a failure of the Regex engine
482 }
483 status = U_ZERO_ERROR; // buffer overflow
484 textChars = new char[inputUTF8Length+1];
485 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
486 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
487
488 REMatcher = &REPattern->matcher(status)->reset(&inputText);
489 if (U_FAILURE(status)) {
490 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
491 line, u_errorName(status));
492 return FALSE;
493 }
494
495 UBool actualmatch;
496 actualmatch = REMatcher->lookingAt(status);
497 if (U_FAILURE(status)) {
498 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
499 line, u_errorName(status));
500 retVal = FALSE;
501 }
502 if (actualmatch != looking) {
503 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
504 retVal = FALSE;
505 }
506
507 status = U_ZERO_ERROR;
508 actualmatch = REMatcher->matches(status);
509 if (U_FAILURE(status)) {
510 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
511 line, u_errorName(status));
512 retVal = FALSE;
513 }
514 if (actualmatch != match) {
515 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
516 retVal = FALSE;
517 }
518
519 if (retVal == FALSE) {
520 REPattern->dumpPattern();
521 }
522
523 delete REPattern;
524 delete REMatcher;
525 utext_close(&inputText);
526 utext_close(&pattern);
527 delete[] textChars;
528 return retVal;
529 }
530
531
532
533 //---------------------------------------------------------------------------
534 //
535 // REGEX_ERR Macro + invocation function to simplify writing tests
536 // regex tests for incorrect patterns
537 //
538 // usage:
539 // REGEX_ERR("pattern", expected error line, column, expected status);
540 //
541 //---------------------------------------------------------------------------
542 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
543
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)544 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
545 UErrorCode expectedStatus, int32_t line) {
546 UnicodeString pattern(pat);
547
548 UErrorCode status = U_ZERO_ERROR;
549 UParseError pe;
550 RegexPattern *callerPattern = NULL;
551
552 //
553 // Compile the caller's pattern
554 //
555 UnicodeString patString(pat);
556 callerPattern = RegexPattern::compile(patString, 0, pe, status);
557 if (status != expectedStatus) {
558 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
559 } else {
560 if (status != U_ZERO_ERROR) {
561 if (pe.line != errLine || pe.offset != errCol) {
562 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
563 line, errLine, errCol, pe.line, pe.offset);
564 }
565 }
566 }
567
568 delete callerPattern;
569
570 //
571 // Compile again, using a UTF-8-based UText
572 //
573 UText patternText = UTEXT_INITIALIZER;
574 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
575 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
576 if (status != expectedStatus) {
577 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
578 } else {
579 if (status != U_ZERO_ERROR) {
580 if (pe.line != errLine || pe.offset != errCol) {
581 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
582 line, errLine, errCol, pe.line, pe.offset);
583 }
584 }
585 }
586
587 delete callerPattern;
588 utext_close(&patternText);
589 }
590
591
592
593 //---------------------------------------------------------------------------
594 //
595 // Basic Check for basic functionality of regex pattern matching.
596 // Avoid the use of REGEX_FIND test macro, which has
597 // substantial dependencies on basic Regex functionality.
598 //
599 //---------------------------------------------------------------------------
Basic()600 void RegexTest::Basic() {
601
602
603 //
604 // Debug - slide failing test cases early
605 //
606 #if 0
607 {
608 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
609 UParseError pe;
610 UErrorCode status = U_ZERO_ERROR;
611 RegexPattern *pattern;
612 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
613 pattern->dumpPattern();
614 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
615 UBool result = m->find();
616 printf("result = %d\n", result);
617 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
618 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
619 }
620 exit(1);
621 #endif
622
623
624 //
625 // Pattern with parentheses
626 //
627 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
628 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
629 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
630
631 //
632 // Patterns with *
633 //
634 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
635 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
636 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
637 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
638 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
639
640 REGEX_TESTLM("a*", "", TRUE, TRUE);
641 REGEX_TESTLM("a*", "b", TRUE, FALSE);
642
643
644 //
645 // Patterns with "."
646 //
647 REGEX_TESTLM(".", "abc", TRUE, FALSE);
648 REGEX_TESTLM("...", "abc", TRUE, TRUE);
649 REGEX_TESTLM("....", "abc", FALSE, FALSE);
650 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
651 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
652 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
653 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
654 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
655
656 //
657 // Patterns with * applied to chars at end of literal string
658 //
659 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
660 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
661
662 //
663 // Supplemental chars match as single chars, not a pair of surrogates.
664 //
665 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
666 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
667 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
668
669
670 //
671 // UnicodeSets in the pattern
672 //
673 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
674 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
675 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
676 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
677 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
678 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
679
680 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
681 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
682 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
683 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
684 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
685
686 //
687 // OR operator in patterns
688 //
689 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
690 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
691 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
692 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
693
694 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
695 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
696 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
697 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
698 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
699 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
700
701 //
702 // +
703 //
704 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
705 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
706 REGEX_TESTLM("b+", "", FALSE, FALSE);
707 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
708 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
709 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
710
711 //
712 // ?
713 //
714 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
715 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
716 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
717 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
718 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
719 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
720 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
721 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
722 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
723
724 //
725 // Escape sequences that become single literal chars, handled internally
726 // by ICU's Unescape.
727 //
728
729 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
730 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
731 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
732 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
733 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
734 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
735 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
736 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
737 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
738 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
739
740 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
741 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
742
743 // Escape of special chars in patterns
744 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
745 }
746
747
748 //---------------------------------------------------------------------------
749 //
750 // UTextBasic Check for quirks that are specific to the UText
751 // implementation.
752 //
753 //---------------------------------------------------------------------------
UTextBasic()754 void RegexTest::UTextBasic() {
755 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
756 UErrorCode status = U_ZERO_ERROR;
757 UText pattern = UTEXT_INITIALIZER;
758 utext_openUTF8(&pattern, str_abc, -1, &status);
759 RegexMatcher matcher(&pattern, 0, status);
760 REGEX_CHECK_STATUS;
761
762 UText input = UTEXT_INITIALIZER;
763 utext_openUTF8(&input, str_abc, -1, &status);
764 REGEX_CHECK_STATUS;
765 matcher.reset(&input);
766 REGEX_CHECK_STATUS;
767 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
768
769 matcher.reset(matcher.inputText());
770 REGEX_CHECK_STATUS;
771 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
772
773 utext_close(&pattern);
774 utext_close(&input);
775 }
776
777
778 //---------------------------------------------------------------------------
779 //
780 // API_Match Test that the API for class RegexMatcher
781 // is present and nominally working, but excluding functions
782 // implementing replace operations.
783 //
784 //---------------------------------------------------------------------------
API_Match()785 void RegexTest::API_Match() {
786 UParseError pe;
787 UErrorCode status=U_ZERO_ERROR;
788 int32_t flags = 0;
789
790 //
791 // Debug - slide failing test cases early
792 //
793 #if 0
794 {
795 }
796 return;
797 #endif
798
799 //
800 // Simple pattern compilation
801 //
802 {
803 UnicodeString re("abc");
804 RegexPattern *pat2;
805 pat2 = RegexPattern::compile(re, flags, pe, status);
806 REGEX_CHECK_STATUS;
807
808 UnicodeString inStr1 = "abcdef this is a test";
809 UnicodeString instr2 = "not abc";
810 UnicodeString empty = "";
811
812
813 //
814 // Matcher creation and reset.
815 //
816 RegexMatcher *m1 = pat2->matcher(inStr1, status);
817 REGEX_CHECK_STATUS;
818 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
819 REGEX_ASSERT(m1->input() == inStr1);
820 m1->reset(instr2);
821 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
822 REGEX_ASSERT(m1->input() == instr2);
823 m1->reset(inStr1);
824 REGEX_ASSERT(m1->input() == inStr1);
825 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
826 m1->reset(empty);
827 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
828 REGEX_ASSERT(m1->input() == empty);
829 REGEX_ASSERT(&m1->pattern() == pat2);
830
831 //
832 // reset(pos, status)
833 //
834 m1->reset(inStr1);
835 m1->reset(4, status);
836 REGEX_CHECK_STATUS;
837 REGEX_ASSERT(m1->input() == inStr1);
838 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
839
840 m1->reset(-1, status);
841 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
842 status = U_ZERO_ERROR;
843
844 m1->reset(0, status);
845 REGEX_CHECK_STATUS;
846 status = U_ZERO_ERROR;
847
848 int32_t len = m1->input().length();
849 m1->reset(len-1, status);
850 REGEX_CHECK_STATUS;
851 status = U_ZERO_ERROR;
852
853 m1->reset(len, status);
854 REGEX_CHECK_STATUS;
855 status = U_ZERO_ERROR;
856
857 m1->reset(len+1, status);
858 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
859 status = U_ZERO_ERROR;
860
861 //
862 // match(pos, status)
863 //
864 m1->reset(instr2);
865 REGEX_ASSERT(m1->matches(4, status) == TRUE);
866 m1->reset();
867 REGEX_ASSERT(m1->matches(3, status) == FALSE);
868 m1->reset();
869 REGEX_ASSERT(m1->matches(5, status) == FALSE);
870 REGEX_ASSERT(m1->matches(4, status) == TRUE);
871 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
872 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
873
874 // Match() at end of string should fail, but should not
875 // be an error.
876 status = U_ZERO_ERROR;
877 len = m1->input().length();
878 REGEX_ASSERT(m1->matches(len, status) == FALSE);
879 REGEX_CHECK_STATUS;
880
881 // Match beyond end of string should fail with an error.
882 status = U_ZERO_ERROR;
883 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
884 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
885
886 // Successful match at end of string.
887 {
888 status = U_ZERO_ERROR;
889 RegexMatcher m("A?", 0, status); // will match zero length string.
890 REGEX_CHECK_STATUS;
891 m.reset(inStr1);
892 len = inStr1.length();
893 REGEX_ASSERT(m.matches(len, status) == TRUE);
894 REGEX_CHECK_STATUS;
895 m.reset(empty);
896 REGEX_ASSERT(m.matches(0, status) == TRUE);
897 REGEX_CHECK_STATUS;
898 }
899
900
901 //
902 // lookingAt(pos, status)
903 //
904 status = U_ZERO_ERROR;
905 m1->reset(instr2); // "not abc"
906 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
907 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
908 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
909 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
910 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
911 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
912 status = U_ZERO_ERROR;
913 len = m1->input().length();
914 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
915 REGEX_CHECK_STATUS;
916 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
917 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
918
919 delete m1;
920 delete pat2;
921 }
922
923
924 //
925 // Capture Group.
926 // RegexMatcher::start();
927 // RegexMatcher::end();
928 // RegexMatcher::groupCount();
929 //
930 {
931 int32_t flags=0;
932 UParseError pe;
933 UErrorCode status=U_ZERO_ERROR;
934
935 UnicodeString re("01(23(45)67)(.*)");
936 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
937 REGEX_CHECK_STATUS;
938 UnicodeString data = "0123456789";
939
940 RegexMatcher *matcher = pat->matcher(data, status);
941 REGEX_CHECK_STATUS;
942 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
943 static const int32_t matchStarts[] = {0, 2, 4, 8};
944 static const int32_t matchEnds[] = {10, 8, 6, 10};
945 int32_t i;
946 for (i=0; i<4; i++) {
947 int32_t actualStart = matcher->start(i, status);
948 REGEX_CHECK_STATUS;
949 if (actualStart != matchStarts[i]) {
950 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
951 __LINE__, i, matchStarts[i], actualStart);
952 }
953 int32_t actualEnd = matcher->end(i, status);
954 REGEX_CHECK_STATUS;
955 if (actualEnd != matchEnds[i]) {
956 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
957 __LINE__, i, matchEnds[i], actualEnd);
958 }
959 }
960
961 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
962 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
963
964 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
965 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
966 matcher->reset();
967 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
968
969 matcher->lookingAt(status);
970 REGEX_ASSERT(matcher->group(status) == "0123456789");
971 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
972 REGEX_ASSERT(matcher->group(1, status) == "234567" );
973 REGEX_ASSERT(matcher->group(2, status) == "45" );
974 REGEX_ASSERT(matcher->group(3, status) == "89" );
975 REGEX_CHECK_STATUS;
976 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
977 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
978 matcher->reset();
979 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
980
981 delete matcher;
982 delete pat;
983
984 }
985
986 //
987 // find
988 //
989 {
990 int32_t flags=0;
991 UParseError pe;
992 UErrorCode status=U_ZERO_ERROR;
993
994 UnicodeString re("abc");
995 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
996 REGEX_CHECK_STATUS;
997 UnicodeString data = ".abc..abc...abc..";
998 // 012345678901234567
999
1000 RegexMatcher *matcher = pat->matcher(data, status);
1001 REGEX_CHECK_STATUS;
1002 REGEX_ASSERT(matcher->find());
1003 REGEX_ASSERT(matcher->start(status) == 1);
1004 REGEX_ASSERT(matcher->find());
1005 REGEX_ASSERT(matcher->start(status) == 6);
1006 REGEX_ASSERT(matcher->find());
1007 REGEX_ASSERT(matcher->start(status) == 12);
1008 REGEX_ASSERT(matcher->find() == FALSE);
1009 REGEX_ASSERT(matcher->find() == FALSE);
1010
1011 matcher->reset();
1012 REGEX_ASSERT(matcher->find());
1013 REGEX_ASSERT(matcher->start(status) == 1);
1014
1015 REGEX_ASSERT(matcher->find(0, status));
1016 REGEX_ASSERT(matcher->start(status) == 1);
1017 REGEX_ASSERT(matcher->find(1, status));
1018 REGEX_ASSERT(matcher->start(status) == 1);
1019 REGEX_ASSERT(matcher->find(2, status));
1020 REGEX_ASSERT(matcher->start(status) == 6);
1021 REGEX_ASSERT(matcher->find(12, status));
1022 REGEX_ASSERT(matcher->start(status) == 12);
1023 REGEX_ASSERT(matcher->find(13, status) == FALSE);
1024 REGEX_ASSERT(matcher->find(16, status) == FALSE);
1025 REGEX_ASSERT(matcher->find(17, status) == FALSE);
1026 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1027
1028 status = U_ZERO_ERROR;
1029 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1030 status = U_ZERO_ERROR;
1031 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1032
1033 REGEX_ASSERT(matcher->groupCount() == 0);
1034
1035 delete matcher;
1036 delete pat;
1037 }
1038
1039
1040 //
1041 // find, with \G in pattern (true if at the end of a previous match).
1042 //
1043 {
1044 int32_t flags=0;
1045 UParseError pe;
1046 UErrorCode status=U_ZERO_ERROR;
1047
1048 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1049 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1050 REGEX_CHECK_STATUS;
1051 UnicodeString data = ".abcabc.abc..";
1052 // 012345678901234567
1053
1054 RegexMatcher *matcher = pat->matcher(data, status);
1055 REGEX_CHECK_STATUS;
1056 REGEX_ASSERT(matcher->find());
1057 REGEX_ASSERT(matcher->start(status) == 0);
1058 REGEX_ASSERT(matcher->start(1, status) == -1);
1059 REGEX_ASSERT(matcher->start(2, status) == 1);
1060
1061 REGEX_ASSERT(matcher->find());
1062 REGEX_ASSERT(matcher->start(status) == 4);
1063 REGEX_ASSERT(matcher->start(1, status) == 4);
1064 REGEX_ASSERT(matcher->start(2, status) == -1);
1065 REGEX_CHECK_STATUS;
1066
1067 delete matcher;
1068 delete pat;
1069 }
1070
1071 //
1072 // find with zero length matches, match position should bump ahead
1073 // to prevent loops.
1074 //
1075 {
1076 int32_t i;
1077 UErrorCode status=U_ZERO_ERROR;
1078 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1079 // using an always-true look-ahead.
1080 REGEX_CHECK_STATUS;
1081 UnicodeString s(" ");
1082 m.reset(s);
1083 for (i=0; ; i++) {
1084 if (m.find() == FALSE) {
1085 break;
1086 }
1087 REGEX_ASSERT(m.start(status) == i);
1088 REGEX_ASSERT(m.end(status) == i);
1089 }
1090 REGEX_ASSERT(i==5);
1091
1092 // Check that the bump goes over surrogate pairs OK
1093 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1094 s = s.unescape();
1095 m.reset(s);
1096 for (i=0; ; i+=2) {
1097 if (m.find() == FALSE) {
1098 break;
1099 }
1100 REGEX_ASSERT(m.start(status) == i);
1101 REGEX_ASSERT(m.end(status) == i);
1102 }
1103 REGEX_ASSERT(i==10);
1104 }
1105 {
1106 // find() loop breaking test.
1107 // with pattern of /.?/, should see a series of one char matches, then a single
1108 // match of zero length at the end of the input string.
1109 int32_t i;
1110 UErrorCode status=U_ZERO_ERROR;
1111 RegexMatcher m(".?", 0, status);
1112 REGEX_CHECK_STATUS;
1113 UnicodeString s(" ");
1114 m.reset(s);
1115 for (i=0; ; i++) {
1116 if (m.find() == FALSE) {
1117 break;
1118 }
1119 REGEX_ASSERT(m.start(status) == i);
1120 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1121 }
1122 REGEX_ASSERT(i==5);
1123 }
1124
1125
1126 //
1127 // Matchers with no input string behave as if they had an empty input string.
1128 //
1129
1130 {
1131 UErrorCode status = U_ZERO_ERROR;
1132 RegexMatcher m(".?", 0, status);
1133 REGEX_CHECK_STATUS;
1134 REGEX_ASSERT(m.find());
1135 REGEX_ASSERT(m.start(status) == 0);
1136 REGEX_ASSERT(m.input() == "");
1137 }
1138 {
1139 UErrorCode status = U_ZERO_ERROR;
1140 RegexPattern *p = RegexPattern::compile(".", 0, status);
1141 RegexMatcher *m = p->matcher(status);
1142 REGEX_CHECK_STATUS;
1143
1144 REGEX_ASSERT(m->find() == FALSE);
1145 REGEX_ASSERT(m->input() == "");
1146 delete m;
1147 delete p;
1148 }
1149
1150 //
1151 // Regions
1152 //
1153 {
1154 UErrorCode status = U_ZERO_ERROR;
1155 UnicodeString testString("This is test data");
1156 RegexMatcher m(".*", testString, 0, status);
1157 REGEX_CHECK_STATUS;
1158 REGEX_ASSERT(m.regionStart() == 0);
1159 REGEX_ASSERT(m.regionEnd() == testString.length());
1160 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1161 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1162
1163 m.region(2,4, status);
1164 REGEX_CHECK_STATUS;
1165 REGEX_ASSERT(m.matches(status));
1166 REGEX_ASSERT(m.start(status)==2);
1167 REGEX_ASSERT(m.end(status)==4);
1168 REGEX_CHECK_STATUS;
1169
1170 m.reset();
1171 REGEX_ASSERT(m.regionStart() == 0);
1172 REGEX_ASSERT(m.regionEnd() == testString.length());
1173
1174 UnicodeString shorterString("short");
1175 m.reset(shorterString);
1176 REGEX_ASSERT(m.regionStart() == 0);
1177 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1178
1179 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1180 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1181 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1182 REGEX_ASSERT(&m == &m.reset());
1183 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1184
1185 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1186 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1187 REGEX_ASSERT(&m == &m.reset());
1188 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1189
1190 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1191 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1192 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1193 REGEX_ASSERT(&m == &m.reset());
1194 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1195
1196 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1197 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1198 REGEX_ASSERT(&m == &m.reset());
1199 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1200
1201 }
1202
1203 //
1204 // hitEnd() and requireEnd()
1205 //
1206 {
1207 UErrorCode status = U_ZERO_ERROR;
1208 UnicodeString testString("aabb");
1209 RegexMatcher m1(".*", testString, 0, status);
1210 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1211 REGEX_ASSERT(m1.hitEnd() == TRUE);
1212 REGEX_ASSERT(m1.requireEnd() == FALSE);
1213 REGEX_CHECK_STATUS;
1214
1215 status = U_ZERO_ERROR;
1216 RegexMatcher m2("a*", testString, 0, status);
1217 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1218 REGEX_ASSERT(m2.hitEnd() == FALSE);
1219 REGEX_ASSERT(m2.requireEnd() == FALSE);
1220 REGEX_CHECK_STATUS;
1221
1222 status = U_ZERO_ERROR;
1223 RegexMatcher m3(".*$", testString, 0, status);
1224 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1225 REGEX_ASSERT(m3.hitEnd() == TRUE);
1226 REGEX_ASSERT(m3.requireEnd() == TRUE);
1227 REGEX_CHECK_STATUS;
1228 }
1229
1230
1231 //
1232 // Compilation error on reset with UChar *
1233 // These were a hazard that people were stumbling over with runtime errors.
1234 // Changed them to compiler errors by adding private methods that more closely
1235 // matched the incorrect use of the functions.
1236 //
1237 #if 0
1238 {
1239 UErrorCode status = U_ZERO_ERROR;
1240 UChar ucharString[20];
1241 RegexMatcher m(".", 0, status);
1242 m.reset(ucharString); // should not compile.
1243
1244 RegexPattern *p = RegexPattern::compile(".", 0, status);
1245 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1246
1247 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1248 }
1249 #endif
1250
1251 //
1252 // Time Outs.
1253 // Note: These tests will need to be changed when the regexp engine is
1254 // able to detect and cut short the exponential time behavior on
1255 // this type of match.
1256 //
1257 {
1258 UErrorCode status = U_ZERO_ERROR;
1259 // Enough 'a's in the string to cause the match to time out.
1260 // (Each on additonal 'a' doubles the time)
1261 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1262 RegexMatcher matcher("(a+)+b", testString, 0, status);
1263 REGEX_CHECK_STATUS;
1264 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1265 matcher.setTimeLimit(100, status);
1266 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1267 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1268 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1269 }
1270 {
1271 UErrorCode status = U_ZERO_ERROR;
1272 // Few enough 'a's to slip in under the time limit.
1273 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1274 RegexMatcher matcher("(a+)+b", testString, 0, status);
1275 REGEX_CHECK_STATUS;
1276 matcher.setTimeLimit(100, status);
1277 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1278 REGEX_CHECK_STATUS;
1279 }
1280
1281 //
1282 // Stack Limits
1283 //
1284 {
1285 UErrorCode status = U_ZERO_ERROR;
1286 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1287
1288 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1289 // of the '+', and makes the stack frames larger.
1290 RegexMatcher matcher("(A)+A$", testString, 0, status);
1291
1292 // With the default stack, this match should fail to run
1293 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1294 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1295
1296 // With unlimited stack, it should run
1297 status = U_ZERO_ERROR;
1298 matcher.setStackLimit(0, status);
1299 REGEX_CHECK_STATUS;
1300 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1301 REGEX_CHECK_STATUS;
1302 REGEX_ASSERT(matcher.getStackLimit() == 0);
1303
1304 // With a limited stack, it the match should fail
1305 status = U_ZERO_ERROR;
1306 matcher.setStackLimit(10000, status);
1307 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1308 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1309 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1310 }
1311
1312 // A pattern that doesn't save state should work with
1313 // a minimal sized stack
1314 {
1315 UErrorCode status = U_ZERO_ERROR;
1316 UnicodeString testString = "abc";
1317 RegexMatcher matcher("abc", testString, 0, status);
1318 REGEX_CHECK_STATUS;
1319 matcher.setStackLimit(30, status);
1320 REGEX_CHECK_STATUS;
1321 REGEX_ASSERT(matcher.matches(status) == TRUE);
1322 REGEX_CHECK_STATUS;
1323 REGEX_ASSERT(matcher.getStackLimit() == 30);
1324
1325 // Negative stack sizes should fail
1326 status = U_ZERO_ERROR;
1327 matcher.setStackLimit(1000, status);
1328 REGEX_CHECK_STATUS;
1329 matcher.setStackLimit(-1, status);
1330 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1331 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1332 }
1333
1334
1335 }
1336
1337
1338
1339
1340
1341
1342 //---------------------------------------------------------------------------
1343 //
1344 // API_Replace API test for class RegexMatcher, testing the
1345 // Replace family of functions.
1346 //
1347 //---------------------------------------------------------------------------
API_Replace()1348 void RegexTest::API_Replace() {
1349 //
1350 // Replace
1351 //
1352 int32_t flags=0;
1353 UParseError pe;
1354 UErrorCode status=U_ZERO_ERROR;
1355
1356 UnicodeString re("abc");
1357 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1358 REGEX_CHECK_STATUS;
1359 UnicodeString data = ".abc..abc...abc..";
1360 // 012345678901234567
1361 RegexMatcher *matcher = pat->matcher(data, status);
1362
1363 //
1364 // Plain vanilla matches.
1365 //
1366 UnicodeString dest;
1367 dest = matcher->replaceFirst("yz", status);
1368 REGEX_CHECK_STATUS;
1369 REGEX_ASSERT(dest == ".yz..abc...abc..");
1370
1371 dest = matcher->replaceAll("yz", status);
1372 REGEX_CHECK_STATUS;
1373 REGEX_ASSERT(dest == ".yz..yz...yz..");
1374
1375 //
1376 // Plain vanilla non-matches.
1377 //
1378 UnicodeString d2 = ".abx..abx...abx..";
1379 matcher->reset(d2);
1380 dest = matcher->replaceFirst("yz", status);
1381 REGEX_CHECK_STATUS;
1382 REGEX_ASSERT(dest == ".abx..abx...abx..");
1383
1384 dest = matcher->replaceAll("yz", status);
1385 REGEX_CHECK_STATUS;
1386 REGEX_ASSERT(dest == ".abx..abx...abx..");
1387
1388 //
1389 // Empty source string
1390 //
1391 UnicodeString d3 = "";
1392 matcher->reset(d3);
1393 dest = matcher->replaceFirst("yz", status);
1394 REGEX_CHECK_STATUS;
1395 REGEX_ASSERT(dest == "");
1396
1397 dest = matcher->replaceAll("yz", status);
1398 REGEX_CHECK_STATUS;
1399 REGEX_ASSERT(dest == "");
1400
1401 //
1402 // Empty substitution string
1403 //
1404 matcher->reset(data); // ".abc..abc...abc.."
1405 dest = matcher->replaceFirst("", status);
1406 REGEX_CHECK_STATUS;
1407 REGEX_ASSERT(dest == "...abc...abc..");
1408
1409 dest = matcher->replaceAll("", status);
1410 REGEX_CHECK_STATUS;
1411 REGEX_ASSERT(dest == "........");
1412
1413 //
1414 // match whole string
1415 //
1416 UnicodeString d4 = "abc";
1417 matcher->reset(d4);
1418 dest = matcher->replaceFirst("xyz", status);
1419 REGEX_CHECK_STATUS;
1420 REGEX_ASSERT(dest == "xyz");
1421
1422 dest = matcher->replaceAll("xyz", status);
1423 REGEX_CHECK_STATUS;
1424 REGEX_ASSERT(dest == "xyz");
1425
1426 //
1427 // Capture Group, simple case
1428 //
1429 UnicodeString re2("a(..)");
1430 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1431 REGEX_CHECK_STATUS;
1432 UnicodeString d5 = "abcdefg";
1433 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1434 REGEX_CHECK_STATUS;
1435 dest = matcher2->replaceFirst("$1$1", status);
1436 REGEX_CHECK_STATUS;
1437 REGEX_ASSERT(dest == "bcbcdefg");
1438
1439 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1440 REGEX_CHECK_STATUS;
1441 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1442
1443 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1444 REGEX_ASSERT(U_FAILURE(status));
1445 status = U_ZERO_ERROR;
1446
1447 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1448 replacement = replacement.unescape();
1449 dest = matcher2->replaceFirst(replacement, status);
1450 REGEX_CHECK_STATUS;
1451 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1452
1453 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1454
1455
1456 //
1457 // Replacement String with \u hex escapes
1458 //
1459 {
1460 UnicodeString src = "abc 1 abc 2 abc 3";
1461 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1462 matcher->reset(src);
1463 UnicodeString result = matcher->replaceAll(substitute, status);
1464 REGEX_CHECK_STATUS;
1465 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1466 }
1467 {
1468 UnicodeString src = "abc !";
1469 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1470 matcher->reset(src);
1471 UnicodeString result = matcher->replaceAll(substitute, status);
1472 REGEX_CHECK_STATUS;
1473 UnicodeString expected = UnicodeString("--");
1474 expected.append((UChar32)0x10000);
1475 expected.append("-- !");
1476 REGEX_ASSERT(result == expected);
1477 }
1478 // TODO: need more through testing of capture substitutions.
1479
1480 // Bug 4057
1481 //
1482 {
1483 status = U_ZERO_ERROR;
1484 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1485 RegexMatcher m("ss(.*?)ee", 0, status);
1486 REGEX_CHECK_STATUS;
1487 UnicodeString result;
1488
1489 // Multiple finds do NOT bump up the previous appendReplacement postion.
1490 m.reset(s);
1491 m.find();
1492 m.find();
1493 m.appendReplacement(result, "ooh", status);
1494 REGEX_CHECK_STATUS;
1495 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1496
1497 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1498 status = U_ZERO_ERROR;
1499 result.truncate(0);
1500 m.reset(10, status);
1501 m.find();
1502 m.find();
1503 m.appendReplacement(result, "ooh", status);
1504 REGEX_CHECK_STATUS;
1505 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1506
1507 // find() at interior of string, appendReplacemnt still starts at beginning.
1508 status = U_ZERO_ERROR;
1509 result.truncate(0);
1510 m.reset();
1511 m.find(10, status);
1512 m.find();
1513 m.appendReplacement(result, "ooh", status);
1514 REGEX_CHECK_STATUS;
1515 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1516
1517 m.appendTail(result);
1518 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1519
1520 }
1521
1522 delete matcher2;
1523 delete pat2;
1524 delete matcher;
1525 delete pat;
1526 }
1527
1528
1529 //---------------------------------------------------------------------------
1530 //
1531 // API_Pattern Test that the API for class RegexPattern is
1532 // present and nominally working.
1533 //
1534 //---------------------------------------------------------------------------
API_Pattern()1535 void RegexTest::API_Pattern() {
1536 RegexPattern pata; // Test default constructor to not crash.
1537 RegexPattern patb;
1538
1539 REGEX_ASSERT(pata == patb);
1540 REGEX_ASSERT(pata == pata);
1541
1542 UnicodeString re1("abc[a-l][m-z]");
1543 UnicodeString re2("def");
1544 UErrorCode status = U_ZERO_ERROR;
1545 UParseError pe;
1546
1547 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1548 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1549 REGEX_CHECK_STATUS;
1550 REGEX_ASSERT(*pat1 == *pat1);
1551 REGEX_ASSERT(*pat1 != pata);
1552
1553 // Assign
1554 patb = *pat1;
1555 REGEX_ASSERT(patb == *pat1);
1556
1557 // Copy Construct
1558 RegexPattern patc(*pat1);
1559 REGEX_ASSERT(patc == *pat1);
1560 REGEX_ASSERT(patb == patc);
1561 REGEX_ASSERT(pat1 != pat2);
1562 patb = *pat2;
1563 REGEX_ASSERT(patb != patc);
1564 REGEX_ASSERT(patb == *pat2);
1565
1566 // Compile with no flags.
1567 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1568 REGEX_ASSERT(*pat1a == *pat1);
1569
1570 REGEX_ASSERT(pat1a->flags() == 0);
1571
1572 // Compile with different flags should be not equal
1573 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1574 REGEX_CHECK_STATUS;
1575
1576 REGEX_ASSERT(*pat1b != *pat1a);
1577 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1578 REGEX_ASSERT(pat1a->flags() == 0);
1579 delete pat1b;
1580
1581 // clone
1582 RegexPattern *pat1c = pat1->clone();
1583 REGEX_ASSERT(*pat1c == *pat1);
1584 REGEX_ASSERT(*pat1c != *pat2);
1585
1586 delete pat1c;
1587 delete pat1a;
1588 delete pat1;
1589 delete pat2;
1590
1591
1592 //
1593 // Verify that a matcher created from a cloned pattern works.
1594 // (Jitterbug 3423)
1595 //
1596 {
1597 UErrorCode status = U_ZERO_ERROR;
1598 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1599 RegexPattern *pClone = pSource->clone();
1600 delete pSource;
1601 RegexMatcher *mFromClone = pClone->matcher(status);
1602 REGEX_CHECK_STATUS;
1603 UnicodeString s = "Hello World";
1604 mFromClone->reset(s);
1605 REGEX_ASSERT(mFromClone->find() == TRUE);
1606 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1607 REGEX_ASSERT(mFromClone->find() == TRUE);
1608 REGEX_ASSERT(mFromClone->group(status) == "World");
1609 REGEX_ASSERT(mFromClone->find() == FALSE);
1610 delete mFromClone;
1611 delete pClone;
1612 }
1613
1614 //
1615 // matches convenience API
1616 //
1617 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1618 REGEX_CHECK_STATUS;
1619 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1620 REGEX_CHECK_STATUS;
1621 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1622 REGEX_CHECK_STATUS;
1623 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1624 REGEX_CHECK_STATUS;
1625 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1626 REGEX_CHECK_STATUS;
1627 status = U_INDEX_OUTOFBOUNDS_ERROR;
1628 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1629 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1630
1631
1632 //
1633 // Split()
1634 //
1635 status = U_ZERO_ERROR;
1636 pat1 = RegexPattern::compile(" +", pe, status);
1637 REGEX_CHECK_STATUS;
1638 UnicodeString fields[10];
1639
1640 int32_t n;
1641 n = pat1->split("Now is the time", fields, 10, status);
1642 REGEX_CHECK_STATUS;
1643 REGEX_ASSERT(n==4);
1644 REGEX_ASSERT(fields[0]=="Now");
1645 REGEX_ASSERT(fields[1]=="is");
1646 REGEX_ASSERT(fields[2]=="the");
1647 REGEX_ASSERT(fields[3]=="time");
1648 REGEX_ASSERT(fields[4]=="");
1649
1650 n = pat1->split("Now is the time", fields, 2, status);
1651 REGEX_CHECK_STATUS;
1652 REGEX_ASSERT(n==2);
1653 REGEX_ASSERT(fields[0]=="Now");
1654 REGEX_ASSERT(fields[1]=="is the time");
1655 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1656
1657 fields[1] = "*";
1658 status = U_ZERO_ERROR;
1659 n = pat1->split("Now is the time", fields, 1, status);
1660 REGEX_CHECK_STATUS;
1661 REGEX_ASSERT(n==1);
1662 REGEX_ASSERT(fields[0]=="Now is the time");
1663 REGEX_ASSERT(fields[1]=="*");
1664 status = U_ZERO_ERROR;
1665
1666 n = pat1->split(" Now is the time ", fields, 10, status);
1667 REGEX_CHECK_STATUS;
1668 REGEX_ASSERT(n==6);
1669 REGEX_ASSERT(fields[0]=="");
1670 REGEX_ASSERT(fields[1]=="Now");
1671 REGEX_ASSERT(fields[2]=="is");
1672 REGEX_ASSERT(fields[3]=="the");
1673 REGEX_ASSERT(fields[4]=="time");
1674 REGEX_ASSERT(fields[5]=="");
1675
1676 n = pat1->split(" ", fields, 10, status);
1677 REGEX_CHECK_STATUS;
1678 REGEX_ASSERT(n==2);
1679 REGEX_ASSERT(fields[0]=="");
1680 REGEX_ASSERT(fields[1]=="");
1681
1682 fields[0] = "foo";
1683 n = pat1->split("", fields, 10, status);
1684 REGEX_CHECK_STATUS;
1685 REGEX_ASSERT(n==0);
1686 REGEX_ASSERT(fields[0]=="foo");
1687
1688 delete pat1;
1689
1690 // split, with a pattern with (capture)
1691 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1692 REGEX_CHECK_STATUS;
1693
1694 status = U_ZERO_ERROR;
1695 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1696 REGEX_CHECK_STATUS;
1697 REGEX_ASSERT(n==7);
1698 REGEX_ASSERT(fields[0]=="");
1699 REGEX_ASSERT(fields[1]=="a");
1700 REGEX_ASSERT(fields[2]=="Now is ");
1701 REGEX_ASSERT(fields[3]=="b");
1702 REGEX_ASSERT(fields[4]=="the time");
1703 REGEX_ASSERT(fields[5]=="c");
1704 REGEX_ASSERT(fields[6]=="");
1705 REGEX_ASSERT(status==U_ZERO_ERROR);
1706
1707 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1708 REGEX_CHECK_STATUS;
1709 REGEX_ASSERT(n==7);
1710 REGEX_ASSERT(fields[0]==" ");
1711 REGEX_ASSERT(fields[1]=="a");
1712 REGEX_ASSERT(fields[2]=="Now is ");
1713 REGEX_ASSERT(fields[3]=="b");
1714 REGEX_ASSERT(fields[4]=="the time");
1715 REGEX_ASSERT(fields[5]=="c");
1716 REGEX_ASSERT(fields[6]=="");
1717
1718 status = U_ZERO_ERROR;
1719 fields[6] = "foo";
1720 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1721 REGEX_CHECK_STATUS;
1722 REGEX_ASSERT(n==6);
1723 REGEX_ASSERT(fields[0]==" ");
1724 REGEX_ASSERT(fields[1]=="a");
1725 REGEX_ASSERT(fields[2]=="Now is ");
1726 REGEX_ASSERT(fields[3]=="b");
1727 REGEX_ASSERT(fields[4]=="the time");
1728 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1729 REGEX_ASSERT(fields[6]=="foo");
1730
1731 status = U_ZERO_ERROR;
1732 fields[5] = "foo";
1733 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1734 REGEX_CHECK_STATUS;
1735 REGEX_ASSERT(n==5);
1736 REGEX_ASSERT(fields[0]==" ");
1737 REGEX_ASSERT(fields[1]=="a");
1738 REGEX_ASSERT(fields[2]=="Now is ");
1739 REGEX_ASSERT(fields[3]=="b");
1740 REGEX_ASSERT(fields[4]=="the time<c>");
1741 REGEX_ASSERT(fields[5]=="foo");
1742
1743 status = U_ZERO_ERROR;
1744 fields[5] = "foo";
1745 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1746 REGEX_CHECK_STATUS;
1747 REGEX_ASSERT(n==5);
1748 REGEX_ASSERT(fields[0]==" ");
1749 REGEX_ASSERT(fields[1]=="a");
1750 REGEX_ASSERT(fields[2]=="Now is ");
1751 REGEX_ASSERT(fields[3]=="b");
1752 REGEX_ASSERT(fields[4]=="the time");
1753 REGEX_ASSERT(fields[5]=="foo");
1754
1755 status = U_ZERO_ERROR;
1756 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1757 REGEX_CHECK_STATUS;
1758 REGEX_ASSERT(n==4);
1759 REGEX_ASSERT(fields[0]==" ");
1760 REGEX_ASSERT(fields[1]=="a");
1761 REGEX_ASSERT(fields[2]=="Now is ");
1762 REGEX_ASSERT(fields[3]=="the time<c>");
1763 status = U_ZERO_ERROR;
1764 delete pat1;
1765
1766 pat1 = RegexPattern::compile("([-,])", pe, status);
1767 REGEX_CHECK_STATUS;
1768 n = pat1->split("1-10,20", fields, 10, status);
1769 REGEX_CHECK_STATUS;
1770 REGEX_ASSERT(n==5);
1771 REGEX_ASSERT(fields[0]=="1");
1772 REGEX_ASSERT(fields[1]=="-");
1773 REGEX_ASSERT(fields[2]=="10");
1774 REGEX_ASSERT(fields[3]==",");
1775 REGEX_ASSERT(fields[4]=="20");
1776 delete pat1;
1777
1778 // Test split of string with empty trailing fields
1779 pat1 = RegexPattern::compile(",", pe, status);
1780 REGEX_CHECK_STATUS;
1781 n = pat1->split("a,b,c,", fields, 10, status);
1782 REGEX_CHECK_STATUS;
1783 REGEX_ASSERT(n==4);
1784 REGEX_ASSERT(fields[0]=="a");
1785 REGEX_ASSERT(fields[1]=="b");
1786 REGEX_ASSERT(fields[2]=="c");
1787 REGEX_ASSERT(fields[3]=="");
1788
1789 n = pat1->split("a,,,", fields, 10, status);
1790 REGEX_CHECK_STATUS;
1791 REGEX_ASSERT(n==4);
1792 REGEX_ASSERT(fields[0]=="a");
1793 REGEX_ASSERT(fields[1]=="");
1794 REGEX_ASSERT(fields[2]=="");
1795 REGEX_ASSERT(fields[3]=="");
1796 delete pat1;
1797
1798 // Split Separator with zero length match.
1799 pat1 = RegexPattern::compile(":?", pe, status);
1800 REGEX_CHECK_STATUS;
1801 n = pat1->split("abc", fields, 10, status);
1802 REGEX_CHECK_STATUS;
1803 REGEX_ASSERT(n==5);
1804 REGEX_ASSERT(fields[0]=="");
1805 REGEX_ASSERT(fields[1]=="a");
1806 REGEX_ASSERT(fields[2]=="b");
1807 REGEX_ASSERT(fields[3]=="c");
1808 REGEX_ASSERT(fields[4]=="");
1809
1810 delete pat1;
1811
1812 //
1813 // RegexPattern::pattern()
1814 //
1815 pat1 = new RegexPattern();
1816 REGEX_ASSERT(pat1->pattern() == "");
1817 delete pat1;
1818
1819 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1820 REGEX_CHECK_STATUS;
1821 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1822 delete pat1;
1823
1824
1825 //
1826 // classID functions
1827 //
1828 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1829 REGEX_CHECK_STATUS;
1830 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1831 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1832 UnicodeString Hello("Hello, world.");
1833 RegexMatcher *m = pat1->matcher(Hello, status);
1834 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1835 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1836 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1837 delete m;
1838 delete pat1;
1839
1840 }
1841
1842 //---------------------------------------------------------------------------
1843 //
1844 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1845 // is present and working, but excluding functions
1846 // implementing replace operations.
1847 //
1848 //---------------------------------------------------------------------------
API_Match_UTF8()1849 void RegexTest::API_Match_UTF8() {
1850 UParseError pe;
1851 UErrorCode status=U_ZERO_ERROR;
1852 int32_t flags = 0;
1853
1854 //
1855 // Debug - slide failing test cases early
1856 //
1857 #if 0
1858 {
1859 }
1860 return;
1861 #endif
1862
1863 //
1864 // Simple pattern compilation
1865 //
1866 {
1867 UText re = UTEXT_INITIALIZER;
1868 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1869 REGEX_VERBOSE_TEXT(&re);
1870 RegexPattern *pat2;
1871 pat2 = RegexPattern::compile(&re, flags, pe, status);
1872 REGEX_CHECK_STATUS;
1873
1874 UText input1 = UTEXT_INITIALIZER;
1875 UText input2 = UTEXT_INITIALIZER;
1876 UText empty = UTEXT_INITIALIZER;
1877 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1878 REGEX_VERBOSE_TEXT(&input1);
1879 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1880 REGEX_VERBOSE_TEXT(&input2);
1881 utext_openUChars(&empty, NULL, 0, &status);
1882
1883 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1884 int32_t input2Len = strlen("not abc");
1885
1886
1887 //
1888 // Matcher creation and reset.
1889 //
1890 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1891 REGEX_CHECK_STATUS;
1892 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1893 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1894 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1895 m1->reset(&input2);
1896 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1897 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1898 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1899 m1->reset(&input1);
1900 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1901 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1902 m1->reset(&empty);
1903 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1904 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1905
1906 //
1907 // reset(pos, status)
1908 //
1909 m1->reset(&input1);
1910 m1->reset(4, status);
1911 REGEX_CHECK_STATUS;
1912 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1913 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1914
1915 m1->reset(-1, status);
1916 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1917 status = U_ZERO_ERROR;
1918
1919 m1->reset(0, status);
1920 REGEX_CHECK_STATUS;
1921 status = U_ZERO_ERROR;
1922
1923 m1->reset(input1Len-1, status);
1924 REGEX_CHECK_STATUS;
1925 status = U_ZERO_ERROR;
1926
1927 m1->reset(input1Len, status);
1928 REGEX_CHECK_STATUS;
1929 status = U_ZERO_ERROR;
1930
1931 m1->reset(input1Len+1, status);
1932 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1933 status = U_ZERO_ERROR;
1934
1935 //
1936 // match(pos, status)
1937 //
1938 m1->reset(&input2);
1939 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1940 m1->reset();
1941 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1942 m1->reset();
1943 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1944 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1945 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1946 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1947
1948 // Match() at end of string should fail, but should not
1949 // be an error.
1950 status = U_ZERO_ERROR;
1951 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1952 REGEX_CHECK_STATUS;
1953
1954 // Match beyond end of string should fail with an error.
1955 status = U_ZERO_ERROR;
1956 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1957 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1958
1959 // Successful match at end of string.
1960 {
1961 status = U_ZERO_ERROR;
1962 RegexMatcher m("A?", 0, status); // will match zero length string.
1963 REGEX_CHECK_STATUS;
1964 m.reset(&input1);
1965 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1966 REGEX_CHECK_STATUS;
1967 m.reset(&empty);
1968 REGEX_ASSERT(m.matches(0, status) == TRUE);
1969 REGEX_CHECK_STATUS;
1970 }
1971
1972
1973 //
1974 // lookingAt(pos, status)
1975 //
1976 status = U_ZERO_ERROR;
1977 m1->reset(&input2); // "not abc"
1978 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1979 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1980 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1981 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1982 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1983 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1984 status = U_ZERO_ERROR;
1985 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1986 REGEX_CHECK_STATUS;
1987 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1988 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1989
1990 delete m1;
1991 delete pat2;
1992
1993 utext_close(&re);
1994 utext_close(&input1);
1995 utext_close(&input2);
1996 utext_close(&empty);
1997 }
1998
1999
2000 //
2001 // Capture Group.
2002 // RegexMatcher::start();
2003 // RegexMatcher::end();
2004 // RegexMatcher::groupCount();
2005 //
2006 {
2007 int32_t flags=0;
2008 UParseError pe;
2009 UErrorCode status=U_ZERO_ERROR;
2010 UText re=UTEXT_INITIALIZER;
2011 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2012 utext_openUTF8(&re, str_01234567_pat, -1, &status);
2013
2014 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2015 REGEX_CHECK_STATUS;
2016
2017 UText input = UTEXT_INITIALIZER;
2018 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2019 utext_openUTF8(&input, str_0123456789, -1, &status);
2020
2021 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2022 REGEX_CHECK_STATUS;
2023 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2024 static const int32_t matchStarts[] = {0, 2, 4, 8};
2025 static const int32_t matchEnds[] = {10, 8, 6, 10};
2026 int32_t i;
2027 for (i=0; i<4; i++) {
2028 int32_t actualStart = matcher->start(i, status);
2029 REGEX_CHECK_STATUS;
2030 if (actualStart != matchStarts[i]) {
2031 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2032 __FILE__, __LINE__, i, matchStarts[i], actualStart);
2033 }
2034 int32_t actualEnd = matcher->end(i, status);
2035 REGEX_CHECK_STATUS;
2036 if (actualEnd != matchEnds[i]) {
2037 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2038 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2039 }
2040 }
2041
2042 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2043 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2044
2045 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2046 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2047 matcher->reset();
2048 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2049
2050 matcher->lookingAt(status);
2051
2052 UnicodeString dest;
2053 UText destText = UTEXT_INITIALIZER;
2054 utext_openUnicodeString(&destText, &dest, &status);
2055 UText *result;
2056 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2057 // Test shallow-clone API
2058 int64_t group_len;
2059 result = matcher->group((UText *)NULL, group_len, status);
2060 REGEX_CHECK_STATUS;
2061 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2062 utext_close(result);
2063 result = matcher->group(0, &destText, group_len, status);
2064 REGEX_CHECK_STATUS;
2065 REGEX_ASSERT(result == &destText);
2066 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2067 // destText is now immutable, reopen it
2068 utext_close(&destText);
2069 utext_openUnicodeString(&destText, &dest, &status);
2070
2071 int64_t length;
2072 result = matcher->group(0, NULL, length, status);
2073 REGEX_CHECK_STATUS;
2074 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2075 utext_close(result);
2076 result = matcher->group(0, &destText, length, status);
2077 REGEX_CHECK_STATUS;
2078 REGEX_ASSERT(result == &destText);
2079 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2080 REGEX_ASSERT(length == 10);
2081 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2082
2083 // Capture Group 1 == "234567"
2084 result = matcher->group(1, NULL, length, status);
2085 REGEX_CHECK_STATUS;
2086 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2087 REGEX_ASSERT(length == 6);
2088 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2089 utext_close(result);
2090
2091 result = matcher->group(1, &destText, length, status);
2092 REGEX_CHECK_STATUS;
2093 REGEX_ASSERT(result == &destText);
2094 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2095 REGEX_ASSERT(length == 6);
2096 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2097 utext_close(result);
2098
2099 // Capture Group 2 == "45"
2100 result = matcher->group(2, NULL, length, status);
2101 REGEX_CHECK_STATUS;
2102 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2103 REGEX_ASSERT(length == 2);
2104 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2105 utext_close(result);
2106
2107 result = matcher->group(2, &destText, length, status);
2108 REGEX_CHECK_STATUS;
2109 REGEX_ASSERT(result == &destText);
2110 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2111 REGEX_ASSERT(length == 2);
2112 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2113 utext_close(result);
2114
2115 // Capture Group 3 == "89"
2116 result = matcher->group(3, NULL, length, status);
2117 REGEX_CHECK_STATUS;
2118 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2119 REGEX_ASSERT(length == 2);
2120 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2121 utext_close(result);
2122
2123 result = matcher->group(3, &destText, length, status);
2124 REGEX_CHECK_STATUS;
2125 REGEX_ASSERT(result == &destText);
2126 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2127 REGEX_ASSERT(length == 2);
2128 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2129 utext_close(result);
2130
2131 // Capture Group number out of range.
2132 status = U_ZERO_ERROR;
2133 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2134 status = U_ZERO_ERROR;
2135 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136 status = U_ZERO_ERROR;
2137 matcher->reset();
2138 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2139
2140 delete matcher;
2141 delete pat;
2142
2143 utext_close(&destText);
2144 utext_close(&input);
2145 utext_close(&re);
2146 }
2147
2148 //
2149 // find
2150 //
2151 {
2152 int32_t flags=0;
2153 UParseError pe;
2154 UErrorCode status=U_ZERO_ERROR;
2155 UText re=UTEXT_INITIALIZER;
2156 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2157 utext_openUTF8(&re, str_abc, -1, &status);
2158
2159 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2160 REGEX_CHECK_STATUS;
2161 UText input = UTEXT_INITIALIZER;
2162 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2163 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2164 // 012345678901234567
2165
2166 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2167 REGEX_CHECK_STATUS;
2168 REGEX_ASSERT(matcher->find());
2169 REGEX_ASSERT(matcher->start(status) == 1);
2170 REGEX_ASSERT(matcher->find());
2171 REGEX_ASSERT(matcher->start(status) == 6);
2172 REGEX_ASSERT(matcher->find());
2173 REGEX_ASSERT(matcher->start(status) == 12);
2174 REGEX_ASSERT(matcher->find() == FALSE);
2175 REGEX_ASSERT(matcher->find() == FALSE);
2176
2177 matcher->reset();
2178 REGEX_ASSERT(matcher->find());
2179 REGEX_ASSERT(matcher->start(status) == 1);
2180
2181 REGEX_ASSERT(matcher->find(0, status));
2182 REGEX_ASSERT(matcher->start(status) == 1);
2183 REGEX_ASSERT(matcher->find(1, status));
2184 REGEX_ASSERT(matcher->start(status) == 1);
2185 REGEX_ASSERT(matcher->find(2, status));
2186 REGEX_ASSERT(matcher->start(status) == 6);
2187 REGEX_ASSERT(matcher->find(12, status));
2188 REGEX_ASSERT(matcher->start(status) == 12);
2189 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2190 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2191 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2192 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2193
2194 status = U_ZERO_ERROR;
2195 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2196 status = U_ZERO_ERROR;
2197 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2198
2199 REGEX_ASSERT(matcher->groupCount() == 0);
2200
2201 delete matcher;
2202 delete pat;
2203
2204 utext_close(&input);
2205 utext_close(&re);
2206 }
2207
2208
2209 //
2210 // find, with \G in pattern (true if at the end of a previous match).
2211 //
2212 {
2213 int32_t flags=0;
2214 UParseError pe;
2215 UErrorCode status=U_ZERO_ERROR;
2216 UText re=UTEXT_INITIALIZER;
2217 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2218 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2219
2220 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2221
2222 REGEX_CHECK_STATUS;
2223 UText input = UTEXT_INITIALIZER;
2224 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2225 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2226 // 012345678901234567
2227
2228 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2229 REGEX_CHECK_STATUS;
2230 REGEX_ASSERT(matcher->find());
2231 REGEX_ASSERT(matcher->start(status) == 0);
2232 REGEX_ASSERT(matcher->start(1, status) == -1);
2233 REGEX_ASSERT(matcher->start(2, status) == 1);
2234
2235 REGEX_ASSERT(matcher->find());
2236 REGEX_ASSERT(matcher->start(status) == 4);
2237 REGEX_ASSERT(matcher->start(1, status) == 4);
2238 REGEX_ASSERT(matcher->start(2, status) == -1);
2239 REGEX_CHECK_STATUS;
2240
2241 delete matcher;
2242 delete pat;
2243
2244 utext_close(&input);
2245 utext_close(&re);
2246 }
2247
2248 //
2249 // find with zero length matches, match position should bump ahead
2250 // to prevent loops.
2251 //
2252 {
2253 int32_t i;
2254 UErrorCode status=U_ZERO_ERROR;
2255 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2256 // using an always-true look-ahead.
2257 REGEX_CHECK_STATUS;
2258 UText s = UTEXT_INITIALIZER;
2259 utext_openUTF8(&s, " ", -1, &status);
2260 m.reset(&s);
2261 for (i=0; ; i++) {
2262 if (m.find() == FALSE) {
2263 break;
2264 }
2265 REGEX_ASSERT(m.start(status) == i);
2266 REGEX_ASSERT(m.end(status) == i);
2267 }
2268 REGEX_ASSERT(i==5);
2269
2270 // Check that the bump goes over characters outside the BMP OK
2271 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2272 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2273 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2274 m.reset(&s);
2275 for (i=0; ; i+=4) {
2276 if (m.find() == FALSE) {
2277 break;
2278 }
2279 REGEX_ASSERT(m.start(status) == i);
2280 REGEX_ASSERT(m.end(status) == i);
2281 }
2282 REGEX_ASSERT(i==20);
2283
2284 utext_close(&s);
2285 }
2286 {
2287 // find() loop breaking test.
2288 // with pattern of /.?/, should see a series of one char matches, then a single
2289 // match of zero length at the end of the input string.
2290 int32_t i;
2291 UErrorCode status=U_ZERO_ERROR;
2292 RegexMatcher m(".?", 0, status);
2293 REGEX_CHECK_STATUS;
2294 UText s = UTEXT_INITIALIZER;
2295 utext_openUTF8(&s, " ", -1, &status);
2296 m.reset(&s);
2297 for (i=0; ; i++) {
2298 if (m.find() == FALSE) {
2299 break;
2300 }
2301 REGEX_ASSERT(m.start(status) == i);
2302 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2303 }
2304 REGEX_ASSERT(i==5);
2305
2306 utext_close(&s);
2307 }
2308
2309
2310 //
2311 // Matchers with no input string behave as if they had an empty input string.
2312 //
2313
2314 {
2315 UErrorCode status = U_ZERO_ERROR;
2316 RegexMatcher m(".?", 0, status);
2317 REGEX_CHECK_STATUS;
2318 REGEX_ASSERT(m.find());
2319 REGEX_ASSERT(m.start(status) == 0);
2320 REGEX_ASSERT(m.input() == "");
2321 }
2322 {
2323 UErrorCode status = U_ZERO_ERROR;
2324 RegexPattern *p = RegexPattern::compile(".", 0, status);
2325 RegexMatcher *m = p->matcher(status);
2326 REGEX_CHECK_STATUS;
2327
2328 REGEX_ASSERT(m->find() == FALSE);
2329 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2330 delete m;
2331 delete p;
2332 }
2333
2334 //
2335 // Regions
2336 //
2337 {
2338 UErrorCode status = U_ZERO_ERROR;
2339 UText testPattern = UTEXT_INITIALIZER;
2340 UText testText = UTEXT_INITIALIZER;
2341 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2342 REGEX_VERBOSE_TEXT(&testPattern);
2343 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2344 REGEX_VERBOSE_TEXT(&testText);
2345
2346 RegexMatcher m(&testPattern, &testText, 0, status);
2347 REGEX_CHECK_STATUS;
2348 REGEX_ASSERT(m.regionStart() == 0);
2349 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2350 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2351 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2352
2353 m.region(2,4, status);
2354 REGEX_CHECK_STATUS;
2355 REGEX_ASSERT(m.matches(status));
2356 REGEX_ASSERT(m.start(status)==2);
2357 REGEX_ASSERT(m.end(status)==4);
2358 REGEX_CHECK_STATUS;
2359
2360 m.reset();
2361 REGEX_ASSERT(m.regionStart() == 0);
2362 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2363
2364 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2365 REGEX_VERBOSE_TEXT(&testText);
2366 m.reset(&testText);
2367 REGEX_ASSERT(m.regionStart() == 0);
2368 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2369
2370 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2371 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2372 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2373 REGEX_ASSERT(&m == &m.reset());
2374 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2375
2376 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2377 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2378 REGEX_ASSERT(&m == &m.reset());
2379 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2380
2381 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2382 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2383 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2384 REGEX_ASSERT(&m == &m.reset());
2385 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2386
2387 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2388 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2389 REGEX_ASSERT(&m == &m.reset());
2390 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2391
2392 utext_close(&testText);
2393 utext_close(&testPattern);
2394 }
2395
2396 //
2397 // hitEnd() and requireEnd()
2398 //
2399 {
2400 UErrorCode status = U_ZERO_ERROR;
2401 UText testPattern = UTEXT_INITIALIZER;
2402 UText testText = UTEXT_INITIALIZER;
2403 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2404 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2405 utext_openUTF8(&testPattern, str_, -1, &status);
2406 utext_openUTF8(&testText, str_aabb, -1, &status);
2407
2408 RegexMatcher m1(&testPattern, &testText, 0, status);
2409 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2410 REGEX_ASSERT(m1.hitEnd() == TRUE);
2411 REGEX_ASSERT(m1.requireEnd() == FALSE);
2412 REGEX_CHECK_STATUS;
2413
2414 status = U_ZERO_ERROR;
2415 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2416 utext_openUTF8(&testPattern, str_a, -1, &status);
2417 RegexMatcher m2(&testPattern, &testText, 0, status);
2418 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2419 REGEX_ASSERT(m2.hitEnd() == FALSE);
2420 REGEX_ASSERT(m2.requireEnd() == FALSE);
2421 REGEX_CHECK_STATUS;
2422
2423 status = U_ZERO_ERROR;
2424 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2425 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2426 RegexMatcher m3(&testPattern, &testText, 0, status);
2427 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2428 REGEX_ASSERT(m3.hitEnd() == TRUE);
2429 REGEX_ASSERT(m3.requireEnd() == TRUE);
2430 REGEX_CHECK_STATUS;
2431
2432 utext_close(&testText);
2433 utext_close(&testPattern);
2434 }
2435 }
2436
2437
2438 //---------------------------------------------------------------------------
2439 //
2440 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2441 // Replace family of functions.
2442 //
2443 //---------------------------------------------------------------------------
API_Replace_UTF8()2444 void RegexTest::API_Replace_UTF8() {
2445 //
2446 // Replace
2447 //
2448 int32_t flags=0;
2449 UParseError pe;
2450 UErrorCode status=U_ZERO_ERROR;
2451
2452 UText re=UTEXT_INITIALIZER;
2453 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2454 REGEX_VERBOSE_TEXT(&re);
2455 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2456 REGEX_CHECK_STATUS;
2457
2458 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2459 // 012345678901234567
2460 UText dataText = UTEXT_INITIALIZER;
2461 utext_openUTF8(&dataText, data, -1, &status);
2462 REGEX_CHECK_STATUS;
2463 REGEX_VERBOSE_TEXT(&dataText);
2464 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2465
2466 //
2467 // Plain vanilla matches.
2468 //
2469 UnicodeString dest;
2470 UText destText = UTEXT_INITIALIZER;
2471 utext_openUnicodeString(&destText, &dest, &status);
2472 UText *result;
2473
2474 UText replText = UTEXT_INITIALIZER;
2475
2476 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2477 utext_openUTF8(&replText, str_yz, -1, &status);
2478 REGEX_VERBOSE_TEXT(&replText);
2479 result = matcher->replaceFirst(&replText, NULL, status);
2480 REGEX_CHECK_STATUS;
2481 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2482 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2483 utext_close(result);
2484 result = matcher->replaceFirst(&replText, &destText, status);
2485 REGEX_CHECK_STATUS;
2486 REGEX_ASSERT(result == &destText);
2487 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2488
2489 result = matcher->replaceAll(&replText, NULL, status);
2490 REGEX_CHECK_STATUS;
2491 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2492 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2493 utext_close(result);
2494
2495 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2496 result = matcher->replaceAll(&replText, &destText, status);
2497 REGEX_CHECK_STATUS;
2498 REGEX_ASSERT(result == &destText);
2499 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2500
2501 //
2502 // Plain vanilla non-matches.
2503 //
2504 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2505 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2506 matcher->reset(&dataText);
2507
2508 result = matcher->replaceFirst(&replText, NULL, status);
2509 REGEX_CHECK_STATUS;
2510 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2511 utext_close(result);
2512 result = matcher->replaceFirst(&replText, &destText, status);
2513 REGEX_CHECK_STATUS;
2514 REGEX_ASSERT(result == &destText);
2515 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2516
2517 result = matcher->replaceAll(&replText, NULL, status);
2518 REGEX_CHECK_STATUS;
2519 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2520 utext_close(result);
2521 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2522 result = matcher->replaceAll(&replText, &destText, status);
2523 REGEX_CHECK_STATUS;
2524 REGEX_ASSERT(result == &destText);
2525 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2526
2527 //
2528 // Empty source string
2529 //
2530 utext_openUTF8(&dataText, NULL, 0, &status);
2531 matcher->reset(&dataText);
2532
2533 result = matcher->replaceFirst(&replText, NULL, status);
2534 REGEX_CHECK_STATUS;
2535 REGEX_ASSERT_UTEXT_UTF8("", result);
2536 utext_close(result);
2537 result = matcher->replaceFirst(&replText, &destText, status);
2538 REGEX_CHECK_STATUS;
2539 REGEX_ASSERT(result == &destText);
2540 REGEX_ASSERT_UTEXT_UTF8("", result);
2541
2542 result = matcher->replaceAll(&replText, NULL, status);
2543 REGEX_CHECK_STATUS;
2544 REGEX_ASSERT_UTEXT_UTF8("", result);
2545 utext_close(result);
2546 result = matcher->replaceAll(&replText, &destText, status);
2547 REGEX_CHECK_STATUS;
2548 REGEX_ASSERT(result == &destText);
2549 REGEX_ASSERT_UTEXT_UTF8("", result);
2550
2551 //
2552 // Empty substitution string
2553 //
2554 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2555 matcher->reset(&dataText);
2556
2557 utext_openUTF8(&replText, NULL, 0, &status);
2558 result = matcher->replaceFirst(&replText, NULL, status);
2559 REGEX_CHECK_STATUS;
2560 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2561 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2562 utext_close(result);
2563 result = matcher->replaceFirst(&replText, &destText, status);
2564 REGEX_CHECK_STATUS;
2565 REGEX_ASSERT(result == &destText);
2566 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2567
2568 result = matcher->replaceAll(&replText, NULL, status);
2569 REGEX_CHECK_STATUS;
2570 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2571 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2572 utext_close(result);
2573 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2574 result = matcher->replaceAll(&replText, &destText, status);
2575 REGEX_CHECK_STATUS;
2576 REGEX_ASSERT(result == &destText);
2577 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2578
2579 //
2580 // match whole string
2581 //
2582 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2583 utext_openUTF8(&dataText, str_abc, -1, &status);
2584 matcher->reset(&dataText);
2585
2586 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2587 utext_openUTF8(&replText, str_xyz, -1, &status);
2588 result = matcher->replaceFirst(&replText, NULL, status);
2589 REGEX_CHECK_STATUS;
2590 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2591 utext_close(result);
2592 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2593 result = matcher->replaceFirst(&replText, &destText, status);
2594 REGEX_CHECK_STATUS;
2595 REGEX_ASSERT(result == &destText);
2596 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2597
2598 result = matcher->replaceAll(&replText, NULL, status);
2599 REGEX_CHECK_STATUS;
2600 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2601 utext_close(result);
2602 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2603 result = matcher->replaceAll(&replText, &destText, status);
2604 REGEX_CHECK_STATUS;
2605 REGEX_ASSERT(result == &destText);
2606 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2607
2608 //
2609 // Capture Group, simple case
2610 //
2611 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2612 utext_openUTF8(&re, str_add, -1, &status);
2613 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2614 REGEX_CHECK_STATUS;
2615
2616 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2617 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2618 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2619 REGEX_CHECK_STATUS;
2620
2621 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2622 utext_openUTF8(&replText, str_11, -1, &status);
2623 result = matcher2->replaceFirst(&replText, NULL, status);
2624 REGEX_CHECK_STATUS;
2625 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2626 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2627 utext_close(result);
2628 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629 result = matcher2->replaceFirst(&replText, &destText, status);
2630 REGEX_CHECK_STATUS;
2631 REGEX_ASSERT(result == &destText);
2632 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2633
2634 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2635 utext_openUTF8(&replText, str_v, -1, &status);
2636 REGEX_VERBOSE_TEXT(&replText);
2637 result = matcher2->replaceFirst(&replText, NULL, status);
2638 REGEX_CHECK_STATUS;
2639 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2640 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2641 utext_close(result);
2642 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2643 result = matcher2->replaceFirst(&replText, &destText, status);
2644 REGEX_CHECK_STATUS;
2645 REGEX_ASSERT(result == &destText);
2646 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2647
2648 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2649 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2650 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2651 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2652 result = matcher2->replaceFirst(&replText, NULL, status);
2653 REGEX_CHECK_STATUS;
2654 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2655 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2656 utext_close(result);
2657 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2658 result = matcher2->replaceFirst(&replText, &destText, status);
2659 REGEX_CHECK_STATUS;
2660 REGEX_ASSERT(result == &destText);
2661 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2662
2663 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2664 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2665 // 012345678901234567890123456
2666 supplDigitChars[22] = 0xF0;
2667 supplDigitChars[23] = 0x9D;
2668 supplDigitChars[24] = 0x9F;
2669 supplDigitChars[25] = 0x8F;
2670 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2671
2672 result = matcher2->replaceFirst(&replText, NULL, status);
2673 REGEX_CHECK_STATUS;
2674 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2675 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2676 utext_close(result);
2677 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2678 result = matcher2->replaceFirst(&replText, &destText, status);
2679 REGEX_CHECK_STATUS;
2680 REGEX_ASSERT(result == &destText);
2681 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2682 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2683 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2684 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2685 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2686 utext_close(result);
2687 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2688 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2689 REGEX_ASSERT(result == &destText);
2690 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2691
2692 //
2693 // Replacement String with \u hex escapes
2694 //
2695 {
2696 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2697 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2698 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2699 utext_openUTF8(&replText, str_u0043, -1, &status);
2700 matcher->reset(&dataText);
2701
2702 result = matcher->replaceAll(&replText, NULL, status);
2703 REGEX_CHECK_STATUS;
2704 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2705 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2706 utext_close(result);
2707 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2708 result = matcher->replaceAll(&replText, &destText, status);
2709 REGEX_CHECK_STATUS;
2710 REGEX_ASSERT(result == &destText);
2711 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2712 }
2713 {
2714 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2715 utext_openUTF8(&dataText, str_abc, -1, &status);
2716 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2717 utext_openUTF8(&replText, str_U00010000, -1, &status);
2718 matcher->reset(&dataText);
2719
2720 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2721 // 0123456789
2722 expected[2] = 0xF0;
2723 expected[3] = 0x90;
2724 expected[4] = 0x80;
2725 expected[5] = 0x80;
2726
2727 result = matcher->replaceAll(&replText, NULL, status);
2728 REGEX_CHECK_STATUS;
2729 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2730 utext_close(result);
2731 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2732 result = matcher->replaceAll(&replText, &destText, status);
2733 REGEX_CHECK_STATUS;
2734 REGEX_ASSERT(result == &destText);
2735 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2736 }
2737 // TODO: need more through testing of capture substitutions.
2738
2739 // Bug 4057
2740 //
2741 {
2742 status = U_ZERO_ERROR;
2743 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2744 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2745 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2746 utext_openUTF8(&re, str_ssee, -1, &status);
2747 utext_openUTF8(&dataText, str_blah, -1, &status);
2748 utext_openUTF8(&replText, str_ooh, -1, &status);
2749
2750 RegexMatcher m(&re, 0, status);
2751 REGEX_CHECK_STATUS;
2752
2753 UnicodeString result;
2754 UText resultText = UTEXT_INITIALIZER;
2755 utext_openUnicodeString(&resultText, &result, &status);
2756
2757 // Multiple finds do NOT bump up the previous appendReplacement postion.
2758 m.reset(&dataText);
2759 m.find();
2760 m.find();
2761 m.appendReplacement(&resultText, &replText, status);
2762 REGEX_CHECK_STATUS;
2763 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2764 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2765
2766 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2767 status = U_ZERO_ERROR;
2768 result.truncate(0);
2769 utext_openUnicodeString(&resultText, &result, &status);
2770 m.reset(10, status);
2771 m.find();
2772 m.find();
2773 m.appendReplacement(&resultText, &replText, status);
2774 REGEX_CHECK_STATUS;
2775 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2776 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2777
2778 // find() at interior of string, appendReplacement still starts at beginning.
2779 status = U_ZERO_ERROR;
2780 result.truncate(0);
2781 utext_openUnicodeString(&resultText, &result, &status);
2782 m.reset();
2783 m.find(10, status);
2784 m.find();
2785 m.appendReplacement(&resultText, &replText, status);
2786 REGEX_CHECK_STATUS;
2787 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2788 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2789
2790 m.appendTail(&resultText, status);
2791 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2792 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2793
2794 utext_close(&resultText);
2795 }
2796
2797 delete matcher2;
2798 delete pat2;
2799 delete matcher;
2800 delete pat;
2801
2802 utext_close(&dataText);
2803 utext_close(&replText);
2804 utext_close(&destText);
2805 utext_close(&re);
2806 }
2807
2808
2809 //---------------------------------------------------------------------------
2810 //
2811 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2812 // present and nominally working.
2813 //
2814 //---------------------------------------------------------------------------
API_Pattern_UTF8()2815 void RegexTest::API_Pattern_UTF8() {
2816 RegexPattern pata; // Test default constructor to not crash.
2817 RegexPattern patb;
2818
2819 REGEX_ASSERT(pata == patb);
2820 REGEX_ASSERT(pata == pata);
2821
2822 UText re1 = UTEXT_INITIALIZER;
2823 UText re2 = UTEXT_INITIALIZER;
2824 UErrorCode status = U_ZERO_ERROR;
2825 UParseError pe;
2826
2827 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2828 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2829 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2830 utext_openUTF8(&re2, str_def, -1, &status);
2831
2832 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2833 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2834 REGEX_CHECK_STATUS;
2835 REGEX_ASSERT(*pat1 == *pat1);
2836 REGEX_ASSERT(*pat1 != pata);
2837
2838 // Assign
2839 patb = *pat1;
2840 REGEX_ASSERT(patb == *pat1);
2841
2842 // Copy Construct
2843 RegexPattern patc(*pat1);
2844 REGEX_ASSERT(patc == *pat1);
2845 REGEX_ASSERT(patb == patc);
2846 REGEX_ASSERT(pat1 != pat2);
2847 patb = *pat2;
2848 REGEX_ASSERT(patb != patc);
2849 REGEX_ASSERT(patb == *pat2);
2850
2851 // Compile with no flags.
2852 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2853 REGEX_ASSERT(*pat1a == *pat1);
2854
2855 REGEX_ASSERT(pat1a->flags() == 0);
2856
2857 // Compile with different flags should be not equal
2858 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2859 REGEX_CHECK_STATUS;
2860
2861 REGEX_ASSERT(*pat1b != *pat1a);
2862 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2863 REGEX_ASSERT(pat1a->flags() == 0);
2864 delete pat1b;
2865
2866 // clone
2867 RegexPattern *pat1c = pat1->clone();
2868 REGEX_ASSERT(*pat1c == *pat1);
2869 REGEX_ASSERT(*pat1c != *pat2);
2870
2871 delete pat1c;
2872 delete pat1a;
2873 delete pat1;
2874 delete pat2;
2875
2876 utext_close(&re1);
2877 utext_close(&re2);
2878
2879
2880 //
2881 // Verify that a matcher created from a cloned pattern works.
2882 // (Jitterbug 3423)
2883 //
2884 {
2885 UErrorCode status = U_ZERO_ERROR;
2886 UText pattern = UTEXT_INITIALIZER;
2887 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2888 utext_openUTF8(&pattern, str_pL, -1, &status);
2889
2890 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2891 RegexPattern *pClone = pSource->clone();
2892 delete pSource;
2893 RegexMatcher *mFromClone = pClone->matcher(status);
2894 REGEX_CHECK_STATUS;
2895
2896 UText input = UTEXT_INITIALIZER;
2897 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2898 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2899 mFromClone->reset(&input);
2900 REGEX_ASSERT(mFromClone->find() == TRUE);
2901 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2902 REGEX_ASSERT(mFromClone->find() == TRUE);
2903 REGEX_ASSERT(mFromClone->group(status) == "World");
2904 REGEX_ASSERT(mFromClone->find() == FALSE);
2905 delete mFromClone;
2906 delete pClone;
2907
2908 utext_close(&input);
2909 utext_close(&pattern);
2910 }
2911
2912 //
2913 // matches convenience API
2914 //
2915 {
2916 UErrorCode status = U_ZERO_ERROR;
2917 UText pattern = UTEXT_INITIALIZER;
2918 UText input = UTEXT_INITIALIZER;
2919
2920 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2921 utext_openUTF8(&input, str_randominput, -1, &status);
2922
2923 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2924 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2925 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2926 REGEX_CHECK_STATUS;
2927
2928 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2929 utext_openUTF8(&pattern, str_abc, -1, &status);
2930 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2931 REGEX_CHECK_STATUS;
2932
2933 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2934 utext_openUTF8(&pattern, str_nput, -1, &status);
2935 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2936 REGEX_CHECK_STATUS;
2937
2938 utext_openUTF8(&pattern, str_randominput, -1, &status);
2939 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2940 REGEX_CHECK_STATUS;
2941
2942 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2943 utext_openUTF8(&pattern, str_u, -1, &status);
2944 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2945 REGEX_CHECK_STATUS;
2946
2947 utext_openUTF8(&input, str_abc, -1, &status);
2948 utext_openUTF8(&pattern, str_abc, -1, &status);
2949 status = U_INDEX_OUTOFBOUNDS_ERROR;
2950 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2951 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2952
2953 utext_close(&input);
2954 utext_close(&pattern);
2955 }
2956
2957
2958 //
2959 // Split()
2960 //
2961 status = U_ZERO_ERROR;
2962 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2963 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2964 pat1 = RegexPattern::compile(&re1, pe, status);
2965 REGEX_CHECK_STATUS;
2966 UnicodeString fields[10];
2967
2968 int32_t n;
2969 n = pat1->split("Now is the time", fields, 10, status);
2970 REGEX_CHECK_STATUS;
2971 REGEX_ASSERT(n==4);
2972 REGEX_ASSERT(fields[0]=="Now");
2973 REGEX_ASSERT(fields[1]=="is");
2974 REGEX_ASSERT(fields[2]=="the");
2975 REGEX_ASSERT(fields[3]=="time");
2976 REGEX_ASSERT(fields[4]=="");
2977
2978 n = pat1->split("Now is the time", fields, 2, status);
2979 REGEX_CHECK_STATUS;
2980 REGEX_ASSERT(n==2);
2981 REGEX_ASSERT(fields[0]=="Now");
2982 REGEX_ASSERT(fields[1]=="is the time");
2983 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2984
2985 fields[1] = "*";
2986 status = U_ZERO_ERROR;
2987 n = pat1->split("Now is the time", fields, 1, status);
2988 REGEX_CHECK_STATUS;
2989 REGEX_ASSERT(n==1);
2990 REGEX_ASSERT(fields[0]=="Now is the time");
2991 REGEX_ASSERT(fields[1]=="*");
2992 status = U_ZERO_ERROR;
2993
2994 n = pat1->split(" Now is the time ", fields, 10, status);
2995 REGEX_CHECK_STATUS;
2996 REGEX_ASSERT(n==6);
2997 REGEX_ASSERT(fields[0]=="");
2998 REGEX_ASSERT(fields[1]=="Now");
2999 REGEX_ASSERT(fields[2]=="is");
3000 REGEX_ASSERT(fields[3]=="the");
3001 REGEX_ASSERT(fields[4]=="time");
3002 REGEX_ASSERT(fields[5]=="");
3003 REGEX_ASSERT(fields[6]=="");
3004
3005 fields[2] = "*";
3006 n = pat1->split(" ", fields, 10, status);
3007 REGEX_CHECK_STATUS;
3008 REGEX_ASSERT(n==2);
3009 REGEX_ASSERT(fields[0]=="");
3010 REGEX_ASSERT(fields[1]=="");
3011 REGEX_ASSERT(fields[2]=="*");
3012
3013 fields[0] = "foo";
3014 n = pat1->split("", fields, 10, status);
3015 REGEX_CHECK_STATUS;
3016 REGEX_ASSERT(n==0);
3017 REGEX_ASSERT(fields[0]=="foo");
3018
3019 delete pat1;
3020
3021 // split, with a pattern with (capture)
3022 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3023 pat1 = RegexPattern::compile(&re1, pe, status);
3024 REGEX_CHECK_STATUS;
3025
3026 status = U_ZERO_ERROR;
3027 fields[6] = fields[7] = "*";
3028 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3029 REGEX_CHECK_STATUS;
3030 REGEX_ASSERT(n==7);
3031 REGEX_ASSERT(fields[0]=="");
3032 REGEX_ASSERT(fields[1]=="a");
3033 REGEX_ASSERT(fields[2]=="Now is ");
3034 REGEX_ASSERT(fields[3]=="b");
3035 REGEX_ASSERT(fields[4]=="the time");
3036 REGEX_ASSERT(fields[5]=="c");
3037 REGEX_ASSERT(fields[6]=="");
3038 REGEX_ASSERT(fields[7]=="*");
3039 REGEX_ASSERT(status==U_ZERO_ERROR);
3040
3041 fields[6] = fields[7] = "*";
3042 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
3043 REGEX_CHECK_STATUS;
3044 REGEX_ASSERT(n==7);
3045 REGEX_ASSERT(fields[0]==" ");
3046 REGEX_ASSERT(fields[1]=="a");
3047 REGEX_ASSERT(fields[2]=="Now is ");
3048 REGEX_ASSERT(fields[3]=="b");
3049 REGEX_ASSERT(fields[4]=="the time");
3050 REGEX_ASSERT(fields[5]=="c");
3051 REGEX_ASSERT(fields[6]=="");
3052 REGEX_ASSERT(fields[7]=="*");
3053
3054 status = U_ZERO_ERROR;
3055 fields[6] = "foo";
3056 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
3057 REGEX_CHECK_STATUS;
3058 REGEX_ASSERT(n==6);
3059 REGEX_ASSERT(fields[0]==" ");
3060 REGEX_ASSERT(fields[1]=="a");
3061 REGEX_ASSERT(fields[2]=="Now is ");
3062 REGEX_ASSERT(fields[3]=="b");
3063 REGEX_ASSERT(fields[4]=="the time");
3064 REGEX_ASSERT(fields[5]==" ");
3065 REGEX_ASSERT(fields[6]=="foo");
3066
3067 status = U_ZERO_ERROR;
3068 fields[5] = "foo";
3069 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3070 REGEX_CHECK_STATUS;
3071 REGEX_ASSERT(n==5);
3072 REGEX_ASSERT(fields[0]==" ");
3073 REGEX_ASSERT(fields[1]=="a");
3074 REGEX_ASSERT(fields[2]=="Now is ");
3075 REGEX_ASSERT(fields[3]=="b");
3076 REGEX_ASSERT(fields[4]=="the time<c>");
3077 REGEX_ASSERT(fields[5]=="foo");
3078
3079 status = U_ZERO_ERROR;
3080 fields[5] = "foo";
3081 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3082 REGEX_CHECK_STATUS;
3083 REGEX_ASSERT(n==5);
3084 REGEX_ASSERT(fields[0]==" ");
3085 REGEX_ASSERT(fields[1]=="a");
3086 REGEX_ASSERT(fields[2]=="Now is ");
3087 REGEX_ASSERT(fields[3]=="b");
3088 REGEX_ASSERT(fields[4]=="the time");
3089 REGEX_ASSERT(fields[5]=="foo");
3090
3091 status = U_ZERO_ERROR;
3092 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3093 REGEX_CHECK_STATUS;
3094 REGEX_ASSERT(n==4);
3095 REGEX_ASSERT(fields[0]==" ");
3096 REGEX_ASSERT(fields[1]=="a");
3097 REGEX_ASSERT(fields[2]=="Now is ");
3098 REGEX_ASSERT(fields[3]=="the time<c>");
3099 status = U_ZERO_ERROR;
3100 delete pat1;
3101
3102 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3103 pat1 = RegexPattern::compile(&re1, pe, status);
3104 REGEX_CHECK_STATUS;
3105 n = pat1->split("1-10,20", fields, 10, status);
3106 REGEX_CHECK_STATUS;
3107 REGEX_ASSERT(n==5);
3108 REGEX_ASSERT(fields[0]=="1");
3109 REGEX_ASSERT(fields[1]=="-");
3110 REGEX_ASSERT(fields[2]=="10");
3111 REGEX_ASSERT(fields[3]==",");
3112 REGEX_ASSERT(fields[4]=="20");
3113 delete pat1;
3114
3115
3116 //
3117 // split of a UText based string, with library allocating output UTexts.
3118 //
3119 {
3120 status = U_ZERO_ERROR;
3121 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3122 UnicodeString stringToSplit("first:second:third");
3123 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3124 REGEX_CHECK_STATUS;
3125
3126 UText *splits[10] = {NULL};
3127 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3128 REGEX_CHECK_STATUS;
3129 REGEX_ASSERT(numFields == 5);
3130 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3131 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3132 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3133 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3134 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3135 REGEX_ASSERT(splits[5] == NULL);
3136
3137 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3138 if (splits[i]) {
3139 utext_close(splits[i]);
3140 splits[i] = NULL;
3141 }
3142 }
3143 utext_close(textToSplit);
3144 }
3145
3146
3147 //
3148 // RegexPattern::pattern() and patternText()
3149 //
3150 pat1 = new RegexPattern();
3151 REGEX_ASSERT(pat1->pattern() == "");
3152 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3153 delete pat1;
3154 const char *helloWorldInvariant = "(Hello, world)*";
3155 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3156 pat1 = RegexPattern::compile(&re1, pe, status);
3157 REGEX_CHECK_STATUS;
3158 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3159 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3160 delete pat1;
3161
3162 utext_close(&re1);
3163 }
3164
3165
3166 //---------------------------------------------------------------------------
3167 //
3168 // Extended A more thorough check for features of regex patterns
3169 // The test cases are in a separate data file,
3170 // source/tests/testdata/regextst.txt
3171 // A description of the test data format is included in that file.
3172 //
3173 //---------------------------------------------------------------------------
3174
3175 const char *
getPath(char buffer[2048],const char * filename)3176 RegexTest::getPath(char buffer[2048], const char *filename) {
3177 UErrorCode status=U_ZERO_ERROR;
3178 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3179 if (U_FAILURE(status)) {
3180 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3181 return NULL;
3182 }
3183
3184 strcpy(buffer, testDataDirectory);
3185 strcat(buffer, filename);
3186 return buffer;
3187 }
3188
Extended()3189 void RegexTest::Extended() {
3190 char tdd[2048];
3191 const char *srcPath;
3192 UErrorCode status = U_ZERO_ERROR;
3193 int32_t lineNum = 0;
3194
3195 //
3196 // Open and read the test data file.
3197 //
3198 srcPath=getPath(tdd, "regextst.txt");
3199 if(srcPath==NULL) {
3200 return; /* something went wrong, error already output */
3201 }
3202
3203 int32_t len;
3204 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3205 if (U_FAILURE(status)) {
3206 return; /* something went wrong, error already output */
3207 }
3208
3209 //
3210 // Put the test data into a UnicodeString
3211 //
3212 UnicodeString testString(FALSE, testData, len);
3213
3214 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3215 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3216 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3217
3218 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3219 UnicodeString testPattern; // The pattern for test from the test file.
3220 UnicodeString testFlags; // the flags for a test.
3221 UnicodeString matchString; // The marked up string to be used as input
3222
3223 if (U_FAILURE(status)){
3224 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3225 delete [] testData;
3226 return;
3227 }
3228
3229 //
3230 // Loop over the test data file, once per line.
3231 //
3232 while (lineMat.find()) {
3233 lineNum++;
3234 if (U_FAILURE(status)) {
3235 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3236 }
3237
3238 status = U_ZERO_ERROR;
3239 UnicodeString testLine = lineMat.group(1, status);
3240 if (testLine.length() == 0) {
3241 continue;
3242 }
3243
3244 //
3245 // Parse the test line. Skip blank and comment only lines.
3246 // Separate out the three main fields - pattern, flags, target.
3247 //
3248
3249 commentMat.reset(testLine);
3250 if (commentMat.lookingAt(status)) {
3251 // This line is a comment, or blank.
3252 continue;
3253 }
3254
3255 //
3256 // Pull out the pattern field, remove it from the test file line.
3257 //
3258 quotedStuffMat.reset(testLine);
3259 if (quotedStuffMat.lookingAt(status)) {
3260 testPattern = quotedStuffMat.group(2, status);
3261 testLine.remove(0, quotedStuffMat.end(0, status));
3262 } else {
3263 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3264 continue;
3265 }
3266
3267
3268 //
3269 // Pull out the flags from the test file line.
3270 //
3271 flagsMat.reset(testLine);
3272 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3273 testFlags = flagsMat.group(1, status);
3274 if (flagsMat.group(2, status).length() > 0) {
3275 errln("Bad Match flag at line %d. Scanning %c\n",
3276 lineNum, flagsMat.group(2, status).charAt(0));
3277 continue;
3278 }
3279 testLine.remove(0, flagsMat.end(0, status));
3280
3281 //
3282 // Pull out the match string, as a whole.
3283 // We'll process the <tags> later.
3284 //
3285 quotedStuffMat.reset(testLine);
3286 if (quotedStuffMat.lookingAt(status)) {
3287 matchString = quotedStuffMat.group(2, status);
3288 testLine.remove(0, quotedStuffMat.end(0, status));
3289 } else {
3290 errln("Bad match string at test file line %d", lineNum);
3291 continue;
3292 }
3293
3294 //
3295 // The only thing left from the input line should be an optional trailing comment.
3296 //
3297 commentMat.reset(testLine);
3298 if (commentMat.lookingAt(status) == FALSE) {
3299 errln("Line %d: unexpected characters at end of test line.", lineNum);
3300 continue;
3301 }
3302
3303 //
3304 // Run the test
3305 //
3306 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3307 }
3308
3309 delete [] testData;
3310
3311 }
3312
3313
3314
3315 //---------------------------------------------------------------------------
3316 //
3317 // regex_find(pattern, flags, inputString, lineNumber)
3318 //
3319 // Function to run a single test from the Extended (data driven) tests.
3320 // See file test/testdata/regextst.txt for a description of the
3321 // pattern and inputString fields, and the allowed flags.
3322 // lineNumber is the source line in regextst.txt of the test.
3323 //
3324 //---------------------------------------------------------------------------
3325
3326
3327 // Set a value into a UVector at position specified by a decimal number in
3328 // a UnicodeString. This is a utility function needed by the actual test function,
3329 // which follows.
set(UVector & vec,int32_t val,UnicodeString index)3330 static void set(UVector &vec, int32_t val, UnicodeString index) {
3331 UErrorCode status=U_ZERO_ERROR;
3332 int32_t idx = 0;
3333 for (int32_t i=0; i<index.length(); i++) {
3334 int32_t d=u_charDigitValue(index.charAt(i));
3335 if (d<0) {return;}
3336 idx = idx*10 + d;
3337 }
3338 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3339 vec.setElementAt(val, idx);
3340 }
3341
setInt(UVector & vec,int32_t val,int32_t idx)3342 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3343 UErrorCode status=U_ZERO_ERROR;
3344 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3345 vec.setElementAt(val, idx);
3346 }
3347
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3348 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3349 {
3350 UBool couldFind = TRUE;
3351 UTEXT_SETNATIVEINDEX(utext, 0);
3352 int32_t i = 0;
3353 while (i < unistrOffset) {
3354 UChar32 c = UTEXT_NEXT32(utext);
3355 if (c != U_SENTINEL) {
3356 i += U16_LENGTH(c);
3357 } else {
3358 couldFind = FALSE;
3359 break;
3360 }
3361 }
3362 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3363 return couldFind;
3364 }
3365
3366
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3367 void RegexTest::regex_find(const UnicodeString &pattern,
3368 const UnicodeString &flags,
3369 const UnicodeString &inputString,
3370 const char *srcPath,
3371 int32_t line) {
3372 UnicodeString unEscapedInput;
3373 UnicodeString deTaggedInput;
3374
3375 int32_t patternUTF8Length, inputUTF8Length;
3376 char *patternChars = NULL, *inputChars = NULL;
3377 UText patternText = UTEXT_INITIALIZER;
3378 UText inputText = UTEXT_INITIALIZER;
3379 UConverter *UTF8Converter = NULL;
3380
3381 UErrorCode status = U_ZERO_ERROR;
3382 UParseError pe;
3383 RegexPattern *parsePat = NULL;
3384 RegexMatcher *parseMatcher = NULL;
3385 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3386 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3387 UVector groupStarts(status);
3388 UVector groupEnds(status);
3389 UVector groupStartsUTF8(status);
3390 UVector groupEndsUTF8(status);
3391 UBool isMatch = FALSE, isUTF8Match = FALSE;
3392 UBool failed = FALSE;
3393 int32_t numFinds;
3394 int32_t i;
3395 UBool useMatchesFunc = FALSE;
3396 UBool useLookingAtFunc = FALSE;
3397 int32_t regionStart = -1;
3398 int32_t regionEnd = -1;
3399 int32_t regionStartUTF8 = -1;
3400 int32_t regionEndUTF8 = -1;
3401
3402
3403 //
3404 // Compile the caller's pattern
3405 //
3406 uint32_t bflags = 0;
3407 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3408 bflags |= UREGEX_CASE_INSENSITIVE;
3409 }
3410 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3411 bflags |= UREGEX_COMMENTS;
3412 }
3413 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3414 bflags |= UREGEX_DOTALL;
3415 }
3416 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3417 bflags |= UREGEX_MULTILINE;
3418 }
3419
3420 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3421 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3422 }
3423 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3424 bflags |= UREGEX_UNIX_LINES;
3425 }
3426 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3427 bflags |= UREGEX_LITERAL;
3428 }
3429
3430
3431 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3432 if (status != U_ZERO_ERROR) {
3433 #if UCONFIG_NO_BREAK_ITERATION==1
3434 // 'v' test flag means that the test pattern should not compile if ICU was configured
3435 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3436 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3437 goto cleanupAndReturn;
3438 }
3439 #endif
3440 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3441 // Expected pattern compilation error.
3442 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3443 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3444 }
3445 goto cleanupAndReturn;
3446 } else {
3447 // Unexpected pattern compilation error.
3448 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3449 goto cleanupAndReturn;
3450 }
3451 }
3452
3453 UTF8Converter = ucnv_open("UTF8", &status);
3454 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3455
3456 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3457 status = U_ZERO_ERROR; // buffer overflow
3458 patternChars = new char[patternUTF8Length+1];
3459 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3460 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3461
3462 if (status == U_ZERO_ERROR) {
3463 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3464
3465 if (status != U_ZERO_ERROR) {
3466 #if UCONFIG_NO_BREAK_ITERATION==1
3467 // 'v' test flag means that the test pattern should not compile if ICU was configured
3468 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3469 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3470 goto cleanupAndReturn;
3471 }
3472 #endif
3473 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3474 // Expected pattern compilation error.
3475 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3476 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3477 }
3478 goto cleanupAndReturn;
3479 } else {
3480 // Unexpected pattern compilation error.
3481 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3482 goto cleanupAndReturn;
3483 }
3484 }
3485 }
3486
3487 if (UTF8Pattern == NULL) {
3488 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3489 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3490 status = U_ZERO_ERROR;
3491 }
3492
3493 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3494 callerPattern->dumpPattern();
3495 }
3496
3497 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3498 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3499 goto cleanupAndReturn;
3500 }
3501
3502
3503 //
3504 // Number of times find() should be called on the test string, default to 1
3505 //
3506 numFinds = 1;
3507 for (i=2; i<=9; i++) {
3508 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3509 if (numFinds != 1) {
3510 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3511 goto cleanupAndReturn;
3512 }
3513 numFinds = i;
3514 }
3515 }
3516
3517 // 'M' flag. Use matches() instead of find()
3518 if (flags.indexOf((UChar)0x4d) >= 0) {
3519 useMatchesFunc = TRUE;
3520 }
3521 if (flags.indexOf((UChar)0x4c) >= 0) {
3522 useLookingAtFunc = TRUE;
3523 }
3524
3525 //
3526 // Find the tags in the input data, remove them, and record the group boundary
3527 // positions.
3528 //
3529 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3530 REGEX_CHECK_STATUS_L(line);
3531
3532 unEscapedInput = inputString.unescape();
3533 parseMatcher = parsePat->matcher(unEscapedInput, status);
3534 REGEX_CHECK_STATUS_L(line);
3535 while(parseMatcher->find()) {
3536 parseMatcher->appendReplacement(deTaggedInput, "", status);
3537 REGEX_CHECK_STATUS;
3538 UnicodeString groupNum = parseMatcher->group(2, status);
3539 if (groupNum == "r") {
3540 // <r> or </r>, a region specification within the string
3541 if (parseMatcher->group(1, status) == "/") {
3542 regionEnd = deTaggedInput.length();
3543 } else {
3544 regionStart = deTaggedInput.length();
3545 }
3546 } else {
3547 // <digits> or </digits>, a group match boundary tag.
3548 if (parseMatcher->group(1, status) == "/") {
3549 set(groupEnds, deTaggedInput.length(), groupNum);
3550 } else {
3551 set(groupStarts, deTaggedInput.length(), groupNum);
3552 }
3553 }
3554 }
3555 parseMatcher->appendTail(deTaggedInput);
3556 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3557 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3558 errln("mismatched <r> tags");
3559 failed = TRUE;
3560 goto cleanupAndReturn;
3561 }
3562
3563 //
3564 // Configure the matcher according to the flags specified with this test.
3565 //
3566 matcher = callerPattern->matcher(deTaggedInput, status);
3567 REGEX_CHECK_STATUS_L(line);
3568 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3569 matcher->setTrace(TRUE);
3570 }
3571
3572 if (UTF8Pattern != NULL) {
3573 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3574 status = U_ZERO_ERROR; // buffer overflow
3575 inputChars = new char[inputUTF8Length+1];
3576 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3577 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3578
3579 if (status == U_ZERO_ERROR) {
3580 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3581 REGEX_CHECK_STATUS_L(line);
3582 }
3583
3584 if (UTF8Matcher == NULL) {
3585 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3586 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3587 status = U_ZERO_ERROR;
3588 }
3589 }
3590
3591 //
3592 // Generate native indices for UTF8 versions of region and capture group info
3593 //
3594 if (UTF8Matcher != NULL) {
3595 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3596 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3597
3598 // Fill out the native index UVector info.
3599 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3600 for (i=0; i<groupStarts.size(); i++) {
3601 int32_t start = groupStarts.elementAti(i);
3602 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3603 if (start >= 0) {
3604 int32_t startUTF8;
3605 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3606 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3607 failed = TRUE;
3608 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3609 }
3610 setInt(groupStartsUTF8, startUTF8, i);
3611 }
3612
3613 int32_t end = groupEnds.elementAti(i);
3614 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3615 if (end >= 0) {
3616 int32_t endUTF8;
3617 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3618 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3619 failed = TRUE;
3620 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3621 }
3622 setInt(groupEndsUTF8, endUTF8, i);
3623 }
3624 }
3625 }
3626
3627 if (regionStart>=0) {
3628 matcher->region(regionStart, regionEnd, status);
3629 REGEX_CHECK_STATUS_L(line);
3630 if (UTF8Matcher != NULL) {
3631 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3632 REGEX_CHECK_STATUS_L(line);
3633 }
3634 }
3635 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3636 matcher->useAnchoringBounds(FALSE);
3637 if (UTF8Matcher != NULL) {
3638 UTF8Matcher->useAnchoringBounds(FALSE);
3639 }
3640 }
3641 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3642 matcher->useTransparentBounds(TRUE);
3643 if (UTF8Matcher != NULL) {
3644 UTF8Matcher->useTransparentBounds(TRUE);
3645 }
3646 }
3647
3648
3649
3650 //
3651 // Do a find on the de-tagged input using the caller's pattern
3652 // TODO: error on count>1 and not find().
3653 // error on both matches() and lookingAt().
3654 //
3655 for (i=0; i<numFinds; i++) {
3656 if (useMatchesFunc) {
3657 isMatch = matcher->matches(status);
3658 if (UTF8Matcher != NULL) {
3659 isUTF8Match = UTF8Matcher->matches(status);
3660 }
3661 } else if (useLookingAtFunc) {
3662 isMatch = matcher->lookingAt(status);
3663 if (UTF8Matcher != NULL) {
3664 isUTF8Match = UTF8Matcher->lookingAt(status);
3665 }
3666 } else {
3667 isMatch = matcher->find();
3668 if (UTF8Matcher != NULL) {
3669 isUTF8Match = UTF8Matcher->find();
3670 }
3671 }
3672 }
3673 matcher->setTrace(FALSE);
3674 if (U_FAILURE(status)) {
3675 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3676 }
3677
3678 //
3679 // Match up the groups from the find() with the groups from the tags
3680 //
3681
3682 // number of tags should match number of groups from find operation.
3683 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3684 // G option in test means that capture group data is not available in the
3685 // expected results, so the check needs to be suppressed.
3686 if (isMatch == FALSE && groupStarts.size() != 0) {
3687 dataerrln("Error at line %d: Match expected, but none found.", line);
3688 failed = TRUE;
3689 goto cleanupAndReturn;
3690 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3691 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3692 failed = TRUE;
3693 goto cleanupAndReturn;
3694 }
3695
3696 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3697 // Only check for match / no match. Don't check capture groups.
3698 if (isMatch && groupStarts.size() == 0) {
3699 errln("Error at line %d: No match expected, but one found.", line);
3700 failed = TRUE;
3701 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3702 errln("Error at line %d: No match expected, but one found. (UTF8)", line);
3703 failed = TRUE;
3704 }
3705 goto cleanupAndReturn;
3706 }
3707
3708 REGEX_CHECK_STATUS_L(line);
3709 for (i=0; i<=matcher->groupCount(); i++) {
3710 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3711 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3712 if (matcher->start(i, status) != expectedStart) {
3713 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3714 line, i, expectedStart, matcher->start(i, status));
3715 failed = TRUE;
3716 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3717 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3718 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3719 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3720 failed = TRUE;
3721 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3722 }
3723
3724 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3725 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3726 if (matcher->end(i, status) != expectedEnd) {
3727 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3728 line, i, expectedEnd, matcher->end(i, status));
3729 failed = TRUE;
3730 // Error on end position; keep going; real error is probably yet to come as group
3731 // end positions work from end of the input data towards the front.
3732 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3733 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3734 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3735 failed = TRUE;
3736 // Error on end position; keep going; real error is probably yet to come as group
3737 // end positions work from end of the input data towards the front.
3738 }
3739 }
3740 if ( matcher->groupCount()+1 < groupStarts.size()) {
3741 errln("Error at line %d: Expected %d capture groups, found %d.",
3742 line, groupStarts.size()-1, matcher->groupCount());
3743 failed = TRUE;
3744 }
3745 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3746 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3747 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3748 failed = TRUE;
3749 }
3750
3751 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3752 matcher->requireEnd() == TRUE) {
3753 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3754 failed = TRUE;
3755 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3756 UTF8Matcher->requireEnd() == TRUE) {
3757 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3758 failed = TRUE;
3759 }
3760
3761 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3762 matcher->requireEnd() == FALSE) {
3763 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3764 failed = TRUE;
3765 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3766 UTF8Matcher->requireEnd() == FALSE) {
3767 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3768 failed = TRUE;
3769 }
3770
3771 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3772 matcher->hitEnd() == TRUE) {
3773 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3774 failed = TRUE;
3775 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3776 UTF8Matcher->hitEnd() == TRUE) {
3777 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3778 failed = TRUE;
3779 }
3780
3781 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3782 matcher->hitEnd() == FALSE) {
3783 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3784 failed = TRUE;
3785 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3786 UTF8Matcher->hitEnd() == FALSE) {
3787 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3788 failed = TRUE;
3789 }
3790
3791
3792 cleanupAndReturn:
3793 if (failed) {
3794 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3795 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3796 // callerPattern->dump();
3797 }
3798 delete parseMatcher;
3799 delete parsePat;
3800 delete UTF8Matcher;
3801 delete UTF8Pattern;
3802 delete matcher;
3803 delete callerPattern;
3804
3805 utext_close(&inputText);
3806 delete[] inputChars;
3807 utext_close(&patternText);
3808 delete[] patternChars;
3809 ucnv_close(UTF8Converter);
3810 }
3811
3812
3813
3814
3815 //---------------------------------------------------------------------------
3816 //
3817 // Errors Check for error handling in patterns.
3818 //
3819 //---------------------------------------------------------------------------
Errors()3820 void RegexTest::Errors() {
3821 // \escape sequences that aren't implemented yet.
3822 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3823
3824 // Missing close parentheses
3825 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3826 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3827 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3828
3829 // Extra close paren
3830 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3831 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3832 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3833
3834 // Look-ahead, Look-behind
3835 // TODO: add tests for unbounded length look-behinds.
3836 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3837
3838 // Attempt to use non-default flags
3839 {
3840 UParseError pe;
3841 UErrorCode status = U_ZERO_ERROR;
3842 int32_t flags = UREGEX_CANON_EQ |
3843 UREGEX_COMMENTS | UREGEX_DOTALL |
3844 UREGEX_MULTILINE;
3845 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3846 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3847 delete pat1;
3848 }
3849
3850
3851 // Quantifiers are allowed only after something that can be quantified.
3852 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3853 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3854 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3855
3856 // Mal-formed {min,max} quantifiers
3857 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3858 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3859 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3860 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3861 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3862 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3863 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3864 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3865 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3866
3867 // Ticket 5389
3868 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3869
3870 // Invalid Back Reference \0
3871 // For ICU 3.8 and earlier
3872 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3873 //
3874 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3875
3876 }
3877
3878
3879 //-------------------------------------------------------------------------------
3880 //
3881 // Read a text data file, convert it to UChars, and return the data
3882 // in one big UChar * buffer, which the caller must delete.
3883 //
3884 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3885 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3886 const char *defEncoding, UErrorCode &status) {
3887 UChar *retPtr = NULL;
3888 char *fileBuf = NULL;
3889 UConverter* conv = NULL;
3890 FILE *f = NULL;
3891
3892 ulen = 0;
3893 if (U_FAILURE(status)) {
3894 return retPtr;
3895 }
3896
3897 //
3898 // Open the file.
3899 //
3900 f = fopen(fileName, "rb");
3901 if (f == 0) {
3902 dataerrln("Error opening test data file %s\n", fileName);
3903 status = U_FILE_ACCESS_ERROR;
3904 return NULL;
3905 }
3906 //
3907 // Read it in
3908 //
3909 int32_t fileSize;
3910 int32_t amt_read;
3911
3912 fseek( f, 0, SEEK_END);
3913 fileSize = ftell(f);
3914 fileBuf = new char[fileSize];
3915 fseek(f, 0, SEEK_SET);
3916 amt_read = fread(fileBuf, 1, fileSize, f);
3917 if (amt_read != fileSize || fileSize <= 0) {
3918 errln("Error reading test data file.");
3919 goto cleanUpAndReturn;
3920 }
3921
3922 //
3923 // Look for a Unicode Signature (BOM) on the data just read
3924 //
3925 int32_t signatureLength;
3926 const char * fileBufC;
3927 const char* encoding;
3928
3929 fileBufC = fileBuf;
3930 encoding = ucnv_detectUnicodeSignature(
3931 fileBuf, fileSize, &signatureLength, &status);
3932 if(encoding!=NULL ){
3933 fileBufC += signatureLength;
3934 fileSize -= signatureLength;
3935 } else {
3936 encoding = defEncoding;
3937 if (strcmp(encoding, "utf-8") == 0) {
3938 errln("file %s is missing its BOM", fileName);
3939 }
3940 }
3941
3942 //
3943 // Open a converter to take the rule file to UTF-16
3944 //
3945 conv = ucnv_open(encoding, &status);
3946 if (U_FAILURE(status)) {
3947 goto cleanUpAndReturn;
3948 }
3949
3950 //
3951 // Convert the rules to UChar.
3952 // Preflight first to determine required buffer size.
3953 //
3954 ulen = ucnv_toUChars(conv,
3955 NULL, // dest,
3956 0, // destCapacity,
3957 fileBufC,
3958 fileSize,
3959 &status);
3960 if (status == U_BUFFER_OVERFLOW_ERROR) {
3961 // Buffer Overflow is expected from the preflight operation.
3962 status = U_ZERO_ERROR;
3963
3964 retPtr = new UChar[ulen+1];
3965 ucnv_toUChars(conv,
3966 retPtr, // dest,
3967 ulen+1,
3968 fileBufC,
3969 fileSize,
3970 &status);
3971 }
3972
3973 cleanUpAndReturn:
3974 fclose(f);
3975 delete[] fileBuf;
3976 ucnv_close(conv);
3977 if (U_FAILURE(status)) {
3978 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3979 delete []retPtr;
3980 retPtr = 0;
3981 ulen = 0;
3982 };
3983 return retPtr;
3984 }
3985
3986
3987 //-------------------------------------------------------------------------------
3988 //
3989 // PerlTests - Run Perl's regular expression tests
3990 // The input file for this test is re_tests, the standard regular
3991 // expression test data distributed with the Perl source code.
3992 //
3993 // Here is Perl's description of the test data file:
3994 //
3995 // # The tests are in a separate file 't/op/re_tests'.
3996 // # Each line in that file is a separate test.
3997 // # There are five columns, separated by tabs.
3998 // #
3999 // # Column 1 contains the pattern, optionally enclosed in C<''>.
4000 // # Modifiers can be put after the closing C<'>.
4001 // #
4002 // # Column 2 contains the string to be matched.
4003 // #
4004 // # Column 3 contains the expected result:
4005 // # y expect a match
4006 // # n expect no match
4007 // # c expect an error
4008 // # B test exposes a known bug in Perl, should be skipped
4009 // # b test exposes a known bug in Perl, should be skipped if noamp
4010 // #
4011 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4012 // #
4013 // # Column 4 contains a string, usually C<$&>.
4014 // #
4015 // # Column 5 contains the expected result of double-quote
4016 // # interpolating that string after the match, or start of error message.
4017 // #
4018 // # Column 6, if present, contains a reason why the test is skipped.
4019 // # This is printed with "skipped", for harness to pick up.
4020 // #
4021 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
4022 // #
4023 // # If you want to add a regular expression test that can't be expressed
4024 // # in this format, don't add it here: put it in op/pat.t instead.
4025 //
4026 // For ICU, if field 3 contains an 'i', the test will be skipped.
4027 // The test exposes is some known incompatibility between ICU and Perl regexps.
4028 // (The i is in addition to whatever was there before.)
4029 //
4030 //-------------------------------------------------------------------------------
PerlTests()4031 void RegexTest::PerlTests() {
4032 char tdd[2048];
4033 const char *srcPath;
4034 UErrorCode status = U_ZERO_ERROR;
4035 UParseError pe;
4036
4037 //
4038 // Open and read the test data file.
4039 //
4040 srcPath=getPath(tdd, "re_tests.txt");
4041 if(srcPath==NULL) {
4042 return; /* something went wrong, error already output */
4043 }
4044
4045 int32_t len;
4046 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4047 if (U_FAILURE(status)) {
4048 return; /* something went wrong, error already output */
4049 }
4050
4051 //
4052 // Put the test data into a UnicodeString
4053 //
4054 UnicodeString testDataString(FALSE, testData, len);
4055
4056 //
4057 // Regex to break the input file into lines, and strip the new lines.
4058 // One line per match, capture group one is the desired data.
4059 //
4060 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4061 if (U_FAILURE(status)) {
4062 dataerrln("RegexPattern::compile() error");
4063 return;
4064 }
4065 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4066
4067 //
4068 // Regex to split a test file line into fields.
4069 // There are six fields, separated by tabs.
4070 //
4071 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4072
4073 //
4074 // Regex to identify test patterns with flag settings, and to separate them.
4075 // Test patterns with flags look like 'pattern'i
4076 // Test patterns without flags are not quoted: pattern
4077 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4078 //
4079 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4080 RegexMatcher* flagMat = flagPat->matcher(status);
4081
4082 //
4083 // The Perl tests reference several perl-isms, which are evaluated/substituted
4084 // in the test data. Not being perl, this must be done explicitly. Here
4085 // are string constants and REs for these constructs.
4086 //
4087 UnicodeString nulnulSrc("${nulnul}");
4088 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4089 nulnul = nulnul.unescape();
4090
4091 UnicodeString ffffSrc("${ffff}");
4092 UnicodeString ffff("\\uffff", -1, US_INV);
4093 ffff = ffff.unescape();
4094
4095 // regexp for $-[0], $+[2], etc.
4096 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4097 RegexMatcher *groupsMat = groupsPat->matcher(status);
4098
4099 // regexp for $0, $1, $2, etc.
4100 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4101 RegexMatcher *cgMat = cgPat->matcher(status);
4102
4103
4104 //
4105 // Main Loop for the Perl Tests, runs once per line from the
4106 // test data file.
4107 //
4108 int32_t lineNum = 0;
4109 int32_t skippedUnimplementedCount = 0;
4110 while (lineMat->find()) {
4111 lineNum++;
4112
4113 //
4114 // Get a line, break it into its fields, do the Perl
4115 // variable substitutions.
4116 //
4117 UnicodeString line = lineMat->group(1, status);
4118 UnicodeString fields[7];
4119 fieldPat->split(line, fields, 7, status);
4120
4121 flagMat->reset(fields[0]);
4122 flagMat->matches(status);
4123 UnicodeString pattern = flagMat->group(2, status);
4124 pattern.findAndReplace("${bang}", "!");
4125 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4126 pattern.findAndReplace(ffffSrc, ffff);
4127
4128 //
4129 // Identify patterns that include match flag settings,
4130 // split off the flags, remove the extra quotes.
4131 //
4132 UnicodeString flagStr = flagMat->group(3, status);
4133 if (U_FAILURE(status)) {
4134 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4135 return;
4136 }
4137 int32_t flags = 0;
4138 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4139 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4140 const UChar UChar_m = 0x6d;
4141 const UChar UChar_x = 0x78;
4142 const UChar UChar_y = 0x79;
4143 if (flagStr.indexOf(UChar_i) != -1) {
4144 flags |= UREGEX_CASE_INSENSITIVE;
4145 }
4146 if (flagStr.indexOf(UChar_m) != -1) {
4147 flags |= UREGEX_MULTILINE;
4148 }
4149 if (flagStr.indexOf(UChar_x) != -1) {
4150 flags |= UREGEX_COMMENTS;
4151 }
4152
4153 //
4154 // Compile the test pattern.
4155 //
4156 status = U_ZERO_ERROR;
4157 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4158 if (status == U_REGEX_UNIMPLEMENTED) {
4159 //
4160 // Test of a feature that is planned for ICU, but not yet implemented.
4161 // skip the test.
4162 skippedUnimplementedCount++;
4163 delete testPat;
4164 status = U_ZERO_ERROR;
4165 continue;
4166 }
4167
4168 if (U_FAILURE(status)) {
4169 // Some tests are supposed to generate errors.
4170 // Only report an error for tests that are supposed to succeed.
4171 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4172 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4173 {
4174 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4175 }
4176 status = U_ZERO_ERROR;
4177 delete testPat;
4178 continue;
4179 }
4180
4181 if (fields[2].indexOf(UChar_i) >= 0) {
4182 // ICU should skip this test.
4183 delete testPat;
4184 continue;
4185 }
4186
4187 if (fields[2].indexOf(UChar_c) >= 0) {
4188 // This pattern should have caused a compilation error, but didn't/
4189 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4190 delete testPat;
4191 continue;
4192 }
4193
4194 //
4195 // replace the Perl variables that appear in some of the
4196 // match data strings.
4197 //
4198 UnicodeString matchString = fields[1];
4199 matchString.findAndReplace(nulnulSrc, nulnul);
4200 matchString.findAndReplace(ffffSrc, ffff);
4201
4202 // Replace any \n in the match string with an actual new-line char.
4203 // Don't do full unescape, as this unescapes more than Perl does, which
4204 // causes other spurious failures in the tests.
4205 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4206
4207
4208
4209 //
4210 // Run the test, check for expected match/don't match result.
4211 //
4212 RegexMatcher *testMat = testPat->matcher(matchString, status);
4213 UBool found = testMat->find();
4214 UBool expected = FALSE;
4215 if (fields[2].indexOf(UChar_y) >=0) {
4216 expected = TRUE;
4217 }
4218 if (expected != found) {
4219 errln("line %d: Expected %smatch, got %smatch",
4220 lineNum, expected?"":"no ", found?"":"no " );
4221 continue;
4222 }
4223
4224 // Don't try to check expected results if there is no match.
4225 // (Some have stuff in the expected fields)
4226 if (!found) {
4227 delete testMat;
4228 delete testPat;
4229 continue;
4230 }
4231
4232 //
4233 // Interpret the Perl expression from the fourth field of the data file,
4234 // building up an ICU string from the results of the ICU match.
4235 // The Perl expression will contain references to the results of
4236 // a regex match, including the matched string, capture group strings,
4237 // group starting and ending indicies, etc.
4238 //
4239 UnicodeString resultString;
4240 UnicodeString perlExpr = fields[3];
4241 #if SUPPORT_MUTATING_INPUT_STRING
4242 groupsMat->reset(perlExpr);
4243 cgMat->reset(perlExpr);
4244 #endif
4245
4246 while (perlExpr.length() > 0) {
4247 #if !SUPPORT_MUTATING_INPUT_STRING
4248 // Perferred usage. Reset after any modification to input string.
4249 groupsMat->reset(perlExpr);
4250 cgMat->reset(perlExpr);
4251 #endif
4252
4253 if (perlExpr.startsWith("$&")) {
4254 resultString.append(testMat->group(status));
4255 perlExpr.remove(0, 2);
4256 }
4257
4258 else if (groupsMat->lookingAt(status)) {
4259 // $-[0] $+[2] etc.
4260 UnicodeString digitString = groupsMat->group(2, status);
4261 int32_t t = 0;
4262 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4263 UnicodeString plusOrMinus = groupsMat->group(1, status);
4264 int32_t matchPosition;
4265 if (plusOrMinus.compare("+") == 0) {
4266 matchPosition = testMat->end(groupNum, status);
4267 } else {
4268 matchPosition = testMat->start(groupNum, status);
4269 }
4270 if (matchPosition != -1) {
4271 ICU_Utility::appendNumber(resultString, matchPosition);
4272 }
4273 perlExpr.remove(0, groupsMat->end(status));
4274 }
4275
4276 else if (cgMat->lookingAt(status)) {
4277 // $1, $2, $3, etc.
4278 UnicodeString digitString = cgMat->group(1, status);
4279 int32_t t = 0;
4280 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4281 if (U_SUCCESS(status)) {
4282 resultString.append(testMat->group(groupNum, status));
4283 status = U_ZERO_ERROR;
4284 }
4285 perlExpr.remove(0, cgMat->end(status));
4286 }
4287
4288 else if (perlExpr.startsWith("@-")) {
4289 int32_t i;
4290 for (i=0; i<=testMat->groupCount(); i++) {
4291 if (i>0) {
4292 resultString.append(" ");
4293 }
4294 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4295 }
4296 perlExpr.remove(0, 2);
4297 }
4298
4299 else if (perlExpr.startsWith("@+")) {
4300 int32_t i;
4301 for (i=0; i<=testMat->groupCount(); i++) {
4302 if (i>0) {
4303 resultString.append(" ");
4304 }
4305 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4306 }
4307 perlExpr.remove(0, 2);
4308 }
4309
4310 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4311 // or as an escaped sequence (e.g. \n)
4312 if (perlExpr.length() > 1) {
4313 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4314 }
4315 UChar c = perlExpr.charAt(0);
4316 switch (c) {
4317 case 'n': c = '\n'; break;
4318 // add any other escape sequences that show up in the test expected results.
4319 }
4320 resultString.append(c);
4321 perlExpr.remove(0, 1);
4322 }
4323
4324 else {
4325 // Any characters from the perl expression that we don't explicitly
4326 // recognize before here are assumed to be literals and copied
4327 // as-is to the expected results.
4328 resultString.append(perlExpr.charAt(0));
4329 perlExpr.remove(0, 1);
4330 }
4331
4332 if (U_FAILURE(status)) {
4333 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4334 break;
4335 }
4336 }
4337
4338 //
4339 // Expected Results Compare
4340 //
4341 UnicodeString expectedS(fields[4]);
4342 expectedS.findAndReplace(nulnulSrc, nulnul);
4343 expectedS.findAndReplace(ffffSrc, ffff);
4344 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4345
4346
4347 if (expectedS.compare(resultString) != 0) {
4348 err("Line %d: Incorrect perl expression results.", lineNum);
4349 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4350 }
4351
4352 delete testMat;
4353 delete testPat;
4354 }
4355
4356 //
4357 // All done. Clean up allocated stuff.
4358 //
4359 delete cgMat;
4360 delete cgPat;
4361
4362 delete groupsMat;
4363 delete groupsPat;
4364
4365 delete flagMat;
4366 delete flagPat;
4367
4368 delete lineMat;
4369 delete linePat;
4370
4371 delete fieldPat;
4372 delete [] testData;
4373
4374
4375 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4376
4377 }
4378
4379
4380 //-------------------------------------------------------------------------------
4381 //
4382 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4383 // (instead of using UnicodeStrings) to test the alternate engine.
4384 // The input file for this test is re_tests, the standard regular
4385 // expression test data distributed with the Perl source code.
4386 // See PerlTests() for more information.
4387 //
4388 //-------------------------------------------------------------------------------
PerlTestsUTF8()4389 void RegexTest::PerlTestsUTF8() {
4390 char tdd[2048];
4391 const char *srcPath;
4392 UErrorCode status = U_ZERO_ERROR;
4393 UParseError pe;
4394 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4395 UText patternText = UTEXT_INITIALIZER;
4396 char *patternChars = NULL;
4397 int32_t patternLength;
4398 int32_t patternCapacity = 0;
4399 UText inputText = UTEXT_INITIALIZER;
4400 char *inputChars = NULL;
4401 int32_t inputLength;
4402 int32_t inputCapacity = 0;
4403
4404 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4405
4406 //
4407 // Open and read the test data file.
4408 //
4409 srcPath=getPath(tdd, "re_tests.txt");
4410 if(srcPath==NULL) {
4411 return; /* something went wrong, error already output */
4412 }
4413
4414 int32_t len;
4415 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4416 if (U_FAILURE(status)) {
4417 return; /* something went wrong, error already output */
4418 }
4419
4420 //
4421 // Put the test data into a UnicodeString
4422 //
4423 UnicodeString testDataString(FALSE, testData, len);
4424
4425 //
4426 // Regex to break the input file into lines, and strip the new lines.
4427 // One line per match, capture group one is the desired data.
4428 //
4429 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4430 if (U_FAILURE(status)) {
4431 dataerrln("RegexPattern::compile() error");
4432 return;
4433 }
4434 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4435
4436 //
4437 // Regex to split a test file line into fields.
4438 // There are six fields, separated by tabs.
4439 //
4440 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4441
4442 //
4443 // Regex to identify test patterns with flag settings, and to separate them.
4444 // Test patterns with flags look like 'pattern'i
4445 // Test patterns without flags are not quoted: pattern
4446 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4447 //
4448 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4449 RegexMatcher* flagMat = flagPat->matcher(status);
4450
4451 //
4452 // The Perl tests reference several perl-isms, which are evaluated/substituted
4453 // in the test data. Not being perl, this must be done explicitly. Here
4454 // are string constants and REs for these constructs.
4455 //
4456 UnicodeString nulnulSrc("${nulnul}");
4457 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4458 nulnul = nulnul.unescape();
4459
4460 UnicodeString ffffSrc("${ffff}");
4461 UnicodeString ffff("\\uffff", -1, US_INV);
4462 ffff = ffff.unescape();
4463
4464 // regexp for $-[0], $+[2], etc.
4465 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4466 RegexMatcher *groupsMat = groupsPat->matcher(status);
4467
4468 // regexp for $0, $1, $2, etc.
4469 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4470 RegexMatcher *cgMat = cgPat->matcher(status);
4471
4472
4473 //
4474 // Main Loop for the Perl Tests, runs once per line from the
4475 // test data file.
4476 //
4477 int32_t lineNum = 0;
4478 int32_t skippedUnimplementedCount = 0;
4479 while (lineMat->find()) {
4480 lineNum++;
4481
4482 //
4483 // Get a line, break it into its fields, do the Perl
4484 // variable substitutions.
4485 //
4486 UnicodeString line = lineMat->group(1, status);
4487 UnicodeString fields[7];
4488 fieldPat->split(line, fields, 7, status);
4489
4490 flagMat->reset(fields[0]);
4491 flagMat->matches(status);
4492 UnicodeString pattern = flagMat->group(2, status);
4493 pattern.findAndReplace("${bang}", "!");
4494 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4495 pattern.findAndReplace(ffffSrc, ffff);
4496
4497 //
4498 // Identify patterns that include match flag settings,
4499 // split off the flags, remove the extra quotes.
4500 //
4501 UnicodeString flagStr = flagMat->group(3, status);
4502 if (U_FAILURE(status)) {
4503 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4504 return;
4505 }
4506 int32_t flags = 0;
4507 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4508 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4509 const UChar UChar_m = 0x6d;
4510 const UChar UChar_x = 0x78;
4511 const UChar UChar_y = 0x79;
4512 if (flagStr.indexOf(UChar_i) != -1) {
4513 flags |= UREGEX_CASE_INSENSITIVE;
4514 }
4515 if (flagStr.indexOf(UChar_m) != -1) {
4516 flags |= UREGEX_MULTILINE;
4517 }
4518 if (flagStr.indexOf(UChar_x) != -1) {
4519 flags |= UREGEX_COMMENTS;
4520 }
4521
4522 //
4523 // Put the pattern in a UTF-8 UText
4524 //
4525 status = U_ZERO_ERROR;
4526 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4527 if (status == U_BUFFER_OVERFLOW_ERROR) {
4528 status = U_ZERO_ERROR;
4529 delete[] patternChars;
4530 patternCapacity = patternLength + 1;
4531 patternChars = new char[patternCapacity];
4532 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4533 }
4534 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4535
4536 //
4537 // Compile the test pattern.
4538 //
4539 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4540 if (status == U_REGEX_UNIMPLEMENTED) {
4541 //
4542 // Test of a feature that is planned for ICU, but not yet implemented.
4543 // skip the test.
4544 skippedUnimplementedCount++;
4545 delete testPat;
4546 status = U_ZERO_ERROR;
4547 continue;
4548 }
4549
4550 if (U_FAILURE(status)) {
4551 // Some tests are supposed to generate errors.
4552 // Only report an error for tests that are supposed to succeed.
4553 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4554 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4555 {
4556 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4557 }
4558 status = U_ZERO_ERROR;
4559 delete testPat;
4560 continue;
4561 }
4562
4563 if (fields[2].indexOf(UChar_i) >= 0) {
4564 // ICU should skip this test.
4565 delete testPat;
4566 continue;
4567 }
4568
4569 if (fields[2].indexOf(UChar_c) >= 0) {
4570 // This pattern should have caused a compilation error, but didn't/
4571 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4572 delete testPat;
4573 continue;
4574 }
4575
4576
4577 //
4578 // replace the Perl variables that appear in some of the
4579 // match data strings.
4580 //
4581 UnicodeString matchString = fields[1];
4582 matchString.findAndReplace(nulnulSrc, nulnul);
4583 matchString.findAndReplace(ffffSrc, ffff);
4584
4585 // Replace any \n in the match string with an actual new-line char.
4586 // Don't do full unescape, as this unescapes more than Perl does, which
4587 // causes other spurious failures in the tests.
4588 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4589
4590 //
4591 // Put the input in a UTF-8 UText
4592 //
4593 status = U_ZERO_ERROR;
4594 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4595 if (status == U_BUFFER_OVERFLOW_ERROR) {
4596 status = U_ZERO_ERROR;
4597 delete[] inputChars;
4598 inputCapacity = inputLength + 1;
4599 inputChars = new char[inputCapacity];
4600 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4601 }
4602 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4603
4604 //
4605 // Run the test, check for expected match/don't match result.
4606 //
4607 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4608 UBool found = testMat->find();
4609 UBool expected = FALSE;
4610 if (fields[2].indexOf(UChar_y) >=0) {
4611 expected = TRUE;
4612 }
4613 if (expected != found) {
4614 errln("line %d: Expected %smatch, got %smatch",
4615 lineNum, expected?"":"no ", found?"":"no " );
4616 continue;
4617 }
4618
4619 // Don't try to check expected results if there is no match.
4620 // (Some have stuff in the expected fields)
4621 if (!found) {
4622 delete testMat;
4623 delete testPat;
4624 continue;
4625 }
4626
4627 //
4628 // Interpret the Perl expression from the fourth field of the data file,
4629 // building up an ICU string from the results of the ICU match.
4630 // The Perl expression will contain references to the results of
4631 // a regex match, including the matched string, capture group strings,
4632 // group starting and ending indicies, etc.
4633 //
4634 UnicodeString resultString;
4635 UnicodeString perlExpr = fields[3];
4636
4637 while (perlExpr.length() > 0) {
4638 groupsMat->reset(perlExpr);
4639 cgMat->reset(perlExpr);
4640
4641 if (perlExpr.startsWith("$&")) {
4642 resultString.append(testMat->group(status));
4643 perlExpr.remove(0, 2);
4644 }
4645
4646 else if (groupsMat->lookingAt(status)) {
4647 // $-[0] $+[2] etc.
4648 UnicodeString digitString = groupsMat->group(2, status);
4649 int32_t t = 0;
4650 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4651 UnicodeString plusOrMinus = groupsMat->group(1, status);
4652 int32_t matchPosition;
4653 if (plusOrMinus.compare("+") == 0) {
4654 matchPosition = testMat->end(groupNum, status);
4655 } else {
4656 matchPosition = testMat->start(groupNum, status);
4657 }
4658 if (matchPosition != -1) {
4659 ICU_Utility::appendNumber(resultString, matchPosition);
4660 }
4661 perlExpr.remove(0, groupsMat->end(status));
4662 }
4663
4664 else if (cgMat->lookingAt(status)) {
4665 // $1, $2, $3, etc.
4666 UnicodeString digitString = cgMat->group(1, status);
4667 int32_t t = 0;
4668 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4669 if (U_SUCCESS(status)) {
4670 resultString.append(testMat->group(groupNum, status));
4671 status = U_ZERO_ERROR;
4672 }
4673 perlExpr.remove(0, cgMat->end(status));
4674 }
4675
4676 else if (perlExpr.startsWith("@-")) {
4677 int32_t i;
4678 for (i=0; i<=testMat->groupCount(); i++) {
4679 if (i>0) {
4680 resultString.append(" ");
4681 }
4682 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4683 }
4684 perlExpr.remove(0, 2);
4685 }
4686
4687 else if (perlExpr.startsWith("@+")) {
4688 int32_t i;
4689 for (i=0; i<=testMat->groupCount(); i++) {
4690 if (i>0) {
4691 resultString.append(" ");
4692 }
4693 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4694 }
4695 perlExpr.remove(0, 2);
4696 }
4697
4698 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4699 // or as an escaped sequence (e.g. \n)
4700 if (perlExpr.length() > 1) {
4701 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4702 }
4703 UChar c = perlExpr.charAt(0);
4704 switch (c) {
4705 case 'n': c = '\n'; break;
4706 // add any other escape sequences that show up in the test expected results.
4707 }
4708 resultString.append(c);
4709 perlExpr.remove(0, 1);
4710 }
4711
4712 else {
4713 // Any characters from the perl expression that we don't explicitly
4714 // recognize before here are assumed to be literals and copied
4715 // as-is to the expected results.
4716 resultString.append(perlExpr.charAt(0));
4717 perlExpr.remove(0, 1);
4718 }
4719
4720 if (U_FAILURE(status)) {
4721 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4722 break;
4723 }
4724 }
4725
4726 //
4727 // Expected Results Compare
4728 //
4729 UnicodeString expectedS(fields[4]);
4730 expectedS.findAndReplace(nulnulSrc, nulnul);
4731 expectedS.findAndReplace(ffffSrc, ffff);
4732 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4733
4734
4735 if (expectedS.compare(resultString) != 0) {
4736 err("Line %d: Incorrect perl expression results.", lineNum);
4737 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4738 }
4739
4740 delete testMat;
4741 delete testPat;
4742 }
4743
4744 //
4745 // All done. Clean up allocated stuff.
4746 //
4747 delete cgMat;
4748 delete cgPat;
4749
4750 delete groupsMat;
4751 delete groupsPat;
4752
4753 delete flagMat;
4754 delete flagPat;
4755
4756 delete lineMat;
4757 delete linePat;
4758
4759 delete fieldPat;
4760 delete [] testData;
4761
4762 utext_close(&patternText);
4763 utext_close(&inputText);
4764
4765 delete [] patternChars;
4766 delete [] inputChars;
4767
4768
4769 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4770
4771 }
4772
4773
4774 //--------------------------------------------------------------
4775 //
4776 // Bug6149 Verify limits to heap expansion for backtrack stack.
4777 // Use this pattern,
4778 // "(a?){1,8000000}"
4779 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4780 // This test is likely to be fragile, as further optimizations stop
4781 // more cases of pointless looping in the match engine.
4782 //
4783 //---------------------------------------------------------------
Bug6149()4784 void RegexTest::Bug6149() {
4785 UnicodeString pattern("(a?){1,8000000}");
4786 UnicodeString s("xyz");
4787 uint32_t flags = 0;
4788 UErrorCode status = U_ZERO_ERROR;
4789
4790 RegexMatcher matcher(pattern, s, flags, status);
4791 UBool result = false;
4792 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4793 REGEX_ASSERT(result == FALSE);
4794 }
4795
4796
4797 //
4798 // Callbacks() Test the callback function.
4799 // When set, callbacks occur periodically during matching operations,
4800 // giving the application code the ability to abort the operation
4801 // before it's normal completion.
4802 //
4803
4804 struct callBackContext {
4805 RegexTest *test;
4806 int32_t maxCalls;
4807 int32_t numCalls;
4808 int32_t lastSteps;
resetcallBackContext4809 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4810 };
4811
4812 U_CDECL_BEGIN
4813 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4814 testCallBackFn(const void *context, int32_t steps) {
4815 callBackContext *info = (callBackContext *)context;
4816 if (info->lastSteps+1 != steps) {
4817 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4818 }
4819 info->lastSteps = steps;
4820 info->numCalls++;
4821 return (info->numCalls < info->maxCalls);
4822 }
4823 U_CDECL_END
4824
Callbacks()4825 void RegexTest::Callbacks() {
4826 {
4827 // Getter returns NULLs if no callback has been set
4828
4829 // The variables that the getter will fill in.
4830 // Init to non-null values so that the action of the getter can be seen.
4831 const void *returnedContext = &returnedContext;
4832 URegexMatchCallback *returnedFn = &testCallBackFn;
4833
4834 UErrorCode status = U_ZERO_ERROR;
4835 RegexMatcher matcher("x", 0, status);
4836 REGEX_CHECK_STATUS;
4837 matcher.getMatchCallback(returnedFn, returnedContext, status);
4838 REGEX_CHECK_STATUS;
4839 REGEX_ASSERT(returnedFn == NULL);
4840 REGEX_ASSERT(returnedContext == NULL);
4841 }
4842
4843 {
4844 // Set and Get work
4845 callBackContext cbInfo = {this, 0, 0, 0};
4846 const void *returnedContext;
4847 URegexMatchCallback *returnedFn;
4848 UErrorCode status = U_ZERO_ERROR;
4849 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4850 REGEX_CHECK_STATUS;
4851 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4852 REGEX_CHECK_STATUS;
4853 matcher.getMatchCallback(returnedFn, returnedContext, status);
4854 REGEX_CHECK_STATUS;
4855 REGEX_ASSERT(returnedFn == testCallBackFn);
4856 REGEX_ASSERT(returnedContext == &cbInfo);
4857
4858 // A short-running match shouldn't invoke the callback
4859 status = U_ZERO_ERROR;
4860 cbInfo.reset(1);
4861 UnicodeString s = "xxx";
4862 matcher.reset(s);
4863 REGEX_ASSERT(matcher.matches(status));
4864 REGEX_CHECK_STATUS;
4865 REGEX_ASSERT(cbInfo.numCalls == 0);
4866
4867 // A medium-length match that runs long enough to invoke the
4868 // callback, but not so long that the callback aborts it.
4869 status = U_ZERO_ERROR;
4870 cbInfo.reset(4);
4871 s = "aaaaaaaaaaaaaaaaaaab";
4872 matcher.reset(s);
4873 REGEX_ASSERT(matcher.matches(status)==FALSE);
4874 REGEX_CHECK_STATUS;
4875 REGEX_ASSERT(cbInfo.numCalls > 0);
4876
4877 // A longer running match that the callback function will abort.
4878 status = U_ZERO_ERROR;
4879 cbInfo.reset(4);
4880 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4881 matcher.reset(s);
4882 REGEX_ASSERT(matcher.matches(status)==FALSE);
4883 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4884 REGEX_ASSERT(cbInfo.numCalls == 4);
4885
4886 // A longer running find that the callback function will abort.
4887 status = U_ZERO_ERROR;
4888 cbInfo.reset(4);
4889 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4890 matcher.reset(s);
4891 REGEX_ASSERT(matcher.find(status)==FALSE);
4892 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4893 REGEX_ASSERT(cbInfo.numCalls == 4);
4894 }
4895
4896
4897 }
4898
4899
4900 //
4901 // FindProgressCallbacks() Test the find "progress" callback function.
4902 // When set, the find progress callback will be invoked during a find operations
4903 // after each return from a match attempt, giving the application the opportunity
4904 // to terminate a long-running find operation before it's normal completion.
4905 //
4906
4907 struct progressCallBackContext {
4908 RegexTest *test;
4909 int64_t lastIndex;
4910 int32_t maxCalls;
4911 int32_t numCalls;
resetprogressCallBackContext4912 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4913 };
4914
4915 // call-back function for find().
4916 // Return TRUE to continue the find().
4917 // Return FALSE to stop the find().
4918 U_CDECL_BEGIN
4919 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4920 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4921 progressCallBackContext *info = (progressCallBackContext *)context;
4922 info->numCalls++;
4923 info->lastIndex = matchIndex;
4924 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4925 return (info->numCalls < info->maxCalls);
4926 }
4927 U_CDECL_END
4928
FindProgressCallbacks()4929 void RegexTest::FindProgressCallbacks() {
4930 {
4931 // Getter returns NULLs if no callback has been set
4932
4933 // The variables that the getter will fill in.
4934 // Init to non-null values so that the action of the getter can be seen.
4935 const void *returnedContext = &returnedContext;
4936 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4937
4938 UErrorCode status = U_ZERO_ERROR;
4939 RegexMatcher matcher("x", 0, status);
4940 REGEX_CHECK_STATUS;
4941 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4942 REGEX_CHECK_STATUS;
4943 REGEX_ASSERT(returnedFn == NULL);
4944 REGEX_ASSERT(returnedContext == NULL);
4945 }
4946
4947 {
4948 // Set and Get work
4949 progressCallBackContext cbInfo = {this, 0, 0, 0};
4950 const void *returnedContext;
4951 URegexFindProgressCallback *returnedFn;
4952 UErrorCode status = U_ZERO_ERROR;
4953 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4954 REGEX_CHECK_STATUS;
4955 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4956 REGEX_CHECK_STATUS;
4957 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4958 REGEX_CHECK_STATUS;
4959 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4960 REGEX_ASSERT(returnedContext == &cbInfo);
4961
4962 // A find that matches on the initial position does NOT invoke the callback.
4963 status = U_ZERO_ERROR;
4964 cbInfo.reset(100);
4965 UnicodeString s = "aaxxx";
4966 matcher.reset(s);
4967 #if 0
4968 matcher.setTrace(TRUE);
4969 #endif
4970 REGEX_ASSERT(matcher.find(0, status));
4971 REGEX_CHECK_STATUS;
4972 REGEX_ASSERT(cbInfo.numCalls == 0);
4973
4974 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4975 // but not so many times that we interrupt the operation.
4976 status = U_ZERO_ERROR;
4977 s = "aaaaaaaaaaaaaaaaaaab";
4978 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4979 matcher.reset(s);
4980 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4981 REGEX_CHECK_STATUS;
4982 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4983
4984 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4985 status = U_ZERO_ERROR;
4986 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4987 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4988 matcher.reset(s1);
4989 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4990 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4991 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4992
4993 // Now a match that will succeed, but after an interruption
4994 status = U_ZERO_ERROR;
4995 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4996 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4997 matcher.reset(s2);
4998 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4999 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5000 // Now retry the match from where left off
5001 cbInfo.maxCalls = 100; // No callback limit
5002 status = U_ZERO_ERROR;
5003 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5004 REGEX_CHECK_STATUS;
5005 }
5006
5007
5008 }
5009
5010
5011 //---------------------------------------------------------------------------
5012 //
5013 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
5014 // UTexts. The pure-C implementation of UText
5015 // has no mutable backing stores, but we can
5016 // use UnicodeString here to test the functionality.
5017 //
5018 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()5019 void RegexTest::PreAllocatedUTextCAPI () {
5020 UErrorCode status = U_ZERO_ERROR;
5021 URegularExpression *re;
5022 UText patternText = UTEXT_INITIALIZER;
5023 UnicodeString buffer;
5024 UText bufferText = UTEXT_INITIALIZER;
5025
5026 utext_openUnicodeString(&bufferText, &buffer, &status);
5027
5028 /*
5029 * getText() and getUText()
5030 */
5031 {
5032 UText text1 = UTEXT_INITIALIZER;
5033 UText text2 = UTEXT_INITIALIZER;
5034 UChar text2Chars[20];
5035 UText *resultText;
5036
5037 status = U_ZERO_ERROR;
5038 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5039 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5040 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5041 utext_openUChars(&text2, text2Chars, -1, &status);
5042
5043 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5044 re = uregex_openUText(&patternText, 0, NULL, &status);
5045
5046 /* First set a UText */
5047 uregex_setUText(re, &text1, &status);
5048 resultText = uregex_getUText(re, &bufferText, &status);
5049 REGEX_CHECK_STATUS;
5050 REGEX_ASSERT(resultText == &bufferText);
5051 utext_setNativeIndex(resultText, 0);
5052 utext_setNativeIndex(&text1, 0);
5053 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5054
5055 resultText = uregex_getUText(re, &bufferText, &status);
5056 REGEX_CHECK_STATUS;
5057 REGEX_ASSERT(resultText == &bufferText);
5058 utext_setNativeIndex(resultText, 0);
5059 utext_setNativeIndex(&text1, 0);
5060 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5061
5062 /* Then set a UChar * */
5063 uregex_setText(re, text2Chars, 7, &status);
5064 resultText = uregex_getUText(re, &bufferText, &status);
5065 REGEX_CHECK_STATUS;
5066 REGEX_ASSERT(resultText == &bufferText);
5067 utext_setNativeIndex(resultText, 0);
5068 utext_setNativeIndex(&text2, 0);
5069 REGEX_ASSERT(testUTextEqual(resultText, &text2));
5070
5071 uregex_close(re);
5072 utext_close(&text1);
5073 utext_close(&text2);
5074 }
5075
5076 /*
5077 * group()
5078 */
5079 {
5080 UChar text1[80];
5081 UText *actual;
5082 UBool result;
5083 int64_t length = 0;
5084
5085 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5086 // 012345678901234567890123456789012345678901234567
5087 // 0 1 2 3 4
5088
5089 status = U_ZERO_ERROR;
5090 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5091 REGEX_CHECK_STATUS;
5092
5093 uregex_setText(re, text1, -1, &status);
5094 result = uregex_find(re, 0, &status);
5095 REGEX_ASSERT(result==TRUE);
5096
5097 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5098 status = U_ZERO_ERROR;
5099 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5100 REGEX_CHECK_STATUS;
5101 REGEX_ASSERT(actual == &bufferText);
5102 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5103 REGEX_ASSERT(length == 16);
5104 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5105
5106 /* Capture group #1. Should succeed, matching " interior ". */
5107 status = U_ZERO_ERROR;
5108 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5109 REGEX_CHECK_STATUS;
5110 REGEX_ASSERT(actual == &bufferText);
5111 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5112 REGEX_ASSERT(length == 10);
5113 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5114
5115 /* Capture group out of range. Error. */
5116 status = U_ZERO_ERROR;
5117 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5118 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5119 REGEX_ASSERT(actual == &bufferText);
5120 uregex_close(re);
5121
5122 }
5123
5124 /*
5125 * replaceFirst()
5126 */
5127 {
5128 UChar text1[80];
5129 UChar text2[80];
5130 UText replText = UTEXT_INITIALIZER;
5131 UText *result;
5132 status = U_ZERO_ERROR;
5133 utext_openUnicodeString(&bufferText, &buffer, &status);
5134
5135 status = U_ZERO_ERROR;
5136 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5137 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5138 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5139
5140 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5141 REGEX_CHECK_STATUS;
5142
5143 /* Normal case, with match */
5144 uregex_setText(re, text1, -1, &status);
5145 REGEX_CHECK_STATUS;
5146 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5147 REGEX_CHECK_STATUS;
5148 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5149 REGEX_CHECK_STATUS;
5150 REGEX_ASSERT(result == &bufferText);
5151 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5152
5153 /* No match. Text should copy to output with no changes. */
5154 uregex_setText(re, text2, -1, &status);
5155 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5156 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5157 REGEX_CHECK_STATUS;
5158 REGEX_ASSERT(result == &bufferText);
5159 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5160
5161 /* Unicode escapes */
5162 uregex_setText(re, text1, -1, &status);
5163 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5164 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5165 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5166 REGEX_CHECK_STATUS;
5167 REGEX_ASSERT(result == &bufferText);
5168 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5169
5170 uregex_close(re);
5171 utext_close(&replText);
5172 }
5173
5174
5175 /*
5176 * replaceAll()
5177 */
5178 {
5179 UChar text1[80];
5180 UChar text2[80];
5181 UText replText = UTEXT_INITIALIZER;
5182 UText *result;
5183
5184 status = U_ZERO_ERROR;
5185 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5186 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5187 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5188
5189 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5190 REGEX_CHECK_STATUS;
5191
5192 /* Normal case, with match */
5193 uregex_setText(re, text1, -1, &status);
5194 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5195 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5196 REGEX_CHECK_STATUS;
5197 REGEX_ASSERT(result == &bufferText);
5198 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5199
5200 /* No match. Text should copy to output with no changes. */
5201 uregex_setText(re, text2, -1, &status);
5202 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5203 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5204 REGEX_CHECK_STATUS;
5205 REGEX_ASSERT(result == &bufferText);
5206 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5207
5208 uregex_close(re);
5209 utext_close(&replText);
5210 }
5211
5212
5213 /*
5214 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5215 * so we don't need to test it here.
5216 */
5217
5218 utext_close(&bufferText);
5219 utext_close(&patternText);
5220 }
5221
5222
5223 //--------------------------------------------------------------
5224 //
5225 // NamedCapture Check basic named capture group functionality
5226 //
5227 //--------------------------------------------------------------
NamedCapture()5228 void RegexTest::NamedCapture() {
5229 UErrorCode status = U_ZERO_ERROR;
5230 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5231 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5232 REGEX_CHECK_STATUS;
5233 int32_t group = pat->groupNumberFromName("five", -1, status);
5234 REGEX_CHECK_STATUS;
5235 REGEX_ASSERT(5 == group);
5236 group = pat->groupNumberFromName("three", -1, status);
5237 REGEX_CHECK_STATUS;
5238 REGEX_ASSERT(3 == group);
5239
5240 status = U_ZERO_ERROR;
5241 group = pat->groupNumberFromName(UnicodeString("six"), status);
5242 REGEX_CHECK_STATUS;
5243 REGEX_ASSERT(6 == group);
5244
5245 status = U_ZERO_ERROR;
5246 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5247 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5248
5249 status = U_ZERO_ERROR;
5250
5251 // After copying a pattern, named capture should still work in the copy.
5252 RegexPattern *copiedPat = new RegexPattern(*pat);
5253 REGEX_ASSERT(*copiedPat == *pat);
5254 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5255
5256 group = copiedPat->groupNumberFromName("five", -1, status);
5257 REGEX_CHECK_STATUS;
5258 REGEX_ASSERT(5 == group);
5259 group = copiedPat->groupNumberFromName("three", -1, status);
5260 REGEX_CHECK_STATUS;
5261 REGEX_ASSERT(3 == group);
5262 delete copiedPat;
5263
5264 // ReplaceAll with named capture group.
5265 status = U_ZERO_ERROR;
5266 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5267 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5268 REGEX_CHECK_STATUS;
5269 // m.pattern().dumpPattern();
5270 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5271 REGEX_CHECK_STATUS;
5272 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5273 delete m;
5274
5275 // ReplaceAll, allowed capture group numbers.
5276 text = UnicodeString("abcmxyz");
5277 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5278 REGEX_CHECK_STATUS;
5279
5280 status = U_ZERO_ERROR;
5281 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5282 REGEX_CHECK_STATUS;
5283 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5284
5285 status = U_ZERO_ERROR;
5286 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5287 REGEX_CHECK_STATUS;
5288 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5289
5290 status = U_ZERO_ERROR;
5291 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5292 REGEX_CHECK_STATUS;
5293 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5294
5295 status = U_ZERO_ERROR;
5296 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5297 REGEX_CHECK_STATUS;
5298 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5299
5300 status = U_ZERO_ERROR;
5301 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5302 REGEX_CHECK_STATUS;
5303 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5304
5305 status = U_ZERO_ERROR;
5306 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5307 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5308
5309 status = U_ZERO_ERROR;
5310 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5311 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5312 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5313
5314 status = U_ZERO_ERROR;
5315 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5316 REGEX_CHECK_STATUS; // that push group num out of range.
5317 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5318
5319 status = U_ZERO_ERROR;
5320 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5321 REGEX_CHECK_STATUS;
5322 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5323
5324 status = U_ZERO_ERROR;
5325 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5326 REGEX_CHECK_STATUS;
5327 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5328
5329 status = U_ZERO_ERROR;
5330 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5331 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5332
5333 status = U_ZERO_ERROR;
5334 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5335 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5336
5337 status = U_ZERO_ERROR;
5338 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5339 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5340
5341 status = U_ZERO_ERROR;
5342 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5343 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5344
5345 delete m;
5346
5347 // Repeat the above replaceAll() tests using the plain C API, which
5348 // has a separate implementation internally.
5349 // TODO: factor out the test data.
5350
5351 status = U_ZERO_ERROR;
5352 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5353 REGEX_CHECK_STATUS;
5354 text = UnicodeString("abcmxyz");
5355 uregex_setText(re, text.getBuffer(), text.length(), &status);
5356 REGEX_CHECK_STATUS;
5357
5358 UChar resultBuf[100];
5359 int32_t resultLength;
5360 UnicodeString repl;
5361
5362 status = U_ZERO_ERROR;
5363 repl = UnicodeString("<$0>");
5364 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5365 REGEX_CHECK_STATUS;
5366 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5367
5368 status = U_ZERO_ERROR;
5369 repl = UnicodeString("<$1>");
5370 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5371 REGEX_CHECK_STATUS;
5372 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5373
5374 status = U_ZERO_ERROR;
5375 repl = UnicodeString("<${one}>");
5376 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5377 REGEX_CHECK_STATUS;
5378 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5379
5380 status = U_ZERO_ERROR;
5381 repl = UnicodeString("<$2>");
5382 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5383 REGEX_CHECK_STATUS;
5384 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5385
5386 status = U_ZERO_ERROR;
5387 repl = UnicodeString("<$3>");
5388 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5389 REGEX_CHECK_STATUS;
5390 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5391
5392 status = U_ZERO_ERROR;
5393 repl = UnicodeString("<$4>");
5394 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5395 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5396
5397 status = U_ZERO_ERROR;
5398 repl = UnicodeString("<$04>");
5399 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5400 REGEX_CHECK_STATUS;
5401 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5402
5403 status = U_ZERO_ERROR;
5404 repl = UnicodeString("<$000016>");
5405 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5406 REGEX_CHECK_STATUS;
5407 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5408
5409 status = U_ZERO_ERROR;
5410 repl = UnicodeString("<$3$2$1${one}>");
5411 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5412 REGEX_CHECK_STATUS;
5413 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5414
5415 status = U_ZERO_ERROR;
5416 repl = UnicodeString("$3$2$1${one}");
5417 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5418 REGEX_CHECK_STATUS;
5419 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5420
5421 status = U_ZERO_ERROR;
5422 repl = UnicodeString("<${noSuchName}>");
5423 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5424 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5425
5426 status = U_ZERO_ERROR;
5427 repl = UnicodeString("<${invalid-name}>");
5428 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5429 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5430
5431 status = U_ZERO_ERROR;
5432 repl = UnicodeString("<${one");
5433 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5434 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5435
5436 status = U_ZERO_ERROR;
5437 repl = UnicodeString("$not a capture group");
5438 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5439 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5440
5441 uregex_close(re);
5442 }
5443
5444 //--------------------------------------------------------------
5445 //
5446 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5447 // The point is not so much what the exact limit is,
5448 // but that a largish number doesn't hit bad non-linear performance,
5449 // and that exceeding the limit fails cleanly.
5450 //
5451 //--------------------------------------------------------------
NamedCaptureLimits()5452 void RegexTest::NamedCaptureLimits() {
5453 if (quick) {
5454 logln("Skipping test. Runs in exhuastive mode only.");
5455 return;
5456 }
5457 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5458 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5459 char nnbuf[100];
5460 UnicodeString pattern;
5461 int32_t nn;
5462
5463 for (nn=1; nn<goodLimit; nn++) {
5464 sprintf(nnbuf, "(?<nn%d>)", nn);
5465 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5466 }
5467 UErrorCode status = U_ZERO_ERROR;
5468 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5469 REGEX_CHECK_STATUS;
5470 for (nn=1; nn<goodLimit; nn++) {
5471 sprintf(nnbuf, "nn%d", nn);
5472 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5473 REGEX_ASSERT(nn == groupNum);
5474 if (nn != groupNum) {
5475 break;
5476 }
5477 }
5478 delete pat;
5479
5480 pattern.remove();
5481 for (nn=1; nn<failLimit; nn++) {
5482 sprintf(nnbuf, "(?<nn%d>)", nn);
5483 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5484 }
5485 status = U_ZERO_ERROR;
5486 pat = RegexPattern::compile(pattern, 0, status);
5487 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5488 delete pat;
5489 }
5490
5491
5492 //--------------------------------------------------------------
5493 //
5494 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5495 //
5496 //---------------------------------------------------------------
Bug7651()5497 void RegexTest::Bug7651() {
5498 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5499 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5500 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5501 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5502 UnicodeString s("#ff @abcd This is test");
5503 RegexPattern *REPattern = NULL;
5504 RegexMatcher *REMatcher = NULL;
5505 UErrorCode status = U_ZERO_ERROR;
5506 UParseError pe;
5507
5508 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5509 REGEX_CHECK_STATUS;
5510 REMatcher = REPattern->matcher(s, status);
5511 REGEX_CHECK_STATUS;
5512 REGEX_ASSERT(REMatcher->find());
5513 REGEX_ASSERT(REMatcher->start(status) == 0);
5514 delete REPattern;
5515 delete REMatcher;
5516 status = U_ZERO_ERROR;
5517
5518 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5519 REGEX_CHECK_STATUS;
5520 REMatcher = REPattern->matcher(s, status);
5521 REGEX_CHECK_STATUS;
5522 REGEX_ASSERT(REMatcher->find());
5523 REGEX_ASSERT(REMatcher->start(status) == 0);
5524 delete REPattern;
5525 delete REMatcher;
5526 status = U_ZERO_ERROR;
5527 }
5528
Bug7740()5529 void RegexTest::Bug7740() {
5530 UErrorCode status = U_ZERO_ERROR;
5531 UnicodeString pattern = "(a)";
5532 UnicodeString text = "abcdef";
5533 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5534 REGEX_CHECK_STATUS;
5535 REGEX_ASSERT(m->lookingAt(status));
5536 REGEX_CHECK_STATUS;
5537 status = U_ILLEGAL_ARGUMENT_ERROR;
5538 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5539 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5540 REGEX_ASSERT(s == "");
5541 delete m;
5542 }
5543
5544 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5545
Bug8479()5546 void RegexTest::Bug8479() {
5547 UErrorCode status = U_ZERO_ERROR;
5548
5549 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5550 REGEX_CHECK_STATUS;
5551 if (U_SUCCESS(status))
5552 {
5553 UnicodeString str;
5554 str.setToBogus();
5555 pMatcher->reset(str);
5556 status = U_ZERO_ERROR;
5557 pMatcher->matches(status);
5558 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5559 delete pMatcher;
5560 }
5561 }
5562
5563
5564 // Bug 7029
Bug7029()5565 void RegexTest::Bug7029() {
5566 UErrorCode status = U_ZERO_ERROR;
5567
5568 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5569 UnicodeString text = "abc.def";
5570 UnicodeString splits[10];
5571 REGEX_CHECK_STATUS;
5572 int32_t numFields = pMatcher->split(text, splits, 10, status);
5573 REGEX_CHECK_STATUS;
5574 REGEX_ASSERT(numFields == 8);
5575 delete pMatcher;
5576 }
5577
5578 // Bug 9283
5579 // This test is checking for the existance of any supplemental characters that case-fold
5580 // to a bmp character.
5581 //
5582 // At the time of this writing there are none. If any should appear in a subsequent release
5583 // of Unicode, the code in regular expressions compilation that determines the longest
5584 // posssible match for a literal string will need to be enhanced.
5585 //
5586 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5587 // for details on what to do in case of a failure of this test.
5588 //
Bug9283()5589 void RegexTest::Bug9283() {
5590 #if !UCONFIG_NO_NORMALIZATION
5591 UErrorCode status = U_ZERO_ERROR;
5592 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5593 REGEX_CHECK_STATUS;
5594 int32_t index;
5595 UChar32 c;
5596 for (index=0; ; index++) {
5597 c = supplementalsWithCaseFolding.charAt(index);
5598 if (c == -1) {
5599 break;
5600 }
5601 UnicodeString cf = UnicodeString(c).foldCase();
5602 REGEX_ASSERT(cf.length() >= 2);
5603 }
5604 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5605 }
5606
5607
CheckInvBufSize()5608 void RegexTest::CheckInvBufSize() {
5609 if(inv_next>=INV_BUFSIZ) {
5610 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5611 __FILE__, INV_BUFSIZ, inv_next);
5612 } else {
5613 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5614 }
5615 }
5616
5617
Bug10459()5618 void RegexTest::Bug10459() {
5619 UErrorCode status = U_ZERO_ERROR;
5620 UnicodeString patternString("(txt)");
5621 UnicodeString txtString("txt");
5622
5623 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5624 REGEX_CHECK_STATUS;
5625 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5626 REGEX_CHECK_STATUS;
5627
5628 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5629 REGEX_CHECK_STATUS;
5630
5631 uregex_setUText(icu_re, utext_txt, &status);
5632 REGEX_CHECK_STATUS;
5633
5634 // The bug was that calling uregex_group() before doing a matching operation
5635 // was causing a segfault. Only for Regular Expressions created from UText.
5636 // It should set an U_REGEX_INVALID_STATE.
5637
5638 UChar buf[100];
5639 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5640 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5641 REGEX_ASSERT(len == 0);
5642
5643 uregex_close(icu_re);
5644 utext_close(utext_pat);
5645 utext_close(utext_txt);
5646 }
5647
TestCaseInsensitiveStarters()5648 void RegexTest::TestCaseInsensitiveStarters() {
5649 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5650 // become stale because of new Unicode characters.
5651 // If it is stale, rerun the generation tool
5652 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5653 // and replace the embedded data in i18n/regexcmp.cpp
5654
5655 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5656 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5657 continue;
5658 }
5659 UnicodeSet s(cp, cp);
5660 s.closeOver(USET_CASE_INSENSITIVE);
5661 UnicodeSetIterator setIter(s);
5662 while (setIter.next()) {
5663 if (!setIter.isString()) {
5664 continue;
5665 }
5666 const UnicodeString &str = setIter.getString();
5667 UChar32 firstChar = str.char32At(0);
5668 UnicodeSet starters;
5669 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5670 if (!starters.contains(cp)) {
5671 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5672 return;
5673 }
5674 }
5675 }
5676 }
5677
5678
TestBug11049()5679 void RegexTest::TestBug11049() {
5680 // Original bug report: pattern with match start consisting of one of several individual characters,
5681 // and the text being matched ending with a supplementary character. find() would read past the
5682 // end of the input text when searching for potential match starting points.
5683
5684 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5685 // detect the bad read.
5686
5687 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5688 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5689
5690 // Test again with a pattern starting with a single character,
5691 // which takes a different code path than starting with an OR expression,
5692 // but with similar logic.
5693 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5694 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5695 }
5696
5697 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5698 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5699 UErrorCode status = U_ZERO_ERROR;
5700 UnicodeString patternString = UnicodeString(pattern).unescape();
5701 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5702
5703 UnicodeString dataString = UnicodeString(data).unescape();
5704 UChar *exactBuffer = new UChar[dataString.length()];
5705 dataString.extract(exactBuffer, dataString.length(), status);
5706 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5707
5708 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5709 REGEX_CHECK_STATUS;
5710 matcher->reset(ut);
5711 UBool result = matcher->find();
5712 if (result != expectMatch) {
5713 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5714 __FILE__, lineNumber, expectMatch, result, pattern, data);
5715 }
5716
5717 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5718 // off-by-one on find() with match at the last code point.
5719 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5720 // because string.unescape() will only shrink it.
5721 char * utf8Buffer = new char[uprv_strlen(data)+1];
5722 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5723 REGEX_CHECK_STATUS;
5724 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5725 REGEX_CHECK_STATUS;
5726 matcher->reset(ut);
5727 result = matcher->find();
5728 if (result != expectMatch) {
5729 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5730 __FILE__, lineNumber, expectMatch, result, pattern, data);
5731 }
5732 delete [] utf8Buffer;
5733
5734 utext_close(ut);
5735 delete [] exactBuffer;
5736 }
5737
5738
TestBug11371()5739 void RegexTest::TestBug11371() {
5740 if (quick) {
5741 logln("Skipping test. Runs in exhuastive mode only.");
5742 return;
5743 }
5744 UErrorCode status = U_ZERO_ERROR;
5745 UnicodeString patternString;
5746
5747 for (int i=0; i<8000000; i++) {
5748 patternString.append(UnicodeString("()"));
5749 }
5750 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5751 if (status != U_REGEX_PATTERN_TOO_BIG) {
5752 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5753 __FILE__, __LINE__, u_errorName(status));
5754 }
5755
5756 status = U_ZERO_ERROR;
5757 patternString = "(";
5758 for (int i=0; i<20000000; i++) {
5759 patternString.append(UnicodeString("A++"));
5760 }
5761 patternString.append(UnicodeString("){0}B++"));
5762 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5763 if (status != U_REGEX_PATTERN_TOO_BIG) {
5764 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5765 __FILE__, __LINE__, u_errorName(status));
5766 }
5767
5768 // Pattern with too much string data, such that string indexes overflow operand data field size
5769 // in compiled instruction.
5770 status = U_ZERO_ERROR;
5771 patternString = "";
5772 while (patternString.length() < 0x00ffffff) {
5773 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5774 }
5775 patternString.append(UnicodeString("X? trailing string"));
5776 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5777 if (status != U_REGEX_PATTERN_TOO_BIG) {
5778 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5779 __FILE__, __LINE__, u_errorName(status));
5780 }
5781 }
5782
TestBug11480()5783 void RegexTest::TestBug11480() {
5784 // C API, get capture group of a group that does not participate in the match.
5785 // (Returns a zero length string, with nul termination,
5786 // indistinguishable from a group with a zero length match.)
5787
5788 UErrorCode status = U_ZERO_ERROR;
5789 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5790 REGEX_CHECK_STATUS;
5791 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5792 uregex_setText(re, text.getBuffer(), text.length(), &status);
5793 REGEX_CHECK_STATUS;
5794 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5795 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5796 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5797 REGEX_ASSERT(length == 0);
5798 REGEX_ASSERT(buf[0] == 13);
5799 REGEX_ASSERT(buf[1] == 0);
5800 REGEX_ASSERT(buf[2] == 13);
5801 uregex_close(re);
5802
5803 // UText C++ API, length of match is 0 for non-participating matches.
5804 UText ut = UTEXT_INITIALIZER;
5805 utext_openUnicodeString(&ut, &text, &status);
5806 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5807 REGEX_CHECK_STATUS;
5808 matcher.reset(&ut);
5809 REGEX_ASSERT(matcher.lookingAt(0, status));
5810
5811 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5812 int64_t groupLen = -666;
5813 UText group = UTEXT_INITIALIZER;
5814 matcher.group(1, &group, groupLen, status);
5815 REGEX_CHECK_STATUS;
5816 REGEX_ASSERT(groupLen == 1);
5817 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5818
5819 // Capture group 2, the (B), does not participate in the match.
5820 matcher.group(2, &group, groupLen, status);
5821 REGEX_CHECK_STATUS;
5822 REGEX_ASSERT(groupLen == 0);
5823 REGEX_ASSERT(matcher.start(2, status) == -1);
5824 REGEX_CHECK_STATUS;
5825 }
5826
5827
5828 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
5829