1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8
9 //
10 // regextst.cpp
11 //
12 // ICU Regular Expressions test, part of intltest.
13 //
14
15 /*
16 NOTE!!
17
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
22
23 */
24
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utf16.h"
42 #include "cstr.h"
43 #include "regextst.h"
44 #include "regexcmp.h"
45 #include "uvector.h"
46 #include "util.h"
47 #include "cmemory.h"
48 #include "cstring.h"
49 #include "uinvchar.h"
50
51 #define SUPPORT_MUTATING_INPUT_STRING 0
52
53 //---------------------------------------------------------------------------
54 //
55 // Test class boilerplate
56 //
57 //---------------------------------------------------------------------------
RegexTest()58 RegexTest::RegexTest()
59 {
60 }
61
62
~RegexTest()63 RegexTest::~RegexTest()
64 {
65 }
66
67
68
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)69 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
70 {
71 if (exec) logln("TestSuite RegexTest: ");
72 TESTCASE_AUTO_BEGIN;
73 TESTCASE_AUTO(Basic);
74 TESTCASE_AUTO(API_Match);
75 TESTCASE_AUTO(API_Replace);
76 TESTCASE_AUTO(API_Pattern);
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(Extended);
79 #endif
80 TESTCASE_AUTO(Errors);
81 TESTCASE_AUTO(PerlTests);
82 TESTCASE_AUTO(Callbacks);
83 TESTCASE_AUTO(FindProgressCallbacks);
84 TESTCASE_AUTO(Bug6149);
85 TESTCASE_AUTO(UTextBasic);
86 TESTCASE_AUTO(API_Match_UTF8);
87 TESTCASE_AUTO(API_Replace_UTF8);
88 TESTCASE_AUTO(API_Pattern_UTF8);
89 TESTCASE_AUTO(PerlTestsUTF8);
90 TESTCASE_AUTO(PreAllocatedUTextCAPI);
91 TESTCASE_AUTO(Bug7651);
92 TESTCASE_AUTO(Bug7740);
93 TESTCASE_AUTO(Bug8479);
94 TESTCASE_AUTO(Bug7029);
95 TESTCASE_AUTO(CheckInvBufSize);
96 TESTCASE_AUTO(Bug9283);
97 TESTCASE_AUTO(Bug10459);
98 TESTCASE_AUTO(TestCaseInsensitiveStarters);
99 TESTCASE_AUTO(TestBug11049);
100 TESTCASE_AUTO(TestBug11371);
101 TESTCASE_AUTO(TestBug11480);
102 TESTCASE_AUTO(NamedCapture);
103 TESTCASE_AUTO(NamedCaptureLimits);
104 TESTCASE_AUTO(TestBug12884);
105 TESTCASE_AUTO(TestBug13631);
106 TESTCASE_AUTO(TestBug13632);
107 TESTCASE_AUTO_END;
108 }
109
110
111 /**
112 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
113 * into ASCII.
114 * @see utext_openUTF8
115 */
116 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
117
118 //---------------------------------------------------------------------------
119 //
120 // Error Checking / Reporting macros used in all of the tests.
121 //
122 //---------------------------------------------------------------------------
123
utextToPrintable(char * buf,int32_t bufLen,UText * text)124 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
125 int64_t oldIndex = utext_getNativeIndex(text);
126 utext_setNativeIndex(text, 0);
127 char *bufPtr = buf;
128 UChar32 c = utext_next32From(text, 0);
129 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
130 if (0x000020<=c && c<0x00007e) {
131 *bufPtr = c;
132 } else {
133 #if 0
134 sprintf(bufPtr,"U+%04X", c);
135 bufPtr+= strlen(bufPtr)-1;
136 #else
137 *bufPtr = '%';
138 #endif
139 }
140 bufPtr++;
141 c = UTEXT_NEXT32(text);
142 }
143 *bufPtr = 0;
144 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
145 char *ebuf = (char*)malloc(bufLen);
146 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
147 uprv_strncpy(buf, ebuf, bufLen);
148 free((void*)ebuf);
149 #endif
150 utext_setNativeIndex(text, oldIndex);
151 }
152
153
154 static char ASSERT_BUF[1024];
155
extractToAssertBuf(const UnicodeString & message)156 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
157 if(message.length()==0) {
158 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
159 } else {
160 UnicodeString buf;
161 IntlTest::prettify(message,buf);
162 if(buf.length()==0) {
163 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
164 } else {
165 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
166 if(ASSERT_BUF[0]==0) {
167 ASSERT_BUF[0]=0;
168 for(int32_t i=0;i<buf.length();i++) {
169 UChar ch = buf[i];
170 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
171 }
172 }
173 }
174 }
175 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
176 return ASSERT_BUF;
177 }
178
179 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
180
181 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
182 __FILE__, __LINE__, u_errorName(status)); return;}}
183
184 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
185
186 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
187 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
188 __LINE__, u_errorName(errcode), u_errorName(status));};}
189
190 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
191 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
192
193 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
194 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
195
196 // expected: const char * , restricted to invariant characters.
197 // actual: const UnicodeString &
198 #define REGEX_ASSERT_UNISTR(expected, actual) { \
199 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
200 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
201 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
202
203
testUTextEqual(UText * uta,UText * utb)204 static UBool testUTextEqual(UText *uta, UText *utb) {
205 UChar32 ca = 0;
206 UChar32 cb = 0;
207 utext_setNativeIndex(uta, 0);
208 utext_setNativeIndex(utb, 0);
209 do {
210 ca = utext_next32(uta);
211 cb = utext_next32(utb);
212 if (ca != cb) {
213 break;
214 }
215 } while (ca != U_SENTINEL);
216 return ca == cb;
217 }
218
219
220 /**
221 * @param expected expected text in UTF-8 (not platform) codepage
222 */
assertUText(const char * expected,UText * actual,const char * file,int line)223 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
224 UErrorCode status = U_ZERO_ERROR;
225 UText expectedText = UTEXT_INITIALIZER;
226 utext_openUTF8(&expectedText, expected, -1, &status);
227 if(U_FAILURE(status)) {
228 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
229 return;
230 }
231 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
232 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
233 return;
234 }
235 utext_setNativeIndex(actual, 0);
236 if (!testUTextEqual(&expectedText, actual)) {
237 char buf[201 /*21*/];
238 char expectedBuf[201];
239 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
240 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
241 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
242 }
243 utext_close(&expectedText);
244 }
245 /**
246 * @param expected invariant (platform local text) input
247 */
248
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)249 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
250 UErrorCode status = U_ZERO_ERROR;
251 UText expectedText = UTEXT_INITIALIZER;
252 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
253 if(U_FAILURE(status)) {
254 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
255 return;
256 }
257 utext_setNativeIndex(actual, 0);
258 if (!testUTextEqual(&expectedText, actual)) {
259 char buf[201 /*21*/];
260 char expectedBuf[201];
261 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
262 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
263 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
264 }
265 utext_close(&expectedText);
266 }
267
268 /**
269 * Assumes utf-8 input
270 */
271 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
272 /**
273 * Assumes Invariant input
274 */
275 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
276
277 /**
278 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
279 * passed into utext_openUTF8. An error will be given if
280 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
281 */
282
283 #define INV_BUFSIZ 2048 /* increase this if too small */
284
285 static int64_t inv_next=0;
286
287 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
288 static char inv_buf[INV_BUFSIZ];
289 #endif
290
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)291 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
292 if(length==-1) length=strlen(inv);
293 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
294 inv_next+=length;
295 return utext_openUTF8(ut, inv, length, status);
296 #else
297 if(inv_next+length+1>INV_BUFSIZ) {
298 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
299 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
300 *status = U_MEMORY_ALLOCATION_ERROR;
301 return NULL;
302 }
303
304 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
305 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
306 inv_next+=length;
307
308 #if 0
309 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
310 #endif
311
312 return utext_openUTF8(ut, (const char*)buf, length, status);
313 #endif
314 }
315
316
317 //---------------------------------------------------------------------------
318 //
319 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
320 // for the LookingAt() and Match() functions.
321 //
322 // usage:
323 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
324 //
325 // The expected results are UBool - TRUE or FALSE.
326 // The input text is unescaped. The pattern is not.
327 //
328 //
329 //---------------------------------------------------------------------------
330
331 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
332
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)333 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
334 const UnicodeString pattern(pat, -1, US_INV);
335 const UnicodeString inputText(text, -1, US_INV);
336 UErrorCode status = U_ZERO_ERROR;
337 UParseError pe;
338 RegexPattern *REPattern = NULL;
339 RegexMatcher *REMatcher = NULL;
340 UBool retVal = TRUE;
341
342 UnicodeString patString(pat, -1, US_INV);
343 REPattern = RegexPattern::compile(patString, 0, pe, status);
344 if (U_FAILURE(status)) {
345 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
346 line, u_errorName(status));
347 return FALSE;
348 }
349 if (line==376) { REPattern->dumpPattern();}
350
351 UnicodeString inputString(inputText);
352 UnicodeString unEscapedInput = inputString.unescape();
353 REMatcher = REPattern->matcher(unEscapedInput, status);
354 if (U_FAILURE(status)) {
355 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
356 line, u_errorName(status));
357 return FALSE;
358 }
359
360 UBool actualmatch;
361 actualmatch = REMatcher->lookingAt(status);
362 if (U_FAILURE(status)) {
363 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
364 line, u_errorName(status));
365 retVal = FALSE;
366 }
367 if (actualmatch != looking) {
368 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
369 retVal = FALSE;
370 }
371
372 status = U_ZERO_ERROR;
373 actualmatch = REMatcher->matches(status);
374 if (U_FAILURE(status)) {
375 errln("RegexTest failure in matches() at line %d. Status = %s\n",
376 line, u_errorName(status));
377 retVal = FALSE;
378 }
379 if (actualmatch != match) {
380 errln("RegexTest: wrong return from matches() at line %d.\n", line);
381 retVal = FALSE;
382 }
383
384 if (retVal == FALSE) {
385 REPattern->dumpPattern();
386 }
387
388 delete REPattern;
389 delete REMatcher;
390 return retVal;
391 }
392
393
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)394 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
395 UText pattern = UTEXT_INITIALIZER;
396 int32_t inputUTF8Length;
397 char *textChars = NULL;
398 UText inputText = UTEXT_INITIALIZER;
399 UErrorCode status = U_ZERO_ERROR;
400 UParseError pe;
401 RegexPattern *REPattern = NULL;
402 RegexMatcher *REMatcher = NULL;
403 UBool retVal = TRUE;
404
405 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
406 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
407 if (U_FAILURE(status)) {
408 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
409 line, u_errorName(status));
410 return FALSE;
411 }
412
413 UnicodeString inputString(text, -1, US_INV);
414 UnicodeString unEscapedInput = inputString.unescape();
415 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
416 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
417
418 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
419 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
420 // UTF-8 does not allow unpaired surrogates, so this could actually happen
421 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
422 return TRUE; // not a failure of the Regex engine
423 }
424 status = U_ZERO_ERROR; // buffer overflow
425 textChars = new char[inputUTF8Length+1];
426 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
427 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
428
429 REMatcher = &REPattern->matcher(status)->reset(&inputText);
430 if (U_FAILURE(status)) {
431 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
432 line, u_errorName(status));
433 return FALSE;
434 }
435
436 UBool actualmatch;
437 actualmatch = REMatcher->lookingAt(status);
438 if (U_FAILURE(status)) {
439 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
440 line, u_errorName(status));
441 retVal = FALSE;
442 }
443 if (actualmatch != looking) {
444 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
445 retVal = FALSE;
446 }
447
448 status = U_ZERO_ERROR;
449 actualmatch = REMatcher->matches(status);
450 if (U_FAILURE(status)) {
451 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
452 line, u_errorName(status));
453 retVal = FALSE;
454 }
455 if (actualmatch != match) {
456 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
457 retVal = FALSE;
458 }
459
460 if (retVal == FALSE) {
461 REPattern->dumpPattern();
462 }
463
464 delete REPattern;
465 delete REMatcher;
466 utext_close(&inputText);
467 utext_close(&pattern);
468 delete[] textChars;
469 return retVal;
470 }
471
472
473
474 //---------------------------------------------------------------------------
475 //
476 // REGEX_ERR Macro + invocation function to simplify writing tests
477 // regex tests for incorrect patterns
478 //
479 // usage:
480 // REGEX_ERR("pattern", expected error line, column, expected status);
481 //
482 //---------------------------------------------------------------------------
483 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
484
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)485 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
486 UErrorCode expectedStatus, int32_t line) {
487 UnicodeString pattern(pat);
488
489 UErrorCode status = U_ZERO_ERROR;
490 UParseError pe;
491 RegexPattern *callerPattern = NULL;
492
493 //
494 // Compile the caller's pattern
495 //
496 UnicodeString patString(pat);
497 callerPattern = RegexPattern::compile(patString, 0, pe, status);
498 if (status != expectedStatus) {
499 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
500 } else {
501 if (status != U_ZERO_ERROR) {
502 if (pe.line != errLine || pe.offset != errCol) {
503 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
504 line, errLine, errCol, pe.line, pe.offset);
505 }
506 }
507 }
508
509 delete callerPattern;
510
511 //
512 // Compile again, using a UTF-8-based UText
513 //
514 UText patternText = UTEXT_INITIALIZER;
515 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
516 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
517 if (status != expectedStatus) {
518 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
519 } else {
520 if (status != U_ZERO_ERROR) {
521 if (pe.line != errLine || pe.offset != errCol) {
522 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
523 line, errLine, errCol, pe.line, pe.offset);
524 }
525 }
526 }
527
528 delete callerPattern;
529 utext_close(&patternText);
530 }
531
532
533
534 //---------------------------------------------------------------------------
535 //
536 // Basic Check for basic functionality of regex pattern matching.
537 // Avoid the use of REGEX_FIND test macro, which has
538 // substantial dependencies on basic Regex functionality.
539 //
540 //---------------------------------------------------------------------------
Basic()541 void RegexTest::Basic() {
542
543
544 //
545 // Debug - slide failing test cases early
546 //
547 #if 0
548 {
549 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
550 UParseError pe;
551 UErrorCode status = U_ZERO_ERROR;
552 RegexPattern *pattern;
553 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
554 pattern->dumpPattern();
555 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
556 UBool result = m->find();
557 printf("result = %d\n", result);
558 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
559 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
560 }
561 exit(1);
562 #endif
563
564
565 //
566 // Pattern with parentheses
567 //
568 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
569 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
570 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
571
572 //
573 // Patterns with *
574 //
575 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
576 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
577 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
578 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
579 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
580
581 REGEX_TESTLM("a*", "", TRUE, TRUE);
582 REGEX_TESTLM("a*", "b", TRUE, FALSE);
583
584
585 //
586 // Patterns with "."
587 //
588 REGEX_TESTLM(".", "abc", TRUE, FALSE);
589 REGEX_TESTLM("...", "abc", TRUE, TRUE);
590 REGEX_TESTLM("....", "abc", FALSE, FALSE);
591 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
592 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
593 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
594 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
595 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
596
597 //
598 // Patterns with * applied to chars at end of literal string
599 //
600 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
601 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
602
603 //
604 // Supplemental chars match as single chars, not a pair of surrogates.
605 //
606 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
607 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
608 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
609
610
611 //
612 // UnicodeSets in the pattern
613 //
614 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
615 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
616 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
617 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
618 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
619 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
620
621 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
622 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
623 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
624 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
625 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
626
627 //
628 // OR operator in patterns
629 //
630 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
631 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
632 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
633 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
634
635 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
636 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
637 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
638 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
639 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
640 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
641
642 //
643 // +
644 //
645 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
646 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
647 REGEX_TESTLM("b+", "", FALSE, FALSE);
648 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
649 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
650 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
651
652 //
653 // ?
654 //
655 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
656 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
657 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
658 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
659 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
660 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
661 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
662 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
663 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
664
665 //
666 // Escape sequences that become single literal chars, handled internally
667 // by ICU's Unescape.
668 //
669
670 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
671 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
672 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
673 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
674 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
675 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
676 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
677 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
678 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
679 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
680
681 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
682 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
683
684 // Escape of special chars in patterns
685 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
686 }
687
688
689 //---------------------------------------------------------------------------
690 //
691 // UTextBasic Check for quirks that are specific to the UText
692 // implementation.
693 //
694 //---------------------------------------------------------------------------
UTextBasic()695 void RegexTest::UTextBasic() {
696 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
697 UErrorCode status = U_ZERO_ERROR;
698 UText pattern = UTEXT_INITIALIZER;
699 utext_openUTF8(&pattern, str_abc, -1, &status);
700 RegexMatcher matcher(&pattern, 0, status);
701 REGEX_CHECK_STATUS;
702
703 UText input = UTEXT_INITIALIZER;
704 utext_openUTF8(&input, str_abc, -1, &status);
705 REGEX_CHECK_STATUS;
706 matcher.reset(&input);
707 REGEX_CHECK_STATUS;
708 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
709
710 matcher.reset(matcher.inputText());
711 REGEX_CHECK_STATUS;
712 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
713
714 utext_close(&pattern);
715 utext_close(&input);
716 }
717
718
719 //---------------------------------------------------------------------------
720 //
721 // API_Match Test that the API for class RegexMatcher
722 // is present and nominally working, but excluding functions
723 // implementing replace operations.
724 //
725 //---------------------------------------------------------------------------
API_Match()726 void RegexTest::API_Match() {
727 UParseError pe;
728 UErrorCode status=U_ZERO_ERROR;
729 int32_t flags = 0;
730
731 //
732 // Debug - slide failing test cases early
733 //
734 #if 0
735 {
736 }
737 return;
738 #endif
739
740 //
741 // Simple pattern compilation
742 //
743 {
744 UnicodeString re("abc");
745 RegexPattern *pat2;
746 pat2 = RegexPattern::compile(re, flags, pe, status);
747 REGEX_CHECK_STATUS;
748
749 UnicodeString inStr1 = "abcdef this is a test";
750 UnicodeString instr2 = "not abc";
751 UnicodeString empty = "";
752
753
754 //
755 // Matcher creation and reset.
756 //
757 RegexMatcher *m1 = pat2->matcher(inStr1, status);
758 REGEX_CHECK_STATUS;
759 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
760 REGEX_ASSERT(m1->input() == inStr1);
761 m1->reset(instr2);
762 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
763 REGEX_ASSERT(m1->input() == instr2);
764 m1->reset(inStr1);
765 REGEX_ASSERT(m1->input() == inStr1);
766 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
767 m1->reset(empty);
768 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
769 REGEX_ASSERT(m1->input() == empty);
770 REGEX_ASSERT(&m1->pattern() == pat2);
771
772 //
773 // reset(pos, status)
774 //
775 m1->reset(inStr1);
776 m1->reset(4, status);
777 REGEX_CHECK_STATUS;
778 REGEX_ASSERT(m1->input() == inStr1);
779 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
780
781 m1->reset(-1, status);
782 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
783 status = U_ZERO_ERROR;
784
785 m1->reset(0, status);
786 REGEX_CHECK_STATUS;
787 status = U_ZERO_ERROR;
788
789 int32_t len = m1->input().length();
790 m1->reset(len-1, status);
791 REGEX_CHECK_STATUS;
792 status = U_ZERO_ERROR;
793
794 m1->reset(len, status);
795 REGEX_CHECK_STATUS;
796 status = U_ZERO_ERROR;
797
798 m1->reset(len+1, status);
799 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
800 status = U_ZERO_ERROR;
801
802 //
803 // match(pos, status)
804 //
805 m1->reset(instr2);
806 REGEX_ASSERT(m1->matches(4, status) == TRUE);
807 m1->reset();
808 REGEX_ASSERT(m1->matches(3, status) == FALSE);
809 m1->reset();
810 REGEX_ASSERT(m1->matches(5, status) == FALSE);
811 REGEX_ASSERT(m1->matches(4, status) == TRUE);
812 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
813 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
814
815 // Match() at end of string should fail, but should not
816 // be an error.
817 status = U_ZERO_ERROR;
818 len = m1->input().length();
819 REGEX_ASSERT(m1->matches(len, status) == FALSE);
820 REGEX_CHECK_STATUS;
821
822 // Match beyond end of string should fail with an error.
823 status = U_ZERO_ERROR;
824 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
825 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
826
827 // Successful match at end of string.
828 {
829 status = U_ZERO_ERROR;
830 RegexMatcher m("A?", 0, status); // will match zero length string.
831 REGEX_CHECK_STATUS;
832 m.reset(inStr1);
833 len = inStr1.length();
834 REGEX_ASSERT(m.matches(len, status) == TRUE);
835 REGEX_CHECK_STATUS;
836 m.reset(empty);
837 REGEX_ASSERT(m.matches(0, status) == TRUE);
838 REGEX_CHECK_STATUS;
839 }
840
841
842 //
843 // lookingAt(pos, status)
844 //
845 status = U_ZERO_ERROR;
846 m1->reset(instr2); // "not abc"
847 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
848 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
849 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
850 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
851 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
852 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
853 status = U_ZERO_ERROR;
854 len = m1->input().length();
855 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
856 REGEX_CHECK_STATUS;
857 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
858 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
859
860 delete m1;
861 delete pat2;
862 }
863
864
865 //
866 // Capture Group.
867 // RegexMatcher::start();
868 // RegexMatcher::end();
869 // RegexMatcher::groupCount();
870 //
871 {
872 int32_t flags=0;
873 UParseError pe;
874 UErrorCode status=U_ZERO_ERROR;
875
876 UnicodeString re("01(23(45)67)(.*)");
877 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
878 REGEX_CHECK_STATUS;
879 UnicodeString data = "0123456789";
880
881 RegexMatcher *matcher = pat->matcher(data, status);
882 REGEX_CHECK_STATUS;
883 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
884 static const int32_t matchStarts[] = {0, 2, 4, 8};
885 static const int32_t matchEnds[] = {10, 8, 6, 10};
886 int32_t i;
887 for (i=0; i<4; i++) {
888 int32_t actualStart = matcher->start(i, status);
889 REGEX_CHECK_STATUS;
890 if (actualStart != matchStarts[i]) {
891 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
892 __LINE__, i, matchStarts[i], actualStart);
893 }
894 int32_t actualEnd = matcher->end(i, status);
895 REGEX_CHECK_STATUS;
896 if (actualEnd != matchEnds[i]) {
897 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
898 __LINE__, i, matchEnds[i], actualEnd);
899 }
900 }
901
902 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
903 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
904
905 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
906 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
907 matcher->reset();
908 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
909
910 matcher->lookingAt(status);
911 REGEX_ASSERT(matcher->group(status) == "0123456789");
912 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
913 REGEX_ASSERT(matcher->group(1, status) == "234567" );
914 REGEX_ASSERT(matcher->group(2, status) == "45" );
915 REGEX_ASSERT(matcher->group(3, status) == "89" );
916 REGEX_CHECK_STATUS;
917 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
918 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
919 matcher->reset();
920 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
921
922 delete matcher;
923 delete pat;
924
925 }
926
927 //
928 // find
929 //
930 {
931 int32_t flags=0;
932 UParseError pe;
933 UErrorCode status=U_ZERO_ERROR;
934
935 UnicodeString re("abc");
936 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
937 REGEX_CHECK_STATUS;
938 UnicodeString data = ".abc..abc...abc..";
939 // 012345678901234567
940
941 RegexMatcher *matcher = pat->matcher(data, status);
942 REGEX_CHECK_STATUS;
943 REGEX_ASSERT(matcher->find());
944 REGEX_ASSERT(matcher->start(status) == 1);
945 REGEX_ASSERT(matcher->find());
946 REGEX_ASSERT(matcher->start(status) == 6);
947 REGEX_ASSERT(matcher->find());
948 REGEX_ASSERT(matcher->start(status) == 12);
949 REGEX_ASSERT(matcher->find() == FALSE);
950 REGEX_ASSERT(matcher->find() == FALSE);
951
952 matcher->reset();
953 REGEX_ASSERT(matcher->find());
954 REGEX_ASSERT(matcher->start(status) == 1);
955
956 REGEX_ASSERT(matcher->find(0, status));
957 REGEX_ASSERT(matcher->start(status) == 1);
958 REGEX_ASSERT(matcher->find(1, status));
959 REGEX_ASSERT(matcher->start(status) == 1);
960 REGEX_ASSERT(matcher->find(2, status));
961 REGEX_ASSERT(matcher->start(status) == 6);
962 REGEX_ASSERT(matcher->find(12, status));
963 REGEX_ASSERT(matcher->start(status) == 12);
964 REGEX_ASSERT(matcher->find(13, status) == FALSE);
965 REGEX_ASSERT(matcher->find(16, status) == FALSE);
966 REGEX_ASSERT(matcher->find(17, status) == FALSE);
967 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
968
969 status = U_ZERO_ERROR;
970 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
971 status = U_ZERO_ERROR;
972 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
973
974 REGEX_ASSERT(matcher->groupCount() == 0);
975
976 delete matcher;
977 delete pat;
978 }
979
980
981 //
982 // find, with \G in pattern (true if at the end of a previous match).
983 //
984 {
985 int32_t flags=0;
986 UParseError pe;
987 UErrorCode status=U_ZERO_ERROR;
988
989 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
990 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
991 REGEX_CHECK_STATUS;
992 UnicodeString data = ".abcabc.abc..";
993 // 012345678901234567
994
995 RegexMatcher *matcher = pat->matcher(data, status);
996 REGEX_CHECK_STATUS;
997 REGEX_ASSERT(matcher->find());
998 REGEX_ASSERT(matcher->start(status) == 0);
999 REGEX_ASSERT(matcher->start(1, status) == -1);
1000 REGEX_ASSERT(matcher->start(2, status) == 1);
1001
1002 REGEX_ASSERT(matcher->find());
1003 REGEX_ASSERT(matcher->start(status) == 4);
1004 REGEX_ASSERT(matcher->start(1, status) == 4);
1005 REGEX_ASSERT(matcher->start(2, status) == -1);
1006 REGEX_CHECK_STATUS;
1007
1008 delete matcher;
1009 delete pat;
1010 }
1011
1012 //
1013 // find with zero length matches, match position should bump ahead
1014 // to prevent loops.
1015 //
1016 {
1017 int32_t i;
1018 UErrorCode status=U_ZERO_ERROR;
1019 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1020 // using an always-true look-ahead.
1021 REGEX_CHECK_STATUS;
1022 UnicodeString s(" ");
1023 m.reset(s);
1024 for (i=0; ; i++) {
1025 if (m.find() == FALSE) {
1026 break;
1027 }
1028 REGEX_ASSERT(m.start(status) == i);
1029 REGEX_ASSERT(m.end(status) == i);
1030 }
1031 REGEX_ASSERT(i==5);
1032
1033 // Check that the bump goes over surrogate pairs OK
1034 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1035 s = s.unescape();
1036 m.reset(s);
1037 for (i=0; ; i+=2) {
1038 if (m.find() == FALSE) {
1039 break;
1040 }
1041 REGEX_ASSERT(m.start(status) == i);
1042 REGEX_ASSERT(m.end(status) == i);
1043 }
1044 REGEX_ASSERT(i==10);
1045 }
1046 {
1047 // find() loop breaking test.
1048 // with pattern of /.?/, should see a series of one char matches, then a single
1049 // match of zero length at the end of the input string.
1050 int32_t i;
1051 UErrorCode status=U_ZERO_ERROR;
1052 RegexMatcher m(".?", 0, status);
1053 REGEX_CHECK_STATUS;
1054 UnicodeString s(" ");
1055 m.reset(s);
1056 for (i=0; ; i++) {
1057 if (m.find() == FALSE) {
1058 break;
1059 }
1060 REGEX_ASSERT(m.start(status) == i);
1061 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1062 }
1063 REGEX_ASSERT(i==5);
1064 }
1065
1066
1067 //
1068 // Matchers with no input string behave as if they had an empty input string.
1069 //
1070
1071 {
1072 UErrorCode status = U_ZERO_ERROR;
1073 RegexMatcher m(".?", 0, status);
1074 REGEX_CHECK_STATUS;
1075 REGEX_ASSERT(m.find());
1076 REGEX_ASSERT(m.start(status) == 0);
1077 REGEX_ASSERT(m.input() == "");
1078 }
1079 {
1080 UErrorCode status = U_ZERO_ERROR;
1081 RegexPattern *p = RegexPattern::compile(".", 0, status);
1082 RegexMatcher *m = p->matcher(status);
1083 REGEX_CHECK_STATUS;
1084
1085 REGEX_ASSERT(m->find() == FALSE);
1086 REGEX_ASSERT(m->input() == "");
1087 delete m;
1088 delete p;
1089 }
1090
1091 //
1092 // Regions
1093 //
1094 {
1095 UErrorCode status = U_ZERO_ERROR;
1096 UnicodeString testString("This is test data");
1097 RegexMatcher m(".*", testString, 0, status);
1098 REGEX_CHECK_STATUS;
1099 REGEX_ASSERT(m.regionStart() == 0);
1100 REGEX_ASSERT(m.regionEnd() == testString.length());
1101 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1102 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1103
1104 m.region(2,4, status);
1105 REGEX_CHECK_STATUS;
1106 REGEX_ASSERT(m.matches(status));
1107 REGEX_ASSERT(m.start(status)==2);
1108 REGEX_ASSERT(m.end(status)==4);
1109 REGEX_CHECK_STATUS;
1110
1111 m.reset();
1112 REGEX_ASSERT(m.regionStart() == 0);
1113 REGEX_ASSERT(m.regionEnd() == testString.length());
1114
1115 UnicodeString shorterString("short");
1116 m.reset(shorterString);
1117 REGEX_ASSERT(m.regionStart() == 0);
1118 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1119
1120 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1121 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1122 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1123 REGEX_ASSERT(&m == &m.reset());
1124 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1125
1126 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1127 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1128 REGEX_ASSERT(&m == &m.reset());
1129 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1130
1131 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1132 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1133 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1134 REGEX_ASSERT(&m == &m.reset());
1135 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1136
1137 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1138 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1139 REGEX_ASSERT(&m == &m.reset());
1140 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1141
1142 }
1143
1144 //
1145 // hitEnd() and requireEnd()
1146 //
1147 {
1148 UErrorCode status = U_ZERO_ERROR;
1149 UnicodeString testString("aabb");
1150 RegexMatcher m1(".*", testString, 0, status);
1151 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1152 REGEX_ASSERT(m1.hitEnd() == TRUE);
1153 REGEX_ASSERT(m1.requireEnd() == FALSE);
1154 REGEX_CHECK_STATUS;
1155
1156 status = U_ZERO_ERROR;
1157 RegexMatcher m2("a*", testString, 0, status);
1158 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1159 REGEX_ASSERT(m2.hitEnd() == FALSE);
1160 REGEX_ASSERT(m2.requireEnd() == FALSE);
1161 REGEX_CHECK_STATUS;
1162
1163 status = U_ZERO_ERROR;
1164 RegexMatcher m3(".*$", testString, 0, status);
1165 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1166 REGEX_ASSERT(m3.hitEnd() == TRUE);
1167 REGEX_ASSERT(m3.requireEnd() == TRUE);
1168 REGEX_CHECK_STATUS;
1169 }
1170
1171
1172 //
1173 // Compilation error on reset with UChar *
1174 // These were a hazard that people were stumbling over with runtime errors.
1175 // Changed them to compiler errors by adding private methods that more closely
1176 // matched the incorrect use of the functions.
1177 //
1178 #if 0
1179 {
1180 UErrorCode status = U_ZERO_ERROR;
1181 UChar ucharString[20];
1182 RegexMatcher m(".", 0, status);
1183 m.reset(ucharString); // should not compile.
1184
1185 RegexPattern *p = RegexPattern::compile(".", 0, status);
1186 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1187
1188 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1189 }
1190 #endif
1191
1192 //
1193 // Time Outs.
1194 // Note: These tests will need to be changed when the regexp engine is
1195 // able to detect and cut short the exponential time behavior on
1196 // this type of match.
1197 //
1198 {
1199 UErrorCode status = U_ZERO_ERROR;
1200 // Enough 'a's in the string to cause the match to time out.
1201 // (Each on additonal 'a' doubles the time)
1202 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1203 RegexMatcher matcher("(a+)+b", testString, 0, status);
1204 REGEX_CHECK_STATUS;
1205 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1206 matcher.setTimeLimit(100, status);
1207 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1208 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1209 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1210 }
1211 {
1212 UErrorCode status = U_ZERO_ERROR;
1213 // Few enough 'a's to slip in under the time limit.
1214 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1215 RegexMatcher matcher("(a+)+b", testString, 0, status);
1216 REGEX_CHECK_STATUS;
1217 matcher.setTimeLimit(100, status);
1218 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1219 REGEX_CHECK_STATUS;
1220 }
1221
1222 //
1223 // Stack Limits
1224 //
1225 {
1226 UErrorCode status = U_ZERO_ERROR;
1227 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1228
1229 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1230 // of the '+', and makes the stack frames larger.
1231 RegexMatcher matcher("(A)+A$", testString, 0, status);
1232
1233 // With the default stack, this match should fail to run
1234 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1235 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1236
1237 // With unlimited stack, it should run
1238 status = U_ZERO_ERROR;
1239 matcher.setStackLimit(0, status);
1240 REGEX_CHECK_STATUS;
1241 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1242 REGEX_CHECK_STATUS;
1243 REGEX_ASSERT(matcher.getStackLimit() == 0);
1244
1245 // With a limited stack, it the match should fail
1246 status = U_ZERO_ERROR;
1247 matcher.setStackLimit(10000, status);
1248 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1249 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1250 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1251 }
1252
1253 // A pattern that doesn't save state should work with
1254 // a minimal sized stack
1255 {
1256 UErrorCode status = U_ZERO_ERROR;
1257 UnicodeString testString = "abc";
1258 RegexMatcher matcher("abc", testString, 0, status);
1259 REGEX_CHECK_STATUS;
1260 matcher.setStackLimit(30, status);
1261 REGEX_CHECK_STATUS;
1262 REGEX_ASSERT(matcher.matches(status) == TRUE);
1263 REGEX_CHECK_STATUS;
1264 REGEX_ASSERT(matcher.getStackLimit() == 30);
1265
1266 // Negative stack sizes should fail
1267 status = U_ZERO_ERROR;
1268 matcher.setStackLimit(1000, status);
1269 REGEX_CHECK_STATUS;
1270 matcher.setStackLimit(-1, status);
1271 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1272 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1273 }
1274
1275
1276 }
1277
1278
1279
1280
1281
1282
1283 //---------------------------------------------------------------------------
1284 //
1285 // API_Replace API test for class RegexMatcher, testing the
1286 // Replace family of functions.
1287 //
1288 //---------------------------------------------------------------------------
API_Replace()1289 void RegexTest::API_Replace() {
1290 //
1291 // Replace
1292 //
1293 int32_t flags=0;
1294 UParseError pe;
1295 UErrorCode status=U_ZERO_ERROR;
1296
1297 UnicodeString re("abc");
1298 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1299 REGEX_CHECK_STATUS;
1300 UnicodeString data = ".abc..abc...abc..";
1301 // 012345678901234567
1302 RegexMatcher *matcher = pat->matcher(data, status);
1303
1304 //
1305 // Plain vanilla matches.
1306 //
1307 UnicodeString dest;
1308 dest = matcher->replaceFirst("yz", status);
1309 REGEX_CHECK_STATUS;
1310 REGEX_ASSERT(dest == ".yz..abc...abc..");
1311
1312 dest = matcher->replaceAll("yz", status);
1313 REGEX_CHECK_STATUS;
1314 REGEX_ASSERT(dest == ".yz..yz...yz..");
1315
1316 //
1317 // Plain vanilla non-matches.
1318 //
1319 UnicodeString d2 = ".abx..abx...abx..";
1320 matcher->reset(d2);
1321 dest = matcher->replaceFirst("yz", status);
1322 REGEX_CHECK_STATUS;
1323 REGEX_ASSERT(dest == ".abx..abx...abx..");
1324
1325 dest = matcher->replaceAll("yz", status);
1326 REGEX_CHECK_STATUS;
1327 REGEX_ASSERT(dest == ".abx..abx...abx..");
1328
1329 //
1330 // Empty source string
1331 //
1332 UnicodeString d3 = "";
1333 matcher->reset(d3);
1334 dest = matcher->replaceFirst("yz", status);
1335 REGEX_CHECK_STATUS;
1336 REGEX_ASSERT(dest == "");
1337
1338 dest = matcher->replaceAll("yz", status);
1339 REGEX_CHECK_STATUS;
1340 REGEX_ASSERT(dest == "");
1341
1342 //
1343 // Empty substitution string
1344 //
1345 matcher->reset(data); // ".abc..abc...abc.."
1346 dest = matcher->replaceFirst("", status);
1347 REGEX_CHECK_STATUS;
1348 REGEX_ASSERT(dest == "...abc...abc..");
1349
1350 dest = matcher->replaceAll("", status);
1351 REGEX_CHECK_STATUS;
1352 REGEX_ASSERT(dest == "........");
1353
1354 //
1355 // match whole string
1356 //
1357 UnicodeString d4 = "abc";
1358 matcher->reset(d4);
1359 dest = matcher->replaceFirst("xyz", status);
1360 REGEX_CHECK_STATUS;
1361 REGEX_ASSERT(dest == "xyz");
1362
1363 dest = matcher->replaceAll("xyz", status);
1364 REGEX_CHECK_STATUS;
1365 REGEX_ASSERT(dest == "xyz");
1366
1367 //
1368 // Capture Group, simple case
1369 //
1370 UnicodeString re2("a(..)");
1371 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1372 REGEX_CHECK_STATUS;
1373 UnicodeString d5 = "abcdefg";
1374 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1375 REGEX_CHECK_STATUS;
1376 dest = matcher2->replaceFirst("$1$1", status);
1377 REGEX_CHECK_STATUS;
1378 REGEX_ASSERT(dest == "bcbcdefg");
1379
1380 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1381 REGEX_CHECK_STATUS;
1382 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1383
1384 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1385 REGEX_ASSERT(U_FAILURE(status));
1386 status = U_ZERO_ERROR;
1387
1388 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1389 replacement = replacement.unescape();
1390 dest = matcher2->replaceFirst(replacement, status);
1391 REGEX_CHECK_STATUS;
1392 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1393
1394 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1395
1396
1397 //
1398 // Replacement String with \u hex escapes
1399 //
1400 {
1401 UnicodeString src = "abc 1 abc 2 abc 3";
1402 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1403 matcher->reset(src);
1404 UnicodeString result = matcher->replaceAll(substitute, status);
1405 REGEX_CHECK_STATUS;
1406 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1407 }
1408 {
1409 UnicodeString src = "abc !";
1410 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1411 matcher->reset(src);
1412 UnicodeString result = matcher->replaceAll(substitute, status);
1413 REGEX_CHECK_STATUS;
1414 UnicodeString expected = UnicodeString("--");
1415 expected.append((UChar32)0x10000);
1416 expected.append("-- !");
1417 REGEX_ASSERT(result == expected);
1418 }
1419 // TODO: need more through testing of capture substitutions.
1420
1421 // Bug 4057
1422 //
1423 {
1424 status = U_ZERO_ERROR;
1425 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1426 RegexMatcher m("ss(.*?)ee", 0, status);
1427 REGEX_CHECK_STATUS;
1428 UnicodeString result;
1429
1430 // Multiple finds do NOT bump up the previous appendReplacement postion.
1431 m.reset(s);
1432 m.find();
1433 m.find();
1434 m.appendReplacement(result, "ooh", status);
1435 REGEX_CHECK_STATUS;
1436 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1437
1438 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1439 status = U_ZERO_ERROR;
1440 result.truncate(0);
1441 m.reset(10, status);
1442 m.find();
1443 m.find();
1444 m.appendReplacement(result, "ooh", status);
1445 REGEX_CHECK_STATUS;
1446 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1447
1448 // find() at interior of string, appendReplacemnt still starts at beginning.
1449 status = U_ZERO_ERROR;
1450 result.truncate(0);
1451 m.reset();
1452 m.find(10, status);
1453 m.find();
1454 m.appendReplacement(result, "ooh", status);
1455 REGEX_CHECK_STATUS;
1456 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1457
1458 m.appendTail(result);
1459 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1460
1461 }
1462
1463 delete matcher2;
1464 delete pat2;
1465 delete matcher;
1466 delete pat;
1467 }
1468
1469
1470 //---------------------------------------------------------------------------
1471 //
1472 // API_Pattern Test that the API for class RegexPattern is
1473 // present and nominally working.
1474 //
1475 //---------------------------------------------------------------------------
API_Pattern()1476 void RegexTest::API_Pattern() {
1477 RegexPattern pata; // Test default constructor to not crash.
1478 RegexPattern patb;
1479
1480 REGEX_ASSERT(pata == patb);
1481 REGEX_ASSERT(pata == pata);
1482
1483 UnicodeString re1("abc[a-l][m-z]");
1484 UnicodeString re2("def");
1485 UErrorCode status = U_ZERO_ERROR;
1486 UParseError pe;
1487
1488 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1489 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1490 REGEX_CHECK_STATUS;
1491 REGEX_ASSERT(*pat1 == *pat1);
1492 REGEX_ASSERT(*pat1 != pata);
1493
1494 // Assign
1495 patb = *pat1;
1496 REGEX_ASSERT(patb == *pat1);
1497
1498 // Copy Construct
1499 RegexPattern patc(*pat1);
1500 REGEX_ASSERT(patc == *pat1);
1501 REGEX_ASSERT(patb == patc);
1502 REGEX_ASSERT(pat1 != pat2);
1503 patb = *pat2;
1504 REGEX_ASSERT(patb != patc);
1505 REGEX_ASSERT(patb == *pat2);
1506
1507 // Compile with no flags.
1508 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1509 REGEX_ASSERT(*pat1a == *pat1);
1510
1511 REGEX_ASSERT(pat1a->flags() == 0);
1512
1513 // Compile with different flags should be not equal
1514 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1515 REGEX_CHECK_STATUS;
1516
1517 REGEX_ASSERT(*pat1b != *pat1a);
1518 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1519 REGEX_ASSERT(pat1a->flags() == 0);
1520 delete pat1b;
1521
1522 // clone
1523 RegexPattern *pat1c = pat1->clone();
1524 REGEX_ASSERT(*pat1c == *pat1);
1525 REGEX_ASSERT(*pat1c != *pat2);
1526
1527 delete pat1c;
1528 delete pat1a;
1529 delete pat1;
1530 delete pat2;
1531
1532
1533 //
1534 // Verify that a matcher created from a cloned pattern works.
1535 // (Jitterbug 3423)
1536 //
1537 {
1538 UErrorCode status = U_ZERO_ERROR;
1539 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1540 RegexPattern *pClone = pSource->clone();
1541 delete pSource;
1542 RegexMatcher *mFromClone = pClone->matcher(status);
1543 REGEX_CHECK_STATUS;
1544 UnicodeString s = "Hello World";
1545 mFromClone->reset(s);
1546 REGEX_ASSERT(mFromClone->find() == TRUE);
1547 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1548 REGEX_ASSERT(mFromClone->find() == TRUE);
1549 REGEX_ASSERT(mFromClone->group(status) == "World");
1550 REGEX_ASSERT(mFromClone->find() == FALSE);
1551 delete mFromClone;
1552 delete pClone;
1553 }
1554
1555 //
1556 // matches convenience API
1557 //
1558 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1559 REGEX_CHECK_STATUS;
1560 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1561 REGEX_CHECK_STATUS;
1562 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1563 REGEX_CHECK_STATUS;
1564 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1565 REGEX_CHECK_STATUS;
1566 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1567 REGEX_CHECK_STATUS;
1568 status = U_INDEX_OUTOFBOUNDS_ERROR;
1569 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1570 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1571
1572
1573 //
1574 // Split()
1575 //
1576 status = U_ZERO_ERROR;
1577 pat1 = RegexPattern::compile(" +", pe, status);
1578 REGEX_CHECK_STATUS;
1579 UnicodeString fields[10];
1580
1581 int32_t n;
1582 n = pat1->split("Now is the time", fields, 10, status);
1583 REGEX_CHECK_STATUS;
1584 REGEX_ASSERT(n==4);
1585 REGEX_ASSERT(fields[0]=="Now");
1586 REGEX_ASSERT(fields[1]=="is");
1587 REGEX_ASSERT(fields[2]=="the");
1588 REGEX_ASSERT(fields[3]=="time");
1589 REGEX_ASSERT(fields[4]=="");
1590
1591 n = pat1->split("Now is the time", fields, 2, status);
1592 REGEX_CHECK_STATUS;
1593 REGEX_ASSERT(n==2);
1594 REGEX_ASSERT(fields[0]=="Now");
1595 REGEX_ASSERT(fields[1]=="is the time");
1596 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1597
1598 fields[1] = "*";
1599 status = U_ZERO_ERROR;
1600 n = pat1->split("Now is the time", fields, 1, status);
1601 REGEX_CHECK_STATUS;
1602 REGEX_ASSERT(n==1);
1603 REGEX_ASSERT(fields[0]=="Now is the time");
1604 REGEX_ASSERT(fields[1]=="*");
1605 status = U_ZERO_ERROR;
1606
1607 n = pat1->split(" Now is the time ", fields, 10, status);
1608 REGEX_CHECK_STATUS;
1609 REGEX_ASSERT(n==6);
1610 REGEX_ASSERT(fields[0]=="");
1611 REGEX_ASSERT(fields[1]=="Now");
1612 REGEX_ASSERT(fields[2]=="is");
1613 REGEX_ASSERT(fields[3]=="the");
1614 REGEX_ASSERT(fields[4]=="time");
1615 REGEX_ASSERT(fields[5]=="");
1616
1617 n = pat1->split(" ", fields, 10, status);
1618 REGEX_CHECK_STATUS;
1619 REGEX_ASSERT(n==2);
1620 REGEX_ASSERT(fields[0]=="");
1621 REGEX_ASSERT(fields[1]=="");
1622
1623 fields[0] = "foo";
1624 n = pat1->split("", fields, 10, status);
1625 REGEX_CHECK_STATUS;
1626 REGEX_ASSERT(n==0);
1627 REGEX_ASSERT(fields[0]=="foo");
1628
1629 delete pat1;
1630
1631 // split, with a pattern with (capture)
1632 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1633 REGEX_CHECK_STATUS;
1634
1635 status = U_ZERO_ERROR;
1636 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1637 REGEX_CHECK_STATUS;
1638 REGEX_ASSERT(n==7);
1639 REGEX_ASSERT(fields[0]=="");
1640 REGEX_ASSERT(fields[1]=="a");
1641 REGEX_ASSERT(fields[2]=="Now is ");
1642 REGEX_ASSERT(fields[3]=="b");
1643 REGEX_ASSERT(fields[4]=="the time");
1644 REGEX_ASSERT(fields[5]=="c");
1645 REGEX_ASSERT(fields[6]=="");
1646 REGEX_ASSERT(status==U_ZERO_ERROR);
1647
1648 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1649 REGEX_CHECK_STATUS;
1650 REGEX_ASSERT(n==7);
1651 REGEX_ASSERT(fields[0]==" ");
1652 REGEX_ASSERT(fields[1]=="a");
1653 REGEX_ASSERT(fields[2]=="Now is ");
1654 REGEX_ASSERT(fields[3]=="b");
1655 REGEX_ASSERT(fields[4]=="the time");
1656 REGEX_ASSERT(fields[5]=="c");
1657 REGEX_ASSERT(fields[6]=="");
1658
1659 status = U_ZERO_ERROR;
1660 fields[6] = "foo";
1661 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1662 REGEX_CHECK_STATUS;
1663 REGEX_ASSERT(n==6);
1664 REGEX_ASSERT(fields[0]==" ");
1665 REGEX_ASSERT(fields[1]=="a");
1666 REGEX_ASSERT(fields[2]=="Now is ");
1667 REGEX_ASSERT(fields[3]=="b");
1668 REGEX_ASSERT(fields[4]=="the time");
1669 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1670 REGEX_ASSERT(fields[6]=="foo");
1671
1672 status = U_ZERO_ERROR;
1673 fields[5] = "foo";
1674 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1675 REGEX_CHECK_STATUS;
1676 REGEX_ASSERT(n==5);
1677 REGEX_ASSERT(fields[0]==" ");
1678 REGEX_ASSERT(fields[1]=="a");
1679 REGEX_ASSERT(fields[2]=="Now is ");
1680 REGEX_ASSERT(fields[3]=="b");
1681 REGEX_ASSERT(fields[4]=="the time<c>");
1682 REGEX_ASSERT(fields[5]=="foo");
1683
1684 status = U_ZERO_ERROR;
1685 fields[5] = "foo";
1686 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1687 REGEX_CHECK_STATUS;
1688 REGEX_ASSERT(n==5);
1689 REGEX_ASSERT(fields[0]==" ");
1690 REGEX_ASSERT(fields[1]=="a");
1691 REGEX_ASSERT(fields[2]=="Now is ");
1692 REGEX_ASSERT(fields[3]=="b");
1693 REGEX_ASSERT(fields[4]=="the time");
1694 REGEX_ASSERT(fields[5]=="foo");
1695
1696 status = U_ZERO_ERROR;
1697 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1698 REGEX_CHECK_STATUS;
1699 REGEX_ASSERT(n==4);
1700 REGEX_ASSERT(fields[0]==" ");
1701 REGEX_ASSERT(fields[1]=="a");
1702 REGEX_ASSERT(fields[2]=="Now is ");
1703 REGEX_ASSERT(fields[3]=="the time<c>");
1704 status = U_ZERO_ERROR;
1705 delete pat1;
1706
1707 pat1 = RegexPattern::compile("([-,])", pe, status);
1708 REGEX_CHECK_STATUS;
1709 n = pat1->split("1-10,20", fields, 10, status);
1710 REGEX_CHECK_STATUS;
1711 REGEX_ASSERT(n==5);
1712 REGEX_ASSERT(fields[0]=="1");
1713 REGEX_ASSERT(fields[1]=="-");
1714 REGEX_ASSERT(fields[2]=="10");
1715 REGEX_ASSERT(fields[3]==",");
1716 REGEX_ASSERT(fields[4]=="20");
1717 delete pat1;
1718
1719 // Test split of string with empty trailing fields
1720 pat1 = RegexPattern::compile(",", pe, status);
1721 REGEX_CHECK_STATUS;
1722 n = pat1->split("a,b,c,", fields, 10, status);
1723 REGEX_CHECK_STATUS;
1724 REGEX_ASSERT(n==4);
1725 REGEX_ASSERT(fields[0]=="a");
1726 REGEX_ASSERT(fields[1]=="b");
1727 REGEX_ASSERT(fields[2]=="c");
1728 REGEX_ASSERT(fields[3]=="");
1729
1730 n = pat1->split("a,,,", fields, 10, status);
1731 REGEX_CHECK_STATUS;
1732 REGEX_ASSERT(n==4);
1733 REGEX_ASSERT(fields[0]=="a");
1734 REGEX_ASSERT(fields[1]=="");
1735 REGEX_ASSERT(fields[2]=="");
1736 REGEX_ASSERT(fields[3]=="");
1737 delete pat1;
1738
1739 // Split Separator with zero length match.
1740 pat1 = RegexPattern::compile(":?", pe, status);
1741 REGEX_CHECK_STATUS;
1742 n = pat1->split("abc", fields, 10, status);
1743 REGEX_CHECK_STATUS;
1744 REGEX_ASSERT(n==5);
1745 REGEX_ASSERT(fields[0]=="");
1746 REGEX_ASSERT(fields[1]=="a");
1747 REGEX_ASSERT(fields[2]=="b");
1748 REGEX_ASSERT(fields[3]=="c");
1749 REGEX_ASSERT(fields[4]=="");
1750
1751 delete pat1;
1752
1753 //
1754 // RegexPattern::pattern()
1755 //
1756 pat1 = new RegexPattern();
1757 REGEX_ASSERT(pat1->pattern() == "");
1758 delete pat1;
1759
1760 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1761 REGEX_CHECK_STATUS;
1762 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1763 delete pat1;
1764
1765
1766 //
1767 // classID functions
1768 //
1769 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1770 REGEX_CHECK_STATUS;
1771 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1772 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1773 UnicodeString Hello("Hello, world.");
1774 RegexMatcher *m = pat1->matcher(Hello, status);
1775 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1776 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1777 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1778 delete m;
1779 delete pat1;
1780
1781 }
1782
1783 //---------------------------------------------------------------------------
1784 //
1785 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1786 // is present and working, but excluding functions
1787 // implementing replace operations.
1788 //
1789 //---------------------------------------------------------------------------
API_Match_UTF8()1790 void RegexTest::API_Match_UTF8() {
1791 UParseError pe;
1792 UErrorCode status=U_ZERO_ERROR;
1793 int32_t flags = 0;
1794
1795 //
1796 // Debug - slide failing test cases early
1797 //
1798 #if 0
1799 {
1800 }
1801 return;
1802 #endif
1803
1804 //
1805 // Simple pattern compilation
1806 //
1807 {
1808 UText re = UTEXT_INITIALIZER;
1809 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1810 REGEX_VERBOSE_TEXT(&re);
1811 RegexPattern *pat2;
1812 pat2 = RegexPattern::compile(&re, flags, pe, status);
1813 REGEX_CHECK_STATUS;
1814
1815 UText input1 = UTEXT_INITIALIZER;
1816 UText input2 = UTEXT_INITIALIZER;
1817 UText empty = UTEXT_INITIALIZER;
1818 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1819 REGEX_VERBOSE_TEXT(&input1);
1820 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1821 REGEX_VERBOSE_TEXT(&input2);
1822 utext_openUChars(&empty, NULL, 0, &status);
1823
1824 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1825 int32_t input2Len = strlen("not abc");
1826
1827
1828 //
1829 // Matcher creation and reset.
1830 //
1831 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1832 REGEX_CHECK_STATUS;
1833 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1834 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1835 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1836 m1->reset(&input2);
1837 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1838 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1839 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1840 m1->reset(&input1);
1841 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1842 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1843 m1->reset(&empty);
1844 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1845 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1846
1847 //
1848 // reset(pos, status)
1849 //
1850 m1->reset(&input1);
1851 m1->reset(4, status);
1852 REGEX_CHECK_STATUS;
1853 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1854 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1855
1856 m1->reset(-1, status);
1857 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1858 status = U_ZERO_ERROR;
1859
1860 m1->reset(0, status);
1861 REGEX_CHECK_STATUS;
1862 status = U_ZERO_ERROR;
1863
1864 m1->reset(input1Len-1, status);
1865 REGEX_CHECK_STATUS;
1866 status = U_ZERO_ERROR;
1867
1868 m1->reset(input1Len, status);
1869 REGEX_CHECK_STATUS;
1870 status = U_ZERO_ERROR;
1871
1872 m1->reset(input1Len+1, status);
1873 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1874 status = U_ZERO_ERROR;
1875
1876 //
1877 // match(pos, status)
1878 //
1879 m1->reset(&input2);
1880 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1881 m1->reset();
1882 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1883 m1->reset();
1884 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1885 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1886 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1887 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1888
1889 // Match() at end of string should fail, but should not
1890 // be an error.
1891 status = U_ZERO_ERROR;
1892 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1893 REGEX_CHECK_STATUS;
1894
1895 // Match beyond end of string should fail with an error.
1896 status = U_ZERO_ERROR;
1897 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1898 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1899
1900 // Successful match at end of string.
1901 {
1902 status = U_ZERO_ERROR;
1903 RegexMatcher m("A?", 0, status); // will match zero length string.
1904 REGEX_CHECK_STATUS;
1905 m.reset(&input1);
1906 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1907 REGEX_CHECK_STATUS;
1908 m.reset(&empty);
1909 REGEX_ASSERT(m.matches(0, status) == TRUE);
1910 REGEX_CHECK_STATUS;
1911 }
1912
1913
1914 //
1915 // lookingAt(pos, status)
1916 //
1917 status = U_ZERO_ERROR;
1918 m1->reset(&input2); // "not abc"
1919 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1920 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1921 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1922 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1923 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1924 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1925 status = U_ZERO_ERROR;
1926 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1927 REGEX_CHECK_STATUS;
1928 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1929 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1930
1931 delete m1;
1932 delete pat2;
1933
1934 utext_close(&re);
1935 utext_close(&input1);
1936 utext_close(&input2);
1937 utext_close(&empty);
1938 }
1939
1940
1941 //
1942 // Capture Group.
1943 // RegexMatcher::start();
1944 // RegexMatcher::end();
1945 // RegexMatcher::groupCount();
1946 //
1947 {
1948 int32_t flags=0;
1949 UParseError pe;
1950 UErrorCode status=U_ZERO_ERROR;
1951 UText re=UTEXT_INITIALIZER;
1952 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1953 utext_openUTF8(&re, str_01234567_pat, -1, &status);
1954
1955 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1956 REGEX_CHECK_STATUS;
1957
1958 UText input = UTEXT_INITIALIZER;
1959 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1960 utext_openUTF8(&input, str_0123456789, -1, &status);
1961
1962 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1963 REGEX_CHECK_STATUS;
1964 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1965 static const int32_t matchStarts[] = {0, 2, 4, 8};
1966 static const int32_t matchEnds[] = {10, 8, 6, 10};
1967 int32_t i;
1968 for (i=0; i<4; i++) {
1969 int32_t actualStart = matcher->start(i, status);
1970 REGEX_CHECK_STATUS;
1971 if (actualStart != matchStarts[i]) {
1972 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
1973 __FILE__, __LINE__, i, matchStarts[i], actualStart);
1974 }
1975 int32_t actualEnd = matcher->end(i, status);
1976 REGEX_CHECK_STATUS;
1977 if (actualEnd != matchEnds[i]) {
1978 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
1979 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1980 }
1981 }
1982
1983 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1984 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1985
1986 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1987 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1988 matcher->reset();
1989 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1990
1991 matcher->lookingAt(status);
1992
1993 UnicodeString dest;
1994 UText destText = UTEXT_INITIALIZER;
1995 utext_openUnicodeString(&destText, &dest, &status);
1996 UText *result;
1997 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1998 // Test shallow-clone API
1999 int64_t group_len;
2000 result = matcher->group((UText *)NULL, group_len, status);
2001 REGEX_CHECK_STATUS;
2002 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2003 utext_close(result);
2004 result = matcher->group(0, &destText, group_len, status);
2005 REGEX_CHECK_STATUS;
2006 REGEX_ASSERT(result == &destText);
2007 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2008 // destText is now immutable, reopen it
2009 utext_close(&destText);
2010 utext_openUnicodeString(&destText, &dest, &status);
2011
2012 int64_t length;
2013 result = matcher->group(0, NULL, length, status);
2014 REGEX_CHECK_STATUS;
2015 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2016 utext_close(result);
2017 result = matcher->group(0, &destText, length, status);
2018 REGEX_CHECK_STATUS;
2019 REGEX_ASSERT(result == &destText);
2020 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2021 REGEX_ASSERT(length == 10);
2022 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2023
2024 // Capture Group 1 == "234567"
2025 result = matcher->group(1, NULL, length, status);
2026 REGEX_CHECK_STATUS;
2027 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2028 REGEX_ASSERT(length == 6);
2029 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2030 utext_close(result);
2031
2032 result = matcher->group(1, &destText, length, status);
2033 REGEX_CHECK_STATUS;
2034 REGEX_ASSERT(result == &destText);
2035 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2036 REGEX_ASSERT(length == 6);
2037 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2038 utext_close(result);
2039
2040 // Capture Group 2 == "45"
2041 result = matcher->group(2, NULL, length, status);
2042 REGEX_CHECK_STATUS;
2043 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2044 REGEX_ASSERT(length == 2);
2045 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2046 utext_close(result);
2047
2048 result = matcher->group(2, &destText, length, status);
2049 REGEX_CHECK_STATUS;
2050 REGEX_ASSERT(result == &destText);
2051 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2052 REGEX_ASSERT(length == 2);
2053 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2054 utext_close(result);
2055
2056 // Capture Group 3 == "89"
2057 result = matcher->group(3, NULL, length, status);
2058 REGEX_CHECK_STATUS;
2059 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2060 REGEX_ASSERT(length == 2);
2061 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2062 utext_close(result);
2063
2064 result = matcher->group(3, &destText, length, status);
2065 REGEX_CHECK_STATUS;
2066 REGEX_ASSERT(result == &destText);
2067 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2068 REGEX_ASSERT(length == 2);
2069 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2070 utext_close(result);
2071
2072 // Capture Group number out of range.
2073 status = U_ZERO_ERROR;
2074 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2075 status = U_ZERO_ERROR;
2076 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2077 status = U_ZERO_ERROR;
2078 matcher->reset();
2079 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2080
2081 delete matcher;
2082 delete pat;
2083
2084 utext_close(&destText);
2085 utext_close(&input);
2086 utext_close(&re);
2087 }
2088
2089 //
2090 // find
2091 //
2092 {
2093 int32_t flags=0;
2094 UParseError pe;
2095 UErrorCode status=U_ZERO_ERROR;
2096 UText re=UTEXT_INITIALIZER;
2097 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2098 utext_openUTF8(&re, str_abc, -1, &status);
2099
2100 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2101 REGEX_CHECK_STATUS;
2102 UText input = UTEXT_INITIALIZER;
2103 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2104 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2105 // 012345678901234567
2106
2107 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2108 REGEX_CHECK_STATUS;
2109 REGEX_ASSERT(matcher->find());
2110 REGEX_ASSERT(matcher->start(status) == 1);
2111 REGEX_ASSERT(matcher->find());
2112 REGEX_ASSERT(matcher->start(status) == 6);
2113 REGEX_ASSERT(matcher->find());
2114 REGEX_ASSERT(matcher->start(status) == 12);
2115 REGEX_ASSERT(matcher->find() == FALSE);
2116 REGEX_ASSERT(matcher->find() == FALSE);
2117
2118 matcher->reset();
2119 REGEX_ASSERT(matcher->find());
2120 REGEX_ASSERT(matcher->start(status) == 1);
2121
2122 REGEX_ASSERT(matcher->find(0, status));
2123 REGEX_ASSERT(matcher->start(status) == 1);
2124 REGEX_ASSERT(matcher->find(1, status));
2125 REGEX_ASSERT(matcher->start(status) == 1);
2126 REGEX_ASSERT(matcher->find(2, status));
2127 REGEX_ASSERT(matcher->start(status) == 6);
2128 REGEX_ASSERT(matcher->find(12, status));
2129 REGEX_ASSERT(matcher->start(status) == 12);
2130 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2131 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2132 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2133 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2134
2135 status = U_ZERO_ERROR;
2136 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2137 status = U_ZERO_ERROR;
2138 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2139
2140 REGEX_ASSERT(matcher->groupCount() == 0);
2141
2142 delete matcher;
2143 delete pat;
2144
2145 utext_close(&input);
2146 utext_close(&re);
2147 }
2148
2149
2150 //
2151 // find, with \G in pattern (true if at the end of a previous match).
2152 //
2153 {
2154 int32_t flags=0;
2155 UParseError pe;
2156 UErrorCode status=U_ZERO_ERROR;
2157 UText re=UTEXT_INITIALIZER;
2158 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2159 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2160
2161 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2162
2163 REGEX_CHECK_STATUS;
2164 UText input = UTEXT_INITIALIZER;
2165 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2166 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2167 // 012345678901234567
2168
2169 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2170 REGEX_CHECK_STATUS;
2171 REGEX_ASSERT(matcher->find());
2172 REGEX_ASSERT(matcher->start(status) == 0);
2173 REGEX_ASSERT(matcher->start(1, status) == -1);
2174 REGEX_ASSERT(matcher->start(2, status) == 1);
2175
2176 REGEX_ASSERT(matcher->find());
2177 REGEX_ASSERT(matcher->start(status) == 4);
2178 REGEX_ASSERT(matcher->start(1, status) == 4);
2179 REGEX_ASSERT(matcher->start(2, status) == -1);
2180 REGEX_CHECK_STATUS;
2181
2182 delete matcher;
2183 delete pat;
2184
2185 utext_close(&input);
2186 utext_close(&re);
2187 }
2188
2189 //
2190 // find with zero length matches, match position should bump ahead
2191 // to prevent loops.
2192 //
2193 {
2194 int32_t i;
2195 UErrorCode status=U_ZERO_ERROR;
2196 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2197 // using an always-true look-ahead.
2198 REGEX_CHECK_STATUS;
2199 UText s = UTEXT_INITIALIZER;
2200 utext_openUTF8(&s, " ", -1, &status);
2201 m.reset(&s);
2202 for (i=0; ; i++) {
2203 if (m.find() == FALSE) {
2204 break;
2205 }
2206 REGEX_ASSERT(m.start(status) == i);
2207 REGEX_ASSERT(m.end(status) == i);
2208 }
2209 REGEX_ASSERT(i==5);
2210
2211 // Check that the bump goes over characters outside the BMP OK
2212 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2213 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2214 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2215 m.reset(&s);
2216 for (i=0; ; i+=4) {
2217 if (m.find() == FALSE) {
2218 break;
2219 }
2220 REGEX_ASSERT(m.start(status) == i);
2221 REGEX_ASSERT(m.end(status) == i);
2222 }
2223 REGEX_ASSERT(i==20);
2224
2225 utext_close(&s);
2226 }
2227 {
2228 // find() loop breaking test.
2229 // with pattern of /.?/, should see a series of one char matches, then a single
2230 // match of zero length at the end of the input string.
2231 int32_t i;
2232 UErrorCode status=U_ZERO_ERROR;
2233 RegexMatcher m(".?", 0, status);
2234 REGEX_CHECK_STATUS;
2235 UText s = UTEXT_INITIALIZER;
2236 utext_openUTF8(&s, " ", -1, &status);
2237 m.reset(&s);
2238 for (i=0; ; i++) {
2239 if (m.find() == FALSE) {
2240 break;
2241 }
2242 REGEX_ASSERT(m.start(status) == i);
2243 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2244 }
2245 REGEX_ASSERT(i==5);
2246
2247 utext_close(&s);
2248 }
2249
2250
2251 //
2252 // Matchers with no input string behave as if they had an empty input string.
2253 //
2254
2255 {
2256 UErrorCode status = U_ZERO_ERROR;
2257 RegexMatcher m(".?", 0, status);
2258 REGEX_CHECK_STATUS;
2259 REGEX_ASSERT(m.find());
2260 REGEX_ASSERT(m.start(status) == 0);
2261 REGEX_ASSERT(m.input() == "");
2262 }
2263 {
2264 UErrorCode status = U_ZERO_ERROR;
2265 RegexPattern *p = RegexPattern::compile(".", 0, status);
2266 RegexMatcher *m = p->matcher(status);
2267 REGEX_CHECK_STATUS;
2268
2269 REGEX_ASSERT(m->find() == FALSE);
2270 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2271 delete m;
2272 delete p;
2273 }
2274
2275 //
2276 // Regions
2277 //
2278 {
2279 UErrorCode status = U_ZERO_ERROR;
2280 UText testPattern = UTEXT_INITIALIZER;
2281 UText testText = UTEXT_INITIALIZER;
2282 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2283 REGEX_VERBOSE_TEXT(&testPattern);
2284 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2285 REGEX_VERBOSE_TEXT(&testText);
2286
2287 RegexMatcher m(&testPattern, &testText, 0, status);
2288 REGEX_CHECK_STATUS;
2289 REGEX_ASSERT(m.regionStart() == 0);
2290 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2291 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2292 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2293
2294 m.region(2,4, status);
2295 REGEX_CHECK_STATUS;
2296 REGEX_ASSERT(m.matches(status));
2297 REGEX_ASSERT(m.start(status)==2);
2298 REGEX_ASSERT(m.end(status)==4);
2299 REGEX_CHECK_STATUS;
2300
2301 m.reset();
2302 REGEX_ASSERT(m.regionStart() == 0);
2303 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2304
2305 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2306 REGEX_VERBOSE_TEXT(&testText);
2307 m.reset(&testText);
2308 REGEX_ASSERT(m.regionStart() == 0);
2309 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2310
2311 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2312 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2313 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2314 REGEX_ASSERT(&m == &m.reset());
2315 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2316
2317 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2318 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2319 REGEX_ASSERT(&m == &m.reset());
2320 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2321
2322 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2323 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2324 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2325 REGEX_ASSERT(&m == &m.reset());
2326 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2327
2328 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2329 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2330 REGEX_ASSERT(&m == &m.reset());
2331 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2332
2333 utext_close(&testText);
2334 utext_close(&testPattern);
2335 }
2336
2337 //
2338 // hitEnd() and requireEnd()
2339 //
2340 {
2341 UErrorCode status = U_ZERO_ERROR;
2342 UText testPattern = UTEXT_INITIALIZER;
2343 UText testText = UTEXT_INITIALIZER;
2344 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2345 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2346 utext_openUTF8(&testPattern, str_, -1, &status);
2347 utext_openUTF8(&testText, str_aabb, -1, &status);
2348
2349 RegexMatcher m1(&testPattern, &testText, 0, status);
2350 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2351 REGEX_ASSERT(m1.hitEnd() == TRUE);
2352 REGEX_ASSERT(m1.requireEnd() == FALSE);
2353 REGEX_CHECK_STATUS;
2354
2355 status = U_ZERO_ERROR;
2356 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2357 utext_openUTF8(&testPattern, str_a, -1, &status);
2358 RegexMatcher m2(&testPattern, &testText, 0, status);
2359 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2360 REGEX_ASSERT(m2.hitEnd() == FALSE);
2361 REGEX_ASSERT(m2.requireEnd() == FALSE);
2362 REGEX_CHECK_STATUS;
2363
2364 status = U_ZERO_ERROR;
2365 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2366 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2367 RegexMatcher m3(&testPattern, &testText, 0, status);
2368 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2369 REGEX_ASSERT(m3.hitEnd() == TRUE);
2370 REGEX_ASSERT(m3.requireEnd() == TRUE);
2371 REGEX_CHECK_STATUS;
2372
2373 utext_close(&testText);
2374 utext_close(&testPattern);
2375 }
2376 }
2377
2378
2379 //---------------------------------------------------------------------------
2380 //
2381 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2382 // Replace family of functions.
2383 //
2384 //---------------------------------------------------------------------------
API_Replace_UTF8()2385 void RegexTest::API_Replace_UTF8() {
2386 //
2387 // Replace
2388 //
2389 int32_t flags=0;
2390 UParseError pe;
2391 UErrorCode status=U_ZERO_ERROR;
2392
2393 UText re=UTEXT_INITIALIZER;
2394 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2395 REGEX_VERBOSE_TEXT(&re);
2396 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2397 REGEX_CHECK_STATUS;
2398
2399 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2400 // 012345678901234567
2401 UText dataText = UTEXT_INITIALIZER;
2402 utext_openUTF8(&dataText, data, -1, &status);
2403 REGEX_CHECK_STATUS;
2404 REGEX_VERBOSE_TEXT(&dataText);
2405 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2406
2407 //
2408 // Plain vanilla matches.
2409 //
2410 UnicodeString dest;
2411 UText destText = UTEXT_INITIALIZER;
2412 utext_openUnicodeString(&destText, &dest, &status);
2413 UText *result;
2414
2415 UText replText = UTEXT_INITIALIZER;
2416
2417 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2418 utext_openUTF8(&replText, str_yz, -1, &status);
2419 REGEX_VERBOSE_TEXT(&replText);
2420 result = matcher->replaceFirst(&replText, NULL, status);
2421 REGEX_CHECK_STATUS;
2422 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2423 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2424 utext_close(result);
2425 result = matcher->replaceFirst(&replText, &destText, status);
2426 REGEX_CHECK_STATUS;
2427 REGEX_ASSERT(result == &destText);
2428 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2429
2430 result = matcher->replaceAll(&replText, NULL, status);
2431 REGEX_CHECK_STATUS;
2432 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2433 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2434 utext_close(result);
2435
2436 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2437 result = matcher->replaceAll(&replText, &destText, status);
2438 REGEX_CHECK_STATUS;
2439 REGEX_ASSERT(result == &destText);
2440 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2441
2442 //
2443 // Plain vanilla non-matches.
2444 //
2445 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2446 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2447 matcher->reset(&dataText);
2448
2449 result = matcher->replaceFirst(&replText, NULL, status);
2450 REGEX_CHECK_STATUS;
2451 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2452 utext_close(result);
2453 result = matcher->replaceFirst(&replText, &destText, status);
2454 REGEX_CHECK_STATUS;
2455 REGEX_ASSERT(result == &destText);
2456 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2457
2458 result = matcher->replaceAll(&replText, NULL, status);
2459 REGEX_CHECK_STATUS;
2460 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2461 utext_close(result);
2462 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2463 result = matcher->replaceAll(&replText, &destText, status);
2464 REGEX_CHECK_STATUS;
2465 REGEX_ASSERT(result == &destText);
2466 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2467
2468 //
2469 // Empty source string
2470 //
2471 utext_openUTF8(&dataText, NULL, 0, &status);
2472 matcher->reset(&dataText);
2473
2474 result = matcher->replaceFirst(&replText, NULL, status);
2475 REGEX_CHECK_STATUS;
2476 REGEX_ASSERT_UTEXT_UTF8("", result);
2477 utext_close(result);
2478 result = matcher->replaceFirst(&replText, &destText, status);
2479 REGEX_CHECK_STATUS;
2480 REGEX_ASSERT(result == &destText);
2481 REGEX_ASSERT_UTEXT_UTF8("", result);
2482
2483 result = matcher->replaceAll(&replText, NULL, status);
2484 REGEX_CHECK_STATUS;
2485 REGEX_ASSERT_UTEXT_UTF8("", result);
2486 utext_close(result);
2487 result = matcher->replaceAll(&replText, &destText, status);
2488 REGEX_CHECK_STATUS;
2489 REGEX_ASSERT(result == &destText);
2490 REGEX_ASSERT_UTEXT_UTF8("", result);
2491
2492 //
2493 // Empty substitution string
2494 //
2495 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2496 matcher->reset(&dataText);
2497
2498 utext_openUTF8(&replText, NULL, 0, &status);
2499 result = matcher->replaceFirst(&replText, NULL, status);
2500 REGEX_CHECK_STATUS;
2501 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2502 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2503 utext_close(result);
2504 result = matcher->replaceFirst(&replText, &destText, status);
2505 REGEX_CHECK_STATUS;
2506 REGEX_ASSERT(result == &destText);
2507 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2508
2509 result = matcher->replaceAll(&replText, NULL, status);
2510 REGEX_CHECK_STATUS;
2511 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2512 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2513 utext_close(result);
2514 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2515 result = matcher->replaceAll(&replText, &destText, status);
2516 REGEX_CHECK_STATUS;
2517 REGEX_ASSERT(result == &destText);
2518 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2519
2520 //
2521 // match whole string
2522 //
2523 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2524 utext_openUTF8(&dataText, str_abc, -1, &status);
2525 matcher->reset(&dataText);
2526
2527 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2528 utext_openUTF8(&replText, str_xyz, -1, &status);
2529 result = matcher->replaceFirst(&replText, NULL, status);
2530 REGEX_CHECK_STATUS;
2531 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2532 utext_close(result);
2533 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2534 result = matcher->replaceFirst(&replText, &destText, status);
2535 REGEX_CHECK_STATUS;
2536 REGEX_ASSERT(result == &destText);
2537 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2538
2539 result = matcher->replaceAll(&replText, NULL, status);
2540 REGEX_CHECK_STATUS;
2541 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2542 utext_close(result);
2543 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2544 result = matcher->replaceAll(&replText, &destText, status);
2545 REGEX_CHECK_STATUS;
2546 REGEX_ASSERT(result == &destText);
2547 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2548
2549 //
2550 // Capture Group, simple case
2551 //
2552 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2553 utext_openUTF8(&re, str_add, -1, &status);
2554 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2555 REGEX_CHECK_STATUS;
2556
2557 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2558 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2559 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2560 REGEX_CHECK_STATUS;
2561
2562 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2563 utext_openUTF8(&replText, str_11, -1, &status);
2564 result = matcher2->replaceFirst(&replText, NULL, status);
2565 REGEX_CHECK_STATUS;
2566 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2567 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2568 utext_close(result);
2569 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2570 result = matcher2->replaceFirst(&replText, &destText, status);
2571 REGEX_CHECK_STATUS;
2572 REGEX_ASSERT(result == &destText);
2573 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2574
2575 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2576 utext_openUTF8(&replText, str_v, -1, &status);
2577 REGEX_VERBOSE_TEXT(&replText);
2578 result = matcher2->replaceFirst(&replText, NULL, status);
2579 REGEX_CHECK_STATUS;
2580 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2581 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2582 utext_close(result);
2583 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2584 result = matcher2->replaceFirst(&replText, &destText, status);
2585 REGEX_CHECK_STATUS;
2586 REGEX_ASSERT(result == &destText);
2587 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2588
2589 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2590 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2591 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2592 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2593 result = matcher2->replaceFirst(&replText, NULL, status);
2594 REGEX_CHECK_STATUS;
2595 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2596 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2597 utext_close(result);
2598 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2599 result = matcher2->replaceFirst(&replText, &destText, status);
2600 REGEX_CHECK_STATUS;
2601 REGEX_ASSERT(result == &destText);
2602 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2603
2604 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2605 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2606 // 012345678901234567890123456
2607 supplDigitChars[22] = 0xF0;
2608 supplDigitChars[23] = 0x9D;
2609 supplDigitChars[24] = 0x9F;
2610 supplDigitChars[25] = 0x8F;
2611 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2612
2613 result = matcher2->replaceFirst(&replText, NULL, status);
2614 REGEX_CHECK_STATUS;
2615 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2616 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2617 utext_close(result);
2618 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2619 result = matcher2->replaceFirst(&replText, &destText, status);
2620 REGEX_CHECK_STATUS;
2621 REGEX_ASSERT(result == &destText);
2622 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2623 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2624 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2625 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2626 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2627 utext_close(result);
2628 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2630 REGEX_ASSERT(result == &destText);
2631 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2632
2633 //
2634 // Replacement String with \u hex escapes
2635 //
2636 {
2637 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2638 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2639 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2640 utext_openUTF8(&replText, str_u0043, -1, &status);
2641 matcher->reset(&dataText);
2642
2643 result = matcher->replaceAll(&replText, NULL, status);
2644 REGEX_CHECK_STATUS;
2645 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2646 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2647 utext_close(result);
2648 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2649 result = matcher->replaceAll(&replText, &destText, status);
2650 REGEX_CHECK_STATUS;
2651 REGEX_ASSERT(result == &destText);
2652 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2653 }
2654 {
2655 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2656 utext_openUTF8(&dataText, str_abc, -1, &status);
2657 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2658 utext_openUTF8(&replText, str_U00010000, -1, &status);
2659 matcher->reset(&dataText);
2660
2661 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2662 // 0123456789
2663 expected[2] = 0xF0;
2664 expected[3] = 0x90;
2665 expected[4] = 0x80;
2666 expected[5] = 0x80;
2667
2668 result = matcher->replaceAll(&replText, NULL, status);
2669 REGEX_CHECK_STATUS;
2670 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2671 utext_close(result);
2672 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2673 result = matcher->replaceAll(&replText, &destText, status);
2674 REGEX_CHECK_STATUS;
2675 REGEX_ASSERT(result == &destText);
2676 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2677 }
2678 // TODO: need more through testing of capture substitutions.
2679
2680 // Bug 4057
2681 //
2682 {
2683 status = U_ZERO_ERROR;
2684 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2685 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2686 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2687 utext_openUTF8(&re, str_ssee, -1, &status);
2688 utext_openUTF8(&dataText, str_blah, -1, &status);
2689 utext_openUTF8(&replText, str_ooh, -1, &status);
2690
2691 RegexMatcher m(&re, 0, status);
2692 REGEX_CHECK_STATUS;
2693
2694 UnicodeString result;
2695 UText resultText = UTEXT_INITIALIZER;
2696 utext_openUnicodeString(&resultText, &result, &status);
2697
2698 // Multiple finds do NOT bump up the previous appendReplacement postion.
2699 m.reset(&dataText);
2700 m.find();
2701 m.find();
2702 m.appendReplacement(&resultText, &replText, status);
2703 REGEX_CHECK_STATUS;
2704 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2705 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2706
2707 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2708 status = U_ZERO_ERROR;
2709 result.truncate(0);
2710 utext_openUnicodeString(&resultText, &result, &status);
2711 m.reset(10, status);
2712 m.find();
2713 m.find();
2714 m.appendReplacement(&resultText, &replText, status);
2715 REGEX_CHECK_STATUS;
2716 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2717 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2718
2719 // find() at interior of string, appendReplacement still starts at beginning.
2720 status = U_ZERO_ERROR;
2721 result.truncate(0);
2722 utext_openUnicodeString(&resultText, &result, &status);
2723 m.reset();
2724 m.find(10, status);
2725 m.find();
2726 m.appendReplacement(&resultText, &replText, status);
2727 REGEX_CHECK_STATUS;
2728 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2729 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2730
2731 m.appendTail(&resultText, status);
2732 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2733 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2734
2735 utext_close(&resultText);
2736 }
2737
2738 delete matcher2;
2739 delete pat2;
2740 delete matcher;
2741 delete pat;
2742
2743 utext_close(&dataText);
2744 utext_close(&replText);
2745 utext_close(&destText);
2746 utext_close(&re);
2747 }
2748
2749
2750 //---------------------------------------------------------------------------
2751 //
2752 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2753 // present and nominally working.
2754 //
2755 //---------------------------------------------------------------------------
API_Pattern_UTF8()2756 void RegexTest::API_Pattern_UTF8() {
2757 RegexPattern pata; // Test default constructor to not crash.
2758 RegexPattern patb;
2759
2760 REGEX_ASSERT(pata == patb);
2761 REGEX_ASSERT(pata == pata);
2762
2763 UText re1 = UTEXT_INITIALIZER;
2764 UText re2 = UTEXT_INITIALIZER;
2765 UErrorCode status = U_ZERO_ERROR;
2766 UParseError pe;
2767
2768 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2769 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2770 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2771 utext_openUTF8(&re2, str_def, -1, &status);
2772
2773 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2774 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2775 REGEX_CHECK_STATUS;
2776 REGEX_ASSERT(*pat1 == *pat1);
2777 REGEX_ASSERT(*pat1 != pata);
2778
2779 // Assign
2780 patb = *pat1;
2781 REGEX_ASSERT(patb == *pat1);
2782
2783 // Copy Construct
2784 RegexPattern patc(*pat1);
2785 REGEX_ASSERT(patc == *pat1);
2786 REGEX_ASSERT(patb == patc);
2787 REGEX_ASSERT(pat1 != pat2);
2788 patb = *pat2;
2789 REGEX_ASSERT(patb != patc);
2790 REGEX_ASSERT(patb == *pat2);
2791
2792 // Compile with no flags.
2793 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2794 REGEX_ASSERT(*pat1a == *pat1);
2795
2796 REGEX_ASSERT(pat1a->flags() == 0);
2797
2798 // Compile with different flags should be not equal
2799 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2800 REGEX_CHECK_STATUS;
2801
2802 REGEX_ASSERT(*pat1b != *pat1a);
2803 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2804 REGEX_ASSERT(pat1a->flags() == 0);
2805 delete pat1b;
2806
2807 // clone
2808 RegexPattern *pat1c = pat1->clone();
2809 REGEX_ASSERT(*pat1c == *pat1);
2810 REGEX_ASSERT(*pat1c != *pat2);
2811
2812 delete pat1c;
2813 delete pat1a;
2814 delete pat1;
2815 delete pat2;
2816
2817 utext_close(&re1);
2818 utext_close(&re2);
2819
2820
2821 //
2822 // Verify that a matcher created from a cloned pattern works.
2823 // (Jitterbug 3423)
2824 //
2825 {
2826 UErrorCode status = U_ZERO_ERROR;
2827 UText pattern = UTEXT_INITIALIZER;
2828 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2829 utext_openUTF8(&pattern, str_pL, -1, &status);
2830
2831 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2832 RegexPattern *pClone = pSource->clone();
2833 delete pSource;
2834 RegexMatcher *mFromClone = pClone->matcher(status);
2835 REGEX_CHECK_STATUS;
2836
2837 UText input = UTEXT_INITIALIZER;
2838 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2839 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2840 mFromClone->reset(&input);
2841 REGEX_ASSERT(mFromClone->find() == TRUE);
2842 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2843 REGEX_ASSERT(mFromClone->find() == TRUE);
2844 REGEX_ASSERT(mFromClone->group(status) == "World");
2845 REGEX_ASSERT(mFromClone->find() == FALSE);
2846 delete mFromClone;
2847 delete pClone;
2848
2849 utext_close(&input);
2850 utext_close(&pattern);
2851 }
2852
2853 //
2854 // matches convenience API
2855 //
2856 {
2857 UErrorCode status = U_ZERO_ERROR;
2858 UText pattern = UTEXT_INITIALIZER;
2859 UText input = UTEXT_INITIALIZER;
2860
2861 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2862 utext_openUTF8(&input, str_randominput, -1, &status);
2863
2864 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2865 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2866 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2867 REGEX_CHECK_STATUS;
2868
2869 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2870 utext_openUTF8(&pattern, str_abc, -1, &status);
2871 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2872 REGEX_CHECK_STATUS;
2873
2874 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2875 utext_openUTF8(&pattern, str_nput, -1, &status);
2876 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2877 REGEX_CHECK_STATUS;
2878
2879 utext_openUTF8(&pattern, str_randominput, -1, &status);
2880 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2881 REGEX_CHECK_STATUS;
2882
2883 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2884 utext_openUTF8(&pattern, str_u, -1, &status);
2885 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2886 REGEX_CHECK_STATUS;
2887
2888 utext_openUTF8(&input, str_abc, -1, &status);
2889 utext_openUTF8(&pattern, str_abc, -1, &status);
2890 status = U_INDEX_OUTOFBOUNDS_ERROR;
2891 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2892 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2893
2894 utext_close(&input);
2895 utext_close(&pattern);
2896 }
2897
2898
2899 //
2900 // Split()
2901 //
2902 status = U_ZERO_ERROR;
2903 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2904 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2905 pat1 = RegexPattern::compile(&re1, pe, status);
2906 REGEX_CHECK_STATUS;
2907 UnicodeString fields[10];
2908
2909 int32_t n;
2910 n = pat1->split("Now is the time", fields, 10, status);
2911 REGEX_CHECK_STATUS;
2912 REGEX_ASSERT(n==4);
2913 REGEX_ASSERT(fields[0]=="Now");
2914 REGEX_ASSERT(fields[1]=="is");
2915 REGEX_ASSERT(fields[2]=="the");
2916 REGEX_ASSERT(fields[3]=="time");
2917 REGEX_ASSERT(fields[4]=="");
2918
2919 n = pat1->split("Now is the time", fields, 2, status);
2920 REGEX_CHECK_STATUS;
2921 REGEX_ASSERT(n==2);
2922 REGEX_ASSERT(fields[0]=="Now");
2923 REGEX_ASSERT(fields[1]=="is the time");
2924 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2925
2926 fields[1] = "*";
2927 status = U_ZERO_ERROR;
2928 n = pat1->split("Now is the time", fields, 1, status);
2929 REGEX_CHECK_STATUS;
2930 REGEX_ASSERT(n==1);
2931 REGEX_ASSERT(fields[0]=="Now is the time");
2932 REGEX_ASSERT(fields[1]=="*");
2933 status = U_ZERO_ERROR;
2934
2935 n = pat1->split(" Now is the time ", fields, 10, status);
2936 REGEX_CHECK_STATUS;
2937 REGEX_ASSERT(n==6);
2938 REGEX_ASSERT(fields[0]=="");
2939 REGEX_ASSERT(fields[1]=="Now");
2940 REGEX_ASSERT(fields[2]=="is");
2941 REGEX_ASSERT(fields[3]=="the");
2942 REGEX_ASSERT(fields[4]=="time");
2943 REGEX_ASSERT(fields[5]=="");
2944 REGEX_ASSERT(fields[6]=="");
2945
2946 fields[2] = "*";
2947 n = pat1->split(" ", fields, 10, status);
2948 REGEX_CHECK_STATUS;
2949 REGEX_ASSERT(n==2);
2950 REGEX_ASSERT(fields[0]=="");
2951 REGEX_ASSERT(fields[1]=="");
2952 REGEX_ASSERT(fields[2]=="*");
2953
2954 fields[0] = "foo";
2955 n = pat1->split("", fields, 10, status);
2956 REGEX_CHECK_STATUS;
2957 REGEX_ASSERT(n==0);
2958 REGEX_ASSERT(fields[0]=="foo");
2959
2960 delete pat1;
2961
2962 // split, with a pattern with (capture)
2963 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2964 pat1 = RegexPattern::compile(&re1, pe, status);
2965 REGEX_CHECK_STATUS;
2966
2967 status = U_ZERO_ERROR;
2968 fields[6] = fields[7] = "*";
2969 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2970 REGEX_CHECK_STATUS;
2971 REGEX_ASSERT(n==7);
2972 REGEX_ASSERT(fields[0]=="");
2973 REGEX_ASSERT(fields[1]=="a");
2974 REGEX_ASSERT(fields[2]=="Now is ");
2975 REGEX_ASSERT(fields[3]=="b");
2976 REGEX_ASSERT(fields[4]=="the time");
2977 REGEX_ASSERT(fields[5]=="c");
2978 REGEX_ASSERT(fields[6]=="");
2979 REGEX_ASSERT(fields[7]=="*");
2980 REGEX_ASSERT(status==U_ZERO_ERROR);
2981
2982 fields[6] = fields[7] = "*";
2983 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
2984 REGEX_CHECK_STATUS;
2985 REGEX_ASSERT(n==7);
2986 REGEX_ASSERT(fields[0]==" ");
2987 REGEX_ASSERT(fields[1]=="a");
2988 REGEX_ASSERT(fields[2]=="Now is ");
2989 REGEX_ASSERT(fields[3]=="b");
2990 REGEX_ASSERT(fields[4]=="the time");
2991 REGEX_ASSERT(fields[5]=="c");
2992 REGEX_ASSERT(fields[6]=="");
2993 REGEX_ASSERT(fields[7]=="*");
2994
2995 status = U_ZERO_ERROR;
2996 fields[6] = "foo";
2997 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
2998 REGEX_CHECK_STATUS;
2999 REGEX_ASSERT(n==6);
3000 REGEX_ASSERT(fields[0]==" ");
3001 REGEX_ASSERT(fields[1]=="a");
3002 REGEX_ASSERT(fields[2]=="Now is ");
3003 REGEX_ASSERT(fields[3]=="b");
3004 REGEX_ASSERT(fields[4]=="the time");
3005 REGEX_ASSERT(fields[5]==" ");
3006 REGEX_ASSERT(fields[6]=="foo");
3007
3008 status = U_ZERO_ERROR;
3009 fields[5] = "foo";
3010 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3011 REGEX_CHECK_STATUS;
3012 REGEX_ASSERT(n==5);
3013 REGEX_ASSERT(fields[0]==" ");
3014 REGEX_ASSERT(fields[1]=="a");
3015 REGEX_ASSERT(fields[2]=="Now is ");
3016 REGEX_ASSERT(fields[3]=="b");
3017 REGEX_ASSERT(fields[4]=="the time<c>");
3018 REGEX_ASSERT(fields[5]=="foo");
3019
3020 status = U_ZERO_ERROR;
3021 fields[5] = "foo";
3022 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3023 REGEX_CHECK_STATUS;
3024 REGEX_ASSERT(n==5);
3025 REGEX_ASSERT(fields[0]==" ");
3026 REGEX_ASSERT(fields[1]=="a");
3027 REGEX_ASSERT(fields[2]=="Now is ");
3028 REGEX_ASSERT(fields[3]=="b");
3029 REGEX_ASSERT(fields[4]=="the time");
3030 REGEX_ASSERT(fields[5]=="foo");
3031
3032 status = U_ZERO_ERROR;
3033 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3034 REGEX_CHECK_STATUS;
3035 REGEX_ASSERT(n==4);
3036 REGEX_ASSERT(fields[0]==" ");
3037 REGEX_ASSERT(fields[1]=="a");
3038 REGEX_ASSERT(fields[2]=="Now is ");
3039 REGEX_ASSERT(fields[3]=="the time<c>");
3040 status = U_ZERO_ERROR;
3041 delete pat1;
3042
3043 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3044 pat1 = RegexPattern::compile(&re1, pe, status);
3045 REGEX_CHECK_STATUS;
3046 n = pat1->split("1-10,20", fields, 10, status);
3047 REGEX_CHECK_STATUS;
3048 REGEX_ASSERT(n==5);
3049 REGEX_ASSERT(fields[0]=="1");
3050 REGEX_ASSERT(fields[1]=="-");
3051 REGEX_ASSERT(fields[2]=="10");
3052 REGEX_ASSERT(fields[3]==",");
3053 REGEX_ASSERT(fields[4]=="20");
3054 delete pat1;
3055
3056
3057 //
3058 // split of a UText based string, with library allocating output UTexts.
3059 //
3060 {
3061 status = U_ZERO_ERROR;
3062 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3063 UnicodeString stringToSplit("first:second:third");
3064 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3065 REGEX_CHECK_STATUS;
3066
3067 UText *splits[10] = {NULL};
3068 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3069 REGEX_CHECK_STATUS;
3070 REGEX_ASSERT(numFields == 5);
3071 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3072 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3073 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3074 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3075 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3076 REGEX_ASSERT(splits[5] == NULL);
3077
3078 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3079 if (splits[i]) {
3080 utext_close(splits[i]);
3081 splits[i] = NULL;
3082 }
3083 }
3084 utext_close(textToSplit);
3085 }
3086
3087
3088 //
3089 // RegexPattern::pattern() and patternText()
3090 //
3091 pat1 = new RegexPattern();
3092 REGEX_ASSERT(pat1->pattern() == "");
3093 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3094 delete pat1;
3095 const char *helloWorldInvariant = "(Hello, world)*";
3096 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3097 pat1 = RegexPattern::compile(&re1, pe, status);
3098 REGEX_CHECK_STATUS;
3099 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3100 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3101 delete pat1;
3102
3103 utext_close(&re1);
3104 }
3105
3106
3107 //---------------------------------------------------------------------------
3108 //
3109 // Extended A more thorough check for features of regex patterns
3110 // The test cases are in a separate data file,
3111 // source/tests/testdata/regextst.txt
3112 // A description of the test data format is included in that file.
3113 //
3114 //---------------------------------------------------------------------------
3115
3116 const char *
getPath(char buffer[2048],const char * filename)3117 RegexTest::getPath(char buffer[2048], const char *filename) {
3118 UErrorCode status=U_ZERO_ERROR;
3119 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3120 if (U_FAILURE(status)) {
3121 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3122 return NULL;
3123 }
3124
3125 strcpy(buffer, testDataDirectory);
3126 strcat(buffer, filename);
3127 return buffer;
3128 }
3129
Extended()3130 void RegexTest::Extended() {
3131 char tdd[2048];
3132 const char *srcPath;
3133 UErrorCode status = U_ZERO_ERROR;
3134 int32_t lineNum = 0;
3135
3136 //
3137 // Open and read the test data file.
3138 //
3139 srcPath=getPath(tdd, "regextst.txt");
3140 if(srcPath==NULL) {
3141 return; /* something went wrong, error already output */
3142 }
3143
3144 int32_t len;
3145 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3146 if (U_FAILURE(status)) {
3147 return; /* something went wrong, error already output */
3148 }
3149
3150 //
3151 // Put the test data into a UnicodeString
3152 //
3153 UnicodeString testString(FALSE, testData, len);
3154
3155 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3156 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3157 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3158
3159 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3160 UnicodeString testPattern; // The pattern for test from the test file.
3161 UnicodeString testFlags; // the flags for a test.
3162 UnicodeString matchString; // The marked up string to be used as input
3163
3164 if (U_FAILURE(status)){
3165 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3166 delete [] testData;
3167 return;
3168 }
3169
3170 //
3171 // Loop over the test data file, once per line.
3172 //
3173 while (lineMat.find()) {
3174 lineNum++;
3175 if (U_FAILURE(status)) {
3176 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3177 }
3178
3179 status = U_ZERO_ERROR;
3180 UnicodeString testLine = lineMat.group(1, status);
3181 if (testLine.length() == 0) {
3182 continue;
3183 }
3184
3185 //
3186 // Parse the test line. Skip blank and comment only lines.
3187 // Separate out the three main fields - pattern, flags, target.
3188 //
3189
3190 commentMat.reset(testLine);
3191 if (commentMat.lookingAt(status)) {
3192 // This line is a comment, or blank.
3193 continue;
3194 }
3195
3196 //
3197 // Pull out the pattern field, remove it from the test file line.
3198 //
3199 quotedStuffMat.reset(testLine);
3200 if (quotedStuffMat.lookingAt(status)) {
3201 testPattern = quotedStuffMat.group(2, status);
3202 testLine.remove(0, quotedStuffMat.end(0, status));
3203 } else {
3204 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3205 continue;
3206 }
3207
3208
3209 //
3210 // Pull out the flags from the test file line.
3211 //
3212 flagsMat.reset(testLine);
3213 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3214 testFlags = flagsMat.group(1, status);
3215 if (flagsMat.group(2, status).length() > 0) {
3216 errln("Bad Match flag at line %d. Scanning %c\n",
3217 lineNum, flagsMat.group(2, status).charAt(0));
3218 continue;
3219 }
3220 testLine.remove(0, flagsMat.end(0, status));
3221
3222 //
3223 // Pull out the match string, as a whole.
3224 // We'll process the <tags> later.
3225 //
3226 quotedStuffMat.reset(testLine);
3227 if (quotedStuffMat.lookingAt(status)) {
3228 matchString = quotedStuffMat.group(2, status);
3229 testLine.remove(0, quotedStuffMat.end(0, status));
3230 } else {
3231 errln("Bad match string at test file line %d", lineNum);
3232 continue;
3233 }
3234
3235 //
3236 // The only thing left from the input line should be an optional trailing comment.
3237 //
3238 commentMat.reset(testLine);
3239 if (commentMat.lookingAt(status) == FALSE) {
3240 errln("Line %d: unexpected characters at end of test line.", lineNum);
3241 continue;
3242 }
3243
3244 //
3245 // Run the test
3246 //
3247 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3248 }
3249
3250 delete [] testData;
3251
3252 }
3253
3254
3255
3256 //---------------------------------------------------------------------------
3257 //
3258 // regex_find(pattern, flags, inputString, lineNumber)
3259 //
3260 // Function to run a single test from the Extended (data driven) tests.
3261 // See file test/testdata/regextst.txt for a description of the
3262 // pattern and inputString fields, and the allowed flags.
3263 // lineNumber is the source line in regextst.txt of the test.
3264 //
3265 //---------------------------------------------------------------------------
3266
3267
3268 // Set a value into a UVector at position specified by a decimal number in
3269 // a UnicodeString. This is a utility function needed by the actual test function,
3270 // which follows.
set(UVector & vec,int32_t val,UnicodeString index)3271 static void set(UVector &vec, int32_t val, UnicodeString index) {
3272 UErrorCode status=U_ZERO_ERROR;
3273 int32_t idx = 0;
3274 for (int32_t i=0; i<index.length(); i++) {
3275 int32_t d=u_charDigitValue(index.charAt(i));
3276 if (d<0) {return;}
3277 idx = idx*10 + d;
3278 }
3279 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3280 vec.setElementAt(val, idx);
3281 }
3282
setInt(UVector & vec,int32_t val,int32_t idx)3283 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3284 UErrorCode status=U_ZERO_ERROR;
3285 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3286 vec.setElementAt(val, idx);
3287 }
3288
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3289 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3290 {
3291 UBool couldFind = TRUE;
3292 UTEXT_SETNATIVEINDEX(utext, 0);
3293 int32_t i = 0;
3294 while (i < unistrOffset) {
3295 UChar32 c = UTEXT_NEXT32(utext);
3296 if (c != U_SENTINEL) {
3297 i += U16_LENGTH(c);
3298 } else {
3299 couldFind = FALSE;
3300 break;
3301 }
3302 }
3303 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3304 return couldFind;
3305 }
3306
3307
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3308 void RegexTest::regex_find(const UnicodeString &pattern,
3309 const UnicodeString &flags,
3310 const UnicodeString &inputString,
3311 const char *srcPath,
3312 int32_t line) {
3313 UnicodeString unEscapedInput;
3314 UnicodeString deTaggedInput;
3315
3316 int32_t patternUTF8Length, inputUTF8Length;
3317 char *patternChars = NULL, *inputChars = NULL;
3318 UText patternText = UTEXT_INITIALIZER;
3319 UText inputText = UTEXT_INITIALIZER;
3320 UConverter *UTF8Converter = NULL;
3321
3322 UErrorCode status = U_ZERO_ERROR;
3323 UParseError pe;
3324 RegexPattern *parsePat = NULL;
3325 RegexMatcher *parseMatcher = NULL;
3326 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3327 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3328 UVector groupStarts(status);
3329 UVector groupEnds(status);
3330 UVector groupStartsUTF8(status);
3331 UVector groupEndsUTF8(status);
3332 UBool isMatch = FALSE, isUTF8Match = FALSE;
3333 UBool failed = FALSE;
3334 int32_t numFinds;
3335 int32_t i;
3336 UBool useMatchesFunc = FALSE;
3337 UBool useLookingAtFunc = FALSE;
3338 int32_t regionStart = -1;
3339 int32_t regionEnd = -1;
3340 int32_t regionStartUTF8 = -1;
3341 int32_t regionEndUTF8 = -1;
3342
3343
3344 //
3345 // Compile the caller's pattern
3346 //
3347 uint32_t bflags = 0;
3348 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3349 bflags |= UREGEX_CASE_INSENSITIVE;
3350 }
3351 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3352 bflags |= UREGEX_COMMENTS;
3353 }
3354 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3355 bflags |= UREGEX_DOTALL;
3356 }
3357 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3358 bflags |= UREGEX_MULTILINE;
3359 }
3360
3361 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3362 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3363 }
3364 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3365 bflags |= UREGEX_UNIX_LINES;
3366 }
3367 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3368 bflags |= UREGEX_LITERAL;
3369 }
3370
3371
3372 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3373 if (status != U_ZERO_ERROR) {
3374 #if UCONFIG_NO_BREAK_ITERATION==1
3375 // 'v' test flag means that the test pattern should not compile if ICU was configured
3376 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3377 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3378 goto cleanupAndReturn;
3379 }
3380 #endif
3381 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3382 // Expected pattern compilation error.
3383 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3384 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3385 }
3386 goto cleanupAndReturn;
3387 } else {
3388 // Unexpected pattern compilation error.
3389 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3390 goto cleanupAndReturn;
3391 }
3392 }
3393
3394 UTF8Converter = ucnv_open("UTF8", &status);
3395 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3396
3397 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3398 status = U_ZERO_ERROR; // buffer overflow
3399 patternChars = new char[patternUTF8Length+1];
3400 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3401 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3402
3403 if (status == U_ZERO_ERROR) {
3404 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3405
3406 if (status != U_ZERO_ERROR) {
3407 #if UCONFIG_NO_BREAK_ITERATION==1
3408 // 'v' test flag means that the test pattern should not compile if ICU was configured
3409 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3410 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3411 goto cleanupAndReturn;
3412 }
3413 #endif
3414 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3415 // Expected pattern compilation error.
3416 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3417 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3418 }
3419 goto cleanupAndReturn;
3420 } else {
3421 // Unexpected pattern compilation error.
3422 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3423 goto cleanupAndReturn;
3424 }
3425 }
3426 }
3427
3428 if (UTF8Pattern == NULL) {
3429 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3430 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3431 status = U_ZERO_ERROR;
3432 }
3433
3434 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3435 callerPattern->dumpPattern();
3436 }
3437
3438 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3439 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3440 goto cleanupAndReturn;
3441 }
3442
3443
3444 //
3445 // Number of times find() should be called on the test string, default to 1
3446 //
3447 numFinds = 1;
3448 for (i=2; i<=9; i++) {
3449 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3450 if (numFinds != 1) {
3451 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3452 goto cleanupAndReturn;
3453 }
3454 numFinds = i;
3455 }
3456 }
3457
3458 // 'M' flag. Use matches() instead of find()
3459 if (flags.indexOf((UChar)0x4d) >= 0) {
3460 useMatchesFunc = TRUE;
3461 }
3462 if (flags.indexOf((UChar)0x4c) >= 0) {
3463 useLookingAtFunc = TRUE;
3464 }
3465
3466 //
3467 // Find the tags in the input data, remove them, and record the group boundary
3468 // positions.
3469 //
3470 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3471 REGEX_CHECK_STATUS_L(line);
3472
3473 unEscapedInput = inputString.unescape();
3474 parseMatcher = parsePat->matcher(unEscapedInput, status);
3475 REGEX_CHECK_STATUS_L(line);
3476 while(parseMatcher->find()) {
3477 parseMatcher->appendReplacement(deTaggedInput, "", status);
3478 REGEX_CHECK_STATUS;
3479 UnicodeString groupNum = parseMatcher->group(2, status);
3480 if (groupNum == "r") {
3481 // <r> or </r>, a region specification within the string
3482 if (parseMatcher->group(1, status) == "/") {
3483 regionEnd = deTaggedInput.length();
3484 } else {
3485 regionStart = deTaggedInput.length();
3486 }
3487 } else {
3488 // <digits> or </digits>, a group match boundary tag.
3489 if (parseMatcher->group(1, status) == "/") {
3490 set(groupEnds, deTaggedInput.length(), groupNum);
3491 } else {
3492 set(groupStarts, deTaggedInput.length(), groupNum);
3493 }
3494 }
3495 }
3496 parseMatcher->appendTail(deTaggedInput);
3497 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3498 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3499 errln("mismatched <r> tags");
3500 failed = TRUE;
3501 goto cleanupAndReturn;
3502 }
3503
3504 //
3505 // Configure the matcher according to the flags specified with this test.
3506 //
3507 matcher = callerPattern->matcher(deTaggedInput, status);
3508 REGEX_CHECK_STATUS_L(line);
3509 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3510 matcher->setTrace(TRUE);
3511 }
3512
3513 if (UTF8Pattern != NULL) {
3514 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3515 status = U_ZERO_ERROR; // buffer overflow
3516 inputChars = new char[inputUTF8Length+1];
3517 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3518 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3519
3520 if (status == U_ZERO_ERROR) {
3521 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3522 REGEX_CHECK_STATUS_L(line);
3523 }
3524
3525 if (UTF8Matcher == NULL) {
3526 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3527 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3528 status = U_ZERO_ERROR;
3529 }
3530 }
3531
3532 //
3533 // Generate native indices for UTF8 versions of region and capture group info
3534 //
3535 if (UTF8Matcher != NULL) {
3536 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3537 UTF8Matcher->setTrace(TRUE);
3538 }
3539 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3540 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3541
3542 // Fill out the native index UVector info.
3543 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3544 for (i=0; i<groupStarts.size(); i++) {
3545 int32_t start = groupStarts.elementAti(i);
3546 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3547 if (start >= 0) {
3548 int32_t startUTF8;
3549 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3550 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3551 failed = TRUE;
3552 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3553 }
3554 setInt(groupStartsUTF8, startUTF8, i);
3555 }
3556
3557 int32_t end = groupEnds.elementAti(i);
3558 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3559 if (end >= 0) {
3560 int32_t endUTF8;
3561 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3562 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3563 failed = TRUE;
3564 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3565 }
3566 setInt(groupEndsUTF8, endUTF8, i);
3567 }
3568 }
3569 }
3570
3571 if (regionStart>=0) {
3572 matcher->region(regionStart, regionEnd, status);
3573 REGEX_CHECK_STATUS_L(line);
3574 if (UTF8Matcher != NULL) {
3575 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3576 REGEX_CHECK_STATUS_L(line);
3577 }
3578 }
3579 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3580 matcher->useAnchoringBounds(FALSE);
3581 if (UTF8Matcher != NULL) {
3582 UTF8Matcher->useAnchoringBounds(FALSE);
3583 }
3584 }
3585 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3586 matcher->useTransparentBounds(TRUE);
3587 if (UTF8Matcher != NULL) {
3588 UTF8Matcher->useTransparentBounds(TRUE);
3589 }
3590 }
3591
3592
3593
3594 //
3595 // Do a find on the de-tagged input using the caller's pattern
3596 // TODO: error on count>1 and not find().
3597 // error on both matches() and lookingAt().
3598 //
3599 for (i=0; i<numFinds; i++) {
3600 if (useMatchesFunc) {
3601 isMatch = matcher->matches(status);
3602 if (UTF8Matcher != NULL) {
3603 isUTF8Match = UTF8Matcher->matches(status);
3604 }
3605 } else if (useLookingAtFunc) {
3606 isMatch = matcher->lookingAt(status);
3607 if (UTF8Matcher != NULL) {
3608 isUTF8Match = UTF8Matcher->lookingAt(status);
3609 }
3610 } else {
3611 isMatch = matcher->find();
3612 if (UTF8Matcher != NULL) {
3613 isUTF8Match = UTF8Matcher->find();
3614 }
3615 }
3616 }
3617 matcher->setTrace(FALSE);
3618 if (UTF8Matcher) {
3619 UTF8Matcher->setTrace(FALSE);
3620 }
3621 if (U_FAILURE(status)) {
3622 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3623 }
3624
3625 //
3626 // Match up the groups from the find() with the groups from the tags
3627 //
3628
3629 // number of tags should match number of groups from find operation.
3630 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3631 // G option in test means that capture group data is not available in the
3632 // expected results, so the check needs to be suppressed.
3633 if (isMatch == FALSE && groupStarts.size() != 0) {
3634 dataerrln("Error at line %d: Match expected, but none found.", line);
3635 failed = TRUE;
3636 goto cleanupAndReturn;
3637 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3638 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3639 failed = TRUE;
3640 goto cleanupAndReturn;
3641 }
3642 if (isMatch && groupStarts.size() == 0) {
3643 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3644 failed = TRUE;
3645 }
3646 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3647 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3648 failed = TRUE;
3649 }
3650
3651 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3652 // Only check for match / no match. Don't check capture groups.
3653 goto cleanupAndReturn;
3654 }
3655
3656 REGEX_CHECK_STATUS_L(line);
3657 for (i=0; i<=matcher->groupCount(); i++) {
3658 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3659 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3660 if (matcher->start(i, status) != expectedStart) {
3661 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3662 line, i, expectedStart, matcher->start(i, status));
3663 failed = TRUE;
3664 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3665 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3666 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3667 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3668 failed = TRUE;
3669 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3670 }
3671
3672 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3673 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3674 if (matcher->end(i, status) != expectedEnd) {
3675 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3676 line, i, expectedEnd, matcher->end(i, status));
3677 failed = TRUE;
3678 // Error on end position; keep going; real error is probably yet to come as group
3679 // end positions work from end of the input data towards the front.
3680 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3681 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3682 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3683 failed = TRUE;
3684 // Error on end position; keep going; real error is probably yet to come as group
3685 // end positions work from end of the input data towards the front.
3686 }
3687 }
3688 if ( matcher->groupCount()+1 < groupStarts.size()) {
3689 errln("Error at line %d: Expected %d capture groups, found %d.",
3690 line, groupStarts.size()-1, matcher->groupCount());
3691 failed = TRUE;
3692 }
3693 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3694 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3695 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3696 failed = TRUE;
3697 }
3698
3699 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3700 matcher->requireEnd() == TRUE) {
3701 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3702 failed = TRUE;
3703 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3704 UTF8Matcher->requireEnd() == TRUE) {
3705 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3706 failed = TRUE;
3707 }
3708
3709 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3710 matcher->requireEnd() == FALSE) {
3711 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3712 failed = TRUE;
3713 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3714 UTF8Matcher->requireEnd() == FALSE) {
3715 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3716 failed = TRUE;
3717 }
3718
3719 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3720 matcher->hitEnd() == TRUE) {
3721 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3722 failed = TRUE;
3723 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3724 UTF8Matcher->hitEnd() == TRUE) {
3725 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3726 failed = TRUE;
3727 }
3728
3729 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3730 matcher->hitEnd() == FALSE) {
3731 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3732 failed = TRUE;
3733 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3734 UTF8Matcher->hitEnd() == FALSE) {
3735 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3736 failed = TRUE;
3737 }
3738
3739
3740 cleanupAndReturn:
3741 if (failed) {
3742 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3743 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3744 // callerPattern->dump();
3745 }
3746 delete parseMatcher;
3747 delete parsePat;
3748 delete UTF8Matcher;
3749 delete UTF8Pattern;
3750 delete matcher;
3751 delete callerPattern;
3752
3753 utext_close(&inputText);
3754 delete[] inputChars;
3755 utext_close(&patternText);
3756 delete[] patternChars;
3757 ucnv_close(UTF8Converter);
3758 }
3759
3760
3761
3762
3763 //---------------------------------------------------------------------------
3764 //
3765 // Errors Check for error handling in patterns.
3766 //
3767 //---------------------------------------------------------------------------
Errors()3768 void RegexTest::Errors() {
3769 // \escape sequences that aren't implemented yet.
3770 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3771
3772 // Missing close parentheses
3773 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3774 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3775 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3776
3777 // Extra close paren
3778 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3779 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3780 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3781
3782 // Look-ahead, Look-behind
3783 // TODO: add tests for unbounded length look-behinds.
3784 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3785
3786 // Attempt to use non-default flags
3787 {
3788 UParseError pe;
3789 UErrorCode status = U_ZERO_ERROR;
3790 int32_t flags = UREGEX_CANON_EQ |
3791 UREGEX_COMMENTS | UREGEX_DOTALL |
3792 UREGEX_MULTILINE;
3793 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3794 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3795 delete pat1;
3796 }
3797
3798
3799 // Quantifiers are allowed only after something that can be quantified.
3800 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3801 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3802 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3803
3804 // Mal-formed {min,max} quantifiers
3805 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3806 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3807 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3808 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3809 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3810 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3811 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3812 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3813 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3814
3815 // Ticket 5389
3816 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3817
3818 // Invalid Back Reference \0
3819 // For ICU 3.8 and earlier
3820 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3821 //
3822 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3823
3824 }
3825
3826
3827 //-------------------------------------------------------------------------------
3828 //
3829 // Read a text data file, convert it to UChars, and return the data
3830 // in one big UChar * buffer, which the caller must delete.
3831 //
3832 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3833 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3834 const char *defEncoding, UErrorCode &status) {
3835 UChar *retPtr = NULL;
3836 char *fileBuf = NULL;
3837 UConverter* conv = NULL;
3838 FILE *f = NULL;
3839
3840 ulen = 0;
3841 if (U_FAILURE(status)) {
3842 return retPtr;
3843 }
3844
3845 //
3846 // Open the file.
3847 //
3848 f = fopen(fileName, "rb");
3849 if (f == 0) {
3850 dataerrln("Error opening test data file %s\n", fileName);
3851 status = U_FILE_ACCESS_ERROR;
3852 return NULL;
3853 }
3854 //
3855 // Read it in
3856 //
3857 int32_t fileSize;
3858 int32_t amt_read;
3859
3860 fseek( f, 0, SEEK_END);
3861 fileSize = ftell(f);
3862 fileBuf = new char[fileSize];
3863 fseek(f, 0, SEEK_SET);
3864 amt_read = fread(fileBuf, 1, fileSize, f);
3865 if (amt_read != fileSize || fileSize <= 0) {
3866 errln("Error reading test data file.");
3867 goto cleanUpAndReturn;
3868 }
3869
3870 //
3871 // Look for a Unicode Signature (BOM) on the data just read
3872 //
3873 int32_t signatureLength;
3874 const char * fileBufC;
3875 const char* encoding;
3876
3877 fileBufC = fileBuf;
3878 encoding = ucnv_detectUnicodeSignature(
3879 fileBuf, fileSize, &signatureLength, &status);
3880 if(encoding!=NULL ){
3881 fileBufC += signatureLength;
3882 fileSize -= signatureLength;
3883 } else {
3884 encoding = defEncoding;
3885 if (strcmp(encoding, "utf-8") == 0) {
3886 errln("file %s is missing its BOM", fileName);
3887 }
3888 }
3889
3890 //
3891 // Open a converter to take the rule file to UTF-16
3892 //
3893 conv = ucnv_open(encoding, &status);
3894 if (U_FAILURE(status)) {
3895 goto cleanUpAndReturn;
3896 }
3897
3898 //
3899 // Convert the rules to UChar.
3900 // Preflight first to determine required buffer size.
3901 //
3902 ulen = ucnv_toUChars(conv,
3903 NULL, // dest,
3904 0, // destCapacity,
3905 fileBufC,
3906 fileSize,
3907 &status);
3908 if (status == U_BUFFER_OVERFLOW_ERROR) {
3909 // Buffer Overflow is expected from the preflight operation.
3910 status = U_ZERO_ERROR;
3911
3912 retPtr = new UChar[ulen+1];
3913 ucnv_toUChars(conv,
3914 retPtr, // dest,
3915 ulen+1,
3916 fileBufC,
3917 fileSize,
3918 &status);
3919 }
3920
3921 cleanUpAndReturn:
3922 fclose(f);
3923 delete[] fileBuf;
3924 ucnv_close(conv);
3925 if (U_FAILURE(status)) {
3926 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3927 delete []retPtr;
3928 retPtr = 0;
3929 ulen = 0;
3930 };
3931 return retPtr;
3932 }
3933
3934
3935 //-------------------------------------------------------------------------------
3936 //
3937 // PerlTests - Run Perl's regular expression tests
3938 // The input file for this test is re_tests, the standard regular
3939 // expression test data distributed with the Perl source code.
3940 //
3941 // Here is Perl's description of the test data file:
3942 //
3943 // # The tests are in a separate file 't/op/re_tests'.
3944 // # Each line in that file is a separate test.
3945 // # There are five columns, separated by tabs.
3946 // #
3947 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3948 // # Modifiers can be put after the closing C<'>.
3949 // #
3950 // # Column 2 contains the string to be matched.
3951 // #
3952 // # Column 3 contains the expected result:
3953 // # y expect a match
3954 // # n expect no match
3955 // # c expect an error
3956 // # B test exposes a known bug in Perl, should be skipped
3957 // # b test exposes a known bug in Perl, should be skipped if noamp
3958 // #
3959 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3960 // #
3961 // # Column 4 contains a string, usually C<$&>.
3962 // #
3963 // # Column 5 contains the expected result of double-quote
3964 // # interpolating that string after the match, or start of error message.
3965 // #
3966 // # Column 6, if present, contains a reason why the test is skipped.
3967 // # This is printed with "skipped", for harness to pick up.
3968 // #
3969 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3970 // #
3971 // # If you want to add a regular expression test that can't be expressed
3972 // # in this format, don't add it here: put it in op/pat.t instead.
3973 //
3974 // For ICU, if field 3 contains an 'i', the test will be skipped.
3975 // The test exposes is some known incompatibility between ICU and Perl regexps.
3976 // (The i is in addition to whatever was there before.)
3977 //
3978 //-------------------------------------------------------------------------------
PerlTests()3979 void RegexTest::PerlTests() {
3980 char tdd[2048];
3981 const char *srcPath;
3982 UErrorCode status = U_ZERO_ERROR;
3983 UParseError pe;
3984
3985 //
3986 // Open and read the test data file.
3987 //
3988 srcPath=getPath(tdd, "re_tests.txt");
3989 if(srcPath==NULL) {
3990 return; /* something went wrong, error already output */
3991 }
3992
3993 int32_t len;
3994 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3995 if (U_FAILURE(status)) {
3996 return; /* something went wrong, error already output */
3997 }
3998
3999 //
4000 // Put the test data into a UnicodeString
4001 //
4002 UnicodeString testDataString(FALSE, testData, len);
4003
4004 //
4005 // Regex to break the input file into lines, and strip the new lines.
4006 // One line per match, capture group one is the desired data.
4007 //
4008 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4009 if (U_FAILURE(status)) {
4010 dataerrln("RegexPattern::compile() error");
4011 return;
4012 }
4013 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4014
4015 //
4016 // Regex to split a test file line into fields.
4017 // There are six fields, separated by tabs.
4018 //
4019 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4020
4021 //
4022 // Regex to identify test patterns with flag settings, and to separate them.
4023 // Test patterns with flags look like 'pattern'i
4024 // Test patterns without flags are not quoted: pattern
4025 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4026 //
4027 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4028 RegexMatcher* flagMat = flagPat->matcher(status);
4029
4030 //
4031 // The Perl tests reference several perl-isms, which are evaluated/substituted
4032 // in the test data. Not being perl, this must be done explicitly. Here
4033 // are string constants and REs for these constructs.
4034 //
4035 UnicodeString nulnulSrc("${nulnul}");
4036 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4037 nulnul = nulnul.unescape();
4038
4039 UnicodeString ffffSrc("${ffff}");
4040 UnicodeString ffff("\\uffff", -1, US_INV);
4041 ffff = ffff.unescape();
4042
4043 // regexp for $-[0], $+[2], etc.
4044 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4045 RegexMatcher *groupsMat = groupsPat->matcher(status);
4046
4047 // regexp for $0, $1, $2, etc.
4048 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4049 RegexMatcher *cgMat = cgPat->matcher(status);
4050
4051
4052 //
4053 // Main Loop for the Perl Tests, runs once per line from the
4054 // test data file.
4055 //
4056 int32_t lineNum = 0;
4057 int32_t skippedUnimplementedCount = 0;
4058 while (lineMat->find()) {
4059 lineNum++;
4060
4061 //
4062 // Get a line, break it into its fields, do the Perl
4063 // variable substitutions.
4064 //
4065 UnicodeString line = lineMat->group(1, status);
4066 UnicodeString fields[7];
4067 fieldPat->split(line, fields, 7, status);
4068
4069 flagMat->reset(fields[0]);
4070 flagMat->matches(status);
4071 UnicodeString pattern = flagMat->group(2, status);
4072 pattern.findAndReplace("${bang}", "!");
4073 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4074 pattern.findAndReplace(ffffSrc, ffff);
4075
4076 //
4077 // Identify patterns that include match flag settings,
4078 // split off the flags, remove the extra quotes.
4079 //
4080 UnicodeString flagStr = flagMat->group(3, status);
4081 if (U_FAILURE(status)) {
4082 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4083 return;
4084 }
4085 int32_t flags = 0;
4086 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4087 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4088 const UChar UChar_m = 0x6d;
4089 const UChar UChar_x = 0x78;
4090 const UChar UChar_y = 0x79;
4091 if (flagStr.indexOf(UChar_i) != -1) {
4092 flags |= UREGEX_CASE_INSENSITIVE;
4093 }
4094 if (flagStr.indexOf(UChar_m) != -1) {
4095 flags |= UREGEX_MULTILINE;
4096 }
4097 if (flagStr.indexOf(UChar_x) != -1) {
4098 flags |= UREGEX_COMMENTS;
4099 }
4100
4101 //
4102 // Compile the test pattern.
4103 //
4104 status = U_ZERO_ERROR;
4105 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4106 if (status == U_REGEX_UNIMPLEMENTED) {
4107 //
4108 // Test of a feature that is planned for ICU, but not yet implemented.
4109 // skip the test.
4110 skippedUnimplementedCount++;
4111 delete testPat;
4112 status = U_ZERO_ERROR;
4113 continue;
4114 }
4115
4116 if (U_FAILURE(status)) {
4117 // Some tests are supposed to generate errors.
4118 // Only report an error for tests that are supposed to succeed.
4119 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4120 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4121 {
4122 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4123 }
4124 status = U_ZERO_ERROR;
4125 delete testPat;
4126 continue;
4127 }
4128
4129 if (fields[2].indexOf(UChar_i) >= 0) {
4130 // ICU should skip this test.
4131 delete testPat;
4132 continue;
4133 }
4134
4135 if (fields[2].indexOf(UChar_c) >= 0) {
4136 // This pattern should have caused a compilation error, but didn't/
4137 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4138 delete testPat;
4139 continue;
4140 }
4141
4142 //
4143 // replace the Perl variables that appear in some of the
4144 // match data strings.
4145 //
4146 UnicodeString matchString = fields[1];
4147 matchString.findAndReplace(nulnulSrc, nulnul);
4148 matchString.findAndReplace(ffffSrc, ffff);
4149
4150 // Replace any \n in the match string with an actual new-line char.
4151 // Don't do full unescape, as this unescapes more than Perl does, which
4152 // causes other spurious failures in the tests.
4153 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4154
4155
4156
4157 //
4158 // Run the test, check for expected match/don't match result.
4159 //
4160 RegexMatcher *testMat = testPat->matcher(matchString, status);
4161 UBool found = testMat->find();
4162 UBool expected = FALSE;
4163 if (fields[2].indexOf(UChar_y) >=0) {
4164 expected = TRUE;
4165 }
4166 if (expected != found) {
4167 errln("line %d: Expected %smatch, got %smatch",
4168 lineNum, expected?"":"no ", found?"":"no " );
4169 continue;
4170 }
4171
4172 // Don't try to check expected results if there is no match.
4173 // (Some have stuff in the expected fields)
4174 if (!found) {
4175 delete testMat;
4176 delete testPat;
4177 continue;
4178 }
4179
4180 //
4181 // Interpret the Perl expression from the fourth field of the data file,
4182 // building up an ICU string from the results of the ICU match.
4183 // The Perl expression will contain references to the results of
4184 // a regex match, including the matched string, capture group strings,
4185 // group starting and ending indicies, etc.
4186 //
4187 UnicodeString resultString;
4188 UnicodeString perlExpr = fields[3];
4189 #if SUPPORT_MUTATING_INPUT_STRING
4190 groupsMat->reset(perlExpr);
4191 cgMat->reset(perlExpr);
4192 #endif
4193
4194 while (perlExpr.length() > 0) {
4195 #if !SUPPORT_MUTATING_INPUT_STRING
4196 // Perferred usage. Reset after any modification to input string.
4197 groupsMat->reset(perlExpr);
4198 cgMat->reset(perlExpr);
4199 #endif
4200
4201 if (perlExpr.startsWith("$&")) {
4202 resultString.append(testMat->group(status));
4203 perlExpr.remove(0, 2);
4204 }
4205
4206 else if (groupsMat->lookingAt(status)) {
4207 // $-[0] $+[2] etc.
4208 UnicodeString digitString = groupsMat->group(2, status);
4209 int32_t t = 0;
4210 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4211 UnicodeString plusOrMinus = groupsMat->group(1, status);
4212 int32_t matchPosition;
4213 if (plusOrMinus.compare("+") == 0) {
4214 matchPosition = testMat->end(groupNum, status);
4215 } else {
4216 matchPosition = testMat->start(groupNum, status);
4217 }
4218 if (matchPosition != -1) {
4219 ICU_Utility::appendNumber(resultString, matchPosition);
4220 }
4221 perlExpr.remove(0, groupsMat->end(status));
4222 }
4223
4224 else if (cgMat->lookingAt(status)) {
4225 // $1, $2, $3, etc.
4226 UnicodeString digitString = cgMat->group(1, status);
4227 int32_t t = 0;
4228 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4229 if (U_SUCCESS(status)) {
4230 resultString.append(testMat->group(groupNum, status));
4231 status = U_ZERO_ERROR;
4232 }
4233 perlExpr.remove(0, cgMat->end(status));
4234 }
4235
4236 else if (perlExpr.startsWith("@-")) {
4237 int32_t i;
4238 for (i=0; i<=testMat->groupCount(); i++) {
4239 if (i>0) {
4240 resultString.append(" ");
4241 }
4242 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4243 }
4244 perlExpr.remove(0, 2);
4245 }
4246
4247 else if (perlExpr.startsWith("@+")) {
4248 int32_t i;
4249 for (i=0; i<=testMat->groupCount(); i++) {
4250 if (i>0) {
4251 resultString.append(" ");
4252 }
4253 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4254 }
4255 perlExpr.remove(0, 2);
4256 }
4257
4258 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4259 // or as an escaped sequence (e.g. \n)
4260 if (perlExpr.length() > 1) {
4261 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4262 }
4263 UChar c = perlExpr.charAt(0);
4264 switch (c) {
4265 case 'n': c = '\n'; break;
4266 // add any other escape sequences that show up in the test expected results.
4267 }
4268 resultString.append(c);
4269 perlExpr.remove(0, 1);
4270 }
4271
4272 else {
4273 // Any characters from the perl expression that we don't explicitly
4274 // recognize before here are assumed to be literals and copied
4275 // as-is to the expected results.
4276 resultString.append(perlExpr.charAt(0));
4277 perlExpr.remove(0, 1);
4278 }
4279
4280 if (U_FAILURE(status)) {
4281 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4282 break;
4283 }
4284 }
4285
4286 //
4287 // Expected Results Compare
4288 //
4289 UnicodeString expectedS(fields[4]);
4290 expectedS.findAndReplace(nulnulSrc, nulnul);
4291 expectedS.findAndReplace(ffffSrc, ffff);
4292 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4293
4294
4295 if (expectedS.compare(resultString) != 0) {
4296 err("Line %d: Incorrect perl expression results.", lineNum);
4297 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4298 }
4299
4300 delete testMat;
4301 delete testPat;
4302 }
4303
4304 //
4305 // All done. Clean up allocated stuff.
4306 //
4307 delete cgMat;
4308 delete cgPat;
4309
4310 delete groupsMat;
4311 delete groupsPat;
4312
4313 delete flagMat;
4314 delete flagPat;
4315
4316 delete lineMat;
4317 delete linePat;
4318
4319 delete fieldPat;
4320 delete [] testData;
4321
4322
4323 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4324
4325 }
4326
4327
4328 //-------------------------------------------------------------------------------
4329 //
4330 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4331 // (instead of using UnicodeStrings) to test the alternate engine.
4332 // The input file for this test is re_tests, the standard regular
4333 // expression test data distributed with the Perl source code.
4334 // See PerlTests() for more information.
4335 //
4336 //-------------------------------------------------------------------------------
PerlTestsUTF8()4337 void RegexTest::PerlTestsUTF8() {
4338 char tdd[2048];
4339 const char *srcPath;
4340 UErrorCode status = U_ZERO_ERROR;
4341 UParseError pe;
4342 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4343 UText patternText = UTEXT_INITIALIZER;
4344 char *patternChars = NULL;
4345 int32_t patternLength;
4346 int32_t patternCapacity = 0;
4347 UText inputText = UTEXT_INITIALIZER;
4348 char *inputChars = NULL;
4349 int32_t inputLength;
4350 int32_t inputCapacity = 0;
4351
4352 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4353
4354 //
4355 // Open and read the test data file.
4356 //
4357 srcPath=getPath(tdd, "re_tests.txt");
4358 if(srcPath==NULL) {
4359 return; /* something went wrong, error already output */
4360 }
4361
4362 int32_t len;
4363 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4364 if (U_FAILURE(status)) {
4365 return; /* something went wrong, error already output */
4366 }
4367
4368 //
4369 // Put the test data into a UnicodeString
4370 //
4371 UnicodeString testDataString(FALSE, testData, len);
4372
4373 //
4374 // Regex to break the input file into lines, and strip the new lines.
4375 // One line per match, capture group one is the desired data.
4376 //
4377 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4378 if (U_FAILURE(status)) {
4379 dataerrln("RegexPattern::compile() error");
4380 return;
4381 }
4382 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4383
4384 //
4385 // Regex to split a test file line into fields.
4386 // There are six fields, separated by tabs.
4387 //
4388 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4389
4390 //
4391 // Regex to identify test patterns with flag settings, and to separate them.
4392 // Test patterns with flags look like 'pattern'i
4393 // Test patterns without flags are not quoted: pattern
4394 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4395 //
4396 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4397 RegexMatcher* flagMat = flagPat->matcher(status);
4398
4399 //
4400 // The Perl tests reference several perl-isms, which are evaluated/substituted
4401 // in the test data. Not being perl, this must be done explicitly. Here
4402 // are string constants and REs for these constructs.
4403 //
4404 UnicodeString nulnulSrc("${nulnul}");
4405 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4406 nulnul = nulnul.unescape();
4407
4408 UnicodeString ffffSrc("${ffff}");
4409 UnicodeString ffff("\\uffff", -1, US_INV);
4410 ffff = ffff.unescape();
4411
4412 // regexp for $-[0], $+[2], etc.
4413 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4414 RegexMatcher *groupsMat = groupsPat->matcher(status);
4415
4416 // regexp for $0, $1, $2, etc.
4417 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4418 RegexMatcher *cgMat = cgPat->matcher(status);
4419
4420
4421 //
4422 // Main Loop for the Perl Tests, runs once per line from the
4423 // test data file.
4424 //
4425 int32_t lineNum = 0;
4426 int32_t skippedUnimplementedCount = 0;
4427 while (lineMat->find()) {
4428 lineNum++;
4429
4430 //
4431 // Get a line, break it into its fields, do the Perl
4432 // variable substitutions.
4433 //
4434 UnicodeString line = lineMat->group(1, status);
4435 UnicodeString fields[7];
4436 fieldPat->split(line, fields, 7, status);
4437
4438 flagMat->reset(fields[0]);
4439 flagMat->matches(status);
4440 UnicodeString pattern = flagMat->group(2, status);
4441 pattern.findAndReplace("${bang}", "!");
4442 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4443 pattern.findAndReplace(ffffSrc, ffff);
4444
4445 //
4446 // Identify patterns that include match flag settings,
4447 // split off the flags, remove the extra quotes.
4448 //
4449 UnicodeString flagStr = flagMat->group(3, status);
4450 if (U_FAILURE(status)) {
4451 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4452 return;
4453 }
4454 int32_t flags = 0;
4455 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4456 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4457 const UChar UChar_m = 0x6d;
4458 const UChar UChar_x = 0x78;
4459 const UChar UChar_y = 0x79;
4460 if (flagStr.indexOf(UChar_i) != -1) {
4461 flags |= UREGEX_CASE_INSENSITIVE;
4462 }
4463 if (flagStr.indexOf(UChar_m) != -1) {
4464 flags |= UREGEX_MULTILINE;
4465 }
4466 if (flagStr.indexOf(UChar_x) != -1) {
4467 flags |= UREGEX_COMMENTS;
4468 }
4469
4470 //
4471 // Put the pattern in a UTF-8 UText
4472 //
4473 status = U_ZERO_ERROR;
4474 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4475 if (status == U_BUFFER_OVERFLOW_ERROR) {
4476 status = U_ZERO_ERROR;
4477 delete[] patternChars;
4478 patternCapacity = patternLength + 1;
4479 patternChars = new char[patternCapacity];
4480 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4481 }
4482 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4483
4484 //
4485 // Compile the test pattern.
4486 //
4487 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4488 if (status == U_REGEX_UNIMPLEMENTED) {
4489 //
4490 // Test of a feature that is planned for ICU, but not yet implemented.
4491 // skip the test.
4492 skippedUnimplementedCount++;
4493 delete testPat;
4494 status = U_ZERO_ERROR;
4495 continue;
4496 }
4497
4498 if (U_FAILURE(status)) {
4499 // Some tests are supposed to generate errors.
4500 // Only report an error for tests that are supposed to succeed.
4501 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4502 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4503 {
4504 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4505 }
4506 status = U_ZERO_ERROR;
4507 delete testPat;
4508 continue;
4509 }
4510
4511 if (fields[2].indexOf(UChar_i) >= 0) {
4512 // ICU should skip this test.
4513 delete testPat;
4514 continue;
4515 }
4516
4517 if (fields[2].indexOf(UChar_c) >= 0) {
4518 // This pattern should have caused a compilation error, but didn't/
4519 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4520 delete testPat;
4521 continue;
4522 }
4523
4524
4525 //
4526 // replace the Perl variables that appear in some of the
4527 // match data strings.
4528 //
4529 UnicodeString matchString = fields[1];
4530 matchString.findAndReplace(nulnulSrc, nulnul);
4531 matchString.findAndReplace(ffffSrc, ffff);
4532
4533 // Replace any \n in the match string with an actual new-line char.
4534 // Don't do full unescape, as this unescapes more than Perl does, which
4535 // causes other spurious failures in the tests.
4536 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4537
4538 //
4539 // Put the input in a UTF-8 UText
4540 //
4541 status = U_ZERO_ERROR;
4542 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4543 if (status == U_BUFFER_OVERFLOW_ERROR) {
4544 status = U_ZERO_ERROR;
4545 delete[] inputChars;
4546 inputCapacity = inputLength + 1;
4547 inputChars = new char[inputCapacity];
4548 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4549 }
4550 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4551
4552 //
4553 // Run the test, check for expected match/don't match result.
4554 //
4555 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4556 UBool found = testMat->find();
4557 UBool expected = FALSE;
4558 if (fields[2].indexOf(UChar_y) >=0) {
4559 expected = TRUE;
4560 }
4561 if (expected != found) {
4562 errln("line %d: Expected %smatch, got %smatch",
4563 lineNum, expected?"":"no ", found?"":"no " );
4564 continue;
4565 }
4566
4567 // Don't try to check expected results if there is no match.
4568 // (Some have stuff in the expected fields)
4569 if (!found) {
4570 delete testMat;
4571 delete testPat;
4572 continue;
4573 }
4574
4575 //
4576 // Interpret the Perl expression from the fourth field of the data file,
4577 // building up an ICU string from the results of the ICU match.
4578 // The Perl expression will contain references to the results of
4579 // a regex match, including the matched string, capture group strings,
4580 // group starting and ending indicies, etc.
4581 //
4582 UnicodeString resultString;
4583 UnicodeString perlExpr = fields[3];
4584
4585 while (perlExpr.length() > 0) {
4586 groupsMat->reset(perlExpr);
4587 cgMat->reset(perlExpr);
4588
4589 if (perlExpr.startsWith("$&")) {
4590 resultString.append(testMat->group(status));
4591 perlExpr.remove(0, 2);
4592 }
4593
4594 else if (groupsMat->lookingAt(status)) {
4595 // $-[0] $+[2] etc.
4596 UnicodeString digitString = groupsMat->group(2, status);
4597 int32_t t = 0;
4598 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4599 UnicodeString plusOrMinus = groupsMat->group(1, status);
4600 int32_t matchPosition;
4601 if (plusOrMinus.compare("+") == 0) {
4602 matchPosition = testMat->end(groupNum, status);
4603 } else {
4604 matchPosition = testMat->start(groupNum, status);
4605 }
4606 if (matchPosition != -1) {
4607 ICU_Utility::appendNumber(resultString, matchPosition);
4608 }
4609 perlExpr.remove(0, groupsMat->end(status));
4610 }
4611
4612 else if (cgMat->lookingAt(status)) {
4613 // $1, $2, $3, etc.
4614 UnicodeString digitString = cgMat->group(1, status);
4615 int32_t t = 0;
4616 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4617 if (U_SUCCESS(status)) {
4618 resultString.append(testMat->group(groupNum, status));
4619 status = U_ZERO_ERROR;
4620 }
4621 perlExpr.remove(0, cgMat->end(status));
4622 }
4623
4624 else if (perlExpr.startsWith("@-")) {
4625 int32_t i;
4626 for (i=0; i<=testMat->groupCount(); i++) {
4627 if (i>0) {
4628 resultString.append(" ");
4629 }
4630 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4631 }
4632 perlExpr.remove(0, 2);
4633 }
4634
4635 else if (perlExpr.startsWith("@+")) {
4636 int32_t i;
4637 for (i=0; i<=testMat->groupCount(); i++) {
4638 if (i>0) {
4639 resultString.append(" ");
4640 }
4641 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4642 }
4643 perlExpr.remove(0, 2);
4644 }
4645
4646 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4647 // or as an escaped sequence (e.g. \n)
4648 if (perlExpr.length() > 1) {
4649 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4650 }
4651 UChar c = perlExpr.charAt(0);
4652 switch (c) {
4653 case 'n': c = '\n'; break;
4654 // add any other escape sequences that show up in the test expected results.
4655 }
4656 resultString.append(c);
4657 perlExpr.remove(0, 1);
4658 }
4659
4660 else {
4661 // Any characters from the perl expression that we don't explicitly
4662 // recognize before here are assumed to be literals and copied
4663 // as-is to the expected results.
4664 resultString.append(perlExpr.charAt(0));
4665 perlExpr.remove(0, 1);
4666 }
4667
4668 if (U_FAILURE(status)) {
4669 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4670 break;
4671 }
4672 }
4673
4674 //
4675 // Expected Results Compare
4676 //
4677 UnicodeString expectedS(fields[4]);
4678 expectedS.findAndReplace(nulnulSrc, nulnul);
4679 expectedS.findAndReplace(ffffSrc, ffff);
4680 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4681
4682
4683 if (expectedS.compare(resultString) != 0) {
4684 err("Line %d: Incorrect perl expression results.", lineNum);
4685 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4686 }
4687
4688 delete testMat;
4689 delete testPat;
4690 }
4691
4692 //
4693 // All done. Clean up allocated stuff.
4694 //
4695 delete cgMat;
4696 delete cgPat;
4697
4698 delete groupsMat;
4699 delete groupsPat;
4700
4701 delete flagMat;
4702 delete flagPat;
4703
4704 delete lineMat;
4705 delete linePat;
4706
4707 delete fieldPat;
4708 delete [] testData;
4709
4710 utext_close(&patternText);
4711 utext_close(&inputText);
4712
4713 delete [] patternChars;
4714 delete [] inputChars;
4715
4716
4717 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4718
4719 }
4720
4721
4722 //--------------------------------------------------------------
4723 //
4724 // Bug6149 Verify limits to heap expansion for backtrack stack.
4725 // Use this pattern,
4726 // "(a?){1,8000000}"
4727 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4728 // This test is likely to be fragile, as further optimizations stop
4729 // more cases of pointless looping in the match engine.
4730 //
4731 //---------------------------------------------------------------
Bug6149()4732 void RegexTest::Bug6149() {
4733 UnicodeString pattern("(a?){1,8000000}");
4734 UnicodeString s("xyz");
4735 uint32_t flags = 0;
4736 UErrorCode status = U_ZERO_ERROR;
4737
4738 RegexMatcher matcher(pattern, s, flags, status);
4739 UBool result = false;
4740 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4741 REGEX_ASSERT(result == FALSE);
4742 }
4743
4744
4745 //
4746 // Callbacks() Test the callback function.
4747 // When set, callbacks occur periodically during matching operations,
4748 // giving the application code the ability to abort the operation
4749 // before it's normal completion.
4750 //
4751
4752 struct callBackContext {
4753 RegexTest *test;
4754 int32_t maxCalls;
4755 int32_t numCalls;
4756 int32_t lastSteps;
resetcallBackContext4757 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4758 };
4759
4760 U_CDECL_BEGIN
4761 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4762 testCallBackFn(const void *context, int32_t steps) {
4763 callBackContext *info = (callBackContext *)context;
4764 if (info->lastSteps+1 != steps) {
4765 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4766 }
4767 info->lastSteps = steps;
4768 info->numCalls++;
4769 return (info->numCalls < info->maxCalls);
4770 }
4771 U_CDECL_END
4772
Callbacks()4773 void RegexTest::Callbacks() {
4774 {
4775 // Getter returns NULLs if no callback has been set
4776
4777 // The variables that the getter will fill in.
4778 // Init to non-null values so that the action of the getter can be seen.
4779 const void *returnedContext = &returnedContext;
4780 URegexMatchCallback *returnedFn = &testCallBackFn;
4781
4782 UErrorCode status = U_ZERO_ERROR;
4783 RegexMatcher matcher("x", 0, status);
4784 REGEX_CHECK_STATUS;
4785 matcher.getMatchCallback(returnedFn, returnedContext, status);
4786 REGEX_CHECK_STATUS;
4787 REGEX_ASSERT(returnedFn == NULL);
4788 REGEX_ASSERT(returnedContext == NULL);
4789 }
4790
4791 {
4792 // Set and Get work
4793 callBackContext cbInfo = {this, 0, 0, 0};
4794 const void *returnedContext;
4795 URegexMatchCallback *returnedFn;
4796 UErrorCode status = U_ZERO_ERROR;
4797 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4798 REGEX_CHECK_STATUS;
4799 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4800 REGEX_CHECK_STATUS;
4801 matcher.getMatchCallback(returnedFn, returnedContext, status);
4802 REGEX_CHECK_STATUS;
4803 REGEX_ASSERT(returnedFn == testCallBackFn);
4804 REGEX_ASSERT(returnedContext == &cbInfo);
4805
4806 // A short-running match shouldn't invoke the callback
4807 status = U_ZERO_ERROR;
4808 cbInfo.reset(1);
4809 UnicodeString s = "xxx";
4810 matcher.reset(s);
4811 REGEX_ASSERT(matcher.matches(status));
4812 REGEX_CHECK_STATUS;
4813 REGEX_ASSERT(cbInfo.numCalls == 0);
4814
4815 // A medium-length match that runs long enough to invoke the
4816 // callback, but not so long that the callback aborts it.
4817 status = U_ZERO_ERROR;
4818 cbInfo.reset(4);
4819 s = "aaaaaaaaaaaaaaaaaaab";
4820 matcher.reset(s);
4821 REGEX_ASSERT(matcher.matches(status)==FALSE);
4822 REGEX_CHECK_STATUS;
4823 REGEX_ASSERT(cbInfo.numCalls > 0);
4824
4825 // A longer running match that the callback function will abort.
4826 status = U_ZERO_ERROR;
4827 cbInfo.reset(4);
4828 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4829 matcher.reset(s);
4830 REGEX_ASSERT(matcher.matches(status)==FALSE);
4831 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4832 REGEX_ASSERT(cbInfo.numCalls == 4);
4833
4834 // A longer running find that the callback function will abort.
4835 status = U_ZERO_ERROR;
4836 cbInfo.reset(4);
4837 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4838 matcher.reset(s);
4839 REGEX_ASSERT(matcher.find(status)==FALSE);
4840 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4841 REGEX_ASSERT(cbInfo.numCalls == 4);
4842 }
4843
4844
4845 }
4846
4847
4848 //
4849 // FindProgressCallbacks() Test the find "progress" callback function.
4850 // When set, the find progress callback will be invoked during a find operations
4851 // after each return from a match attempt, giving the application the opportunity
4852 // to terminate a long-running find operation before it's normal completion.
4853 //
4854
4855 struct progressCallBackContext {
4856 RegexTest *test;
4857 int64_t lastIndex;
4858 int32_t maxCalls;
4859 int32_t numCalls;
resetprogressCallBackContext4860 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4861 };
4862
4863 // call-back function for find().
4864 // Return TRUE to continue the find().
4865 // Return FALSE to stop the find().
4866 U_CDECL_BEGIN
4867 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4868 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4869 progressCallBackContext *info = (progressCallBackContext *)context;
4870 info->numCalls++;
4871 info->lastIndex = matchIndex;
4872 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4873 return (info->numCalls < info->maxCalls);
4874 }
4875 U_CDECL_END
4876
FindProgressCallbacks()4877 void RegexTest::FindProgressCallbacks() {
4878 {
4879 // Getter returns NULLs if no callback has been set
4880
4881 // The variables that the getter will fill in.
4882 // Init to non-null values so that the action of the getter can be seen.
4883 const void *returnedContext = &returnedContext;
4884 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4885
4886 UErrorCode status = U_ZERO_ERROR;
4887 RegexMatcher matcher("x", 0, status);
4888 REGEX_CHECK_STATUS;
4889 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4890 REGEX_CHECK_STATUS;
4891 REGEX_ASSERT(returnedFn == NULL);
4892 REGEX_ASSERT(returnedContext == NULL);
4893 }
4894
4895 {
4896 // Set and Get work
4897 progressCallBackContext cbInfo = {this, 0, 0, 0};
4898 const void *returnedContext;
4899 URegexFindProgressCallback *returnedFn;
4900 UErrorCode status = U_ZERO_ERROR;
4901 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4902 REGEX_CHECK_STATUS;
4903 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4904 REGEX_CHECK_STATUS;
4905 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4906 REGEX_CHECK_STATUS;
4907 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4908 REGEX_ASSERT(returnedContext == &cbInfo);
4909
4910 // A find that matches on the initial position does NOT invoke the callback.
4911 status = U_ZERO_ERROR;
4912 cbInfo.reset(100);
4913 UnicodeString s = "aaxxx";
4914 matcher.reset(s);
4915 #if 0
4916 matcher.setTrace(TRUE);
4917 #endif
4918 REGEX_ASSERT(matcher.find(0, status));
4919 REGEX_CHECK_STATUS;
4920 REGEX_ASSERT(cbInfo.numCalls == 0);
4921
4922 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4923 // but not so many times that we interrupt the operation.
4924 status = U_ZERO_ERROR;
4925 s = "aaaaaaaaaaaaaaaaaaab";
4926 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4927 matcher.reset(s);
4928 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4929 REGEX_CHECK_STATUS;
4930 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4931
4932 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4933 status = U_ZERO_ERROR;
4934 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4935 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4936 matcher.reset(s1);
4937 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4938 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4939 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4940
4941 // Now a match that will succeed, but after an interruption
4942 status = U_ZERO_ERROR;
4943 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4944 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4945 matcher.reset(s2);
4946 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4947 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4948 // Now retry the match from where left off
4949 cbInfo.maxCalls = 100; // No callback limit
4950 status = U_ZERO_ERROR;
4951 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4952 REGEX_CHECK_STATUS;
4953 }
4954
4955
4956 }
4957
4958
4959 //---------------------------------------------------------------------------
4960 //
4961 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4962 // UTexts. The pure-C implementation of UText
4963 // has no mutable backing stores, but we can
4964 // use UnicodeString here to test the functionality.
4965 //
4966 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4967 void RegexTest::PreAllocatedUTextCAPI () {
4968 UErrorCode status = U_ZERO_ERROR;
4969 URegularExpression *re;
4970 UText patternText = UTEXT_INITIALIZER;
4971 UnicodeString buffer;
4972 UText bufferText = UTEXT_INITIALIZER;
4973
4974 utext_openUnicodeString(&bufferText, &buffer, &status);
4975
4976 /*
4977 * getText() and getUText()
4978 */
4979 {
4980 UText text1 = UTEXT_INITIALIZER;
4981 UText text2 = UTEXT_INITIALIZER;
4982 UChar text2Chars[20];
4983 UText *resultText;
4984
4985 status = U_ZERO_ERROR;
4986 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4987 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4988 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4989 utext_openUChars(&text2, text2Chars, -1, &status);
4990
4991 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4992 re = uregex_openUText(&patternText, 0, NULL, &status);
4993
4994 /* First set a UText */
4995 uregex_setUText(re, &text1, &status);
4996 resultText = uregex_getUText(re, &bufferText, &status);
4997 REGEX_CHECK_STATUS;
4998 REGEX_ASSERT(resultText == &bufferText);
4999 utext_setNativeIndex(resultText, 0);
5000 utext_setNativeIndex(&text1, 0);
5001 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5002
5003 resultText = uregex_getUText(re, &bufferText, &status);
5004 REGEX_CHECK_STATUS;
5005 REGEX_ASSERT(resultText == &bufferText);
5006 utext_setNativeIndex(resultText, 0);
5007 utext_setNativeIndex(&text1, 0);
5008 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5009
5010 /* Then set a UChar * */
5011 uregex_setText(re, text2Chars, 7, &status);
5012 resultText = uregex_getUText(re, &bufferText, &status);
5013 REGEX_CHECK_STATUS;
5014 REGEX_ASSERT(resultText == &bufferText);
5015 utext_setNativeIndex(resultText, 0);
5016 utext_setNativeIndex(&text2, 0);
5017 REGEX_ASSERT(testUTextEqual(resultText, &text2));
5018
5019 uregex_close(re);
5020 utext_close(&text1);
5021 utext_close(&text2);
5022 }
5023
5024 /*
5025 * group()
5026 */
5027 {
5028 UChar text1[80];
5029 UText *actual;
5030 UBool result;
5031 int64_t length = 0;
5032
5033 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5034 // 012345678901234567890123456789012345678901234567
5035 // 0 1 2 3 4
5036
5037 status = U_ZERO_ERROR;
5038 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5039 REGEX_CHECK_STATUS;
5040
5041 uregex_setText(re, text1, -1, &status);
5042 result = uregex_find(re, 0, &status);
5043 REGEX_ASSERT(result==TRUE);
5044
5045 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5046 status = U_ZERO_ERROR;
5047 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5048 REGEX_CHECK_STATUS;
5049 REGEX_ASSERT(actual == &bufferText);
5050 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5051 REGEX_ASSERT(length == 16);
5052 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5053
5054 /* Capture group #1. Should succeed, matching " interior ". */
5055 status = U_ZERO_ERROR;
5056 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5057 REGEX_CHECK_STATUS;
5058 REGEX_ASSERT(actual == &bufferText);
5059 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5060 REGEX_ASSERT(length == 10);
5061 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5062
5063 /* Capture group out of range. Error. */
5064 status = U_ZERO_ERROR;
5065 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5066 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5067 REGEX_ASSERT(actual == &bufferText);
5068 uregex_close(re);
5069
5070 }
5071
5072 /*
5073 * replaceFirst()
5074 */
5075 {
5076 UChar text1[80];
5077 UChar text2[80];
5078 UText replText = UTEXT_INITIALIZER;
5079 UText *result;
5080 status = U_ZERO_ERROR;
5081 utext_openUnicodeString(&bufferText, &buffer, &status);
5082
5083 status = U_ZERO_ERROR;
5084 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5085 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5086 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5087
5088 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5089 REGEX_CHECK_STATUS;
5090
5091 /* Normal case, with match */
5092 uregex_setText(re, text1, -1, &status);
5093 REGEX_CHECK_STATUS;
5094 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5095 REGEX_CHECK_STATUS;
5096 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5097 REGEX_CHECK_STATUS;
5098 REGEX_ASSERT(result == &bufferText);
5099 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5100
5101 /* No match. Text should copy to output with no changes. */
5102 uregex_setText(re, text2, -1, &status);
5103 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5104 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5105 REGEX_CHECK_STATUS;
5106 REGEX_ASSERT(result == &bufferText);
5107 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5108
5109 /* Unicode escapes */
5110 uregex_setText(re, text1, -1, &status);
5111 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5112 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5113 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5114 REGEX_CHECK_STATUS;
5115 REGEX_ASSERT(result == &bufferText);
5116 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5117
5118 uregex_close(re);
5119 utext_close(&replText);
5120 }
5121
5122
5123 /*
5124 * replaceAll()
5125 */
5126 {
5127 UChar text1[80];
5128 UChar text2[80];
5129 UText replText = UTEXT_INITIALIZER;
5130 UText *result;
5131
5132 status = U_ZERO_ERROR;
5133 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5134 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5135 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5136
5137 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5138 REGEX_CHECK_STATUS;
5139
5140 /* Normal case, with match */
5141 uregex_setText(re, text1, -1, &status);
5142 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5143 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5144 REGEX_CHECK_STATUS;
5145 REGEX_ASSERT(result == &bufferText);
5146 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5147
5148 /* No match. Text should copy to output with no changes. */
5149 uregex_setText(re, text2, -1, &status);
5150 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5151 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5152 REGEX_CHECK_STATUS;
5153 REGEX_ASSERT(result == &bufferText);
5154 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5155
5156 uregex_close(re);
5157 utext_close(&replText);
5158 }
5159
5160
5161 /*
5162 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5163 * so we don't need to test it here.
5164 */
5165
5166 utext_close(&bufferText);
5167 utext_close(&patternText);
5168 }
5169
5170
5171 //--------------------------------------------------------------
5172 //
5173 // NamedCapture Check basic named capture group functionality
5174 //
5175 //--------------------------------------------------------------
NamedCapture()5176 void RegexTest::NamedCapture() {
5177 UErrorCode status = U_ZERO_ERROR;
5178 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5179 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5180 REGEX_CHECK_STATUS;
5181 int32_t group = pat->groupNumberFromName("five", -1, status);
5182 REGEX_CHECK_STATUS;
5183 REGEX_ASSERT(5 == group);
5184 group = pat->groupNumberFromName("three", -1, status);
5185 REGEX_CHECK_STATUS;
5186 REGEX_ASSERT(3 == group);
5187
5188 status = U_ZERO_ERROR;
5189 group = pat->groupNumberFromName(UnicodeString("six"), status);
5190 REGEX_CHECK_STATUS;
5191 REGEX_ASSERT(6 == group);
5192
5193 status = U_ZERO_ERROR;
5194 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5195 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5196
5197 status = U_ZERO_ERROR;
5198
5199 // After copying a pattern, named capture should still work in the copy.
5200 RegexPattern *copiedPat = new RegexPattern(*pat);
5201 REGEX_ASSERT(*copiedPat == *pat);
5202 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5203
5204 group = copiedPat->groupNumberFromName("five", -1, status);
5205 REGEX_CHECK_STATUS;
5206 REGEX_ASSERT(5 == group);
5207 group = copiedPat->groupNumberFromName("three", -1, status);
5208 REGEX_CHECK_STATUS;
5209 REGEX_ASSERT(3 == group);
5210 delete copiedPat;
5211
5212 // ReplaceAll with named capture group.
5213 status = U_ZERO_ERROR;
5214 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5215 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5216 REGEX_CHECK_STATUS;
5217 // m.pattern().dumpPattern();
5218 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5219 REGEX_CHECK_STATUS;
5220 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5221 delete m;
5222
5223 // ReplaceAll, allowed capture group numbers.
5224 text = UnicodeString("abcmxyz");
5225 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5226 REGEX_CHECK_STATUS;
5227
5228 status = U_ZERO_ERROR;
5229 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5230 REGEX_CHECK_STATUS;
5231 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5232
5233 status = U_ZERO_ERROR;
5234 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5235 REGEX_CHECK_STATUS;
5236 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5237
5238 status = U_ZERO_ERROR;
5239 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5240 REGEX_CHECK_STATUS;
5241 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5242
5243 status = U_ZERO_ERROR;
5244 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5245 REGEX_CHECK_STATUS;
5246 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5247
5248 status = U_ZERO_ERROR;
5249 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5250 REGEX_CHECK_STATUS;
5251 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5252
5253 status = U_ZERO_ERROR;
5254 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5255 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5256
5257 status = U_ZERO_ERROR;
5258 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5259 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5260 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5261
5262 status = U_ZERO_ERROR;
5263 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5264 REGEX_CHECK_STATUS; // that push group num out of range.
5265 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5266
5267 status = U_ZERO_ERROR;
5268 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5269 REGEX_CHECK_STATUS;
5270 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5271
5272 status = U_ZERO_ERROR;
5273 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5274 REGEX_CHECK_STATUS;
5275 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5276
5277 status = U_ZERO_ERROR;
5278 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5279 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5280
5281 status = U_ZERO_ERROR;
5282 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5283 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5284
5285 status = U_ZERO_ERROR;
5286 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5287 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5288
5289 status = U_ZERO_ERROR;
5290 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5291 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5292
5293 delete m;
5294
5295 // Repeat the above replaceAll() tests using the plain C API, which
5296 // has a separate implementation internally.
5297 // TODO: factor out the test data.
5298
5299 status = U_ZERO_ERROR;
5300 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5301 REGEX_CHECK_STATUS;
5302 text = UnicodeString("abcmxyz");
5303 uregex_setText(re, text.getBuffer(), text.length(), &status);
5304 REGEX_CHECK_STATUS;
5305
5306 UChar resultBuf[100];
5307 int32_t resultLength;
5308 UnicodeString repl;
5309
5310 status = U_ZERO_ERROR;
5311 repl = UnicodeString("<$0>");
5312 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5313 REGEX_CHECK_STATUS;
5314 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5315
5316 status = U_ZERO_ERROR;
5317 repl = UnicodeString("<$1>");
5318 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5319 REGEX_CHECK_STATUS;
5320 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5321
5322 status = U_ZERO_ERROR;
5323 repl = UnicodeString("<${one}>");
5324 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5325 REGEX_CHECK_STATUS;
5326 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5327
5328 status = U_ZERO_ERROR;
5329 repl = UnicodeString("<$2>");
5330 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5331 REGEX_CHECK_STATUS;
5332 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5333
5334 status = U_ZERO_ERROR;
5335 repl = UnicodeString("<$3>");
5336 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5337 REGEX_CHECK_STATUS;
5338 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5339
5340 status = U_ZERO_ERROR;
5341 repl = UnicodeString("<$4>");
5342 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5343 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5344
5345 status = U_ZERO_ERROR;
5346 repl = UnicodeString("<$04>");
5347 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5348 REGEX_CHECK_STATUS;
5349 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5350
5351 status = U_ZERO_ERROR;
5352 repl = UnicodeString("<$000016>");
5353 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5354 REGEX_CHECK_STATUS;
5355 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5356
5357 status = U_ZERO_ERROR;
5358 repl = UnicodeString("<$3$2$1${one}>");
5359 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5360 REGEX_CHECK_STATUS;
5361 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5362
5363 status = U_ZERO_ERROR;
5364 repl = UnicodeString("$3$2$1${one}");
5365 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5366 REGEX_CHECK_STATUS;
5367 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5368
5369 status = U_ZERO_ERROR;
5370 repl = UnicodeString("<${noSuchName}>");
5371 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5372 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5373
5374 status = U_ZERO_ERROR;
5375 repl = UnicodeString("<${invalid-name}>");
5376 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5377 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5378
5379 status = U_ZERO_ERROR;
5380 repl = UnicodeString("<${one");
5381 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5382 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5383
5384 status = U_ZERO_ERROR;
5385 repl = UnicodeString("$not a capture group");
5386 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5387 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5388
5389 uregex_close(re);
5390 }
5391
5392 //--------------------------------------------------------------
5393 //
5394 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5395 // The point is not so much what the exact limit is,
5396 // but that a largish number doesn't hit bad non-linear performance,
5397 // and that exceeding the limit fails cleanly.
5398 //
5399 //--------------------------------------------------------------
NamedCaptureLimits()5400 void RegexTest::NamedCaptureLimits() {
5401 if (quick) {
5402 logln("Skipping test. Runs in exhuastive mode only.");
5403 return;
5404 }
5405 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5406 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5407 char nnbuf[100];
5408 UnicodeString pattern;
5409 int32_t nn;
5410
5411 for (nn=1; nn<goodLimit; nn++) {
5412 sprintf(nnbuf, "(?<nn%d>)", nn);
5413 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5414 }
5415 UErrorCode status = U_ZERO_ERROR;
5416 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5417 REGEX_CHECK_STATUS;
5418 for (nn=1; nn<goodLimit; nn++) {
5419 sprintf(nnbuf, "nn%d", nn);
5420 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5421 REGEX_ASSERT(nn == groupNum);
5422 if (nn != groupNum) {
5423 break;
5424 }
5425 }
5426 delete pat;
5427
5428 pattern.remove();
5429 for (nn=1; nn<failLimit; nn++) {
5430 sprintf(nnbuf, "(?<nn%d>)", nn);
5431 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5432 }
5433 status = U_ZERO_ERROR;
5434 pat = RegexPattern::compile(pattern, 0, status);
5435 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5436 delete pat;
5437 }
5438
5439
5440 //--------------------------------------------------------------
5441 //
5442 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5443 //
5444 //---------------------------------------------------------------
Bug7651()5445 void RegexTest::Bug7651() {
5446 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5447 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5448 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5449 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5450 UnicodeString s("#ff @abcd This is test");
5451 RegexPattern *REPattern = NULL;
5452 RegexMatcher *REMatcher = NULL;
5453 UErrorCode status = U_ZERO_ERROR;
5454 UParseError pe;
5455
5456 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5457 REGEX_CHECK_STATUS;
5458 REMatcher = REPattern->matcher(s, status);
5459 REGEX_CHECK_STATUS;
5460 REGEX_ASSERT(REMatcher->find());
5461 REGEX_ASSERT(REMatcher->start(status) == 0);
5462 delete REPattern;
5463 delete REMatcher;
5464 status = U_ZERO_ERROR;
5465
5466 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5467 REGEX_CHECK_STATUS;
5468 REMatcher = REPattern->matcher(s, status);
5469 REGEX_CHECK_STATUS;
5470 REGEX_ASSERT(REMatcher->find());
5471 REGEX_ASSERT(REMatcher->start(status) == 0);
5472 delete REPattern;
5473 delete REMatcher;
5474 status = U_ZERO_ERROR;
5475 }
5476
Bug7740()5477 void RegexTest::Bug7740() {
5478 UErrorCode status = U_ZERO_ERROR;
5479 UnicodeString pattern = "(a)";
5480 UnicodeString text = "abcdef";
5481 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5482 REGEX_CHECK_STATUS;
5483 REGEX_ASSERT(m->lookingAt(status));
5484 REGEX_CHECK_STATUS;
5485 status = U_ILLEGAL_ARGUMENT_ERROR;
5486 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5487 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5488 REGEX_ASSERT(s == "");
5489 delete m;
5490 }
5491
5492 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5493
Bug8479()5494 void RegexTest::Bug8479() {
5495 UErrorCode status = U_ZERO_ERROR;
5496
5497 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5498 REGEX_CHECK_STATUS;
5499 if (U_SUCCESS(status))
5500 {
5501 UnicodeString str;
5502 str.setToBogus();
5503 pMatcher->reset(str);
5504 status = U_ZERO_ERROR;
5505 pMatcher->matches(status);
5506 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5507 delete pMatcher;
5508 }
5509 }
5510
5511
5512 // Bug 7029
Bug7029()5513 void RegexTest::Bug7029() {
5514 UErrorCode status = U_ZERO_ERROR;
5515
5516 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5517 UnicodeString text = "abc.def";
5518 UnicodeString splits[10];
5519 REGEX_CHECK_STATUS;
5520 int32_t numFields = pMatcher->split(text, splits, 10, status);
5521 REGEX_CHECK_STATUS;
5522 REGEX_ASSERT(numFields == 8);
5523 delete pMatcher;
5524 }
5525
5526 // Bug 9283
5527 // This test is checking for the existance of any supplemental characters that case-fold
5528 // to a bmp character.
5529 //
5530 // At the time of this writing there are none. If any should appear in a subsequent release
5531 // of Unicode, the code in regular expressions compilation that determines the longest
5532 // posssible match for a literal string will need to be enhanced.
5533 //
5534 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5535 // for details on what to do in case of a failure of this test.
5536 //
Bug9283()5537 void RegexTest::Bug9283() {
5538 #if !UCONFIG_NO_NORMALIZATION
5539 UErrorCode status = U_ZERO_ERROR;
5540 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5541 REGEX_CHECK_STATUS;
5542 int32_t index;
5543 UChar32 c;
5544 for (index=0; ; index++) {
5545 c = supplementalsWithCaseFolding.charAt(index);
5546 if (c == -1) {
5547 break;
5548 }
5549 UnicodeString cf = UnicodeString(c).foldCase();
5550 REGEX_ASSERT(cf.length() >= 2);
5551 }
5552 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5553 }
5554
5555
CheckInvBufSize()5556 void RegexTest::CheckInvBufSize() {
5557 if(inv_next>=INV_BUFSIZ) {
5558 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5559 __FILE__, INV_BUFSIZ, inv_next);
5560 } else {
5561 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5562 }
5563 }
5564
5565
Bug10459()5566 void RegexTest::Bug10459() {
5567 UErrorCode status = U_ZERO_ERROR;
5568 UnicodeString patternString("(txt)");
5569 UnicodeString txtString("txt");
5570
5571 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5572 REGEX_CHECK_STATUS;
5573 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5574 REGEX_CHECK_STATUS;
5575
5576 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5577 REGEX_CHECK_STATUS;
5578
5579 uregex_setUText(icu_re, utext_txt, &status);
5580 REGEX_CHECK_STATUS;
5581
5582 // The bug was that calling uregex_group() before doing a matching operation
5583 // was causing a segfault. Only for Regular Expressions created from UText.
5584 // It should set an U_REGEX_INVALID_STATE.
5585
5586 UChar buf[100];
5587 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5588 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5589 REGEX_ASSERT(len == 0);
5590
5591 uregex_close(icu_re);
5592 utext_close(utext_pat);
5593 utext_close(utext_txt);
5594 }
5595
TestCaseInsensitiveStarters()5596 void RegexTest::TestCaseInsensitiveStarters() {
5597 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5598 // become stale because of new Unicode characters.
5599 // If it is stale, rerun the generation tool
5600 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5601 // and replace the embedded data in i18n/regexcmp.cpp
5602
5603 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5604 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5605 continue;
5606 }
5607 UnicodeSet s(cp, cp);
5608 s.closeOver(USET_CASE_INSENSITIVE);
5609 UnicodeSetIterator setIter(s);
5610 while (setIter.next()) {
5611 if (!setIter.isString()) {
5612 continue;
5613 }
5614 const UnicodeString &str = setIter.getString();
5615 UChar32 firstChar = str.char32At(0);
5616 UnicodeSet starters;
5617 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5618 if (!starters.contains(cp)) {
5619 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5620 return;
5621 }
5622 }
5623 }
5624 }
5625
5626
TestBug11049()5627 void RegexTest::TestBug11049() {
5628 // Original bug report: pattern with match start consisting of one of several individual characters,
5629 // and the text being matched ending with a supplementary character. find() would read past the
5630 // end of the input text when searching for potential match starting points.
5631
5632 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5633 // detect the bad read.
5634
5635 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5636 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5637
5638 // Test again with a pattern starting with a single character,
5639 // which takes a different code path than starting with an OR expression,
5640 // but with similar logic.
5641 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5642 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5643 }
5644
5645 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5646 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5647 UErrorCode status = U_ZERO_ERROR;
5648 UnicodeString patternString = UnicodeString(pattern).unescape();
5649 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5650
5651 UnicodeString dataString = UnicodeString(data).unescape();
5652 UChar *exactBuffer = new UChar[dataString.length()];
5653 dataString.extract(exactBuffer, dataString.length(), status);
5654 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5655
5656 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5657 REGEX_CHECK_STATUS;
5658 matcher->reset(ut);
5659 UBool result = matcher->find();
5660 if (result != expectMatch) {
5661 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5662 __FILE__, lineNumber, expectMatch, result, pattern, data);
5663 }
5664
5665 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5666 // off-by-one on find() with match at the last code point.
5667 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5668 // because string.unescape() will only shrink it.
5669 char * utf8Buffer = new char[uprv_strlen(data)+1];
5670 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5671 REGEX_CHECK_STATUS;
5672 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5673 REGEX_CHECK_STATUS;
5674 matcher->reset(ut);
5675 result = matcher->find();
5676 if (result != expectMatch) {
5677 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5678 __FILE__, lineNumber, expectMatch, result, pattern, data);
5679 }
5680 delete [] utf8Buffer;
5681
5682 utext_close(ut);
5683 delete [] exactBuffer;
5684 }
5685
5686
TestBug11371()5687 void RegexTest::TestBug11371() {
5688 if (quick) {
5689 logln("Skipping test. Runs in exhuastive mode only.");
5690 return;
5691 }
5692 UErrorCode status = U_ZERO_ERROR;
5693 UnicodeString patternString;
5694
5695 for (int i=0; i<8000000; i++) {
5696 patternString.append(UnicodeString("()"));
5697 }
5698 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5699 if (status != U_REGEX_PATTERN_TOO_BIG) {
5700 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5701 __FILE__, __LINE__, u_errorName(status));
5702 }
5703
5704 status = U_ZERO_ERROR;
5705 patternString = "(";
5706 for (int i=0; i<20000000; i++) {
5707 patternString.append(UnicodeString("A++"));
5708 }
5709 patternString.append(UnicodeString("){0}B++"));
5710 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5711 if (status != U_REGEX_PATTERN_TOO_BIG) {
5712 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5713 __FILE__, __LINE__, u_errorName(status));
5714 }
5715
5716 // Pattern with too much string data, such that string indexes overflow operand data field size
5717 // in compiled instruction.
5718 status = U_ZERO_ERROR;
5719 patternString = "";
5720 while (patternString.length() < 0x00ffffff) {
5721 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5722 }
5723 patternString.append(UnicodeString("X? trailing string"));
5724 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5725 if (status != U_REGEX_PATTERN_TOO_BIG) {
5726 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5727 __FILE__, __LINE__, u_errorName(status));
5728 }
5729 }
5730
TestBug11480()5731 void RegexTest::TestBug11480() {
5732 // C API, get capture group of a group that does not participate in the match.
5733 // (Returns a zero length string, with nul termination,
5734 // indistinguishable from a group with a zero length match.)
5735
5736 UErrorCode status = U_ZERO_ERROR;
5737 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5738 REGEX_CHECK_STATUS;
5739 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5740 uregex_setText(re, text.getBuffer(), text.length(), &status);
5741 REGEX_CHECK_STATUS;
5742 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5743 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5744 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5745 REGEX_ASSERT(length == 0);
5746 REGEX_ASSERT(buf[0] == 13);
5747 REGEX_ASSERT(buf[1] == 0);
5748 REGEX_ASSERT(buf[2] == 13);
5749 uregex_close(re);
5750
5751 // UText C++ API, length of match is 0 for non-participating matches.
5752 UText ut = UTEXT_INITIALIZER;
5753 utext_openUnicodeString(&ut, &text, &status);
5754 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5755 REGEX_CHECK_STATUS;
5756 matcher.reset(&ut);
5757 REGEX_ASSERT(matcher.lookingAt(0, status));
5758
5759 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5760 int64_t groupLen = -666;
5761 UText group = UTEXT_INITIALIZER;
5762 matcher.group(1, &group, groupLen, status);
5763 REGEX_CHECK_STATUS;
5764 REGEX_ASSERT(groupLen == 1);
5765 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5766
5767 // Capture group 2, the (B), does not participate in the match.
5768 matcher.group(2, &group, groupLen, status);
5769 REGEX_CHECK_STATUS;
5770 REGEX_ASSERT(groupLen == 0);
5771 REGEX_ASSERT(matcher.start(2, status) == -1);
5772 REGEX_CHECK_STATUS;
5773 }
5774
TestBug12884()5775 void RegexTest::TestBug12884() {
5776 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5777 UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5778 UnicodeString text(u"hello");
5779 UErrorCode status = U_ZERO_ERROR;
5780 RegexMatcher m(pattern, text, 0, status);
5781 REGEX_CHECK_STATUS;
5782 m.setTimeLimit(5, status);
5783 m.find(status);
5784 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5785
5786 // Non-greedy loops. They take a different code path during matching.
5787 UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5788 status = U_ZERO_ERROR;
5789 RegexMatcher ngM(ngPattern, text, 0, status);
5790 REGEX_CHECK_STATUS;
5791 ngM.setTimeLimit(5, status);
5792 ngM.find(status);
5793 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5794
5795 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5796 const char *text8 = u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
5797 "carácter, sin importar la plataforma, sin importar el programa,"
5798 "sin importar el idioma.";
5799 status = U_ZERO_ERROR;
5800 LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5801 REGEX_CHECK_STATUS;
5802 m.reset(ut.getAlias());
5803 m.find(status);
5804 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5805
5806 status = U_ZERO_ERROR;
5807 ngM.reset(ut.getAlias());
5808 ngM.find(status);
5809 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5810 }
5811
5812 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5813 // can cause a read past the end of the input text.
5814 // The failure is seen when running this test with Clang's Addresss Sanitizer.
5815
TestBug13631()5816 void RegexTest::TestBug13631() {
5817 const UChar *pats[] = { u"(?<!^)",
5818 u"(?<=^)",
5819 nullptr
5820 };
5821 for (const UChar **pat=pats; *pat; ++pat) {
5822 UErrorCode status = U_ZERO_ERROR;
5823 UnicodeString upat(*pat);
5824 RegexMatcher matcher(upat, 0, status);
5825 const UChar s =u'a';
5826 UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5827 REGEX_CHECK_STATUS;
5828 matcher.reset(ut);
5829 while (matcher.find()) {
5830 }
5831 utext_close(ut);
5832 }
5833 }
5834
5835 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5836 // where a following group specification would be expected.
5837 // Failure shows when running the test under Clang's Address Sanitizer.
5838
TestBug13632()5839 void RegexTest::TestBug13632() {
5840 UErrorCode status = U_ZERO_ERROR;
5841 URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
5842 const char16_t *sourceString = u"Hello, world.";
5843 uregex_setText(re, sourceString, u_strlen(sourceString), &status);
5844
5845 const int32_t destCap = 20;
5846 char16_t dest[destCap] = {};
5847 const char16_t replacement[] = {u'x', u'$'}; // Not nul terminated string.
5848 uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
5849
5850 assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5851 uregex_close(re);
5852 }
5853
5854 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
5855