1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8
9 //
10 // regextst.cpp
11 //
12 // ICU Regular Expressions test, part of intltest.
13 //
14
15 /*
16 NOTE!!
17
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
22
23 */
24
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/stringpiece.h"
35 #include "unicode/uchar.h"
36 #include "unicode/ucnv.h"
37 #include "unicode/uniset.h"
38 #include "unicode/uregex.h"
39 #include "unicode/usetiter.h"
40 #include "unicode/ustring.h"
41 #include "unicode/utext.h"
42 #include "unicode/utf16.h"
43 #include "cstr.h"
44 #include "regextst.h"
45 #include "regexcmp.h"
46 #include "uvector.h"
47 #include "util.h"
48 #include "cmemory.h"
49 #include "cstring.h"
50 #include "uinvchar.h"
51
52 #define SUPPORT_MUTATING_INPUT_STRING 0
53
54 //---------------------------------------------------------------------------
55 //
56 // Test class boilerplate
57 //
58 //---------------------------------------------------------------------------
RegexTest()59 RegexTest::RegexTest()
60 {
61 }
62
63
~RegexTest()64 RegexTest::~RegexTest()
65 {
66 }
67
68
69
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)70 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
71 {
72 if (exec) logln("TestSuite RegexTest: ");
73 TESTCASE_AUTO_BEGIN;
74 TESTCASE_AUTO(Basic);
75 TESTCASE_AUTO(API_Match);
76 TESTCASE_AUTO(API_Replace);
77 TESTCASE_AUTO(API_Pattern);
78 #if !UCONFIG_NO_FILE_IO
79 TESTCASE_AUTO(Extended);
80 #endif
81 TESTCASE_AUTO(Errors);
82 TESTCASE_AUTO(PerlTests);
83 TESTCASE_AUTO(Callbacks);
84 TESTCASE_AUTO(FindProgressCallbacks);
85 TESTCASE_AUTO(Bug6149);
86 TESTCASE_AUTO(UTextBasic);
87 TESTCASE_AUTO(API_Match_UTF8);
88 TESTCASE_AUTO(API_Replace_UTF8);
89 TESTCASE_AUTO(API_Pattern_UTF8);
90 TESTCASE_AUTO(PerlTestsUTF8);
91 TESTCASE_AUTO(PreAllocatedUTextCAPI);
92 TESTCASE_AUTO(Bug7651);
93 TESTCASE_AUTO(Bug7740);
94 TESTCASE_AUTO(Bug8479);
95 TESTCASE_AUTO(Bug7029);
96 TESTCASE_AUTO(CheckInvBufSize);
97 TESTCASE_AUTO(Bug9283);
98 TESTCASE_AUTO(Bug10459);
99 TESTCASE_AUTO(TestCaseInsensitiveStarters);
100 TESTCASE_AUTO(TestBug11049);
101 TESTCASE_AUTO(TestBug11371);
102 TESTCASE_AUTO(TestBug11480);
103 TESTCASE_AUTO(NamedCapture);
104 TESTCASE_AUTO(NamedCaptureLimits);
105 TESTCASE_AUTO(TestBug12884);
106 TESTCASE_AUTO(TestBug13631);
107 TESTCASE_AUTO(TestBug13632);
108 TESTCASE_AUTO(TestBug20359);
109 TESTCASE_AUTO(TestBug20863);
110 TESTCASE_AUTO_END;
111 }
112
113
114 /**
115 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
116 * into ASCII.
117 * @see utext_openUTF8
118 */
119 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
120
121 //---------------------------------------------------------------------------
122 //
123 // Error Checking / Reporting macros used in all of the tests.
124 //
125 //---------------------------------------------------------------------------
126
utextToPrintable(char * buf,int32_t bufLen,UText * text)127 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
128 int64_t oldIndex = utext_getNativeIndex(text);
129 utext_setNativeIndex(text, 0);
130 char *bufPtr = buf;
131 UChar32 c = utext_next32From(text, 0);
132 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
133 if (0x000020<=c && c<0x00007e) {
134 *bufPtr = c;
135 } else {
136 #if 0
137 sprintf(bufPtr,"U+%04X", c);
138 bufPtr+= strlen(bufPtr)-1;
139 #else
140 *bufPtr = '%';
141 #endif
142 }
143 bufPtr++;
144 c = UTEXT_NEXT32(text);
145 }
146 *bufPtr = 0;
147 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
148 char *ebuf = (char*)malloc(bufLen);
149 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
150 uprv_strncpy(buf, ebuf, bufLen);
151 free((void*)ebuf);
152 #endif
153 utext_setNativeIndex(text, oldIndex);
154 }
155
156
157 static char ASSERT_BUF[1024];
158
extractToAssertBuf(const UnicodeString & message)159 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
160 if(message.length()==0) {
161 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
162 } else {
163 UnicodeString buf;
164 IntlTest::prettify(message,buf);
165 if(buf.length()==0) {
166 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
167 } else {
168 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
169 if(ASSERT_BUF[0]==0) {
170 ASSERT_BUF[0]=0;
171 for(int32_t i=0;i<buf.length();i++) {
172 UChar ch = buf[i];
173 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
174 }
175 }
176 }
177 }
178 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
179 return ASSERT_BUF;
180 }
181
182 #define REGEX_VERBOSE_TEXT(text) UPRV_BLOCK_MACRO_BEGIN { \
183 char buf[200]; \
184 utextToPrintable(buf,UPRV_LENGTHOF(buf),text); \
185 logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf); \
186 } UPRV_BLOCK_MACRO_END
187
188 #define REGEX_CHECK_STATUS UPRV_BLOCK_MACRO_BEGIN { \
189 if (U_FAILURE(status)) { \
190 dataerrln("%s:%d: RegexTest failure. status=%s", \
191 __FILE__, __LINE__, u_errorName(status)); \
192 return; \
193 } \
194 } UPRV_BLOCK_MACRO_END
195
196 #define REGEX_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
197 if ((expr)==FALSE) { \
198 errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr); \
199 } \
200 } UPRV_BLOCK_MACRO_END
201
202 #define REGEX_ASSERT_FAIL(expr, errcode) UPRV_BLOCK_MACRO_BEGIN { \
203 UErrorCode status=U_ZERO_ERROR; \
204 (expr); \
205 if (status!=errcode) { \
206 dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
207 __LINE__, u_errorName(errcode), u_errorName(status)); \
208 } \
209 } UPRV_BLOCK_MACRO_END
210
211 #define REGEX_CHECK_STATUS_L(line) UPRV_BLOCK_MACRO_BEGIN { \
212 if (U_FAILURE(status)) { \
213 errln("RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); \
214 } \
215 } UPRV_BLOCK_MACRO_END
216
217 #define REGEX_ASSERT_L(expr, line) UPRV_BLOCK_MACRO_BEGIN { \
218 if ((expr)==FALSE) { \
219 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); \
220 return; \
221 } \
222 } UPRV_BLOCK_MACRO_END
223
224 // expected: const char * , restricted to invariant characters.
225 // actual: const UnicodeString &
226 #define REGEX_ASSERT_UNISTR(expected, actual) UPRV_BLOCK_MACRO_BEGIN { \
227 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
228 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
229 __FILE__, __LINE__, expected, extractToAssertBuf(actual)); \
230 } \
231 } UPRV_BLOCK_MACRO_END
232
233
testUTextEqual(UText * uta,UText * utb)234 static UBool testUTextEqual(UText *uta, UText *utb) {
235 UChar32 ca = 0;
236 UChar32 cb = 0;
237 utext_setNativeIndex(uta, 0);
238 utext_setNativeIndex(utb, 0);
239 do {
240 ca = utext_next32(uta);
241 cb = utext_next32(utb);
242 if (ca != cb) {
243 break;
244 }
245 } while (ca != U_SENTINEL);
246 return ca == cb;
247 }
248
249
250 /**
251 * @param expected expected text in UTF-8 (not platform) codepage
252 */
assertUText(const char * expected,UText * actual,const char * file,int line)253 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
254 UErrorCode status = U_ZERO_ERROR;
255 UText expectedText = UTEXT_INITIALIZER;
256 utext_openUTF8(&expectedText, expected, -1, &status);
257 if(U_FAILURE(status)) {
258 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
259 return;
260 }
261 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
262 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
263 return;
264 }
265 utext_setNativeIndex(actual, 0);
266 if (!testUTextEqual(&expectedText, actual)) {
267 char buf[201 /*21*/];
268 char expectedBuf[201];
269 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
270 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
271 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
272 }
273 utext_close(&expectedText);
274 }
275 /**
276 * @param expected invariant (platform local text) input
277 */
278
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)279 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
280 UErrorCode status = U_ZERO_ERROR;
281 UText expectedText = UTEXT_INITIALIZER;
282 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
283 if(U_FAILURE(status)) {
284 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
285 return;
286 }
287 utext_setNativeIndex(actual, 0);
288 if (!testUTextEqual(&expectedText, actual)) {
289 char buf[201 /*21*/];
290 char expectedBuf[201];
291 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
292 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
293 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
294 }
295 utext_close(&expectedText);
296 }
297
298 /**
299 * Assumes utf-8 input
300 */
301 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
302 /**
303 * Assumes Invariant input
304 */
305 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
306
307 /**
308 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
309 * passed into utext_openUTF8. An error will be given if
310 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
311 */
312
313 #define INV_BUFSIZ 2048 /* increase this if too small */
314
315 static int64_t inv_next=0;
316
317 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
318 static char inv_buf[INV_BUFSIZ];
319 #endif
320
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)321 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
322 if(length==-1) length=strlen(inv);
323 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
324 inv_next+=length;
325 return utext_openUTF8(ut, inv, length, status);
326 #else
327 if(inv_next+length+1>INV_BUFSIZ) {
328 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
329 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
330 *status = U_MEMORY_ALLOCATION_ERROR;
331 return NULL;
332 }
333
334 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
335 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
336 inv_next+=length;
337
338 #if 0
339 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
340 #endif
341
342 return utext_openUTF8(ut, (const char*)buf, length, status);
343 #endif
344 }
345
346
347 //---------------------------------------------------------------------------
348 //
349 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
350 // for the LookingAt() and Match() functions.
351 //
352 // usage:
353 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
354 //
355 // The expected results are UBool - TRUE or FALSE.
356 // The input text is unescaped. The pattern is not.
357 //
358 //
359 //---------------------------------------------------------------------------
360
361 #define REGEX_TESTLM(pat, text, looking, match) UPRV_BLOCK_MACRO_BEGIN { \
362 doRegexLMTest(pat, text, looking, match, __LINE__); \
363 doRegexLMTestUTF8(pat, text, looking, match, __LINE__); \
364 } UPRV_BLOCK_MACRO_END
365
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)366 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
367 const UnicodeString pattern(pat, -1, US_INV);
368 const UnicodeString inputText(text, -1, US_INV);
369 UErrorCode status = U_ZERO_ERROR;
370 UParseError pe;
371 RegexPattern *REPattern = NULL;
372 RegexMatcher *REMatcher = NULL;
373 UBool retVal = TRUE;
374
375 UnicodeString patString(pat, -1, US_INV);
376 REPattern = RegexPattern::compile(patString, 0, pe, status);
377 if (U_FAILURE(status)) {
378 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
379 line, u_errorName(status));
380 return FALSE;
381 }
382 if (line==376) { REPattern->dumpPattern();}
383
384 UnicodeString inputString(inputText);
385 UnicodeString unEscapedInput = inputString.unescape();
386 REMatcher = REPattern->matcher(unEscapedInput, status);
387 if (U_FAILURE(status)) {
388 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
389 line, u_errorName(status));
390 return FALSE;
391 }
392
393 UBool actualmatch;
394 actualmatch = REMatcher->lookingAt(status);
395 if (U_FAILURE(status)) {
396 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
397 line, u_errorName(status));
398 retVal = FALSE;
399 }
400 if (actualmatch != looking) {
401 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
402 retVal = FALSE;
403 }
404
405 status = U_ZERO_ERROR;
406 actualmatch = REMatcher->matches(status);
407 if (U_FAILURE(status)) {
408 errln("RegexTest failure in matches() at line %d. Status = %s\n",
409 line, u_errorName(status));
410 retVal = FALSE;
411 }
412 if (actualmatch != match) {
413 errln("RegexTest: wrong return from matches() at line %d.\n", line);
414 retVal = FALSE;
415 }
416
417 if (retVal == FALSE) {
418 REPattern->dumpPattern();
419 }
420
421 delete REPattern;
422 delete REMatcher;
423 return retVal;
424 }
425
426
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)427 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
428 UText pattern = UTEXT_INITIALIZER;
429 int32_t inputUTF8Length;
430 char *textChars = NULL;
431 UText inputText = UTEXT_INITIALIZER;
432 UErrorCode status = U_ZERO_ERROR;
433 UParseError pe;
434 RegexPattern *REPattern = NULL;
435 RegexMatcher *REMatcher = NULL;
436 UBool retVal = TRUE;
437
438 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
439 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
440 if (U_FAILURE(status)) {
441 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
442 line, u_errorName(status));
443 return FALSE;
444 }
445
446 UnicodeString inputString(text, -1, US_INV);
447 UnicodeString unEscapedInput = inputString.unescape();
448 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
449 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
450
451 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
452 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
453 // UTF-8 does not allow unpaired surrogates, so this could actually happen
454 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
455 return TRUE; // not a failure of the Regex engine
456 }
457 status = U_ZERO_ERROR; // buffer overflow
458 textChars = new char[inputUTF8Length+1];
459 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
460 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
461
462 REMatcher = &REPattern->matcher(status)->reset(&inputText);
463 if (U_FAILURE(status)) {
464 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
465 line, u_errorName(status));
466 return FALSE;
467 }
468
469 UBool actualmatch;
470 actualmatch = REMatcher->lookingAt(status);
471 if (U_FAILURE(status)) {
472 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
473 line, u_errorName(status));
474 retVal = FALSE;
475 }
476 if (actualmatch != looking) {
477 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
478 retVal = FALSE;
479 }
480
481 status = U_ZERO_ERROR;
482 actualmatch = REMatcher->matches(status);
483 if (U_FAILURE(status)) {
484 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
485 line, u_errorName(status));
486 retVal = FALSE;
487 }
488 if (actualmatch != match) {
489 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
490 retVal = FALSE;
491 }
492
493 if (retVal == FALSE) {
494 REPattern->dumpPattern();
495 }
496
497 delete REPattern;
498 delete REMatcher;
499 utext_close(&inputText);
500 utext_close(&pattern);
501 delete[] textChars;
502 return retVal;
503 }
504
505
506
507 //---------------------------------------------------------------------------
508 //
509 // REGEX_ERR Macro + invocation function to simplify writing tests
510 // regex tests for incorrect patterns
511 //
512 // usage:
513 // REGEX_ERR("pattern", expected error line, column, expected status);
514 //
515 //---------------------------------------------------------------------------
516 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__)
517
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)518 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
519 UErrorCode expectedStatus, int32_t line) {
520 UnicodeString pattern(pat);
521
522 UErrorCode status = U_ZERO_ERROR;
523 UParseError pe;
524 RegexPattern *callerPattern = NULL;
525
526 //
527 // Compile the caller's pattern
528 //
529 UnicodeString patString(pat);
530 callerPattern = RegexPattern::compile(patString, 0, pe, status);
531 if (status != expectedStatus) {
532 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
533 } else {
534 if (status != U_ZERO_ERROR) {
535 if (pe.line != errLine || pe.offset != errCol) {
536 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
537 line, errLine, errCol, pe.line, pe.offset);
538 }
539 }
540 }
541
542 delete callerPattern;
543
544 //
545 // Compile again, using a UTF-8-based UText
546 //
547 UText patternText = UTEXT_INITIALIZER;
548 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
549 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
550 if (status != expectedStatus) {
551 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
552 } else {
553 if (status != U_ZERO_ERROR) {
554 if (pe.line != errLine || pe.offset != errCol) {
555 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
556 line, errLine, errCol, pe.line, pe.offset);
557 }
558 }
559 }
560
561 delete callerPattern;
562 utext_close(&patternText);
563 }
564
565
566
567 //---------------------------------------------------------------------------
568 //
569 // Basic Check for basic functionality of regex pattern matching.
570 // Avoid the use of REGEX_FIND test macro, which has
571 // substantial dependencies on basic Regex functionality.
572 //
573 //---------------------------------------------------------------------------
Basic()574 void RegexTest::Basic() {
575
576
577 //
578 // Debug - slide failing test cases early
579 //
580 #if 0
581 {
582 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
583 UParseError pe;
584 UErrorCode status = U_ZERO_ERROR;
585 RegexPattern *pattern;
586 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
587 pattern->dumpPattern();
588 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
589 UBool result = m->find();
590 printf("result = %d\n", result);
591 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
592 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
593 }
594 exit(1);
595 #endif
596
597
598 //
599 // Pattern with parentheses
600 //
601 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
602 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
603 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
604
605 //
606 // Patterns with *
607 //
608 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
609 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
610 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
611 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
612 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
613
614 REGEX_TESTLM("a*", "", TRUE, TRUE);
615 REGEX_TESTLM("a*", "b", TRUE, FALSE);
616
617
618 //
619 // Patterns with "."
620 //
621 REGEX_TESTLM(".", "abc", TRUE, FALSE);
622 REGEX_TESTLM("...", "abc", TRUE, TRUE);
623 REGEX_TESTLM("....", "abc", FALSE, FALSE);
624 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
625 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
626 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
627 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
628 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
629
630 //
631 // Patterns with * applied to chars at end of literal string
632 //
633 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
634 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
635
636 //
637 // Supplemental chars match as single chars, not a pair of surrogates.
638 //
639 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
640 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
641 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
642
643
644 //
645 // UnicodeSets in the pattern
646 //
647 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
648 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
649 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
650 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
651 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
652 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
653
654 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
655 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
656 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
657 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
658 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
659
660 //
661 // OR operator in patterns
662 //
663 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
664 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
665 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
666 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
667
668 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
669 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
670 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
671 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
672 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
673 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
674
675 //
676 // +
677 //
678 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
679 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
680 REGEX_TESTLM("b+", "", FALSE, FALSE);
681 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
682 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
683 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
684
685 //
686 // ?
687 //
688 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
689 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
690 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
691 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
692 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
693 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
694 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
695 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
696 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
697
698 //
699 // Escape sequences that become single literal chars, handled internally
700 // by ICU's Unescape.
701 //
702
703 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
704 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
705 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
706 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
707 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
708 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
709 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
710 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
711 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
712 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
713
714 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
715 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
716
717 // Escape of special chars in patterns
718 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
719 }
720
721
722 //---------------------------------------------------------------------------
723 //
724 // UTextBasic Check for quirks that are specific to the UText
725 // implementation.
726 //
727 //---------------------------------------------------------------------------
UTextBasic()728 void RegexTest::UTextBasic() {
729 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
730 UErrorCode status = U_ZERO_ERROR;
731 UText pattern = UTEXT_INITIALIZER;
732 utext_openUTF8(&pattern, str_abc, -1, &status);
733 RegexMatcher matcher(&pattern, 0, status);
734 REGEX_CHECK_STATUS;
735
736 UText input = UTEXT_INITIALIZER;
737 utext_openUTF8(&input, str_abc, -1, &status);
738 REGEX_CHECK_STATUS;
739 matcher.reset(&input);
740 REGEX_CHECK_STATUS;
741 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
742
743 matcher.reset(matcher.inputText());
744 REGEX_CHECK_STATUS;
745 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
746
747 utext_close(&pattern);
748 utext_close(&input);
749 }
750
751
752 //---------------------------------------------------------------------------
753 //
754 // API_Match Test that the API for class RegexMatcher
755 // is present and nominally working, but excluding functions
756 // implementing replace operations.
757 //
758 //---------------------------------------------------------------------------
API_Match()759 void RegexTest::API_Match() {
760 UParseError pe;
761 UErrorCode status=U_ZERO_ERROR;
762 int32_t flags = 0;
763
764 //
765 // Debug - slide failing test cases early
766 //
767 #if 0
768 {
769 }
770 return;
771 #endif
772
773 //
774 // Simple pattern compilation
775 //
776 {
777 UnicodeString re("abc");
778 RegexPattern *pat2;
779 pat2 = RegexPattern::compile(re, flags, pe, status);
780 REGEX_CHECK_STATUS;
781
782 UnicodeString inStr1 = "abcdef this is a test";
783 UnicodeString instr2 = "not abc";
784 UnicodeString empty = "";
785
786
787 //
788 // Matcher creation and reset.
789 //
790 RegexMatcher *m1 = pat2->matcher(inStr1, status);
791 REGEX_CHECK_STATUS;
792 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
793 REGEX_ASSERT(m1->input() == inStr1);
794 m1->reset(instr2);
795 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
796 REGEX_ASSERT(m1->input() == instr2);
797 m1->reset(inStr1);
798 REGEX_ASSERT(m1->input() == inStr1);
799 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
800 m1->reset(empty);
801 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
802 REGEX_ASSERT(m1->input() == empty);
803 REGEX_ASSERT(&m1->pattern() == pat2);
804
805 //
806 // reset(pos, status)
807 //
808 m1->reset(inStr1);
809 m1->reset(4, status);
810 REGEX_CHECK_STATUS;
811 REGEX_ASSERT(m1->input() == inStr1);
812 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
813
814 m1->reset(-1, status);
815 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
816 status = U_ZERO_ERROR;
817
818 m1->reset(0, status);
819 REGEX_CHECK_STATUS;
820 status = U_ZERO_ERROR;
821
822 int32_t len = m1->input().length();
823 m1->reset(len-1, status);
824 REGEX_CHECK_STATUS;
825 status = U_ZERO_ERROR;
826
827 m1->reset(len, status);
828 REGEX_CHECK_STATUS;
829 status = U_ZERO_ERROR;
830
831 m1->reset(len+1, status);
832 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
833 status = U_ZERO_ERROR;
834
835 //
836 // match(pos, status)
837 //
838 m1->reset(instr2);
839 REGEX_ASSERT(m1->matches(4, status) == TRUE);
840 m1->reset();
841 REGEX_ASSERT(m1->matches(3, status) == FALSE);
842 m1->reset();
843 REGEX_ASSERT(m1->matches(5, status) == FALSE);
844 REGEX_ASSERT(m1->matches(4, status) == TRUE);
845 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
846 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
847
848 // Match() at end of string should fail, but should not
849 // be an error.
850 status = U_ZERO_ERROR;
851 len = m1->input().length();
852 REGEX_ASSERT(m1->matches(len, status) == FALSE);
853 REGEX_CHECK_STATUS;
854
855 // Match beyond end of string should fail with an error.
856 status = U_ZERO_ERROR;
857 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
858 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
859
860 // Successful match at end of string.
861 {
862 status = U_ZERO_ERROR;
863 RegexMatcher m("A?", 0, status); // will match zero length string.
864 REGEX_CHECK_STATUS;
865 m.reset(inStr1);
866 len = inStr1.length();
867 REGEX_ASSERT(m.matches(len, status) == TRUE);
868 REGEX_CHECK_STATUS;
869 m.reset(empty);
870 REGEX_ASSERT(m.matches(0, status) == TRUE);
871 REGEX_CHECK_STATUS;
872 }
873
874
875 //
876 // lookingAt(pos, status)
877 //
878 status = U_ZERO_ERROR;
879 m1->reset(instr2); // "not abc"
880 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
881 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
882 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
883 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
884 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
885 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
886 status = U_ZERO_ERROR;
887 len = m1->input().length();
888 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
889 REGEX_CHECK_STATUS;
890 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
891 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
892
893 delete m1;
894 delete pat2;
895 }
896
897
898 //
899 // Capture Group.
900 // RegexMatcher::start();
901 // RegexMatcher::end();
902 // RegexMatcher::groupCount();
903 //
904 {
905 int32_t flags=0;
906 UParseError pe;
907 UErrorCode status=U_ZERO_ERROR;
908
909 UnicodeString re("01(23(45)67)(.*)");
910 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
911 REGEX_CHECK_STATUS;
912 UnicodeString data = "0123456789";
913
914 RegexMatcher *matcher = pat->matcher(data, status);
915 REGEX_CHECK_STATUS;
916 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
917 static const int32_t matchStarts[] = {0, 2, 4, 8};
918 static const int32_t matchEnds[] = {10, 8, 6, 10};
919 int32_t i;
920 for (i=0; i<4; i++) {
921 int32_t actualStart = matcher->start(i, status);
922 REGEX_CHECK_STATUS;
923 if (actualStart != matchStarts[i]) {
924 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
925 __LINE__, i, matchStarts[i], actualStart);
926 }
927 int32_t actualEnd = matcher->end(i, status);
928 REGEX_CHECK_STATUS;
929 if (actualEnd != matchEnds[i]) {
930 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
931 __LINE__, i, matchEnds[i], actualEnd);
932 }
933 }
934
935 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
936 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
937
938 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
939 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
940 matcher->reset();
941 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
942
943 matcher->lookingAt(status);
944 REGEX_ASSERT(matcher->group(status) == "0123456789");
945 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
946 REGEX_ASSERT(matcher->group(1, status) == "234567" );
947 REGEX_ASSERT(matcher->group(2, status) == "45" );
948 REGEX_ASSERT(matcher->group(3, status) == "89" );
949 REGEX_CHECK_STATUS;
950 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
951 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
952 matcher->reset();
953 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
954
955 delete matcher;
956 delete pat;
957
958 }
959
960 //
961 // find
962 //
963 {
964 int32_t flags=0;
965 UParseError pe;
966 UErrorCode status=U_ZERO_ERROR;
967
968 UnicodeString re("abc");
969 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
970 REGEX_CHECK_STATUS;
971 UnicodeString data = ".abc..abc...abc..";
972 // 012345678901234567
973
974 RegexMatcher *matcher = pat->matcher(data, status);
975 REGEX_CHECK_STATUS;
976 REGEX_ASSERT(matcher->find());
977 REGEX_ASSERT(matcher->start(status) == 1);
978 REGEX_ASSERT(matcher->find());
979 REGEX_ASSERT(matcher->start(status) == 6);
980 REGEX_ASSERT(matcher->find());
981 REGEX_ASSERT(matcher->start(status) == 12);
982 REGEX_ASSERT(matcher->find() == FALSE);
983 REGEX_ASSERT(matcher->find() == FALSE);
984
985 matcher->reset();
986 REGEX_ASSERT(matcher->find());
987 REGEX_ASSERT(matcher->start(status) == 1);
988
989 REGEX_ASSERT(matcher->find(0, status));
990 REGEX_ASSERT(matcher->start(status) == 1);
991 REGEX_ASSERT(matcher->find(1, status));
992 REGEX_ASSERT(matcher->start(status) == 1);
993 REGEX_ASSERT(matcher->find(2, status));
994 REGEX_ASSERT(matcher->start(status) == 6);
995 REGEX_ASSERT(matcher->find(12, status));
996 REGEX_ASSERT(matcher->start(status) == 12);
997 REGEX_ASSERT(matcher->find(13, status) == FALSE);
998 REGEX_ASSERT(matcher->find(16, status) == FALSE);
999 REGEX_ASSERT(matcher->find(17, status) == FALSE);
1000 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1001
1002 status = U_ZERO_ERROR;
1003 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1004 status = U_ZERO_ERROR;
1005 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1006
1007 REGEX_ASSERT(matcher->groupCount() == 0);
1008
1009 delete matcher;
1010 delete pat;
1011 }
1012
1013
1014 //
1015 // find, with \G in pattern (true if at the end of a previous match).
1016 //
1017 {
1018 int32_t flags=0;
1019 UParseError pe;
1020 UErrorCode status=U_ZERO_ERROR;
1021
1022 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1023 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1024 REGEX_CHECK_STATUS;
1025 UnicodeString data = ".abcabc.abc..";
1026 // 012345678901234567
1027
1028 RegexMatcher *matcher = pat->matcher(data, status);
1029 REGEX_CHECK_STATUS;
1030 REGEX_ASSERT(matcher->find());
1031 REGEX_ASSERT(matcher->start(status) == 0);
1032 REGEX_ASSERT(matcher->start(1, status) == -1);
1033 REGEX_ASSERT(matcher->start(2, status) == 1);
1034
1035 REGEX_ASSERT(matcher->find());
1036 REGEX_ASSERT(matcher->start(status) == 4);
1037 REGEX_ASSERT(matcher->start(1, status) == 4);
1038 REGEX_ASSERT(matcher->start(2, status) == -1);
1039 REGEX_CHECK_STATUS;
1040
1041 delete matcher;
1042 delete pat;
1043 }
1044
1045 //
1046 // find with zero length matches, match position should bump ahead
1047 // to prevent loops.
1048 //
1049 {
1050 int32_t i;
1051 UErrorCode status=U_ZERO_ERROR;
1052 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1053 // using an always-true look-ahead.
1054 REGEX_CHECK_STATUS;
1055 UnicodeString s(" ");
1056 m.reset(s);
1057 for (i=0; ; i++) {
1058 if (m.find() == FALSE) {
1059 break;
1060 }
1061 REGEX_ASSERT(m.start(status) == i);
1062 REGEX_ASSERT(m.end(status) == i);
1063 }
1064 REGEX_ASSERT(i==5);
1065
1066 // Check that the bump goes over surrogate pairs OK
1067 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1068 s = s.unescape();
1069 m.reset(s);
1070 for (i=0; ; i+=2) {
1071 if (m.find() == FALSE) {
1072 break;
1073 }
1074 REGEX_ASSERT(m.start(status) == i);
1075 REGEX_ASSERT(m.end(status) == i);
1076 }
1077 REGEX_ASSERT(i==10);
1078 }
1079 {
1080 // find() loop breaking test.
1081 // with pattern of /.?/, should see a series of one char matches, then a single
1082 // match of zero length at the end of the input string.
1083 int32_t i;
1084 UErrorCode status=U_ZERO_ERROR;
1085 RegexMatcher m(".?", 0, status);
1086 REGEX_CHECK_STATUS;
1087 UnicodeString s(" ");
1088 m.reset(s);
1089 for (i=0; ; i++) {
1090 if (m.find() == FALSE) {
1091 break;
1092 }
1093 REGEX_ASSERT(m.start(status) == i);
1094 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1095 }
1096 REGEX_ASSERT(i==5);
1097 }
1098
1099
1100 //
1101 // Matchers with no input string behave as if they had an empty input string.
1102 //
1103
1104 {
1105 UErrorCode status = U_ZERO_ERROR;
1106 RegexMatcher m(".?", 0, status);
1107 REGEX_CHECK_STATUS;
1108 REGEX_ASSERT(m.find());
1109 REGEX_ASSERT(m.start(status) == 0);
1110 REGEX_ASSERT(m.input() == "");
1111 }
1112 {
1113 UErrorCode status = U_ZERO_ERROR;
1114 RegexPattern *p = RegexPattern::compile(".", 0, status);
1115 RegexMatcher *m = p->matcher(status);
1116 REGEX_CHECK_STATUS;
1117
1118 REGEX_ASSERT(m->find() == FALSE);
1119 REGEX_ASSERT(m->input() == "");
1120 delete m;
1121 delete p;
1122 }
1123
1124 //
1125 // Regions
1126 //
1127 {
1128 UErrorCode status = U_ZERO_ERROR;
1129 UnicodeString testString("This is test data");
1130 RegexMatcher m(".*", testString, 0, status);
1131 REGEX_CHECK_STATUS;
1132 REGEX_ASSERT(m.regionStart() == 0);
1133 REGEX_ASSERT(m.regionEnd() == testString.length());
1134 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1135 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1136
1137 m.region(2,4, status);
1138 REGEX_CHECK_STATUS;
1139 REGEX_ASSERT(m.matches(status));
1140 REGEX_ASSERT(m.start(status)==2);
1141 REGEX_ASSERT(m.end(status)==4);
1142 REGEX_CHECK_STATUS;
1143
1144 m.reset();
1145 REGEX_ASSERT(m.regionStart() == 0);
1146 REGEX_ASSERT(m.regionEnd() == testString.length());
1147
1148 UnicodeString shorterString("short");
1149 m.reset(shorterString);
1150 REGEX_ASSERT(m.regionStart() == 0);
1151 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1152
1153 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1154 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1155 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1156 REGEX_ASSERT(&m == &m.reset());
1157 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1158
1159 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1160 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1161 REGEX_ASSERT(&m == &m.reset());
1162 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1163
1164 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1165 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1166 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1167 REGEX_ASSERT(&m == &m.reset());
1168 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1169
1170 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1171 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1172 REGEX_ASSERT(&m == &m.reset());
1173 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1174
1175 }
1176
1177 //
1178 // hitEnd() and requireEnd()
1179 //
1180 {
1181 UErrorCode status = U_ZERO_ERROR;
1182 UnicodeString testString("aabb");
1183 RegexMatcher m1(".*", testString, 0, status);
1184 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1185 REGEX_ASSERT(m1.hitEnd() == TRUE);
1186 REGEX_ASSERT(m1.requireEnd() == FALSE);
1187 REGEX_CHECK_STATUS;
1188
1189 status = U_ZERO_ERROR;
1190 RegexMatcher m2("a*", testString, 0, status);
1191 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1192 REGEX_ASSERT(m2.hitEnd() == FALSE);
1193 REGEX_ASSERT(m2.requireEnd() == FALSE);
1194 REGEX_CHECK_STATUS;
1195
1196 status = U_ZERO_ERROR;
1197 RegexMatcher m3(".*$", testString, 0, status);
1198 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1199 REGEX_ASSERT(m3.hitEnd() == TRUE);
1200 REGEX_ASSERT(m3.requireEnd() == TRUE);
1201 REGEX_CHECK_STATUS;
1202 }
1203
1204
1205 //
1206 // Compilation error on reset with UChar *
1207 // These were a hazard that people were stumbling over with runtime errors.
1208 // Changed them to compiler errors by adding private methods that more closely
1209 // matched the incorrect use of the functions.
1210 //
1211 #if 0
1212 {
1213 UErrorCode status = U_ZERO_ERROR;
1214 UChar ucharString[20];
1215 RegexMatcher m(".", 0, status);
1216 m.reset(ucharString); // should not compile.
1217
1218 RegexPattern *p = RegexPattern::compile(".", 0, status);
1219 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1220
1221 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1222 }
1223 #endif
1224
1225 //
1226 // Time Outs.
1227 // Note: These tests will need to be changed when the regexp engine is
1228 // able to detect and cut short the exponential time behavior on
1229 // this type of match.
1230 //
1231 {
1232 UErrorCode status = U_ZERO_ERROR;
1233 // Enough 'a's in the string to cause the match to time out.
1234 // (Each on additonal 'a' doubles the time)
1235 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1236 RegexMatcher matcher("(a+)+b", testString, 0, status);
1237 REGEX_CHECK_STATUS;
1238 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1239 matcher.setTimeLimit(100, status);
1240 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1241 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1242 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1243 }
1244 {
1245 UErrorCode status = U_ZERO_ERROR;
1246 // Few enough 'a's to slip in under the time limit.
1247 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1248 RegexMatcher matcher("(a+)+b", testString, 0, status);
1249 REGEX_CHECK_STATUS;
1250 matcher.setTimeLimit(100, status);
1251 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1252 REGEX_CHECK_STATUS;
1253 }
1254
1255 //
1256 // Stack Limits
1257 //
1258 {
1259 UErrorCode status = U_ZERO_ERROR;
1260 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1261
1262 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1263 // of the '+', and makes the stack frames larger.
1264 RegexMatcher matcher("(A)+A$", testString, 0, status);
1265
1266 // With the default stack, this match should fail to run
1267 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1268 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1269
1270 // With unlimited stack, it should run
1271 status = U_ZERO_ERROR;
1272 matcher.setStackLimit(0, status);
1273 REGEX_CHECK_STATUS;
1274 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1275 REGEX_CHECK_STATUS;
1276 REGEX_ASSERT(matcher.getStackLimit() == 0);
1277
1278 // With a limited stack, it the match should fail
1279 status = U_ZERO_ERROR;
1280 matcher.setStackLimit(10000, status);
1281 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1282 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1283 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1284 }
1285
1286 // A pattern that doesn't save state should work with
1287 // a minimal sized stack
1288 {
1289 UErrorCode status = U_ZERO_ERROR;
1290 UnicodeString testString = "abc";
1291 RegexMatcher matcher("abc", testString, 0, status);
1292 REGEX_CHECK_STATUS;
1293 matcher.setStackLimit(30, status);
1294 REGEX_CHECK_STATUS;
1295 REGEX_ASSERT(matcher.matches(status) == TRUE);
1296 REGEX_CHECK_STATUS;
1297 REGEX_ASSERT(matcher.getStackLimit() == 30);
1298
1299 // Negative stack sizes should fail
1300 status = U_ZERO_ERROR;
1301 matcher.setStackLimit(1000, status);
1302 REGEX_CHECK_STATUS;
1303 matcher.setStackLimit(-1, status);
1304 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1305 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1306 }
1307
1308
1309 }
1310
1311
1312
1313
1314
1315
1316 //---------------------------------------------------------------------------
1317 //
1318 // API_Replace API test for class RegexMatcher, testing the
1319 // Replace family of functions.
1320 //
1321 //---------------------------------------------------------------------------
API_Replace()1322 void RegexTest::API_Replace() {
1323 //
1324 // Replace
1325 //
1326 int32_t flags=0;
1327 UParseError pe;
1328 UErrorCode status=U_ZERO_ERROR;
1329
1330 UnicodeString re("abc");
1331 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1332 REGEX_CHECK_STATUS;
1333 UnicodeString data = ".abc..abc...abc..";
1334 // 012345678901234567
1335 RegexMatcher *matcher = pat->matcher(data, status);
1336
1337 //
1338 // Plain vanilla matches.
1339 //
1340 UnicodeString dest;
1341 dest = matcher->replaceFirst("yz", status);
1342 REGEX_CHECK_STATUS;
1343 REGEX_ASSERT(dest == ".yz..abc...abc..");
1344
1345 dest = matcher->replaceAll("yz", status);
1346 REGEX_CHECK_STATUS;
1347 REGEX_ASSERT(dest == ".yz..yz...yz..");
1348
1349 //
1350 // Plain vanilla non-matches.
1351 //
1352 UnicodeString d2 = ".abx..abx...abx..";
1353 matcher->reset(d2);
1354 dest = matcher->replaceFirst("yz", status);
1355 REGEX_CHECK_STATUS;
1356 REGEX_ASSERT(dest == ".abx..abx...abx..");
1357
1358 dest = matcher->replaceAll("yz", status);
1359 REGEX_CHECK_STATUS;
1360 REGEX_ASSERT(dest == ".abx..abx...abx..");
1361
1362 //
1363 // Empty source string
1364 //
1365 UnicodeString d3 = "";
1366 matcher->reset(d3);
1367 dest = matcher->replaceFirst("yz", status);
1368 REGEX_CHECK_STATUS;
1369 REGEX_ASSERT(dest == "");
1370
1371 dest = matcher->replaceAll("yz", status);
1372 REGEX_CHECK_STATUS;
1373 REGEX_ASSERT(dest == "");
1374
1375 //
1376 // Empty substitution string
1377 //
1378 matcher->reset(data); // ".abc..abc...abc.."
1379 dest = matcher->replaceFirst("", status);
1380 REGEX_CHECK_STATUS;
1381 REGEX_ASSERT(dest == "...abc...abc..");
1382
1383 dest = matcher->replaceAll("", status);
1384 REGEX_CHECK_STATUS;
1385 REGEX_ASSERT(dest == "........");
1386
1387 //
1388 // match whole string
1389 //
1390 UnicodeString d4 = "abc";
1391 matcher->reset(d4);
1392 dest = matcher->replaceFirst("xyz", status);
1393 REGEX_CHECK_STATUS;
1394 REGEX_ASSERT(dest == "xyz");
1395
1396 dest = matcher->replaceAll("xyz", status);
1397 REGEX_CHECK_STATUS;
1398 REGEX_ASSERT(dest == "xyz");
1399
1400 //
1401 // Capture Group, simple case
1402 //
1403 UnicodeString re2("a(..)");
1404 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1405 REGEX_CHECK_STATUS;
1406 UnicodeString d5 = "abcdefg";
1407 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1408 REGEX_CHECK_STATUS;
1409 dest = matcher2->replaceFirst("$1$1", status);
1410 REGEX_CHECK_STATUS;
1411 REGEX_ASSERT(dest == "bcbcdefg");
1412
1413 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1414 REGEX_CHECK_STATUS;
1415 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1416
1417 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1418 REGEX_ASSERT(U_FAILURE(status));
1419 status = U_ZERO_ERROR;
1420
1421 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1422 replacement = replacement.unescape();
1423 dest = matcher2->replaceFirst(replacement, status);
1424 REGEX_CHECK_STATUS;
1425 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1426
1427 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1428
1429
1430 //
1431 // Replacement String with \u hex escapes
1432 //
1433 {
1434 UnicodeString src = "abc 1 abc 2 abc 3";
1435 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1436 matcher->reset(src);
1437 UnicodeString result = matcher->replaceAll(substitute, status);
1438 REGEX_CHECK_STATUS;
1439 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1440 }
1441 {
1442 UnicodeString src = "abc !";
1443 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1444 matcher->reset(src);
1445 UnicodeString result = matcher->replaceAll(substitute, status);
1446 REGEX_CHECK_STATUS;
1447 UnicodeString expected = UnicodeString("--");
1448 expected.append((UChar32)0x10000);
1449 expected.append("-- !");
1450 REGEX_ASSERT(result == expected);
1451 }
1452 // TODO: need more through testing of capture substitutions.
1453
1454 // Bug 4057
1455 //
1456 {
1457 status = U_ZERO_ERROR;
1458 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1459 RegexMatcher m("ss(.*?)ee", 0, status);
1460 REGEX_CHECK_STATUS;
1461 UnicodeString result;
1462
1463 // Multiple finds do NOT bump up the previous appendReplacement postion.
1464 m.reset(s);
1465 m.find();
1466 m.find();
1467 m.appendReplacement(result, "ooh", status);
1468 REGEX_CHECK_STATUS;
1469 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1470
1471 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1472 status = U_ZERO_ERROR;
1473 result.truncate(0);
1474 m.reset(10, status);
1475 m.find();
1476 m.find();
1477 m.appendReplacement(result, "ooh", status);
1478 REGEX_CHECK_STATUS;
1479 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1480
1481 // find() at interior of string, appendReplacemnt still starts at beginning.
1482 status = U_ZERO_ERROR;
1483 result.truncate(0);
1484 m.reset();
1485 m.find(10, status);
1486 m.find();
1487 m.appendReplacement(result, "ooh", status);
1488 REGEX_CHECK_STATUS;
1489 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1490
1491 m.appendTail(result);
1492 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1493
1494 }
1495
1496 delete matcher2;
1497 delete pat2;
1498 delete matcher;
1499 delete pat;
1500 }
1501
1502
1503 //---------------------------------------------------------------------------
1504 //
1505 // API_Pattern Test that the API for class RegexPattern is
1506 // present and nominally working.
1507 //
1508 //---------------------------------------------------------------------------
API_Pattern()1509 void RegexTest::API_Pattern() {
1510 RegexPattern pata; // Test default constructor to not crash.
1511 RegexPattern patb;
1512
1513 REGEX_ASSERT(pata == patb);
1514 REGEX_ASSERT(pata == pata);
1515
1516 UnicodeString re1("abc[a-l][m-z]");
1517 UnicodeString re2("def");
1518 UErrorCode status = U_ZERO_ERROR;
1519 UParseError pe;
1520
1521 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1522 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1523 REGEX_CHECK_STATUS;
1524 REGEX_ASSERT(*pat1 == *pat1);
1525 REGEX_ASSERT(*pat1 != pata);
1526
1527 // Assign
1528 patb = *pat1;
1529 REGEX_ASSERT(patb == *pat1);
1530
1531 // Copy Construct
1532 RegexPattern patc(*pat1);
1533 REGEX_ASSERT(patc == *pat1);
1534 REGEX_ASSERT(patb == patc);
1535 REGEX_ASSERT(pat1 != pat2);
1536 patb = *pat2;
1537 REGEX_ASSERT(patb != patc);
1538 REGEX_ASSERT(patb == *pat2);
1539
1540 // Compile with no flags.
1541 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1542 REGEX_ASSERT(*pat1a == *pat1);
1543
1544 REGEX_ASSERT(pat1a->flags() == 0);
1545
1546 // Compile with different flags should be not equal
1547 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1548 REGEX_CHECK_STATUS;
1549
1550 REGEX_ASSERT(*pat1b != *pat1a);
1551 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1552 REGEX_ASSERT(pat1a->flags() == 0);
1553 delete pat1b;
1554
1555 // clone
1556 RegexPattern *pat1c = pat1->clone();
1557 REGEX_ASSERT(*pat1c == *pat1);
1558 REGEX_ASSERT(*pat1c != *pat2);
1559
1560 delete pat1c;
1561 delete pat1a;
1562 delete pat1;
1563 delete pat2;
1564
1565
1566 //
1567 // Verify that a matcher created from a cloned pattern works.
1568 // (Jitterbug 3423)
1569 //
1570 {
1571 UErrorCode status = U_ZERO_ERROR;
1572 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1573 RegexPattern *pClone = pSource->clone();
1574 delete pSource;
1575 RegexMatcher *mFromClone = pClone->matcher(status);
1576 REGEX_CHECK_STATUS;
1577 UnicodeString s = "Hello World";
1578 mFromClone->reset(s);
1579 REGEX_ASSERT(mFromClone->find() == TRUE);
1580 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1581 REGEX_ASSERT(mFromClone->find() == TRUE);
1582 REGEX_ASSERT(mFromClone->group(status) == "World");
1583 REGEX_ASSERT(mFromClone->find() == FALSE);
1584 delete mFromClone;
1585 delete pClone;
1586 }
1587
1588 //
1589 // matches convenience API
1590 //
1591 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1592 REGEX_CHECK_STATUS;
1593 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1594 REGEX_CHECK_STATUS;
1595 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1596 REGEX_CHECK_STATUS;
1597 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1598 REGEX_CHECK_STATUS;
1599 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1600 REGEX_CHECK_STATUS;
1601 status = U_INDEX_OUTOFBOUNDS_ERROR;
1602 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1603 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1604
1605
1606 //
1607 // Split()
1608 //
1609 status = U_ZERO_ERROR;
1610 pat1 = RegexPattern::compile(" +", pe, status);
1611 REGEX_CHECK_STATUS;
1612 UnicodeString fields[10];
1613
1614 int32_t n;
1615 n = pat1->split("Now is the time", fields, 10, status);
1616 REGEX_CHECK_STATUS;
1617 REGEX_ASSERT(n==4);
1618 REGEX_ASSERT(fields[0]=="Now");
1619 REGEX_ASSERT(fields[1]=="is");
1620 REGEX_ASSERT(fields[2]=="the");
1621 REGEX_ASSERT(fields[3]=="time");
1622 REGEX_ASSERT(fields[4]=="");
1623
1624 n = pat1->split("Now is the time", fields, 2, status);
1625 REGEX_CHECK_STATUS;
1626 REGEX_ASSERT(n==2);
1627 REGEX_ASSERT(fields[0]=="Now");
1628 REGEX_ASSERT(fields[1]=="is the time");
1629 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1630
1631 fields[1] = "*";
1632 status = U_ZERO_ERROR;
1633 n = pat1->split("Now is the time", fields, 1, status);
1634 REGEX_CHECK_STATUS;
1635 REGEX_ASSERT(n==1);
1636 REGEX_ASSERT(fields[0]=="Now is the time");
1637 REGEX_ASSERT(fields[1]=="*");
1638 status = U_ZERO_ERROR;
1639
1640 n = pat1->split(" Now is the time ", fields, 10, status);
1641 REGEX_CHECK_STATUS;
1642 REGEX_ASSERT(n==6);
1643 REGEX_ASSERT(fields[0]=="");
1644 REGEX_ASSERT(fields[1]=="Now");
1645 REGEX_ASSERT(fields[2]=="is");
1646 REGEX_ASSERT(fields[3]=="the");
1647 REGEX_ASSERT(fields[4]=="time");
1648 REGEX_ASSERT(fields[5]=="");
1649
1650 n = pat1->split(" ", fields, 10, status);
1651 REGEX_CHECK_STATUS;
1652 REGEX_ASSERT(n==2);
1653 REGEX_ASSERT(fields[0]=="");
1654 REGEX_ASSERT(fields[1]=="");
1655
1656 fields[0] = "foo";
1657 n = pat1->split("", fields, 10, status);
1658 REGEX_CHECK_STATUS;
1659 REGEX_ASSERT(n==0);
1660 REGEX_ASSERT(fields[0]=="foo");
1661
1662 delete pat1;
1663
1664 // split, with a pattern with (capture)
1665 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1666 REGEX_CHECK_STATUS;
1667
1668 status = U_ZERO_ERROR;
1669 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1670 REGEX_CHECK_STATUS;
1671 REGEX_ASSERT(n==7);
1672 REGEX_ASSERT(fields[0]=="");
1673 REGEX_ASSERT(fields[1]=="a");
1674 REGEX_ASSERT(fields[2]=="Now is ");
1675 REGEX_ASSERT(fields[3]=="b");
1676 REGEX_ASSERT(fields[4]=="the time");
1677 REGEX_ASSERT(fields[5]=="c");
1678 REGEX_ASSERT(fields[6]=="");
1679 REGEX_ASSERT(status==U_ZERO_ERROR);
1680
1681 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1682 REGEX_CHECK_STATUS;
1683 REGEX_ASSERT(n==7);
1684 REGEX_ASSERT(fields[0]==" ");
1685 REGEX_ASSERT(fields[1]=="a");
1686 REGEX_ASSERT(fields[2]=="Now is ");
1687 REGEX_ASSERT(fields[3]=="b");
1688 REGEX_ASSERT(fields[4]=="the time");
1689 REGEX_ASSERT(fields[5]=="c");
1690 REGEX_ASSERT(fields[6]=="");
1691
1692 status = U_ZERO_ERROR;
1693 fields[6] = "foo";
1694 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1695 REGEX_CHECK_STATUS;
1696 REGEX_ASSERT(n==6);
1697 REGEX_ASSERT(fields[0]==" ");
1698 REGEX_ASSERT(fields[1]=="a");
1699 REGEX_ASSERT(fields[2]=="Now is ");
1700 REGEX_ASSERT(fields[3]=="b");
1701 REGEX_ASSERT(fields[4]=="the time");
1702 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1703 REGEX_ASSERT(fields[6]=="foo");
1704
1705 status = U_ZERO_ERROR;
1706 fields[5] = "foo";
1707 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1708 REGEX_CHECK_STATUS;
1709 REGEX_ASSERT(n==5);
1710 REGEX_ASSERT(fields[0]==" ");
1711 REGEX_ASSERT(fields[1]=="a");
1712 REGEX_ASSERT(fields[2]=="Now is ");
1713 REGEX_ASSERT(fields[3]=="b");
1714 REGEX_ASSERT(fields[4]=="the time<c>");
1715 REGEX_ASSERT(fields[5]=="foo");
1716
1717 status = U_ZERO_ERROR;
1718 fields[5] = "foo";
1719 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1720 REGEX_CHECK_STATUS;
1721 REGEX_ASSERT(n==5);
1722 REGEX_ASSERT(fields[0]==" ");
1723 REGEX_ASSERT(fields[1]=="a");
1724 REGEX_ASSERT(fields[2]=="Now is ");
1725 REGEX_ASSERT(fields[3]=="b");
1726 REGEX_ASSERT(fields[4]=="the time");
1727 REGEX_ASSERT(fields[5]=="foo");
1728
1729 status = U_ZERO_ERROR;
1730 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1731 REGEX_CHECK_STATUS;
1732 REGEX_ASSERT(n==4);
1733 REGEX_ASSERT(fields[0]==" ");
1734 REGEX_ASSERT(fields[1]=="a");
1735 REGEX_ASSERT(fields[2]=="Now is ");
1736 REGEX_ASSERT(fields[3]=="the time<c>");
1737 status = U_ZERO_ERROR;
1738 delete pat1;
1739
1740 pat1 = RegexPattern::compile("([-,])", pe, status);
1741 REGEX_CHECK_STATUS;
1742 n = pat1->split("1-10,20", fields, 10, status);
1743 REGEX_CHECK_STATUS;
1744 REGEX_ASSERT(n==5);
1745 REGEX_ASSERT(fields[0]=="1");
1746 REGEX_ASSERT(fields[1]=="-");
1747 REGEX_ASSERT(fields[2]=="10");
1748 REGEX_ASSERT(fields[3]==",");
1749 REGEX_ASSERT(fields[4]=="20");
1750 delete pat1;
1751
1752 // Test split of string with empty trailing fields
1753 pat1 = RegexPattern::compile(",", pe, status);
1754 REGEX_CHECK_STATUS;
1755 n = pat1->split("a,b,c,", fields, 10, status);
1756 REGEX_CHECK_STATUS;
1757 REGEX_ASSERT(n==4);
1758 REGEX_ASSERT(fields[0]=="a");
1759 REGEX_ASSERT(fields[1]=="b");
1760 REGEX_ASSERT(fields[2]=="c");
1761 REGEX_ASSERT(fields[3]=="");
1762
1763 n = pat1->split("a,,,", fields, 10, status);
1764 REGEX_CHECK_STATUS;
1765 REGEX_ASSERT(n==4);
1766 REGEX_ASSERT(fields[0]=="a");
1767 REGEX_ASSERT(fields[1]=="");
1768 REGEX_ASSERT(fields[2]=="");
1769 REGEX_ASSERT(fields[3]=="");
1770 delete pat1;
1771
1772 // Split Separator with zero length match.
1773 pat1 = RegexPattern::compile(":?", pe, status);
1774 REGEX_CHECK_STATUS;
1775 n = pat1->split("abc", fields, 10, status);
1776 REGEX_CHECK_STATUS;
1777 REGEX_ASSERT(n==5);
1778 REGEX_ASSERT(fields[0]=="");
1779 REGEX_ASSERT(fields[1]=="a");
1780 REGEX_ASSERT(fields[2]=="b");
1781 REGEX_ASSERT(fields[3]=="c");
1782 REGEX_ASSERT(fields[4]=="");
1783
1784 delete pat1;
1785
1786 //
1787 // RegexPattern::pattern()
1788 //
1789 pat1 = new RegexPattern();
1790 REGEX_ASSERT(pat1->pattern() == "");
1791 delete pat1;
1792
1793 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1794 REGEX_CHECK_STATUS;
1795 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1796 delete pat1;
1797
1798
1799 //
1800 // classID functions
1801 //
1802 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1803 REGEX_CHECK_STATUS;
1804 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1805 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1806 UnicodeString Hello("Hello, world.");
1807 RegexMatcher *m = pat1->matcher(Hello, status);
1808 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1809 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1810 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1811 delete m;
1812 delete pat1;
1813
1814 }
1815
1816 //---------------------------------------------------------------------------
1817 //
1818 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1819 // is present and working, but excluding functions
1820 // implementing replace operations.
1821 //
1822 //---------------------------------------------------------------------------
API_Match_UTF8()1823 void RegexTest::API_Match_UTF8() {
1824 UParseError pe;
1825 UErrorCode status=U_ZERO_ERROR;
1826 int32_t flags = 0;
1827
1828 //
1829 // Debug - slide failing test cases early
1830 //
1831 #if 0
1832 {
1833 }
1834 return;
1835 #endif
1836
1837 //
1838 // Simple pattern compilation
1839 //
1840 {
1841 UText re = UTEXT_INITIALIZER;
1842 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1843 REGEX_VERBOSE_TEXT(&re);
1844 RegexPattern *pat2;
1845 pat2 = RegexPattern::compile(&re, flags, pe, status);
1846 REGEX_CHECK_STATUS;
1847
1848 UText input1 = UTEXT_INITIALIZER;
1849 UText input2 = UTEXT_INITIALIZER;
1850 UText empty = UTEXT_INITIALIZER;
1851 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1852 REGEX_VERBOSE_TEXT(&input1);
1853 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1854 REGEX_VERBOSE_TEXT(&input2);
1855 utext_openUChars(&empty, NULL, 0, &status);
1856
1857 int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
1858 int32_t input2Len = static_cast<int32_t>(strlen("not abc"));
1859
1860
1861 //
1862 // Matcher creation and reset.
1863 //
1864 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1865 REGEX_CHECK_STATUS;
1866 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1867 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1868 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1869 m1->reset(&input2);
1870 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1871 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1872 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1873 m1->reset(&input1);
1874 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1875 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1876 m1->reset(&empty);
1877 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1878 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1879
1880 //
1881 // reset(pos, status)
1882 //
1883 m1->reset(&input1);
1884 m1->reset(4, status);
1885 REGEX_CHECK_STATUS;
1886 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1887 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1888
1889 m1->reset(-1, status);
1890 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1891 status = U_ZERO_ERROR;
1892
1893 m1->reset(0, status);
1894 REGEX_CHECK_STATUS;
1895 status = U_ZERO_ERROR;
1896
1897 m1->reset(input1Len-1, status);
1898 REGEX_CHECK_STATUS;
1899 status = U_ZERO_ERROR;
1900
1901 m1->reset(input1Len, status);
1902 REGEX_CHECK_STATUS;
1903 status = U_ZERO_ERROR;
1904
1905 m1->reset(input1Len+1, status);
1906 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1907 status = U_ZERO_ERROR;
1908
1909 //
1910 // match(pos, status)
1911 //
1912 m1->reset(&input2);
1913 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1914 m1->reset();
1915 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1916 m1->reset();
1917 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1918 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1919 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1920 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1921
1922 // Match() at end of string should fail, but should not
1923 // be an error.
1924 status = U_ZERO_ERROR;
1925 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1926 REGEX_CHECK_STATUS;
1927
1928 // Match beyond end of string should fail with an error.
1929 status = U_ZERO_ERROR;
1930 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1931 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1932
1933 // Successful match at end of string.
1934 {
1935 status = U_ZERO_ERROR;
1936 RegexMatcher m("A?", 0, status); // will match zero length string.
1937 REGEX_CHECK_STATUS;
1938 m.reset(&input1);
1939 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1940 REGEX_CHECK_STATUS;
1941 m.reset(&empty);
1942 REGEX_ASSERT(m.matches(0, status) == TRUE);
1943 REGEX_CHECK_STATUS;
1944 }
1945
1946
1947 //
1948 // lookingAt(pos, status)
1949 //
1950 status = U_ZERO_ERROR;
1951 m1->reset(&input2); // "not abc"
1952 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1953 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1954 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1955 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1956 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1957 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1958 status = U_ZERO_ERROR;
1959 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1960 REGEX_CHECK_STATUS;
1961 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1962 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1963
1964 delete m1;
1965 delete pat2;
1966
1967 utext_close(&re);
1968 utext_close(&input1);
1969 utext_close(&input2);
1970 utext_close(&empty);
1971 }
1972
1973
1974 //
1975 // Capture Group.
1976 // RegexMatcher::start();
1977 // RegexMatcher::end();
1978 // RegexMatcher::groupCount();
1979 //
1980 {
1981 int32_t flags=0;
1982 UParseError pe;
1983 UErrorCode status=U_ZERO_ERROR;
1984 UText re=UTEXT_INITIALIZER;
1985 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1986 utext_openUTF8(&re, str_01234567_pat, -1, &status);
1987
1988 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1989 REGEX_CHECK_STATUS;
1990
1991 UText input = UTEXT_INITIALIZER;
1992 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1993 utext_openUTF8(&input, str_0123456789, -1, &status);
1994
1995 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1996 REGEX_CHECK_STATUS;
1997 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1998 static const int32_t matchStarts[] = {0, 2, 4, 8};
1999 static const int32_t matchEnds[] = {10, 8, 6, 10};
2000 int32_t i;
2001 for (i=0; i<4; i++) {
2002 int32_t actualStart = matcher->start(i, status);
2003 REGEX_CHECK_STATUS;
2004 if (actualStart != matchStarts[i]) {
2005 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2006 __FILE__, __LINE__, i, matchStarts[i], actualStart);
2007 }
2008 int32_t actualEnd = matcher->end(i, status);
2009 REGEX_CHECK_STATUS;
2010 if (actualEnd != matchEnds[i]) {
2011 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2012 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2013 }
2014 }
2015
2016 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2017 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2018
2019 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2020 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2021 matcher->reset();
2022 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2023
2024 matcher->lookingAt(status);
2025
2026 UnicodeString dest;
2027 UText destText = UTEXT_INITIALIZER;
2028 utext_openUnicodeString(&destText, &dest, &status);
2029 UText *result;
2030 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2031 // Test shallow-clone API
2032 int64_t group_len;
2033 result = matcher->group((UText *)NULL, group_len, status);
2034 REGEX_CHECK_STATUS;
2035 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2036 utext_close(result);
2037 result = matcher->group(0, &destText, group_len, status);
2038 REGEX_CHECK_STATUS;
2039 REGEX_ASSERT(result == &destText);
2040 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2041 // destText is now immutable, reopen it
2042 utext_close(&destText);
2043 utext_openUnicodeString(&destText, &dest, &status);
2044
2045 int64_t length;
2046 result = matcher->group(0, NULL, length, status);
2047 REGEX_CHECK_STATUS;
2048 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2049 utext_close(result);
2050 result = matcher->group(0, &destText, length, status);
2051 REGEX_CHECK_STATUS;
2052 REGEX_ASSERT(result == &destText);
2053 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2054 REGEX_ASSERT(length == 10);
2055 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2056
2057 // Capture Group 1 == "234567"
2058 result = matcher->group(1, NULL, length, status);
2059 REGEX_CHECK_STATUS;
2060 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2061 REGEX_ASSERT(length == 6);
2062 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2063 utext_close(result);
2064
2065 result = matcher->group(1, &destText, length, status);
2066 REGEX_CHECK_STATUS;
2067 REGEX_ASSERT(result == &destText);
2068 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2069 REGEX_ASSERT(length == 6);
2070 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2071 utext_close(result);
2072
2073 // Capture Group 2 == "45"
2074 result = matcher->group(2, NULL, length, status);
2075 REGEX_CHECK_STATUS;
2076 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2077 REGEX_ASSERT(length == 2);
2078 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2079 utext_close(result);
2080
2081 result = matcher->group(2, &destText, length, status);
2082 REGEX_CHECK_STATUS;
2083 REGEX_ASSERT(result == &destText);
2084 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2085 REGEX_ASSERT(length == 2);
2086 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2087 utext_close(result);
2088
2089 // Capture Group 3 == "89"
2090 result = matcher->group(3, NULL, length, status);
2091 REGEX_CHECK_STATUS;
2092 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2093 REGEX_ASSERT(length == 2);
2094 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2095 utext_close(result);
2096
2097 result = matcher->group(3, &destText, length, status);
2098 REGEX_CHECK_STATUS;
2099 REGEX_ASSERT(result == &destText);
2100 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2101 REGEX_ASSERT(length == 2);
2102 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2103 utext_close(result);
2104
2105 // Capture Group number out of range.
2106 status = U_ZERO_ERROR;
2107 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2108 status = U_ZERO_ERROR;
2109 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2110 status = U_ZERO_ERROR;
2111 matcher->reset();
2112 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2113
2114 delete matcher;
2115 delete pat;
2116
2117 utext_close(&destText);
2118 utext_close(&input);
2119 utext_close(&re);
2120 }
2121
2122 //
2123 // find
2124 //
2125 {
2126 int32_t flags=0;
2127 UParseError pe;
2128 UErrorCode status=U_ZERO_ERROR;
2129 UText re=UTEXT_INITIALIZER;
2130 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2131 utext_openUTF8(&re, str_abc, -1, &status);
2132
2133 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2134 REGEX_CHECK_STATUS;
2135 UText input = UTEXT_INITIALIZER;
2136 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2137 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2138 // 012345678901234567
2139
2140 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2141 REGEX_CHECK_STATUS;
2142 REGEX_ASSERT(matcher->find());
2143 REGEX_ASSERT(matcher->start(status) == 1);
2144 REGEX_ASSERT(matcher->find());
2145 REGEX_ASSERT(matcher->start(status) == 6);
2146 REGEX_ASSERT(matcher->find());
2147 REGEX_ASSERT(matcher->start(status) == 12);
2148 REGEX_ASSERT(matcher->find() == FALSE);
2149 REGEX_ASSERT(matcher->find() == FALSE);
2150
2151 matcher->reset();
2152 REGEX_ASSERT(matcher->find());
2153 REGEX_ASSERT(matcher->start(status) == 1);
2154
2155 REGEX_ASSERT(matcher->find(0, status));
2156 REGEX_ASSERT(matcher->start(status) == 1);
2157 REGEX_ASSERT(matcher->find(1, status));
2158 REGEX_ASSERT(matcher->start(status) == 1);
2159 REGEX_ASSERT(matcher->find(2, status));
2160 REGEX_ASSERT(matcher->start(status) == 6);
2161 REGEX_ASSERT(matcher->find(12, status));
2162 REGEX_ASSERT(matcher->start(status) == 12);
2163 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2164 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2165 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2166 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2167
2168 status = U_ZERO_ERROR;
2169 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2170 status = U_ZERO_ERROR;
2171 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2172
2173 REGEX_ASSERT(matcher->groupCount() == 0);
2174
2175 delete matcher;
2176 delete pat;
2177
2178 utext_close(&input);
2179 utext_close(&re);
2180 }
2181
2182
2183 //
2184 // find, with \G in pattern (true if at the end of a previous match).
2185 //
2186 {
2187 int32_t flags=0;
2188 UParseError pe;
2189 UErrorCode status=U_ZERO_ERROR;
2190 UText re=UTEXT_INITIALIZER;
2191 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2192 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2193
2194 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2195
2196 REGEX_CHECK_STATUS;
2197 UText input = UTEXT_INITIALIZER;
2198 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2199 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2200 // 012345678901234567
2201
2202 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2203 REGEX_CHECK_STATUS;
2204 REGEX_ASSERT(matcher->find());
2205 REGEX_ASSERT(matcher->start(status) == 0);
2206 REGEX_ASSERT(matcher->start(1, status) == -1);
2207 REGEX_ASSERT(matcher->start(2, status) == 1);
2208
2209 REGEX_ASSERT(matcher->find());
2210 REGEX_ASSERT(matcher->start(status) == 4);
2211 REGEX_ASSERT(matcher->start(1, status) == 4);
2212 REGEX_ASSERT(matcher->start(2, status) == -1);
2213 REGEX_CHECK_STATUS;
2214
2215 delete matcher;
2216 delete pat;
2217
2218 utext_close(&input);
2219 utext_close(&re);
2220 }
2221
2222 //
2223 // find with zero length matches, match position should bump ahead
2224 // to prevent loops.
2225 //
2226 {
2227 int32_t i;
2228 UErrorCode status=U_ZERO_ERROR;
2229 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2230 // using an always-true look-ahead.
2231 REGEX_CHECK_STATUS;
2232 UText s = UTEXT_INITIALIZER;
2233 utext_openUTF8(&s, " ", -1, &status);
2234 m.reset(&s);
2235 for (i=0; ; i++) {
2236 if (m.find() == FALSE) {
2237 break;
2238 }
2239 REGEX_ASSERT(m.start(status) == i);
2240 REGEX_ASSERT(m.end(status) == i);
2241 }
2242 REGEX_ASSERT(i==5);
2243
2244 // Check that the bump goes over characters outside the BMP OK
2245 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2246 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2247 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2248 m.reset(&s);
2249 for (i=0; ; i+=4) {
2250 if (m.find() == FALSE) {
2251 break;
2252 }
2253 REGEX_ASSERT(m.start(status) == i);
2254 REGEX_ASSERT(m.end(status) == i);
2255 }
2256 REGEX_ASSERT(i==20);
2257
2258 utext_close(&s);
2259 }
2260 {
2261 // find() loop breaking test.
2262 // with pattern of /.?/, should see a series of one char matches, then a single
2263 // match of zero length at the end of the input string.
2264 int32_t i;
2265 UErrorCode status=U_ZERO_ERROR;
2266 RegexMatcher m(".?", 0, status);
2267 REGEX_CHECK_STATUS;
2268 UText s = UTEXT_INITIALIZER;
2269 utext_openUTF8(&s, " ", -1, &status);
2270 m.reset(&s);
2271 for (i=0; ; i++) {
2272 if (m.find() == FALSE) {
2273 break;
2274 }
2275 REGEX_ASSERT(m.start(status) == i);
2276 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2277 }
2278 REGEX_ASSERT(i==5);
2279
2280 utext_close(&s);
2281 }
2282
2283
2284 //
2285 // Matchers with no input string behave as if they had an empty input string.
2286 //
2287
2288 {
2289 UErrorCode status = U_ZERO_ERROR;
2290 RegexMatcher m(".?", 0, status);
2291 REGEX_CHECK_STATUS;
2292 REGEX_ASSERT(m.find());
2293 REGEX_ASSERT(m.start(status) == 0);
2294 REGEX_ASSERT(m.input() == "");
2295 }
2296 {
2297 UErrorCode status = U_ZERO_ERROR;
2298 RegexPattern *p = RegexPattern::compile(".", 0, status);
2299 RegexMatcher *m = p->matcher(status);
2300 REGEX_CHECK_STATUS;
2301
2302 REGEX_ASSERT(m->find() == FALSE);
2303 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2304 delete m;
2305 delete p;
2306 }
2307
2308 //
2309 // Regions
2310 //
2311 {
2312 UErrorCode status = U_ZERO_ERROR;
2313 UText testPattern = UTEXT_INITIALIZER;
2314 UText testText = UTEXT_INITIALIZER;
2315 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2316 REGEX_VERBOSE_TEXT(&testPattern);
2317 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2318 REGEX_VERBOSE_TEXT(&testText);
2319
2320 RegexMatcher m(&testPattern, &testText, 0, status);
2321 REGEX_CHECK_STATUS;
2322 REGEX_ASSERT(m.regionStart() == 0);
2323 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2324 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2325 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2326
2327 m.region(2,4, status);
2328 REGEX_CHECK_STATUS;
2329 REGEX_ASSERT(m.matches(status));
2330 REGEX_ASSERT(m.start(status)==2);
2331 REGEX_ASSERT(m.end(status)==4);
2332 REGEX_CHECK_STATUS;
2333
2334 m.reset();
2335 REGEX_ASSERT(m.regionStart() == 0);
2336 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2337
2338 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2339 REGEX_VERBOSE_TEXT(&testText);
2340 m.reset(&testText);
2341 REGEX_ASSERT(m.regionStart() == 0);
2342 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2343
2344 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2345 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2346 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2347 REGEX_ASSERT(&m == &m.reset());
2348 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2349
2350 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2351 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2352 REGEX_ASSERT(&m == &m.reset());
2353 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2354
2355 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2356 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2357 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2358 REGEX_ASSERT(&m == &m.reset());
2359 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2360
2361 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2362 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2363 REGEX_ASSERT(&m == &m.reset());
2364 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2365
2366 utext_close(&testText);
2367 utext_close(&testPattern);
2368 }
2369
2370 //
2371 // hitEnd() and requireEnd()
2372 //
2373 {
2374 UErrorCode status = U_ZERO_ERROR;
2375 UText testPattern = UTEXT_INITIALIZER;
2376 UText testText = UTEXT_INITIALIZER;
2377 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2378 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2379 utext_openUTF8(&testPattern, str_, -1, &status);
2380 utext_openUTF8(&testText, str_aabb, -1, &status);
2381
2382 RegexMatcher m1(&testPattern, &testText, 0, status);
2383 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2384 REGEX_ASSERT(m1.hitEnd() == TRUE);
2385 REGEX_ASSERT(m1.requireEnd() == FALSE);
2386 REGEX_CHECK_STATUS;
2387
2388 status = U_ZERO_ERROR;
2389 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2390 utext_openUTF8(&testPattern, str_a, -1, &status);
2391 RegexMatcher m2(&testPattern, &testText, 0, status);
2392 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2393 REGEX_ASSERT(m2.hitEnd() == FALSE);
2394 REGEX_ASSERT(m2.requireEnd() == FALSE);
2395 REGEX_CHECK_STATUS;
2396
2397 status = U_ZERO_ERROR;
2398 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2399 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2400 RegexMatcher m3(&testPattern, &testText, 0, status);
2401 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2402 REGEX_ASSERT(m3.hitEnd() == TRUE);
2403 REGEX_ASSERT(m3.requireEnd() == TRUE);
2404 REGEX_CHECK_STATUS;
2405
2406 utext_close(&testText);
2407 utext_close(&testPattern);
2408 }
2409 }
2410
2411
2412 //---------------------------------------------------------------------------
2413 //
2414 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2415 // Replace family of functions.
2416 //
2417 //---------------------------------------------------------------------------
API_Replace_UTF8()2418 void RegexTest::API_Replace_UTF8() {
2419 //
2420 // Replace
2421 //
2422 int32_t flags=0;
2423 UParseError pe;
2424 UErrorCode status=U_ZERO_ERROR;
2425
2426 UText re=UTEXT_INITIALIZER;
2427 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2428 REGEX_VERBOSE_TEXT(&re);
2429 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2430 REGEX_CHECK_STATUS;
2431
2432 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2433 // 012345678901234567
2434 UText dataText = UTEXT_INITIALIZER;
2435 utext_openUTF8(&dataText, data, -1, &status);
2436 REGEX_CHECK_STATUS;
2437 REGEX_VERBOSE_TEXT(&dataText);
2438 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2439
2440 //
2441 // Plain vanilla matches.
2442 //
2443 UnicodeString dest;
2444 UText destText = UTEXT_INITIALIZER;
2445 utext_openUnicodeString(&destText, &dest, &status);
2446 UText *result;
2447
2448 UText replText = UTEXT_INITIALIZER;
2449
2450 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2451 utext_openUTF8(&replText, str_yz, -1, &status);
2452 REGEX_VERBOSE_TEXT(&replText);
2453 result = matcher->replaceFirst(&replText, NULL, status);
2454 REGEX_CHECK_STATUS;
2455 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2456 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2457 utext_close(result);
2458 result = matcher->replaceFirst(&replText, &destText, status);
2459 REGEX_CHECK_STATUS;
2460 REGEX_ASSERT(result == &destText);
2461 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2462
2463 result = matcher->replaceAll(&replText, NULL, status);
2464 REGEX_CHECK_STATUS;
2465 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2466 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2467 utext_close(result);
2468
2469 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2470 result = matcher->replaceAll(&replText, &destText, status);
2471 REGEX_CHECK_STATUS;
2472 REGEX_ASSERT(result == &destText);
2473 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2474
2475 //
2476 // Plain vanilla non-matches.
2477 //
2478 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2479 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2480 matcher->reset(&dataText);
2481
2482 result = matcher->replaceFirst(&replText, NULL, status);
2483 REGEX_CHECK_STATUS;
2484 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2485 utext_close(result);
2486 result = matcher->replaceFirst(&replText, &destText, status);
2487 REGEX_CHECK_STATUS;
2488 REGEX_ASSERT(result == &destText);
2489 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2490
2491 result = matcher->replaceAll(&replText, NULL, status);
2492 REGEX_CHECK_STATUS;
2493 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2494 utext_close(result);
2495 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2496 result = matcher->replaceAll(&replText, &destText, status);
2497 REGEX_CHECK_STATUS;
2498 REGEX_ASSERT(result == &destText);
2499 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2500
2501 //
2502 // Empty source string
2503 //
2504 utext_openUTF8(&dataText, NULL, 0, &status);
2505 matcher->reset(&dataText);
2506
2507 result = matcher->replaceFirst(&replText, NULL, status);
2508 REGEX_CHECK_STATUS;
2509 REGEX_ASSERT_UTEXT_UTF8("", result);
2510 utext_close(result);
2511 result = matcher->replaceFirst(&replText, &destText, status);
2512 REGEX_CHECK_STATUS;
2513 REGEX_ASSERT(result == &destText);
2514 REGEX_ASSERT_UTEXT_UTF8("", result);
2515
2516 result = matcher->replaceAll(&replText, NULL, status);
2517 REGEX_CHECK_STATUS;
2518 REGEX_ASSERT_UTEXT_UTF8("", result);
2519 utext_close(result);
2520 result = matcher->replaceAll(&replText, &destText, status);
2521 REGEX_CHECK_STATUS;
2522 REGEX_ASSERT(result == &destText);
2523 REGEX_ASSERT_UTEXT_UTF8("", result);
2524
2525 //
2526 // Empty substitution string
2527 //
2528 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2529 matcher->reset(&dataText);
2530
2531 utext_openUTF8(&replText, NULL, 0, &status);
2532 result = matcher->replaceFirst(&replText, NULL, status);
2533 REGEX_CHECK_STATUS;
2534 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2535 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2536 utext_close(result);
2537 result = matcher->replaceFirst(&replText, &destText, status);
2538 REGEX_CHECK_STATUS;
2539 REGEX_ASSERT(result == &destText);
2540 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2541
2542 result = matcher->replaceAll(&replText, NULL, status);
2543 REGEX_CHECK_STATUS;
2544 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2545 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2546 utext_close(result);
2547 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2548 result = matcher->replaceAll(&replText, &destText, status);
2549 REGEX_CHECK_STATUS;
2550 REGEX_ASSERT(result == &destText);
2551 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2552
2553 //
2554 // match whole string
2555 //
2556 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2557 utext_openUTF8(&dataText, str_abc, -1, &status);
2558 matcher->reset(&dataText);
2559
2560 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2561 utext_openUTF8(&replText, str_xyz, -1, &status);
2562 result = matcher->replaceFirst(&replText, NULL, status);
2563 REGEX_CHECK_STATUS;
2564 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2565 utext_close(result);
2566 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2567 result = matcher->replaceFirst(&replText, &destText, status);
2568 REGEX_CHECK_STATUS;
2569 REGEX_ASSERT(result == &destText);
2570 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2571
2572 result = matcher->replaceAll(&replText, NULL, status);
2573 REGEX_CHECK_STATUS;
2574 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2575 utext_close(result);
2576 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2577 result = matcher->replaceAll(&replText, &destText, status);
2578 REGEX_CHECK_STATUS;
2579 REGEX_ASSERT(result == &destText);
2580 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2581
2582 //
2583 // Capture Group, simple case
2584 //
2585 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2586 utext_openUTF8(&re, str_add, -1, &status);
2587 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2588 REGEX_CHECK_STATUS;
2589
2590 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2591 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2592 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2593 REGEX_CHECK_STATUS;
2594
2595 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2596 utext_openUTF8(&replText, str_11, -1, &status);
2597 result = matcher2->replaceFirst(&replText, NULL, status);
2598 REGEX_CHECK_STATUS;
2599 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2600 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2601 utext_close(result);
2602 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2603 result = matcher2->replaceFirst(&replText, &destText, status);
2604 REGEX_CHECK_STATUS;
2605 REGEX_ASSERT(result == &destText);
2606 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2607
2608 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2609 utext_openUTF8(&replText, str_v, -1, &status);
2610 REGEX_VERBOSE_TEXT(&replText);
2611 result = matcher2->replaceFirst(&replText, NULL, status);
2612 REGEX_CHECK_STATUS;
2613 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2614 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2615 utext_close(result);
2616 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2617 result = matcher2->replaceFirst(&replText, &destText, status);
2618 REGEX_CHECK_STATUS;
2619 REGEX_ASSERT(result == &destText);
2620 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2621
2622 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2623 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2624 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2625 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2626 result = matcher2->replaceFirst(&replText, NULL, status);
2627 REGEX_CHECK_STATUS;
2628 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2629 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2630 utext_close(result);
2631 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2632 result = matcher2->replaceFirst(&replText, &destText, status);
2633 REGEX_CHECK_STATUS;
2634 REGEX_ASSERT(result == &destText);
2635 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2636
2637 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2638 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2639 // 012345678901234567890123456
2640 supplDigitChars[22] = 0xF0;
2641 supplDigitChars[23] = 0x9D;
2642 supplDigitChars[24] = 0x9F;
2643 supplDigitChars[25] = 0x8F;
2644 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2645
2646 result = matcher2->replaceFirst(&replText, NULL, status);
2647 REGEX_CHECK_STATUS;
2648 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2649 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2650 utext_close(result);
2651 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2652 result = matcher2->replaceFirst(&replText, &destText, status);
2653 REGEX_CHECK_STATUS;
2654 REGEX_ASSERT(result == &destText);
2655 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2656 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2657 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2658 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2659 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2660 utext_close(result);
2661 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2662 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2663 REGEX_ASSERT(result == &destText);
2664 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2665
2666 //
2667 // Replacement String with \u hex escapes
2668 //
2669 {
2670 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2671 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2672 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2673 utext_openUTF8(&replText, str_u0043, -1, &status);
2674 matcher->reset(&dataText);
2675
2676 result = matcher->replaceAll(&replText, NULL, status);
2677 REGEX_CHECK_STATUS;
2678 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2679 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2680 utext_close(result);
2681 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2682 result = matcher->replaceAll(&replText, &destText, status);
2683 REGEX_CHECK_STATUS;
2684 REGEX_ASSERT(result == &destText);
2685 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2686 }
2687 {
2688 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2689 utext_openUTF8(&dataText, str_abc, -1, &status);
2690 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2691 utext_openUTF8(&replText, str_U00010000, -1, &status);
2692 matcher->reset(&dataText);
2693
2694 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2695 // 0123456789
2696 expected[2] = 0xF0;
2697 expected[3] = 0x90;
2698 expected[4] = 0x80;
2699 expected[5] = 0x80;
2700
2701 result = matcher->replaceAll(&replText, NULL, status);
2702 REGEX_CHECK_STATUS;
2703 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2704 utext_close(result);
2705 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2706 result = matcher->replaceAll(&replText, &destText, status);
2707 REGEX_CHECK_STATUS;
2708 REGEX_ASSERT(result == &destText);
2709 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2710 }
2711 // TODO: need more through testing of capture substitutions.
2712
2713 // Bug 4057
2714 //
2715 {
2716 status = U_ZERO_ERROR;
2717 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2718 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2719 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2720 utext_openUTF8(&re, str_ssee, -1, &status);
2721 utext_openUTF8(&dataText, str_blah, -1, &status);
2722 utext_openUTF8(&replText, str_ooh, -1, &status);
2723
2724 RegexMatcher m(&re, 0, status);
2725 REGEX_CHECK_STATUS;
2726
2727 UnicodeString result;
2728 UText resultText = UTEXT_INITIALIZER;
2729 utext_openUnicodeString(&resultText, &result, &status);
2730
2731 // Multiple finds do NOT bump up the previous appendReplacement postion.
2732 m.reset(&dataText);
2733 m.find();
2734 m.find();
2735 m.appendReplacement(&resultText, &replText, status);
2736 REGEX_CHECK_STATUS;
2737 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2738 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2739
2740 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2741 status = U_ZERO_ERROR;
2742 result.truncate(0);
2743 utext_openUnicodeString(&resultText, &result, &status);
2744 m.reset(10, status);
2745 m.find();
2746 m.find();
2747 m.appendReplacement(&resultText, &replText, status);
2748 REGEX_CHECK_STATUS;
2749 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2750 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2751
2752 // find() at interior of string, appendReplacement still starts at beginning.
2753 status = U_ZERO_ERROR;
2754 result.truncate(0);
2755 utext_openUnicodeString(&resultText, &result, &status);
2756 m.reset();
2757 m.find(10, status);
2758 m.find();
2759 m.appendReplacement(&resultText, &replText, status);
2760 REGEX_CHECK_STATUS;
2761 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2762 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2763
2764 m.appendTail(&resultText, status);
2765 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2766 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2767
2768 utext_close(&resultText);
2769 }
2770
2771 delete matcher2;
2772 delete pat2;
2773 delete matcher;
2774 delete pat;
2775
2776 utext_close(&dataText);
2777 utext_close(&replText);
2778 utext_close(&destText);
2779 utext_close(&re);
2780 }
2781
2782
2783 //---------------------------------------------------------------------------
2784 //
2785 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2786 // present and nominally working.
2787 //
2788 //---------------------------------------------------------------------------
API_Pattern_UTF8()2789 void RegexTest::API_Pattern_UTF8() {
2790 RegexPattern pata; // Test default constructor to not crash.
2791 RegexPattern patb;
2792
2793 REGEX_ASSERT(pata == patb);
2794 REGEX_ASSERT(pata == pata);
2795
2796 UText re1 = UTEXT_INITIALIZER;
2797 UText re2 = UTEXT_INITIALIZER;
2798 UErrorCode status = U_ZERO_ERROR;
2799 UParseError pe;
2800
2801 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2802 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2803 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2804 utext_openUTF8(&re2, str_def, -1, &status);
2805
2806 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2807 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2808 REGEX_CHECK_STATUS;
2809 REGEX_ASSERT(*pat1 == *pat1);
2810 REGEX_ASSERT(*pat1 != pata);
2811
2812 // Assign
2813 patb = *pat1;
2814 REGEX_ASSERT(patb == *pat1);
2815
2816 // Copy Construct
2817 RegexPattern patc(*pat1);
2818 REGEX_ASSERT(patc == *pat1);
2819 REGEX_ASSERT(patb == patc);
2820 REGEX_ASSERT(pat1 != pat2);
2821 patb = *pat2;
2822 REGEX_ASSERT(patb != patc);
2823 REGEX_ASSERT(patb == *pat2);
2824
2825 // Compile with no flags.
2826 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2827 REGEX_ASSERT(*pat1a == *pat1);
2828
2829 REGEX_ASSERT(pat1a->flags() == 0);
2830
2831 // Compile with different flags should be not equal
2832 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2833 REGEX_CHECK_STATUS;
2834
2835 REGEX_ASSERT(*pat1b != *pat1a);
2836 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2837 REGEX_ASSERT(pat1a->flags() == 0);
2838 delete pat1b;
2839
2840 // clone
2841 RegexPattern *pat1c = pat1->clone();
2842 REGEX_ASSERT(*pat1c == *pat1);
2843 REGEX_ASSERT(*pat1c != *pat2);
2844
2845 delete pat1c;
2846 delete pat1a;
2847 delete pat1;
2848 delete pat2;
2849
2850 utext_close(&re1);
2851 utext_close(&re2);
2852
2853
2854 //
2855 // Verify that a matcher created from a cloned pattern works.
2856 // (Jitterbug 3423)
2857 //
2858 {
2859 UErrorCode status = U_ZERO_ERROR;
2860 UText pattern = UTEXT_INITIALIZER;
2861 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2862 utext_openUTF8(&pattern, str_pL, -1, &status);
2863
2864 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2865 RegexPattern *pClone = pSource->clone();
2866 delete pSource;
2867 RegexMatcher *mFromClone = pClone->matcher(status);
2868 REGEX_CHECK_STATUS;
2869
2870 UText input = UTEXT_INITIALIZER;
2871 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2872 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2873 mFromClone->reset(&input);
2874 REGEX_ASSERT(mFromClone->find() == TRUE);
2875 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2876 REGEX_ASSERT(mFromClone->find() == TRUE);
2877 REGEX_ASSERT(mFromClone->group(status) == "World");
2878 REGEX_ASSERT(mFromClone->find() == FALSE);
2879 delete mFromClone;
2880 delete pClone;
2881
2882 utext_close(&input);
2883 utext_close(&pattern);
2884 }
2885
2886 //
2887 // matches convenience API
2888 //
2889 {
2890 UErrorCode status = U_ZERO_ERROR;
2891 UText pattern = UTEXT_INITIALIZER;
2892 UText input = UTEXT_INITIALIZER;
2893
2894 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2895 utext_openUTF8(&input, str_randominput, -1, &status);
2896
2897 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2898 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2899 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2900 REGEX_CHECK_STATUS;
2901
2902 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2903 utext_openUTF8(&pattern, str_abc, -1, &status);
2904 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2905 REGEX_CHECK_STATUS;
2906
2907 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2908 utext_openUTF8(&pattern, str_nput, -1, &status);
2909 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2910 REGEX_CHECK_STATUS;
2911
2912 utext_openUTF8(&pattern, str_randominput, -1, &status);
2913 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2914 REGEX_CHECK_STATUS;
2915
2916 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2917 utext_openUTF8(&pattern, str_u, -1, &status);
2918 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2919 REGEX_CHECK_STATUS;
2920
2921 utext_openUTF8(&input, str_abc, -1, &status);
2922 utext_openUTF8(&pattern, str_abc, -1, &status);
2923 status = U_INDEX_OUTOFBOUNDS_ERROR;
2924 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2925 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2926
2927 utext_close(&input);
2928 utext_close(&pattern);
2929 }
2930
2931
2932 //
2933 // Split()
2934 //
2935 status = U_ZERO_ERROR;
2936 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2937 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2938 pat1 = RegexPattern::compile(&re1, pe, status);
2939 REGEX_CHECK_STATUS;
2940 UnicodeString fields[10];
2941
2942 int32_t n;
2943 n = pat1->split("Now is the time", fields, 10, status);
2944 REGEX_CHECK_STATUS;
2945 REGEX_ASSERT(n==4);
2946 REGEX_ASSERT(fields[0]=="Now");
2947 REGEX_ASSERT(fields[1]=="is");
2948 REGEX_ASSERT(fields[2]=="the");
2949 REGEX_ASSERT(fields[3]=="time");
2950 REGEX_ASSERT(fields[4]=="");
2951
2952 n = pat1->split("Now is the time", fields, 2, status);
2953 REGEX_CHECK_STATUS;
2954 REGEX_ASSERT(n==2);
2955 REGEX_ASSERT(fields[0]=="Now");
2956 REGEX_ASSERT(fields[1]=="is the time");
2957 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2958
2959 fields[1] = "*";
2960 status = U_ZERO_ERROR;
2961 n = pat1->split("Now is the time", fields, 1, status);
2962 REGEX_CHECK_STATUS;
2963 REGEX_ASSERT(n==1);
2964 REGEX_ASSERT(fields[0]=="Now is the time");
2965 REGEX_ASSERT(fields[1]=="*");
2966 status = U_ZERO_ERROR;
2967
2968 n = pat1->split(" Now is the time ", fields, 10, status);
2969 REGEX_CHECK_STATUS;
2970 REGEX_ASSERT(n==6);
2971 REGEX_ASSERT(fields[0]=="");
2972 REGEX_ASSERT(fields[1]=="Now");
2973 REGEX_ASSERT(fields[2]=="is");
2974 REGEX_ASSERT(fields[3]=="the");
2975 REGEX_ASSERT(fields[4]=="time");
2976 REGEX_ASSERT(fields[5]=="");
2977 REGEX_ASSERT(fields[6]=="");
2978
2979 fields[2] = "*";
2980 n = pat1->split(" ", fields, 10, status);
2981 REGEX_CHECK_STATUS;
2982 REGEX_ASSERT(n==2);
2983 REGEX_ASSERT(fields[0]=="");
2984 REGEX_ASSERT(fields[1]=="");
2985 REGEX_ASSERT(fields[2]=="*");
2986
2987 fields[0] = "foo";
2988 n = pat1->split("", fields, 10, status);
2989 REGEX_CHECK_STATUS;
2990 REGEX_ASSERT(n==0);
2991 REGEX_ASSERT(fields[0]=="foo");
2992
2993 delete pat1;
2994
2995 // split, with a pattern with (capture)
2996 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2997 pat1 = RegexPattern::compile(&re1, pe, status);
2998 REGEX_CHECK_STATUS;
2999
3000 status = U_ZERO_ERROR;
3001 fields[6] = fields[7] = "*";
3002 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3003 REGEX_CHECK_STATUS;
3004 REGEX_ASSERT(n==7);
3005 REGEX_ASSERT(fields[0]=="");
3006 REGEX_ASSERT(fields[1]=="a");
3007 REGEX_ASSERT(fields[2]=="Now is ");
3008 REGEX_ASSERT(fields[3]=="b");
3009 REGEX_ASSERT(fields[4]=="the time");
3010 REGEX_ASSERT(fields[5]=="c");
3011 REGEX_ASSERT(fields[6]=="");
3012 REGEX_ASSERT(fields[7]=="*");
3013 REGEX_ASSERT(status==U_ZERO_ERROR);
3014
3015 fields[6] = fields[7] = "*";
3016 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
3017 REGEX_CHECK_STATUS;
3018 REGEX_ASSERT(n==7);
3019 REGEX_ASSERT(fields[0]==" ");
3020 REGEX_ASSERT(fields[1]=="a");
3021 REGEX_ASSERT(fields[2]=="Now is ");
3022 REGEX_ASSERT(fields[3]=="b");
3023 REGEX_ASSERT(fields[4]=="the time");
3024 REGEX_ASSERT(fields[5]=="c");
3025 REGEX_ASSERT(fields[6]=="");
3026 REGEX_ASSERT(fields[7]=="*");
3027
3028 status = U_ZERO_ERROR;
3029 fields[6] = "foo";
3030 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
3031 REGEX_CHECK_STATUS;
3032 REGEX_ASSERT(n==6);
3033 REGEX_ASSERT(fields[0]==" ");
3034 REGEX_ASSERT(fields[1]=="a");
3035 REGEX_ASSERT(fields[2]=="Now is ");
3036 REGEX_ASSERT(fields[3]=="b");
3037 REGEX_ASSERT(fields[4]=="the time");
3038 REGEX_ASSERT(fields[5]==" ");
3039 REGEX_ASSERT(fields[6]=="foo");
3040
3041 status = U_ZERO_ERROR;
3042 fields[5] = "foo";
3043 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3044 REGEX_CHECK_STATUS;
3045 REGEX_ASSERT(n==5);
3046 REGEX_ASSERT(fields[0]==" ");
3047 REGEX_ASSERT(fields[1]=="a");
3048 REGEX_ASSERT(fields[2]=="Now is ");
3049 REGEX_ASSERT(fields[3]=="b");
3050 REGEX_ASSERT(fields[4]=="the time<c>");
3051 REGEX_ASSERT(fields[5]=="foo");
3052
3053 status = U_ZERO_ERROR;
3054 fields[5] = "foo";
3055 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3056 REGEX_CHECK_STATUS;
3057 REGEX_ASSERT(n==5);
3058 REGEX_ASSERT(fields[0]==" ");
3059 REGEX_ASSERT(fields[1]=="a");
3060 REGEX_ASSERT(fields[2]=="Now is ");
3061 REGEX_ASSERT(fields[3]=="b");
3062 REGEX_ASSERT(fields[4]=="the time");
3063 REGEX_ASSERT(fields[5]=="foo");
3064
3065 status = U_ZERO_ERROR;
3066 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3067 REGEX_CHECK_STATUS;
3068 REGEX_ASSERT(n==4);
3069 REGEX_ASSERT(fields[0]==" ");
3070 REGEX_ASSERT(fields[1]=="a");
3071 REGEX_ASSERT(fields[2]=="Now is ");
3072 REGEX_ASSERT(fields[3]=="the time<c>");
3073 status = U_ZERO_ERROR;
3074 delete pat1;
3075
3076 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3077 pat1 = RegexPattern::compile(&re1, pe, status);
3078 REGEX_CHECK_STATUS;
3079 n = pat1->split("1-10,20", fields, 10, status);
3080 REGEX_CHECK_STATUS;
3081 REGEX_ASSERT(n==5);
3082 REGEX_ASSERT(fields[0]=="1");
3083 REGEX_ASSERT(fields[1]=="-");
3084 REGEX_ASSERT(fields[2]=="10");
3085 REGEX_ASSERT(fields[3]==",");
3086 REGEX_ASSERT(fields[4]=="20");
3087 delete pat1;
3088
3089
3090 //
3091 // split of a UText based string, with library allocating output UTexts.
3092 //
3093 {
3094 status = U_ZERO_ERROR;
3095 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3096 UnicodeString stringToSplit("first:second:third");
3097 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3098 REGEX_CHECK_STATUS;
3099
3100 UText *splits[10] = {NULL};
3101 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3102 REGEX_CHECK_STATUS;
3103 REGEX_ASSERT(numFields == 5);
3104 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3105 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3106 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3107 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3108 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3109 REGEX_ASSERT(splits[5] == NULL);
3110
3111 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3112 if (splits[i]) {
3113 utext_close(splits[i]);
3114 splits[i] = NULL;
3115 }
3116 }
3117 utext_close(textToSplit);
3118 }
3119
3120
3121 //
3122 // RegexPattern::pattern() and patternText()
3123 //
3124 pat1 = new RegexPattern();
3125 REGEX_ASSERT(pat1->pattern() == "");
3126 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3127 delete pat1;
3128 const char *helloWorldInvariant = "(Hello, world)*";
3129 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3130 pat1 = RegexPattern::compile(&re1, pe, status);
3131 REGEX_CHECK_STATUS;
3132 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3133 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3134 delete pat1;
3135
3136 utext_close(&re1);
3137 }
3138
3139
3140 //---------------------------------------------------------------------------
3141 //
3142 // Extended A more thorough check for features of regex patterns
3143 // The test cases are in a separate data file,
3144 // source/tests/testdata/regextst.txt
3145 // A description of the test data format is included in that file.
3146 //
3147 //---------------------------------------------------------------------------
3148
3149 const char *
getPath(char buffer[2048],const char * filename)3150 RegexTest::getPath(char buffer[2048], const char *filename) {
3151 UErrorCode status=U_ZERO_ERROR;
3152 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3153 if (U_FAILURE(status)) {
3154 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3155 return NULL;
3156 }
3157
3158 strcpy(buffer, testDataDirectory);
3159 strcat(buffer, filename);
3160 return buffer;
3161 }
3162
Extended()3163 void RegexTest::Extended() {
3164 char tdd[2048];
3165 const char *srcPath;
3166 UErrorCode status = U_ZERO_ERROR;
3167 int32_t lineNum = 0;
3168
3169 //
3170 // Open and read the test data file.
3171 //
3172 srcPath=getPath(tdd, "regextst.txt");
3173 if(srcPath==NULL) {
3174 return; /* something went wrong, error already output */
3175 }
3176
3177 int32_t len;
3178 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3179 if (U_FAILURE(status)) {
3180 return; /* something went wrong, error already output */
3181 }
3182
3183 //
3184 // Put the test data into a UnicodeString
3185 //
3186 UnicodeString testString(FALSE, testData, len);
3187
3188 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3189 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3190 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3191
3192 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3193 UnicodeString testPattern; // The pattern for test from the test file.
3194 UnicodeString testFlags; // the flags for a test.
3195 UnicodeString matchString; // The marked up string to be used as input
3196
3197 if (U_FAILURE(status)){
3198 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3199 delete [] testData;
3200 return;
3201 }
3202
3203 //
3204 // Loop over the test data file, once per line.
3205 //
3206 while (lineMat.find()) {
3207 lineNum++;
3208 if (U_FAILURE(status)) {
3209 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3210 }
3211
3212 status = U_ZERO_ERROR;
3213 UnicodeString testLine = lineMat.group(1, status);
3214 if (testLine.length() == 0) {
3215 continue;
3216 }
3217
3218 //
3219 // Parse the test line. Skip blank and comment only lines.
3220 // Separate out the three main fields - pattern, flags, target.
3221 //
3222
3223 commentMat.reset(testLine);
3224 if (commentMat.lookingAt(status)) {
3225 // This line is a comment, or blank.
3226 continue;
3227 }
3228
3229 //
3230 // Pull out the pattern field, remove it from the test file line.
3231 //
3232 quotedStuffMat.reset(testLine);
3233 if (quotedStuffMat.lookingAt(status)) {
3234 testPattern = quotedStuffMat.group(2, status);
3235 testLine.remove(0, quotedStuffMat.end(0, status));
3236 } else {
3237 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3238 continue;
3239 }
3240
3241
3242 //
3243 // Pull out the flags from the test file line.
3244 //
3245 flagsMat.reset(testLine);
3246 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3247 testFlags = flagsMat.group(1, status);
3248 if (flagsMat.group(2, status).length() > 0) {
3249 errln("Bad Match flag at line %d. Scanning %c\n",
3250 lineNum, flagsMat.group(2, status).charAt(0));
3251 continue;
3252 }
3253 testLine.remove(0, flagsMat.end(0, status));
3254
3255 //
3256 // Pull out the match string, as a whole.
3257 // We'll process the <tags> later.
3258 //
3259 quotedStuffMat.reset(testLine);
3260 if (quotedStuffMat.lookingAt(status)) {
3261 matchString = quotedStuffMat.group(2, status);
3262 testLine.remove(0, quotedStuffMat.end(0, status));
3263 } else {
3264 errln("Bad match string at test file line %d", lineNum);
3265 continue;
3266 }
3267
3268 //
3269 // The only thing left from the input line should be an optional trailing comment.
3270 //
3271 commentMat.reset(testLine);
3272 if (commentMat.lookingAt(status) == FALSE) {
3273 errln("Line %d: unexpected characters at end of test line.", lineNum);
3274 continue;
3275 }
3276
3277 //
3278 // Run the test
3279 //
3280 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3281 }
3282
3283 delete [] testData;
3284
3285 }
3286
3287
3288
3289 //---------------------------------------------------------------------------
3290 //
3291 // regex_find(pattern, flags, inputString, lineNumber)
3292 //
3293 // Function to run a single test from the Extended (data driven) tests.
3294 // See file test/testdata/regextst.txt for a description of the
3295 // pattern and inputString fields, and the allowed flags.
3296 // lineNumber is the source line in regextst.txt of the test.
3297 //
3298 //---------------------------------------------------------------------------
3299
3300
3301 // Set a value into a UVector at position specified by a decimal number in
3302 // a UnicodeString. This is a utility function needed by the actual test function,
3303 // which follows.
set(UVector & vec,int32_t val,UnicodeString index)3304 static void set(UVector &vec, int32_t val, UnicodeString index) {
3305 UErrorCode status=U_ZERO_ERROR;
3306 int32_t idx = 0;
3307 for (int32_t i=0; i<index.length(); i++) {
3308 int32_t d=u_charDigitValue(index.charAt(i));
3309 if (d<0) {return;}
3310 idx = idx*10 + d;
3311 }
3312 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3313 vec.setElementAt(val, idx);
3314 }
3315
setInt(UVector & vec,int32_t val,int32_t idx)3316 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3317 UErrorCode status=U_ZERO_ERROR;
3318 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3319 vec.setElementAt(val, idx);
3320 }
3321
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3322 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3323 {
3324 UBool couldFind = TRUE;
3325 UTEXT_SETNATIVEINDEX(utext, 0);
3326 int32_t i = 0;
3327 while (i < unistrOffset) {
3328 UChar32 c = UTEXT_NEXT32(utext);
3329 if (c != U_SENTINEL) {
3330 i += U16_LENGTH(c);
3331 } else {
3332 couldFind = FALSE;
3333 break;
3334 }
3335 }
3336 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3337 return couldFind;
3338 }
3339
3340
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3341 void RegexTest::regex_find(const UnicodeString &pattern,
3342 const UnicodeString &flags,
3343 const UnicodeString &inputString,
3344 const char *srcPath,
3345 int32_t line) {
3346 UnicodeString unEscapedInput;
3347 UnicodeString deTaggedInput;
3348
3349 int32_t patternUTF8Length, inputUTF8Length;
3350 char *patternChars = NULL, *inputChars = NULL;
3351 UText patternText = UTEXT_INITIALIZER;
3352 UText inputText = UTEXT_INITIALIZER;
3353 UConverter *UTF8Converter = NULL;
3354
3355 UErrorCode status = U_ZERO_ERROR;
3356 UParseError pe;
3357 RegexPattern *parsePat = NULL;
3358 RegexMatcher *parseMatcher = NULL;
3359 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3360 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3361 UVector groupStarts(status);
3362 UVector groupEnds(status);
3363 UVector groupStartsUTF8(status);
3364 UVector groupEndsUTF8(status);
3365 UBool isMatch = FALSE, isUTF8Match = FALSE;
3366 UBool failed = FALSE;
3367 int32_t numFinds;
3368 int32_t i;
3369 UBool useMatchesFunc = FALSE;
3370 UBool useLookingAtFunc = FALSE;
3371 int32_t regionStart = -1;
3372 int32_t regionEnd = -1;
3373 int32_t regionStartUTF8 = -1;
3374 int32_t regionEndUTF8 = -1;
3375
3376
3377 //
3378 // Compile the caller's pattern
3379 //
3380 uint32_t bflags = 0;
3381 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3382 bflags |= UREGEX_CASE_INSENSITIVE;
3383 }
3384 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3385 bflags |= UREGEX_COMMENTS;
3386 }
3387 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3388 bflags |= UREGEX_DOTALL;
3389 }
3390 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3391 bflags |= UREGEX_MULTILINE;
3392 }
3393
3394 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3395 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3396 }
3397 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3398 bflags |= UREGEX_UNIX_LINES;
3399 }
3400 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3401 bflags |= UREGEX_LITERAL;
3402 }
3403
3404
3405 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3406 if (status != U_ZERO_ERROR) {
3407 #if UCONFIG_NO_BREAK_ITERATION==1
3408 // 'v' test flag means that the test pattern should not compile if ICU was configured
3409 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3410 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3411 goto cleanupAndReturn;
3412 }
3413 #endif
3414 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3415 // Expected pattern compilation error.
3416 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3417 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3418 }
3419 goto cleanupAndReturn;
3420 } else {
3421 // Unexpected pattern compilation error.
3422 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3423 goto cleanupAndReturn;
3424 }
3425 }
3426
3427 UTF8Converter = ucnv_open("UTF8", &status);
3428 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3429
3430 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3431 status = U_ZERO_ERROR; // buffer overflow
3432 patternChars = new char[patternUTF8Length+1];
3433 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3434 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3435
3436 if (status == U_ZERO_ERROR) {
3437 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3438
3439 if (status != U_ZERO_ERROR) {
3440 #if UCONFIG_NO_BREAK_ITERATION==1
3441 // 'v' test flag means that the test pattern should not compile if ICU was configured
3442 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3443 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3444 goto cleanupAndReturn;
3445 }
3446 #endif
3447 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3448 // Expected pattern compilation error.
3449 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3450 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3451 }
3452 goto cleanupAndReturn;
3453 } else {
3454 // Unexpected pattern compilation error.
3455 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3456 goto cleanupAndReturn;
3457 }
3458 }
3459 }
3460
3461 if (UTF8Pattern == NULL) {
3462 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3463 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3464 status = U_ZERO_ERROR;
3465 }
3466
3467 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3468 callerPattern->dumpPattern();
3469 }
3470
3471 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3472 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3473 goto cleanupAndReturn;
3474 }
3475
3476
3477 //
3478 // Number of times find() should be called on the test string, default to 1
3479 //
3480 numFinds = 1;
3481 for (i=2; i<=9; i++) {
3482 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3483 if (numFinds != 1) {
3484 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3485 goto cleanupAndReturn;
3486 }
3487 numFinds = i;
3488 }
3489 }
3490
3491 // 'M' flag. Use matches() instead of find()
3492 if (flags.indexOf((UChar)0x4d) >= 0) {
3493 useMatchesFunc = TRUE;
3494 }
3495 if (flags.indexOf((UChar)0x4c) >= 0) {
3496 useLookingAtFunc = TRUE;
3497 }
3498
3499 //
3500 // Find the tags in the input data, remove them, and record the group boundary
3501 // positions.
3502 //
3503 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3504 if (!assertSuccess(WHERE, status) ) {
3505 goto cleanupAndReturn;
3506 }
3507
3508 unEscapedInput = inputString.unescape();
3509 parseMatcher = parsePat->matcher(unEscapedInput, status);
3510 if (!assertSuccess(WHERE, status) ) {
3511 goto cleanupAndReturn;
3512 }
3513 while(parseMatcher->find()) {
3514 parseMatcher->appendReplacement(deTaggedInput, "", status);
3515 REGEX_CHECK_STATUS;
3516 UnicodeString groupNum = parseMatcher->group(2, status);
3517 if (groupNum == "r") {
3518 // <r> or </r>, a region specification within the string
3519 if (parseMatcher->group(1, status) == "/") {
3520 regionEnd = deTaggedInput.length();
3521 } else {
3522 regionStart = deTaggedInput.length();
3523 }
3524 } else {
3525 // <digits> or </digits>, a group match boundary tag.
3526 if (parseMatcher->group(1, status) == "/") {
3527 set(groupEnds, deTaggedInput.length(), groupNum);
3528 } else {
3529 set(groupStarts, deTaggedInput.length(), groupNum);
3530 }
3531 }
3532 }
3533 parseMatcher->appendTail(deTaggedInput);
3534
3535 if (groupStarts.size() != groupEnds.size()) {
3536 errln("Error at line %d: mismatched <n> group tags in expected results.", line);
3537 failed = true;
3538 goto cleanupAndReturn;
3539 }
3540 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3541 errln("mismatched <r> tags");
3542 failed = TRUE;
3543 goto cleanupAndReturn;
3544 }
3545
3546 //
3547 // Configure the matcher according to the flags specified with this test.
3548 //
3549 matcher = callerPattern->matcher(deTaggedInput, status);
3550 REGEX_CHECK_STATUS_L(line);
3551 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3552 matcher->setTrace(TRUE);
3553 }
3554
3555 if (UTF8Pattern != NULL) {
3556 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3557 status = U_ZERO_ERROR; // buffer overflow
3558 inputChars = new char[inputUTF8Length+1];
3559 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3560 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3561
3562 if (status == U_ZERO_ERROR) {
3563 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3564 REGEX_CHECK_STATUS_L(line);
3565 }
3566
3567 if (UTF8Matcher == NULL) {
3568 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3569 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3570 status = U_ZERO_ERROR;
3571 }
3572 }
3573
3574 //
3575 // Generate native indices for UTF8 versions of region and capture group info
3576 //
3577 if (UTF8Matcher != NULL) {
3578 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3579 UTF8Matcher->setTrace(TRUE);
3580 }
3581 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3582 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3583
3584 // Fill out the native index UVector info.
3585 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3586 for (i=0; i<groupStarts.size(); i++) {
3587 int32_t start = groupStarts.elementAti(i);
3588 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3589 if (start >= 0) {
3590 int32_t startUTF8;
3591 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3592 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3593 failed = TRUE;
3594 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3595 }
3596 setInt(groupStartsUTF8, startUTF8, i);
3597 }
3598
3599 int32_t end = groupEnds.elementAti(i);
3600 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3601 if (end >= 0) {
3602 int32_t endUTF8;
3603 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3604 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3605 failed = TRUE;
3606 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3607 }
3608 setInt(groupEndsUTF8, endUTF8, i);
3609 }
3610 }
3611 }
3612
3613 if (regionStart>=0) {
3614 matcher->region(regionStart, regionEnd, status);
3615 REGEX_CHECK_STATUS_L(line);
3616 if (UTF8Matcher != NULL) {
3617 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3618 REGEX_CHECK_STATUS_L(line);
3619 }
3620 }
3621 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3622 matcher->useAnchoringBounds(FALSE);
3623 if (UTF8Matcher != NULL) {
3624 UTF8Matcher->useAnchoringBounds(FALSE);
3625 }
3626 }
3627 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3628 matcher->useTransparentBounds(TRUE);
3629 if (UTF8Matcher != NULL) {
3630 UTF8Matcher->useTransparentBounds(TRUE);
3631 }
3632 }
3633
3634
3635
3636 //
3637 // Do a find on the de-tagged input using the caller's pattern
3638 // TODO: error on count>1 and not find().
3639 // error on both matches() and lookingAt().
3640 //
3641 for (i=0; i<numFinds; i++) {
3642 if (useMatchesFunc) {
3643 isMatch = matcher->matches(status);
3644 if (UTF8Matcher != NULL) {
3645 isUTF8Match = UTF8Matcher->matches(status);
3646 }
3647 } else if (useLookingAtFunc) {
3648 isMatch = matcher->lookingAt(status);
3649 if (UTF8Matcher != NULL) {
3650 isUTF8Match = UTF8Matcher->lookingAt(status);
3651 }
3652 } else {
3653 isMatch = matcher->find();
3654 if (UTF8Matcher != NULL) {
3655 isUTF8Match = UTF8Matcher->find();
3656 }
3657 }
3658 }
3659 matcher->setTrace(FALSE);
3660 if (UTF8Matcher) {
3661 UTF8Matcher->setTrace(FALSE);
3662 }
3663 if (U_FAILURE(status)) {
3664 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3665 }
3666
3667 //
3668 // Match up the groups from the find() with the groups from the tags
3669 //
3670
3671 // number of tags should match number of groups from find operation.
3672 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3673 // G option in test means that capture group data is not available in the
3674 // expected results, so the check needs to be suppressed.
3675 if (isMatch == FALSE && groupStarts.size() != 0) {
3676 dataerrln("Error at line %d: Match expected, but none found.", line);
3677 failed = TRUE;
3678 goto cleanupAndReturn;
3679 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3680 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3681 failed = TRUE;
3682 goto cleanupAndReturn;
3683 }
3684 if (isMatch && groupStarts.size() == 0) {
3685 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3686 failed = TRUE;
3687 }
3688 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3689 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3690 failed = TRUE;
3691 }
3692
3693 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3694 // Only check for match / no match. Don't check capture groups.
3695 goto cleanupAndReturn;
3696 }
3697
3698 REGEX_CHECK_STATUS_L(line);
3699 for (i=0; i<=matcher->groupCount(); i++) {
3700 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3701 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3702 if (matcher->start(i, status) != expectedStart) {
3703 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3704 line, i, expectedStart, matcher->start(i, status));
3705 failed = TRUE;
3706 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3707 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3708 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3709 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3710 failed = TRUE;
3711 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3712 }
3713
3714 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3715 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3716 if (matcher->end(i, status) != expectedEnd) {
3717 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3718 line, i, expectedEnd, matcher->end(i, status));
3719 failed = TRUE;
3720 // Error on end position; keep going; real error is probably yet to come as group
3721 // end positions work from end of the input data towards the front.
3722 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3723 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3724 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3725 failed = TRUE;
3726 // Error on end position; keep going; real error is probably yet to come as group
3727 // end positions work from end of the input data towards the front.
3728 }
3729 }
3730 if ( matcher->groupCount()+1 < groupStarts.size()) {
3731 errln("Error at line %d: Expected %d capture groups, found %d.",
3732 line, groupStarts.size()-1, matcher->groupCount());
3733 failed = TRUE;
3734 }
3735 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3736 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3737 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3738 failed = TRUE;
3739 }
3740
3741 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3742 matcher->requireEnd() == TRUE) {
3743 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3744 failed = TRUE;
3745 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3746 UTF8Matcher->requireEnd() == TRUE) {
3747 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3748 failed = TRUE;
3749 }
3750
3751 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3752 matcher->requireEnd() == FALSE) {
3753 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3754 failed = TRUE;
3755 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3756 UTF8Matcher->requireEnd() == FALSE) {
3757 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3758 failed = TRUE;
3759 }
3760
3761 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3762 matcher->hitEnd() == TRUE) {
3763 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3764 failed = TRUE;
3765 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3766 UTF8Matcher->hitEnd() == TRUE) {
3767 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3768 failed = TRUE;
3769 }
3770
3771 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3772 matcher->hitEnd() == FALSE) {
3773 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3774 failed = TRUE;
3775 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3776 UTF8Matcher->hitEnd() == FALSE) {
3777 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3778 failed = TRUE;
3779 }
3780
3781
3782 cleanupAndReturn:
3783 if (failed) {
3784 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3785 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3786 // callerPattern->dump();
3787 }
3788 delete parseMatcher;
3789 delete parsePat;
3790 delete UTF8Matcher;
3791 delete UTF8Pattern;
3792 delete matcher;
3793 delete callerPattern;
3794
3795 utext_close(&inputText);
3796 delete[] inputChars;
3797 utext_close(&patternText);
3798 delete[] patternChars;
3799 ucnv_close(UTF8Converter);
3800 }
3801
3802
3803
3804
3805 //---------------------------------------------------------------------------
3806 //
3807 // Errors Check for error handling in patterns.
3808 //
3809 //---------------------------------------------------------------------------
Errors()3810 void RegexTest::Errors() {
3811 // \escape sequences that aren't implemented yet.
3812 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3813
3814 // Missing close parentheses
3815 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3816 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3817 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3818
3819 // Extra close paren
3820 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3821 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3822 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3823
3824 // Look-ahead, Look-behind
3825 // TODO: add tests for unbounded length look-behinds.
3826 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3827
3828 // Attempt to use non-default flags
3829 {
3830 UParseError pe;
3831 UErrorCode status = U_ZERO_ERROR;
3832 int32_t flags = UREGEX_CANON_EQ |
3833 UREGEX_COMMENTS | UREGEX_DOTALL |
3834 UREGEX_MULTILINE;
3835 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3836 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3837 delete pat1;
3838 }
3839
3840
3841 // Quantifiers are allowed only after something that can be quantified.
3842 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3843 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3844 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3845
3846 // Mal-formed {min,max} quantifiers
3847 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3848 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3849 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3850 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3851 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3852 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3853 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3854 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3855 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3856
3857 // Ticket 5389
3858 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3859
3860 // Invalid Back Reference \0
3861 // For ICU 3.8 and earlier
3862 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3863 //
3864 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3865
3866 }
3867
3868
3869 //-------------------------------------------------------------------------------
3870 //
3871 // Read a text data file, convert it to UChars, and return the data
3872 // in one big UChar * buffer, which the caller must delete.
3873 //
3874 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3875 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3876 const char *defEncoding, UErrorCode &status) {
3877 UChar *retPtr = NULL;
3878 char *fileBuf = NULL;
3879 UConverter* conv = NULL;
3880 FILE *f = NULL;
3881
3882 ulen = 0;
3883 if (U_FAILURE(status)) {
3884 return retPtr;
3885 }
3886
3887 //
3888 // Open the file.
3889 //
3890 f = fopen(fileName, "rb");
3891 if (f == 0) {
3892 dataerrln("Error opening test data file %s\n", fileName);
3893 status = U_FILE_ACCESS_ERROR;
3894 return NULL;
3895 }
3896 //
3897 // Read it in
3898 //
3899 int32_t fileSize;
3900 int32_t amt_read;
3901
3902 fseek( f, 0, SEEK_END);
3903 fileSize = ftell(f);
3904 fileBuf = new char[fileSize];
3905 fseek(f, 0, SEEK_SET);
3906 amt_read = static_cast<int32_t>(fread(fileBuf, 1, fileSize, f));
3907 if (amt_read != fileSize || fileSize <= 0) {
3908 errln("Error reading test data file.");
3909 goto cleanUpAndReturn;
3910 }
3911
3912 //
3913 // Look for a Unicode Signature (BOM) on the data just read
3914 //
3915 int32_t signatureLength;
3916 const char * fileBufC;
3917 const char* encoding;
3918
3919 fileBufC = fileBuf;
3920 encoding = ucnv_detectUnicodeSignature(
3921 fileBuf, fileSize, &signatureLength, &status);
3922 if(encoding!=NULL ){
3923 fileBufC += signatureLength;
3924 fileSize -= signatureLength;
3925 } else {
3926 encoding = defEncoding;
3927 if (strcmp(encoding, "utf-8") == 0) {
3928 errln("file %s is missing its BOM", fileName);
3929 }
3930 }
3931
3932 //
3933 // Open a converter to take the rule file to UTF-16
3934 //
3935 conv = ucnv_open(encoding, &status);
3936 if (U_FAILURE(status)) {
3937 goto cleanUpAndReturn;
3938 }
3939
3940 //
3941 // Convert the rules to UChar.
3942 // Preflight first to determine required buffer size.
3943 //
3944 ulen = ucnv_toUChars(conv,
3945 NULL, // dest,
3946 0, // destCapacity,
3947 fileBufC,
3948 fileSize,
3949 &status);
3950 if (status == U_BUFFER_OVERFLOW_ERROR) {
3951 // Buffer Overflow is expected from the preflight operation.
3952 status = U_ZERO_ERROR;
3953
3954 retPtr = new UChar[ulen+1];
3955 ucnv_toUChars(conv,
3956 retPtr, // dest,
3957 ulen+1,
3958 fileBufC,
3959 fileSize,
3960 &status);
3961 }
3962
3963 cleanUpAndReturn:
3964 fclose(f);
3965 delete[] fileBuf;
3966 ucnv_close(conv);
3967 if (U_FAILURE(status)) {
3968 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3969 delete []retPtr;
3970 retPtr = 0;
3971 ulen = 0;
3972 }
3973 return retPtr;
3974 }
3975
3976
3977 //-------------------------------------------------------------------------------
3978 //
3979 // PerlTests - Run Perl's regular expression tests
3980 // The input file for this test is re_tests, the standard regular
3981 // expression test data distributed with the Perl source code.
3982 //
3983 // Here is Perl's description of the test data file:
3984 //
3985 // # The tests are in a separate file 't/op/re_tests'.
3986 // # Each line in that file is a separate test.
3987 // # There are five columns, separated by tabs.
3988 // #
3989 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3990 // # Modifiers can be put after the closing C<'>.
3991 // #
3992 // # Column 2 contains the string to be matched.
3993 // #
3994 // # Column 3 contains the expected result:
3995 // # y expect a match
3996 // # n expect no match
3997 // # c expect an error
3998 // # B test exposes a known bug in Perl, should be skipped
3999 // # b test exposes a known bug in Perl, should be skipped if noamp
4000 // #
4001 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4002 // #
4003 // # Column 4 contains a string, usually C<$&>.
4004 // #
4005 // # Column 5 contains the expected result of double-quote
4006 // # interpolating that string after the match, or start of error message.
4007 // #
4008 // # Column 6, if present, contains a reason why the test is skipped.
4009 // # This is printed with "skipped", for harness to pick up.
4010 // #
4011 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
4012 // #
4013 // # If you want to add a regular expression test that can't be expressed
4014 // # in this format, don't add it here: put it in op/pat.t instead.
4015 //
4016 // For ICU, if field 3 contains an 'i', the test will be skipped.
4017 // The test exposes is some known incompatibility between ICU and Perl regexps.
4018 // (The i is in addition to whatever was there before.)
4019 //
4020 //-------------------------------------------------------------------------------
PerlTests()4021 void RegexTest::PerlTests() {
4022 char tdd[2048];
4023 const char *srcPath;
4024 UErrorCode status = U_ZERO_ERROR;
4025 UParseError pe;
4026
4027 //
4028 // Open and read the test data file.
4029 //
4030 srcPath=getPath(tdd, "re_tests.txt");
4031 if(srcPath==NULL) {
4032 return; /* something went wrong, error already output */
4033 }
4034
4035 int32_t len;
4036 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4037 if (U_FAILURE(status)) {
4038 return; /* something went wrong, error already output */
4039 }
4040
4041 //
4042 // Put the test data into a UnicodeString
4043 //
4044 UnicodeString testDataString(FALSE, testData, len);
4045
4046 //
4047 // Regex to break the input file into lines, and strip the new lines.
4048 // One line per match, capture group one is the desired data.
4049 //
4050 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4051 if (U_FAILURE(status)) {
4052 dataerrln("RegexPattern::compile() error");
4053 return;
4054 }
4055 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4056
4057 //
4058 // Regex to split a test file line into fields.
4059 // There are six fields, separated by tabs.
4060 //
4061 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4062
4063 //
4064 // Regex to identify test patterns with flag settings, and to separate them.
4065 // Test patterns with flags look like 'pattern'i
4066 // Test patterns without flags are not quoted: pattern
4067 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4068 //
4069 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4070 RegexMatcher* flagMat = flagPat->matcher(status);
4071
4072 //
4073 // The Perl tests reference several perl-isms, which are evaluated/substituted
4074 // in the test data. Not being perl, this must be done explicitly. Here
4075 // are string constants and REs for these constructs.
4076 //
4077 UnicodeString nulnulSrc("${nulnul}");
4078 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4079 nulnul = nulnul.unescape();
4080
4081 UnicodeString ffffSrc("${ffff}");
4082 UnicodeString ffff("\\uffff", -1, US_INV);
4083 ffff = ffff.unescape();
4084
4085 // regexp for $-[0], $+[2], etc.
4086 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4087 RegexMatcher *groupsMat = groupsPat->matcher(status);
4088
4089 // regexp for $0, $1, $2, etc.
4090 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4091 RegexMatcher *cgMat = cgPat->matcher(status);
4092
4093
4094 //
4095 // Main Loop for the Perl Tests, runs once per line from the
4096 // test data file.
4097 //
4098 int32_t lineNum = 0;
4099 int32_t skippedUnimplementedCount = 0;
4100 while (lineMat->find()) {
4101 lineNum++;
4102
4103 //
4104 // Get a line, break it into its fields, do the Perl
4105 // variable substitutions.
4106 //
4107 UnicodeString line = lineMat->group(1, status);
4108 UnicodeString fields[7];
4109 fieldPat->split(line, fields, 7, status);
4110
4111 flagMat->reset(fields[0]);
4112 flagMat->matches(status);
4113 UnicodeString pattern = flagMat->group(2, status);
4114 pattern.findAndReplace("${bang}", "!");
4115 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4116 pattern.findAndReplace(ffffSrc, ffff);
4117
4118 //
4119 // Identify patterns that include match flag settings,
4120 // split off the flags, remove the extra quotes.
4121 //
4122 UnicodeString flagStr = flagMat->group(3, status);
4123 if (U_FAILURE(status)) {
4124 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4125 return;
4126 }
4127 int32_t flags = 0;
4128 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4129 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4130 const UChar UChar_m = 0x6d;
4131 const UChar UChar_x = 0x78;
4132 const UChar UChar_y = 0x79;
4133 if (flagStr.indexOf(UChar_i) != -1) {
4134 flags |= UREGEX_CASE_INSENSITIVE;
4135 }
4136 if (flagStr.indexOf(UChar_m) != -1) {
4137 flags |= UREGEX_MULTILINE;
4138 }
4139 if (flagStr.indexOf(UChar_x) != -1) {
4140 flags |= UREGEX_COMMENTS;
4141 }
4142
4143 //
4144 // Compile the test pattern.
4145 //
4146 status = U_ZERO_ERROR;
4147 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4148 if (status == U_REGEX_UNIMPLEMENTED) {
4149 //
4150 // Test of a feature that is planned for ICU, but not yet implemented.
4151 // skip the test.
4152 skippedUnimplementedCount++;
4153 delete testPat;
4154 status = U_ZERO_ERROR;
4155 continue;
4156 }
4157
4158 if (U_FAILURE(status)) {
4159 // Some tests are supposed to generate errors.
4160 // Only report an error for tests that are supposed to succeed.
4161 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4162 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4163 {
4164 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4165 }
4166 status = U_ZERO_ERROR;
4167 delete testPat;
4168 continue;
4169 }
4170
4171 if (fields[2].indexOf(UChar_i) >= 0) {
4172 // ICU should skip this test.
4173 delete testPat;
4174 continue;
4175 }
4176
4177 if (fields[2].indexOf(UChar_c) >= 0) {
4178 // This pattern should have caused a compilation error, but didn't/
4179 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4180 delete testPat;
4181 continue;
4182 }
4183
4184 //
4185 // replace the Perl variables that appear in some of the
4186 // match data strings.
4187 //
4188 UnicodeString matchString = fields[1];
4189 matchString.findAndReplace(nulnulSrc, nulnul);
4190 matchString.findAndReplace(ffffSrc, ffff);
4191
4192 // Replace any \n in the match string with an actual new-line char.
4193 // Don't do full unescape, as this unescapes more than Perl does, which
4194 // causes other spurious failures in the tests.
4195 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4196
4197
4198
4199 //
4200 // Run the test, check for expected match/don't match result.
4201 //
4202 RegexMatcher *testMat = testPat->matcher(matchString, status);
4203 UBool found = testMat->find();
4204 UBool expected = FALSE;
4205 if (fields[2].indexOf(UChar_y) >=0) {
4206 expected = TRUE;
4207 }
4208 if (expected != found) {
4209 errln("line %d: Expected %smatch, got %smatch",
4210 lineNum, expected?"":"no ", found?"":"no " );
4211 delete testMat;
4212 delete testPat;
4213 continue;
4214 }
4215
4216 // Don't try to check expected results if there is no match.
4217 // (Some have stuff in the expected fields)
4218 if (!found) {
4219 delete testMat;
4220 delete testPat;
4221 continue;
4222 }
4223
4224 //
4225 // Interpret the Perl expression from the fourth field of the data file,
4226 // building up an ICU string from the results of the ICU match.
4227 // The Perl expression will contain references to the results of
4228 // a regex match, including the matched string, capture group strings,
4229 // group starting and ending indicies, etc.
4230 //
4231 UnicodeString resultString;
4232 UnicodeString perlExpr = fields[3];
4233 #if SUPPORT_MUTATING_INPUT_STRING
4234 groupsMat->reset(perlExpr);
4235 cgMat->reset(perlExpr);
4236 #endif
4237
4238 while (perlExpr.length() > 0) {
4239 #if !SUPPORT_MUTATING_INPUT_STRING
4240 // Perferred usage. Reset after any modification to input string.
4241 groupsMat->reset(perlExpr);
4242 cgMat->reset(perlExpr);
4243 #endif
4244
4245 if (perlExpr.startsWith("$&")) {
4246 resultString.append(testMat->group(status));
4247 perlExpr.remove(0, 2);
4248 }
4249
4250 else if (groupsMat->lookingAt(status)) {
4251 // $-[0] $+[2] etc.
4252 UnicodeString digitString = groupsMat->group(2, status);
4253 int32_t t = 0;
4254 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4255 UnicodeString plusOrMinus = groupsMat->group(1, status);
4256 int32_t matchPosition;
4257 if (plusOrMinus.compare("+") == 0) {
4258 matchPosition = testMat->end(groupNum, status);
4259 } else {
4260 matchPosition = testMat->start(groupNum, status);
4261 }
4262 if (matchPosition != -1) {
4263 ICU_Utility::appendNumber(resultString, matchPosition);
4264 }
4265 perlExpr.remove(0, groupsMat->end(status));
4266 }
4267
4268 else if (cgMat->lookingAt(status)) {
4269 // $1, $2, $3, etc.
4270 UnicodeString digitString = cgMat->group(1, status);
4271 int32_t t = 0;
4272 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4273 if (U_SUCCESS(status)) {
4274 resultString.append(testMat->group(groupNum, status));
4275 status = U_ZERO_ERROR;
4276 }
4277 perlExpr.remove(0, cgMat->end(status));
4278 }
4279
4280 else if (perlExpr.startsWith("@-")) {
4281 int32_t i;
4282 for (i=0; i<=testMat->groupCount(); i++) {
4283 if (i>0) {
4284 resultString.append(" ");
4285 }
4286 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4287 }
4288 perlExpr.remove(0, 2);
4289 }
4290
4291 else if (perlExpr.startsWith("@+")) {
4292 int32_t i;
4293 for (i=0; i<=testMat->groupCount(); i++) {
4294 if (i>0) {
4295 resultString.append(" ");
4296 }
4297 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4298 }
4299 perlExpr.remove(0, 2);
4300 }
4301
4302 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4303 // or as an escaped sequence (e.g. \n)
4304 if (perlExpr.length() > 1) {
4305 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4306 }
4307 UChar c = perlExpr.charAt(0);
4308 switch (c) {
4309 case 'n': c = '\n'; break;
4310 // add any other escape sequences that show up in the test expected results.
4311 }
4312 resultString.append(c);
4313 perlExpr.remove(0, 1);
4314 }
4315
4316 else {
4317 // Any characters from the perl expression that we don't explicitly
4318 // recognize before here are assumed to be literals and copied
4319 // as-is to the expected results.
4320 resultString.append(perlExpr.charAt(0));
4321 perlExpr.remove(0, 1);
4322 }
4323
4324 if (U_FAILURE(status)) {
4325 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4326 break;
4327 }
4328 }
4329
4330 //
4331 // Expected Results Compare
4332 //
4333 UnicodeString expectedS(fields[4]);
4334 expectedS.findAndReplace(nulnulSrc, nulnul);
4335 expectedS.findAndReplace(ffffSrc, ffff);
4336 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4337
4338
4339 if (expectedS.compare(resultString) != 0) {
4340 err("Line %d: Incorrect perl expression results.", lineNum);
4341 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4342 }
4343
4344 delete testMat;
4345 delete testPat;
4346 }
4347
4348 //
4349 // All done. Clean up allocated stuff.
4350 //
4351 delete cgMat;
4352 delete cgPat;
4353
4354 delete groupsMat;
4355 delete groupsPat;
4356
4357 delete flagMat;
4358 delete flagPat;
4359
4360 delete lineMat;
4361 delete linePat;
4362
4363 delete fieldPat;
4364 delete [] testData;
4365
4366
4367 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4368
4369 }
4370
4371
4372 //-------------------------------------------------------------------------------
4373 //
4374 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4375 // (instead of using UnicodeStrings) to test the alternate engine.
4376 // The input file for this test is re_tests, the standard regular
4377 // expression test data distributed with the Perl source code.
4378 // See PerlTests() for more information.
4379 //
4380 //-------------------------------------------------------------------------------
PerlTestsUTF8()4381 void RegexTest::PerlTestsUTF8() {
4382 char tdd[2048];
4383 const char *srcPath;
4384 UErrorCode status = U_ZERO_ERROR;
4385 UParseError pe;
4386 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4387 UText patternText = UTEXT_INITIALIZER;
4388 char *patternChars = NULL;
4389 int32_t patternLength;
4390 int32_t patternCapacity = 0;
4391 UText inputText = UTEXT_INITIALIZER;
4392 char *inputChars = NULL;
4393 int32_t inputLength;
4394 int32_t inputCapacity = 0;
4395
4396 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4397
4398 //
4399 // Open and read the test data file.
4400 //
4401 srcPath=getPath(tdd, "re_tests.txt");
4402 if(srcPath==NULL) {
4403 return; /* something went wrong, error already output */
4404 }
4405
4406 int32_t len;
4407 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4408 if (U_FAILURE(status)) {
4409 return; /* something went wrong, error already output */
4410 }
4411
4412 //
4413 // Put the test data into a UnicodeString
4414 //
4415 UnicodeString testDataString(FALSE, testData, len);
4416
4417 //
4418 // Regex to break the input file into lines, and strip the new lines.
4419 // One line per match, capture group one is the desired data.
4420 //
4421 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4422 if (U_FAILURE(status)) {
4423 dataerrln("RegexPattern::compile() error");
4424 return;
4425 }
4426 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4427
4428 //
4429 // Regex to split a test file line into fields.
4430 // There are six fields, separated by tabs.
4431 //
4432 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4433
4434 //
4435 // Regex to identify test patterns with flag settings, and to separate them.
4436 // Test patterns with flags look like 'pattern'i
4437 // Test patterns without flags are not quoted: pattern
4438 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4439 //
4440 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4441 RegexMatcher* flagMat = flagPat->matcher(status);
4442
4443 //
4444 // The Perl tests reference several perl-isms, which are evaluated/substituted
4445 // in the test data. Not being perl, this must be done explicitly. Here
4446 // are string constants and REs for these constructs.
4447 //
4448 UnicodeString nulnulSrc("${nulnul}");
4449 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4450 nulnul = nulnul.unescape();
4451
4452 UnicodeString ffffSrc("${ffff}");
4453 UnicodeString ffff("\\uffff", -1, US_INV);
4454 ffff = ffff.unescape();
4455
4456 // regexp for $-[0], $+[2], etc.
4457 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4458 RegexMatcher *groupsMat = groupsPat->matcher(status);
4459
4460 // regexp for $0, $1, $2, etc.
4461 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4462 RegexMatcher *cgMat = cgPat->matcher(status);
4463
4464
4465 //
4466 // Main Loop for the Perl Tests, runs once per line from the
4467 // test data file.
4468 //
4469 int32_t lineNum = 0;
4470 int32_t skippedUnimplementedCount = 0;
4471 while (lineMat->find()) {
4472 lineNum++;
4473
4474 //
4475 // Get a line, break it into its fields, do the Perl
4476 // variable substitutions.
4477 //
4478 UnicodeString line = lineMat->group(1, status);
4479 UnicodeString fields[7];
4480 fieldPat->split(line, fields, 7, status);
4481
4482 flagMat->reset(fields[0]);
4483 flagMat->matches(status);
4484 UnicodeString pattern = flagMat->group(2, status);
4485 pattern.findAndReplace("${bang}", "!");
4486 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4487 pattern.findAndReplace(ffffSrc, ffff);
4488
4489 //
4490 // Identify patterns that include match flag settings,
4491 // split off the flags, remove the extra quotes.
4492 //
4493 UnicodeString flagStr = flagMat->group(3, status);
4494 if (U_FAILURE(status)) {
4495 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4496 return;
4497 }
4498 int32_t flags = 0;
4499 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4500 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4501 const UChar UChar_m = 0x6d;
4502 const UChar UChar_x = 0x78;
4503 const UChar UChar_y = 0x79;
4504 if (flagStr.indexOf(UChar_i) != -1) {
4505 flags |= UREGEX_CASE_INSENSITIVE;
4506 }
4507 if (flagStr.indexOf(UChar_m) != -1) {
4508 flags |= UREGEX_MULTILINE;
4509 }
4510 if (flagStr.indexOf(UChar_x) != -1) {
4511 flags |= UREGEX_COMMENTS;
4512 }
4513
4514 //
4515 // Put the pattern in a UTF-8 UText
4516 //
4517 status = U_ZERO_ERROR;
4518 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4519 if (status == U_BUFFER_OVERFLOW_ERROR) {
4520 status = U_ZERO_ERROR;
4521 delete[] patternChars;
4522 patternCapacity = patternLength + 1;
4523 patternChars = new char[patternCapacity];
4524 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4525 }
4526 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4527
4528 //
4529 // Compile the test pattern.
4530 //
4531 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4532 if (status == U_REGEX_UNIMPLEMENTED) {
4533 //
4534 // Test of a feature that is planned for ICU, but not yet implemented.
4535 // skip the test.
4536 skippedUnimplementedCount++;
4537 delete testPat;
4538 status = U_ZERO_ERROR;
4539 continue;
4540 }
4541
4542 if (U_FAILURE(status)) {
4543 // Some tests are supposed to generate errors.
4544 // Only report an error for tests that are supposed to succeed.
4545 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4546 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4547 {
4548 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4549 }
4550 status = U_ZERO_ERROR;
4551 delete testPat;
4552 continue;
4553 }
4554
4555 if (fields[2].indexOf(UChar_i) >= 0) {
4556 // ICU should skip this test.
4557 delete testPat;
4558 continue;
4559 }
4560
4561 if (fields[2].indexOf(UChar_c) >= 0) {
4562 // This pattern should have caused a compilation error, but didn't/
4563 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4564 delete testPat;
4565 continue;
4566 }
4567
4568
4569 //
4570 // replace the Perl variables that appear in some of the
4571 // match data strings.
4572 //
4573 UnicodeString matchString = fields[1];
4574 matchString.findAndReplace(nulnulSrc, nulnul);
4575 matchString.findAndReplace(ffffSrc, ffff);
4576
4577 // Replace any \n in the match string with an actual new-line char.
4578 // Don't do full unescape, as this unescapes more than Perl does, which
4579 // causes other spurious failures in the tests.
4580 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4581
4582 //
4583 // Put the input in a UTF-8 UText
4584 //
4585 status = U_ZERO_ERROR;
4586 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4587 if (status == U_BUFFER_OVERFLOW_ERROR) {
4588 status = U_ZERO_ERROR;
4589 delete[] inputChars;
4590 inputCapacity = inputLength + 1;
4591 inputChars = new char[inputCapacity];
4592 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4593 }
4594 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4595
4596 //
4597 // Run the test, check for expected match/don't match result.
4598 //
4599 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4600 UBool found = testMat->find();
4601 UBool expected = FALSE;
4602 if (fields[2].indexOf(UChar_y) >=0) {
4603 expected = TRUE;
4604 }
4605 if (expected != found) {
4606 errln("line %d: Expected %smatch, got %smatch",
4607 lineNum, expected?"":"no ", found?"":"no " );
4608 delete testMat;
4609 delete testPat;
4610 continue;
4611 }
4612
4613 // Don't try to check expected results if there is no match.
4614 // (Some have stuff in the expected fields)
4615 if (!found) {
4616 delete testMat;
4617 delete testPat;
4618 continue;
4619 }
4620
4621 //
4622 // Interpret the Perl expression from the fourth field of the data file,
4623 // building up an ICU string from the results of the ICU match.
4624 // The Perl expression will contain references to the results of
4625 // a regex match, including the matched string, capture group strings,
4626 // group starting and ending indicies, etc.
4627 //
4628 UnicodeString resultString;
4629 UnicodeString perlExpr = fields[3];
4630
4631 while (perlExpr.length() > 0) {
4632 groupsMat->reset(perlExpr);
4633 cgMat->reset(perlExpr);
4634
4635 if (perlExpr.startsWith("$&")) {
4636 resultString.append(testMat->group(status));
4637 perlExpr.remove(0, 2);
4638 }
4639
4640 else if (groupsMat->lookingAt(status)) {
4641 // $-[0] $+[2] etc.
4642 UnicodeString digitString = groupsMat->group(2, status);
4643 int32_t t = 0;
4644 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4645 UnicodeString plusOrMinus = groupsMat->group(1, status);
4646 int32_t matchPosition;
4647 if (plusOrMinus.compare("+") == 0) {
4648 matchPosition = testMat->end(groupNum, status);
4649 } else {
4650 matchPosition = testMat->start(groupNum, status);
4651 }
4652 if (matchPosition != -1) {
4653 ICU_Utility::appendNumber(resultString, matchPosition);
4654 }
4655 perlExpr.remove(0, groupsMat->end(status));
4656 }
4657
4658 else if (cgMat->lookingAt(status)) {
4659 // $1, $2, $3, etc.
4660 UnicodeString digitString = cgMat->group(1, status);
4661 int32_t t = 0;
4662 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4663 if (U_SUCCESS(status)) {
4664 resultString.append(testMat->group(groupNum, status));
4665 status = U_ZERO_ERROR;
4666 }
4667 perlExpr.remove(0, cgMat->end(status));
4668 }
4669
4670 else if (perlExpr.startsWith("@-")) {
4671 int32_t i;
4672 for (i=0; i<=testMat->groupCount(); i++) {
4673 if (i>0) {
4674 resultString.append(" ");
4675 }
4676 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4677 }
4678 perlExpr.remove(0, 2);
4679 }
4680
4681 else if (perlExpr.startsWith("@+")) {
4682 int32_t i;
4683 for (i=0; i<=testMat->groupCount(); i++) {
4684 if (i>0) {
4685 resultString.append(" ");
4686 }
4687 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4688 }
4689 perlExpr.remove(0, 2);
4690 }
4691
4692 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4693 // or as an escaped sequence (e.g. \n)
4694 if (perlExpr.length() > 1) {
4695 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4696 }
4697 UChar c = perlExpr.charAt(0);
4698 switch (c) {
4699 case 'n': c = '\n'; break;
4700 // add any other escape sequences that show up in the test expected results.
4701 }
4702 resultString.append(c);
4703 perlExpr.remove(0, 1);
4704 }
4705
4706 else {
4707 // Any characters from the perl expression that we don't explicitly
4708 // recognize before here are assumed to be literals and copied
4709 // as-is to the expected results.
4710 resultString.append(perlExpr.charAt(0));
4711 perlExpr.remove(0, 1);
4712 }
4713
4714 if (U_FAILURE(status)) {
4715 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4716 break;
4717 }
4718 }
4719
4720 //
4721 // Expected Results Compare
4722 //
4723 UnicodeString expectedS(fields[4]);
4724 expectedS.findAndReplace(nulnulSrc, nulnul);
4725 expectedS.findAndReplace(ffffSrc, ffff);
4726 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4727
4728
4729 if (expectedS.compare(resultString) != 0) {
4730 err("Line %d: Incorrect perl expression results.", lineNum);
4731 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4732 }
4733
4734 delete testMat;
4735 delete testPat;
4736 }
4737
4738 //
4739 // All done. Clean up allocated stuff.
4740 //
4741 delete cgMat;
4742 delete cgPat;
4743
4744 delete groupsMat;
4745 delete groupsPat;
4746
4747 delete flagMat;
4748 delete flagPat;
4749
4750 delete lineMat;
4751 delete linePat;
4752
4753 delete fieldPat;
4754 delete [] testData;
4755
4756 utext_close(&patternText);
4757 utext_close(&inputText);
4758
4759 delete [] patternChars;
4760 delete [] inputChars;
4761
4762
4763 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4764
4765 }
4766
4767
4768 //--------------------------------------------------------------
4769 //
4770 // Bug6149 Verify limits to heap expansion for backtrack stack.
4771 // Use this pattern,
4772 // "(a?){1,8000000}"
4773 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4774 // This test is likely to be fragile, as further optimizations stop
4775 // more cases of pointless looping in the match engine.
4776 //
4777 //---------------------------------------------------------------
Bug6149()4778 void RegexTest::Bug6149() {
4779 UnicodeString pattern("(a?){1,8000000}");
4780 UnicodeString s("xyz");
4781 uint32_t flags = 0;
4782 UErrorCode status = U_ZERO_ERROR;
4783
4784 RegexMatcher matcher(pattern, s, flags, status);
4785 UBool result = false;
4786 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4787 REGEX_ASSERT(result == FALSE);
4788 }
4789
4790
4791 //
4792 // Callbacks() Test the callback function.
4793 // When set, callbacks occur periodically during matching operations,
4794 // giving the application code the ability to abort the operation
4795 // before it's normal completion.
4796 //
4797
4798 struct callBackContext {
4799 RegexTest *test;
4800 int32_t maxCalls;
4801 int32_t numCalls;
4802 int32_t lastSteps;
resetcallBackContext4803 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}
4804 };
4805
4806 U_CDECL_BEGIN
4807 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4808 testCallBackFn(const void *context, int32_t steps) {
4809 callBackContext *info = (callBackContext *)context;
4810 if (info->lastSteps+1 != steps) {
4811 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4812 }
4813 info->lastSteps = steps;
4814 info->numCalls++;
4815 return (info->numCalls < info->maxCalls);
4816 }
4817 U_CDECL_END
4818
Callbacks()4819 void RegexTest::Callbacks() {
4820 {
4821 // Getter returns NULLs if no callback has been set
4822
4823 // The variables that the getter will fill in.
4824 // Init to non-null values so that the action of the getter can be seen.
4825 const void *returnedContext = &returnedContext;
4826 URegexMatchCallback *returnedFn = &testCallBackFn;
4827
4828 UErrorCode status = U_ZERO_ERROR;
4829 RegexMatcher matcher("x", 0, status);
4830 REGEX_CHECK_STATUS;
4831 matcher.getMatchCallback(returnedFn, returnedContext, status);
4832 REGEX_CHECK_STATUS;
4833 REGEX_ASSERT(returnedFn == NULL);
4834 REGEX_ASSERT(returnedContext == NULL);
4835 }
4836
4837 {
4838 // Set and Get work
4839 callBackContext cbInfo = {this, 0, 0, 0};
4840 const void *returnedContext;
4841 URegexMatchCallback *returnedFn;
4842 UErrorCode status = U_ZERO_ERROR;
4843 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4844 REGEX_CHECK_STATUS;
4845 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4846 REGEX_CHECK_STATUS;
4847 matcher.getMatchCallback(returnedFn, returnedContext, status);
4848 REGEX_CHECK_STATUS;
4849 REGEX_ASSERT(returnedFn == testCallBackFn);
4850 REGEX_ASSERT(returnedContext == &cbInfo);
4851
4852 // A short-running match shouldn't invoke the callback
4853 status = U_ZERO_ERROR;
4854 cbInfo.reset(1);
4855 UnicodeString s = "xxx";
4856 matcher.reset(s);
4857 REGEX_ASSERT(matcher.matches(status));
4858 REGEX_CHECK_STATUS;
4859 REGEX_ASSERT(cbInfo.numCalls == 0);
4860
4861 // A medium-length match that runs long enough to invoke the
4862 // callback, but not so long that the callback aborts it.
4863 status = U_ZERO_ERROR;
4864 cbInfo.reset(4);
4865 s = "aaaaaaaaaaaaaaaaaaab";
4866 matcher.reset(s);
4867 REGEX_ASSERT(matcher.matches(status)==FALSE);
4868 REGEX_CHECK_STATUS;
4869 REGEX_ASSERT(cbInfo.numCalls > 0);
4870
4871 // A longer running match that the callback function will abort.
4872 status = U_ZERO_ERROR;
4873 cbInfo.reset(4);
4874 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4875 matcher.reset(s);
4876 REGEX_ASSERT(matcher.matches(status)==FALSE);
4877 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4878 REGEX_ASSERT(cbInfo.numCalls == 4);
4879
4880 // A longer running find that the callback function will abort.
4881 status = U_ZERO_ERROR;
4882 cbInfo.reset(4);
4883 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4884 matcher.reset(s);
4885 REGEX_ASSERT(matcher.find(status)==FALSE);
4886 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4887 REGEX_ASSERT(cbInfo.numCalls == 4);
4888 }
4889
4890
4891 }
4892
4893
4894 //
4895 // FindProgressCallbacks() Test the find "progress" callback function.
4896 // When set, the find progress callback will be invoked during a find operations
4897 // after each return from a match attempt, giving the application the opportunity
4898 // to terminate a long-running find operation before it's normal completion.
4899 //
4900
4901 struct progressCallBackContext {
4902 RegexTest *test;
4903 int64_t lastIndex;
4904 int32_t maxCalls;
4905 int32_t numCalls;
resetprogressCallBackContext4906 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}
4907 };
4908
4909 // call-back function for find().
4910 // Return TRUE to continue the find().
4911 // Return FALSE to stop the find().
4912 U_CDECL_BEGIN
4913 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4914 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4915 progressCallBackContext *info = (progressCallBackContext *)context;
4916 info->numCalls++;
4917 info->lastIndex = matchIndex;
4918 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4919 return (info->numCalls < info->maxCalls);
4920 }
4921 U_CDECL_END
4922
FindProgressCallbacks()4923 void RegexTest::FindProgressCallbacks() {
4924 {
4925 // Getter returns NULLs if no callback has been set
4926
4927 // The variables that the getter will fill in.
4928 // Init to non-null values so that the action of the getter can be seen.
4929 const void *returnedContext = &returnedContext;
4930 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4931
4932 UErrorCode status = U_ZERO_ERROR;
4933 RegexMatcher matcher("x", 0, status);
4934 REGEX_CHECK_STATUS;
4935 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4936 REGEX_CHECK_STATUS;
4937 REGEX_ASSERT(returnedFn == NULL);
4938 REGEX_ASSERT(returnedContext == NULL);
4939 }
4940
4941 {
4942 // Set and Get work
4943 progressCallBackContext cbInfo = {this, 0, 0, 0};
4944 const void *returnedContext;
4945 URegexFindProgressCallback *returnedFn;
4946 UErrorCode status = U_ZERO_ERROR;
4947 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4948 REGEX_CHECK_STATUS;
4949 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4950 REGEX_CHECK_STATUS;
4951 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4952 REGEX_CHECK_STATUS;
4953 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4954 REGEX_ASSERT(returnedContext == &cbInfo);
4955
4956 // A find that matches on the initial position does NOT invoke the callback.
4957 status = U_ZERO_ERROR;
4958 cbInfo.reset(100);
4959 UnicodeString s = "aaxxx";
4960 matcher.reset(s);
4961 #if 0
4962 matcher.setTrace(TRUE);
4963 #endif
4964 REGEX_ASSERT(matcher.find(0, status));
4965 REGEX_CHECK_STATUS;
4966 REGEX_ASSERT(cbInfo.numCalls == 0);
4967
4968 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4969 // but not so many times that we interrupt the operation.
4970 status = U_ZERO_ERROR;
4971 s = "aaaaaaaaaaaaaaaaaaab";
4972 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4973 matcher.reset(s);
4974 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4975 REGEX_CHECK_STATUS;
4976 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4977
4978 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4979 status = U_ZERO_ERROR;
4980 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4981 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4982 matcher.reset(s1);
4983 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4984 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4985 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4986
4987 // Now a match that will succeed, but after an interruption
4988 status = U_ZERO_ERROR;
4989 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4990 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4991 matcher.reset(s2);
4992 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4993 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4994 // Now retry the match from where left off
4995 cbInfo.maxCalls = 100; // No callback limit
4996 status = U_ZERO_ERROR;
4997 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4998 REGEX_CHECK_STATUS;
4999 }
5000
5001
5002 }
5003
5004
5005 //---------------------------------------------------------------------------
5006 //
5007 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
5008 // UTexts. The pure-C implementation of UText
5009 // has no mutable backing stores, but we can
5010 // use UnicodeString here to test the functionality.
5011 //
5012 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()5013 void RegexTest::PreAllocatedUTextCAPI () {
5014 UErrorCode status = U_ZERO_ERROR;
5015 URegularExpression *re;
5016 UText patternText = UTEXT_INITIALIZER;
5017 UnicodeString buffer;
5018 UText bufferText = UTEXT_INITIALIZER;
5019
5020 utext_openUnicodeString(&bufferText, &buffer, &status);
5021
5022 /*
5023 * getText() and getUText()
5024 */
5025 {
5026 UText text1 = UTEXT_INITIALIZER;
5027 UText text2 = UTEXT_INITIALIZER;
5028 UChar text2Chars[20];
5029 UText *resultText;
5030
5031 status = U_ZERO_ERROR;
5032 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5033 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5034 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5035 utext_openUChars(&text2, text2Chars, -1, &status);
5036
5037 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5038 re = uregex_openUText(&patternText, 0, NULL, &status);
5039
5040 /* First set a UText */
5041 uregex_setUText(re, &text1, &status);
5042 resultText = uregex_getUText(re, &bufferText, &status);
5043 REGEX_CHECK_STATUS;
5044 REGEX_ASSERT(resultText == &bufferText);
5045 utext_setNativeIndex(resultText, 0);
5046 utext_setNativeIndex(&text1, 0);
5047 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5048
5049 resultText = uregex_getUText(re, &bufferText, &status);
5050 REGEX_CHECK_STATUS;
5051 REGEX_ASSERT(resultText == &bufferText);
5052 utext_setNativeIndex(resultText, 0);
5053 utext_setNativeIndex(&text1, 0);
5054 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5055
5056 /* Then set a UChar * */
5057 uregex_setText(re, text2Chars, 7, &status);
5058 resultText = uregex_getUText(re, &bufferText, &status);
5059 REGEX_CHECK_STATUS;
5060 REGEX_ASSERT(resultText == &bufferText);
5061 utext_setNativeIndex(resultText, 0);
5062 utext_setNativeIndex(&text2, 0);
5063 REGEX_ASSERT(testUTextEqual(resultText, &text2));
5064
5065 uregex_close(re);
5066 utext_close(&text1);
5067 utext_close(&text2);
5068 }
5069
5070 /*
5071 * group()
5072 */
5073 {
5074 UChar text1[80];
5075 UText *actual;
5076 UBool result;
5077 int64_t length = 0;
5078
5079 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5080 // 012345678901234567890123456789012345678901234567
5081 // 0 1 2 3 4
5082
5083 status = U_ZERO_ERROR;
5084 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5085 REGEX_CHECK_STATUS;
5086
5087 uregex_setText(re, text1, -1, &status);
5088 result = uregex_find(re, 0, &status);
5089 REGEX_ASSERT(result==TRUE);
5090
5091 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5092 status = U_ZERO_ERROR;
5093 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5094 REGEX_CHECK_STATUS;
5095 REGEX_ASSERT(actual == &bufferText);
5096 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5097 REGEX_ASSERT(length == 16);
5098 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5099
5100 /* Capture group #1. Should succeed, matching " interior ". */
5101 status = U_ZERO_ERROR;
5102 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5103 REGEX_CHECK_STATUS;
5104 REGEX_ASSERT(actual == &bufferText);
5105 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5106 REGEX_ASSERT(length == 10);
5107 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5108
5109 /* Capture group out of range. Error. */
5110 status = U_ZERO_ERROR;
5111 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5112 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5113 REGEX_ASSERT(actual == &bufferText);
5114 uregex_close(re);
5115
5116 }
5117
5118 /*
5119 * replaceFirst()
5120 */
5121 {
5122 UChar text1[80];
5123 UChar text2[80];
5124 UText replText = UTEXT_INITIALIZER;
5125 UText *result;
5126 status = U_ZERO_ERROR;
5127 utext_openUnicodeString(&bufferText, &buffer, &status);
5128
5129 status = U_ZERO_ERROR;
5130 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5131 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5132 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5133
5134 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5135 REGEX_CHECK_STATUS;
5136
5137 /* Normal case, with match */
5138 uregex_setText(re, text1, -1, &status);
5139 REGEX_CHECK_STATUS;
5140 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5141 REGEX_CHECK_STATUS;
5142 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5143 REGEX_CHECK_STATUS;
5144 REGEX_ASSERT(result == &bufferText);
5145 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5146
5147 /* No match. Text should copy to output with no changes. */
5148 uregex_setText(re, text2, -1, &status);
5149 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5150 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5151 REGEX_CHECK_STATUS;
5152 REGEX_ASSERT(result == &bufferText);
5153 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5154
5155 /* Unicode escapes */
5156 uregex_setText(re, text1, -1, &status);
5157 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5158 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5159 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5160 REGEX_CHECK_STATUS;
5161 REGEX_ASSERT(result == &bufferText);
5162 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5163
5164 uregex_close(re);
5165 utext_close(&replText);
5166 }
5167
5168
5169 /*
5170 * replaceAll()
5171 */
5172 {
5173 UChar text1[80];
5174 UChar text2[80];
5175 UText replText = UTEXT_INITIALIZER;
5176 UText *result;
5177
5178 status = U_ZERO_ERROR;
5179 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5180 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5181 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5182
5183 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5184 REGEX_CHECK_STATUS;
5185
5186 /* Normal case, with match */
5187 uregex_setText(re, text1, -1, &status);
5188 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5189 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5190 REGEX_CHECK_STATUS;
5191 REGEX_ASSERT(result == &bufferText);
5192 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5193
5194 /* No match. Text should copy to output with no changes. */
5195 uregex_setText(re, text2, -1, &status);
5196 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5197 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5198 REGEX_CHECK_STATUS;
5199 REGEX_ASSERT(result == &bufferText);
5200 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5201
5202 uregex_close(re);
5203 utext_close(&replText);
5204 }
5205
5206
5207 /*
5208 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5209 * so we don't need to test it here.
5210 */
5211
5212 utext_close(&bufferText);
5213 utext_close(&patternText);
5214 }
5215
5216
5217 //--------------------------------------------------------------
5218 //
5219 // NamedCapture Check basic named capture group functionality
5220 //
5221 //--------------------------------------------------------------
NamedCapture()5222 void RegexTest::NamedCapture() {
5223 UErrorCode status = U_ZERO_ERROR;
5224 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5225 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5226 REGEX_CHECK_STATUS;
5227 int32_t group = pat->groupNumberFromName("five", -1, status);
5228 REGEX_CHECK_STATUS;
5229 REGEX_ASSERT(5 == group);
5230 group = pat->groupNumberFromName("three", -1, status);
5231 REGEX_CHECK_STATUS;
5232 REGEX_ASSERT(3 == group);
5233
5234 status = U_ZERO_ERROR;
5235 group = pat->groupNumberFromName(UnicodeString("six"), status);
5236 REGEX_CHECK_STATUS;
5237 REGEX_ASSERT(6 == group);
5238
5239 status = U_ZERO_ERROR;
5240 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5241 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5242
5243 status = U_ZERO_ERROR;
5244
5245 // After copying a pattern, named capture should still work in the copy.
5246 RegexPattern *copiedPat = new RegexPattern(*pat);
5247 REGEX_ASSERT(*copiedPat == *pat);
5248 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5249
5250 group = copiedPat->groupNumberFromName("five", -1, status);
5251 REGEX_CHECK_STATUS;
5252 REGEX_ASSERT(5 == group);
5253 group = copiedPat->groupNumberFromName("three", -1, status);
5254 REGEX_CHECK_STATUS;
5255 REGEX_ASSERT(3 == group);
5256 delete copiedPat;
5257
5258 // ReplaceAll with named capture group.
5259 status = U_ZERO_ERROR;
5260 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5261 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5262 REGEX_CHECK_STATUS;
5263 // m.pattern().dumpPattern();
5264 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5265 REGEX_CHECK_STATUS;
5266 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5267 delete m;
5268
5269 // ReplaceAll, allowed capture group numbers.
5270 text = UnicodeString("abcmxyz");
5271 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5272 REGEX_CHECK_STATUS;
5273
5274 status = U_ZERO_ERROR;
5275 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5276 REGEX_CHECK_STATUS;
5277 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5278
5279 status = U_ZERO_ERROR;
5280 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5281 REGEX_CHECK_STATUS;
5282 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5283
5284 status = U_ZERO_ERROR;
5285 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5286 REGEX_CHECK_STATUS;
5287 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5288
5289 status = U_ZERO_ERROR;
5290 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5291 REGEX_CHECK_STATUS;
5292 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5293
5294 status = U_ZERO_ERROR;
5295 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5296 REGEX_CHECK_STATUS;
5297 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5298
5299 status = U_ZERO_ERROR;
5300 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5301 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5302
5303 status = U_ZERO_ERROR;
5304 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5305 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5306 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5307
5308 status = U_ZERO_ERROR;
5309 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5310 REGEX_CHECK_STATUS; // that push group num out of range.
5311 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5312
5313 status = U_ZERO_ERROR;
5314 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5315 REGEX_CHECK_STATUS;
5316 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5317
5318 status = U_ZERO_ERROR;
5319 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5320 REGEX_CHECK_STATUS;
5321 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5322
5323 status = U_ZERO_ERROR;
5324 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5325 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5326
5327 status = U_ZERO_ERROR;
5328 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5329 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5330
5331 status = U_ZERO_ERROR;
5332 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5333 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5334
5335 status = U_ZERO_ERROR;
5336 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5337 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5338
5339 delete m;
5340
5341 // Repeat the above replaceAll() tests using the plain C API, which
5342 // has a separate implementation internally.
5343 // TODO: factor out the test data.
5344
5345 status = U_ZERO_ERROR;
5346 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5347 REGEX_CHECK_STATUS;
5348 text = UnicodeString("abcmxyz");
5349 uregex_setText(re, text.getBuffer(), text.length(), &status);
5350 REGEX_CHECK_STATUS;
5351
5352 UChar resultBuf[100];
5353 int32_t resultLength;
5354 UnicodeString repl;
5355
5356 status = U_ZERO_ERROR;
5357 repl = UnicodeString("<$0>");
5358 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5359 REGEX_CHECK_STATUS;
5360 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5361
5362 status = U_ZERO_ERROR;
5363 repl = UnicodeString("<$1>");
5364 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5365 REGEX_CHECK_STATUS;
5366 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5367
5368 status = U_ZERO_ERROR;
5369 repl = UnicodeString("<${one}>");
5370 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5371 REGEX_CHECK_STATUS;
5372 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5373
5374 status = U_ZERO_ERROR;
5375 repl = UnicodeString("<$2>");
5376 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5377 REGEX_CHECK_STATUS;
5378 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5379
5380 status = U_ZERO_ERROR;
5381 repl = UnicodeString("<$3>");
5382 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5383 REGEX_CHECK_STATUS;
5384 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5385
5386 status = U_ZERO_ERROR;
5387 repl = UnicodeString("<$4>");
5388 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5389 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5390
5391 status = U_ZERO_ERROR;
5392 repl = UnicodeString("<$04>");
5393 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5394 REGEX_CHECK_STATUS;
5395 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5396
5397 status = U_ZERO_ERROR;
5398 repl = UnicodeString("<$000016>");
5399 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5400 REGEX_CHECK_STATUS;
5401 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5402
5403 status = U_ZERO_ERROR;
5404 repl = UnicodeString("<$3$2$1${one}>");
5405 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5406 REGEX_CHECK_STATUS;
5407 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5408
5409 status = U_ZERO_ERROR;
5410 repl = UnicodeString("$3$2$1${one}");
5411 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5412 REGEX_CHECK_STATUS;
5413 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5414
5415 status = U_ZERO_ERROR;
5416 repl = UnicodeString("<${noSuchName}>");
5417 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5418 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5419
5420 status = U_ZERO_ERROR;
5421 repl = UnicodeString("<${invalid-name}>");
5422 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5423 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5424
5425 status = U_ZERO_ERROR;
5426 repl = UnicodeString("<${one");
5427 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5428 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5429
5430 status = U_ZERO_ERROR;
5431 repl = UnicodeString("$not a capture group");
5432 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5433 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5434
5435 uregex_close(re);
5436 }
5437
5438 //--------------------------------------------------------------
5439 //
5440 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5441 // The point is not so much what the exact limit is,
5442 // but that a largish number doesn't hit bad non-linear performance,
5443 // and that exceeding the limit fails cleanly.
5444 //
5445 //--------------------------------------------------------------
NamedCaptureLimits()5446 void RegexTest::NamedCaptureLimits() {
5447 if (quick) {
5448 logln("Skipping test. Runs in exhuastive mode only.");
5449 return;
5450 }
5451 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5452 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5453 char nnbuf[100];
5454 UnicodeString pattern;
5455 int32_t nn;
5456
5457 for (nn=1; nn<goodLimit; nn++) {
5458 sprintf(nnbuf, "(?<nn%d>)", nn);
5459 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5460 }
5461 UErrorCode status = U_ZERO_ERROR;
5462 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5463 REGEX_CHECK_STATUS;
5464 for (nn=1; nn<goodLimit; nn++) {
5465 sprintf(nnbuf, "nn%d", nn);
5466 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5467 REGEX_ASSERT(nn == groupNum);
5468 if (nn != groupNum) {
5469 break;
5470 }
5471 }
5472 delete pat;
5473
5474 pattern.remove();
5475 for (nn=1; nn<failLimit; nn++) {
5476 sprintf(nnbuf, "(?<nn%d>)", nn);
5477 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5478 }
5479 status = U_ZERO_ERROR;
5480 pat = RegexPattern::compile(pattern, 0, status);
5481 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5482 delete pat;
5483 }
5484
5485
5486 //--------------------------------------------------------------
5487 //
5488 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5489 //
5490 //---------------------------------------------------------------
Bug7651()5491 void RegexTest::Bug7651() {
5492 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5493 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5494 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5495 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5496 UnicodeString s("#ff @abcd This is test");
5497 RegexPattern *REPattern = NULL;
5498 RegexMatcher *REMatcher = NULL;
5499 UErrorCode status = U_ZERO_ERROR;
5500 UParseError pe;
5501
5502 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5503 REGEX_CHECK_STATUS;
5504 REMatcher = REPattern->matcher(s, status);
5505 REGEX_CHECK_STATUS;
5506 REGEX_ASSERT(REMatcher->find());
5507 REGEX_ASSERT(REMatcher->start(status) == 0);
5508 delete REPattern;
5509 delete REMatcher;
5510 status = U_ZERO_ERROR;
5511
5512 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5513 REGEX_CHECK_STATUS;
5514 REMatcher = REPattern->matcher(s, status);
5515 REGEX_CHECK_STATUS;
5516 REGEX_ASSERT(REMatcher->find());
5517 REGEX_ASSERT(REMatcher->start(status) == 0);
5518 delete REPattern;
5519 delete REMatcher;
5520 status = U_ZERO_ERROR;
5521 }
5522
Bug7740()5523 void RegexTest::Bug7740() {
5524 UErrorCode status = U_ZERO_ERROR;
5525 UnicodeString pattern = "(a)";
5526 UnicodeString text = "abcdef";
5527 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5528 REGEX_CHECK_STATUS;
5529 REGEX_ASSERT(m->lookingAt(status));
5530 REGEX_CHECK_STATUS;
5531 status = U_ILLEGAL_ARGUMENT_ERROR;
5532 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5533 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5534 REGEX_ASSERT(s == "");
5535 delete m;
5536 }
5537
5538 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5539
Bug8479()5540 void RegexTest::Bug8479() {
5541 UErrorCode status = U_ZERO_ERROR;
5542
5543 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5544 REGEX_CHECK_STATUS;
5545 if (U_SUCCESS(status))
5546 {
5547 UnicodeString str;
5548 str.setToBogus();
5549 pMatcher->reset(str);
5550 status = U_ZERO_ERROR;
5551 pMatcher->matches(status);
5552 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5553 delete pMatcher;
5554 }
5555 }
5556
5557
5558 // Bug 7029
Bug7029()5559 void RegexTest::Bug7029() {
5560 UErrorCode status = U_ZERO_ERROR;
5561
5562 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5563 UnicodeString text = "abc.def";
5564 UnicodeString splits[10];
5565 REGEX_CHECK_STATUS;
5566 int32_t numFields = pMatcher->split(text, splits, 10, status);
5567 REGEX_CHECK_STATUS;
5568 REGEX_ASSERT(numFields == 8);
5569 delete pMatcher;
5570 }
5571
5572 // Bug 9283
5573 // This test is checking for the existance of any supplemental characters that case-fold
5574 // to a bmp character.
5575 //
5576 // At the time of this writing there are none. If any should appear in a subsequent release
5577 // of Unicode, the code in regular expressions compilation that determines the longest
5578 // posssible match for a literal string will need to be enhanced.
5579 //
5580 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5581 // for details on what to do in case of a failure of this test.
5582 //
Bug9283()5583 void RegexTest::Bug9283() {
5584 #if !UCONFIG_NO_NORMALIZATION
5585 UErrorCode status = U_ZERO_ERROR;
5586 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5587 REGEX_CHECK_STATUS;
5588 int32_t index;
5589 UChar32 c;
5590 for (index=0; ; index++) {
5591 c = supplementalsWithCaseFolding.charAt(index);
5592 if (c == -1) {
5593 break;
5594 }
5595 UnicodeString cf = UnicodeString(c).foldCase();
5596 REGEX_ASSERT(cf.length() >= 2);
5597 }
5598 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5599 }
5600
5601
CheckInvBufSize()5602 void RegexTest::CheckInvBufSize() {
5603 if(inv_next>=INV_BUFSIZ) {
5604 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5605 __FILE__, INV_BUFSIZ, inv_next);
5606 } else {
5607 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5608 }
5609 }
5610
5611
Bug10459()5612 void RegexTest::Bug10459() {
5613 UErrorCode status = U_ZERO_ERROR;
5614 UnicodeString patternString("(txt)");
5615 UnicodeString txtString("txt");
5616
5617 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5618 REGEX_CHECK_STATUS;
5619 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5620 REGEX_CHECK_STATUS;
5621
5622 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5623 REGEX_CHECK_STATUS;
5624
5625 uregex_setUText(icu_re, utext_txt, &status);
5626 REGEX_CHECK_STATUS;
5627
5628 // The bug was that calling uregex_group() before doing a matching operation
5629 // was causing a segfault. Only for Regular Expressions created from UText.
5630 // It should set an U_REGEX_INVALID_STATE.
5631
5632 UChar buf[100];
5633 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5634 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5635 REGEX_ASSERT(len == 0);
5636
5637 uregex_close(icu_re);
5638 utext_close(utext_pat);
5639 utext_close(utext_txt);
5640 }
5641
TestCaseInsensitiveStarters()5642 void RegexTest::TestCaseInsensitiveStarters() {
5643 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5644 // become stale because of new Unicode characters.
5645 // If it is stale, rerun the generation tool
5646 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5647 // and replace the embedded data in i18n/regexcmp.cpp
5648
5649 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5650 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5651 continue;
5652 }
5653 UnicodeSet s(cp, cp);
5654 s.closeOver(USET_CASE_INSENSITIVE);
5655 UnicodeSetIterator setIter(s);
5656 while (setIter.next()) {
5657 if (!setIter.isString()) {
5658 continue;
5659 }
5660 const UnicodeString &str = setIter.getString();
5661 UChar32 firstChar = str.char32At(0);
5662 UnicodeSet starters;
5663 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5664 if (!starters.contains(cp)) {
5665 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5666 return;
5667 }
5668 }
5669 }
5670 }
5671
5672
TestBug11049()5673 void RegexTest::TestBug11049() {
5674 // Original bug report: pattern with match start consisting of one of several individual characters,
5675 // and the text being matched ending with a supplementary character. find() would read past the
5676 // end of the input text when searching for potential match starting points.
5677
5678 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5679 // detect the bad read.
5680
5681 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5682 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5683
5684 // Test again with a pattern starting with a single character,
5685 // which takes a different code path than starting with an OR expression,
5686 // but with similar logic.
5687 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5688 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5689 }
5690
5691 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5692 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5693 UErrorCode status = U_ZERO_ERROR;
5694 UnicodeString patternString = UnicodeString(pattern).unescape();
5695 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5696
5697 UnicodeString dataString = UnicodeString(data).unescape();
5698 UChar *exactBuffer = new UChar[dataString.length()];
5699 dataString.extract(exactBuffer, dataString.length(), status);
5700 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5701
5702 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5703 REGEX_CHECK_STATUS;
5704 matcher->reset(ut);
5705 UBool result = matcher->find();
5706 if (result != expectMatch) {
5707 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5708 __FILE__, lineNumber, expectMatch, result, pattern, data);
5709 }
5710
5711 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5712 // off-by-one on find() with match at the last code point.
5713 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5714 // because string.unescape() will only shrink it.
5715 char * utf8Buffer = new char[uprv_strlen(data)+1];
5716 u_strToUTF8(utf8Buffer, static_cast<int32_t>(uprv_strlen(data)+1), NULL, dataString.getBuffer(), dataString.length(), &status);
5717 REGEX_CHECK_STATUS;
5718 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5719 REGEX_CHECK_STATUS;
5720 matcher->reset(ut);
5721 result = matcher->find();
5722 if (result != expectMatch) {
5723 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5724 __FILE__, lineNumber, expectMatch, result, pattern, data);
5725 }
5726 delete [] utf8Buffer;
5727
5728 utext_close(ut);
5729 delete [] exactBuffer;
5730 }
5731
5732
TestBug11371()5733 void RegexTest::TestBug11371() {
5734 if (quick) {
5735 logln("Skipping test. Runs in exhuastive mode only.");
5736 return;
5737 }
5738 UErrorCode status = U_ZERO_ERROR;
5739 UnicodeString patternString;
5740
5741 for (int i=0; i<8000000; i++) {
5742 patternString.append(UnicodeString("()"));
5743 }
5744 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5745 if (status != U_REGEX_PATTERN_TOO_BIG) {
5746 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5747 __FILE__, __LINE__, u_errorName(status));
5748 }
5749
5750 status = U_ZERO_ERROR;
5751 patternString = "(";
5752 for (int i=0; i<20000000; i++) {
5753 patternString.append(UnicodeString("A++"));
5754 }
5755 patternString.append(UnicodeString("){0}B++"));
5756 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5757 if (status != U_REGEX_PATTERN_TOO_BIG) {
5758 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5759 __FILE__, __LINE__, u_errorName(status));
5760 }
5761
5762 // Pattern with too much string data, such that string indexes overflow operand data field size
5763 // in compiled instruction.
5764 status = U_ZERO_ERROR;
5765 patternString = "";
5766 while (patternString.length() < 0x00ffffff) {
5767 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5768 }
5769 patternString.append(UnicodeString("X? trailing string"));
5770 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5771 if (status != U_REGEX_PATTERN_TOO_BIG) {
5772 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5773 __FILE__, __LINE__, u_errorName(status));
5774 }
5775 }
5776
TestBug11480()5777 void RegexTest::TestBug11480() {
5778 // C API, get capture group of a group that does not participate in the match.
5779 // (Returns a zero length string, with nul termination,
5780 // indistinguishable from a group with a zero length match.)
5781
5782 UErrorCode status = U_ZERO_ERROR;
5783 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5784 REGEX_CHECK_STATUS;
5785 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5786 uregex_setText(re, text.getBuffer(), text.length(), &status);
5787 REGEX_CHECK_STATUS;
5788 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5789 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5790 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5791 REGEX_ASSERT(length == 0);
5792 REGEX_ASSERT(buf[0] == 13);
5793 REGEX_ASSERT(buf[1] == 0);
5794 REGEX_ASSERT(buf[2] == 13);
5795 uregex_close(re);
5796
5797 // UText C++ API, length of match is 0 for non-participating matches.
5798 UText ut = UTEXT_INITIALIZER;
5799 utext_openUnicodeString(&ut, &text, &status);
5800 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5801 REGEX_CHECK_STATUS;
5802 matcher.reset(&ut);
5803 REGEX_ASSERT(matcher.lookingAt(0, status));
5804
5805 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5806 int64_t groupLen = -666;
5807 UText group = UTEXT_INITIALIZER;
5808 matcher.group(1, &group, groupLen, status);
5809 REGEX_CHECK_STATUS;
5810 REGEX_ASSERT(groupLen == 1);
5811 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5812
5813 // Capture group 2, the (B), does not participate in the match.
5814 matcher.group(2, &group, groupLen, status);
5815 REGEX_CHECK_STATUS;
5816 REGEX_ASSERT(groupLen == 0);
5817 REGEX_ASSERT(matcher.start(2, status) == -1);
5818 REGEX_CHECK_STATUS;
5819 }
5820
TestBug12884()5821 void RegexTest::TestBug12884() {
5822 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5823 UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5824 UnicodeString text(u"hello");
5825 UErrorCode status = U_ZERO_ERROR;
5826 RegexMatcher m(pattern, text, 0, status);
5827 REGEX_CHECK_STATUS;
5828 m.setTimeLimit(5, status);
5829 m.find(status);
5830 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5831
5832 // Non-greedy loops. They take a different code path during matching.
5833 UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5834 status = U_ZERO_ERROR;
5835 RegexMatcher ngM(ngPattern, text, 0, status);
5836 REGEX_CHECK_STATUS;
5837 ngM.setTimeLimit(5, status);
5838 ngM.find(status);
5839 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5840
5841 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5842 StringPiece text8(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
5843 "carácter, sin importar la plataforma, sin importar el programa,"
5844 "sin importar el idioma.");
5845 status = U_ZERO_ERROR;
5846 LocalUTextPointer ut(utext_openUTF8(NULL, text8.data(), text8.length(), &status));
5847 REGEX_CHECK_STATUS;
5848 m.reset(ut.getAlias());
5849 m.find(status);
5850 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5851
5852 status = U_ZERO_ERROR;
5853 ngM.reset(ut.getAlias());
5854 ngM.find(status);
5855 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5856 }
5857
5858 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5859 // can cause a read past the end of the input text.
5860 // The failure is seen when running this test with Clang's Addresss Sanitizer.
5861
TestBug13631()5862 void RegexTest::TestBug13631() {
5863 const UChar *pats[] = { u"(?<!^)",
5864 u"(?<=^)",
5865 nullptr
5866 };
5867 for (const UChar **pat=pats; *pat; ++pat) {
5868 UErrorCode status = U_ZERO_ERROR;
5869 UnicodeString upat(*pat);
5870 RegexMatcher matcher(upat, 0, status);
5871 const UChar s =u'a';
5872 UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5873 REGEX_CHECK_STATUS;
5874 matcher.reset(ut);
5875 while (matcher.find()) {
5876 }
5877 utext_close(ut);
5878 }
5879 }
5880
5881 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5882 // where a following group specification would be expected.
5883 // Failure shows when running the test under Clang's Address Sanitizer.
5884
TestBug13632()5885 void RegexTest::TestBug13632() {
5886 UErrorCode status = U_ZERO_ERROR;
5887 URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
5888 const char16_t *sourceString = u"Hello, world.";
5889 uregex_setText(re, sourceString, u_strlen(sourceString), &status);
5890
5891 const int32_t destCap = 20;
5892 char16_t dest[destCap] = {};
5893 const char16_t replacement[] = {u'x', u'$'}; // Not nul terminated string.
5894 uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
5895
5896 assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5897 uregex_close(re);
5898 }
5899
TestBug20359()5900 void RegexTest::TestBug20359() {
5901 // The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
5902 // pairs. (Enter and exit pattern literal quote mode). Logic was correct.
5903 // Changed implementation to loop instead of recursing.
5904
5905 UnicodeString pattern;
5906 for (int i=0; i<50000; ++i) {
5907 pattern += u"\\Q\\E";
5908 }
5909 pattern += u"x";
5910
5911 UErrorCode status = U_ZERO_ERROR;
5912 LocalURegularExpressionPointer re(uregex_open(pattern.getBuffer(), pattern.length(),
5913 0, nullptr, &status));
5914 assertSuccess(WHERE, status);
5915
5916 // We have passed the point where the bug crashed. The following is a small sanity
5917 // check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
5918
5919 uregex_setText(re.getAlias(), u"abcxyz", -1, &status);
5920 assertSuccess(WHERE, status);
5921 assertTrue(WHERE, uregex_find(re.getAlias(), 0, &status));
5922 assertEquals(WHERE, 3, uregex_start(re.getAlias(), 0, &status));
5923 assertSuccess(WHERE, status);
5924 }
5925
5926
TestBug20863()5927 void RegexTest::TestBug20863() {
5928 // Test that patterns with a large number of named capture groups work correctly.
5929 //
5930 // The ticket was not for a bug per se, but to reduce memory usage by using lazy
5931 // construction of the map from capture names to numbers, and decreasing the
5932 // default size of the map.
5933
5934 constexpr int GROUP_COUNT = 2000;
5935 std::vector<UnicodeString> groupNames;
5936 for (int32_t i=0; i<GROUP_COUNT; ++i) {
5937 UnicodeString name;
5938 name.append(u"name");
5939 name.append(Int64ToUnicodeString(i));
5940 groupNames.push_back(name);
5941 }
5942
5943 UnicodeString patternString;
5944 for (UnicodeString name: groupNames) {
5945 patternString.append(u"(?<");
5946 patternString.append(name);
5947 patternString.append(u">.)");
5948 }
5949
5950 UErrorCode status = U_ZERO_ERROR;
5951 UParseError pe;
5952 LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
5953 if (!assertSuccess(WHERE, status)) {
5954 return;
5955 }
5956
5957 for (int32_t i=0; i<GROUP_COUNT; ++i) {
5958 int32_t group = pattern->groupNumberFromName(groupNames[i], status);
5959 if (!assertSuccess(WHERE, status)) {
5960 return;
5961 }
5962 assertEquals(WHERE, i+1, group);
5963 // Note: group 0 is the overall match; group 1 is the first separate capture group.
5964 }
5965
5966 // Verify that assignment of patterns with various combinations of named capture work.
5967 // Lazy creation of the internal named capture map changed the implementation logic here.
5968 {
5969 LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5970 LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5971 assertSuccess(WHERE, status);
5972 assertFalse(WHERE, *pat1 == *pat2);
5973 *pat1 = *pat2;
5974 assertTrue(WHERE, *pat1 == *pat2);
5975 assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
5976 assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
5977 assertSuccess(WHERE, status);
5978 }
5979
5980 {
5981 LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5982 LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5983 assertSuccess(WHERE, status);
5984 assertFalse(WHERE, *pat1 == *pat2);
5985 *pat2 = *pat1;
5986 assertTrue(WHERE, *pat1 == *pat2);
5987 assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
5988 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5989 status = U_ZERO_ERROR;
5990 assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
5991 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5992 status = U_ZERO_ERROR;
5993 }
5994
5995 {
5996 LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
5997 LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
5998 assertSuccess(WHERE, status);
5999 assertFalse(WHERE, *pat1 == *pat2);
6000 *pat2 = *pat1;
6001 assertTrue(WHERE, *pat1 == *pat2);
6002 assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
6003 assertSuccess(WHERE, status);
6004 assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
6005 assertSuccess(WHERE, status);
6006 assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
6007 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
6008 status = U_ZERO_ERROR;
6009 assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
6010 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
6011 status = U_ZERO_ERROR;
6012 }
6013
6014 }
6015
6016
6017 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
6018