1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7 //
8 // regextst.cpp
9 //
10 // ICU Regular Expressions test, part of intltest.
11 //
12
13 /*
14 NOTE!!
15
16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system.
20
21 */
22
23 #include "intltest.h"
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26 #include "unicode/regex.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ucnv.h"
29 #include "unicode/uniset.h"
30 #include "unicode/ustring.h"
31 #include "regextst.h"
32 #include "uvector.h"
33 #include "util.h"
34 #include <stdlib.h>
35 #include <string.h>
36 #include <stdio.h>
37 #include "cstring.h"
38 #include "uinvchar.h"
39
40 #define SUPPORT_MUTATING_INPUT_STRING 0
41
42 //---------------------------------------------------------------------------
43 //
44 // Test class boilerplate
45 //
46 //---------------------------------------------------------------------------
RegexTest()47 RegexTest::RegexTest()
48 {
49 }
50
51
~RegexTest()52 RegexTest::~RegexTest()
53 {
54 }
55
56
57
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)58 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
59 {
60 if (exec) logln("TestSuite RegexTest: ");
61 switch (index) {
62
63 case 0: name = "Basic";
64 if (exec) Basic();
65 break;
66 case 1: name = "API_Match";
67 if (exec) API_Match();
68 break;
69 case 2: name = "API_Replace";
70 if (exec) API_Replace();
71 break;
72 case 3: name = "API_Pattern";
73 if (exec) API_Pattern();
74 break;
75 case 4:
76 #if !UCONFIG_NO_FILE_IO
77 name = "Extended";
78 if (exec) Extended();
79 #else
80 name = "skip";
81 #endif
82 break;
83 case 5: name = "Errors";
84 if (exec) Errors();
85 break;
86 case 6: name = "PerlTests";
87 if (exec) PerlTests();
88 break;
89 case 7: name = "Callbacks";
90 if (exec) Callbacks();
91 break;
92 case 8: name = "FindProgressCallbacks";
93 if (exec) FindProgressCallbacks();
94 break;
95 case 9: name = "Bug 6149";
96 if (exec) Bug6149();
97 break;
98 case 10: name = "UTextBasic";
99 if (exec) UTextBasic();
100 break;
101 case 11: name = "API_Match_UTF8";
102 if (exec) API_Match_UTF8();
103 break;
104 case 12: name = "API_Replace_UTF8";
105 if (exec) API_Replace_UTF8();
106 break;
107 case 13: name = "API_Pattern_UTF8";
108 if (exec) API_Pattern_UTF8();
109 break;
110 case 14: name = "PerlTestsUTF8";
111 if (exec) PerlTestsUTF8();
112 break;
113 case 15: name = "PreAllocatedUTextCAPI";
114 if (exec) PreAllocatedUTextCAPI();
115 break;
116 case 16: name = "Bug 7651";
117 if (exec) Bug7651();
118 break;
119 case 17: name = "Bug 7740";
120 if (exec) Bug7740();
121 break;
122 case 18: name = "Bug 8479";
123 if (exec) Bug8479();
124 break;
125 case 19: name = "Bug 7029";
126 if (exec) Bug7029();
127 break;
128 case 20: name = "CheckInvBufSize";
129 if (exec) CheckInvBufSize();
130 break;
131 case 21: name = "Bug 9283";
132 if (exec) Bug9283();
133 break;
134
135 default: name = "";
136 break; //needed to end loop
137 }
138 }
139
140
141
142 /**
143 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
144 * into ASCII.
145 * @see utext_openUTF8
146 */
147 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
148
149 //---------------------------------------------------------------------------
150 //
151 // Error Checking / Reporting macros used in all of the tests.
152 //
153 //---------------------------------------------------------------------------
154
utextToPrintable(char * buf,int32_t bufLen,UText * text)155 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
156 int64_t oldIndex = utext_getNativeIndex(text);
157 utext_setNativeIndex(text, 0);
158 char *bufPtr = buf;
159 UChar32 c = utext_next32From(text, 0);
160 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
161 if (0x000020<=c && c<0x00007e) {
162 *bufPtr = c;
163 } else {
164 #if 0
165 sprintf(bufPtr,"U+%04X", c);
166 bufPtr+= strlen(bufPtr)-1;
167 #else
168 *bufPtr = '%';
169 #endif
170 }
171 bufPtr++;
172 c = UTEXT_NEXT32(text);
173 }
174 *bufPtr = 0;
175 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
176 char *ebuf = (char*)malloc(bufLen);
177 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
178 uprv_strncpy(buf, ebuf, bufLen);
179 free((void*)ebuf);
180 #endif
181 utext_setNativeIndex(text, oldIndex);
182 }
183
184
185 static char ASSERT_BUF[1024];
186
extractToAssertBuf(const UnicodeString & message)187 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
188 if(message.length()==0) {
189 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
190 } else {
191 UnicodeString buf;
192 IntlTest::prettify(message,buf);
193 if(buf.length()==0) {
194 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
195 } else {
196 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
197 if(ASSERT_BUF[0]==0) {
198 ASSERT_BUF[0]=0;
199 for(int32_t i=0;i<buf.length();i++) {
200 UChar ch = buf[i];
201 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
202 }
203 }
204 }
205 }
206 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
207 return ASSERT_BUF;
208 }
209
210
211 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
212
213 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
214 __FILE__, __LINE__, u_errorName(status)); return;}}
215
216 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
217
218 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
219 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
220 __LINE__, u_errorName(errcode), u_errorName(status));};}
221
222 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
223 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
224
225 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
226 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
227
228 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
229
230
testUTextEqual(UText * uta,UText * utb)231 static UBool testUTextEqual(UText *uta, UText *utb) {
232 UChar32 ca = 0;
233 UChar32 cb = 0;
234 utext_setNativeIndex(uta, 0);
235 utext_setNativeIndex(utb, 0);
236 do {
237 ca = utext_next32(uta);
238 cb = utext_next32(utb);
239 if (ca != cb) {
240 break;
241 }
242 } while (ca != U_SENTINEL);
243 return ca == cb;
244 }
245
246
247 /**
248 * @param expected expected text in UTF-8 (not platform) codepage
249 */
assertUText(const char * expected,UText * actual,const char * file,int line)250 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
251 UErrorCode status = U_ZERO_ERROR;
252 UText expectedText = UTEXT_INITIALIZER;
253 utext_openUTF8(&expectedText, expected, -1, &status);
254 if(U_FAILURE(status)) {
255 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
256 return;
257 }
258 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
259 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
260 return;
261 }
262 utext_setNativeIndex(actual, 0);
263 if (!testUTextEqual(&expectedText, actual)) {
264 char buf[201 /*21*/];
265 char expectedBuf[201];
266 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
267 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
268 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
269 }
270 utext_close(&expectedText);
271 }
272 /**
273 * @param expected invariant (platform local text) input
274 */
275
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)276 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
277 UErrorCode status = U_ZERO_ERROR;
278 UText expectedText = UTEXT_INITIALIZER;
279 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
280 if(U_FAILURE(status)) {
281 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
282 return;
283 }
284 utext_setNativeIndex(actual, 0);
285 if (!testUTextEqual(&expectedText, actual)) {
286 char buf[201 /*21*/];
287 char expectedBuf[201];
288 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
289 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
290 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
291 }
292 utext_close(&expectedText);
293 }
294
295 /**
296 * Assumes utf-8 input
297 */
298 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
299 /**
300 * Assumes Invariant input
301 */
302 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
303
304 /**
305 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
306 * passed into utext_openUTF8. An error will be given if
307 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
308 */
309
310 #define INV_BUFSIZ 2048 /* increase this if too small */
311
312 static int64_t inv_next=0;
313
314 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
315 static char inv_buf[INV_BUFSIZ];
316 #endif
317
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)318 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
319 if(length==-1) length=strlen(inv);
320 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
321 inv_next+=length;
322 return utext_openUTF8(ut, inv, length, status);
323 #else
324 if(inv_next+length+1>INV_BUFSIZ) {
325 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
326 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
327 *status = U_MEMORY_ALLOCATION_ERROR;
328 return NULL;
329 }
330
331 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
332 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
333 inv_next+=length;
334
335 #if 0
336 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
337 #endif
338
339 return utext_openUTF8(ut, (const char*)buf, length, status);
340 #endif
341 }
342
343
344 //---------------------------------------------------------------------------
345 //
346 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
347 // for the LookingAt() and Match() functions.
348 //
349 // usage:
350 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
351 //
352 // The expected results are UBool - TRUE or FALSE.
353 // The input text is unescaped. The pattern is not.
354 //
355 //
356 //---------------------------------------------------------------------------
357
358 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
359
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)360 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
361 const UnicodeString pattern(pat, -1, US_INV);
362 const UnicodeString inputText(text, -1, US_INV);
363 UErrorCode status = U_ZERO_ERROR;
364 UParseError pe;
365 RegexPattern *REPattern = NULL;
366 RegexMatcher *REMatcher = NULL;
367 UBool retVal = TRUE;
368
369 UnicodeString patString(pat, -1, US_INV);
370 REPattern = RegexPattern::compile(patString, 0, pe, status);
371 if (U_FAILURE(status)) {
372 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
373 line, u_errorName(status));
374 return FALSE;
375 }
376 if (line==376) { RegexPatternDump(REPattern);}
377
378 UnicodeString inputString(inputText);
379 UnicodeString unEscapedInput = inputString.unescape();
380 REMatcher = REPattern->matcher(unEscapedInput, status);
381 if (U_FAILURE(status)) {
382 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
383 line, u_errorName(status));
384 return FALSE;
385 }
386
387 UBool actualmatch;
388 actualmatch = REMatcher->lookingAt(status);
389 if (U_FAILURE(status)) {
390 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
391 line, u_errorName(status));
392 retVal = FALSE;
393 }
394 if (actualmatch != looking) {
395 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
396 retVal = FALSE;
397 }
398
399 status = U_ZERO_ERROR;
400 actualmatch = REMatcher->matches(status);
401 if (U_FAILURE(status)) {
402 errln("RegexTest failure in matches() at line %d. Status = %s\n",
403 line, u_errorName(status));
404 retVal = FALSE;
405 }
406 if (actualmatch != match) {
407 errln("RegexTest: wrong return from matches() at line %d.\n", line);
408 retVal = FALSE;
409 }
410
411 if (retVal == FALSE) {
412 RegexPatternDump(REPattern);
413 }
414
415 delete REPattern;
416 delete REMatcher;
417 return retVal;
418 }
419
420
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)421 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
422 UText pattern = UTEXT_INITIALIZER;
423 int32_t inputUTF8Length;
424 char *textChars = NULL;
425 UText inputText = UTEXT_INITIALIZER;
426 UErrorCode status = U_ZERO_ERROR;
427 UParseError pe;
428 RegexPattern *REPattern = NULL;
429 RegexMatcher *REMatcher = NULL;
430 UBool retVal = TRUE;
431
432 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
433 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
434 if (U_FAILURE(status)) {
435 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
436 line, u_errorName(status));
437 return FALSE;
438 }
439
440 UnicodeString inputString(text, -1, US_INV);
441 UnicodeString unEscapedInput = inputString.unescape();
442 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
443 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
444
445 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
446 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
447 // UTF-8 does not allow unpaired surrogates, so this could actually happen
448 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
449 return TRUE; // not a failure of the Regex engine
450 }
451 status = U_ZERO_ERROR; // buffer overflow
452 textChars = new char[inputUTF8Length+1];
453 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
454 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
455
456 REMatcher = &REPattern->matcher(status)->reset(&inputText);
457 if (U_FAILURE(status)) {
458 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
459 line, u_errorName(status));
460 return FALSE;
461 }
462
463 UBool actualmatch;
464 actualmatch = REMatcher->lookingAt(status);
465 if (U_FAILURE(status)) {
466 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
467 line, u_errorName(status));
468 retVal = FALSE;
469 }
470 if (actualmatch != looking) {
471 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
472 retVal = FALSE;
473 }
474
475 status = U_ZERO_ERROR;
476 actualmatch = REMatcher->matches(status);
477 if (U_FAILURE(status)) {
478 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
479 line, u_errorName(status));
480 retVal = FALSE;
481 }
482 if (actualmatch != match) {
483 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
484 retVal = FALSE;
485 }
486
487 if (retVal == FALSE) {
488 RegexPatternDump(REPattern);
489 }
490
491 delete REPattern;
492 delete REMatcher;
493 utext_close(&inputText);
494 utext_close(&pattern);
495 delete[] textChars;
496 return retVal;
497 }
498
499
500
501 //---------------------------------------------------------------------------
502 //
503 // REGEX_ERR Macro + invocation function to simplify writing tests
504 // regex tests for incorrect patterns
505 //
506 // usage:
507 // REGEX_ERR("pattern", expected error line, column, expected status);
508 //
509 //---------------------------------------------------------------------------
510 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
511
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)512 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
513 UErrorCode expectedStatus, int32_t line) {
514 UnicodeString pattern(pat);
515
516 UErrorCode status = U_ZERO_ERROR;
517 UParseError pe;
518 RegexPattern *callerPattern = NULL;
519
520 //
521 // Compile the caller's pattern
522 //
523 UnicodeString patString(pat);
524 callerPattern = RegexPattern::compile(patString, 0, pe, status);
525 if (status != expectedStatus) {
526 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
527 } else {
528 if (status != U_ZERO_ERROR) {
529 if (pe.line != errLine || pe.offset != errCol) {
530 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
531 line, errLine, errCol, pe.line, pe.offset);
532 }
533 }
534 }
535
536 delete callerPattern;
537
538 //
539 // Compile again, using a UTF-8-based UText
540 //
541 UText patternText = UTEXT_INITIALIZER;
542 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
543 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
544 if (status != expectedStatus) {
545 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
546 } else {
547 if (status != U_ZERO_ERROR) {
548 if (pe.line != errLine || pe.offset != errCol) {
549 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
550 line, errLine, errCol, pe.line, pe.offset);
551 }
552 }
553 }
554
555 delete callerPattern;
556 utext_close(&patternText);
557 }
558
559
560
561 //---------------------------------------------------------------------------
562 //
563 // Basic Check for basic functionality of regex pattern matching.
564 // Avoid the use of REGEX_FIND test macro, which has
565 // substantial dependencies on basic Regex functionality.
566 //
567 //---------------------------------------------------------------------------
Basic()568 void RegexTest::Basic() {
569
570
571 //
572 // Debug - slide failing test cases early
573 //
574 #if 0
575 {
576 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
577 UParseError pe;
578 UErrorCode status = U_ZERO_ERROR;
579 RegexPattern *pattern;
580 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
581 RegexPatternDump(pattern);
582 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
583 UBool result = m->find();
584 printf("result = %d\n", result);
585 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
586 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
587 }
588 exit(1);
589 #endif
590
591
592 //
593 // Pattern with parentheses
594 //
595 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
596 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
597 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
598
599 //
600 // Patterns with *
601 //
602 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
603 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
604 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
605 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
606 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
607
608 REGEX_TESTLM("a*", "", TRUE, TRUE);
609 REGEX_TESTLM("a*", "b", TRUE, FALSE);
610
611
612 //
613 // Patterns with "."
614 //
615 REGEX_TESTLM(".", "abc", TRUE, FALSE);
616 REGEX_TESTLM("...", "abc", TRUE, TRUE);
617 REGEX_TESTLM("....", "abc", FALSE, FALSE);
618 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
619 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
620 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
621 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
622 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
623
624 //
625 // Patterns with * applied to chars at end of literal string
626 //
627 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
628 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
629
630 //
631 // Supplemental chars match as single chars, not a pair of surrogates.
632 //
633 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
634 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
635 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
636
637
638 //
639 // UnicodeSets in the pattern
640 //
641 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
642 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
643 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
644 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
645 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
646 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
647
648 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
649 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
650 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
651 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
652 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
653
654 //
655 // OR operator in patterns
656 //
657 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
658 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
659 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
660 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
661
662 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
663 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
664 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
665 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
666 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
667 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
668
669 //
670 // +
671 //
672 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
673 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
674 REGEX_TESTLM("b+", "", FALSE, FALSE);
675 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
676 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
677 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
678
679 //
680 // ?
681 //
682 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
683 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
684 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
685 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
686 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
687 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
688 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
689 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
690 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
691
692 //
693 // Escape sequences that become single literal chars, handled internally
694 // by ICU's Unescape.
695 //
696
697 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
698 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
699 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
700 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
701 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
702 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
703 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
704 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
705 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
706 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
707
708 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
709 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
710
711 // Escape of special chars in patterns
712 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
713 }
714
715
716 //---------------------------------------------------------------------------
717 //
718 // UTextBasic Check for quirks that are specific to the UText
719 // implementation.
720 //
721 //---------------------------------------------------------------------------
UTextBasic()722 void RegexTest::UTextBasic() {
723 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
724 UErrorCode status = U_ZERO_ERROR;
725 UText pattern = UTEXT_INITIALIZER;
726 utext_openUTF8(&pattern, str_abc, -1, &status);
727 RegexMatcher matcher(&pattern, 0, status);
728 REGEX_CHECK_STATUS;
729
730 UText input = UTEXT_INITIALIZER;
731 utext_openUTF8(&input, str_abc, -1, &status);
732 REGEX_CHECK_STATUS;
733 matcher.reset(&input);
734 REGEX_CHECK_STATUS;
735 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
736
737 matcher.reset(matcher.inputText());
738 REGEX_CHECK_STATUS;
739 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
740
741 utext_close(&pattern);
742 utext_close(&input);
743 }
744
745
746 //---------------------------------------------------------------------------
747 //
748 // API_Match Test that the API for class RegexMatcher
749 // is present and nominally working, but excluding functions
750 // implementing replace operations.
751 //
752 //---------------------------------------------------------------------------
API_Match()753 void RegexTest::API_Match() {
754 UParseError pe;
755 UErrorCode status=U_ZERO_ERROR;
756 int32_t flags = 0;
757
758 //
759 // Debug - slide failing test cases early
760 //
761 #if 0
762 {
763 }
764 return;
765 #endif
766
767 //
768 // Simple pattern compilation
769 //
770 {
771 UnicodeString re("abc");
772 RegexPattern *pat2;
773 pat2 = RegexPattern::compile(re, flags, pe, status);
774 REGEX_CHECK_STATUS;
775
776 UnicodeString inStr1 = "abcdef this is a test";
777 UnicodeString instr2 = "not abc";
778 UnicodeString empty = "";
779
780
781 //
782 // Matcher creation and reset.
783 //
784 RegexMatcher *m1 = pat2->matcher(inStr1, status);
785 REGEX_CHECK_STATUS;
786 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
787 REGEX_ASSERT(m1->input() == inStr1);
788 m1->reset(instr2);
789 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
790 REGEX_ASSERT(m1->input() == instr2);
791 m1->reset(inStr1);
792 REGEX_ASSERT(m1->input() == inStr1);
793 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
794 m1->reset(empty);
795 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
796 REGEX_ASSERT(m1->input() == empty);
797 REGEX_ASSERT(&m1->pattern() == pat2);
798
799 //
800 // reset(pos, status)
801 //
802 m1->reset(inStr1);
803 m1->reset(4, status);
804 REGEX_CHECK_STATUS;
805 REGEX_ASSERT(m1->input() == inStr1);
806 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
807
808 m1->reset(-1, status);
809 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
810 status = U_ZERO_ERROR;
811
812 m1->reset(0, status);
813 REGEX_CHECK_STATUS;
814 status = U_ZERO_ERROR;
815
816 int32_t len = m1->input().length();
817 m1->reset(len-1, status);
818 REGEX_CHECK_STATUS;
819 status = U_ZERO_ERROR;
820
821 m1->reset(len, status);
822 REGEX_CHECK_STATUS;
823 status = U_ZERO_ERROR;
824
825 m1->reset(len+1, status);
826 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
827 status = U_ZERO_ERROR;
828
829 //
830 // match(pos, status)
831 //
832 m1->reset(instr2);
833 REGEX_ASSERT(m1->matches(4, status) == TRUE);
834 m1->reset();
835 REGEX_ASSERT(m1->matches(3, status) == FALSE);
836 m1->reset();
837 REGEX_ASSERT(m1->matches(5, status) == FALSE);
838 REGEX_ASSERT(m1->matches(4, status) == TRUE);
839 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
840 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
841
842 // Match() at end of string should fail, but should not
843 // be an error.
844 status = U_ZERO_ERROR;
845 len = m1->input().length();
846 REGEX_ASSERT(m1->matches(len, status) == FALSE);
847 REGEX_CHECK_STATUS;
848
849 // Match beyond end of string should fail with an error.
850 status = U_ZERO_ERROR;
851 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
852 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
853
854 // Successful match at end of string.
855 {
856 status = U_ZERO_ERROR;
857 RegexMatcher m("A?", 0, status); // will match zero length string.
858 REGEX_CHECK_STATUS;
859 m.reset(inStr1);
860 len = inStr1.length();
861 REGEX_ASSERT(m.matches(len, status) == TRUE);
862 REGEX_CHECK_STATUS;
863 m.reset(empty);
864 REGEX_ASSERT(m.matches(0, status) == TRUE);
865 REGEX_CHECK_STATUS;
866 }
867
868
869 //
870 // lookingAt(pos, status)
871 //
872 status = U_ZERO_ERROR;
873 m1->reset(instr2); // "not abc"
874 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
875 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
876 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
877 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
878 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
879 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
880 status = U_ZERO_ERROR;
881 len = m1->input().length();
882 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
883 REGEX_CHECK_STATUS;
884 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
885 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
886
887 delete m1;
888 delete pat2;
889 }
890
891
892 //
893 // Capture Group.
894 // RegexMatcher::start();
895 // RegexMatcher::end();
896 // RegexMatcher::groupCount();
897 //
898 {
899 int32_t flags=0;
900 UParseError pe;
901 UErrorCode status=U_ZERO_ERROR;
902
903 UnicodeString re("01(23(45)67)(.*)");
904 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
905 REGEX_CHECK_STATUS;
906 UnicodeString data = "0123456789";
907
908 RegexMatcher *matcher = pat->matcher(data, status);
909 REGEX_CHECK_STATUS;
910 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
911 static const int32_t matchStarts[] = {0, 2, 4, 8};
912 static const int32_t matchEnds[] = {10, 8, 6, 10};
913 int32_t i;
914 for (i=0; i<4; i++) {
915 int32_t actualStart = matcher->start(i, status);
916 REGEX_CHECK_STATUS;
917 if (actualStart != matchStarts[i]) {
918 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
919 __LINE__, i, matchStarts[i], actualStart);
920 }
921 int32_t actualEnd = matcher->end(i, status);
922 REGEX_CHECK_STATUS;
923 if (actualEnd != matchEnds[i]) {
924 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
925 __LINE__, i, matchEnds[i], actualEnd);
926 }
927 }
928
929 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
930 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
931
932 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
933 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
934 matcher->reset();
935 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
936
937 matcher->lookingAt(status);
938 REGEX_ASSERT(matcher->group(status) == "0123456789");
939 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
940 REGEX_ASSERT(matcher->group(1, status) == "234567" );
941 REGEX_ASSERT(matcher->group(2, status) == "45" );
942 REGEX_ASSERT(matcher->group(3, status) == "89" );
943 REGEX_CHECK_STATUS;
944 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
945 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
946 matcher->reset();
947 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
948
949 delete matcher;
950 delete pat;
951
952 }
953
954 //
955 // find
956 //
957 {
958 int32_t flags=0;
959 UParseError pe;
960 UErrorCode status=U_ZERO_ERROR;
961
962 UnicodeString re("abc");
963 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
964 REGEX_CHECK_STATUS;
965 UnicodeString data = ".abc..abc...abc..";
966 // 012345678901234567
967
968 RegexMatcher *matcher = pat->matcher(data, status);
969 REGEX_CHECK_STATUS;
970 REGEX_ASSERT(matcher->find());
971 REGEX_ASSERT(matcher->start(status) == 1);
972 REGEX_ASSERT(matcher->find());
973 REGEX_ASSERT(matcher->start(status) == 6);
974 REGEX_ASSERT(matcher->find());
975 REGEX_ASSERT(matcher->start(status) == 12);
976 REGEX_ASSERT(matcher->find() == FALSE);
977 REGEX_ASSERT(matcher->find() == FALSE);
978
979 matcher->reset();
980 REGEX_ASSERT(matcher->find());
981 REGEX_ASSERT(matcher->start(status) == 1);
982
983 REGEX_ASSERT(matcher->find(0, status));
984 REGEX_ASSERT(matcher->start(status) == 1);
985 REGEX_ASSERT(matcher->find(1, status));
986 REGEX_ASSERT(matcher->start(status) == 1);
987 REGEX_ASSERT(matcher->find(2, status));
988 REGEX_ASSERT(matcher->start(status) == 6);
989 REGEX_ASSERT(matcher->find(12, status));
990 REGEX_ASSERT(matcher->start(status) == 12);
991 REGEX_ASSERT(matcher->find(13, status) == FALSE);
992 REGEX_ASSERT(matcher->find(16, status) == FALSE);
993 REGEX_ASSERT(matcher->find(17, status) == FALSE);
994 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
995
996 status = U_ZERO_ERROR;
997 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
998 status = U_ZERO_ERROR;
999 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1000
1001 REGEX_ASSERT(matcher->groupCount() == 0);
1002
1003 delete matcher;
1004 delete pat;
1005 }
1006
1007
1008 //
1009 // find, with \G in pattern (true if at the end of a previous match).
1010 //
1011 {
1012 int32_t flags=0;
1013 UParseError pe;
1014 UErrorCode status=U_ZERO_ERROR;
1015
1016 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1017 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1018 REGEX_CHECK_STATUS;
1019 UnicodeString data = ".abcabc.abc..";
1020 // 012345678901234567
1021
1022 RegexMatcher *matcher = pat->matcher(data, status);
1023 REGEX_CHECK_STATUS;
1024 REGEX_ASSERT(matcher->find());
1025 REGEX_ASSERT(matcher->start(status) == 0);
1026 REGEX_ASSERT(matcher->start(1, status) == -1);
1027 REGEX_ASSERT(matcher->start(2, status) == 1);
1028
1029 REGEX_ASSERT(matcher->find());
1030 REGEX_ASSERT(matcher->start(status) == 4);
1031 REGEX_ASSERT(matcher->start(1, status) == 4);
1032 REGEX_ASSERT(matcher->start(2, status) == -1);
1033 REGEX_CHECK_STATUS;
1034
1035 delete matcher;
1036 delete pat;
1037 }
1038
1039 //
1040 // find with zero length matches, match position should bump ahead
1041 // to prevent loops.
1042 //
1043 {
1044 int32_t i;
1045 UErrorCode status=U_ZERO_ERROR;
1046 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1047 // using an always-true look-ahead.
1048 REGEX_CHECK_STATUS;
1049 UnicodeString s(" ");
1050 m.reset(s);
1051 for (i=0; ; i++) {
1052 if (m.find() == FALSE) {
1053 break;
1054 }
1055 REGEX_ASSERT(m.start(status) == i);
1056 REGEX_ASSERT(m.end(status) == i);
1057 }
1058 REGEX_ASSERT(i==5);
1059
1060 // Check that the bump goes over surrogate pairs OK
1061 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1062 s = s.unescape();
1063 m.reset(s);
1064 for (i=0; ; i+=2) {
1065 if (m.find() == FALSE) {
1066 break;
1067 }
1068 REGEX_ASSERT(m.start(status) == i);
1069 REGEX_ASSERT(m.end(status) == i);
1070 }
1071 REGEX_ASSERT(i==10);
1072 }
1073 {
1074 // find() loop breaking test.
1075 // with pattern of /.?/, should see a series of one char matches, then a single
1076 // match of zero length at the end of the input string.
1077 int32_t i;
1078 UErrorCode status=U_ZERO_ERROR;
1079 RegexMatcher m(".?", 0, status);
1080 REGEX_CHECK_STATUS;
1081 UnicodeString s(" ");
1082 m.reset(s);
1083 for (i=0; ; i++) {
1084 if (m.find() == FALSE) {
1085 break;
1086 }
1087 REGEX_ASSERT(m.start(status) == i);
1088 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1089 }
1090 REGEX_ASSERT(i==5);
1091 }
1092
1093
1094 //
1095 // Matchers with no input string behave as if they had an empty input string.
1096 //
1097
1098 {
1099 UErrorCode status = U_ZERO_ERROR;
1100 RegexMatcher m(".?", 0, status);
1101 REGEX_CHECK_STATUS;
1102 REGEX_ASSERT(m.find());
1103 REGEX_ASSERT(m.start(status) == 0);
1104 REGEX_ASSERT(m.input() == "");
1105 }
1106 {
1107 UErrorCode status = U_ZERO_ERROR;
1108 RegexPattern *p = RegexPattern::compile(".", 0, status);
1109 RegexMatcher *m = p->matcher(status);
1110 REGEX_CHECK_STATUS;
1111
1112 REGEX_ASSERT(m->find() == FALSE);
1113 REGEX_ASSERT(m->input() == "");
1114 delete m;
1115 delete p;
1116 }
1117
1118 //
1119 // Regions
1120 //
1121 {
1122 UErrorCode status = U_ZERO_ERROR;
1123 UnicodeString testString("This is test data");
1124 RegexMatcher m(".*", testString, 0, status);
1125 REGEX_CHECK_STATUS;
1126 REGEX_ASSERT(m.regionStart() == 0);
1127 REGEX_ASSERT(m.regionEnd() == testString.length());
1128 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1129 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1130
1131 m.region(2,4, status);
1132 REGEX_CHECK_STATUS;
1133 REGEX_ASSERT(m.matches(status));
1134 REGEX_ASSERT(m.start(status)==2);
1135 REGEX_ASSERT(m.end(status)==4);
1136 REGEX_CHECK_STATUS;
1137
1138 m.reset();
1139 REGEX_ASSERT(m.regionStart() == 0);
1140 REGEX_ASSERT(m.regionEnd() == testString.length());
1141
1142 UnicodeString shorterString("short");
1143 m.reset(shorterString);
1144 REGEX_ASSERT(m.regionStart() == 0);
1145 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1146
1147 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1148 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1149 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1150 REGEX_ASSERT(&m == &m.reset());
1151 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1152
1153 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1154 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1155 REGEX_ASSERT(&m == &m.reset());
1156 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1157
1158 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1159 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1160 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1161 REGEX_ASSERT(&m == &m.reset());
1162 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1163
1164 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1165 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1166 REGEX_ASSERT(&m == &m.reset());
1167 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1168
1169 }
1170
1171 //
1172 // hitEnd() and requireEnd()
1173 //
1174 {
1175 UErrorCode status = U_ZERO_ERROR;
1176 UnicodeString testString("aabb");
1177 RegexMatcher m1(".*", testString, 0, status);
1178 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1179 REGEX_ASSERT(m1.hitEnd() == TRUE);
1180 REGEX_ASSERT(m1.requireEnd() == FALSE);
1181 REGEX_CHECK_STATUS;
1182
1183 status = U_ZERO_ERROR;
1184 RegexMatcher m2("a*", testString, 0, status);
1185 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1186 REGEX_ASSERT(m2.hitEnd() == FALSE);
1187 REGEX_ASSERT(m2.requireEnd() == FALSE);
1188 REGEX_CHECK_STATUS;
1189
1190 status = U_ZERO_ERROR;
1191 RegexMatcher m3(".*$", testString, 0, status);
1192 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1193 REGEX_ASSERT(m3.hitEnd() == TRUE);
1194 REGEX_ASSERT(m3.requireEnd() == TRUE);
1195 REGEX_CHECK_STATUS;
1196 }
1197
1198
1199 //
1200 // Compilation error on reset with UChar *
1201 // These were a hazard that people were stumbling over with runtime errors.
1202 // Changed them to compiler errors by adding private methods that more closely
1203 // matched the incorrect use of the functions.
1204 //
1205 #if 0
1206 {
1207 UErrorCode status = U_ZERO_ERROR;
1208 UChar ucharString[20];
1209 RegexMatcher m(".", 0, status);
1210 m.reset(ucharString); // should not compile.
1211
1212 RegexPattern *p = RegexPattern::compile(".", 0, status);
1213 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1214
1215 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1216 }
1217 #endif
1218
1219 //
1220 // Time Outs.
1221 // Note: These tests will need to be changed when the regexp engine is
1222 // able to detect and cut short the exponential time behavior on
1223 // this type of match.
1224 //
1225 {
1226 UErrorCode status = U_ZERO_ERROR;
1227 // Enough 'a's in the string to cause the match to time out.
1228 // (Each on additonal 'a' doubles the time)
1229 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1230 RegexMatcher matcher("(a+)+b", testString, 0, status);
1231 REGEX_CHECK_STATUS;
1232 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1233 matcher.setTimeLimit(100, status);
1234 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1235 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1236 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1237 }
1238 {
1239 UErrorCode status = U_ZERO_ERROR;
1240 // Few enough 'a's to slip in under the time limit.
1241 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1242 RegexMatcher matcher("(a+)+b", testString, 0, status);
1243 REGEX_CHECK_STATUS;
1244 matcher.setTimeLimit(100, status);
1245 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1246 REGEX_CHECK_STATUS;
1247 }
1248
1249 //
1250 // Stack Limits
1251 //
1252 {
1253 UErrorCode status = U_ZERO_ERROR;
1254 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1255
1256 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1257 // of the '+', and makes the stack frames larger.
1258 RegexMatcher matcher("(A)+A$", testString, 0, status);
1259
1260 // With the default stack, this match should fail to run
1261 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1262 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1263
1264 // With unlimited stack, it should run
1265 status = U_ZERO_ERROR;
1266 matcher.setStackLimit(0, status);
1267 REGEX_CHECK_STATUS;
1268 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1269 REGEX_CHECK_STATUS;
1270 REGEX_ASSERT(matcher.getStackLimit() == 0);
1271
1272 // With a limited stack, it the match should fail
1273 status = U_ZERO_ERROR;
1274 matcher.setStackLimit(10000, status);
1275 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1276 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1277 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1278 }
1279
1280 // A pattern that doesn't save state should work with
1281 // a minimal sized stack
1282 {
1283 UErrorCode status = U_ZERO_ERROR;
1284 UnicodeString testString = "abc";
1285 RegexMatcher matcher("abc", testString, 0, status);
1286 REGEX_CHECK_STATUS;
1287 matcher.setStackLimit(30, status);
1288 REGEX_CHECK_STATUS;
1289 REGEX_ASSERT(matcher.matches(status) == TRUE);
1290 REGEX_CHECK_STATUS;
1291 REGEX_ASSERT(matcher.getStackLimit() == 30);
1292
1293 // Negative stack sizes should fail
1294 status = U_ZERO_ERROR;
1295 matcher.setStackLimit(1000, status);
1296 REGEX_CHECK_STATUS;
1297 matcher.setStackLimit(-1, status);
1298 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1299 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1300 }
1301
1302
1303 }
1304
1305
1306
1307
1308
1309
1310 //---------------------------------------------------------------------------
1311 //
1312 // API_Replace API test for class RegexMatcher, testing the
1313 // Replace family of functions.
1314 //
1315 //---------------------------------------------------------------------------
API_Replace()1316 void RegexTest::API_Replace() {
1317 //
1318 // Replace
1319 //
1320 int32_t flags=0;
1321 UParseError pe;
1322 UErrorCode status=U_ZERO_ERROR;
1323
1324 UnicodeString re("abc");
1325 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1326 REGEX_CHECK_STATUS;
1327 UnicodeString data = ".abc..abc...abc..";
1328 // 012345678901234567
1329 RegexMatcher *matcher = pat->matcher(data, status);
1330
1331 //
1332 // Plain vanilla matches.
1333 //
1334 UnicodeString dest;
1335 dest = matcher->replaceFirst("yz", status);
1336 REGEX_CHECK_STATUS;
1337 REGEX_ASSERT(dest == ".yz..abc...abc..");
1338
1339 dest = matcher->replaceAll("yz", status);
1340 REGEX_CHECK_STATUS;
1341 REGEX_ASSERT(dest == ".yz..yz...yz..");
1342
1343 //
1344 // Plain vanilla non-matches.
1345 //
1346 UnicodeString d2 = ".abx..abx...abx..";
1347 matcher->reset(d2);
1348 dest = matcher->replaceFirst("yz", status);
1349 REGEX_CHECK_STATUS;
1350 REGEX_ASSERT(dest == ".abx..abx...abx..");
1351
1352 dest = matcher->replaceAll("yz", status);
1353 REGEX_CHECK_STATUS;
1354 REGEX_ASSERT(dest == ".abx..abx...abx..");
1355
1356 //
1357 // Empty source string
1358 //
1359 UnicodeString d3 = "";
1360 matcher->reset(d3);
1361 dest = matcher->replaceFirst("yz", status);
1362 REGEX_CHECK_STATUS;
1363 REGEX_ASSERT(dest == "");
1364
1365 dest = matcher->replaceAll("yz", status);
1366 REGEX_CHECK_STATUS;
1367 REGEX_ASSERT(dest == "");
1368
1369 //
1370 // Empty substitution string
1371 //
1372 matcher->reset(data); // ".abc..abc...abc.."
1373 dest = matcher->replaceFirst("", status);
1374 REGEX_CHECK_STATUS;
1375 REGEX_ASSERT(dest == "...abc...abc..");
1376
1377 dest = matcher->replaceAll("", status);
1378 REGEX_CHECK_STATUS;
1379 REGEX_ASSERT(dest == "........");
1380
1381 //
1382 // match whole string
1383 //
1384 UnicodeString d4 = "abc";
1385 matcher->reset(d4);
1386 dest = matcher->replaceFirst("xyz", status);
1387 REGEX_CHECK_STATUS;
1388 REGEX_ASSERT(dest == "xyz");
1389
1390 dest = matcher->replaceAll("xyz", status);
1391 REGEX_CHECK_STATUS;
1392 REGEX_ASSERT(dest == "xyz");
1393
1394 //
1395 // Capture Group, simple case
1396 //
1397 UnicodeString re2("a(..)");
1398 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1399 REGEX_CHECK_STATUS;
1400 UnicodeString d5 = "abcdefg";
1401 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1402 REGEX_CHECK_STATUS;
1403 dest = matcher2->replaceFirst("$1$1", status);
1404 REGEX_CHECK_STATUS;
1405 REGEX_ASSERT(dest == "bcbcdefg");
1406
1407 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1408 REGEX_CHECK_STATUS;
1409 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1410
1411 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1412 REGEX_CHECK_STATUS;
1413 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1414
1415 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1416 replacement = replacement.unescape();
1417 dest = matcher2->replaceFirst(replacement, status);
1418 REGEX_CHECK_STATUS;
1419 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1420
1421 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1422
1423
1424 //
1425 // Replacement String with \u hex escapes
1426 //
1427 {
1428 UnicodeString src = "abc 1 abc 2 abc 3";
1429 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1430 matcher->reset(src);
1431 UnicodeString result = matcher->replaceAll(substitute, status);
1432 REGEX_CHECK_STATUS;
1433 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1434 }
1435 {
1436 UnicodeString src = "abc !";
1437 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1438 matcher->reset(src);
1439 UnicodeString result = matcher->replaceAll(substitute, status);
1440 REGEX_CHECK_STATUS;
1441 UnicodeString expected = UnicodeString("--");
1442 expected.append((UChar32)0x10000);
1443 expected.append("-- !");
1444 REGEX_ASSERT(result == expected);
1445 }
1446 // TODO: need more through testing of capture substitutions.
1447
1448 // Bug 4057
1449 //
1450 {
1451 status = U_ZERO_ERROR;
1452 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1453 RegexMatcher m("ss(.*?)ee", 0, status);
1454 REGEX_CHECK_STATUS;
1455 UnicodeString result;
1456
1457 // Multiple finds do NOT bump up the previous appendReplacement postion.
1458 m.reset(s);
1459 m.find();
1460 m.find();
1461 m.appendReplacement(result, "ooh", status);
1462 REGEX_CHECK_STATUS;
1463 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1464
1465 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1466 status = U_ZERO_ERROR;
1467 result.truncate(0);
1468 m.reset(10, status);
1469 m.find();
1470 m.find();
1471 m.appendReplacement(result, "ooh", status);
1472 REGEX_CHECK_STATUS;
1473 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1474
1475 // find() at interior of string, appendReplacemnt still starts at beginning.
1476 status = U_ZERO_ERROR;
1477 result.truncate(0);
1478 m.reset();
1479 m.find(10, status);
1480 m.find();
1481 m.appendReplacement(result, "ooh", status);
1482 REGEX_CHECK_STATUS;
1483 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1484
1485 m.appendTail(result);
1486 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1487
1488 }
1489
1490 delete matcher2;
1491 delete pat2;
1492 delete matcher;
1493 delete pat;
1494 }
1495
1496
1497 //---------------------------------------------------------------------------
1498 //
1499 // API_Pattern Test that the API for class RegexPattern is
1500 // present and nominally working.
1501 //
1502 //---------------------------------------------------------------------------
API_Pattern()1503 void RegexTest::API_Pattern() {
1504 RegexPattern pata; // Test default constructor to not crash.
1505 RegexPattern patb;
1506
1507 REGEX_ASSERT(pata == patb);
1508 REGEX_ASSERT(pata == pata);
1509
1510 UnicodeString re1("abc[a-l][m-z]");
1511 UnicodeString re2("def");
1512 UErrorCode status = U_ZERO_ERROR;
1513 UParseError pe;
1514
1515 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1516 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1517 REGEX_CHECK_STATUS;
1518 REGEX_ASSERT(*pat1 == *pat1);
1519 REGEX_ASSERT(*pat1 != pata);
1520
1521 // Assign
1522 patb = *pat1;
1523 REGEX_ASSERT(patb == *pat1);
1524
1525 // Copy Construct
1526 RegexPattern patc(*pat1);
1527 REGEX_ASSERT(patc == *pat1);
1528 REGEX_ASSERT(patb == patc);
1529 REGEX_ASSERT(pat1 != pat2);
1530 patb = *pat2;
1531 REGEX_ASSERT(patb != patc);
1532 REGEX_ASSERT(patb == *pat2);
1533
1534 // Compile with no flags.
1535 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1536 REGEX_ASSERT(*pat1a == *pat1);
1537
1538 REGEX_ASSERT(pat1a->flags() == 0);
1539
1540 // Compile with different flags should be not equal
1541 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1542 REGEX_CHECK_STATUS;
1543
1544 REGEX_ASSERT(*pat1b != *pat1a);
1545 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1546 REGEX_ASSERT(pat1a->flags() == 0);
1547 delete pat1b;
1548
1549 // clone
1550 RegexPattern *pat1c = pat1->clone();
1551 REGEX_ASSERT(*pat1c == *pat1);
1552 REGEX_ASSERT(*pat1c != *pat2);
1553
1554 delete pat1c;
1555 delete pat1a;
1556 delete pat1;
1557 delete pat2;
1558
1559
1560 //
1561 // Verify that a matcher created from a cloned pattern works.
1562 // (Jitterbug 3423)
1563 //
1564 {
1565 UErrorCode status = U_ZERO_ERROR;
1566 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1567 RegexPattern *pClone = pSource->clone();
1568 delete pSource;
1569 RegexMatcher *mFromClone = pClone->matcher(status);
1570 REGEX_CHECK_STATUS;
1571 UnicodeString s = "Hello World";
1572 mFromClone->reset(s);
1573 REGEX_ASSERT(mFromClone->find() == TRUE);
1574 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1575 REGEX_ASSERT(mFromClone->find() == TRUE);
1576 REGEX_ASSERT(mFromClone->group(status) == "World");
1577 REGEX_ASSERT(mFromClone->find() == FALSE);
1578 delete mFromClone;
1579 delete pClone;
1580 }
1581
1582 //
1583 // matches convenience API
1584 //
1585 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1586 REGEX_CHECK_STATUS;
1587 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1588 REGEX_CHECK_STATUS;
1589 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1590 REGEX_CHECK_STATUS;
1591 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1592 REGEX_CHECK_STATUS;
1593 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1594 REGEX_CHECK_STATUS;
1595 status = U_INDEX_OUTOFBOUNDS_ERROR;
1596 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1597 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1598
1599
1600 //
1601 // Split()
1602 //
1603 status = U_ZERO_ERROR;
1604 pat1 = RegexPattern::compile(" +", pe, status);
1605 REGEX_CHECK_STATUS;
1606 UnicodeString fields[10];
1607
1608 int32_t n;
1609 n = pat1->split("Now is the time", fields, 10, status);
1610 REGEX_CHECK_STATUS;
1611 REGEX_ASSERT(n==4);
1612 REGEX_ASSERT(fields[0]=="Now");
1613 REGEX_ASSERT(fields[1]=="is");
1614 REGEX_ASSERT(fields[2]=="the");
1615 REGEX_ASSERT(fields[3]=="time");
1616 REGEX_ASSERT(fields[4]=="");
1617
1618 n = pat1->split("Now is the time", fields, 2, status);
1619 REGEX_CHECK_STATUS;
1620 REGEX_ASSERT(n==2);
1621 REGEX_ASSERT(fields[0]=="Now");
1622 REGEX_ASSERT(fields[1]=="is the time");
1623 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1624
1625 fields[1] = "*";
1626 status = U_ZERO_ERROR;
1627 n = pat1->split("Now is the time", fields, 1, status);
1628 REGEX_CHECK_STATUS;
1629 REGEX_ASSERT(n==1);
1630 REGEX_ASSERT(fields[0]=="Now is the time");
1631 REGEX_ASSERT(fields[1]=="*");
1632 status = U_ZERO_ERROR;
1633
1634 n = pat1->split(" Now is the time ", fields, 10, status);
1635 REGEX_CHECK_STATUS;
1636 REGEX_ASSERT(n==6);
1637 REGEX_ASSERT(fields[0]=="");
1638 REGEX_ASSERT(fields[1]=="Now");
1639 REGEX_ASSERT(fields[2]=="is");
1640 REGEX_ASSERT(fields[3]=="the");
1641 REGEX_ASSERT(fields[4]=="time");
1642 REGEX_ASSERT(fields[5]=="");
1643
1644 n = pat1->split(" ", fields, 10, status);
1645 REGEX_CHECK_STATUS;
1646 REGEX_ASSERT(n==2);
1647 REGEX_ASSERT(fields[0]=="");
1648 REGEX_ASSERT(fields[1]=="");
1649
1650 fields[0] = "foo";
1651 n = pat1->split("", fields, 10, status);
1652 REGEX_CHECK_STATUS;
1653 REGEX_ASSERT(n==0);
1654 REGEX_ASSERT(fields[0]=="foo");
1655
1656 delete pat1;
1657
1658 // split, with a pattern with (capture)
1659 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1660 REGEX_CHECK_STATUS;
1661
1662 status = U_ZERO_ERROR;
1663 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1664 REGEX_CHECK_STATUS;
1665 REGEX_ASSERT(n==7);
1666 REGEX_ASSERT(fields[0]=="");
1667 REGEX_ASSERT(fields[1]=="a");
1668 REGEX_ASSERT(fields[2]=="Now is ");
1669 REGEX_ASSERT(fields[3]=="b");
1670 REGEX_ASSERT(fields[4]=="the time");
1671 REGEX_ASSERT(fields[5]=="c");
1672 REGEX_ASSERT(fields[6]=="");
1673 REGEX_ASSERT(status==U_ZERO_ERROR);
1674
1675 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1676 REGEX_CHECK_STATUS;
1677 REGEX_ASSERT(n==7);
1678 REGEX_ASSERT(fields[0]==" ");
1679 REGEX_ASSERT(fields[1]=="a");
1680 REGEX_ASSERT(fields[2]=="Now is ");
1681 REGEX_ASSERT(fields[3]=="b");
1682 REGEX_ASSERT(fields[4]=="the time");
1683 REGEX_ASSERT(fields[5]=="c");
1684 REGEX_ASSERT(fields[6]=="");
1685
1686 status = U_ZERO_ERROR;
1687 fields[6] = "foo";
1688 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1689 REGEX_CHECK_STATUS;
1690 REGEX_ASSERT(n==6);
1691 REGEX_ASSERT(fields[0]==" ");
1692 REGEX_ASSERT(fields[1]=="a");
1693 REGEX_ASSERT(fields[2]=="Now is ");
1694 REGEX_ASSERT(fields[3]=="b");
1695 REGEX_ASSERT(fields[4]=="the time");
1696 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1697 REGEX_ASSERT(fields[6]=="foo");
1698
1699 status = U_ZERO_ERROR;
1700 fields[5] = "foo";
1701 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1702 REGEX_CHECK_STATUS;
1703 REGEX_ASSERT(n==5);
1704 REGEX_ASSERT(fields[0]==" ");
1705 REGEX_ASSERT(fields[1]=="a");
1706 REGEX_ASSERT(fields[2]=="Now is ");
1707 REGEX_ASSERT(fields[3]=="b");
1708 REGEX_ASSERT(fields[4]=="the time<c>");
1709 REGEX_ASSERT(fields[5]=="foo");
1710
1711 status = U_ZERO_ERROR;
1712 fields[5] = "foo";
1713 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1714 REGEX_CHECK_STATUS;
1715 REGEX_ASSERT(n==5);
1716 REGEX_ASSERT(fields[0]==" ");
1717 REGEX_ASSERT(fields[1]=="a");
1718 REGEX_ASSERT(fields[2]=="Now is ");
1719 REGEX_ASSERT(fields[3]=="b");
1720 REGEX_ASSERT(fields[4]=="the time");
1721 REGEX_ASSERT(fields[5]=="foo");
1722
1723 status = U_ZERO_ERROR;
1724 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1725 REGEX_CHECK_STATUS;
1726 REGEX_ASSERT(n==4);
1727 REGEX_ASSERT(fields[0]==" ");
1728 REGEX_ASSERT(fields[1]=="a");
1729 REGEX_ASSERT(fields[2]=="Now is ");
1730 REGEX_ASSERT(fields[3]=="the time<c>");
1731 status = U_ZERO_ERROR;
1732 delete pat1;
1733
1734 pat1 = RegexPattern::compile("([-,])", pe, status);
1735 REGEX_CHECK_STATUS;
1736 n = pat1->split("1-10,20", fields, 10, status);
1737 REGEX_CHECK_STATUS;
1738 REGEX_ASSERT(n==5);
1739 REGEX_ASSERT(fields[0]=="1");
1740 REGEX_ASSERT(fields[1]=="-");
1741 REGEX_ASSERT(fields[2]=="10");
1742 REGEX_ASSERT(fields[3]==",");
1743 REGEX_ASSERT(fields[4]=="20");
1744 delete pat1;
1745
1746 // Test split of string with empty trailing fields
1747 pat1 = RegexPattern::compile(",", pe, status);
1748 REGEX_CHECK_STATUS;
1749 n = pat1->split("a,b,c,", fields, 10, status);
1750 REGEX_CHECK_STATUS;
1751 REGEX_ASSERT(n==4);
1752 REGEX_ASSERT(fields[0]=="a");
1753 REGEX_ASSERT(fields[1]=="b");
1754 REGEX_ASSERT(fields[2]=="c");
1755 REGEX_ASSERT(fields[3]=="");
1756
1757 n = pat1->split("a,,,", fields, 10, status);
1758 REGEX_CHECK_STATUS;
1759 REGEX_ASSERT(n==4);
1760 REGEX_ASSERT(fields[0]=="a");
1761 REGEX_ASSERT(fields[1]=="");
1762 REGEX_ASSERT(fields[2]=="");
1763 REGEX_ASSERT(fields[3]=="");
1764 delete pat1;
1765
1766 // Split Separator with zero length match.
1767 pat1 = RegexPattern::compile(":?", pe, status);
1768 REGEX_CHECK_STATUS;
1769 n = pat1->split("abc", fields, 10, status);
1770 REGEX_CHECK_STATUS;
1771 REGEX_ASSERT(n==5);
1772 REGEX_ASSERT(fields[0]=="");
1773 REGEX_ASSERT(fields[1]=="a");
1774 REGEX_ASSERT(fields[2]=="b");
1775 REGEX_ASSERT(fields[3]=="c");
1776 REGEX_ASSERT(fields[4]=="");
1777
1778 delete pat1;
1779
1780 //
1781 // RegexPattern::pattern()
1782 //
1783 pat1 = new RegexPattern();
1784 REGEX_ASSERT(pat1->pattern() == "");
1785 delete pat1;
1786
1787 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1788 REGEX_CHECK_STATUS;
1789 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1790 delete pat1;
1791
1792
1793 //
1794 // classID functions
1795 //
1796 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1797 REGEX_CHECK_STATUS;
1798 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1799 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1800 UnicodeString Hello("Hello, world.");
1801 RegexMatcher *m = pat1->matcher(Hello, status);
1802 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1803 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1804 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1805 delete m;
1806 delete pat1;
1807
1808 }
1809
1810 //---------------------------------------------------------------------------
1811 //
1812 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1813 // is present and working, but excluding functions
1814 // implementing replace operations.
1815 //
1816 //---------------------------------------------------------------------------
API_Match_UTF8()1817 void RegexTest::API_Match_UTF8() {
1818 UParseError pe;
1819 UErrorCode status=U_ZERO_ERROR;
1820 int32_t flags = 0;
1821
1822 //
1823 // Debug - slide failing test cases early
1824 //
1825 #if 0
1826 {
1827 }
1828 return;
1829 #endif
1830
1831 //
1832 // Simple pattern compilation
1833 //
1834 {
1835 UText re = UTEXT_INITIALIZER;
1836 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1837 REGEX_VERBOSE_TEXT(&re);
1838 RegexPattern *pat2;
1839 pat2 = RegexPattern::compile(&re, flags, pe, status);
1840 REGEX_CHECK_STATUS;
1841
1842 UText input1 = UTEXT_INITIALIZER;
1843 UText input2 = UTEXT_INITIALIZER;
1844 UText empty = UTEXT_INITIALIZER;
1845 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1846 REGEX_VERBOSE_TEXT(&input1);
1847 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1848 REGEX_VERBOSE_TEXT(&input2);
1849 utext_openUChars(&empty, NULL, 0, &status);
1850
1851 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1852 int32_t input2Len = strlen("not abc");
1853
1854
1855 //
1856 // Matcher creation and reset.
1857 //
1858 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1859 REGEX_CHECK_STATUS;
1860 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1861 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1862 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1863 m1->reset(&input2);
1864 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1865 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1866 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1867 m1->reset(&input1);
1868 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1869 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1870 m1->reset(&empty);
1871 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1872 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1873
1874 //
1875 // reset(pos, status)
1876 //
1877 m1->reset(&input1);
1878 m1->reset(4, status);
1879 REGEX_CHECK_STATUS;
1880 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1881 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1882
1883 m1->reset(-1, status);
1884 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1885 status = U_ZERO_ERROR;
1886
1887 m1->reset(0, status);
1888 REGEX_CHECK_STATUS;
1889 status = U_ZERO_ERROR;
1890
1891 m1->reset(input1Len-1, status);
1892 REGEX_CHECK_STATUS;
1893 status = U_ZERO_ERROR;
1894
1895 m1->reset(input1Len, status);
1896 REGEX_CHECK_STATUS;
1897 status = U_ZERO_ERROR;
1898
1899 m1->reset(input1Len+1, status);
1900 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1901 status = U_ZERO_ERROR;
1902
1903 //
1904 // match(pos, status)
1905 //
1906 m1->reset(&input2);
1907 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1908 m1->reset();
1909 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1910 m1->reset();
1911 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1912 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1913 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1914 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1915
1916 // Match() at end of string should fail, but should not
1917 // be an error.
1918 status = U_ZERO_ERROR;
1919 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1920 REGEX_CHECK_STATUS;
1921
1922 // Match beyond end of string should fail with an error.
1923 status = U_ZERO_ERROR;
1924 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1925 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1926
1927 // Successful match at end of string.
1928 {
1929 status = U_ZERO_ERROR;
1930 RegexMatcher m("A?", 0, status); // will match zero length string.
1931 REGEX_CHECK_STATUS;
1932 m.reset(&input1);
1933 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1934 REGEX_CHECK_STATUS;
1935 m.reset(&empty);
1936 REGEX_ASSERT(m.matches(0, status) == TRUE);
1937 REGEX_CHECK_STATUS;
1938 }
1939
1940
1941 //
1942 // lookingAt(pos, status)
1943 //
1944 status = U_ZERO_ERROR;
1945 m1->reset(&input2); // "not abc"
1946 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1947 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1948 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1949 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1950 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1951 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1952 status = U_ZERO_ERROR;
1953 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1954 REGEX_CHECK_STATUS;
1955 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1956 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1957
1958 delete m1;
1959 delete pat2;
1960
1961 utext_close(&re);
1962 utext_close(&input1);
1963 utext_close(&input2);
1964 utext_close(&empty);
1965 }
1966
1967
1968 //
1969 // Capture Group.
1970 // RegexMatcher::start();
1971 // RegexMatcher::end();
1972 // RegexMatcher::groupCount();
1973 //
1974 {
1975 int32_t flags=0;
1976 UParseError pe;
1977 UErrorCode status=U_ZERO_ERROR;
1978 UText re=UTEXT_INITIALIZER;
1979 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1980 utext_openUTF8(&re, str_01234567_pat, -1, &status);
1981
1982 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1983 REGEX_CHECK_STATUS;
1984
1985 UText input = UTEXT_INITIALIZER;
1986 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1987 utext_openUTF8(&input, str_0123456789, -1, &status);
1988
1989 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1990 REGEX_CHECK_STATUS;
1991 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1992 static const int32_t matchStarts[] = {0, 2, 4, 8};
1993 static const int32_t matchEnds[] = {10, 8, 6, 10};
1994 int32_t i;
1995 for (i=0; i<4; i++) {
1996 int32_t actualStart = matcher->start(i, status);
1997 REGEX_CHECK_STATUS;
1998 if (actualStart != matchStarts[i]) {
1999 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2000 __FILE__, __LINE__, i, matchStarts[i], actualStart);
2001 }
2002 int32_t actualEnd = matcher->end(i, status);
2003 REGEX_CHECK_STATUS;
2004 if (actualEnd != matchEnds[i]) {
2005 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2006 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2007 }
2008 }
2009
2010 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2011 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2012
2013 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2014 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2015 matcher->reset();
2016 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2017
2018 matcher->lookingAt(status);
2019
2020 UnicodeString dest;
2021 UText destText = UTEXT_INITIALIZER;
2022 utext_openUnicodeString(&destText, &dest, &status);
2023 UText *result;
2024 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2025 // Test shallow-clone API
2026 int64_t group_len;
2027 result = matcher->group((UText *)NULL, group_len, status);
2028 REGEX_CHECK_STATUS;
2029 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2030 utext_close(result);
2031 result = matcher->group(0, &destText, group_len, status);
2032 REGEX_CHECK_STATUS;
2033 REGEX_ASSERT(result == &destText);
2034 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2035 // destText is now immutable, reopen it
2036 utext_close(&destText);
2037 utext_openUnicodeString(&destText, &dest, &status);
2038
2039 result = matcher->group(0, NULL, status);
2040 REGEX_CHECK_STATUS;
2041 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2042 utext_close(result);
2043 result = matcher->group(0, &destText, status);
2044 REGEX_CHECK_STATUS;
2045 REGEX_ASSERT(result == &destText);
2046 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2047
2048 result = matcher->group(1, NULL, status);
2049 REGEX_CHECK_STATUS;
2050 const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2051 REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2052 utext_close(result);
2053 result = matcher->group(1, &destText, status);
2054 REGEX_CHECK_STATUS;
2055 REGEX_ASSERT(result == &destText);
2056 REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2057
2058 result = matcher->group(2, NULL, status);
2059 REGEX_CHECK_STATUS;
2060 const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
2061 REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2062 utext_close(result);
2063 result = matcher->group(2, &destText, status);
2064 REGEX_CHECK_STATUS;
2065 REGEX_ASSERT(result == &destText);
2066 REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2067
2068 result = matcher->group(3, NULL, status);
2069 REGEX_CHECK_STATUS;
2070 const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
2071 REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2072 utext_close(result);
2073 result = matcher->group(3, &destText, status);
2074 REGEX_CHECK_STATUS;
2075 REGEX_ASSERT(result == &destText);
2076 REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2077
2078 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2079 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2080 matcher->reset();
2081 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2082
2083 delete matcher;
2084 delete pat;
2085
2086 utext_close(&destText);
2087 utext_close(&input);
2088 utext_close(&re);
2089 }
2090
2091 //
2092 // find
2093 //
2094 {
2095 int32_t flags=0;
2096 UParseError pe;
2097 UErrorCode status=U_ZERO_ERROR;
2098 UText re=UTEXT_INITIALIZER;
2099 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2100 utext_openUTF8(&re, str_abc, -1, &status);
2101
2102 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2103 REGEX_CHECK_STATUS;
2104 UText input = UTEXT_INITIALIZER;
2105 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2106 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2107 // 012345678901234567
2108
2109 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2110 REGEX_CHECK_STATUS;
2111 REGEX_ASSERT(matcher->find());
2112 REGEX_ASSERT(matcher->start(status) == 1);
2113 REGEX_ASSERT(matcher->find());
2114 REGEX_ASSERT(matcher->start(status) == 6);
2115 REGEX_ASSERT(matcher->find());
2116 REGEX_ASSERT(matcher->start(status) == 12);
2117 REGEX_ASSERT(matcher->find() == FALSE);
2118 REGEX_ASSERT(matcher->find() == FALSE);
2119
2120 matcher->reset();
2121 REGEX_ASSERT(matcher->find());
2122 REGEX_ASSERT(matcher->start(status) == 1);
2123
2124 REGEX_ASSERT(matcher->find(0, status));
2125 REGEX_ASSERT(matcher->start(status) == 1);
2126 REGEX_ASSERT(matcher->find(1, status));
2127 REGEX_ASSERT(matcher->start(status) == 1);
2128 REGEX_ASSERT(matcher->find(2, status));
2129 REGEX_ASSERT(matcher->start(status) == 6);
2130 REGEX_ASSERT(matcher->find(12, status));
2131 REGEX_ASSERT(matcher->start(status) == 12);
2132 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2133 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2134 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2135 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2136
2137 status = U_ZERO_ERROR;
2138 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2139 status = U_ZERO_ERROR;
2140 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2141
2142 REGEX_ASSERT(matcher->groupCount() == 0);
2143
2144 delete matcher;
2145 delete pat;
2146
2147 utext_close(&input);
2148 utext_close(&re);
2149 }
2150
2151
2152 //
2153 // find, with \G in pattern (true if at the end of a previous match).
2154 //
2155 {
2156 int32_t flags=0;
2157 UParseError pe;
2158 UErrorCode status=U_ZERO_ERROR;
2159 UText re=UTEXT_INITIALIZER;
2160 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2161 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2162
2163 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2164
2165 REGEX_CHECK_STATUS;
2166 UText input = UTEXT_INITIALIZER;
2167 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2168 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2169 // 012345678901234567
2170
2171 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2172 REGEX_CHECK_STATUS;
2173 REGEX_ASSERT(matcher->find());
2174 REGEX_ASSERT(matcher->start(status) == 0);
2175 REGEX_ASSERT(matcher->start(1, status) == -1);
2176 REGEX_ASSERT(matcher->start(2, status) == 1);
2177
2178 REGEX_ASSERT(matcher->find());
2179 REGEX_ASSERT(matcher->start(status) == 4);
2180 REGEX_ASSERT(matcher->start(1, status) == 4);
2181 REGEX_ASSERT(matcher->start(2, status) == -1);
2182 REGEX_CHECK_STATUS;
2183
2184 delete matcher;
2185 delete pat;
2186
2187 utext_close(&input);
2188 utext_close(&re);
2189 }
2190
2191 //
2192 // find with zero length matches, match position should bump ahead
2193 // to prevent loops.
2194 //
2195 {
2196 int32_t i;
2197 UErrorCode status=U_ZERO_ERROR;
2198 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2199 // using an always-true look-ahead.
2200 REGEX_CHECK_STATUS;
2201 UText s = UTEXT_INITIALIZER;
2202 utext_openUTF8(&s, " ", -1, &status);
2203 m.reset(&s);
2204 for (i=0; ; i++) {
2205 if (m.find() == FALSE) {
2206 break;
2207 }
2208 REGEX_ASSERT(m.start(status) == i);
2209 REGEX_ASSERT(m.end(status) == i);
2210 }
2211 REGEX_ASSERT(i==5);
2212
2213 // Check that the bump goes over characters outside the BMP OK
2214 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2215 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2216 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2217 m.reset(&s);
2218 for (i=0; ; i+=4) {
2219 if (m.find() == FALSE) {
2220 break;
2221 }
2222 REGEX_ASSERT(m.start(status) == i);
2223 REGEX_ASSERT(m.end(status) == i);
2224 }
2225 REGEX_ASSERT(i==20);
2226
2227 utext_close(&s);
2228 }
2229 {
2230 // find() loop breaking test.
2231 // with pattern of /.?/, should see a series of one char matches, then a single
2232 // match of zero length at the end of the input string.
2233 int32_t i;
2234 UErrorCode status=U_ZERO_ERROR;
2235 RegexMatcher m(".?", 0, status);
2236 REGEX_CHECK_STATUS;
2237 UText s = UTEXT_INITIALIZER;
2238 utext_openUTF8(&s, " ", -1, &status);
2239 m.reset(&s);
2240 for (i=0; ; i++) {
2241 if (m.find() == FALSE) {
2242 break;
2243 }
2244 REGEX_ASSERT(m.start(status) == i);
2245 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2246 }
2247 REGEX_ASSERT(i==5);
2248
2249 utext_close(&s);
2250 }
2251
2252
2253 //
2254 // Matchers with no input string behave as if they had an empty input string.
2255 //
2256
2257 {
2258 UErrorCode status = U_ZERO_ERROR;
2259 RegexMatcher m(".?", 0, status);
2260 REGEX_CHECK_STATUS;
2261 REGEX_ASSERT(m.find());
2262 REGEX_ASSERT(m.start(status) == 0);
2263 REGEX_ASSERT(m.input() == "");
2264 }
2265 {
2266 UErrorCode status = U_ZERO_ERROR;
2267 RegexPattern *p = RegexPattern::compile(".", 0, status);
2268 RegexMatcher *m = p->matcher(status);
2269 REGEX_CHECK_STATUS;
2270
2271 REGEX_ASSERT(m->find() == FALSE);
2272 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2273 delete m;
2274 delete p;
2275 }
2276
2277 //
2278 // Regions
2279 //
2280 {
2281 UErrorCode status = U_ZERO_ERROR;
2282 UText testPattern = UTEXT_INITIALIZER;
2283 UText testText = UTEXT_INITIALIZER;
2284 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2285 REGEX_VERBOSE_TEXT(&testPattern);
2286 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2287 REGEX_VERBOSE_TEXT(&testText);
2288
2289 RegexMatcher m(&testPattern, &testText, 0, status);
2290 REGEX_CHECK_STATUS;
2291 REGEX_ASSERT(m.regionStart() == 0);
2292 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2293 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2294 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2295
2296 m.region(2,4, status);
2297 REGEX_CHECK_STATUS;
2298 REGEX_ASSERT(m.matches(status));
2299 REGEX_ASSERT(m.start(status)==2);
2300 REGEX_ASSERT(m.end(status)==4);
2301 REGEX_CHECK_STATUS;
2302
2303 m.reset();
2304 REGEX_ASSERT(m.regionStart() == 0);
2305 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2306
2307 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2308 REGEX_VERBOSE_TEXT(&testText);
2309 m.reset(&testText);
2310 REGEX_ASSERT(m.regionStart() == 0);
2311 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2312
2313 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2314 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2315 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2316 REGEX_ASSERT(&m == &m.reset());
2317 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2318
2319 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2320 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2321 REGEX_ASSERT(&m == &m.reset());
2322 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2323
2324 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2325 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2326 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2327 REGEX_ASSERT(&m == &m.reset());
2328 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2329
2330 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2331 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2332 REGEX_ASSERT(&m == &m.reset());
2333 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2334
2335 utext_close(&testText);
2336 utext_close(&testPattern);
2337 }
2338
2339 //
2340 // hitEnd() and requireEnd()
2341 //
2342 {
2343 UErrorCode status = U_ZERO_ERROR;
2344 UText testPattern = UTEXT_INITIALIZER;
2345 UText testText = UTEXT_INITIALIZER;
2346 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2347 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2348 utext_openUTF8(&testPattern, str_, -1, &status);
2349 utext_openUTF8(&testText, str_aabb, -1, &status);
2350
2351 RegexMatcher m1(&testPattern, &testText, 0, status);
2352 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2353 REGEX_ASSERT(m1.hitEnd() == TRUE);
2354 REGEX_ASSERT(m1.requireEnd() == FALSE);
2355 REGEX_CHECK_STATUS;
2356
2357 status = U_ZERO_ERROR;
2358 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2359 utext_openUTF8(&testPattern, str_a, -1, &status);
2360 RegexMatcher m2(&testPattern, &testText, 0, status);
2361 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2362 REGEX_ASSERT(m2.hitEnd() == FALSE);
2363 REGEX_ASSERT(m2.requireEnd() == FALSE);
2364 REGEX_CHECK_STATUS;
2365
2366 status = U_ZERO_ERROR;
2367 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2368 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2369 RegexMatcher m3(&testPattern, &testText, 0, status);
2370 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2371 REGEX_ASSERT(m3.hitEnd() == TRUE);
2372 REGEX_ASSERT(m3.requireEnd() == TRUE);
2373 REGEX_CHECK_STATUS;
2374
2375 utext_close(&testText);
2376 utext_close(&testPattern);
2377 }
2378 }
2379
2380
2381 //---------------------------------------------------------------------------
2382 //
2383 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2384 // Replace family of functions.
2385 //
2386 //---------------------------------------------------------------------------
API_Replace_UTF8()2387 void RegexTest::API_Replace_UTF8() {
2388 //
2389 // Replace
2390 //
2391 int32_t flags=0;
2392 UParseError pe;
2393 UErrorCode status=U_ZERO_ERROR;
2394
2395 UText re=UTEXT_INITIALIZER;
2396 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2397 REGEX_VERBOSE_TEXT(&re);
2398 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2399 REGEX_CHECK_STATUS;
2400
2401 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2402 // 012345678901234567
2403 UText dataText = UTEXT_INITIALIZER;
2404 utext_openUTF8(&dataText, data, -1, &status);
2405 REGEX_CHECK_STATUS;
2406 REGEX_VERBOSE_TEXT(&dataText);
2407 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2408
2409 //
2410 // Plain vanilla matches.
2411 //
2412 UnicodeString dest;
2413 UText destText = UTEXT_INITIALIZER;
2414 utext_openUnicodeString(&destText, &dest, &status);
2415 UText *result;
2416
2417 UText replText = UTEXT_INITIALIZER;
2418
2419 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2420 utext_openUTF8(&replText, str_yz, -1, &status);
2421 REGEX_VERBOSE_TEXT(&replText);
2422 result = matcher->replaceFirst(&replText, NULL, status);
2423 REGEX_CHECK_STATUS;
2424 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2425 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2426 utext_close(result);
2427 result = matcher->replaceFirst(&replText, &destText, status);
2428 REGEX_CHECK_STATUS;
2429 REGEX_ASSERT(result == &destText);
2430 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2431
2432 result = matcher->replaceAll(&replText, NULL, status);
2433 REGEX_CHECK_STATUS;
2434 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2435 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2436 utext_close(result);
2437
2438 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2439 result = matcher->replaceAll(&replText, &destText, status);
2440 REGEX_CHECK_STATUS;
2441 REGEX_ASSERT(result == &destText);
2442 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2443
2444 //
2445 // Plain vanilla non-matches.
2446 //
2447 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2448 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2449 matcher->reset(&dataText);
2450
2451 result = matcher->replaceFirst(&replText, NULL, status);
2452 REGEX_CHECK_STATUS;
2453 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2454 utext_close(result);
2455 result = matcher->replaceFirst(&replText, &destText, status);
2456 REGEX_CHECK_STATUS;
2457 REGEX_ASSERT(result == &destText);
2458 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2459
2460 result = matcher->replaceAll(&replText, NULL, status);
2461 REGEX_CHECK_STATUS;
2462 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2463 utext_close(result);
2464 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2465 result = matcher->replaceAll(&replText, &destText, status);
2466 REGEX_CHECK_STATUS;
2467 REGEX_ASSERT(result == &destText);
2468 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2469
2470 //
2471 // Empty source string
2472 //
2473 utext_openUTF8(&dataText, NULL, 0, &status);
2474 matcher->reset(&dataText);
2475
2476 result = matcher->replaceFirst(&replText, NULL, status);
2477 REGEX_CHECK_STATUS;
2478 REGEX_ASSERT_UTEXT_UTF8("", result);
2479 utext_close(result);
2480 result = matcher->replaceFirst(&replText, &destText, status);
2481 REGEX_CHECK_STATUS;
2482 REGEX_ASSERT(result == &destText);
2483 REGEX_ASSERT_UTEXT_UTF8("", result);
2484
2485 result = matcher->replaceAll(&replText, NULL, status);
2486 REGEX_CHECK_STATUS;
2487 REGEX_ASSERT_UTEXT_UTF8("", result);
2488 utext_close(result);
2489 result = matcher->replaceAll(&replText, &destText, status);
2490 REGEX_CHECK_STATUS;
2491 REGEX_ASSERT(result == &destText);
2492 REGEX_ASSERT_UTEXT_UTF8("", result);
2493
2494 //
2495 // Empty substitution string
2496 //
2497 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2498 matcher->reset(&dataText);
2499
2500 utext_openUTF8(&replText, NULL, 0, &status);
2501 result = matcher->replaceFirst(&replText, NULL, status);
2502 REGEX_CHECK_STATUS;
2503 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2504 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2505 utext_close(result);
2506 result = matcher->replaceFirst(&replText, &destText, status);
2507 REGEX_CHECK_STATUS;
2508 REGEX_ASSERT(result == &destText);
2509 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2510
2511 result = matcher->replaceAll(&replText, NULL, status);
2512 REGEX_CHECK_STATUS;
2513 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2514 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2515 utext_close(result);
2516 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2517 result = matcher->replaceAll(&replText, &destText, status);
2518 REGEX_CHECK_STATUS;
2519 REGEX_ASSERT(result == &destText);
2520 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2521
2522 //
2523 // match whole string
2524 //
2525 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2526 utext_openUTF8(&dataText, str_abc, -1, &status);
2527 matcher->reset(&dataText);
2528
2529 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2530 utext_openUTF8(&replText, str_xyz, -1, &status);
2531 result = matcher->replaceFirst(&replText, NULL, status);
2532 REGEX_CHECK_STATUS;
2533 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2534 utext_close(result);
2535 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2536 result = matcher->replaceFirst(&replText, &destText, status);
2537 REGEX_CHECK_STATUS;
2538 REGEX_ASSERT(result == &destText);
2539 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2540
2541 result = matcher->replaceAll(&replText, NULL, status);
2542 REGEX_CHECK_STATUS;
2543 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2544 utext_close(result);
2545 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2546 result = matcher->replaceAll(&replText, &destText, status);
2547 REGEX_CHECK_STATUS;
2548 REGEX_ASSERT(result == &destText);
2549 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2550
2551 //
2552 // Capture Group, simple case
2553 //
2554 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2555 utext_openUTF8(&re, str_add, -1, &status);
2556 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2557 REGEX_CHECK_STATUS;
2558
2559 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2560 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2561 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2562 REGEX_CHECK_STATUS;
2563
2564 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2565 utext_openUTF8(&replText, str_11, -1, &status);
2566 result = matcher2->replaceFirst(&replText, NULL, status);
2567 REGEX_CHECK_STATUS;
2568 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2569 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2570 utext_close(result);
2571 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2572 result = matcher2->replaceFirst(&replText, &destText, status);
2573 REGEX_CHECK_STATUS;
2574 REGEX_ASSERT(result == &destText);
2575 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2576
2577 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2578 utext_openUTF8(&replText, str_v, -1, &status);
2579 REGEX_VERBOSE_TEXT(&replText);
2580 result = matcher2->replaceFirst(&replText, NULL, status);
2581 REGEX_CHECK_STATUS;
2582 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2583 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2584 utext_close(result);
2585 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2586 result = matcher2->replaceFirst(&replText, &destText, status);
2587 REGEX_CHECK_STATUS;
2588 REGEX_ASSERT(result == &destText);
2589 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2590
2591 const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2592 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2593 result = matcher2->replaceFirst(&replText, NULL, status);
2594 REGEX_CHECK_STATUS;
2595 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2596 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2597 utext_close(result);
2598 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2599 result = matcher2->replaceFirst(&replText, &destText, status);
2600 REGEX_CHECK_STATUS;
2601 REGEX_ASSERT(result == &destText);
2602 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2603
2604 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2605 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2606 // 012345678901234567890123456
2607 supplDigitChars[22] = 0xF0;
2608 supplDigitChars[23] = 0x9D;
2609 supplDigitChars[24] = 0x9F;
2610 supplDigitChars[25] = 0x8F;
2611 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2612
2613 result = matcher2->replaceFirst(&replText, NULL, status);
2614 REGEX_CHECK_STATUS;
2615 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2616 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2617 utext_close(result);
2618 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2619 result = matcher2->replaceFirst(&replText, &destText, status);
2620 REGEX_CHECK_STATUS;
2621 REGEX_ASSERT(result == &destText);
2622 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2623 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2624 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2625 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2626 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2627 utext_close(result);
2628 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2630 REGEX_ASSERT(result == &destText);
2631 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2632
2633 //
2634 // Replacement String with \u hex escapes
2635 //
2636 {
2637 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2638 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2639 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2640 utext_openUTF8(&replText, str_u0043, -1, &status);
2641 matcher->reset(&dataText);
2642
2643 result = matcher->replaceAll(&replText, NULL, status);
2644 REGEX_CHECK_STATUS;
2645 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2646 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2647 utext_close(result);
2648 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2649 result = matcher->replaceAll(&replText, &destText, status);
2650 REGEX_CHECK_STATUS;
2651 REGEX_ASSERT(result == &destText);
2652 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2653 }
2654 {
2655 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2656 utext_openUTF8(&dataText, str_abc, -1, &status);
2657 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2658 utext_openUTF8(&replText, str_U00010000, -1, &status);
2659 matcher->reset(&dataText);
2660
2661 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2662 // 0123456789
2663 expected[2] = 0xF0;
2664 expected[3] = 0x90;
2665 expected[4] = 0x80;
2666 expected[5] = 0x80;
2667
2668 result = matcher->replaceAll(&replText, NULL, status);
2669 REGEX_CHECK_STATUS;
2670 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2671 utext_close(result);
2672 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2673 result = matcher->replaceAll(&replText, &destText, status);
2674 REGEX_CHECK_STATUS;
2675 REGEX_ASSERT(result == &destText);
2676 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2677 }
2678 // TODO: need more through testing of capture substitutions.
2679
2680 // Bug 4057
2681 //
2682 {
2683 status = U_ZERO_ERROR;
2684 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2685 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2686 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2687 utext_openUTF8(&re, str_ssee, -1, &status);
2688 utext_openUTF8(&dataText, str_blah, -1, &status);
2689 utext_openUTF8(&replText, str_ooh, -1, &status);
2690
2691 RegexMatcher m(&re, 0, status);
2692 REGEX_CHECK_STATUS;
2693
2694 UnicodeString result;
2695 UText resultText = UTEXT_INITIALIZER;
2696 utext_openUnicodeString(&resultText, &result, &status);
2697
2698 // Multiple finds do NOT bump up the previous appendReplacement postion.
2699 m.reset(&dataText);
2700 m.find();
2701 m.find();
2702 m.appendReplacement(&resultText, &replText, status);
2703 REGEX_CHECK_STATUS;
2704 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2705 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2706
2707 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2708 status = U_ZERO_ERROR;
2709 result.truncate(0);
2710 utext_openUnicodeString(&resultText, &result, &status);
2711 m.reset(10, status);
2712 m.find();
2713 m.find();
2714 m.appendReplacement(&resultText, &replText, status);
2715 REGEX_CHECK_STATUS;
2716 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2717 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2718
2719 // find() at interior of string, appendReplacement still starts at beginning.
2720 status = U_ZERO_ERROR;
2721 result.truncate(0);
2722 utext_openUnicodeString(&resultText, &result, &status);
2723 m.reset();
2724 m.find(10, status);
2725 m.find();
2726 m.appendReplacement(&resultText, &replText, status);
2727 REGEX_CHECK_STATUS;
2728 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2729 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2730
2731 m.appendTail(&resultText, status);
2732 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2733 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2734
2735 utext_close(&resultText);
2736 }
2737
2738 delete matcher2;
2739 delete pat2;
2740 delete matcher;
2741 delete pat;
2742
2743 utext_close(&dataText);
2744 utext_close(&replText);
2745 utext_close(&destText);
2746 utext_close(&re);
2747 }
2748
2749
2750 //---------------------------------------------------------------------------
2751 //
2752 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2753 // present and nominally working.
2754 //
2755 //---------------------------------------------------------------------------
API_Pattern_UTF8()2756 void RegexTest::API_Pattern_UTF8() {
2757 RegexPattern pata; // Test default constructor to not crash.
2758 RegexPattern patb;
2759
2760 REGEX_ASSERT(pata == patb);
2761 REGEX_ASSERT(pata == pata);
2762
2763 UText re1 = UTEXT_INITIALIZER;
2764 UText re2 = UTEXT_INITIALIZER;
2765 UErrorCode status = U_ZERO_ERROR;
2766 UParseError pe;
2767
2768 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2769 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2770 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2771 utext_openUTF8(&re2, str_def, -1, &status);
2772
2773 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2774 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2775 REGEX_CHECK_STATUS;
2776 REGEX_ASSERT(*pat1 == *pat1);
2777 REGEX_ASSERT(*pat1 != pata);
2778
2779 // Assign
2780 patb = *pat1;
2781 REGEX_ASSERT(patb == *pat1);
2782
2783 // Copy Construct
2784 RegexPattern patc(*pat1);
2785 REGEX_ASSERT(patc == *pat1);
2786 REGEX_ASSERT(patb == patc);
2787 REGEX_ASSERT(pat1 != pat2);
2788 patb = *pat2;
2789 REGEX_ASSERT(patb != patc);
2790 REGEX_ASSERT(patb == *pat2);
2791
2792 // Compile with no flags.
2793 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2794 REGEX_ASSERT(*pat1a == *pat1);
2795
2796 REGEX_ASSERT(pat1a->flags() == 0);
2797
2798 // Compile with different flags should be not equal
2799 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2800 REGEX_CHECK_STATUS;
2801
2802 REGEX_ASSERT(*pat1b != *pat1a);
2803 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2804 REGEX_ASSERT(pat1a->flags() == 0);
2805 delete pat1b;
2806
2807 // clone
2808 RegexPattern *pat1c = pat1->clone();
2809 REGEX_ASSERT(*pat1c == *pat1);
2810 REGEX_ASSERT(*pat1c != *pat2);
2811
2812 delete pat1c;
2813 delete pat1a;
2814 delete pat1;
2815 delete pat2;
2816
2817 utext_close(&re1);
2818 utext_close(&re2);
2819
2820
2821 //
2822 // Verify that a matcher created from a cloned pattern works.
2823 // (Jitterbug 3423)
2824 //
2825 {
2826 UErrorCode status = U_ZERO_ERROR;
2827 UText pattern = UTEXT_INITIALIZER;
2828 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2829 utext_openUTF8(&pattern, str_pL, -1, &status);
2830
2831 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2832 RegexPattern *pClone = pSource->clone();
2833 delete pSource;
2834 RegexMatcher *mFromClone = pClone->matcher(status);
2835 REGEX_CHECK_STATUS;
2836
2837 UText input = UTEXT_INITIALIZER;
2838 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2839 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2840 mFromClone->reset(&input);
2841 REGEX_ASSERT(mFromClone->find() == TRUE);
2842 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2843 REGEX_ASSERT(mFromClone->find() == TRUE);
2844 REGEX_ASSERT(mFromClone->group(status) == "World");
2845 REGEX_ASSERT(mFromClone->find() == FALSE);
2846 delete mFromClone;
2847 delete pClone;
2848
2849 utext_close(&input);
2850 utext_close(&pattern);
2851 }
2852
2853 //
2854 // matches convenience API
2855 //
2856 {
2857 UErrorCode status = U_ZERO_ERROR;
2858 UText pattern = UTEXT_INITIALIZER;
2859 UText input = UTEXT_INITIALIZER;
2860
2861 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2862 utext_openUTF8(&input, str_randominput, -1, &status);
2863
2864 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2865 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2866 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2867 REGEX_CHECK_STATUS;
2868
2869 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2870 utext_openUTF8(&pattern, str_abc, -1, &status);
2871 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2872 REGEX_CHECK_STATUS;
2873
2874 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2875 utext_openUTF8(&pattern, str_nput, -1, &status);
2876 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2877 REGEX_CHECK_STATUS;
2878
2879 utext_openUTF8(&pattern, str_randominput, -1, &status);
2880 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2881 REGEX_CHECK_STATUS;
2882
2883 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2884 utext_openUTF8(&pattern, str_u, -1, &status);
2885 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2886 REGEX_CHECK_STATUS;
2887
2888 utext_openUTF8(&input, str_abc, -1, &status);
2889 utext_openUTF8(&pattern, str_abc, -1, &status);
2890 status = U_INDEX_OUTOFBOUNDS_ERROR;
2891 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2892 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2893
2894 utext_close(&input);
2895 utext_close(&pattern);
2896 }
2897
2898
2899 //
2900 // Split()
2901 //
2902 status = U_ZERO_ERROR;
2903 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2904 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2905 pat1 = RegexPattern::compile(&re1, pe, status);
2906 REGEX_CHECK_STATUS;
2907 UnicodeString fields[10];
2908
2909 int32_t n;
2910 n = pat1->split("Now is the time", fields, 10, status);
2911 REGEX_CHECK_STATUS;
2912 REGEX_ASSERT(n==4);
2913 REGEX_ASSERT(fields[0]=="Now");
2914 REGEX_ASSERT(fields[1]=="is");
2915 REGEX_ASSERT(fields[2]=="the");
2916 REGEX_ASSERT(fields[3]=="time");
2917 REGEX_ASSERT(fields[4]=="");
2918
2919 n = pat1->split("Now is the time", fields, 2, status);
2920 REGEX_CHECK_STATUS;
2921 REGEX_ASSERT(n==2);
2922 REGEX_ASSERT(fields[0]=="Now");
2923 REGEX_ASSERT(fields[1]=="is the time");
2924 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2925
2926 fields[1] = "*";
2927 status = U_ZERO_ERROR;
2928 n = pat1->split("Now is the time", fields, 1, status);
2929 REGEX_CHECK_STATUS;
2930 REGEX_ASSERT(n==1);
2931 REGEX_ASSERT(fields[0]=="Now is the time");
2932 REGEX_ASSERT(fields[1]=="*");
2933 status = U_ZERO_ERROR;
2934
2935 n = pat1->split(" Now is the time ", fields, 10, status);
2936 REGEX_CHECK_STATUS;
2937 REGEX_ASSERT(n==6);
2938 REGEX_ASSERT(fields[0]=="");
2939 REGEX_ASSERT(fields[1]=="Now");
2940 REGEX_ASSERT(fields[2]=="is");
2941 REGEX_ASSERT(fields[3]=="the");
2942 REGEX_ASSERT(fields[4]=="time");
2943 REGEX_ASSERT(fields[5]=="");
2944 REGEX_ASSERT(fields[6]=="");
2945
2946 fields[2] = "*";
2947 n = pat1->split(" ", fields, 10, status);
2948 REGEX_CHECK_STATUS;
2949 REGEX_ASSERT(n==2);
2950 REGEX_ASSERT(fields[0]=="");
2951 REGEX_ASSERT(fields[1]=="");
2952 REGEX_ASSERT(fields[2]=="*");
2953
2954 fields[0] = "foo";
2955 n = pat1->split("", fields, 10, status);
2956 REGEX_CHECK_STATUS;
2957 REGEX_ASSERT(n==0);
2958 REGEX_ASSERT(fields[0]=="foo");
2959
2960 delete pat1;
2961
2962 // split, with a pattern with (capture)
2963 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2964 pat1 = RegexPattern::compile(&re1, pe, status);
2965 REGEX_CHECK_STATUS;
2966
2967 status = U_ZERO_ERROR;
2968 fields[6] = fields[7] = "*";
2969 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2970 REGEX_CHECK_STATUS;
2971 REGEX_ASSERT(n==7);
2972 REGEX_ASSERT(fields[0]=="");
2973 REGEX_ASSERT(fields[1]=="a");
2974 REGEX_ASSERT(fields[2]=="Now is ");
2975 REGEX_ASSERT(fields[3]=="b");
2976 REGEX_ASSERT(fields[4]=="the time");
2977 REGEX_ASSERT(fields[5]=="c");
2978 REGEX_ASSERT(fields[6]=="");
2979 REGEX_ASSERT(fields[7]=="*");
2980 REGEX_ASSERT(status==U_ZERO_ERROR);
2981
2982 fields[6] = fields[7] = "*";
2983 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
2984 REGEX_CHECK_STATUS;
2985 REGEX_ASSERT(n==7);
2986 REGEX_ASSERT(fields[0]==" ");
2987 REGEX_ASSERT(fields[1]=="a");
2988 REGEX_ASSERT(fields[2]=="Now is ");
2989 REGEX_ASSERT(fields[3]=="b");
2990 REGEX_ASSERT(fields[4]=="the time");
2991 REGEX_ASSERT(fields[5]=="c");
2992 REGEX_ASSERT(fields[6]=="");
2993 REGEX_ASSERT(fields[7]=="*");
2994
2995 status = U_ZERO_ERROR;
2996 fields[6] = "foo";
2997 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
2998 REGEX_CHECK_STATUS;
2999 REGEX_ASSERT(n==6);
3000 REGEX_ASSERT(fields[0]==" ");
3001 REGEX_ASSERT(fields[1]=="a");
3002 REGEX_ASSERT(fields[2]=="Now is ");
3003 REGEX_ASSERT(fields[3]=="b");
3004 REGEX_ASSERT(fields[4]=="the time");
3005 REGEX_ASSERT(fields[5]==" ");
3006 REGEX_ASSERT(fields[6]=="foo");
3007
3008 status = U_ZERO_ERROR;
3009 fields[5] = "foo";
3010 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3011 REGEX_CHECK_STATUS;
3012 REGEX_ASSERT(n==5);
3013 REGEX_ASSERT(fields[0]==" ");
3014 REGEX_ASSERT(fields[1]=="a");
3015 REGEX_ASSERT(fields[2]=="Now is ");
3016 REGEX_ASSERT(fields[3]=="b");
3017 REGEX_ASSERT(fields[4]=="the time<c>");
3018 REGEX_ASSERT(fields[5]=="foo");
3019
3020 status = U_ZERO_ERROR;
3021 fields[5] = "foo";
3022 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3023 REGEX_CHECK_STATUS;
3024 REGEX_ASSERT(n==5);
3025 REGEX_ASSERT(fields[0]==" ");
3026 REGEX_ASSERT(fields[1]=="a");
3027 REGEX_ASSERT(fields[2]=="Now is ");
3028 REGEX_ASSERT(fields[3]=="b");
3029 REGEX_ASSERT(fields[4]=="the time");
3030 REGEX_ASSERT(fields[5]=="foo");
3031
3032 status = U_ZERO_ERROR;
3033 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3034 REGEX_CHECK_STATUS;
3035 REGEX_ASSERT(n==4);
3036 REGEX_ASSERT(fields[0]==" ");
3037 REGEX_ASSERT(fields[1]=="a");
3038 REGEX_ASSERT(fields[2]=="Now is ");
3039 REGEX_ASSERT(fields[3]=="the time<c>");
3040 status = U_ZERO_ERROR;
3041 delete pat1;
3042
3043 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3044 pat1 = RegexPattern::compile(&re1, pe, status);
3045 REGEX_CHECK_STATUS;
3046 n = pat1->split("1-10,20", fields, 10, status);
3047 REGEX_CHECK_STATUS;
3048 REGEX_ASSERT(n==5);
3049 REGEX_ASSERT(fields[0]=="1");
3050 REGEX_ASSERT(fields[1]=="-");
3051 REGEX_ASSERT(fields[2]=="10");
3052 REGEX_ASSERT(fields[3]==",");
3053 REGEX_ASSERT(fields[4]=="20");
3054 delete pat1;
3055
3056
3057 //
3058 // RegexPattern::pattern() and patternText()
3059 //
3060 pat1 = new RegexPattern();
3061 REGEX_ASSERT(pat1->pattern() == "");
3062 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3063 delete pat1;
3064 const char *helloWorldInvariant = "(Hello, world)*";
3065 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3066 pat1 = RegexPattern::compile(&re1, pe, status);
3067 REGEX_CHECK_STATUS;
3068 REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
3069 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3070 delete pat1;
3071
3072 utext_close(&re1);
3073 }
3074
3075
3076 //---------------------------------------------------------------------------
3077 //
3078 // Extended A more thorough check for features of regex patterns
3079 // The test cases are in a separate data file,
3080 // source/tests/testdata/regextst.txt
3081 // A description of the test data format is included in that file.
3082 //
3083 //---------------------------------------------------------------------------
3084
3085 const char *
getPath(char buffer[2048],const char * filename)3086 RegexTest::getPath(char buffer[2048], const char *filename) {
3087 UErrorCode status=U_ZERO_ERROR;
3088 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3089 if (U_FAILURE(status)) {
3090 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3091 return NULL;
3092 }
3093
3094 strcpy(buffer, testDataDirectory);
3095 strcat(buffer, filename);
3096 return buffer;
3097 }
3098
Extended()3099 void RegexTest::Extended() {
3100 char tdd[2048];
3101 const char *srcPath;
3102 UErrorCode status = U_ZERO_ERROR;
3103 int32_t lineNum = 0;
3104
3105 //
3106 // Open and read the test data file.
3107 //
3108 srcPath=getPath(tdd, "regextst.txt");
3109 if(srcPath==NULL) {
3110 return; /* something went wrong, error already output */
3111 }
3112
3113 int32_t len;
3114 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3115 if (U_FAILURE(status)) {
3116 return; /* something went wrong, error already output */
3117 }
3118
3119 //
3120 // Put the test data into a UnicodeString
3121 //
3122 UnicodeString testString(FALSE, testData, len);
3123
3124 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3125 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3126 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3127
3128 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3129 UnicodeString testPattern; // The pattern for test from the test file.
3130 UnicodeString testFlags; // the flags for a test.
3131 UnicodeString matchString; // The marked up string to be used as input
3132
3133 if (U_FAILURE(status)){
3134 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3135 delete [] testData;
3136 return;
3137 }
3138
3139 //
3140 // Loop over the test data file, once per line.
3141 //
3142 while (lineMat.find()) {
3143 lineNum++;
3144 if (U_FAILURE(status)) {
3145 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3146 }
3147
3148 status = U_ZERO_ERROR;
3149 UnicodeString testLine = lineMat.group(1, status);
3150 if (testLine.length() == 0) {
3151 continue;
3152 }
3153
3154 //
3155 // Parse the test line. Skip blank and comment only lines.
3156 // Separate out the three main fields - pattern, flags, target.
3157 //
3158
3159 commentMat.reset(testLine);
3160 if (commentMat.lookingAt(status)) {
3161 // This line is a comment, or blank.
3162 continue;
3163 }
3164
3165 //
3166 // Pull out the pattern field, remove it from the test file line.
3167 //
3168 quotedStuffMat.reset(testLine);
3169 if (quotedStuffMat.lookingAt(status)) {
3170 testPattern = quotedStuffMat.group(2, status);
3171 testLine.remove(0, quotedStuffMat.end(0, status));
3172 } else {
3173 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3174 continue;
3175 }
3176
3177
3178 //
3179 // Pull out the flags from the test file line.
3180 //
3181 flagsMat.reset(testLine);
3182 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3183 testFlags = flagsMat.group(1, status);
3184 if (flagsMat.group(2, status).length() > 0) {
3185 errln("Bad Match flag at line %d. Scanning %c\n",
3186 lineNum, flagsMat.group(2, status).charAt(0));
3187 continue;
3188 }
3189 testLine.remove(0, flagsMat.end(0, status));
3190
3191 //
3192 // Pull out the match string, as a whole.
3193 // We'll process the <tags> later.
3194 //
3195 quotedStuffMat.reset(testLine);
3196 if (quotedStuffMat.lookingAt(status)) {
3197 matchString = quotedStuffMat.group(2, status);
3198 testLine.remove(0, quotedStuffMat.end(0, status));
3199 } else {
3200 errln("Bad match string at test file line %d", lineNum);
3201 continue;
3202 }
3203
3204 //
3205 // The only thing left from the input line should be an optional trailing comment.
3206 //
3207 commentMat.reset(testLine);
3208 if (commentMat.lookingAt(status) == FALSE) {
3209 errln("Line %d: unexpected characters at end of test line.", lineNum);
3210 continue;
3211 }
3212
3213 //
3214 // Run the test
3215 //
3216 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3217 }
3218
3219 delete [] testData;
3220
3221 }
3222
3223
3224
3225 //---------------------------------------------------------------------------
3226 //
3227 // regex_find(pattern, flags, inputString, lineNumber)
3228 //
3229 // Function to run a single test from the Extended (data driven) tests.
3230 // See file test/testdata/regextst.txt for a description of the
3231 // pattern and inputString fields, and the allowed flags.
3232 // lineNumber is the source line in regextst.txt of the test.
3233 //
3234 //---------------------------------------------------------------------------
3235
3236
3237 // Set a value into a UVector at position specified by a decimal number in
3238 // a UnicodeString. This is a utility function needed by the actual test function,
3239 // which follows.
set(UVector & vec,int32_t val,UnicodeString index)3240 static void set(UVector &vec, int32_t val, UnicodeString index) {
3241 UErrorCode status=U_ZERO_ERROR;
3242 int32_t idx = 0;
3243 for (int32_t i=0; i<index.length(); i++) {
3244 int32_t d=u_charDigitValue(index.charAt(i));
3245 if (d<0) {return;}
3246 idx = idx*10 + d;
3247 }
3248 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3249 vec.setElementAt(val, idx);
3250 }
3251
setInt(UVector & vec,int32_t val,int32_t idx)3252 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3253 UErrorCode status=U_ZERO_ERROR;
3254 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3255 vec.setElementAt(val, idx);
3256 }
3257
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3258 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3259 {
3260 UBool couldFind = TRUE;
3261 UTEXT_SETNATIVEINDEX(utext, 0);
3262 int32_t i = 0;
3263 while (i < unistrOffset) {
3264 UChar32 c = UTEXT_NEXT32(utext);
3265 if (c != U_SENTINEL) {
3266 i += U16_LENGTH(c);
3267 } else {
3268 couldFind = FALSE;
3269 break;
3270 }
3271 }
3272 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3273 return couldFind;
3274 }
3275
3276
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3277 void RegexTest::regex_find(const UnicodeString &pattern,
3278 const UnicodeString &flags,
3279 const UnicodeString &inputString,
3280 const char *srcPath,
3281 int32_t line) {
3282 UnicodeString unEscapedInput;
3283 UnicodeString deTaggedInput;
3284
3285 int32_t patternUTF8Length, inputUTF8Length;
3286 char *patternChars = NULL, *inputChars = NULL;
3287 UText patternText = UTEXT_INITIALIZER;
3288 UText inputText = UTEXT_INITIALIZER;
3289 UConverter *UTF8Converter = NULL;
3290
3291 UErrorCode status = U_ZERO_ERROR;
3292 UParseError pe;
3293 RegexPattern *parsePat = NULL;
3294 RegexMatcher *parseMatcher = NULL;
3295 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3296 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3297 UVector groupStarts(status);
3298 UVector groupEnds(status);
3299 UVector groupStartsUTF8(status);
3300 UVector groupEndsUTF8(status);
3301 UBool isMatch = FALSE, isUTF8Match = FALSE;
3302 UBool failed = FALSE;
3303 int32_t numFinds;
3304 int32_t i;
3305 UBool useMatchesFunc = FALSE;
3306 UBool useLookingAtFunc = FALSE;
3307 int32_t regionStart = -1;
3308 int32_t regionEnd = -1;
3309 int32_t regionStartUTF8 = -1;
3310 int32_t regionEndUTF8 = -1;
3311
3312
3313 //
3314 // Compile the caller's pattern
3315 //
3316 uint32_t bflags = 0;
3317 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3318 bflags |= UREGEX_CASE_INSENSITIVE;
3319 }
3320 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3321 bflags |= UREGEX_COMMENTS;
3322 }
3323 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3324 bflags |= UREGEX_DOTALL;
3325 }
3326 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3327 bflags |= UREGEX_MULTILINE;
3328 }
3329
3330 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3331 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3332 }
3333 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3334 bflags |= UREGEX_UNIX_LINES;
3335 }
3336 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3337 bflags |= UREGEX_LITERAL;
3338 }
3339
3340
3341 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3342 if (status != U_ZERO_ERROR) {
3343 #if UCONFIG_NO_BREAK_ITERATION==1
3344 // 'v' test flag means that the test pattern should not compile if ICU was configured
3345 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3346 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3347 goto cleanupAndReturn;
3348 }
3349 #endif
3350 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3351 // Expected pattern compilation error.
3352 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3353 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3354 }
3355 goto cleanupAndReturn;
3356 } else {
3357 // Unexpected pattern compilation error.
3358 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3359 goto cleanupAndReturn;
3360 }
3361 }
3362
3363 UTF8Converter = ucnv_open("UTF8", &status);
3364 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3365
3366 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3367 status = U_ZERO_ERROR; // buffer overflow
3368 patternChars = new char[patternUTF8Length+1];
3369 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3370 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3371
3372 if (status == U_ZERO_ERROR) {
3373 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3374
3375 if (status != U_ZERO_ERROR) {
3376 #if UCONFIG_NO_BREAK_ITERATION==1
3377 // 'v' test flag means that the test pattern should not compile if ICU was configured
3378 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3379 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3380 goto cleanupAndReturn;
3381 }
3382 #endif
3383 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3384 // Expected pattern compilation error.
3385 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3386 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3387 }
3388 goto cleanupAndReturn;
3389 } else {
3390 // Unexpected pattern compilation error.
3391 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3392 goto cleanupAndReturn;
3393 }
3394 }
3395 }
3396
3397 if (UTF8Pattern == NULL) {
3398 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3399 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3400 status = U_ZERO_ERROR;
3401 }
3402
3403 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3404 RegexPatternDump(callerPattern);
3405 }
3406
3407 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3408 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3409 goto cleanupAndReturn;
3410 }
3411
3412
3413 //
3414 // Number of times find() should be called on the test string, default to 1
3415 //
3416 numFinds = 1;
3417 for (i=2; i<=9; i++) {
3418 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3419 if (numFinds != 1) {
3420 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3421 goto cleanupAndReturn;
3422 }
3423 numFinds = i;
3424 }
3425 }
3426
3427 // 'M' flag. Use matches() instead of find()
3428 if (flags.indexOf((UChar)0x4d) >= 0) {
3429 useMatchesFunc = TRUE;
3430 }
3431 if (flags.indexOf((UChar)0x4c) >= 0) {
3432 useLookingAtFunc = TRUE;
3433 }
3434
3435 //
3436 // Find the tags in the input data, remove them, and record the group boundary
3437 // positions.
3438 //
3439 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3440 REGEX_CHECK_STATUS_L(line);
3441
3442 unEscapedInput = inputString.unescape();
3443 parseMatcher = parsePat->matcher(unEscapedInput, status);
3444 REGEX_CHECK_STATUS_L(line);
3445 while(parseMatcher->find()) {
3446 parseMatcher->appendReplacement(deTaggedInput, "", status);
3447 REGEX_CHECK_STATUS;
3448 UnicodeString groupNum = parseMatcher->group(2, status);
3449 if (groupNum == "r") {
3450 // <r> or </r>, a region specification within the string
3451 if (parseMatcher->group(1, status) == "/") {
3452 regionEnd = deTaggedInput.length();
3453 } else {
3454 regionStart = deTaggedInput.length();
3455 }
3456 } else {
3457 // <digits> or </digits>, a group match boundary tag.
3458 if (parseMatcher->group(1, status) == "/") {
3459 set(groupEnds, deTaggedInput.length(), groupNum);
3460 } else {
3461 set(groupStarts, deTaggedInput.length(), groupNum);
3462 }
3463 }
3464 }
3465 parseMatcher->appendTail(deTaggedInput);
3466 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3467 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3468 errln("mismatched <r> tags");
3469 failed = TRUE;
3470 goto cleanupAndReturn;
3471 }
3472
3473 //
3474 // Configure the matcher according to the flags specified with this test.
3475 //
3476 matcher = callerPattern->matcher(deTaggedInput, status);
3477 REGEX_CHECK_STATUS_L(line);
3478 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3479 matcher->setTrace(TRUE);
3480 }
3481
3482 if (UTF8Pattern != NULL) {
3483 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3484 status = U_ZERO_ERROR; // buffer overflow
3485 inputChars = new char[inputUTF8Length+1];
3486 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3487 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3488
3489 if (status == U_ZERO_ERROR) {
3490 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3491 REGEX_CHECK_STATUS_L(line);
3492 }
3493
3494 if (UTF8Matcher == NULL) {
3495 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3496 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3497 status = U_ZERO_ERROR;
3498 }
3499 }
3500
3501 //
3502 // Generate native indices for UTF8 versions of region and capture group info
3503 //
3504 if (UTF8Matcher != NULL) {
3505 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3506 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3507
3508 // Fill out the native index UVector info.
3509 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3510 for (i=0; i<groupStarts.size(); i++) {
3511 int32_t start = groupStarts.elementAti(i);
3512 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3513 if (start >= 0) {
3514 int32_t startUTF8;
3515 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3516 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3517 failed = TRUE;
3518 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3519 }
3520 setInt(groupStartsUTF8, startUTF8, i);
3521 }
3522
3523 int32_t end = groupEnds.elementAti(i);
3524 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3525 if (end >= 0) {
3526 int32_t endUTF8;
3527 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3528 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3529 failed = TRUE;
3530 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3531 }
3532 setInt(groupEndsUTF8, endUTF8, i);
3533 }
3534 }
3535 }
3536
3537 if (regionStart>=0) {
3538 matcher->region(regionStart, regionEnd, status);
3539 REGEX_CHECK_STATUS_L(line);
3540 if (UTF8Matcher != NULL) {
3541 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3542 REGEX_CHECK_STATUS_L(line);
3543 }
3544 }
3545 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3546 matcher->useAnchoringBounds(FALSE);
3547 if (UTF8Matcher != NULL) {
3548 UTF8Matcher->useAnchoringBounds(FALSE);
3549 }
3550 }
3551 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3552 matcher->useTransparentBounds(TRUE);
3553 if (UTF8Matcher != NULL) {
3554 UTF8Matcher->useTransparentBounds(TRUE);
3555 }
3556 }
3557
3558
3559
3560 //
3561 // Do a find on the de-tagged input using the caller's pattern
3562 // TODO: error on count>1 and not find().
3563 // error on both matches() and lookingAt().
3564 //
3565 for (i=0; i<numFinds; i++) {
3566 if (useMatchesFunc) {
3567 isMatch = matcher->matches(status);
3568 if (UTF8Matcher != NULL) {
3569 isUTF8Match = UTF8Matcher->matches(status);
3570 }
3571 } else if (useLookingAtFunc) {
3572 isMatch = matcher->lookingAt(status);
3573 if (UTF8Matcher != NULL) {
3574 isUTF8Match = UTF8Matcher->lookingAt(status);
3575 }
3576 } else {
3577 isMatch = matcher->find();
3578 if (UTF8Matcher != NULL) {
3579 isUTF8Match = UTF8Matcher->find();
3580 }
3581 }
3582 }
3583 matcher->setTrace(FALSE);
3584 if (U_FAILURE(status)) {
3585 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3586 }
3587
3588 //
3589 // Match up the groups from the find() with the groups from the tags
3590 //
3591
3592 // number of tags should match number of groups from find operation.
3593 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3594 // G option in test means that capture group data is not available in the
3595 // expected results, so the check needs to be suppressed.
3596 if (isMatch == FALSE && groupStarts.size() != 0) {
3597 dataerrln("Error at line %d: Match expected, but none found.", line);
3598 failed = TRUE;
3599 goto cleanupAndReturn;
3600 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3601 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3602 failed = TRUE;
3603 goto cleanupAndReturn;
3604 }
3605
3606 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3607 // Only check for match / no match. Don't check capture groups.
3608 if (isMatch && groupStarts.size() == 0) {
3609 errln("Error at line %d: No match expected, but one found.", line);
3610 failed = TRUE;
3611 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3612 errln("Error at line %d: No match expected, but one found. (UTF8)", line);
3613 failed = TRUE;
3614 }
3615 goto cleanupAndReturn;
3616 }
3617
3618 REGEX_CHECK_STATUS_L(line);
3619 for (i=0; i<=matcher->groupCount(); i++) {
3620 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3621 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3622 if (matcher->start(i, status) != expectedStart) {
3623 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3624 line, i, expectedStart, matcher->start(i, status));
3625 failed = TRUE;
3626 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3627 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3628 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3629 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3630 failed = TRUE;
3631 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3632 }
3633
3634 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3635 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3636 if (matcher->end(i, status) != expectedEnd) {
3637 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3638 line, i, expectedEnd, matcher->end(i, status));
3639 failed = TRUE;
3640 // Error on end position; keep going; real error is probably yet to come as group
3641 // end positions work from end of the input data towards the front.
3642 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3643 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3644 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3645 failed = TRUE;
3646 // Error on end position; keep going; real error is probably yet to come as group
3647 // end positions work from end of the input data towards the front.
3648 }
3649 }
3650 if ( matcher->groupCount()+1 < groupStarts.size()) {
3651 errln("Error at line %d: Expected %d capture groups, found %d.",
3652 line, groupStarts.size()-1, matcher->groupCount());
3653 failed = TRUE;
3654 }
3655 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3656 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3657 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3658 failed = TRUE;
3659 }
3660
3661 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3662 matcher->requireEnd() == TRUE) {
3663 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3664 failed = TRUE;
3665 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3666 UTF8Matcher->requireEnd() == TRUE) {
3667 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3668 failed = TRUE;
3669 }
3670
3671 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3672 matcher->requireEnd() == FALSE) {
3673 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3674 failed = TRUE;
3675 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3676 UTF8Matcher->requireEnd() == FALSE) {
3677 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3678 failed = TRUE;
3679 }
3680
3681 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3682 matcher->hitEnd() == TRUE) {
3683 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3684 failed = TRUE;
3685 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3686 UTF8Matcher->hitEnd() == TRUE) {
3687 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3688 failed = TRUE;
3689 }
3690
3691 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3692 matcher->hitEnd() == FALSE) {
3693 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3694 failed = TRUE;
3695 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3696 UTF8Matcher->hitEnd() == FALSE) {
3697 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3698 failed = TRUE;
3699 }
3700
3701
3702 cleanupAndReturn:
3703 if (failed) {
3704 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3705 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3706 // callerPattern->dump();
3707 }
3708 delete parseMatcher;
3709 delete parsePat;
3710 delete UTF8Matcher;
3711 delete UTF8Pattern;
3712 delete matcher;
3713 delete callerPattern;
3714
3715 utext_close(&inputText);
3716 delete[] inputChars;
3717 utext_close(&patternText);
3718 delete[] patternChars;
3719 ucnv_close(UTF8Converter);
3720 }
3721
3722
3723
3724
3725 //---------------------------------------------------------------------------
3726 //
3727 // Errors Check for error handling in patterns.
3728 //
3729 //---------------------------------------------------------------------------
Errors()3730 void RegexTest::Errors() {
3731 // \escape sequences that aren't implemented yet.
3732 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3733
3734 // Missing close parentheses
3735 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3736 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3737 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3738
3739 // Extra close paren
3740 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3741 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3742 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3743
3744 // Look-ahead, Look-behind
3745 // TODO: add tests for unbounded length look-behinds.
3746 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3747
3748 // Attempt to use non-default flags
3749 {
3750 UParseError pe;
3751 UErrorCode status = U_ZERO_ERROR;
3752 int32_t flags = UREGEX_CANON_EQ |
3753 UREGEX_COMMENTS | UREGEX_DOTALL |
3754 UREGEX_MULTILINE;
3755 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3756 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3757 delete pat1;
3758 }
3759
3760
3761 // Quantifiers are allowed only after something that can be quantified.
3762 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3763 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3764 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3765
3766 // Mal-formed {min,max} quantifiers
3767 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3768 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3769 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3770 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3771 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3772 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3773 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3774 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3775 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3776
3777 // Ticket 5389
3778 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3779
3780 // Invalid Back Reference \0
3781 // For ICU 3.8 and earlier
3782 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3783 //
3784 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3785
3786 }
3787
3788
3789 //-------------------------------------------------------------------------------
3790 //
3791 // Read a text data file, convert it to UChars, and return the data
3792 // in one big UChar * buffer, which the caller must delete.
3793 //
3794 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3795 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3796 const char *defEncoding, UErrorCode &status) {
3797 UChar *retPtr = NULL;
3798 char *fileBuf = NULL;
3799 UConverter* conv = NULL;
3800 FILE *f = NULL;
3801
3802 ulen = 0;
3803 if (U_FAILURE(status)) {
3804 return retPtr;
3805 }
3806
3807 //
3808 // Open the file.
3809 //
3810 f = fopen(fileName, "rb");
3811 if (f == 0) {
3812 dataerrln("Error opening test data file %s\n", fileName);
3813 status = U_FILE_ACCESS_ERROR;
3814 return NULL;
3815 }
3816 //
3817 // Read it in
3818 //
3819 int32_t fileSize;
3820 int32_t amt_read;
3821
3822 fseek( f, 0, SEEK_END);
3823 fileSize = ftell(f);
3824 fileBuf = new char[fileSize];
3825 fseek(f, 0, SEEK_SET);
3826 amt_read = fread(fileBuf, 1, fileSize, f);
3827 if (amt_read != fileSize || fileSize <= 0) {
3828 errln("Error reading test data file.");
3829 goto cleanUpAndReturn;
3830 }
3831
3832 //
3833 // Look for a Unicode Signature (BOM) on the data just read
3834 //
3835 int32_t signatureLength;
3836 const char * fileBufC;
3837 const char* encoding;
3838
3839 fileBufC = fileBuf;
3840 encoding = ucnv_detectUnicodeSignature(
3841 fileBuf, fileSize, &signatureLength, &status);
3842 if(encoding!=NULL ){
3843 fileBufC += signatureLength;
3844 fileSize -= signatureLength;
3845 } else {
3846 encoding = defEncoding;
3847 if (strcmp(encoding, "utf-8") == 0) {
3848 errln("file %s is missing its BOM", fileName);
3849 }
3850 }
3851
3852 //
3853 // Open a converter to take the rule file to UTF-16
3854 //
3855 conv = ucnv_open(encoding, &status);
3856 if (U_FAILURE(status)) {
3857 goto cleanUpAndReturn;
3858 }
3859
3860 //
3861 // Convert the rules to UChar.
3862 // Preflight first to determine required buffer size.
3863 //
3864 ulen = ucnv_toUChars(conv,
3865 NULL, // dest,
3866 0, // destCapacity,
3867 fileBufC,
3868 fileSize,
3869 &status);
3870 if (status == U_BUFFER_OVERFLOW_ERROR) {
3871 // Buffer Overflow is expected from the preflight operation.
3872 status = U_ZERO_ERROR;
3873
3874 retPtr = new UChar[ulen+1];
3875 ucnv_toUChars(conv,
3876 retPtr, // dest,
3877 ulen+1,
3878 fileBufC,
3879 fileSize,
3880 &status);
3881 }
3882
3883 cleanUpAndReturn:
3884 fclose(f);
3885 delete[] fileBuf;
3886 ucnv_close(conv);
3887 if (U_FAILURE(status)) {
3888 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3889 delete []retPtr;
3890 retPtr = 0;
3891 ulen = 0;
3892 };
3893 return retPtr;
3894 }
3895
3896
3897 //-------------------------------------------------------------------------------
3898 //
3899 // PerlTests - Run Perl's regular expression tests
3900 // The input file for this test is re_tests, the standard regular
3901 // expression test data distributed with the Perl source code.
3902 //
3903 // Here is Perl's description of the test data file:
3904 //
3905 // # The tests are in a separate file 't/op/re_tests'.
3906 // # Each line in that file is a separate test.
3907 // # There are five columns, separated by tabs.
3908 // #
3909 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3910 // # Modifiers can be put after the closing C<'>.
3911 // #
3912 // # Column 2 contains the string to be matched.
3913 // #
3914 // # Column 3 contains the expected result:
3915 // # y expect a match
3916 // # n expect no match
3917 // # c expect an error
3918 // # B test exposes a known bug in Perl, should be skipped
3919 // # b test exposes a known bug in Perl, should be skipped if noamp
3920 // #
3921 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3922 // #
3923 // # Column 4 contains a string, usually C<$&>.
3924 // #
3925 // # Column 5 contains the expected result of double-quote
3926 // # interpolating that string after the match, or start of error message.
3927 // #
3928 // # Column 6, if present, contains a reason why the test is skipped.
3929 // # This is printed with "skipped", for harness to pick up.
3930 // #
3931 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3932 // #
3933 // # If you want to add a regular expression test that can't be expressed
3934 // # in this format, don't add it here: put it in op/pat.t instead.
3935 //
3936 // For ICU, if field 3 contains an 'i', the test will be skipped.
3937 // The test exposes is some known incompatibility between ICU and Perl regexps.
3938 // (The i is in addition to whatever was there before.)
3939 //
3940 //-------------------------------------------------------------------------------
PerlTests()3941 void RegexTest::PerlTests() {
3942 char tdd[2048];
3943 const char *srcPath;
3944 UErrorCode status = U_ZERO_ERROR;
3945 UParseError pe;
3946
3947 //
3948 // Open and read the test data file.
3949 //
3950 srcPath=getPath(tdd, "re_tests.txt");
3951 if(srcPath==NULL) {
3952 return; /* something went wrong, error already output */
3953 }
3954
3955 int32_t len;
3956 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3957 if (U_FAILURE(status)) {
3958 return; /* something went wrong, error already output */
3959 }
3960
3961 //
3962 // Put the test data into a UnicodeString
3963 //
3964 UnicodeString testDataString(FALSE, testData, len);
3965
3966 //
3967 // Regex to break the input file into lines, and strip the new lines.
3968 // One line per match, capture group one is the desired data.
3969 //
3970 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3971 if (U_FAILURE(status)) {
3972 dataerrln("RegexPattern::compile() error");
3973 return;
3974 }
3975 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3976
3977 //
3978 // Regex to split a test file line into fields.
3979 // There are six fields, separated by tabs.
3980 //
3981 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3982
3983 //
3984 // Regex to identify test patterns with flag settings, and to separate them.
3985 // Test patterns with flags look like 'pattern'i
3986 // Test patterns without flags are not quoted: pattern
3987 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3988 //
3989 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3990 RegexMatcher* flagMat = flagPat->matcher(status);
3991
3992 //
3993 // The Perl tests reference several perl-isms, which are evaluated/substituted
3994 // in the test data. Not being perl, this must be done explicitly. Here
3995 // are string constants and REs for these constructs.
3996 //
3997 UnicodeString nulnulSrc("${nulnul}");
3998 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3999 nulnul = nulnul.unescape();
4000
4001 UnicodeString ffffSrc("${ffff}");
4002 UnicodeString ffff("\\uffff", -1, US_INV);
4003 ffff = ffff.unescape();
4004
4005 // regexp for $-[0], $+[2], etc.
4006 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4007 RegexMatcher *groupsMat = groupsPat->matcher(status);
4008
4009 // regexp for $0, $1, $2, etc.
4010 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4011 RegexMatcher *cgMat = cgPat->matcher(status);
4012
4013
4014 //
4015 // Main Loop for the Perl Tests, runs once per line from the
4016 // test data file.
4017 //
4018 int32_t lineNum = 0;
4019 int32_t skippedUnimplementedCount = 0;
4020 while (lineMat->find()) {
4021 lineNum++;
4022
4023 //
4024 // Get a line, break it into its fields, do the Perl
4025 // variable substitutions.
4026 //
4027 UnicodeString line = lineMat->group(1, status);
4028 UnicodeString fields[7];
4029 fieldPat->split(line, fields, 7, status);
4030
4031 flagMat->reset(fields[0]);
4032 flagMat->matches(status);
4033 UnicodeString pattern = flagMat->group(2, status);
4034 pattern.findAndReplace("${bang}", "!");
4035 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4036 pattern.findAndReplace(ffffSrc, ffff);
4037
4038 //
4039 // Identify patterns that include match flag settings,
4040 // split off the flags, remove the extra quotes.
4041 //
4042 UnicodeString flagStr = flagMat->group(3, status);
4043 if (U_FAILURE(status)) {
4044 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4045 return;
4046 }
4047 int32_t flags = 0;
4048 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4049 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4050 const UChar UChar_m = 0x6d;
4051 const UChar UChar_x = 0x78;
4052 const UChar UChar_y = 0x79;
4053 if (flagStr.indexOf(UChar_i) != -1) {
4054 flags |= UREGEX_CASE_INSENSITIVE;
4055 }
4056 if (flagStr.indexOf(UChar_m) != -1) {
4057 flags |= UREGEX_MULTILINE;
4058 }
4059 if (flagStr.indexOf(UChar_x) != -1) {
4060 flags |= UREGEX_COMMENTS;
4061 }
4062
4063 //
4064 // Compile the test pattern.
4065 //
4066 status = U_ZERO_ERROR;
4067 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4068 if (status == U_REGEX_UNIMPLEMENTED) {
4069 //
4070 // Test of a feature that is planned for ICU, but not yet implemented.
4071 // skip the test.
4072 skippedUnimplementedCount++;
4073 delete testPat;
4074 status = U_ZERO_ERROR;
4075 continue;
4076 }
4077
4078 if (U_FAILURE(status)) {
4079 // Some tests are supposed to generate errors.
4080 // Only report an error for tests that are supposed to succeed.
4081 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4082 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4083 {
4084 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4085 }
4086 status = U_ZERO_ERROR;
4087 delete testPat;
4088 continue;
4089 }
4090
4091 if (fields[2].indexOf(UChar_i) >= 0) {
4092 // ICU should skip this test.
4093 delete testPat;
4094 continue;
4095 }
4096
4097 if (fields[2].indexOf(UChar_c) >= 0) {
4098 // This pattern should have caused a compilation error, but didn't/
4099 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4100 delete testPat;
4101 continue;
4102 }
4103
4104 //
4105 // replace the Perl variables that appear in some of the
4106 // match data strings.
4107 //
4108 UnicodeString matchString = fields[1];
4109 matchString.findAndReplace(nulnulSrc, nulnul);
4110 matchString.findAndReplace(ffffSrc, ffff);
4111
4112 // Replace any \n in the match string with an actual new-line char.
4113 // Don't do full unescape, as this unescapes more than Perl does, which
4114 // causes other spurious failures in the tests.
4115 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4116
4117
4118
4119 //
4120 // Run the test, check for expected match/don't match result.
4121 //
4122 RegexMatcher *testMat = testPat->matcher(matchString, status);
4123 UBool found = testMat->find();
4124 UBool expected = FALSE;
4125 if (fields[2].indexOf(UChar_y) >=0) {
4126 expected = TRUE;
4127 }
4128 if (expected != found) {
4129 errln("line %d: Expected %smatch, got %smatch",
4130 lineNum, expected?"":"no ", found?"":"no " );
4131 continue;
4132 }
4133
4134 // Don't try to check expected results if there is no match.
4135 // (Some have stuff in the expected fields)
4136 if (!found) {
4137 delete testMat;
4138 delete testPat;
4139 continue;
4140 }
4141
4142 //
4143 // Interpret the Perl expression from the fourth field of the data file,
4144 // building up an ICU string from the results of the ICU match.
4145 // The Perl expression will contain references to the results of
4146 // a regex match, including the matched string, capture group strings,
4147 // group starting and ending indicies, etc.
4148 //
4149 UnicodeString resultString;
4150 UnicodeString perlExpr = fields[3];
4151 #if SUPPORT_MUTATING_INPUT_STRING
4152 groupsMat->reset(perlExpr);
4153 cgMat->reset(perlExpr);
4154 #endif
4155
4156 while (perlExpr.length() > 0) {
4157 #if !SUPPORT_MUTATING_INPUT_STRING
4158 // Perferred usage. Reset after any modification to input string.
4159 groupsMat->reset(perlExpr);
4160 cgMat->reset(perlExpr);
4161 #endif
4162
4163 if (perlExpr.startsWith("$&")) {
4164 resultString.append(testMat->group(status));
4165 perlExpr.remove(0, 2);
4166 }
4167
4168 else if (groupsMat->lookingAt(status)) {
4169 // $-[0] $+[2] etc.
4170 UnicodeString digitString = groupsMat->group(2, status);
4171 int32_t t = 0;
4172 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4173 UnicodeString plusOrMinus = groupsMat->group(1, status);
4174 int32_t matchPosition;
4175 if (plusOrMinus.compare("+") == 0) {
4176 matchPosition = testMat->end(groupNum, status);
4177 } else {
4178 matchPosition = testMat->start(groupNum, status);
4179 }
4180 if (matchPosition != -1) {
4181 ICU_Utility::appendNumber(resultString, matchPosition);
4182 }
4183 perlExpr.remove(0, groupsMat->end(status));
4184 }
4185
4186 else if (cgMat->lookingAt(status)) {
4187 // $1, $2, $3, etc.
4188 UnicodeString digitString = cgMat->group(1, status);
4189 int32_t t = 0;
4190 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4191 if (U_SUCCESS(status)) {
4192 resultString.append(testMat->group(groupNum, status));
4193 status = U_ZERO_ERROR;
4194 }
4195 perlExpr.remove(0, cgMat->end(status));
4196 }
4197
4198 else if (perlExpr.startsWith("@-")) {
4199 int32_t i;
4200 for (i=0; i<=testMat->groupCount(); i++) {
4201 if (i>0) {
4202 resultString.append(" ");
4203 }
4204 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4205 }
4206 perlExpr.remove(0, 2);
4207 }
4208
4209 else if (perlExpr.startsWith("@+")) {
4210 int32_t i;
4211 for (i=0; i<=testMat->groupCount(); i++) {
4212 if (i>0) {
4213 resultString.append(" ");
4214 }
4215 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4216 }
4217 perlExpr.remove(0, 2);
4218 }
4219
4220 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4221 // or as an escaped sequence (e.g. \n)
4222 if (perlExpr.length() > 1) {
4223 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4224 }
4225 UChar c = perlExpr.charAt(0);
4226 switch (c) {
4227 case 'n': c = '\n'; break;
4228 // add any other escape sequences that show up in the test expected results.
4229 }
4230 resultString.append(c);
4231 perlExpr.remove(0, 1);
4232 }
4233
4234 else {
4235 // Any characters from the perl expression that we don't explicitly
4236 // recognize before here are assumed to be literals and copied
4237 // as-is to the expected results.
4238 resultString.append(perlExpr.charAt(0));
4239 perlExpr.remove(0, 1);
4240 }
4241
4242 if (U_FAILURE(status)) {
4243 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4244 break;
4245 }
4246 }
4247
4248 //
4249 // Expected Results Compare
4250 //
4251 UnicodeString expectedS(fields[4]);
4252 expectedS.findAndReplace(nulnulSrc, nulnul);
4253 expectedS.findAndReplace(ffffSrc, ffff);
4254 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4255
4256
4257 if (expectedS.compare(resultString) != 0) {
4258 err("Line %d: Incorrect perl expression results.", lineNum);
4259 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4260 }
4261
4262 delete testMat;
4263 delete testPat;
4264 }
4265
4266 //
4267 // All done. Clean up allocated stuff.
4268 //
4269 delete cgMat;
4270 delete cgPat;
4271
4272 delete groupsMat;
4273 delete groupsPat;
4274
4275 delete flagMat;
4276 delete flagPat;
4277
4278 delete lineMat;
4279 delete linePat;
4280
4281 delete fieldPat;
4282 delete [] testData;
4283
4284
4285 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4286
4287 }
4288
4289
4290 //-------------------------------------------------------------------------------
4291 //
4292 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4293 // (instead of using UnicodeStrings) to test the alternate engine.
4294 // The input file for this test is re_tests, the standard regular
4295 // expression test data distributed with the Perl source code.
4296 // See PerlTests() for more information.
4297 //
4298 //-------------------------------------------------------------------------------
PerlTestsUTF8()4299 void RegexTest::PerlTestsUTF8() {
4300 char tdd[2048];
4301 const char *srcPath;
4302 UErrorCode status = U_ZERO_ERROR;
4303 UParseError pe;
4304 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4305 UText patternText = UTEXT_INITIALIZER;
4306 char *patternChars = NULL;
4307 int32_t patternLength;
4308 int32_t patternCapacity = 0;
4309 UText inputText = UTEXT_INITIALIZER;
4310 char *inputChars = NULL;
4311 int32_t inputLength;
4312 int32_t inputCapacity = 0;
4313
4314 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4315
4316 //
4317 // Open and read the test data file.
4318 //
4319 srcPath=getPath(tdd, "re_tests.txt");
4320 if(srcPath==NULL) {
4321 return; /* something went wrong, error already output */
4322 }
4323
4324 int32_t len;
4325 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4326 if (U_FAILURE(status)) {
4327 return; /* something went wrong, error already output */
4328 }
4329
4330 //
4331 // Put the test data into a UnicodeString
4332 //
4333 UnicodeString testDataString(FALSE, testData, len);
4334
4335 //
4336 // Regex to break the input file into lines, and strip the new lines.
4337 // One line per match, capture group one is the desired data.
4338 //
4339 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4340 if (U_FAILURE(status)) {
4341 dataerrln("RegexPattern::compile() error");
4342 return;
4343 }
4344 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4345
4346 //
4347 // Regex to split a test file line into fields.
4348 // There are six fields, separated by tabs.
4349 //
4350 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4351
4352 //
4353 // Regex to identify test patterns with flag settings, and to separate them.
4354 // Test patterns with flags look like 'pattern'i
4355 // Test patterns without flags are not quoted: pattern
4356 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4357 //
4358 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4359 RegexMatcher* flagMat = flagPat->matcher(status);
4360
4361 //
4362 // The Perl tests reference several perl-isms, which are evaluated/substituted
4363 // in the test data. Not being perl, this must be done explicitly. Here
4364 // are string constants and REs for these constructs.
4365 //
4366 UnicodeString nulnulSrc("${nulnul}");
4367 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4368 nulnul = nulnul.unescape();
4369
4370 UnicodeString ffffSrc("${ffff}");
4371 UnicodeString ffff("\\uffff", -1, US_INV);
4372 ffff = ffff.unescape();
4373
4374 // regexp for $-[0], $+[2], etc.
4375 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4376 RegexMatcher *groupsMat = groupsPat->matcher(status);
4377
4378 // regexp for $0, $1, $2, etc.
4379 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4380 RegexMatcher *cgMat = cgPat->matcher(status);
4381
4382
4383 //
4384 // Main Loop for the Perl Tests, runs once per line from the
4385 // test data file.
4386 //
4387 int32_t lineNum = 0;
4388 int32_t skippedUnimplementedCount = 0;
4389 while (lineMat->find()) {
4390 lineNum++;
4391
4392 //
4393 // Get a line, break it into its fields, do the Perl
4394 // variable substitutions.
4395 //
4396 UnicodeString line = lineMat->group(1, status);
4397 UnicodeString fields[7];
4398 fieldPat->split(line, fields, 7, status);
4399
4400 flagMat->reset(fields[0]);
4401 flagMat->matches(status);
4402 UnicodeString pattern = flagMat->group(2, status);
4403 pattern.findAndReplace("${bang}", "!");
4404 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4405 pattern.findAndReplace(ffffSrc, ffff);
4406
4407 //
4408 // Identify patterns that include match flag settings,
4409 // split off the flags, remove the extra quotes.
4410 //
4411 UnicodeString flagStr = flagMat->group(3, status);
4412 if (U_FAILURE(status)) {
4413 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4414 return;
4415 }
4416 int32_t flags = 0;
4417 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4418 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4419 const UChar UChar_m = 0x6d;
4420 const UChar UChar_x = 0x78;
4421 const UChar UChar_y = 0x79;
4422 if (flagStr.indexOf(UChar_i) != -1) {
4423 flags |= UREGEX_CASE_INSENSITIVE;
4424 }
4425 if (flagStr.indexOf(UChar_m) != -1) {
4426 flags |= UREGEX_MULTILINE;
4427 }
4428 if (flagStr.indexOf(UChar_x) != -1) {
4429 flags |= UREGEX_COMMENTS;
4430 }
4431
4432 //
4433 // Put the pattern in a UTF-8 UText
4434 //
4435 status = U_ZERO_ERROR;
4436 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4437 if (status == U_BUFFER_OVERFLOW_ERROR) {
4438 status = U_ZERO_ERROR;
4439 delete[] patternChars;
4440 patternCapacity = patternLength + 1;
4441 patternChars = new char[patternCapacity];
4442 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4443 }
4444 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4445
4446 //
4447 // Compile the test pattern.
4448 //
4449 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4450 if (status == U_REGEX_UNIMPLEMENTED) {
4451 //
4452 // Test of a feature that is planned for ICU, but not yet implemented.
4453 // skip the test.
4454 skippedUnimplementedCount++;
4455 delete testPat;
4456 status = U_ZERO_ERROR;
4457 continue;
4458 }
4459
4460 if (U_FAILURE(status)) {
4461 // Some tests are supposed to generate errors.
4462 // Only report an error for tests that are supposed to succeed.
4463 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4464 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4465 {
4466 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4467 }
4468 status = U_ZERO_ERROR;
4469 delete testPat;
4470 continue;
4471 }
4472
4473 if (fields[2].indexOf(UChar_i) >= 0) {
4474 // ICU should skip this test.
4475 delete testPat;
4476 continue;
4477 }
4478
4479 if (fields[2].indexOf(UChar_c) >= 0) {
4480 // This pattern should have caused a compilation error, but didn't/
4481 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4482 delete testPat;
4483 continue;
4484 }
4485
4486
4487 //
4488 // replace the Perl variables that appear in some of the
4489 // match data strings.
4490 //
4491 UnicodeString matchString = fields[1];
4492 matchString.findAndReplace(nulnulSrc, nulnul);
4493 matchString.findAndReplace(ffffSrc, ffff);
4494
4495 // Replace any \n in the match string with an actual new-line char.
4496 // Don't do full unescape, as this unescapes more than Perl does, which
4497 // causes other spurious failures in the tests.
4498 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4499
4500 //
4501 // Put the input in a UTF-8 UText
4502 //
4503 status = U_ZERO_ERROR;
4504 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4505 if (status == U_BUFFER_OVERFLOW_ERROR) {
4506 status = U_ZERO_ERROR;
4507 delete[] inputChars;
4508 inputCapacity = inputLength + 1;
4509 inputChars = new char[inputCapacity];
4510 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4511 }
4512 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4513
4514 //
4515 // Run the test, check for expected match/don't match result.
4516 //
4517 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4518 UBool found = testMat->find();
4519 UBool expected = FALSE;
4520 if (fields[2].indexOf(UChar_y) >=0) {
4521 expected = TRUE;
4522 }
4523 if (expected != found) {
4524 errln("line %d: Expected %smatch, got %smatch",
4525 lineNum, expected?"":"no ", found?"":"no " );
4526 continue;
4527 }
4528
4529 // Don't try to check expected results if there is no match.
4530 // (Some have stuff in the expected fields)
4531 if (!found) {
4532 delete testMat;
4533 delete testPat;
4534 continue;
4535 }
4536
4537 //
4538 // Interpret the Perl expression from the fourth field of the data file,
4539 // building up an ICU string from the results of the ICU match.
4540 // The Perl expression will contain references to the results of
4541 // a regex match, including the matched string, capture group strings,
4542 // group starting and ending indicies, etc.
4543 //
4544 UnicodeString resultString;
4545 UnicodeString perlExpr = fields[3];
4546
4547 while (perlExpr.length() > 0) {
4548 groupsMat->reset(perlExpr);
4549 cgMat->reset(perlExpr);
4550
4551 if (perlExpr.startsWith("$&")) {
4552 resultString.append(testMat->group(status));
4553 perlExpr.remove(0, 2);
4554 }
4555
4556 else if (groupsMat->lookingAt(status)) {
4557 // $-[0] $+[2] etc.
4558 UnicodeString digitString = groupsMat->group(2, status);
4559 int32_t t = 0;
4560 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4561 UnicodeString plusOrMinus = groupsMat->group(1, status);
4562 int32_t matchPosition;
4563 if (plusOrMinus.compare("+") == 0) {
4564 matchPosition = testMat->end(groupNum, status);
4565 } else {
4566 matchPosition = testMat->start(groupNum, status);
4567 }
4568 if (matchPosition != -1) {
4569 ICU_Utility::appendNumber(resultString, matchPosition);
4570 }
4571 perlExpr.remove(0, groupsMat->end(status));
4572 }
4573
4574 else if (cgMat->lookingAt(status)) {
4575 // $1, $2, $3, etc.
4576 UnicodeString digitString = cgMat->group(1, status);
4577 int32_t t = 0;
4578 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4579 if (U_SUCCESS(status)) {
4580 resultString.append(testMat->group(groupNum, status));
4581 status = U_ZERO_ERROR;
4582 }
4583 perlExpr.remove(0, cgMat->end(status));
4584 }
4585
4586 else if (perlExpr.startsWith("@-")) {
4587 int32_t i;
4588 for (i=0; i<=testMat->groupCount(); i++) {
4589 if (i>0) {
4590 resultString.append(" ");
4591 }
4592 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4593 }
4594 perlExpr.remove(0, 2);
4595 }
4596
4597 else if (perlExpr.startsWith("@+")) {
4598 int32_t i;
4599 for (i=0; i<=testMat->groupCount(); i++) {
4600 if (i>0) {
4601 resultString.append(" ");
4602 }
4603 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4604 }
4605 perlExpr.remove(0, 2);
4606 }
4607
4608 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4609 // or as an escaped sequence (e.g. \n)
4610 if (perlExpr.length() > 1) {
4611 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4612 }
4613 UChar c = perlExpr.charAt(0);
4614 switch (c) {
4615 case 'n': c = '\n'; break;
4616 // add any other escape sequences that show up in the test expected results.
4617 }
4618 resultString.append(c);
4619 perlExpr.remove(0, 1);
4620 }
4621
4622 else {
4623 // Any characters from the perl expression that we don't explicitly
4624 // recognize before here are assumed to be literals and copied
4625 // as-is to the expected results.
4626 resultString.append(perlExpr.charAt(0));
4627 perlExpr.remove(0, 1);
4628 }
4629
4630 if (U_FAILURE(status)) {
4631 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4632 break;
4633 }
4634 }
4635
4636 //
4637 // Expected Results Compare
4638 //
4639 UnicodeString expectedS(fields[4]);
4640 expectedS.findAndReplace(nulnulSrc, nulnul);
4641 expectedS.findAndReplace(ffffSrc, ffff);
4642 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4643
4644
4645 if (expectedS.compare(resultString) != 0) {
4646 err("Line %d: Incorrect perl expression results.", lineNum);
4647 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4648 }
4649
4650 delete testMat;
4651 delete testPat;
4652 }
4653
4654 //
4655 // All done. Clean up allocated stuff.
4656 //
4657 delete cgMat;
4658 delete cgPat;
4659
4660 delete groupsMat;
4661 delete groupsPat;
4662
4663 delete flagMat;
4664 delete flagPat;
4665
4666 delete lineMat;
4667 delete linePat;
4668
4669 delete fieldPat;
4670 delete [] testData;
4671
4672 utext_close(&patternText);
4673 utext_close(&inputText);
4674
4675 delete [] patternChars;
4676 delete [] inputChars;
4677
4678
4679 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4680
4681 }
4682
4683
4684 //--------------------------------------------------------------
4685 //
4686 // Bug6149 Verify limits to heap expansion for backtrack stack.
4687 // Use this pattern,
4688 // "(a?){1,8000000}"
4689 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4690 // This test is likely to be fragile, as further optimizations stop
4691 // more cases of pointless looping in the match engine.
4692 //
4693 //---------------------------------------------------------------
Bug6149()4694 void RegexTest::Bug6149() {
4695 UnicodeString pattern("(a?){1,8000000}");
4696 UnicodeString s("xyz");
4697 uint32_t flags = 0;
4698 UErrorCode status = U_ZERO_ERROR;
4699
4700 RegexMatcher matcher(pattern, s, flags, status);
4701 UBool result = false;
4702 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4703 REGEX_ASSERT(result == FALSE);
4704 }
4705
4706
4707 //
4708 // Callbacks() Test the callback function.
4709 // When set, callbacks occur periodically during matching operations,
4710 // giving the application code the ability to abort the operation
4711 // before it's normal completion.
4712 //
4713
4714 struct callBackContext {
4715 RegexTest *test;
4716 int32_t maxCalls;
4717 int32_t numCalls;
4718 int32_t lastSteps;
resetcallBackContext4719 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4720 };
4721
4722 U_CDECL_BEGIN
4723 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4724 testCallBackFn(const void *context, int32_t steps) {
4725 callBackContext *info = (callBackContext *)context;
4726 if (info->lastSteps+1 != steps) {
4727 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4728 }
4729 info->lastSteps = steps;
4730 info->numCalls++;
4731 return (info->numCalls < info->maxCalls);
4732 }
4733 U_CDECL_END
4734
Callbacks()4735 void RegexTest::Callbacks() {
4736 {
4737 // Getter returns NULLs if no callback has been set
4738
4739 // The variables that the getter will fill in.
4740 // Init to non-null values so that the action of the getter can be seen.
4741 const void *returnedContext = &returnedContext;
4742 URegexMatchCallback *returnedFn = &testCallBackFn;
4743
4744 UErrorCode status = U_ZERO_ERROR;
4745 RegexMatcher matcher("x", 0, status);
4746 REGEX_CHECK_STATUS;
4747 matcher.getMatchCallback(returnedFn, returnedContext, status);
4748 REGEX_CHECK_STATUS;
4749 REGEX_ASSERT(returnedFn == NULL);
4750 REGEX_ASSERT(returnedContext == NULL);
4751 }
4752
4753 {
4754 // Set and Get work
4755 callBackContext cbInfo = {this, 0, 0, 0};
4756 const void *returnedContext;
4757 URegexMatchCallback *returnedFn;
4758 UErrorCode status = U_ZERO_ERROR;
4759 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4760 REGEX_CHECK_STATUS;
4761 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4762 REGEX_CHECK_STATUS;
4763 matcher.getMatchCallback(returnedFn, returnedContext, status);
4764 REGEX_CHECK_STATUS;
4765 REGEX_ASSERT(returnedFn == testCallBackFn);
4766 REGEX_ASSERT(returnedContext == &cbInfo);
4767
4768 // A short-running match shouldn't invoke the callback
4769 status = U_ZERO_ERROR;
4770 cbInfo.reset(1);
4771 UnicodeString s = "xxx";
4772 matcher.reset(s);
4773 REGEX_ASSERT(matcher.matches(status));
4774 REGEX_CHECK_STATUS;
4775 REGEX_ASSERT(cbInfo.numCalls == 0);
4776
4777 // A medium-length match that runs long enough to invoke the
4778 // callback, but not so long that the callback aborts it.
4779 status = U_ZERO_ERROR;
4780 cbInfo.reset(4);
4781 s = "aaaaaaaaaaaaaaaaaaab";
4782 matcher.reset(s);
4783 REGEX_ASSERT(matcher.matches(status)==FALSE);
4784 REGEX_CHECK_STATUS;
4785 REGEX_ASSERT(cbInfo.numCalls > 0);
4786
4787 // A longer running match that the callback function will abort.
4788 status = U_ZERO_ERROR;
4789 cbInfo.reset(4);
4790 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4791 matcher.reset(s);
4792 REGEX_ASSERT(matcher.matches(status)==FALSE);
4793 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4794 REGEX_ASSERT(cbInfo.numCalls == 4);
4795 }
4796
4797
4798 }
4799
4800
4801 //
4802 // FindProgressCallbacks() Test the find "progress" callback function.
4803 // When set, the find progress callback will be invoked during a find operations
4804 // after each return from a match attempt, giving the application the opportunity
4805 // to terminate a long-running find operation before it's normal completion.
4806 //
4807
4808 struct progressCallBackContext {
4809 RegexTest *test;
4810 int64_t lastIndex;
4811 int32_t maxCalls;
4812 int32_t numCalls;
resetprogressCallBackContext4813 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4814 };
4815
4816 U_CDECL_BEGIN
4817 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4818 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4819 progressCallBackContext *info = (progressCallBackContext *)context;
4820 info->numCalls++;
4821 info->lastIndex = matchIndex;
4822 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4823 return (info->numCalls < info->maxCalls);
4824 }
4825 U_CDECL_END
4826
FindProgressCallbacks()4827 void RegexTest::FindProgressCallbacks() {
4828 {
4829 // Getter returns NULLs if no callback has been set
4830
4831 // The variables that the getter will fill in.
4832 // Init to non-null values so that the action of the getter can be seen.
4833 const void *returnedContext = &returnedContext;
4834 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4835
4836 UErrorCode status = U_ZERO_ERROR;
4837 RegexMatcher matcher("x", 0, status);
4838 REGEX_CHECK_STATUS;
4839 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4840 REGEX_CHECK_STATUS;
4841 REGEX_ASSERT(returnedFn == NULL);
4842 REGEX_ASSERT(returnedContext == NULL);
4843 }
4844
4845 {
4846 // Set and Get work
4847 progressCallBackContext cbInfo = {this, 0, 0, 0};
4848 const void *returnedContext;
4849 URegexFindProgressCallback *returnedFn;
4850 UErrorCode status = U_ZERO_ERROR;
4851 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4852 REGEX_CHECK_STATUS;
4853 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4854 REGEX_CHECK_STATUS;
4855 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4856 REGEX_CHECK_STATUS;
4857 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4858 REGEX_ASSERT(returnedContext == &cbInfo);
4859
4860 // A short-running match should NOT invoke the callback.
4861 status = U_ZERO_ERROR;
4862 cbInfo.reset(100);
4863 UnicodeString s = "abxxx";
4864 matcher.reset(s);
4865 #if 0
4866 matcher.setTrace(TRUE);
4867 #endif
4868 REGEX_ASSERT(matcher.find(0, status));
4869 REGEX_CHECK_STATUS;
4870 REGEX_ASSERT(cbInfo.numCalls == 0);
4871
4872 // A medium running match that causes matcher.find() to invoke our callback for each index.
4873 status = U_ZERO_ERROR;
4874 s = "aaaaaaaaaaaaaaaaaaab";
4875 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4876 matcher.reset(s);
4877 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4878 REGEX_CHECK_STATUS;
4879 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4880
4881 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4882 status = U_ZERO_ERROR;
4883 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4884 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4885 matcher.reset(s1);
4886 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4887 REGEX_CHECK_STATUS;
4888 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4889
4890 #if 0
4891 // Now a match that will succeed, but after an interruption
4892 status = U_ZERO_ERROR;
4893 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4894 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4895 matcher.reset(s2);
4896 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4897 REGEX_CHECK_STATUS;
4898 // Now retry the match from where left off
4899 cbInfo.maxCalls = 100; // No callback limit
4900 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4901 REGEX_CHECK_STATUS;
4902 #endif
4903 }
4904
4905
4906 }
4907
4908
4909 //---------------------------------------------------------------------------
4910 //
4911 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4912 // UTexts. The pure-C implementation of UText
4913 // has no mutable backing stores, but we can
4914 // use UnicodeString here to test the functionality.
4915 //
4916 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4917 void RegexTest::PreAllocatedUTextCAPI () {
4918 UErrorCode status = U_ZERO_ERROR;
4919 URegularExpression *re;
4920 UText patternText = UTEXT_INITIALIZER;
4921 UnicodeString buffer;
4922 UText bufferText = UTEXT_INITIALIZER;
4923
4924 utext_openUnicodeString(&bufferText, &buffer, &status);
4925
4926 /*
4927 * getText() and getUText()
4928 */
4929 {
4930 UText text1 = UTEXT_INITIALIZER;
4931 UText text2 = UTEXT_INITIALIZER;
4932 UChar text2Chars[20];
4933 UText *resultText;
4934
4935 status = U_ZERO_ERROR;
4936 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4937 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4938 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4939 utext_openUChars(&text2, text2Chars, -1, &status);
4940
4941 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4942 re = uregex_openUText(&patternText, 0, NULL, &status);
4943
4944 /* First set a UText */
4945 uregex_setUText(re, &text1, &status);
4946 resultText = uregex_getUText(re, &bufferText, &status);
4947 REGEX_CHECK_STATUS;
4948 REGEX_ASSERT(resultText == &bufferText);
4949 utext_setNativeIndex(resultText, 0);
4950 utext_setNativeIndex(&text1, 0);
4951 REGEX_ASSERT(testUTextEqual(resultText, &text1));
4952
4953 resultText = uregex_getUText(re, &bufferText, &status);
4954 REGEX_CHECK_STATUS;
4955 REGEX_ASSERT(resultText == &bufferText);
4956 utext_setNativeIndex(resultText, 0);
4957 utext_setNativeIndex(&text1, 0);
4958 REGEX_ASSERT(testUTextEqual(resultText, &text1));
4959
4960 /* Then set a UChar * */
4961 uregex_setText(re, text2Chars, 7, &status);
4962 resultText = uregex_getUText(re, &bufferText, &status);
4963 REGEX_CHECK_STATUS;
4964 REGEX_ASSERT(resultText == &bufferText);
4965 utext_setNativeIndex(resultText, 0);
4966 utext_setNativeIndex(&text2, 0);
4967 REGEX_ASSERT(testUTextEqual(resultText, &text2));
4968
4969 uregex_close(re);
4970 utext_close(&text1);
4971 utext_close(&text2);
4972 }
4973
4974 /*
4975 * group()
4976 */
4977 {
4978 UChar text1[80];
4979 UText *actual;
4980 UBool result;
4981 u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2);
4982
4983 status = U_ZERO_ERROR;
4984 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4985 REGEX_CHECK_STATUS;
4986
4987 uregex_setText(re, text1, -1, &status);
4988 result = uregex_find(re, 0, &status);
4989 REGEX_ASSERT(result==TRUE);
4990
4991 /* Capture Group 0, the full match. Should succeed. */
4992 status = U_ZERO_ERROR;
4993 actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4994 REGEX_CHECK_STATUS;
4995 REGEX_ASSERT(actual == &bufferText);
4996 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
4997
4998 /* Capture group #1. Should succeed. */
4999 status = U_ZERO_ERROR;
5000 actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
5001 REGEX_CHECK_STATUS;
5002 REGEX_ASSERT(actual == &bufferText);
5003 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
5004
5005 /* Capture group out of range. Error. */
5006 status = U_ZERO_ERROR;
5007 actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
5008 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5009 REGEX_ASSERT(actual == &bufferText);
5010
5011 uregex_close(re);
5012
5013 }
5014
5015 /*
5016 * replaceFirst()
5017 */
5018 {
5019 UChar text1[80];
5020 UChar text2[80];
5021 UText replText = UTEXT_INITIALIZER;
5022 UText *result;
5023
5024 status = U_ZERO_ERROR;
5025 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5026 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5027 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5028
5029 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5030 REGEX_CHECK_STATUS;
5031
5032 /* Normal case, with match */
5033 uregex_setText(re, text1, -1, &status);
5034 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5035 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5036 REGEX_CHECK_STATUS;
5037 REGEX_ASSERT(result == &bufferText);
5038 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5039
5040 /* No match. Text should copy to output with no changes. */
5041 uregex_setText(re, text2, -1, &status);
5042 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5043 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5044 REGEX_CHECK_STATUS;
5045 REGEX_ASSERT(result == &bufferText);
5046 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5047
5048 /* Unicode escapes */
5049 uregex_setText(re, text1, -1, &status);
5050 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
5051 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5052 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5053 REGEX_CHECK_STATUS;
5054 REGEX_ASSERT(result == &bufferText);
5055 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5056
5057 uregex_close(re);
5058 utext_close(&replText);
5059 }
5060
5061
5062 /*
5063 * replaceAll()
5064 */
5065 {
5066 UChar text1[80];
5067 UChar text2[80];
5068 UText replText = UTEXT_INITIALIZER;
5069 UText *result;
5070
5071 status = U_ZERO_ERROR;
5072 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5073 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5074 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5075
5076 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5077 REGEX_CHECK_STATUS;
5078
5079 /* Normal case, with match */
5080 uregex_setText(re, text1, -1, &status);
5081 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5082 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5083 REGEX_CHECK_STATUS;
5084 REGEX_ASSERT(result == &bufferText);
5085 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5086
5087 /* No match. Text should copy to output with no changes. */
5088 uregex_setText(re, text2, -1, &status);
5089 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5090 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5091 REGEX_CHECK_STATUS;
5092 REGEX_ASSERT(result == &bufferText);
5093 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5094
5095 uregex_close(re);
5096 utext_close(&replText);
5097 }
5098
5099
5100 /*
5101 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5102 * so we don't need to test it here.
5103 */
5104
5105 utext_close(&bufferText);
5106 utext_close(&patternText);
5107 }
5108
5109 //--------------------------------------------------------------
5110 //
5111 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5112 //
5113 //---------------------------------------------------------------
Bug7651()5114 void RegexTest::Bug7651() {
5115 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5116 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5117 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5118 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5119 UnicodeString s("#ff @abcd This is test");
5120 RegexPattern *REPattern = NULL;
5121 RegexMatcher *REMatcher = NULL;
5122 UErrorCode status = U_ZERO_ERROR;
5123 UParseError pe;
5124
5125 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5126 REGEX_CHECK_STATUS;
5127 REMatcher = REPattern->matcher(s, status);
5128 REGEX_CHECK_STATUS;
5129 REGEX_ASSERT(REMatcher->find());
5130 REGEX_ASSERT(REMatcher->start(status) == 0);
5131 delete REPattern;
5132 delete REMatcher;
5133 status = U_ZERO_ERROR;
5134
5135 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5136 REGEX_CHECK_STATUS;
5137 REMatcher = REPattern->matcher(s, status);
5138 REGEX_CHECK_STATUS;
5139 REGEX_ASSERT(REMatcher->find());
5140 REGEX_ASSERT(REMatcher->start(status) == 0);
5141 delete REPattern;
5142 delete REMatcher;
5143 status = U_ZERO_ERROR;
5144 }
5145
Bug7740()5146 void RegexTest::Bug7740() {
5147 UErrorCode status = U_ZERO_ERROR;
5148 UnicodeString pattern = "(a)";
5149 UnicodeString text = "abcdef";
5150 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5151 REGEX_CHECK_STATUS;
5152 REGEX_ASSERT(m->lookingAt(status));
5153 REGEX_CHECK_STATUS;
5154 status = U_ILLEGAL_ARGUMENT_ERROR;
5155 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5156 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5157 REGEX_ASSERT(s == "");
5158 delete m;
5159 }
5160
5161 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5162
Bug8479()5163 void RegexTest::Bug8479() {
5164 UErrorCode status = U_ZERO_ERROR;
5165
5166 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5167 REGEX_CHECK_STATUS;
5168 if (U_SUCCESS(status))
5169 {
5170 UnicodeString str;
5171 str.setToBogus();
5172 pMatcher->reset(str);
5173 status = U_ZERO_ERROR;
5174 pMatcher->matches(status);
5175 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5176 delete pMatcher;
5177 }
5178 }
5179
5180
5181 // Bug 7029
Bug7029()5182 void RegexTest::Bug7029() {
5183 UErrorCode status = U_ZERO_ERROR;
5184
5185 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5186 UnicodeString text = "abc.def";
5187 UnicodeString splits[10];
5188 REGEX_CHECK_STATUS;
5189 int32_t numFields = pMatcher->split(text, splits, 10, status);
5190 REGEX_CHECK_STATUS;
5191 REGEX_ASSERT(numFields == 8);
5192 delete pMatcher;
5193 }
5194
5195 // Bug 9283
5196 // This test is checking for the existance of any supplemental characters that case-fold
5197 // to a bmp character.
5198 //
5199 // At the time of this writing there are none. If any should appear in a subsequent release
5200 // of Unicode, the code in regular expressions compilation that determines the longest
5201 // posssible match for a literal string will need to be enhanced.
5202 //
5203 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5204 // for details on what to do in case of a failure of this test.
5205 //
Bug9283()5206 void RegexTest::Bug9283() {
5207 UErrorCode status = U_ZERO_ERROR;
5208 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5209 REGEX_CHECK_STATUS;
5210 int32_t index;
5211 UChar32 c;
5212 for (index=0; ; index++) {
5213 c = supplementalsWithCaseFolding.charAt(index);
5214 if (c == -1) {
5215 break;
5216 }
5217 UnicodeString cf = UnicodeString(c).foldCase();
5218 REGEX_ASSERT(cf.length() >= 2);
5219 }
5220 }
5221
5222
CheckInvBufSize()5223 void RegexTest::CheckInvBufSize() {
5224 if(inv_next>=INV_BUFSIZ) {
5225 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5226 __FILE__, INV_BUFSIZ, inv_next);
5227 } else {
5228 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5229 }
5230 }
5231
5232 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
5233
5234