1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2011, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7 //
8 // regextst.cpp
9 //
10 // ICU Regular Expressions test, part of intltest.
11 //
12
13 /*
14 NOTE!!
15
16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system.
20
21 */
22
23 #include "intltest.h"
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26 #include "unicode/regex.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ucnv.h"
29 #include "unicode/ustring.h"
30 #include "regextst.h"
31 #include "uvector.h"
32 #include "util.h"
33 #include <stdlib.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include "cstring.h"
37 #include "uinvchar.h"
38
39 #define SUPPORT_MUTATING_INPUT_STRING 0
40
41 //---------------------------------------------------------------------------
42 //
43 // Test class boilerplate
44 //
45 //---------------------------------------------------------------------------
RegexTest()46 RegexTest::RegexTest()
47 {
48 }
49
50
~RegexTest()51 RegexTest::~RegexTest()
52 {
53 }
54
55
56
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)57 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
58 {
59 if (exec) logln("TestSuite RegexTest: ");
60 switch (index) {
61
62 case 0: name = "Basic";
63 if (exec) Basic();
64 break;
65 case 1: name = "API_Match";
66 if (exec) API_Match();
67 break;
68 case 2: name = "API_Replace";
69 if (exec) API_Replace();
70 break;
71 case 3: name = "API_Pattern";
72 if (exec) API_Pattern();
73 break;
74 case 4:
75 #if !UCONFIG_NO_FILE_IO
76 name = "Extended";
77 if (exec) Extended();
78 #else
79 name = "skip";
80 #endif
81 break;
82 case 5: name = "Errors";
83 if (exec) Errors();
84 break;
85 case 6: name = "PerlTests";
86 if (exec) PerlTests();
87 break;
88 case 7: name = "Callbacks";
89 if (exec) Callbacks();
90 break;
91 case 8: name = "FindProgressCallbacks";
92 if (exec) FindProgressCallbacks();
93 break;
94 case 9: name = "Bug 6149";
95 if (exec) Bug6149();
96 break;
97 case 10: name = "UTextBasic";
98 if (exec) UTextBasic();
99 break;
100 case 11: name = "API_Match_UTF8";
101 if (exec) API_Match_UTF8();
102 break;
103 case 12: name = "API_Replace_UTF8";
104 if (exec) API_Replace_UTF8();
105 break;
106 case 13: name = "API_Pattern_UTF8";
107 if (exec) API_Pattern_UTF8();
108 break;
109 case 14: name = "PerlTestsUTF8";
110 if (exec) PerlTestsUTF8();
111 break;
112 case 15: name = "PreAllocatedUTextCAPI";
113 if (exec) PreAllocatedUTextCAPI();
114 break;
115 case 16: name = "Bug 7651";
116 if (exec) Bug7651();
117 break;
118 case 17: name = "Bug 7740";
119 if (exec) Bug7740();
120 break;
121 case 18: name = "Bug 8479";
122 if (exec) Bug8479();
123 break;
124 case 19: name = "Bug 7029";
125 if (exec) Bug7029();
126 break;
127 case 20: name = "CheckInvBufSize";
128 if (exec) CheckInvBufSize();
129 break;
130
131 default: name = "";
132 break; //needed to end loop
133 }
134 }
135
136
137
138 /**
139 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
140 * into ASCII.
141 * @see utext_openUTF8
142 */
143 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
144
145 //---------------------------------------------------------------------------
146 //
147 // Error Checking / Reporting macros used in all of the tests.
148 //
149 //---------------------------------------------------------------------------
150
utextToPrintable(char * buf,int32_t bufLen,UText * text)151 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
152 int64_t oldIndex = utext_getNativeIndex(text);
153 utext_setNativeIndex(text, 0);
154 char *bufPtr = buf;
155 UChar32 c = utext_next32From(text, 0);
156 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
157 if (0x000020<=c && c<0x00007e) {
158 *bufPtr = c;
159 } else {
160 #if 0
161 sprintf(bufPtr,"U+%04X", c);
162 bufPtr+= strlen(bufPtr)-1;
163 #else
164 *bufPtr = '%';
165 #endif
166 }
167 bufPtr++;
168 c = UTEXT_NEXT32(text);
169 }
170 *bufPtr = 0;
171 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
172 char *ebuf = (char*)malloc(bufLen);
173 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
174 uprv_strncpy(buf, ebuf, bufLen);
175 free((void*)ebuf);
176 #endif
177 utext_setNativeIndex(text, oldIndex);
178 }
179
toHex(int32_t i)180 static inline UChar toHex(int32_t i) {
181 return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10)));
182 }
183
escape(const UnicodeString & s,UnicodeString & result)184 static UnicodeString& escape(const UnicodeString& s, UnicodeString& result) {
185 for (int32_t i=0; i<s.length(); ++i) {
186 UChar c = s[i];
187 if ((c <= (UChar)0x7F) && (c>0)) {
188 result += c;
189 } else {
190 result += (UChar)0x5c;
191 result += (UChar)0x75;
192 result += toHex((c >> 12) & 0xF);
193 result += toHex((c >> 8) & 0xF);
194 result += toHex((c >> 4) & 0xF);
195 result += toHex( c & 0xF);
196 }
197 }
198 return result;
199 }
200
201 static char ASSERT_BUF[1024];
202
extractToAssertBuf(const UnicodeString & message)203 static const char* extractToAssertBuf(const UnicodeString& message) {
204 if(message.length()==0) {
205 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
206 } else {
207 UnicodeString buf;
208 escape(message, buf);
209 if(buf.length()==0) {
210 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
211 } else {
212 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
213 if(ASSERT_BUF[0]==0) {
214 ASSERT_BUF[0]=0;
215 for(int32_t i=0;i<buf.length();i++) {
216 UChar ch = buf[i];
217 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
218 }
219 }
220 }
221 }
222 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
223 return ASSERT_BUF;
224 }
225
226
227 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
228
229 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
230 __FILE__, __LINE__, u_errorName(status)); return;}}
231
232 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
233
234 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
235 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
236 __LINE__, u_errorName(errcode), u_errorName(status));};}
237
238 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
239 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
240
241 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
242 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
243
244 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
245
246 /**
247 * @param expected expected text in UTF-8 (not platform) codepage
248 */
assertUText(const char * expected,UText * actual,const char * file,int line)249 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
250 UErrorCode status = U_ZERO_ERROR;
251 UText expectedText = UTEXT_INITIALIZER;
252 utext_openUTF8(&expectedText, expected, -1, &status);
253 if(U_FAILURE(status)) {
254 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
255 return;
256 }
257 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
258 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
259 return;
260 }
261 utext_setNativeIndex(actual, 0);
262 if (utext_compare(&expectedText, -1, actual, -1) != 0) {
263 char buf[201 /*21*/];
264 char expectedBuf[201];
265 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
266 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
267 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
268 }
269 utext_close(&expectedText);
270 }
271 /**
272 * @param expected invariant (platform local text) input
273 */
274
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)275 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
276 UErrorCode status = U_ZERO_ERROR;
277 UText expectedText = UTEXT_INITIALIZER;
278 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
279 if(U_FAILURE(status)) {
280 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
281 return;
282 }
283 utext_setNativeIndex(actual, 0);
284 if (utext_compare(&expectedText, -1, actual, -1) != 0) {
285 char buf[201 /*21*/];
286 char expectedBuf[201];
287 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
288 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
289 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
290 }
291 utext_close(&expectedText);
292 }
293
294 /**
295 * Assumes utf-8 input
296 */
297 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
298 /**
299 * Assumes Invariant input
300 */
301 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
302
303 /**
304 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
305 * passed into utext_openUTF8. An error will be given if
306 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
307 */
308
309 #define INV_BUFSIZ 2048 /* increase this if too small */
310
311 static int32_t inv_next=0;
312
313 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
314 static char inv_buf[INV_BUFSIZ];
315 #endif
316
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)317 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
318 if(length==-1) length=strlen(inv);
319 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
320 inv_next+=length;
321 return utext_openUTF8(ut, inv, length, status);
322 #else
323 if(inv_next+length+1>INV_BUFSIZ) {
324 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
325 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
326 *status = U_MEMORY_ALLOCATION_ERROR;
327 return NULL;
328 }
329
330 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
331 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
332 inv_next+=length;
333
334 #if 0
335 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
336 #endif
337
338 return utext_openUTF8(ut, (const char*)buf, length, status);
339 #endif
340 }
341
342
343 //---------------------------------------------------------------------------
344 //
345 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
346 // for the LookingAt() and Match() functions.
347 //
348 // usage:
349 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
350 //
351 // The expected results are UBool - TRUE or FALSE.
352 // The input text is unescaped. The pattern is not.
353 //
354 //
355 //---------------------------------------------------------------------------
356
357 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
358
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)359 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
360 const UnicodeString pattern(pat, -1, US_INV);
361 const UnicodeString inputText(text, -1, US_INV);
362 UErrorCode status = U_ZERO_ERROR;
363 UParseError pe;
364 RegexPattern *REPattern = NULL;
365 RegexMatcher *REMatcher = NULL;
366 UBool retVal = TRUE;
367
368 UnicodeString patString(pat, -1, US_INV);
369 REPattern = RegexPattern::compile(patString, 0, pe, status);
370 if (U_FAILURE(status)) {
371 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
372 line, u_errorName(status));
373 return FALSE;
374 }
375 if (line==376) { RegexPatternDump(REPattern);}
376
377 UnicodeString inputString(inputText);
378 UnicodeString unEscapedInput = inputString.unescape();
379 REMatcher = REPattern->matcher(unEscapedInput, status);
380 if (U_FAILURE(status)) {
381 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
382 line, u_errorName(status));
383 return FALSE;
384 }
385
386 UBool actualmatch;
387 actualmatch = REMatcher->lookingAt(status);
388 if (U_FAILURE(status)) {
389 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
390 line, u_errorName(status));
391 retVal = FALSE;
392 }
393 if (actualmatch != looking) {
394 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
395 retVal = FALSE;
396 }
397
398 status = U_ZERO_ERROR;
399 actualmatch = REMatcher->matches(status);
400 if (U_FAILURE(status)) {
401 errln("RegexTest failure in matches() at line %d. Status = %s\n",
402 line, u_errorName(status));
403 retVal = FALSE;
404 }
405 if (actualmatch != match) {
406 errln("RegexTest: wrong return from matches() at line %d.\n", line);
407 retVal = FALSE;
408 }
409
410 if (retVal == FALSE) {
411 RegexPatternDump(REPattern);
412 }
413
414 delete REPattern;
415 delete REMatcher;
416 return retVal;
417 }
418
419
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)420 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
421 UText pattern = UTEXT_INITIALIZER;
422 int32_t inputUTF8Length;
423 char *textChars = NULL;
424 UText inputText = UTEXT_INITIALIZER;
425 UErrorCode status = U_ZERO_ERROR;
426 UParseError pe;
427 RegexPattern *REPattern = NULL;
428 RegexMatcher *REMatcher = NULL;
429 UBool retVal = TRUE;
430
431 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
432 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
433 if (U_FAILURE(status)) {
434 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
435 line, u_errorName(status));
436 return FALSE;
437 }
438
439 UnicodeString inputString(text, -1, US_INV);
440 UnicodeString unEscapedInput = inputString.unescape();
441 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
442 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
443
444 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
445 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
446 // UTF-8 does not allow unpaired surrogates, so this could actually happen
447 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
448 return TRUE; // not a failure of the Regex engine
449 }
450 status = U_ZERO_ERROR; // buffer overflow
451 textChars = new char[inputUTF8Length+1];
452 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
453 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
454
455 REMatcher = &REPattern->matcher(status)->reset(&inputText);
456 if (U_FAILURE(status)) {
457 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
458 line, u_errorName(status));
459 return FALSE;
460 }
461
462 UBool actualmatch;
463 actualmatch = REMatcher->lookingAt(status);
464 if (U_FAILURE(status)) {
465 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
466 line, u_errorName(status));
467 retVal = FALSE;
468 }
469 if (actualmatch != looking) {
470 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
471 retVal = FALSE;
472 }
473
474 status = U_ZERO_ERROR;
475 actualmatch = REMatcher->matches(status);
476 if (U_FAILURE(status)) {
477 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
478 line, u_errorName(status));
479 retVal = FALSE;
480 }
481 if (actualmatch != match) {
482 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
483 retVal = FALSE;
484 }
485
486 if (retVal == FALSE) {
487 RegexPatternDump(REPattern);
488 }
489
490 delete REPattern;
491 delete REMatcher;
492 utext_close(&inputText);
493 utext_close(&pattern);
494 delete[] textChars;
495 return retVal;
496 }
497
498
499
500 //---------------------------------------------------------------------------
501 //
502 // REGEX_ERR Macro + invocation function to simplify writing tests
503 // regex tests for incorrect patterns
504 //
505 // usage:
506 // REGEX_ERR("pattern", expected error line, column, expected status);
507 //
508 //---------------------------------------------------------------------------
509 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
510
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)511 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
512 UErrorCode expectedStatus, int32_t line) {
513 UnicodeString pattern(pat);
514
515 UErrorCode status = U_ZERO_ERROR;
516 UParseError pe;
517 RegexPattern *callerPattern = NULL;
518
519 //
520 // Compile the caller's pattern
521 //
522 UnicodeString patString(pat);
523 callerPattern = RegexPattern::compile(patString, 0, pe, status);
524 if (status != expectedStatus) {
525 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
526 } else {
527 if (status != U_ZERO_ERROR) {
528 if (pe.line != errLine || pe.offset != errCol) {
529 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
530 line, errLine, errCol, pe.line, pe.offset);
531 }
532 }
533 }
534
535 delete callerPattern;
536
537 //
538 // Compile again, using a UTF-8-based UText
539 //
540 UText patternText = UTEXT_INITIALIZER;
541 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
542 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
543 if (status != expectedStatus) {
544 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
545 } else {
546 if (status != U_ZERO_ERROR) {
547 if (pe.line != errLine || pe.offset != errCol) {
548 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
549 line, errLine, errCol, pe.line, pe.offset);
550 }
551 }
552 }
553
554 delete callerPattern;
555 utext_close(&patternText);
556 }
557
558
559
560 //---------------------------------------------------------------------------
561 //
562 // Basic Check for basic functionality of regex pattern matching.
563 // Avoid the use of REGEX_FIND test macro, which has
564 // substantial dependencies on basic Regex functionality.
565 //
566 //---------------------------------------------------------------------------
Basic()567 void RegexTest::Basic() {
568
569
570 //
571 // Debug - slide failing test cases early
572 //
573 #if 0
574 {
575 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
576 UParseError pe;
577 UErrorCode status = U_ZERO_ERROR;
578 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
579 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
580 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
581 }
582 exit(1);
583 #endif
584
585
586 //
587 // Pattern with parentheses
588 //
589 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
590 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
591 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
592
593 //
594 // Patterns with *
595 //
596 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
597 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
598 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
599 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
600 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
601
602 REGEX_TESTLM("a*", "", TRUE, TRUE);
603 REGEX_TESTLM("a*", "b", TRUE, FALSE);
604
605
606 //
607 // Patterns with "."
608 //
609 REGEX_TESTLM(".", "abc", TRUE, FALSE);
610 REGEX_TESTLM("...", "abc", TRUE, TRUE);
611 REGEX_TESTLM("....", "abc", FALSE, FALSE);
612 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
613 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
614 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
615 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
616 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
617
618 //
619 // Patterns with * applied to chars at end of literal string
620 //
621 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
622 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
623
624 //
625 // Supplemental chars match as single chars, not a pair of surrogates.
626 //
627 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
628 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
629 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
630
631
632 //
633 // UnicodeSets in the pattern
634 //
635 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
636 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
637 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
638 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
639 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
640 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
641
642 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
643 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
644 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
645 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
646 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
647
648 //
649 // OR operator in patterns
650 //
651 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
652 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
653 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
654 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
655
656 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
657 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
658 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
659 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
660 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
661 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
662
663 //
664 // +
665 //
666 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
667 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
668 REGEX_TESTLM("b+", "", FALSE, FALSE);
669 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
670 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
671 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
672
673 //
674 // ?
675 //
676 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
677 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
678 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
679 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
680 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
681 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
682 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
683 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
684 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
685
686 //
687 // Escape sequences that become single literal chars, handled internally
688 // by ICU's Unescape.
689 //
690
691 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
692 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
693 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
694 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
695 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
696 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
697 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
698 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
699 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
700 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
701
702 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
703 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
704
705 // Escape of special chars in patterns
706 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
707 }
708
709
710 //---------------------------------------------------------------------------
711 //
712 // UTextBasic Check for quirks that are specific to the UText
713 // implementation.
714 //
715 //---------------------------------------------------------------------------
UTextBasic()716 void RegexTest::UTextBasic() {
717 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
718 UErrorCode status = U_ZERO_ERROR;
719 UText pattern = UTEXT_INITIALIZER;
720 utext_openUTF8(&pattern, str_abc, -1, &status);
721 RegexMatcher matcher(&pattern, 0, status);
722 REGEX_CHECK_STATUS;
723
724 UText input = UTEXT_INITIALIZER;
725 utext_openUTF8(&input, str_abc, -1, &status);
726 REGEX_CHECK_STATUS;
727 matcher.reset(&input);
728 REGEX_CHECK_STATUS;
729 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
730
731 matcher.reset(matcher.inputText());
732 REGEX_CHECK_STATUS;
733 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
734
735 utext_close(&pattern);
736 utext_close(&input);
737 }
738
739
740 //---------------------------------------------------------------------------
741 //
742 // API_Match Test that the API for class RegexMatcher
743 // is present and nominally working, but excluding functions
744 // implementing replace operations.
745 //
746 //---------------------------------------------------------------------------
API_Match()747 void RegexTest::API_Match() {
748 UParseError pe;
749 UErrorCode status=U_ZERO_ERROR;
750 int32_t flags = 0;
751
752 //
753 // Debug - slide failing test cases early
754 //
755 #if 0
756 {
757 }
758 return;
759 #endif
760
761 //
762 // Simple pattern compilation
763 //
764 {
765 UnicodeString re("abc");
766 RegexPattern *pat2;
767 pat2 = RegexPattern::compile(re, flags, pe, status);
768 REGEX_CHECK_STATUS;
769
770 UnicodeString inStr1 = "abcdef this is a test";
771 UnicodeString instr2 = "not abc";
772 UnicodeString empty = "";
773
774
775 //
776 // Matcher creation and reset.
777 //
778 RegexMatcher *m1 = pat2->matcher(inStr1, status);
779 REGEX_CHECK_STATUS;
780 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
781 REGEX_ASSERT(m1->input() == inStr1);
782 m1->reset(instr2);
783 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
784 REGEX_ASSERT(m1->input() == instr2);
785 m1->reset(inStr1);
786 REGEX_ASSERT(m1->input() == inStr1);
787 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
788 m1->reset(empty);
789 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
790 REGEX_ASSERT(m1->input() == empty);
791 REGEX_ASSERT(&m1->pattern() == pat2);
792
793 //
794 // reset(pos, status)
795 //
796 m1->reset(inStr1);
797 m1->reset(4, status);
798 REGEX_CHECK_STATUS;
799 REGEX_ASSERT(m1->input() == inStr1);
800 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
801
802 m1->reset(-1, status);
803 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
804 status = U_ZERO_ERROR;
805
806 m1->reset(0, status);
807 REGEX_CHECK_STATUS;
808 status = U_ZERO_ERROR;
809
810 int32_t len = m1->input().length();
811 m1->reset(len-1, status);
812 REGEX_CHECK_STATUS;
813 status = U_ZERO_ERROR;
814
815 m1->reset(len, status);
816 REGEX_CHECK_STATUS;
817 status = U_ZERO_ERROR;
818
819 m1->reset(len+1, status);
820 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
821 status = U_ZERO_ERROR;
822
823 //
824 // match(pos, status)
825 //
826 m1->reset(instr2);
827 REGEX_ASSERT(m1->matches(4, status) == TRUE);
828 m1->reset();
829 REGEX_ASSERT(m1->matches(3, status) == FALSE);
830 m1->reset();
831 REGEX_ASSERT(m1->matches(5, status) == FALSE);
832 REGEX_ASSERT(m1->matches(4, status) == TRUE);
833 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
834 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
835
836 // Match() at end of string should fail, but should not
837 // be an error.
838 status = U_ZERO_ERROR;
839 len = m1->input().length();
840 REGEX_ASSERT(m1->matches(len, status) == FALSE);
841 REGEX_CHECK_STATUS;
842
843 // Match beyond end of string should fail with an error.
844 status = U_ZERO_ERROR;
845 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
846 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
847
848 // Successful match at end of string.
849 {
850 status = U_ZERO_ERROR;
851 RegexMatcher m("A?", 0, status); // will match zero length string.
852 REGEX_CHECK_STATUS;
853 m.reset(inStr1);
854 len = inStr1.length();
855 REGEX_ASSERT(m.matches(len, status) == TRUE);
856 REGEX_CHECK_STATUS;
857 m.reset(empty);
858 REGEX_ASSERT(m.matches(0, status) == TRUE);
859 REGEX_CHECK_STATUS;
860 }
861
862
863 //
864 // lookingAt(pos, status)
865 //
866 status = U_ZERO_ERROR;
867 m1->reset(instr2); // "not abc"
868 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
869 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
870 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
871 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
872 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
873 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
874 status = U_ZERO_ERROR;
875 len = m1->input().length();
876 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
877 REGEX_CHECK_STATUS;
878 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
879 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
880
881 delete m1;
882 delete pat2;
883 }
884
885
886 //
887 // Capture Group.
888 // RegexMatcher::start();
889 // RegexMatcher::end();
890 // RegexMatcher::groupCount();
891 //
892 {
893 int32_t flags=0;
894 UParseError pe;
895 UErrorCode status=U_ZERO_ERROR;
896
897 UnicodeString re("01(23(45)67)(.*)");
898 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
899 REGEX_CHECK_STATUS;
900 UnicodeString data = "0123456789";
901
902 RegexMatcher *matcher = pat->matcher(data, status);
903 REGEX_CHECK_STATUS;
904 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
905 static const int32_t matchStarts[] = {0, 2, 4, 8};
906 static const int32_t matchEnds[] = {10, 8, 6, 10};
907 int32_t i;
908 for (i=0; i<4; i++) {
909 int32_t actualStart = matcher->start(i, status);
910 REGEX_CHECK_STATUS;
911 if (actualStart != matchStarts[i]) {
912 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
913 __LINE__, i, matchStarts[i], actualStart);
914 }
915 int32_t actualEnd = matcher->end(i, status);
916 REGEX_CHECK_STATUS;
917 if (actualEnd != matchEnds[i]) {
918 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
919 __LINE__, i, matchEnds[i], actualEnd);
920 }
921 }
922
923 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
924 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
925
926 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
927 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
928 matcher->reset();
929 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
930
931 matcher->lookingAt(status);
932 REGEX_ASSERT(matcher->group(status) == "0123456789");
933 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
934 REGEX_ASSERT(matcher->group(1, status) == "234567" );
935 REGEX_ASSERT(matcher->group(2, status) == "45" );
936 REGEX_ASSERT(matcher->group(3, status) == "89" );
937 REGEX_CHECK_STATUS;
938 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
939 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
940 matcher->reset();
941 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
942
943 delete matcher;
944 delete pat;
945
946 }
947
948 //
949 // find
950 //
951 {
952 int32_t flags=0;
953 UParseError pe;
954 UErrorCode status=U_ZERO_ERROR;
955
956 UnicodeString re("abc");
957 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
958 REGEX_CHECK_STATUS;
959 UnicodeString data = ".abc..abc...abc..";
960 // 012345678901234567
961
962 RegexMatcher *matcher = pat->matcher(data, status);
963 REGEX_CHECK_STATUS;
964 REGEX_ASSERT(matcher->find());
965 REGEX_ASSERT(matcher->start(status) == 1);
966 REGEX_ASSERT(matcher->find());
967 REGEX_ASSERT(matcher->start(status) == 6);
968 REGEX_ASSERT(matcher->find());
969 REGEX_ASSERT(matcher->start(status) == 12);
970 REGEX_ASSERT(matcher->find() == FALSE);
971 REGEX_ASSERT(matcher->find() == FALSE);
972
973 matcher->reset();
974 REGEX_ASSERT(matcher->find());
975 REGEX_ASSERT(matcher->start(status) == 1);
976
977 REGEX_ASSERT(matcher->find(0, status));
978 REGEX_ASSERT(matcher->start(status) == 1);
979 REGEX_ASSERT(matcher->find(1, status));
980 REGEX_ASSERT(matcher->start(status) == 1);
981 REGEX_ASSERT(matcher->find(2, status));
982 REGEX_ASSERT(matcher->start(status) == 6);
983 REGEX_ASSERT(matcher->find(12, status));
984 REGEX_ASSERT(matcher->start(status) == 12);
985 REGEX_ASSERT(matcher->find(13, status) == FALSE);
986 REGEX_ASSERT(matcher->find(16, status) == FALSE);
987 REGEX_ASSERT(matcher->find(17, status) == FALSE);
988 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
989
990 status = U_ZERO_ERROR;
991 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
992 status = U_ZERO_ERROR;
993 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
994
995 REGEX_ASSERT(matcher->groupCount() == 0);
996
997 delete matcher;
998 delete pat;
999 }
1000
1001
1002 //
1003 // find, with \G in pattern (true if at the end of a previous match).
1004 //
1005 {
1006 int32_t flags=0;
1007 UParseError pe;
1008 UErrorCode status=U_ZERO_ERROR;
1009
1010 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1011 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1012 REGEX_CHECK_STATUS;
1013 UnicodeString data = ".abcabc.abc..";
1014 // 012345678901234567
1015
1016 RegexMatcher *matcher = pat->matcher(data, status);
1017 REGEX_CHECK_STATUS;
1018 REGEX_ASSERT(matcher->find());
1019 REGEX_ASSERT(matcher->start(status) == 0);
1020 REGEX_ASSERT(matcher->start(1, status) == -1);
1021 REGEX_ASSERT(matcher->start(2, status) == 1);
1022
1023 REGEX_ASSERT(matcher->find());
1024 REGEX_ASSERT(matcher->start(status) == 4);
1025 REGEX_ASSERT(matcher->start(1, status) == 4);
1026 REGEX_ASSERT(matcher->start(2, status) == -1);
1027 REGEX_CHECK_STATUS;
1028
1029 delete matcher;
1030 delete pat;
1031 }
1032
1033 //
1034 // find with zero length matches, match position should bump ahead
1035 // to prevent loops.
1036 //
1037 {
1038 int32_t i;
1039 UErrorCode status=U_ZERO_ERROR;
1040 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1041 // using an always-true look-ahead.
1042 REGEX_CHECK_STATUS;
1043 UnicodeString s(" ");
1044 m.reset(s);
1045 for (i=0; ; i++) {
1046 if (m.find() == FALSE) {
1047 break;
1048 }
1049 REGEX_ASSERT(m.start(status) == i);
1050 REGEX_ASSERT(m.end(status) == i);
1051 }
1052 REGEX_ASSERT(i==5);
1053
1054 // Check that the bump goes over surrogate pairs OK
1055 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1056 s = s.unescape();
1057 m.reset(s);
1058 for (i=0; ; i+=2) {
1059 if (m.find() == FALSE) {
1060 break;
1061 }
1062 REGEX_ASSERT(m.start(status) == i);
1063 REGEX_ASSERT(m.end(status) == i);
1064 }
1065 REGEX_ASSERT(i==10);
1066 }
1067 {
1068 // find() loop breaking test.
1069 // with pattern of /.?/, should see a series of one char matches, then a single
1070 // match of zero length at the end of the input string.
1071 int32_t i;
1072 UErrorCode status=U_ZERO_ERROR;
1073 RegexMatcher m(".?", 0, status);
1074 REGEX_CHECK_STATUS;
1075 UnicodeString s(" ");
1076 m.reset(s);
1077 for (i=0; ; i++) {
1078 if (m.find() == FALSE) {
1079 break;
1080 }
1081 REGEX_ASSERT(m.start(status) == i);
1082 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1083 }
1084 REGEX_ASSERT(i==5);
1085 }
1086
1087
1088 //
1089 // Matchers with no input string behave as if they had an empty input string.
1090 //
1091
1092 {
1093 UErrorCode status = U_ZERO_ERROR;
1094 RegexMatcher m(".?", 0, status);
1095 REGEX_CHECK_STATUS;
1096 REGEX_ASSERT(m.find());
1097 REGEX_ASSERT(m.start(status) == 0);
1098 REGEX_ASSERT(m.input() == "");
1099 }
1100 {
1101 UErrorCode status = U_ZERO_ERROR;
1102 RegexPattern *p = RegexPattern::compile(".", 0, status);
1103 RegexMatcher *m = p->matcher(status);
1104 REGEX_CHECK_STATUS;
1105
1106 REGEX_ASSERT(m->find() == FALSE);
1107 REGEX_ASSERT(m->input() == "");
1108 delete m;
1109 delete p;
1110 }
1111
1112 //
1113 // Regions
1114 //
1115 {
1116 UErrorCode status = U_ZERO_ERROR;
1117 UnicodeString testString("This is test data");
1118 RegexMatcher m(".*", testString, 0, status);
1119 REGEX_CHECK_STATUS;
1120 REGEX_ASSERT(m.regionStart() == 0);
1121 REGEX_ASSERT(m.regionEnd() == testString.length());
1122 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1123 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1124
1125 m.region(2,4, status);
1126 REGEX_CHECK_STATUS;
1127 REGEX_ASSERT(m.matches(status));
1128 REGEX_ASSERT(m.start(status)==2);
1129 REGEX_ASSERT(m.end(status)==4);
1130 REGEX_CHECK_STATUS;
1131
1132 m.reset();
1133 REGEX_ASSERT(m.regionStart() == 0);
1134 REGEX_ASSERT(m.regionEnd() == testString.length());
1135
1136 UnicodeString shorterString("short");
1137 m.reset(shorterString);
1138 REGEX_ASSERT(m.regionStart() == 0);
1139 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1140
1141 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1142 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1143 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1144 REGEX_ASSERT(&m == &m.reset());
1145 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1146
1147 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1148 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1149 REGEX_ASSERT(&m == &m.reset());
1150 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1151
1152 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1153 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1154 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1155 REGEX_ASSERT(&m == &m.reset());
1156 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1157
1158 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1159 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1160 REGEX_ASSERT(&m == &m.reset());
1161 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1162
1163 }
1164
1165 //
1166 // hitEnd() and requireEnd()
1167 //
1168 {
1169 UErrorCode status = U_ZERO_ERROR;
1170 UnicodeString testString("aabb");
1171 RegexMatcher m1(".*", testString, 0, status);
1172 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1173 REGEX_ASSERT(m1.hitEnd() == TRUE);
1174 REGEX_ASSERT(m1.requireEnd() == FALSE);
1175 REGEX_CHECK_STATUS;
1176
1177 status = U_ZERO_ERROR;
1178 RegexMatcher m2("a*", testString, 0, status);
1179 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1180 REGEX_ASSERT(m2.hitEnd() == FALSE);
1181 REGEX_ASSERT(m2.requireEnd() == FALSE);
1182 REGEX_CHECK_STATUS;
1183
1184 status = U_ZERO_ERROR;
1185 RegexMatcher m3(".*$", testString, 0, status);
1186 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1187 REGEX_ASSERT(m3.hitEnd() == TRUE);
1188 REGEX_ASSERT(m3.requireEnd() == TRUE);
1189 REGEX_CHECK_STATUS;
1190 }
1191
1192
1193 //
1194 // Compilation error on reset with UChar *
1195 // These were a hazard that people were stumbling over with runtime errors.
1196 // Changed them to compiler errors by adding private methods that more closely
1197 // matched the incorrect use of the functions.
1198 //
1199 #if 0
1200 {
1201 UErrorCode status = U_ZERO_ERROR;
1202 UChar ucharString[20];
1203 RegexMatcher m(".", 0, status);
1204 m.reset(ucharString); // should not compile.
1205
1206 RegexPattern *p = RegexPattern::compile(".", 0, status);
1207 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1208
1209 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1210 }
1211 #endif
1212
1213 //
1214 // Time Outs.
1215 // Note: These tests will need to be changed when the regexp engine is
1216 // able to detect and cut short the exponential time behavior on
1217 // this type of match.
1218 //
1219 {
1220 UErrorCode status = U_ZERO_ERROR;
1221 // Enough 'a's in the string to cause the match to time out.
1222 // (Each on additonal 'a' doubles the time)
1223 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1224 RegexMatcher matcher("(a+)+b", testString, 0, status);
1225 REGEX_CHECK_STATUS;
1226 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1227 matcher.setTimeLimit(100, status);
1228 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1229 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1230 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1231 }
1232 {
1233 UErrorCode status = U_ZERO_ERROR;
1234 // Few enough 'a's to slip in under the time limit.
1235 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1236 RegexMatcher matcher("(a+)+b", testString, 0, status);
1237 REGEX_CHECK_STATUS;
1238 matcher.setTimeLimit(100, status);
1239 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1240 REGEX_CHECK_STATUS;
1241 }
1242
1243 //
1244 // Stack Limits
1245 //
1246 {
1247 UErrorCode status = U_ZERO_ERROR;
1248 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1249
1250 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1251 // of the '+', and makes the stack frames larger.
1252 RegexMatcher matcher("(A)+A$", testString, 0, status);
1253
1254 // With the default stack, this match should fail to run
1255 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1256 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1257
1258 // With unlimited stack, it should run
1259 status = U_ZERO_ERROR;
1260 matcher.setStackLimit(0, status);
1261 REGEX_CHECK_STATUS;
1262 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1263 REGEX_CHECK_STATUS;
1264 REGEX_ASSERT(matcher.getStackLimit() == 0);
1265
1266 // With a limited stack, it the match should fail
1267 status = U_ZERO_ERROR;
1268 matcher.setStackLimit(10000, status);
1269 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1270 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1271 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1272 }
1273
1274 // A pattern that doesn't save state should work with
1275 // a minimal sized stack
1276 {
1277 UErrorCode status = U_ZERO_ERROR;
1278 UnicodeString testString = "abc";
1279 RegexMatcher matcher("abc", testString, 0, status);
1280 REGEX_CHECK_STATUS;
1281 matcher.setStackLimit(30, status);
1282 REGEX_CHECK_STATUS;
1283 REGEX_ASSERT(matcher.matches(status) == TRUE);
1284 REGEX_CHECK_STATUS;
1285 REGEX_ASSERT(matcher.getStackLimit() == 30);
1286
1287 // Negative stack sizes should fail
1288 status = U_ZERO_ERROR;
1289 matcher.setStackLimit(1000, status);
1290 REGEX_CHECK_STATUS;
1291 matcher.setStackLimit(-1, status);
1292 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1293 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1294 }
1295
1296
1297 }
1298
1299
1300
1301
1302
1303
1304 //---------------------------------------------------------------------------
1305 //
1306 // API_Replace API test for class RegexMatcher, testing the
1307 // Replace family of functions.
1308 //
1309 //---------------------------------------------------------------------------
API_Replace()1310 void RegexTest::API_Replace() {
1311 //
1312 // Replace
1313 //
1314 int32_t flags=0;
1315 UParseError pe;
1316 UErrorCode status=U_ZERO_ERROR;
1317
1318 UnicodeString re("abc");
1319 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1320 REGEX_CHECK_STATUS;
1321 UnicodeString data = ".abc..abc...abc..";
1322 // 012345678901234567
1323 RegexMatcher *matcher = pat->matcher(data, status);
1324
1325 //
1326 // Plain vanilla matches.
1327 //
1328 UnicodeString dest;
1329 dest = matcher->replaceFirst("yz", status);
1330 REGEX_CHECK_STATUS;
1331 REGEX_ASSERT(dest == ".yz..abc...abc..");
1332
1333 dest = matcher->replaceAll("yz", status);
1334 REGEX_CHECK_STATUS;
1335 REGEX_ASSERT(dest == ".yz..yz...yz..");
1336
1337 //
1338 // Plain vanilla non-matches.
1339 //
1340 UnicodeString d2 = ".abx..abx...abx..";
1341 matcher->reset(d2);
1342 dest = matcher->replaceFirst("yz", status);
1343 REGEX_CHECK_STATUS;
1344 REGEX_ASSERT(dest == ".abx..abx...abx..");
1345
1346 dest = matcher->replaceAll("yz", status);
1347 REGEX_CHECK_STATUS;
1348 REGEX_ASSERT(dest == ".abx..abx...abx..");
1349
1350 //
1351 // Empty source string
1352 //
1353 UnicodeString d3 = "";
1354 matcher->reset(d3);
1355 dest = matcher->replaceFirst("yz", status);
1356 REGEX_CHECK_STATUS;
1357 REGEX_ASSERT(dest == "");
1358
1359 dest = matcher->replaceAll("yz", status);
1360 REGEX_CHECK_STATUS;
1361 REGEX_ASSERT(dest == "");
1362
1363 //
1364 // Empty substitution string
1365 //
1366 matcher->reset(data); // ".abc..abc...abc.."
1367 dest = matcher->replaceFirst("", status);
1368 REGEX_CHECK_STATUS;
1369 REGEX_ASSERT(dest == "...abc...abc..");
1370
1371 dest = matcher->replaceAll("", status);
1372 REGEX_CHECK_STATUS;
1373 REGEX_ASSERT(dest == "........");
1374
1375 //
1376 // match whole string
1377 //
1378 UnicodeString d4 = "abc";
1379 matcher->reset(d4);
1380 dest = matcher->replaceFirst("xyz", status);
1381 REGEX_CHECK_STATUS;
1382 REGEX_ASSERT(dest == "xyz");
1383
1384 dest = matcher->replaceAll("xyz", status);
1385 REGEX_CHECK_STATUS;
1386 REGEX_ASSERT(dest == "xyz");
1387
1388 //
1389 // Capture Group, simple case
1390 //
1391 UnicodeString re2("a(..)");
1392 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1393 REGEX_CHECK_STATUS;
1394 UnicodeString d5 = "abcdefg";
1395 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1396 REGEX_CHECK_STATUS;
1397 dest = matcher2->replaceFirst("$1$1", status);
1398 REGEX_CHECK_STATUS;
1399 REGEX_ASSERT(dest == "bcbcdefg");
1400
1401 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1402 REGEX_CHECK_STATUS;
1403 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1404
1405 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1406 REGEX_CHECK_STATUS;
1407 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1408
1409 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1410 replacement = replacement.unescape();
1411 dest = matcher2->replaceFirst(replacement, status);
1412 REGEX_CHECK_STATUS;
1413 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1414
1415 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1416
1417
1418 //
1419 // Replacement String with \u hex escapes
1420 //
1421 {
1422 UnicodeString src = "abc 1 abc 2 abc 3";
1423 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1424 matcher->reset(src);
1425 UnicodeString result = matcher->replaceAll(substitute, status);
1426 REGEX_CHECK_STATUS;
1427 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1428 }
1429 {
1430 UnicodeString src = "abc !";
1431 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1432 matcher->reset(src);
1433 UnicodeString result = matcher->replaceAll(substitute, status);
1434 REGEX_CHECK_STATUS;
1435 UnicodeString expected = UnicodeString("--");
1436 expected.append((UChar32)0x10000);
1437 expected.append("-- !");
1438 REGEX_ASSERT(result == expected);
1439 }
1440 // TODO: need more through testing of capture substitutions.
1441
1442 // Bug 4057
1443 //
1444 {
1445 status = U_ZERO_ERROR;
1446 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1447 RegexMatcher m("ss(.*?)ee", 0, status);
1448 REGEX_CHECK_STATUS;
1449 UnicodeString result;
1450
1451 // Multiple finds do NOT bump up the previous appendReplacement postion.
1452 m.reset(s);
1453 m.find();
1454 m.find();
1455 m.appendReplacement(result, "ooh", status);
1456 REGEX_CHECK_STATUS;
1457 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1458
1459 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1460 status = U_ZERO_ERROR;
1461 result.truncate(0);
1462 m.reset(10, status);
1463 m.find();
1464 m.find();
1465 m.appendReplacement(result, "ooh", status);
1466 REGEX_CHECK_STATUS;
1467 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1468
1469 // find() at interior of string, appendReplacemnt still starts at beginning.
1470 status = U_ZERO_ERROR;
1471 result.truncate(0);
1472 m.reset();
1473 m.find(10, status);
1474 m.find();
1475 m.appendReplacement(result, "ooh", status);
1476 REGEX_CHECK_STATUS;
1477 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1478
1479 m.appendTail(result);
1480 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1481
1482 }
1483
1484 delete matcher2;
1485 delete pat2;
1486 delete matcher;
1487 delete pat;
1488 }
1489
1490
1491 //---------------------------------------------------------------------------
1492 //
1493 // API_Pattern Test that the API for class RegexPattern is
1494 // present and nominally working.
1495 //
1496 //---------------------------------------------------------------------------
API_Pattern()1497 void RegexTest::API_Pattern() {
1498 RegexPattern pata; // Test default constructor to not crash.
1499 RegexPattern patb;
1500
1501 REGEX_ASSERT(pata == patb);
1502 REGEX_ASSERT(pata == pata);
1503
1504 UnicodeString re1("abc[a-l][m-z]");
1505 UnicodeString re2("def");
1506 UErrorCode status = U_ZERO_ERROR;
1507 UParseError pe;
1508
1509 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1510 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1511 REGEX_CHECK_STATUS;
1512 REGEX_ASSERT(*pat1 == *pat1);
1513 REGEX_ASSERT(*pat1 != pata);
1514
1515 // Assign
1516 patb = *pat1;
1517 REGEX_ASSERT(patb == *pat1);
1518
1519 // Copy Construct
1520 RegexPattern patc(*pat1);
1521 REGEX_ASSERT(patc == *pat1);
1522 REGEX_ASSERT(patb == patc);
1523 REGEX_ASSERT(pat1 != pat2);
1524 patb = *pat2;
1525 REGEX_ASSERT(patb != patc);
1526 REGEX_ASSERT(patb == *pat2);
1527
1528 // Compile with no flags.
1529 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1530 REGEX_ASSERT(*pat1a == *pat1);
1531
1532 REGEX_ASSERT(pat1a->flags() == 0);
1533
1534 // Compile with different flags should be not equal
1535 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1536 REGEX_CHECK_STATUS;
1537
1538 REGEX_ASSERT(*pat1b != *pat1a);
1539 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1540 REGEX_ASSERT(pat1a->flags() == 0);
1541 delete pat1b;
1542
1543 // clone
1544 RegexPattern *pat1c = pat1->clone();
1545 REGEX_ASSERT(*pat1c == *pat1);
1546 REGEX_ASSERT(*pat1c != *pat2);
1547
1548 delete pat1c;
1549 delete pat1a;
1550 delete pat1;
1551 delete pat2;
1552
1553
1554 //
1555 // Verify that a matcher created from a cloned pattern works.
1556 // (Jitterbug 3423)
1557 //
1558 {
1559 UErrorCode status = U_ZERO_ERROR;
1560 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1561 RegexPattern *pClone = pSource->clone();
1562 delete pSource;
1563 RegexMatcher *mFromClone = pClone->matcher(status);
1564 REGEX_CHECK_STATUS;
1565 UnicodeString s = "Hello World";
1566 mFromClone->reset(s);
1567 REGEX_ASSERT(mFromClone->find() == TRUE);
1568 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1569 REGEX_ASSERT(mFromClone->find() == TRUE);
1570 REGEX_ASSERT(mFromClone->group(status) == "World");
1571 REGEX_ASSERT(mFromClone->find() == FALSE);
1572 delete mFromClone;
1573 delete pClone;
1574 }
1575
1576 //
1577 // matches convenience API
1578 //
1579 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1580 REGEX_CHECK_STATUS;
1581 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1582 REGEX_CHECK_STATUS;
1583 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1584 REGEX_CHECK_STATUS;
1585 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1586 REGEX_CHECK_STATUS;
1587 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1588 REGEX_CHECK_STATUS;
1589 status = U_INDEX_OUTOFBOUNDS_ERROR;
1590 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1591 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1592
1593
1594 //
1595 // Split()
1596 //
1597 status = U_ZERO_ERROR;
1598 pat1 = RegexPattern::compile(" +", pe, status);
1599 REGEX_CHECK_STATUS;
1600 UnicodeString fields[10];
1601
1602 int32_t n;
1603 n = pat1->split("Now is the time", fields, 10, status);
1604 REGEX_CHECK_STATUS;
1605 REGEX_ASSERT(n==4);
1606 REGEX_ASSERT(fields[0]=="Now");
1607 REGEX_ASSERT(fields[1]=="is");
1608 REGEX_ASSERT(fields[2]=="the");
1609 REGEX_ASSERT(fields[3]=="time");
1610 REGEX_ASSERT(fields[4]=="");
1611
1612 n = pat1->split("Now is the time", fields, 2, status);
1613 REGEX_CHECK_STATUS;
1614 REGEX_ASSERT(n==2);
1615 REGEX_ASSERT(fields[0]=="Now");
1616 REGEX_ASSERT(fields[1]=="is the time");
1617 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1618
1619 fields[1] = "*";
1620 status = U_ZERO_ERROR;
1621 n = pat1->split("Now is the time", fields, 1, status);
1622 REGEX_CHECK_STATUS;
1623 REGEX_ASSERT(n==1);
1624 REGEX_ASSERT(fields[0]=="Now is the time");
1625 REGEX_ASSERT(fields[1]=="*");
1626 status = U_ZERO_ERROR;
1627
1628 n = pat1->split(" Now is the time ", fields, 10, status);
1629 REGEX_CHECK_STATUS;
1630 REGEX_ASSERT(n==6);
1631 REGEX_ASSERT(fields[0]=="");
1632 REGEX_ASSERT(fields[1]=="Now");
1633 REGEX_ASSERT(fields[2]=="is");
1634 REGEX_ASSERT(fields[3]=="the");
1635 REGEX_ASSERT(fields[4]=="time");
1636 REGEX_ASSERT(fields[5]=="");
1637
1638 n = pat1->split(" ", fields, 10, status);
1639 REGEX_CHECK_STATUS;
1640 REGEX_ASSERT(n==2);
1641 REGEX_ASSERT(fields[0]=="");
1642 REGEX_ASSERT(fields[1]=="");
1643
1644 fields[0] = "foo";
1645 n = pat1->split("", fields, 10, status);
1646 REGEX_CHECK_STATUS;
1647 REGEX_ASSERT(n==0);
1648 REGEX_ASSERT(fields[0]=="foo");
1649
1650 delete pat1;
1651
1652 // split, with a pattern with (capture)
1653 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1654 REGEX_CHECK_STATUS;
1655
1656 status = U_ZERO_ERROR;
1657 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1658 REGEX_CHECK_STATUS;
1659 REGEX_ASSERT(n==7);
1660 REGEX_ASSERT(fields[0]=="");
1661 REGEX_ASSERT(fields[1]=="a");
1662 REGEX_ASSERT(fields[2]=="Now is ");
1663 REGEX_ASSERT(fields[3]=="b");
1664 REGEX_ASSERT(fields[4]=="the time");
1665 REGEX_ASSERT(fields[5]=="c");
1666 REGEX_ASSERT(fields[6]=="");
1667 REGEX_ASSERT(status==U_ZERO_ERROR);
1668
1669 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1670 REGEX_CHECK_STATUS;
1671 REGEX_ASSERT(n==7);
1672 REGEX_ASSERT(fields[0]==" ");
1673 REGEX_ASSERT(fields[1]=="a");
1674 REGEX_ASSERT(fields[2]=="Now is ");
1675 REGEX_ASSERT(fields[3]=="b");
1676 REGEX_ASSERT(fields[4]=="the time");
1677 REGEX_ASSERT(fields[5]=="c");
1678 REGEX_ASSERT(fields[6]=="");
1679
1680 status = U_ZERO_ERROR;
1681 fields[6] = "foo";
1682 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1683 REGEX_CHECK_STATUS;
1684 REGEX_ASSERT(n==6);
1685 REGEX_ASSERT(fields[0]==" ");
1686 REGEX_ASSERT(fields[1]=="a");
1687 REGEX_ASSERT(fields[2]=="Now is ");
1688 REGEX_ASSERT(fields[3]=="b");
1689 REGEX_ASSERT(fields[4]=="the time");
1690 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1691 REGEX_ASSERT(fields[6]=="foo");
1692
1693 status = U_ZERO_ERROR;
1694 fields[5] = "foo";
1695 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1696 REGEX_CHECK_STATUS;
1697 REGEX_ASSERT(n==5);
1698 REGEX_ASSERT(fields[0]==" ");
1699 REGEX_ASSERT(fields[1]=="a");
1700 REGEX_ASSERT(fields[2]=="Now is ");
1701 REGEX_ASSERT(fields[3]=="b");
1702 REGEX_ASSERT(fields[4]=="the time<c>");
1703 REGEX_ASSERT(fields[5]=="foo");
1704
1705 status = U_ZERO_ERROR;
1706 fields[5] = "foo";
1707 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1708 REGEX_CHECK_STATUS;
1709 REGEX_ASSERT(n==5);
1710 REGEX_ASSERT(fields[0]==" ");
1711 REGEX_ASSERT(fields[1]=="a");
1712 REGEX_ASSERT(fields[2]=="Now is ");
1713 REGEX_ASSERT(fields[3]=="b");
1714 REGEX_ASSERT(fields[4]=="the time");
1715 REGEX_ASSERT(fields[5]=="foo");
1716
1717 status = U_ZERO_ERROR;
1718 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1719 REGEX_CHECK_STATUS;
1720 REGEX_ASSERT(n==4);
1721 REGEX_ASSERT(fields[0]==" ");
1722 REGEX_ASSERT(fields[1]=="a");
1723 REGEX_ASSERT(fields[2]=="Now is ");
1724 REGEX_ASSERT(fields[3]=="the time<c>");
1725 status = U_ZERO_ERROR;
1726 delete pat1;
1727
1728 pat1 = RegexPattern::compile("([-,])", pe, status);
1729 REGEX_CHECK_STATUS;
1730 n = pat1->split("1-10,20", fields, 10, status);
1731 REGEX_CHECK_STATUS;
1732 REGEX_ASSERT(n==5);
1733 REGEX_ASSERT(fields[0]=="1");
1734 REGEX_ASSERT(fields[1]=="-");
1735 REGEX_ASSERT(fields[2]=="10");
1736 REGEX_ASSERT(fields[3]==",");
1737 REGEX_ASSERT(fields[4]=="20");
1738 delete pat1;
1739
1740 // Test split of string with empty trailing fields
1741 pat1 = RegexPattern::compile(",", pe, status);
1742 REGEX_CHECK_STATUS;
1743 n = pat1->split("a,b,c,", fields, 10, status);
1744 REGEX_CHECK_STATUS;
1745 REGEX_ASSERT(n==4);
1746 REGEX_ASSERT(fields[0]=="a");
1747 REGEX_ASSERT(fields[1]=="b");
1748 REGEX_ASSERT(fields[2]=="c");
1749 REGEX_ASSERT(fields[3]=="");
1750
1751 n = pat1->split("a,,,", fields, 10, status);
1752 REGEX_CHECK_STATUS;
1753 REGEX_ASSERT(n==4);
1754 REGEX_ASSERT(fields[0]=="a");
1755 REGEX_ASSERT(fields[1]=="");
1756 REGEX_ASSERT(fields[2]=="");
1757 REGEX_ASSERT(fields[3]=="");
1758 delete pat1;
1759
1760 // Split Separator with zero length match.
1761 pat1 = RegexPattern::compile(":?", pe, status);
1762 REGEX_CHECK_STATUS;
1763 n = pat1->split("abc", fields, 10, status);
1764 REGEX_CHECK_STATUS;
1765 REGEX_ASSERT(n==5);
1766 REGEX_ASSERT(fields[0]=="");
1767 REGEX_ASSERT(fields[1]=="a");
1768 REGEX_ASSERT(fields[2]=="b");
1769 REGEX_ASSERT(fields[3]=="c");
1770 REGEX_ASSERT(fields[4]=="");
1771
1772 delete pat1;
1773
1774 //
1775 // RegexPattern::pattern()
1776 //
1777 pat1 = new RegexPattern();
1778 REGEX_ASSERT(pat1->pattern() == "");
1779 delete pat1;
1780
1781 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1782 REGEX_CHECK_STATUS;
1783 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1784 delete pat1;
1785
1786
1787 //
1788 // classID functions
1789 //
1790 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1791 REGEX_CHECK_STATUS;
1792 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1793 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1794 UnicodeString Hello("Hello, world.");
1795 RegexMatcher *m = pat1->matcher(Hello, status);
1796 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1797 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1798 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1799 delete m;
1800 delete pat1;
1801
1802 }
1803
1804 //---------------------------------------------------------------------------
1805 //
1806 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1807 // is present and working, but excluding functions
1808 // implementing replace operations.
1809 //
1810 //---------------------------------------------------------------------------
API_Match_UTF8()1811 void RegexTest::API_Match_UTF8() {
1812 UParseError pe;
1813 UErrorCode status=U_ZERO_ERROR;
1814 int32_t flags = 0;
1815
1816 //
1817 // Debug - slide failing test cases early
1818 //
1819 #if 0
1820 {
1821 }
1822 return;
1823 #endif
1824
1825 //
1826 // Simple pattern compilation
1827 //
1828 {
1829 UText re = UTEXT_INITIALIZER;
1830 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1831 REGEX_VERBOSE_TEXT(&re);
1832 RegexPattern *pat2;
1833 pat2 = RegexPattern::compile(&re, flags, pe, status);
1834 REGEX_CHECK_STATUS;
1835
1836 UText input1 = UTEXT_INITIALIZER;
1837 UText input2 = UTEXT_INITIALIZER;
1838 UText empty = UTEXT_INITIALIZER;
1839 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1840 REGEX_VERBOSE_TEXT(&input1);
1841 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1842 REGEX_VERBOSE_TEXT(&input2);
1843 utext_openUChars(&empty, NULL, 0, &status);
1844
1845 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1846 int32_t input2Len = strlen("not abc");
1847
1848
1849 //
1850 // Matcher creation and reset.
1851 //
1852 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1853 REGEX_CHECK_STATUS;
1854 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1855 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1856 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1857 m1->reset(&input2);
1858 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1859 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1860 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1861 m1->reset(&input1);
1862 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1863 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1864 m1->reset(&empty);
1865 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1866 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1867
1868 //
1869 // reset(pos, status)
1870 //
1871 m1->reset(&input1);
1872 m1->reset(4, status);
1873 REGEX_CHECK_STATUS;
1874 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1875 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1876
1877 m1->reset(-1, status);
1878 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1879 status = U_ZERO_ERROR;
1880
1881 m1->reset(0, status);
1882 REGEX_CHECK_STATUS;
1883 status = U_ZERO_ERROR;
1884
1885 m1->reset(input1Len-1, status);
1886 REGEX_CHECK_STATUS;
1887 status = U_ZERO_ERROR;
1888
1889 m1->reset(input1Len, status);
1890 REGEX_CHECK_STATUS;
1891 status = U_ZERO_ERROR;
1892
1893 m1->reset(input1Len+1, status);
1894 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1895 status = U_ZERO_ERROR;
1896
1897 //
1898 // match(pos, status)
1899 //
1900 m1->reset(&input2);
1901 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1902 m1->reset();
1903 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1904 m1->reset();
1905 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1906 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1907 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1908 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1909
1910 // Match() at end of string should fail, but should not
1911 // be an error.
1912 status = U_ZERO_ERROR;
1913 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1914 REGEX_CHECK_STATUS;
1915
1916 // Match beyond end of string should fail with an error.
1917 status = U_ZERO_ERROR;
1918 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1919 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1920
1921 // Successful match at end of string.
1922 {
1923 status = U_ZERO_ERROR;
1924 RegexMatcher m("A?", 0, status); // will match zero length string.
1925 REGEX_CHECK_STATUS;
1926 m.reset(&input1);
1927 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1928 REGEX_CHECK_STATUS;
1929 m.reset(&empty);
1930 REGEX_ASSERT(m.matches(0, status) == TRUE);
1931 REGEX_CHECK_STATUS;
1932 }
1933
1934
1935 //
1936 // lookingAt(pos, status)
1937 //
1938 status = U_ZERO_ERROR;
1939 m1->reset(&input2); // "not abc"
1940 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1941 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1942 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1943 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1944 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1945 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1946 status = U_ZERO_ERROR;
1947 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1948 REGEX_CHECK_STATUS;
1949 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1950 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1951
1952 delete m1;
1953 delete pat2;
1954
1955 utext_close(&re);
1956 utext_close(&input1);
1957 utext_close(&input2);
1958 utext_close(&empty);
1959 }
1960
1961
1962 //
1963 // Capture Group.
1964 // RegexMatcher::start();
1965 // RegexMatcher::end();
1966 // RegexMatcher::groupCount();
1967 //
1968 {
1969 int32_t flags=0;
1970 UParseError pe;
1971 UErrorCode status=U_ZERO_ERROR;
1972 UText re=UTEXT_INITIALIZER;
1973 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1974 utext_openUTF8(&re, str_01234567_pat, -1, &status);
1975
1976 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1977 REGEX_CHECK_STATUS;
1978
1979 UText input = UTEXT_INITIALIZER;
1980 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1981 utext_openUTF8(&input, str_0123456789, -1, &status);
1982
1983 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1984 REGEX_CHECK_STATUS;
1985 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1986 static const int32_t matchStarts[] = {0, 2, 4, 8};
1987 static const int32_t matchEnds[] = {10, 8, 6, 10};
1988 int32_t i;
1989 for (i=0; i<4; i++) {
1990 int32_t actualStart = matcher->start(i, status);
1991 REGEX_CHECK_STATUS;
1992 if (actualStart != matchStarts[i]) {
1993 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
1994 __FILE__, __LINE__, i, matchStarts[i], actualStart);
1995 }
1996 int32_t actualEnd = matcher->end(i, status);
1997 REGEX_CHECK_STATUS;
1998 if (actualEnd != matchEnds[i]) {
1999 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2000 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2001 }
2002 }
2003
2004 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2005 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2006
2007 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2008 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2009 matcher->reset();
2010 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2011
2012 matcher->lookingAt(status);
2013
2014 UnicodeString dest;
2015 UText destText = UTEXT_INITIALIZER;
2016 utext_openUnicodeString(&destText, &dest, &status);
2017 UText *result;
2018 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2019 // Test shallow-clone API
2020 int64_t group_len;
2021 result = matcher->group((UText *)NULL, group_len, status);
2022 REGEX_CHECK_STATUS;
2023 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2024 utext_close(result);
2025 result = matcher->group(0, &destText, group_len, status);
2026 REGEX_CHECK_STATUS;
2027 REGEX_ASSERT(result == &destText);
2028 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2029 // destText is now immutable, reopen it
2030 utext_close(&destText);
2031 utext_openUnicodeString(&destText, &dest, &status);
2032
2033 result = matcher->group(0, NULL, status);
2034 REGEX_CHECK_STATUS;
2035 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2036 utext_close(result);
2037 result = matcher->group(0, &destText, status);
2038 REGEX_CHECK_STATUS;
2039 REGEX_ASSERT(result == &destText);
2040 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2041
2042 result = matcher->group(1, NULL, status);
2043 REGEX_CHECK_STATUS;
2044 const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2045 REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2046 utext_close(result);
2047 result = matcher->group(1, &destText, status);
2048 REGEX_CHECK_STATUS;
2049 REGEX_ASSERT(result == &destText);
2050 REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2051
2052 result = matcher->group(2, NULL, status);
2053 REGEX_CHECK_STATUS;
2054 const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
2055 REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2056 utext_close(result);
2057 result = matcher->group(2, &destText, status);
2058 REGEX_CHECK_STATUS;
2059 REGEX_ASSERT(result == &destText);
2060 REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2061
2062 result = matcher->group(3, NULL, status);
2063 REGEX_CHECK_STATUS;
2064 const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
2065 REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2066 utext_close(result);
2067 result = matcher->group(3, &destText, status);
2068 REGEX_CHECK_STATUS;
2069 REGEX_ASSERT(result == &destText);
2070 REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2071
2072 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2073 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2074 matcher->reset();
2075 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2076
2077 delete matcher;
2078 delete pat;
2079
2080 utext_close(&destText);
2081 utext_close(&input);
2082 utext_close(&re);
2083 }
2084
2085 //
2086 // find
2087 //
2088 {
2089 int32_t flags=0;
2090 UParseError pe;
2091 UErrorCode status=U_ZERO_ERROR;
2092 UText re=UTEXT_INITIALIZER;
2093 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2094 utext_openUTF8(&re, str_abc, -1, &status);
2095
2096 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2097 REGEX_CHECK_STATUS;
2098 UText input = UTEXT_INITIALIZER;
2099 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2100 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2101 // 012345678901234567
2102
2103 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2104 REGEX_CHECK_STATUS;
2105 REGEX_ASSERT(matcher->find());
2106 REGEX_ASSERT(matcher->start(status) == 1);
2107 REGEX_ASSERT(matcher->find());
2108 REGEX_ASSERT(matcher->start(status) == 6);
2109 REGEX_ASSERT(matcher->find());
2110 REGEX_ASSERT(matcher->start(status) == 12);
2111 REGEX_ASSERT(matcher->find() == FALSE);
2112 REGEX_ASSERT(matcher->find() == FALSE);
2113
2114 matcher->reset();
2115 REGEX_ASSERT(matcher->find());
2116 REGEX_ASSERT(matcher->start(status) == 1);
2117
2118 REGEX_ASSERT(matcher->find(0, status));
2119 REGEX_ASSERT(matcher->start(status) == 1);
2120 REGEX_ASSERT(matcher->find(1, status));
2121 REGEX_ASSERT(matcher->start(status) == 1);
2122 REGEX_ASSERT(matcher->find(2, status));
2123 REGEX_ASSERT(matcher->start(status) == 6);
2124 REGEX_ASSERT(matcher->find(12, status));
2125 REGEX_ASSERT(matcher->start(status) == 12);
2126 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2127 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2128 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2129 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2130
2131 status = U_ZERO_ERROR;
2132 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2133 status = U_ZERO_ERROR;
2134 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2135
2136 REGEX_ASSERT(matcher->groupCount() == 0);
2137
2138 delete matcher;
2139 delete pat;
2140
2141 utext_close(&input);
2142 utext_close(&re);
2143 }
2144
2145
2146 //
2147 // find, with \G in pattern (true if at the end of a previous match).
2148 //
2149 {
2150 int32_t flags=0;
2151 UParseError pe;
2152 UErrorCode status=U_ZERO_ERROR;
2153 UText re=UTEXT_INITIALIZER;
2154 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2155 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2156
2157 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2158
2159 REGEX_CHECK_STATUS;
2160 UText input = UTEXT_INITIALIZER;
2161 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2162 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2163 // 012345678901234567
2164
2165 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2166 REGEX_CHECK_STATUS;
2167 REGEX_ASSERT(matcher->find());
2168 REGEX_ASSERT(matcher->start(status) == 0);
2169 REGEX_ASSERT(matcher->start(1, status) == -1);
2170 REGEX_ASSERT(matcher->start(2, status) == 1);
2171
2172 REGEX_ASSERT(matcher->find());
2173 REGEX_ASSERT(matcher->start(status) == 4);
2174 REGEX_ASSERT(matcher->start(1, status) == 4);
2175 REGEX_ASSERT(matcher->start(2, status) == -1);
2176 REGEX_CHECK_STATUS;
2177
2178 delete matcher;
2179 delete pat;
2180
2181 utext_close(&input);
2182 utext_close(&re);
2183 }
2184
2185 //
2186 // find with zero length matches, match position should bump ahead
2187 // to prevent loops.
2188 //
2189 {
2190 int32_t i;
2191 UErrorCode status=U_ZERO_ERROR;
2192 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2193 // using an always-true look-ahead.
2194 REGEX_CHECK_STATUS;
2195 UText s = UTEXT_INITIALIZER;
2196 utext_openUTF8(&s, " ", -1, &status);
2197 m.reset(&s);
2198 for (i=0; ; i++) {
2199 if (m.find() == FALSE) {
2200 break;
2201 }
2202 REGEX_ASSERT(m.start(status) == i);
2203 REGEX_ASSERT(m.end(status) == i);
2204 }
2205 REGEX_ASSERT(i==5);
2206
2207 // Check that the bump goes over characters outside the BMP OK
2208 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2209 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2210 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2211 m.reset(&s);
2212 for (i=0; ; i+=4) {
2213 if (m.find() == FALSE) {
2214 break;
2215 }
2216 REGEX_ASSERT(m.start(status) == i);
2217 REGEX_ASSERT(m.end(status) == i);
2218 }
2219 REGEX_ASSERT(i==20);
2220
2221 utext_close(&s);
2222 }
2223 {
2224 // find() loop breaking test.
2225 // with pattern of /.?/, should see a series of one char matches, then a single
2226 // match of zero length at the end of the input string.
2227 int32_t i;
2228 UErrorCode status=U_ZERO_ERROR;
2229 RegexMatcher m(".?", 0, status);
2230 REGEX_CHECK_STATUS;
2231 UText s = UTEXT_INITIALIZER;
2232 utext_openUTF8(&s, " ", -1, &status);
2233 m.reset(&s);
2234 for (i=0; ; i++) {
2235 if (m.find() == FALSE) {
2236 break;
2237 }
2238 REGEX_ASSERT(m.start(status) == i);
2239 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2240 }
2241 REGEX_ASSERT(i==5);
2242
2243 utext_close(&s);
2244 }
2245
2246
2247 //
2248 // Matchers with no input string behave as if they had an empty input string.
2249 //
2250
2251 {
2252 UErrorCode status = U_ZERO_ERROR;
2253 RegexMatcher m(".?", 0, status);
2254 REGEX_CHECK_STATUS;
2255 REGEX_ASSERT(m.find());
2256 REGEX_ASSERT(m.start(status) == 0);
2257 REGEX_ASSERT(m.input() == "");
2258 }
2259 {
2260 UErrorCode status = U_ZERO_ERROR;
2261 RegexPattern *p = RegexPattern::compile(".", 0, status);
2262 RegexMatcher *m = p->matcher(status);
2263 REGEX_CHECK_STATUS;
2264
2265 REGEX_ASSERT(m->find() == FALSE);
2266 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2267 delete m;
2268 delete p;
2269 }
2270
2271 //
2272 // Regions
2273 //
2274 {
2275 UErrorCode status = U_ZERO_ERROR;
2276 UText testPattern = UTEXT_INITIALIZER;
2277 UText testText = UTEXT_INITIALIZER;
2278 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2279 REGEX_VERBOSE_TEXT(&testPattern);
2280 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2281 REGEX_VERBOSE_TEXT(&testText);
2282
2283 RegexMatcher m(&testPattern, &testText, 0, status);
2284 REGEX_CHECK_STATUS;
2285 REGEX_ASSERT(m.regionStart() == 0);
2286 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2287 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2288 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2289
2290 m.region(2,4, status);
2291 REGEX_CHECK_STATUS;
2292 REGEX_ASSERT(m.matches(status));
2293 REGEX_ASSERT(m.start(status)==2);
2294 REGEX_ASSERT(m.end(status)==4);
2295 REGEX_CHECK_STATUS;
2296
2297 m.reset();
2298 REGEX_ASSERT(m.regionStart() == 0);
2299 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2300
2301 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2302 REGEX_VERBOSE_TEXT(&testText);
2303 m.reset(&testText);
2304 REGEX_ASSERT(m.regionStart() == 0);
2305 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2306
2307 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2308 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2309 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2310 REGEX_ASSERT(&m == &m.reset());
2311 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2312
2313 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2314 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2315 REGEX_ASSERT(&m == &m.reset());
2316 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2317
2318 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2319 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2320 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2321 REGEX_ASSERT(&m == &m.reset());
2322 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2323
2324 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2325 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2326 REGEX_ASSERT(&m == &m.reset());
2327 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2328
2329 utext_close(&testText);
2330 utext_close(&testPattern);
2331 }
2332
2333 //
2334 // hitEnd() and requireEnd()
2335 //
2336 {
2337 UErrorCode status = U_ZERO_ERROR;
2338 UText testPattern = UTEXT_INITIALIZER;
2339 UText testText = UTEXT_INITIALIZER;
2340 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2341 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2342 utext_openUTF8(&testPattern, str_, -1, &status);
2343 utext_openUTF8(&testText, str_aabb, -1, &status);
2344
2345 RegexMatcher m1(&testPattern, &testText, 0, status);
2346 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2347 REGEX_ASSERT(m1.hitEnd() == TRUE);
2348 REGEX_ASSERT(m1.requireEnd() == FALSE);
2349 REGEX_CHECK_STATUS;
2350
2351 status = U_ZERO_ERROR;
2352 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2353 utext_openUTF8(&testPattern, str_a, -1, &status);
2354 RegexMatcher m2(&testPattern, &testText, 0, status);
2355 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2356 REGEX_ASSERT(m2.hitEnd() == FALSE);
2357 REGEX_ASSERT(m2.requireEnd() == FALSE);
2358 REGEX_CHECK_STATUS;
2359
2360 status = U_ZERO_ERROR;
2361 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2362 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2363 RegexMatcher m3(&testPattern, &testText, 0, status);
2364 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2365 REGEX_ASSERT(m3.hitEnd() == TRUE);
2366 REGEX_ASSERT(m3.requireEnd() == TRUE);
2367 REGEX_CHECK_STATUS;
2368
2369 utext_close(&testText);
2370 utext_close(&testPattern);
2371 }
2372 }
2373
2374
2375 //---------------------------------------------------------------------------
2376 //
2377 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2378 // Replace family of functions.
2379 //
2380 //---------------------------------------------------------------------------
API_Replace_UTF8()2381 void RegexTest::API_Replace_UTF8() {
2382 //
2383 // Replace
2384 //
2385 int32_t flags=0;
2386 UParseError pe;
2387 UErrorCode status=U_ZERO_ERROR;
2388
2389 UText re=UTEXT_INITIALIZER;
2390 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2391 REGEX_VERBOSE_TEXT(&re);
2392 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2393 REGEX_CHECK_STATUS;
2394
2395 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2396 // 012345678901234567
2397 UText dataText = UTEXT_INITIALIZER;
2398 utext_openUTF8(&dataText, data, -1, &status);
2399 REGEX_CHECK_STATUS;
2400 REGEX_VERBOSE_TEXT(&dataText);
2401 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2402
2403 //
2404 // Plain vanilla matches.
2405 //
2406 UnicodeString dest;
2407 UText destText = UTEXT_INITIALIZER;
2408 utext_openUnicodeString(&destText, &dest, &status);
2409 UText *result;
2410
2411 UText replText = UTEXT_INITIALIZER;
2412
2413 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2414 utext_openUTF8(&replText, str_yz, -1, &status);
2415 REGEX_VERBOSE_TEXT(&replText);
2416 result = matcher->replaceFirst(&replText, NULL, status);
2417 REGEX_CHECK_STATUS;
2418 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2419 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2420 utext_close(result);
2421 result = matcher->replaceFirst(&replText, &destText, status);
2422 REGEX_CHECK_STATUS;
2423 REGEX_ASSERT(result == &destText);
2424 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2425
2426 result = matcher->replaceAll(&replText, NULL, status);
2427 REGEX_CHECK_STATUS;
2428 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2429 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2430 utext_close(result);
2431
2432 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2433 result = matcher->replaceAll(&replText, &destText, status);
2434 REGEX_CHECK_STATUS;
2435 REGEX_ASSERT(result == &destText);
2436 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2437
2438 //
2439 // Plain vanilla non-matches.
2440 //
2441 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2442 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2443 matcher->reset(&dataText);
2444
2445 result = matcher->replaceFirst(&replText, NULL, status);
2446 REGEX_CHECK_STATUS;
2447 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2448 utext_close(result);
2449 result = matcher->replaceFirst(&replText, &destText, status);
2450 REGEX_CHECK_STATUS;
2451 REGEX_ASSERT(result == &destText);
2452 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2453
2454 result = matcher->replaceAll(&replText, NULL, status);
2455 REGEX_CHECK_STATUS;
2456 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2457 utext_close(result);
2458 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2459 result = matcher->replaceAll(&replText, &destText, status);
2460 REGEX_CHECK_STATUS;
2461 REGEX_ASSERT(result == &destText);
2462 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2463
2464 //
2465 // Empty source string
2466 //
2467 utext_openUTF8(&dataText, NULL, 0, &status);
2468 matcher->reset(&dataText);
2469
2470 result = matcher->replaceFirst(&replText, NULL, status);
2471 REGEX_CHECK_STATUS;
2472 REGEX_ASSERT_UTEXT_UTF8("", result);
2473 utext_close(result);
2474 result = matcher->replaceFirst(&replText, &destText, status);
2475 REGEX_CHECK_STATUS;
2476 REGEX_ASSERT(result == &destText);
2477 REGEX_ASSERT_UTEXT_UTF8("", result);
2478
2479 result = matcher->replaceAll(&replText, NULL, status);
2480 REGEX_CHECK_STATUS;
2481 REGEX_ASSERT_UTEXT_UTF8("", result);
2482 utext_close(result);
2483 result = matcher->replaceAll(&replText, &destText, status);
2484 REGEX_CHECK_STATUS;
2485 REGEX_ASSERT(result == &destText);
2486 REGEX_ASSERT_UTEXT_UTF8("", result);
2487
2488 //
2489 // Empty substitution string
2490 //
2491 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2492 matcher->reset(&dataText);
2493
2494 utext_openUTF8(&replText, NULL, 0, &status);
2495 result = matcher->replaceFirst(&replText, NULL, status);
2496 REGEX_CHECK_STATUS;
2497 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2498 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2499 utext_close(result);
2500 result = matcher->replaceFirst(&replText, &destText, status);
2501 REGEX_CHECK_STATUS;
2502 REGEX_ASSERT(result == &destText);
2503 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2504
2505 result = matcher->replaceAll(&replText, NULL, status);
2506 REGEX_CHECK_STATUS;
2507 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2508 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2509 utext_close(result);
2510 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2511 result = matcher->replaceAll(&replText, &destText, status);
2512 REGEX_CHECK_STATUS;
2513 REGEX_ASSERT(result == &destText);
2514 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2515
2516 //
2517 // match whole string
2518 //
2519 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2520 utext_openUTF8(&dataText, str_abc, -1, &status);
2521 matcher->reset(&dataText);
2522
2523 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2524 utext_openUTF8(&replText, str_xyz, -1, &status);
2525 result = matcher->replaceFirst(&replText, NULL, status);
2526 REGEX_CHECK_STATUS;
2527 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2528 utext_close(result);
2529 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2530 result = matcher->replaceFirst(&replText, &destText, status);
2531 REGEX_CHECK_STATUS;
2532 REGEX_ASSERT(result == &destText);
2533 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2534
2535 result = matcher->replaceAll(&replText, NULL, status);
2536 REGEX_CHECK_STATUS;
2537 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2538 utext_close(result);
2539 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2540 result = matcher->replaceAll(&replText, &destText, status);
2541 REGEX_CHECK_STATUS;
2542 REGEX_ASSERT(result == &destText);
2543 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2544
2545 //
2546 // Capture Group, simple case
2547 //
2548 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2549 utext_openUTF8(&re, str_add, -1, &status);
2550 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2551 REGEX_CHECK_STATUS;
2552
2553 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2554 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2555 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2556 REGEX_CHECK_STATUS;
2557
2558 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2559 utext_openUTF8(&replText, str_11, -1, &status);
2560 result = matcher2->replaceFirst(&replText, NULL, status);
2561 REGEX_CHECK_STATUS;
2562 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2563 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2564 utext_close(result);
2565 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2566 result = matcher2->replaceFirst(&replText, &destText, status);
2567 REGEX_CHECK_STATUS;
2568 REGEX_ASSERT(result == &destText);
2569 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2570
2571 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2572 utext_openUTF8(&replText, str_v, -1, &status);
2573 REGEX_VERBOSE_TEXT(&replText);
2574 result = matcher2->replaceFirst(&replText, NULL, status);
2575 REGEX_CHECK_STATUS;
2576 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2577 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2578 utext_close(result);
2579 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2580 result = matcher2->replaceFirst(&replText, &destText, status);
2581 REGEX_CHECK_STATUS;
2582 REGEX_ASSERT(result == &destText);
2583 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2584
2585 const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2586 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2587 result = matcher2->replaceFirst(&replText, NULL, status);
2588 REGEX_CHECK_STATUS;
2589 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2590 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2591 utext_close(result);
2592 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2593 result = matcher2->replaceFirst(&replText, &destText, status);
2594 REGEX_CHECK_STATUS;
2595 REGEX_ASSERT(result == &destText);
2596 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2597
2598 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2599 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2600 // 012345678901234567890123456
2601 supplDigitChars[22] = 0xF0;
2602 supplDigitChars[23] = 0x9D;
2603 supplDigitChars[24] = 0x9F;
2604 supplDigitChars[25] = 0x8F;
2605 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2606
2607 result = matcher2->replaceFirst(&replText, NULL, status);
2608 REGEX_CHECK_STATUS;
2609 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2610 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2611 utext_close(result);
2612 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2613 result = matcher2->replaceFirst(&replText, &destText, status);
2614 REGEX_CHECK_STATUS;
2615 REGEX_ASSERT(result == &destText);
2616 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2617 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2618 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2619 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2620 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2621 utext_close(result);
2622 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2623 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2624 REGEX_ASSERT(result == &destText);
2625 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2626
2627 //
2628 // Replacement String with \u hex escapes
2629 //
2630 {
2631 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2632 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2633 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2634 utext_openUTF8(&replText, str_u0043, -1, &status);
2635 matcher->reset(&dataText);
2636
2637 result = matcher->replaceAll(&replText, NULL, status);
2638 REGEX_CHECK_STATUS;
2639 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2640 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2641 utext_close(result);
2642 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2643 result = matcher->replaceAll(&replText, &destText, status);
2644 REGEX_CHECK_STATUS;
2645 REGEX_ASSERT(result == &destText);
2646 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2647 }
2648 {
2649 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2650 utext_openUTF8(&dataText, str_abc, -1, &status);
2651 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2652 utext_openUTF8(&replText, str_U00010000, -1, &status);
2653 matcher->reset(&dataText);
2654
2655 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2656 // 0123456789
2657 expected[2] = 0xF0;
2658 expected[3] = 0x90;
2659 expected[4] = 0x80;
2660 expected[5] = 0x80;
2661
2662 result = matcher->replaceAll(&replText, NULL, status);
2663 REGEX_CHECK_STATUS;
2664 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2665 utext_close(result);
2666 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2667 result = matcher->replaceAll(&replText, &destText, status);
2668 REGEX_CHECK_STATUS;
2669 REGEX_ASSERT(result == &destText);
2670 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2671 }
2672 // TODO: need more through testing of capture substitutions.
2673
2674 // Bug 4057
2675 //
2676 {
2677 status = U_ZERO_ERROR;
2678 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2679 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2680 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2681 utext_openUTF8(&re, str_ssee, -1, &status);
2682 utext_openUTF8(&dataText, str_blah, -1, &status);
2683 utext_openUTF8(&replText, str_ooh, -1, &status);
2684
2685 RegexMatcher m(&re, 0, status);
2686 REGEX_CHECK_STATUS;
2687
2688 UnicodeString result;
2689 UText resultText = UTEXT_INITIALIZER;
2690 utext_openUnicodeString(&resultText, &result, &status);
2691
2692 // Multiple finds do NOT bump up the previous appendReplacement postion.
2693 m.reset(&dataText);
2694 m.find();
2695 m.find();
2696 m.appendReplacement(&resultText, &replText, status);
2697 REGEX_CHECK_STATUS;
2698 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2699 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2700
2701 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2702 status = U_ZERO_ERROR;
2703 result.truncate(0);
2704 utext_openUnicodeString(&resultText, &result, &status);
2705 m.reset(10, status);
2706 m.find();
2707 m.find();
2708 m.appendReplacement(&resultText, &replText, status);
2709 REGEX_CHECK_STATUS;
2710 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2711 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2712
2713 // find() at interior of string, appendReplacement still starts at beginning.
2714 status = U_ZERO_ERROR;
2715 result.truncate(0);
2716 utext_openUnicodeString(&resultText, &result, &status);
2717 m.reset();
2718 m.find(10, status);
2719 m.find();
2720 m.appendReplacement(&resultText, &replText, status);
2721 REGEX_CHECK_STATUS;
2722 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2723 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2724
2725 m.appendTail(&resultText, status);
2726 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2727 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2728
2729 utext_close(&resultText);
2730 }
2731
2732 delete matcher2;
2733 delete pat2;
2734 delete matcher;
2735 delete pat;
2736
2737 utext_close(&dataText);
2738 utext_close(&replText);
2739 utext_close(&destText);
2740 utext_close(&re);
2741 }
2742
2743
2744 //---------------------------------------------------------------------------
2745 //
2746 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2747 // present and nominally working.
2748 //
2749 //---------------------------------------------------------------------------
API_Pattern_UTF8()2750 void RegexTest::API_Pattern_UTF8() {
2751 RegexPattern pata; // Test default constructor to not crash.
2752 RegexPattern patb;
2753
2754 REGEX_ASSERT(pata == patb);
2755 REGEX_ASSERT(pata == pata);
2756
2757 UText re1 = UTEXT_INITIALIZER;
2758 UText re2 = UTEXT_INITIALIZER;
2759 UErrorCode status = U_ZERO_ERROR;
2760 UParseError pe;
2761
2762 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2763 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2764 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2765 utext_openUTF8(&re2, str_def, -1, &status);
2766
2767 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2768 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2769 REGEX_CHECK_STATUS;
2770 REGEX_ASSERT(*pat1 == *pat1);
2771 REGEX_ASSERT(*pat1 != pata);
2772
2773 // Assign
2774 patb = *pat1;
2775 REGEX_ASSERT(patb == *pat1);
2776
2777 // Copy Construct
2778 RegexPattern patc(*pat1);
2779 REGEX_ASSERT(patc == *pat1);
2780 REGEX_ASSERT(patb == patc);
2781 REGEX_ASSERT(pat1 != pat2);
2782 patb = *pat2;
2783 REGEX_ASSERT(patb != patc);
2784 REGEX_ASSERT(patb == *pat2);
2785
2786 // Compile with no flags.
2787 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2788 REGEX_ASSERT(*pat1a == *pat1);
2789
2790 REGEX_ASSERT(pat1a->flags() == 0);
2791
2792 // Compile with different flags should be not equal
2793 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2794 REGEX_CHECK_STATUS;
2795
2796 REGEX_ASSERT(*pat1b != *pat1a);
2797 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2798 REGEX_ASSERT(pat1a->flags() == 0);
2799 delete pat1b;
2800
2801 // clone
2802 RegexPattern *pat1c = pat1->clone();
2803 REGEX_ASSERT(*pat1c == *pat1);
2804 REGEX_ASSERT(*pat1c != *pat2);
2805
2806 delete pat1c;
2807 delete pat1a;
2808 delete pat1;
2809 delete pat2;
2810
2811 utext_close(&re1);
2812 utext_close(&re2);
2813
2814
2815 //
2816 // Verify that a matcher created from a cloned pattern works.
2817 // (Jitterbug 3423)
2818 //
2819 {
2820 UErrorCode status = U_ZERO_ERROR;
2821 UText pattern = UTEXT_INITIALIZER;
2822 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2823 utext_openUTF8(&pattern, str_pL, -1, &status);
2824
2825 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2826 RegexPattern *pClone = pSource->clone();
2827 delete pSource;
2828 RegexMatcher *mFromClone = pClone->matcher(status);
2829 REGEX_CHECK_STATUS;
2830
2831 UText input = UTEXT_INITIALIZER;
2832 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2833 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2834 mFromClone->reset(&input);
2835 REGEX_ASSERT(mFromClone->find() == TRUE);
2836 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2837 REGEX_ASSERT(mFromClone->find() == TRUE);
2838 REGEX_ASSERT(mFromClone->group(status) == "World");
2839 REGEX_ASSERT(mFromClone->find() == FALSE);
2840 delete mFromClone;
2841 delete pClone;
2842
2843 utext_close(&input);
2844 utext_close(&pattern);
2845 }
2846
2847 //
2848 // matches convenience API
2849 //
2850 {
2851 UErrorCode status = U_ZERO_ERROR;
2852 UText pattern = UTEXT_INITIALIZER;
2853 UText input = UTEXT_INITIALIZER;
2854
2855 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2856 utext_openUTF8(&input, str_randominput, -1, &status);
2857
2858 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2859 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2860 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2861 REGEX_CHECK_STATUS;
2862
2863 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2864 utext_openUTF8(&pattern, str_abc, -1, &status);
2865 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2866 REGEX_CHECK_STATUS;
2867
2868 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2869 utext_openUTF8(&pattern, str_nput, -1, &status);
2870 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2871 REGEX_CHECK_STATUS;
2872
2873 utext_openUTF8(&pattern, str_randominput, -1, &status);
2874 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2875 REGEX_CHECK_STATUS;
2876
2877 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2878 utext_openUTF8(&pattern, str_u, -1, &status);
2879 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2880 REGEX_CHECK_STATUS;
2881
2882 utext_openUTF8(&input, str_abc, -1, &status);
2883 utext_openUTF8(&pattern, str_abc, -1, &status);
2884 status = U_INDEX_OUTOFBOUNDS_ERROR;
2885 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2886 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2887
2888 utext_close(&input);
2889 utext_close(&pattern);
2890 }
2891
2892
2893 //
2894 // Split()
2895 //
2896 status = U_ZERO_ERROR;
2897 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2898 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2899 pat1 = RegexPattern::compile(&re1, pe, status);
2900 REGEX_CHECK_STATUS;
2901 UnicodeString fields[10];
2902
2903 int32_t n;
2904 n = pat1->split("Now is the time", fields, 10, status);
2905 REGEX_CHECK_STATUS;
2906 REGEX_ASSERT(n==4);
2907 REGEX_ASSERT(fields[0]=="Now");
2908 REGEX_ASSERT(fields[1]=="is");
2909 REGEX_ASSERT(fields[2]=="the");
2910 REGEX_ASSERT(fields[3]=="time");
2911 REGEX_ASSERT(fields[4]=="");
2912
2913 n = pat1->split("Now is the time", fields, 2, status);
2914 REGEX_CHECK_STATUS;
2915 REGEX_ASSERT(n==2);
2916 REGEX_ASSERT(fields[0]=="Now");
2917 REGEX_ASSERT(fields[1]=="is the time");
2918 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2919
2920 fields[1] = "*";
2921 status = U_ZERO_ERROR;
2922 n = pat1->split("Now is the time", fields, 1, status);
2923 REGEX_CHECK_STATUS;
2924 REGEX_ASSERT(n==1);
2925 REGEX_ASSERT(fields[0]=="Now is the time");
2926 REGEX_ASSERT(fields[1]=="*");
2927 status = U_ZERO_ERROR;
2928
2929 n = pat1->split(" Now is the time ", fields, 10, status);
2930 REGEX_CHECK_STATUS;
2931 REGEX_ASSERT(n==6);
2932 REGEX_ASSERT(fields[0]=="");
2933 REGEX_ASSERT(fields[1]=="Now");
2934 REGEX_ASSERT(fields[2]=="is");
2935 REGEX_ASSERT(fields[3]=="the");
2936 REGEX_ASSERT(fields[4]=="time");
2937 REGEX_ASSERT(fields[5]=="");
2938 REGEX_ASSERT(fields[6]=="");
2939
2940 fields[2] = "*";
2941 n = pat1->split(" ", fields, 10, status);
2942 REGEX_CHECK_STATUS;
2943 REGEX_ASSERT(n==2);
2944 REGEX_ASSERT(fields[0]=="");
2945 REGEX_ASSERT(fields[1]=="");
2946 REGEX_ASSERT(fields[2]=="*");
2947
2948 fields[0] = "foo";
2949 n = pat1->split("", fields, 10, status);
2950 REGEX_CHECK_STATUS;
2951 REGEX_ASSERT(n==0);
2952 REGEX_ASSERT(fields[0]=="foo");
2953
2954 delete pat1;
2955
2956 // split, with a pattern with (capture)
2957 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2958 pat1 = RegexPattern::compile(&re1, pe, status);
2959 REGEX_CHECK_STATUS;
2960
2961 status = U_ZERO_ERROR;
2962 fields[6] = fields[7] = "*";
2963 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2964 REGEX_CHECK_STATUS;
2965 REGEX_ASSERT(n==7);
2966 REGEX_ASSERT(fields[0]=="");
2967 REGEX_ASSERT(fields[1]=="a");
2968 REGEX_ASSERT(fields[2]=="Now is ");
2969 REGEX_ASSERT(fields[3]=="b");
2970 REGEX_ASSERT(fields[4]=="the time");
2971 REGEX_ASSERT(fields[5]=="c");
2972 REGEX_ASSERT(fields[6]=="");
2973 REGEX_ASSERT(fields[7]=="*");
2974 REGEX_ASSERT(status==U_ZERO_ERROR);
2975
2976 fields[6] = fields[7] = "*";
2977 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
2978 REGEX_CHECK_STATUS;
2979 REGEX_ASSERT(n==7);
2980 REGEX_ASSERT(fields[0]==" ");
2981 REGEX_ASSERT(fields[1]=="a");
2982 REGEX_ASSERT(fields[2]=="Now is ");
2983 REGEX_ASSERT(fields[3]=="b");
2984 REGEX_ASSERT(fields[4]=="the time");
2985 REGEX_ASSERT(fields[5]=="c");
2986 REGEX_ASSERT(fields[6]=="");
2987 REGEX_ASSERT(fields[7]=="*");
2988
2989 status = U_ZERO_ERROR;
2990 fields[6] = "foo";
2991 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
2992 REGEX_CHECK_STATUS;
2993 REGEX_ASSERT(n==6);
2994 REGEX_ASSERT(fields[0]==" ");
2995 REGEX_ASSERT(fields[1]=="a");
2996 REGEX_ASSERT(fields[2]=="Now is ");
2997 REGEX_ASSERT(fields[3]=="b");
2998 REGEX_ASSERT(fields[4]=="the time");
2999 REGEX_ASSERT(fields[5]==" ");
3000 REGEX_ASSERT(fields[6]=="foo");
3001
3002 status = U_ZERO_ERROR;
3003 fields[5] = "foo";
3004 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3005 REGEX_CHECK_STATUS;
3006 REGEX_ASSERT(n==5);
3007 REGEX_ASSERT(fields[0]==" ");
3008 REGEX_ASSERT(fields[1]=="a");
3009 REGEX_ASSERT(fields[2]=="Now is ");
3010 REGEX_ASSERT(fields[3]=="b");
3011 REGEX_ASSERT(fields[4]=="the time<c>");
3012 REGEX_ASSERT(fields[5]=="foo");
3013
3014 status = U_ZERO_ERROR;
3015 fields[5] = "foo";
3016 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3017 REGEX_CHECK_STATUS;
3018 REGEX_ASSERT(n==5);
3019 REGEX_ASSERT(fields[0]==" ");
3020 REGEX_ASSERT(fields[1]=="a");
3021 REGEX_ASSERT(fields[2]=="Now is ");
3022 REGEX_ASSERT(fields[3]=="b");
3023 REGEX_ASSERT(fields[4]=="the time");
3024 REGEX_ASSERT(fields[5]=="foo");
3025
3026 status = U_ZERO_ERROR;
3027 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3028 REGEX_CHECK_STATUS;
3029 REGEX_ASSERT(n==4);
3030 REGEX_ASSERT(fields[0]==" ");
3031 REGEX_ASSERT(fields[1]=="a");
3032 REGEX_ASSERT(fields[2]=="Now is ");
3033 REGEX_ASSERT(fields[3]=="the time<c>");
3034 status = U_ZERO_ERROR;
3035 delete pat1;
3036
3037 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3038 pat1 = RegexPattern::compile(&re1, pe, status);
3039 REGEX_CHECK_STATUS;
3040 n = pat1->split("1-10,20", fields, 10, status);
3041 REGEX_CHECK_STATUS;
3042 REGEX_ASSERT(n==5);
3043 REGEX_ASSERT(fields[0]=="1");
3044 REGEX_ASSERT(fields[1]=="-");
3045 REGEX_ASSERT(fields[2]=="10");
3046 REGEX_ASSERT(fields[3]==",");
3047 REGEX_ASSERT(fields[4]=="20");
3048 delete pat1;
3049
3050
3051 //
3052 // RegexPattern::pattern() and patternText()
3053 //
3054 pat1 = new RegexPattern();
3055 REGEX_ASSERT(pat1->pattern() == "");
3056 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3057 delete pat1;
3058 const char *helloWorldInvariant = "(Hello, world)*";
3059 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3060 pat1 = RegexPattern::compile(&re1, pe, status);
3061 REGEX_CHECK_STATUS;
3062 REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
3063 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3064 delete pat1;
3065
3066 utext_close(&re1);
3067 }
3068
3069
3070 //---------------------------------------------------------------------------
3071 //
3072 // Extended A more thorough check for features of regex patterns
3073 // The test cases are in a separate data file,
3074 // source/tests/testdata/regextst.txt
3075 // A description of the test data format is included in that file.
3076 //
3077 //---------------------------------------------------------------------------
3078
3079 const char *
getPath(char buffer[2048],const char * filename)3080 RegexTest::getPath(char buffer[2048], const char *filename) {
3081 UErrorCode status=U_ZERO_ERROR;
3082 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3083 if (U_FAILURE(status)) {
3084 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3085 return NULL;
3086 }
3087
3088 strcpy(buffer, testDataDirectory);
3089 strcat(buffer, filename);
3090 return buffer;
3091 }
3092
Extended()3093 void RegexTest::Extended() {
3094 char tdd[2048];
3095 const char *srcPath;
3096 UErrorCode status = U_ZERO_ERROR;
3097 int32_t lineNum = 0;
3098
3099 //
3100 // Open and read the test data file.
3101 //
3102 srcPath=getPath(tdd, "regextst.txt");
3103 if(srcPath==NULL) {
3104 return; /* something went wrong, error already output */
3105 }
3106
3107 int32_t len;
3108 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3109 if (U_FAILURE(status)) {
3110 return; /* something went wrong, error already output */
3111 }
3112
3113 //
3114 // Put the test data into a UnicodeString
3115 //
3116 UnicodeString testString(FALSE, testData, len);
3117
3118 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3119 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3120 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3121
3122 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3123 UnicodeString testPattern; // The pattern for test from the test file.
3124 UnicodeString testFlags; // the flags for a test.
3125 UnicodeString matchString; // The marked up string to be used as input
3126
3127 if (U_FAILURE(status)){
3128 dataerrln("Construct RegexMatcher() error.");
3129 delete [] testData;
3130 return;
3131 }
3132
3133 //
3134 // Loop over the test data file, once per line.
3135 //
3136 while (lineMat.find()) {
3137 lineNum++;
3138 if (U_FAILURE(status)) {
3139 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3140 }
3141
3142 status = U_ZERO_ERROR;
3143 UnicodeString testLine = lineMat.group(1, status);
3144 if (testLine.length() == 0) {
3145 continue;
3146 }
3147
3148 //
3149 // Parse the test line. Skip blank and comment only lines.
3150 // Separate out the three main fields - pattern, flags, target.
3151 //
3152
3153 commentMat.reset(testLine);
3154 if (commentMat.lookingAt(status)) {
3155 // This line is a comment, or blank.
3156 continue;
3157 }
3158
3159 //
3160 // Pull out the pattern field, remove it from the test file line.
3161 //
3162 quotedStuffMat.reset(testLine);
3163 if (quotedStuffMat.lookingAt(status)) {
3164 testPattern = quotedStuffMat.group(2, status);
3165 testLine.remove(0, quotedStuffMat.end(0, status));
3166 } else {
3167 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3168 continue;
3169 }
3170
3171
3172 //
3173 // Pull out the flags from the test file line.
3174 //
3175 flagsMat.reset(testLine);
3176 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3177 testFlags = flagsMat.group(1, status);
3178 if (flagsMat.group(2, status).length() > 0) {
3179 errln("Bad Match flag at line %d. Scanning %c\n",
3180 lineNum, flagsMat.group(2, status).charAt(0));
3181 continue;
3182 }
3183 testLine.remove(0, flagsMat.end(0, status));
3184
3185 //
3186 // Pull out the match string, as a whole.
3187 // We'll process the <tags> later.
3188 //
3189 quotedStuffMat.reset(testLine);
3190 if (quotedStuffMat.lookingAt(status)) {
3191 matchString = quotedStuffMat.group(2, status);
3192 testLine.remove(0, quotedStuffMat.end(0, status));
3193 } else {
3194 errln("Bad match string at test file line %d", lineNum);
3195 continue;
3196 }
3197
3198 //
3199 // The only thing left from the input line should be an optional trailing comment.
3200 //
3201 commentMat.reset(testLine);
3202 if (commentMat.lookingAt(status) == FALSE) {
3203 errln("Line %d: unexpected characters at end of test line.", lineNum);
3204 continue;
3205 }
3206
3207 //
3208 // Run the test
3209 //
3210 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3211 }
3212
3213 delete [] testData;
3214
3215 }
3216
3217
3218
3219 //---------------------------------------------------------------------------
3220 //
3221 // regex_find(pattern, flags, inputString, lineNumber)
3222 //
3223 // Function to run a single test from the Extended (data driven) tests.
3224 // See file test/testdata/regextst.txt for a description of the
3225 // pattern and inputString fields, and the allowed flags.
3226 // lineNumber is the source line in regextst.txt of the test.
3227 //
3228 //---------------------------------------------------------------------------
3229
3230
3231 // Set a value into a UVector at position specified by a decimal number in
3232 // a UnicodeString. This is a utility function needed by the actual test function,
3233 // which follows.
set(UVector & vec,int32_t val,UnicodeString index)3234 static void set(UVector &vec, int32_t val, UnicodeString index) {
3235 UErrorCode status=U_ZERO_ERROR;
3236 int32_t idx = 0;
3237 for (int32_t i=0; i<index.length(); i++) {
3238 int32_t d=u_charDigitValue(index.charAt(i));
3239 if (d<0) {return;}
3240 idx = idx*10 + d;
3241 }
3242 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3243 vec.setElementAt(val, idx);
3244 }
3245
setInt(UVector & vec,int32_t val,int32_t idx)3246 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3247 UErrorCode status=U_ZERO_ERROR;
3248 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3249 vec.setElementAt(val, idx);
3250 }
3251
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3252 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3253 {
3254 UBool couldFind = TRUE;
3255 UTEXT_SETNATIVEINDEX(utext, 0);
3256 int32_t i = 0;
3257 while (i < unistrOffset) {
3258 UChar32 c = UTEXT_NEXT32(utext);
3259 if (c != U_SENTINEL) {
3260 i += U16_LENGTH(c);
3261 } else {
3262 couldFind = FALSE;
3263 break;
3264 }
3265 }
3266 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3267 return couldFind;
3268 }
3269
3270
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3271 void RegexTest::regex_find(const UnicodeString &pattern,
3272 const UnicodeString &flags,
3273 const UnicodeString &inputString,
3274 const char *srcPath,
3275 int32_t line) {
3276 UnicodeString unEscapedInput;
3277 UnicodeString deTaggedInput;
3278
3279 int32_t patternUTF8Length, inputUTF8Length;
3280 char *patternChars = NULL, *inputChars = NULL;
3281 UText patternText = UTEXT_INITIALIZER;
3282 UText inputText = UTEXT_INITIALIZER;
3283 UConverter *UTF8Converter = NULL;
3284
3285 UErrorCode status = U_ZERO_ERROR;
3286 UParseError pe;
3287 RegexPattern *parsePat = NULL;
3288 RegexMatcher *parseMatcher = NULL;
3289 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3290 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3291 UVector groupStarts(status);
3292 UVector groupEnds(status);
3293 UVector groupStartsUTF8(status);
3294 UVector groupEndsUTF8(status);
3295 UBool isMatch = FALSE, isUTF8Match = FALSE;
3296 UBool failed = FALSE;
3297 int32_t numFinds;
3298 int32_t i;
3299 UBool useMatchesFunc = FALSE;
3300 UBool useLookingAtFunc = FALSE;
3301 int32_t regionStart = -1;
3302 int32_t regionEnd = -1;
3303 int32_t regionStartUTF8 = -1;
3304 int32_t regionEndUTF8 = -1;
3305
3306
3307 //
3308 // Compile the caller's pattern
3309 //
3310 uint32_t bflags = 0;
3311 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3312 bflags |= UREGEX_CASE_INSENSITIVE;
3313 }
3314 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3315 bflags |= UREGEX_COMMENTS;
3316 }
3317 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3318 bflags |= UREGEX_DOTALL;
3319 }
3320 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3321 bflags |= UREGEX_MULTILINE;
3322 }
3323
3324 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3325 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3326 }
3327 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3328 bflags |= UREGEX_UNIX_LINES;
3329 }
3330
3331
3332 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3333 if (status != U_ZERO_ERROR) {
3334 #if UCONFIG_NO_BREAK_ITERATION==1
3335 // 'v' test flag means that the test pattern should not compile if ICU was configured
3336 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3337 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3338 goto cleanupAndReturn;
3339 }
3340 #endif
3341 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3342 // Expected pattern compilation error.
3343 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3344 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3345 }
3346 goto cleanupAndReturn;
3347 } else {
3348 // Unexpected pattern compilation error.
3349 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3350 goto cleanupAndReturn;
3351 }
3352 }
3353
3354 UTF8Converter = ucnv_open("UTF8", &status);
3355 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3356
3357 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3358 status = U_ZERO_ERROR; // buffer overflow
3359 patternChars = new char[patternUTF8Length+1];
3360 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3361 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3362
3363 if (status == U_ZERO_ERROR) {
3364 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3365
3366 if (status != U_ZERO_ERROR) {
3367 #if UCONFIG_NO_BREAK_ITERATION==1
3368 // 'v' test flag means that the test pattern should not compile if ICU was configured
3369 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3370 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3371 goto cleanupAndReturn;
3372 }
3373 #endif
3374 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3375 // Expected pattern compilation error.
3376 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3377 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3378 }
3379 goto cleanupAndReturn;
3380 } else {
3381 // Unexpected pattern compilation error.
3382 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3383 goto cleanupAndReturn;
3384 }
3385 }
3386 }
3387
3388 if (UTF8Pattern == NULL) {
3389 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3390 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3391 status = U_ZERO_ERROR;
3392 }
3393
3394 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3395 RegexPatternDump(callerPattern);
3396 }
3397
3398 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3399 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3400 goto cleanupAndReturn;
3401 }
3402
3403
3404 //
3405 // Number of times find() should be called on the test string, default to 1
3406 //
3407 numFinds = 1;
3408 for (i=2; i<=9; i++) {
3409 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3410 if (numFinds != 1) {
3411 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3412 goto cleanupAndReturn;
3413 }
3414 numFinds = i;
3415 }
3416 }
3417
3418 // 'M' flag. Use matches() instead of find()
3419 if (flags.indexOf((UChar)0x4d) >= 0) {
3420 useMatchesFunc = TRUE;
3421 }
3422 if (flags.indexOf((UChar)0x4c) >= 0) {
3423 useLookingAtFunc = TRUE;
3424 }
3425
3426 //
3427 // Find the tags in the input data, remove them, and record the group boundary
3428 // positions.
3429 //
3430 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3431 REGEX_CHECK_STATUS_L(line);
3432
3433 unEscapedInput = inputString.unescape();
3434 parseMatcher = parsePat->matcher(unEscapedInput, status);
3435 REGEX_CHECK_STATUS_L(line);
3436 while(parseMatcher->find()) {
3437 parseMatcher->appendReplacement(deTaggedInput, "", status);
3438 REGEX_CHECK_STATUS;
3439 UnicodeString groupNum = parseMatcher->group(2, status);
3440 if (groupNum == "r") {
3441 // <r> or </r>, a region specification within the string
3442 if (parseMatcher->group(1, status) == "/") {
3443 regionEnd = deTaggedInput.length();
3444 } else {
3445 regionStart = deTaggedInput.length();
3446 }
3447 } else {
3448 // <digits> or </digits>, a group match boundary tag.
3449 if (parseMatcher->group(1, status) == "/") {
3450 set(groupEnds, deTaggedInput.length(), groupNum);
3451 } else {
3452 set(groupStarts, deTaggedInput.length(), groupNum);
3453 }
3454 }
3455 }
3456 parseMatcher->appendTail(deTaggedInput);
3457 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3458 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3459 errln("mismatched <r> tags");
3460 failed = TRUE;
3461 goto cleanupAndReturn;
3462 }
3463
3464 //
3465 // Configure the matcher according to the flags specified with this test.
3466 //
3467 matcher = callerPattern->matcher(deTaggedInput, status);
3468 REGEX_CHECK_STATUS_L(line);
3469 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3470 matcher->setTrace(TRUE);
3471 }
3472
3473 if (UTF8Pattern != NULL) {
3474 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3475 status = U_ZERO_ERROR; // buffer overflow
3476 inputChars = new char[inputUTF8Length+1];
3477 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3478 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3479
3480 if (status == U_ZERO_ERROR) {
3481 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3482 REGEX_CHECK_STATUS_L(line);
3483 }
3484
3485 if (UTF8Matcher == NULL) {
3486 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3487 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3488 status = U_ZERO_ERROR;
3489 }
3490 }
3491
3492 //
3493 // Generate native indices for UTF8 versions of region and capture group info
3494 //
3495 if (UTF8Matcher != NULL) {
3496 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3497 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3498
3499 // Fill out the native index UVector info.
3500 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3501 for (i=0; i<groupStarts.size(); i++) {
3502 int32_t start = groupStarts.elementAti(i);
3503 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3504 if (start >= 0) {
3505 int32_t startUTF8;
3506 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3507 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3508 failed = TRUE;
3509 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3510 }
3511 setInt(groupStartsUTF8, startUTF8, i);
3512 }
3513
3514 int32_t end = groupEnds.elementAti(i);
3515 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3516 if (end >= 0) {
3517 int32_t endUTF8;
3518 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3519 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3520 failed = TRUE;
3521 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3522 }
3523 setInt(groupEndsUTF8, endUTF8, i);
3524 }
3525 }
3526 }
3527
3528 if (regionStart>=0) {
3529 matcher->region(regionStart, regionEnd, status);
3530 REGEX_CHECK_STATUS_L(line);
3531 if (UTF8Matcher != NULL) {
3532 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3533 REGEX_CHECK_STATUS_L(line);
3534 }
3535 }
3536 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3537 matcher->useAnchoringBounds(FALSE);
3538 if (UTF8Matcher != NULL) {
3539 UTF8Matcher->useAnchoringBounds(FALSE);
3540 }
3541 }
3542 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3543 matcher->useTransparentBounds(TRUE);
3544 if (UTF8Matcher != NULL) {
3545 UTF8Matcher->useTransparentBounds(TRUE);
3546 }
3547 }
3548
3549
3550
3551 //
3552 // Do a find on the de-tagged input using the caller's pattern
3553 // TODO: error on count>1 and not find().
3554 // error on both matches() and lookingAt().
3555 //
3556 for (i=0; i<numFinds; i++) {
3557 if (useMatchesFunc) {
3558 isMatch = matcher->matches(status);
3559 if (UTF8Matcher != NULL) {
3560 isUTF8Match = UTF8Matcher->matches(status);
3561 }
3562 } else if (useLookingAtFunc) {
3563 isMatch = matcher->lookingAt(status);
3564 if (UTF8Matcher != NULL) {
3565 isUTF8Match = UTF8Matcher->lookingAt(status);
3566 }
3567 } else {
3568 isMatch = matcher->find();
3569 if (UTF8Matcher != NULL) {
3570 isUTF8Match = UTF8Matcher->find();
3571 }
3572 }
3573 }
3574 matcher->setTrace(FALSE);
3575
3576 //
3577 // Match up the groups from the find() with the groups from the tags
3578 //
3579
3580 // number of tags should match number of groups from find operation.
3581 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3582 // G option in test means that capture group data is not available in the
3583 // expected results, so the check needs to be suppressed.
3584 if (isMatch == FALSE && groupStarts.size() != 0) {
3585 dataerrln("Error at line %d: Match expected, but none found.", line);
3586 failed = TRUE;
3587 goto cleanupAndReturn;
3588 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3589 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3590 failed = TRUE;
3591 goto cleanupAndReturn;
3592 }
3593
3594 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3595 // Only check for match / no match. Don't check capture groups.
3596 if (isMatch && groupStarts.size() == 0) {
3597 errln("Error at line %d: No match expected, but one found.", line);
3598 failed = TRUE;
3599 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3600 errln("Error at line %d: No match expected, but one found. (UTF8)", line);
3601 failed = TRUE;
3602 }
3603 goto cleanupAndReturn;
3604 }
3605
3606 REGEX_CHECK_STATUS_L(line);
3607 for (i=0; i<=matcher->groupCount(); i++) {
3608 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3609 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3610 if (matcher->start(i, status) != expectedStart) {
3611 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3612 line, i, expectedStart, matcher->start(i, status));
3613 failed = TRUE;
3614 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3615 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3616 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3617 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3618 failed = TRUE;
3619 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3620 }
3621
3622 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3623 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3624 if (matcher->end(i, status) != expectedEnd) {
3625 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3626 line, i, expectedEnd, matcher->end(i, status));
3627 failed = TRUE;
3628 // Error on end position; keep going; real error is probably yet to come as group
3629 // end positions work from end of the input data towards the front.
3630 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3631 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3632 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3633 failed = TRUE;
3634 // Error on end position; keep going; real error is probably yet to come as group
3635 // end positions work from end of the input data towards the front.
3636 }
3637 }
3638 if ( matcher->groupCount()+1 < groupStarts.size()) {
3639 errln("Error at line %d: Expected %d capture groups, found %d.",
3640 line, groupStarts.size()-1, matcher->groupCount());
3641 failed = TRUE;
3642 }
3643 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3644 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3645 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3646 failed = TRUE;
3647 }
3648
3649 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3650 matcher->requireEnd() == TRUE) {
3651 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3652 failed = TRUE;
3653 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3654 UTF8Matcher->requireEnd() == TRUE) {
3655 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3656 failed = TRUE;
3657 }
3658
3659 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3660 matcher->requireEnd() == FALSE) {
3661 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3662 failed = TRUE;
3663 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3664 UTF8Matcher->requireEnd() == FALSE) {
3665 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3666 failed = TRUE;
3667 }
3668
3669 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3670 matcher->hitEnd() == TRUE) {
3671 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3672 failed = TRUE;
3673 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3674 UTF8Matcher->hitEnd() == TRUE) {
3675 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3676 failed = TRUE;
3677 }
3678
3679 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3680 matcher->hitEnd() == FALSE) {
3681 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3682 failed = TRUE;
3683 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3684 UTF8Matcher->hitEnd() == FALSE) {
3685 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3686 failed = TRUE;
3687 }
3688
3689
3690 cleanupAndReturn:
3691 if (failed) {
3692 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3693 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3694 // callerPattern->dump();
3695 }
3696 delete parseMatcher;
3697 delete parsePat;
3698 delete UTF8Matcher;
3699 delete UTF8Pattern;
3700 delete matcher;
3701 delete callerPattern;
3702
3703 utext_close(&inputText);
3704 delete[] inputChars;
3705 utext_close(&patternText);
3706 delete[] patternChars;
3707 ucnv_close(UTF8Converter);
3708 }
3709
3710
3711
3712
3713 //---------------------------------------------------------------------------
3714 //
3715 // Errors Check for error handling in patterns.
3716 //
3717 //---------------------------------------------------------------------------
Errors()3718 void RegexTest::Errors() {
3719 // \escape sequences that aren't implemented yet.
3720 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3721
3722 // Missing close parentheses
3723 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3724 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3725 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3726
3727 // Extra close paren
3728 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3729 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3730 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3731
3732 // Look-ahead, Look-behind
3733 // TODO: add tests for unbounded length look-behinds.
3734 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3735
3736 // Attempt to use non-default flags
3737 {
3738 UParseError pe;
3739 UErrorCode status = U_ZERO_ERROR;
3740 int32_t flags = UREGEX_CANON_EQ |
3741 UREGEX_COMMENTS | UREGEX_DOTALL |
3742 UREGEX_MULTILINE;
3743 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3744 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3745 delete pat1;
3746 }
3747
3748
3749 // Quantifiers are allowed only after something that can be quantified.
3750 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3751 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3752 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3753
3754 // Mal-formed {min,max} quantifiers
3755 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3756 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3757 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3758 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3759 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3760 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3761 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3762 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3763 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3764
3765 // Ticket 5389
3766 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3767
3768 // Invalid Back Reference \0
3769 // For ICU 3.8 and earlier
3770 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3771 //
3772 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3773
3774 }
3775
3776
3777 //-------------------------------------------------------------------------------
3778 //
3779 // Read a text data file, convert it to UChars, and return the data
3780 // in one big UChar * buffer, which the caller must delete.
3781 //
3782 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3783 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3784 const char *defEncoding, UErrorCode &status) {
3785 UChar *retPtr = NULL;
3786 char *fileBuf = NULL;
3787 UConverter* conv = NULL;
3788 FILE *f = NULL;
3789
3790 ulen = 0;
3791 if (U_FAILURE(status)) {
3792 return retPtr;
3793 }
3794
3795 //
3796 // Open the file.
3797 //
3798 f = fopen(fileName, "rb");
3799 if (f == 0) {
3800 dataerrln("Error opening test data file %s\n", fileName);
3801 status = U_FILE_ACCESS_ERROR;
3802 return NULL;
3803 }
3804 //
3805 // Read it in
3806 //
3807 int32_t fileSize;
3808 int32_t amt_read;
3809
3810 fseek( f, 0, SEEK_END);
3811 fileSize = ftell(f);
3812 fileBuf = new char[fileSize];
3813 fseek(f, 0, SEEK_SET);
3814 amt_read = fread(fileBuf, 1, fileSize, f);
3815 if (amt_read != fileSize || fileSize <= 0) {
3816 errln("Error reading test data file.");
3817 goto cleanUpAndReturn;
3818 }
3819
3820 //
3821 // Look for a Unicode Signature (BOM) on the data just read
3822 //
3823 int32_t signatureLength;
3824 const char * fileBufC;
3825 const char* encoding;
3826
3827 fileBufC = fileBuf;
3828 encoding = ucnv_detectUnicodeSignature(
3829 fileBuf, fileSize, &signatureLength, &status);
3830 if(encoding!=NULL ){
3831 fileBufC += signatureLength;
3832 fileSize -= signatureLength;
3833 } else {
3834 encoding = defEncoding;
3835 if (strcmp(encoding, "utf-8") == 0) {
3836 errln("file %s is missing its BOM", fileName);
3837 }
3838 }
3839
3840 //
3841 // Open a converter to take the rule file to UTF-16
3842 //
3843 conv = ucnv_open(encoding, &status);
3844 if (U_FAILURE(status)) {
3845 goto cleanUpAndReturn;
3846 }
3847
3848 //
3849 // Convert the rules to UChar.
3850 // Preflight first to determine required buffer size.
3851 //
3852 ulen = ucnv_toUChars(conv,
3853 NULL, // dest,
3854 0, // destCapacity,
3855 fileBufC,
3856 fileSize,
3857 &status);
3858 if (status == U_BUFFER_OVERFLOW_ERROR) {
3859 // Buffer Overflow is expected from the preflight operation.
3860 status = U_ZERO_ERROR;
3861
3862 retPtr = new UChar[ulen+1];
3863 ucnv_toUChars(conv,
3864 retPtr, // dest,
3865 ulen+1,
3866 fileBufC,
3867 fileSize,
3868 &status);
3869 }
3870
3871 cleanUpAndReturn:
3872 fclose(f);
3873 delete[] fileBuf;
3874 ucnv_close(conv);
3875 if (U_FAILURE(status)) {
3876 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3877 delete []retPtr;
3878 retPtr = 0;
3879 ulen = 0;
3880 };
3881 return retPtr;
3882 }
3883
3884
3885 //-------------------------------------------------------------------------------
3886 //
3887 // PerlTests - Run Perl's regular expression tests
3888 // The input file for this test is re_tests, the standard regular
3889 // expression test data distributed with the Perl source code.
3890 //
3891 // Here is Perl's description of the test data file:
3892 //
3893 // # The tests are in a separate file 't/op/re_tests'.
3894 // # Each line in that file is a separate test.
3895 // # There are five columns, separated by tabs.
3896 // #
3897 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3898 // # Modifiers can be put after the closing C<'>.
3899 // #
3900 // # Column 2 contains the string to be matched.
3901 // #
3902 // # Column 3 contains the expected result:
3903 // # y expect a match
3904 // # n expect no match
3905 // # c expect an error
3906 // # B test exposes a known bug in Perl, should be skipped
3907 // # b test exposes a known bug in Perl, should be skipped if noamp
3908 // #
3909 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3910 // #
3911 // # Column 4 contains a string, usually C<$&>.
3912 // #
3913 // # Column 5 contains the expected result of double-quote
3914 // # interpolating that string after the match, or start of error message.
3915 // #
3916 // # Column 6, if present, contains a reason why the test is skipped.
3917 // # This is printed with "skipped", for harness to pick up.
3918 // #
3919 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3920 // #
3921 // # If you want to add a regular expression test that can't be expressed
3922 // # in this format, don't add it here: put it in op/pat.t instead.
3923 //
3924 // For ICU, if field 3 contains an 'i', the test will be skipped.
3925 // The test exposes is some known incompatibility between ICU and Perl regexps.
3926 // (The i is in addition to whatever was there before.)
3927 //
3928 //-------------------------------------------------------------------------------
PerlTests()3929 void RegexTest::PerlTests() {
3930 char tdd[2048];
3931 const char *srcPath;
3932 UErrorCode status = U_ZERO_ERROR;
3933 UParseError pe;
3934
3935 //
3936 // Open and read the test data file.
3937 //
3938 srcPath=getPath(tdd, "re_tests.txt");
3939 if(srcPath==NULL) {
3940 return; /* something went wrong, error already output */
3941 }
3942
3943 int32_t len;
3944 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3945 if (U_FAILURE(status)) {
3946 return; /* something went wrong, error already output */
3947 }
3948
3949 //
3950 // Put the test data into a UnicodeString
3951 //
3952 UnicodeString testDataString(FALSE, testData, len);
3953
3954 //
3955 // Regex to break the input file into lines, and strip the new lines.
3956 // One line per match, capture group one is the desired data.
3957 //
3958 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3959 if (U_FAILURE(status)) {
3960 dataerrln("RegexPattern::compile() error");
3961 return;
3962 }
3963 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3964
3965 //
3966 // Regex to split a test file line into fields.
3967 // There are six fields, separated by tabs.
3968 //
3969 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3970
3971 //
3972 // Regex to identify test patterns with flag settings, and to separate them.
3973 // Test patterns with flags look like 'pattern'i
3974 // Test patterns without flags are not quoted: pattern
3975 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3976 //
3977 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3978 RegexMatcher* flagMat = flagPat->matcher(status);
3979
3980 //
3981 // The Perl tests reference several perl-isms, which are evaluated/substituted
3982 // in the test data. Not being perl, this must be done explicitly. Here
3983 // are string constants and REs for these constructs.
3984 //
3985 UnicodeString nulnulSrc("${nulnul}");
3986 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3987 nulnul = nulnul.unescape();
3988
3989 UnicodeString ffffSrc("${ffff}");
3990 UnicodeString ffff("\\uffff", -1, US_INV);
3991 ffff = ffff.unescape();
3992
3993 // regexp for $-[0], $+[2], etc.
3994 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3995 RegexMatcher *groupsMat = groupsPat->matcher(status);
3996
3997 // regexp for $0, $1, $2, etc.
3998 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3999 RegexMatcher *cgMat = cgPat->matcher(status);
4000
4001
4002 //
4003 // Main Loop for the Perl Tests, runs once per line from the
4004 // test data file.
4005 //
4006 int32_t lineNum = 0;
4007 int32_t skippedUnimplementedCount = 0;
4008 while (lineMat->find()) {
4009 lineNum++;
4010
4011 //
4012 // Get a line, break it into its fields, do the Perl
4013 // variable substitutions.
4014 //
4015 UnicodeString line = lineMat->group(1, status);
4016 UnicodeString fields[7];
4017 fieldPat->split(line, fields, 7, status);
4018
4019 flagMat->reset(fields[0]);
4020 flagMat->matches(status);
4021 UnicodeString pattern = flagMat->group(2, status);
4022 pattern.findAndReplace("${bang}", "!");
4023 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4024 pattern.findAndReplace(ffffSrc, ffff);
4025
4026 //
4027 // Identify patterns that include match flag settings,
4028 // split off the flags, remove the extra quotes.
4029 //
4030 UnicodeString flagStr = flagMat->group(3, status);
4031 if (U_FAILURE(status)) {
4032 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4033 return;
4034 }
4035 int32_t flags = 0;
4036 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4037 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4038 const UChar UChar_m = 0x6d;
4039 const UChar UChar_x = 0x78;
4040 const UChar UChar_y = 0x79;
4041 if (flagStr.indexOf(UChar_i) != -1) {
4042 flags |= UREGEX_CASE_INSENSITIVE;
4043 }
4044 if (flagStr.indexOf(UChar_m) != -1) {
4045 flags |= UREGEX_MULTILINE;
4046 }
4047 if (flagStr.indexOf(UChar_x) != -1) {
4048 flags |= UREGEX_COMMENTS;
4049 }
4050
4051 //
4052 // Compile the test pattern.
4053 //
4054 status = U_ZERO_ERROR;
4055 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4056 if (status == U_REGEX_UNIMPLEMENTED) {
4057 //
4058 // Test of a feature that is planned for ICU, but not yet implemented.
4059 // skip the test.
4060 skippedUnimplementedCount++;
4061 delete testPat;
4062 status = U_ZERO_ERROR;
4063 continue;
4064 }
4065
4066 if (U_FAILURE(status)) {
4067 // Some tests are supposed to generate errors.
4068 // Only report an error for tests that are supposed to succeed.
4069 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4070 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4071 {
4072 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4073 }
4074 status = U_ZERO_ERROR;
4075 delete testPat;
4076 continue;
4077 }
4078
4079 if (fields[2].indexOf(UChar_i) >= 0) {
4080 // ICU should skip this test.
4081 delete testPat;
4082 continue;
4083 }
4084
4085 if (fields[2].indexOf(UChar_c) >= 0) {
4086 // This pattern should have caused a compilation error, but didn't/
4087 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4088 delete testPat;
4089 continue;
4090 }
4091
4092 //
4093 // replace the Perl variables that appear in some of the
4094 // match data strings.
4095 //
4096 UnicodeString matchString = fields[1];
4097 matchString.findAndReplace(nulnulSrc, nulnul);
4098 matchString.findAndReplace(ffffSrc, ffff);
4099
4100 // Replace any \n in the match string with an actual new-line char.
4101 // Don't do full unescape, as this unescapes more than Perl does, which
4102 // causes other spurious failures in the tests.
4103 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4104
4105
4106
4107 //
4108 // Run the test, check for expected match/don't match result.
4109 //
4110 RegexMatcher *testMat = testPat->matcher(matchString, status);
4111 UBool found = testMat->find();
4112 UBool expected = FALSE;
4113 if (fields[2].indexOf(UChar_y) >=0) {
4114 expected = TRUE;
4115 }
4116 if (expected != found) {
4117 errln("line %d: Expected %smatch, got %smatch",
4118 lineNum, expected?"":"no ", found?"":"no " );
4119 continue;
4120 }
4121
4122 // Don't try to check expected results if there is no match.
4123 // (Some have stuff in the expected fields)
4124 if (!found) {
4125 delete testMat;
4126 delete testPat;
4127 continue;
4128 }
4129
4130 //
4131 // Interpret the Perl expression from the fourth field of the data file,
4132 // building up an ICU string from the results of the ICU match.
4133 // The Perl expression will contain references to the results of
4134 // a regex match, including the matched string, capture group strings,
4135 // group starting and ending indicies, etc.
4136 //
4137 UnicodeString resultString;
4138 UnicodeString perlExpr = fields[3];
4139 #if SUPPORT_MUTATING_INPUT_STRING
4140 groupsMat->reset(perlExpr);
4141 cgMat->reset(perlExpr);
4142 #endif
4143
4144 while (perlExpr.length() > 0) {
4145 #if !SUPPORT_MUTATING_INPUT_STRING
4146 // Perferred usage. Reset after any modification to input string.
4147 groupsMat->reset(perlExpr);
4148 cgMat->reset(perlExpr);
4149 #endif
4150
4151 if (perlExpr.startsWith("$&")) {
4152 resultString.append(testMat->group(status));
4153 perlExpr.remove(0, 2);
4154 }
4155
4156 else if (groupsMat->lookingAt(status)) {
4157 // $-[0] $+[2] etc.
4158 UnicodeString digitString = groupsMat->group(2, status);
4159 int32_t t = 0;
4160 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4161 UnicodeString plusOrMinus = groupsMat->group(1, status);
4162 int32_t matchPosition;
4163 if (plusOrMinus.compare("+") == 0) {
4164 matchPosition = testMat->end(groupNum, status);
4165 } else {
4166 matchPosition = testMat->start(groupNum, status);
4167 }
4168 if (matchPosition != -1) {
4169 ICU_Utility::appendNumber(resultString, matchPosition);
4170 }
4171 perlExpr.remove(0, groupsMat->end(status));
4172 }
4173
4174 else if (cgMat->lookingAt(status)) {
4175 // $1, $2, $3, etc.
4176 UnicodeString digitString = cgMat->group(1, status);
4177 int32_t t = 0;
4178 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4179 if (U_SUCCESS(status)) {
4180 resultString.append(testMat->group(groupNum, status));
4181 status = U_ZERO_ERROR;
4182 }
4183 perlExpr.remove(0, cgMat->end(status));
4184 }
4185
4186 else if (perlExpr.startsWith("@-")) {
4187 int32_t i;
4188 for (i=0; i<=testMat->groupCount(); i++) {
4189 if (i>0) {
4190 resultString.append(" ");
4191 }
4192 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4193 }
4194 perlExpr.remove(0, 2);
4195 }
4196
4197 else if (perlExpr.startsWith("@+")) {
4198 int32_t i;
4199 for (i=0; i<=testMat->groupCount(); i++) {
4200 if (i>0) {
4201 resultString.append(" ");
4202 }
4203 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4204 }
4205 perlExpr.remove(0, 2);
4206 }
4207
4208 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4209 // or as an escaped sequence (e.g. \n)
4210 if (perlExpr.length() > 1) {
4211 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4212 }
4213 UChar c = perlExpr.charAt(0);
4214 switch (c) {
4215 case 'n': c = '\n'; break;
4216 // add any other escape sequences that show up in the test expected results.
4217 }
4218 resultString.append(c);
4219 perlExpr.remove(0, 1);
4220 }
4221
4222 else {
4223 // Any characters from the perl expression that we don't explicitly
4224 // recognize before here are assumed to be literals and copied
4225 // as-is to the expected results.
4226 resultString.append(perlExpr.charAt(0));
4227 perlExpr.remove(0, 1);
4228 }
4229
4230 if (U_FAILURE(status)) {
4231 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4232 break;
4233 }
4234 }
4235
4236 //
4237 // Expected Results Compare
4238 //
4239 UnicodeString expectedS(fields[4]);
4240 expectedS.findAndReplace(nulnulSrc, nulnul);
4241 expectedS.findAndReplace(ffffSrc, ffff);
4242 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4243
4244
4245 if (expectedS.compare(resultString) != 0) {
4246 err("Line %d: Incorrect perl expression results.", lineNum);
4247 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4248 }
4249
4250 delete testMat;
4251 delete testPat;
4252 }
4253
4254 //
4255 // All done. Clean up allocated stuff.
4256 //
4257 delete cgMat;
4258 delete cgPat;
4259
4260 delete groupsMat;
4261 delete groupsPat;
4262
4263 delete flagMat;
4264 delete flagPat;
4265
4266 delete lineMat;
4267 delete linePat;
4268
4269 delete fieldPat;
4270 delete [] testData;
4271
4272
4273 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4274
4275 }
4276
4277
4278 //-------------------------------------------------------------------------------
4279 //
4280 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4281 // (instead of using UnicodeStrings) to test the alternate engine.
4282 // The input file for this test is re_tests, the standard regular
4283 // expression test data distributed with the Perl source code.
4284 // See PerlTests() for more information.
4285 //
4286 //-------------------------------------------------------------------------------
PerlTestsUTF8()4287 void RegexTest::PerlTestsUTF8() {
4288 char tdd[2048];
4289 const char *srcPath;
4290 UErrorCode status = U_ZERO_ERROR;
4291 UParseError pe;
4292 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4293 UText patternText = UTEXT_INITIALIZER;
4294 char *patternChars = NULL;
4295 int32_t patternLength;
4296 int32_t patternCapacity = 0;
4297 UText inputText = UTEXT_INITIALIZER;
4298 char *inputChars = NULL;
4299 int32_t inputLength;
4300 int32_t inputCapacity = 0;
4301
4302 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4303
4304 //
4305 // Open and read the test data file.
4306 //
4307 srcPath=getPath(tdd, "re_tests.txt");
4308 if(srcPath==NULL) {
4309 return; /* something went wrong, error already output */
4310 }
4311
4312 int32_t len;
4313 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4314 if (U_FAILURE(status)) {
4315 return; /* something went wrong, error already output */
4316 }
4317
4318 //
4319 // Put the test data into a UnicodeString
4320 //
4321 UnicodeString testDataString(FALSE, testData, len);
4322
4323 //
4324 // Regex to break the input file into lines, and strip the new lines.
4325 // One line per match, capture group one is the desired data.
4326 //
4327 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4328 if (U_FAILURE(status)) {
4329 dataerrln("RegexPattern::compile() error");
4330 return;
4331 }
4332 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4333
4334 //
4335 // Regex to split a test file line into fields.
4336 // There are six fields, separated by tabs.
4337 //
4338 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4339
4340 //
4341 // Regex to identify test patterns with flag settings, and to separate them.
4342 // Test patterns with flags look like 'pattern'i
4343 // Test patterns without flags are not quoted: pattern
4344 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4345 //
4346 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4347 RegexMatcher* flagMat = flagPat->matcher(status);
4348
4349 //
4350 // The Perl tests reference several perl-isms, which are evaluated/substituted
4351 // in the test data. Not being perl, this must be done explicitly. Here
4352 // are string constants and REs for these constructs.
4353 //
4354 UnicodeString nulnulSrc("${nulnul}");
4355 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4356 nulnul = nulnul.unescape();
4357
4358 UnicodeString ffffSrc("${ffff}");
4359 UnicodeString ffff("\\uffff", -1, US_INV);
4360 ffff = ffff.unescape();
4361
4362 // regexp for $-[0], $+[2], etc.
4363 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4364 RegexMatcher *groupsMat = groupsPat->matcher(status);
4365
4366 // regexp for $0, $1, $2, etc.
4367 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4368 RegexMatcher *cgMat = cgPat->matcher(status);
4369
4370
4371 //
4372 // Main Loop for the Perl Tests, runs once per line from the
4373 // test data file.
4374 //
4375 int32_t lineNum = 0;
4376 int32_t skippedUnimplementedCount = 0;
4377 while (lineMat->find()) {
4378 lineNum++;
4379
4380 //
4381 // Get a line, break it into its fields, do the Perl
4382 // variable substitutions.
4383 //
4384 UnicodeString line = lineMat->group(1, status);
4385 UnicodeString fields[7];
4386 fieldPat->split(line, fields, 7, status);
4387
4388 flagMat->reset(fields[0]);
4389 flagMat->matches(status);
4390 UnicodeString pattern = flagMat->group(2, status);
4391 pattern.findAndReplace("${bang}", "!");
4392 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4393 pattern.findAndReplace(ffffSrc, ffff);
4394
4395 //
4396 // Identify patterns that include match flag settings,
4397 // split off the flags, remove the extra quotes.
4398 //
4399 UnicodeString flagStr = flagMat->group(3, status);
4400 if (U_FAILURE(status)) {
4401 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4402 return;
4403 }
4404 int32_t flags = 0;
4405 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4406 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4407 const UChar UChar_m = 0x6d;
4408 const UChar UChar_x = 0x78;
4409 const UChar UChar_y = 0x79;
4410 if (flagStr.indexOf(UChar_i) != -1) {
4411 flags |= UREGEX_CASE_INSENSITIVE;
4412 }
4413 if (flagStr.indexOf(UChar_m) != -1) {
4414 flags |= UREGEX_MULTILINE;
4415 }
4416 if (flagStr.indexOf(UChar_x) != -1) {
4417 flags |= UREGEX_COMMENTS;
4418 }
4419
4420 //
4421 // Put the pattern in a UTF-8 UText
4422 //
4423 status = U_ZERO_ERROR;
4424 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4425 if (status == U_BUFFER_OVERFLOW_ERROR) {
4426 status = U_ZERO_ERROR;
4427 delete[] patternChars;
4428 patternCapacity = patternLength + 1;
4429 patternChars = new char[patternCapacity];
4430 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4431 }
4432 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4433
4434 //
4435 // Compile the test pattern.
4436 //
4437 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4438 if (status == U_REGEX_UNIMPLEMENTED) {
4439 //
4440 // Test of a feature that is planned for ICU, but not yet implemented.
4441 // skip the test.
4442 skippedUnimplementedCount++;
4443 delete testPat;
4444 status = U_ZERO_ERROR;
4445 continue;
4446 }
4447
4448 if (U_FAILURE(status)) {
4449 // Some tests are supposed to generate errors.
4450 // Only report an error for tests that are supposed to succeed.
4451 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4452 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4453 {
4454 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4455 }
4456 status = U_ZERO_ERROR;
4457 delete testPat;
4458 continue;
4459 }
4460
4461 if (fields[2].indexOf(UChar_i) >= 0) {
4462 // ICU should skip this test.
4463 delete testPat;
4464 continue;
4465 }
4466
4467 if (fields[2].indexOf(UChar_c) >= 0) {
4468 // This pattern should have caused a compilation error, but didn't/
4469 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4470 delete testPat;
4471 continue;
4472 }
4473
4474
4475 //
4476 // replace the Perl variables that appear in some of the
4477 // match data strings.
4478 //
4479 UnicodeString matchString = fields[1];
4480 matchString.findAndReplace(nulnulSrc, nulnul);
4481 matchString.findAndReplace(ffffSrc, ffff);
4482
4483 // Replace any \n in the match string with an actual new-line char.
4484 // Don't do full unescape, as this unescapes more than Perl does, which
4485 // causes other spurious failures in the tests.
4486 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4487
4488 //
4489 // Put the input in a UTF-8 UText
4490 //
4491 status = U_ZERO_ERROR;
4492 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4493 if (status == U_BUFFER_OVERFLOW_ERROR) {
4494 status = U_ZERO_ERROR;
4495 delete[] inputChars;
4496 inputCapacity = inputLength + 1;
4497 inputChars = new char[inputCapacity];
4498 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4499 }
4500 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4501
4502 //
4503 // Run the test, check for expected match/don't match result.
4504 //
4505 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4506 UBool found = testMat->find();
4507 UBool expected = FALSE;
4508 if (fields[2].indexOf(UChar_y) >=0) {
4509 expected = TRUE;
4510 }
4511 if (expected != found) {
4512 errln("line %d: Expected %smatch, got %smatch",
4513 lineNum, expected?"":"no ", found?"":"no " );
4514 continue;
4515 }
4516
4517 // Don't try to check expected results if there is no match.
4518 // (Some have stuff in the expected fields)
4519 if (!found) {
4520 delete testMat;
4521 delete testPat;
4522 continue;
4523 }
4524
4525 //
4526 // Interpret the Perl expression from the fourth field of the data file,
4527 // building up an ICU string from the results of the ICU match.
4528 // The Perl expression will contain references to the results of
4529 // a regex match, including the matched string, capture group strings,
4530 // group starting and ending indicies, etc.
4531 //
4532 UnicodeString resultString;
4533 UnicodeString perlExpr = fields[3];
4534
4535 while (perlExpr.length() > 0) {
4536 groupsMat->reset(perlExpr);
4537 cgMat->reset(perlExpr);
4538
4539 if (perlExpr.startsWith("$&")) {
4540 resultString.append(testMat->group(status));
4541 perlExpr.remove(0, 2);
4542 }
4543
4544 else if (groupsMat->lookingAt(status)) {
4545 // $-[0] $+[2] etc.
4546 UnicodeString digitString = groupsMat->group(2, status);
4547 int32_t t = 0;
4548 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4549 UnicodeString plusOrMinus = groupsMat->group(1, status);
4550 int32_t matchPosition;
4551 if (plusOrMinus.compare("+") == 0) {
4552 matchPosition = testMat->end(groupNum, status);
4553 } else {
4554 matchPosition = testMat->start(groupNum, status);
4555 }
4556 if (matchPosition != -1) {
4557 ICU_Utility::appendNumber(resultString, matchPosition);
4558 }
4559 perlExpr.remove(0, groupsMat->end(status));
4560 }
4561
4562 else if (cgMat->lookingAt(status)) {
4563 // $1, $2, $3, etc.
4564 UnicodeString digitString = cgMat->group(1, status);
4565 int32_t t = 0;
4566 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4567 if (U_SUCCESS(status)) {
4568 resultString.append(testMat->group(groupNum, status));
4569 status = U_ZERO_ERROR;
4570 }
4571 perlExpr.remove(0, cgMat->end(status));
4572 }
4573
4574 else if (perlExpr.startsWith("@-")) {
4575 int32_t i;
4576 for (i=0; i<=testMat->groupCount(); i++) {
4577 if (i>0) {
4578 resultString.append(" ");
4579 }
4580 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4581 }
4582 perlExpr.remove(0, 2);
4583 }
4584
4585 else if (perlExpr.startsWith("@+")) {
4586 int32_t i;
4587 for (i=0; i<=testMat->groupCount(); i++) {
4588 if (i>0) {
4589 resultString.append(" ");
4590 }
4591 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4592 }
4593 perlExpr.remove(0, 2);
4594 }
4595
4596 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4597 // or as an escaped sequence (e.g. \n)
4598 if (perlExpr.length() > 1) {
4599 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4600 }
4601 UChar c = perlExpr.charAt(0);
4602 switch (c) {
4603 case 'n': c = '\n'; break;
4604 // add any other escape sequences that show up in the test expected results.
4605 }
4606 resultString.append(c);
4607 perlExpr.remove(0, 1);
4608 }
4609
4610 else {
4611 // Any characters from the perl expression that we don't explicitly
4612 // recognize before here are assumed to be literals and copied
4613 // as-is to the expected results.
4614 resultString.append(perlExpr.charAt(0));
4615 perlExpr.remove(0, 1);
4616 }
4617
4618 if (U_FAILURE(status)) {
4619 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4620 break;
4621 }
4622 }
4623
4624 //
4625 // Expected Results Compare
4626 //
4627 UnicodeString expectedS(fields[4]);
4628 expectedS.findAndReplace(nulnulSrc, nulnul);
4629 expectedS.findAndReplace(ffffSrc, ffff);
4630 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4631
4632
4633 if (expectedS.compare(resultString) != 0) {
4634 err("Line %d: Incorrect perl expression results.", lineNum);
4635 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4636 }
4637
4638 delete testMat;
4639 delete testPat;
4640 }
4641
4642 //
4643 // All done. Clean up allocated stuff.
4644 //
4645 delete cgMat;
4646 delete cgPat;
4647
4648 delete groupsMat;
4649 delete groupsPat;
4650
4651 delete flagMat;
4652 delete flagPat;
4653
4654 delete lineMat;
4655 delete linePat;
4656
4657 delete fieldPat;
4658 delete [] testData;
4659
4660 utext_close(&patternText);
4661 utext_close(&inputText);
4662
4663 delete [] patternChars;
4664 delete [] inputChars;
4665
4666
4667 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4668
4669 }
4670
4671
4672 //--------------------------------------------------------------
4673 //
4674 // Bug6149 Verify limits to heap expansion for backtrack stack.
4675 // Use this pattern,
4676 // "(a?){1,}"
4677 // The zero-length match will repeat forever.
4678 // (That this goes into a loop is another bug)
4679 //
4680 //---------------------------------------------------------------
Bug6149()4681 void RegexTest::Bug6149() {
4682 UnicodeString pattern("(a?){1,}");
4683 UnicodeString s("xyz");
4684 uint32_t flags = 0;
4685 UErrorCode status = U_ZERO_ERROR;
4686
4687 RegexMatcher matcher(pattern, s, flags, status);
4688 UBool result = false;
4689 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4690 REGEX_ASSERT(result == FALSE);
4691 }
4692
4693
4694 //
4695 // Callbacks() Test the callback function.
4696 // When set, callbacks occur periodically during matching operations,
4697 // giving the application code the ability to abort the operation
4698 // before it's normal completion.
4699 //
4700
4701 struct callBackContext {
4702 RegexTest *test;
4703 int32_t maxCalls;
4704 int32_t numCalls;
4705 int32_t lastSteps;
resetcallBackContext4706 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4707 };
4708
4709 U_CDECL_BEGIN
4710 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4711 testCallBackFn(const void *context, int32_t steps) {
4712 callBackContext *info = (callBackContext *)context;
4713 if (info->lastSteps+1 != steps) {
4714 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4715 }
4716 info->lastSteps = steps;
4717 info->numCalls++;
4718 return (info->numCalls < info->maxCalls);
4719 }
4720 U_CDECL_END
4721
Callbacks()4722 void RegexTest::Callbacks() {
4723 {
4724 // Getter returns NULLs if no callback has been set
4725
4726 // The variables that the getter will fill in.
4727 // Init to non-null values so that the action of the getter can be seen.
4728 const void *returnedContext = &returnedContext;
4729 URegexMatchCallback *returnedFn = &testCallBackFn;
4730
4731 UErrorCode status = U_ZERO_ERROR;
4732 RegexMatcher matcher("x", 0, status);
4733 REGEX_CHECK_STATUS;
4734 matcher.getMatchCallback(returnedFn, returnedContext, status);
4735 REGEX_CHECK_STATUS;
4736 REGEX_ASSERT(returnedFn == NULL);
4737 REGEX_ASSERT(returnedContext == NULL);
4738 }
4739
4740 {
4741 // Set and Get work
4742 callBackContext cbInfo = {this, 0, 0, 0};
4743 const void *returnedContext;
4744 URegexMatchCallback *returnedFn;
4745 UErrorCode status = U_ZERO_ERROR;
4746 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4747 REGEX_CHECK_STATUS;
4748 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4749 REGEX_CHECK_STATUS;
4750 matcher.getMatchCallback(returnedFn, returnedContext, status);
4751 REGEX_CHECK_STATUS;
4752 REGEX_ASSERT(returnedFn == testCallBackFn);
4753 REGEX_ASSERT(returnedContext == &cbInfo);
4754
4755 // A short-running match shouldn't invoke the callback
4756 status = U_ZERO_ERROR;
4757 cbInfo.reset(1);
4758 UnicodeString s = "xxx";
4759 matcher.reset(s);
4760 REGEX_ASSERT(matcher.matches(status));
4761 REGEX_CHECK_STATUS;
4762 REGEX_ASSERT(cbInfo.numCalls == 0);
4763
4764 // A medium-length match that runs long enough to invoke the
4765 // callback, but not so long that the callback aborts it.
4766 status = U_ZERO_ERROR;
4767 cbInfo.reset(4);
4768 s = "aaaaaaaaaaaaaaaaaaab";
4769 matcher.reset(s);
4770 REGEX_ASSERT(matcher.matches(status)==FALSE);
4771 REGEX_CHECK_STATUS;
4772 REGEX_ASSERT(cbInfo.numCalls > 0);
4773
4774 // A longer running match that the callback function will abort.
4775 status = U_ZERO_ERROR;
4776 cbInfo.reset(4);
4777 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4778 matcher.reset(s);
4779 REGEX_ASSERT(matcher.matches(status)==FALSE);
4780 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4781 REGEX_ASSERT(cbInfo.numCalls == 4);
4782 }
4783
4784
4785 }
4786
4787
4788 //
4789 // FindProgressCallbacks() Test the find "progress" callback function.
4790 // When set, the find progress callback will be invoked during a find operations
4791 // after each return from a match attempt, giving the application the opportunity
4792 // to terminate a long-running find operation before it's normal completion.
4793 //
4794
4795 struct progressCallBackContext {
4796 RegexTest *test;
4797 int64_t lastIndex;
4798 int32_t maxCalls;
4799 int32_t numCalls;
resetprogressCallBackContext4800 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4801 };
4802
4803 U_CDECL_BEGIN
4804 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4805 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4806 progressCallBackContext *info = (progressCallBackContext *)context;
4807 info->numCalls++;
4808 info->lastIndex = matchIndex;
4809 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4810 return (info->numCalls < info->maxCalls);
4811 }
4812 U_CDECL_END
4813
FindProgressCallbacks()4814 void RegexTest::FindProgressCallbacks() {
4815 {
4816 // Getter returns NULLs if no callback has been set
4817
4818 // The variables that the getter will fill in.
4819 // Init to non-null values so that the action of the getter can be seen.
4820 const void *returnedContext = &returnedContext;
4821 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4822
4823 UErrorCode status = U_ZERO_ERROR;
4824 RegexMatcher matcher("x", 0, status);
4825 REGEX_CHECK_STATUS;
4826 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4827 REGEX_CHECK_STATUS;
4828 REGEX_ASSERT(returnedFn == NULL);
4829 REGEX_ASSERT(returnedContext == NULL);
4830 }
4831
4832 {
4833 // Set and Get work
4834 progressCallBackContext cbInfo = {this, 0, 0, 0};
4835 const void *returnedContext;
4836 URegexFindProgressCallback *returnedFn;
4837 UErrorCode status = U_ZERO_ERROR;
4838 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4839 REGEX_CHECK_STATUS;
4840 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4841 REGEX_CHECK_STATUS;
4842 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4843 REGEX_CHECK_STATUS;
4844 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4845 REGEX_ASSERT(returnedContext == &cbInfo);
4846
4847 // A short-running match should NOT invoke the callback.
4848 status = U_ZERO_ERROR;
4849 cbInfo.reset(100);
4850 UnicodeString s = "abxxx";
4851 matcher.reset(s);
4852 #if 0
4853 matcher.setTrace(TRUE);
4854 #endif
4855 REGEX_ASSERT(matcher.find(0, status));
4856 REGEX_CHECK_STATUS;
4857 REGEX_ASSERT(cbInfo.numCalls == 0);
4858
4859 // A medium running match that causes matcher.find() to invoke our callback for each index.
4860 status = U_ZERO_ERROR;
4861 s = "aaaaaaaaaaaaaaaaaaab";
4862 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4863 matcher.reset(s);
4864 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4865 REGEX_CHECK_STATUS;
4866 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4867
4868 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4869 status = U_ZERO_ERROR;
4870 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4871 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4872 matcher.reset(s1);
4873 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4874 REGEX_CHECK_STATUS;
4875 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4876
4877 #if 0
4878 // Now a match that will succeed, but after an interruption
4879 status = U_ZERO_ERROR;
4880 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4881 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4882 matcher.reset(s2);
4883 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4884 REGEX_CHECK_STATUS;
4885 // Now retry the match from where left off
4886 cbInfo.maxCalls = 100; // No callback limit
4887 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4888 REGEX_CHECK_STATUS;
4889 #endif
4890 }
4891
4892
4893 }
4894
4895
4896 //---------------------------------------------------------------------------
4897 //
4898 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4899 // UTexts. The pure-C implementation of UText
4900 // has no mutable backing stores, but we can
4901 // use UnicodeString here to test the functionality.
4902 //
4903 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4904 void RegexTest::PreAllocatedUTextCAPI () {
4905 UErrorCode status = U_ZERO_ERROR;
4906 URegularExpression *re;
4907 UText patternText = UTEXT_INITIALIZER;
4908 UnicodeString buffer;
4909 UText bufferText = UTEXT_INITIALIZER;
4910
4911 utext_openUnicodeString(&bufferText, &buffer, &status);
4912
4913 /*
4914 * getText() and getUText()
4915 */
4916 {
4917 UText text1 = UTEXT_INITIALIZER;
4918 UText text2 = UTEXT_INITIALIZER;
4919 UChar text2Chars[20];
4920 UText *resultText;
4921
4922 status = U_ZERO_ERROR;
4923 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4924 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4925 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4926 utext_openUChars(&text2, text2Chars, -1, &status);
4927
4928 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4929 re = uregex_openUText(&patternText, 0, NULL, &status);
4930
4931 /* First set a UText */
4932 uregex_setUText(re, &text1, &status);
4933 resultText = uregex_getUText(re, &bufferText, &status);
4934 REGEX_CHECK_STATUS;
4935 REGEX_ASSERT(resultText == &bufferText);
4936 utext_setNativeIndex(resultText, 0);
4937 utext_setNativeIndex(&text1, 0);
4938 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4939
4940 resultText = uregex_getUText(re, &bufferText, &status);
4941 REGEX_CHECK_STATUS;
4942 REGEX_ASSERT(resultText == &bufferText);
4943 utext_setNativeIndex(resultText, 0);
4944 utext_setNativeIndex(&text1, 0);
4945 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4946
4947 /* Then set a UChar * */
4948 uregex_setText(re, text2Chars, 7, &status);
4949 resultText = uregex_getUText(re, &bufferText, &status);
4950 REGEX_CHECK_STATUS;
4951 REGEX_ASSERT(resultText == &bufferText);
4952 utext_setNativeIndex(resultText, 0);
4953 utext_setNativeIndex(&text2, 0);
4954 REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
4955
4956 uregex_close(re);
4957 utext_close(&text1);
4958 utext_close(&text2);
4959 }
4960
4961 /*
4962 * group()
4963 */
4964 {
4965 UChar text1[80];
4966 UText *actual;
4967 UBool result;
4968 u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2);
4969
4970 status = U_ZERO_ERROR;
4971 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4972 REGEX_CHECK_STATUS;
4973
4974 uregex_setText(re, text1, -1, &status);
4975 result = uregex_find(re, 0, &status);
4976 REGEX_ASSERT(result==TRUE);
4977
4978 /* Capture Group 0, the full match. Should succeed. */
4979 status = U_ZERO_ERROR;
4980 actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4981 REGEX_CHECK_STATUS;
4982 REGEX_ASSERT(actual == &bufferText);
4983 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
4984
4985 /* Capture group #1. Should succeed. */
4986 status = U_ZERO_ERROR;
4987 actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
4988 REGEX_CHECK_STATUS;
4989 REGEX_ASSERT(actual == &bufferText);
4990 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
4991
4992 /* Capture group out of range. Error. */
4993 status = U_ZERO_ERROR;
4994 actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
4995 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
4996 REGEX_ASSERT(actual == &bufferText);
4997
4998 uregex_close(re);
4999
5000 }
5001
5002 /*
5003 * replaceFirst()
5004 */
5005 {
5006 UChar text1[80];
5007 UChar text2[80];
5008 UText replText = UTEXT_INITIALIZER;
5009 UText *result;
5010
5011 status = U_ZERO_ERROR;
5012 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5013 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5014 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5015
5016 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5017 REGEX_CHECK_STATUS;
5018
5019 /* Normal case, with match */
5020 uregex_setText(re, text1, -1, &status);
5021 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5022 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5023 REGEX_CHECK_STATUS;
5024 REGEX_ASSERT(result == &bufferText);
5025 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5026
5027 /* No match. Text should copy to output with no changes. */
5028 uregex_setText(re, text2, -1, &status);
5029 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5030 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5031 REGEX_CHECK_STATUS;
5032 REGEX_ASSERT(result == &bufferText);
5033 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5034
5035 /* Unicode escapes */
5036 uregex_setText(re, text1, -1, &status);
5037 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
5038 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5039 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5040 REGEX_CHECK_STATUS;
5041 REGEX_ASSERT(result == &bufferText);
5042 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5043
5044 uregex_close(re);
5045 utext_close(&replText);
5046 }
5047
5048
5049 /*
5050 * replaceAll()
5051 */
5052 {
5053 UChar text1[80];
5054 UChar text2[80];
5055 UText replText = UTEXT_INITIALIZER;
5056 UText *result;
5057
5058 status = U_ZERO_ERROR;
5059 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5060 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5061 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5062
5063 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5064 REGEX_CHECK_STATUS;
5065
5066 /* Normal case, with match */
5067 uregex_setText(re, text1, -1, &status);
5068 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5069 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5070 REGEX_CHECK_STATUS;
5071 REGEX_ASSERT(result == &bufferText);
5072 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5073
5074 /* No match. Text should copy to output with no changes. */
5075 uregex_setText(re, text2, -1, &status);
5076 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5077 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5078 REGEX_CHECK_STATUS;
5079 REGEX_ASSERT(result == &bufferText);
5080 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5081
5082 uregex_close(re);
5083 utext_close(&replText);
5084 }
5085
5086
5087 /*
5088 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5089 * so we don't need to test it here.
5090 */
5091
5092 utext_close(&bufferText);
5093 utext_close(&patternText);
5094 }
5095
5096 //--------------------------------------------------------------
5097 //
5098 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5099 //
5100 //---------------------------------------------------------------
Bug7651()5101 void RegexTest::Bug7651() {
5102 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5103 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5104 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5105 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5106 UnicodeString s("#ff @abcd This is test");
5107 RegexPattern *REPattern = NULL;
5108 RegexMatcher *REMatcher = NULL;
5109 UErrorCode status = U_ZERO_ERROR;
5110 UParseError pe;
5111
5112 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5113 REGEX_CHECK_STATUS;
5114 REMatcher = REPattern->matcher(s, status);
5115 REGEX_CHECK_STATUS;
5116 REGEX_ASSERT(REMatcher->find());
5117 REGEX_ASSERT(REMatcher->start(status) == 0);
5118 delete REPattern;
5119 delete REMatcher;
5120 status = U_ZERO_ERROR;
5121
5122 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5123 REGEX_CHECK_STATUS;
5124 REMatcher = REPattern->matcher(s, status);
5125 REGEX_CHECK_STATUS;
5126 REGEX_ASSERT(REMatcher->find());
5127 REGEX_ASSERT(REMatcher->start(status) == 0);
5128 delete REPattern;
5129 delete REMatcher;
5130 status = U_ZERO_ERROR;
5131 }
5132
Bug7740()5133 void RegexTest::Bug7740() {
5134 UErrorCode status = U_ZERO_ERROR;
5135 UnicodeString pattern = "(a)";
5136 UnicodeString text = "abcdef";
5137 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5138 REGEX_CHECK_STATUS;
5139 REGEX_ASSERT(m->lookingAt(status));
5140 REGEX_CHECK_STATUS;
5141 status = U_ILLEGAL_ARGUMENT_ERROR;
5142 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5143 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5144 REGEX_ASSERT(s == "");
5145 delete m;
5146 }
5147
5148 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5149
Bug8479()5150 void RegexTest::Bug8479() {
5151 UErrorCode status = U_ZERO_ERROR;
5152
5153 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5154 REGEX_CHECK_STATUS;
5155 if (U_SUCCESS(status))
5156 {
5157 UnicodeString str;
5158 str.setToBogus();
5159 pMatcher->reset(str);
5160 status = U_ZERO_ERROR;
5161 pMatcher->matches(status);
5162 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5163 delete pMatcher;
5164 }
5165 }
5166
5167
5168 // Bug 7029
Bug7029()5169 void RegexTest::Bug7029() {
5170 UErrorCode status = U_ZERO_ERROR;
5171
5172 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5173 UnicodeString text = "abc.def";
5174 UnicodeString splits[10];
5175 REGEX_CHECK_STATUS;
5176 int32_t numFields = pMatcher->split(text, splits, 10, status);
5177 REGEX_CHECK_STATUS;
5178 REGEX_ASSERT(numFields == 8);
5179 delete pMatcher;
5180 }
5181
CheckInvBufSize()5182 void RegexTest::CheckInvBufSize() {
5183 if(inv_next>=INV_BUFSIZ) {
5184 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5185 __FILE__, INV_BUFSIZ, inv_next);
5186 } else {
5187 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5188 }
5189 }
5190
5191 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
5192
5193