1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7 //
8 // regextst.cpp
9 //
10 // ICU Regular Expressions test, part of intltest.
11 //
12
13 #include "intltest.h"
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16 #include "unicode/regex.h"
17 #include "unicode/uchar.h"
18 #include "unicode/ucnv.h"
19 #include "unicode/ustring.h"
20 #include "regextst.h"
21 #include "uvector.h"
22 #include "util.h"
23 #include <stdlib.h>
24 #include <string.h>
25 #include <stdio.h>
26
27 #define SUPPORT_MUTATING_INPUT_STRING 0
28
29
30 //---------------------------------------------------------------------------
31 //
32 // Test class boilerplate
33 //
34 //---------------------------------------------------------------------------
RegexTest()35 RegexTest::RegexTest()
36 {
37 }
38
39
~RegexTest()40 RegexTest::~RegexTest()
41 {
42 }
43
44
45
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)46 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
47 {
48 if (exec) logln("TestSuite RegexTest: ");
49 switch (index) {
50
51 case 0: name = "Basic";
52 if (exec) Basic();
53 break;
54 case 1: name = "API_Match";
55 if (exec) API_Match();
56 break;
57 case 2: name = "API_Replace";
58 if (exec) API_Replace();
59 break;
60 case 3: name = "API_Pattern";
61 if (exec) API_Pattern();
62 break;
63 case 4:
64 #if !UCONFIG_NO_FILE_IO
65 name = "Extended";
66 if (exec) Extended();
67 #else
68 name = "skip";
69 #endif
70 break;
71 case 5: name = "Errors";
72 if (exec) Errors();
73 break;
74 case 6: name = "PerlTests";
75 if (exec) PerlTests();
76 break;
77 case 7: name = "Callbacks";
78 if (exec) Callbacks();
79 break;
80 case 8: name = "Bug 6149";
81 if (exec) Bug6149();
82 break;
83 case 9: name = "UTextBasic";
84 if (exec) UTextBasic();
85 break;
86 case 10: name = "API_Match_UTF8";
87 if (exec) API_Match_UTF8();
88 break;
89 case 11: name = "API_Replace_UTF8";
90 if (exec) API_Replace_UTF8();
91 break;
92 case 12: name = "API_Pattern_UTF8";
93 if (exec) API_Pattern_UTF8();
94 break;
95 case 13: name = "PerlTestsUTF8";
96 if (exec) PerlTestsUTF8();
97 break;
98 case 14: name = "PreAllocatedUTextCAPI";
99 if (exec) PreAllocatedUTextCAPI();
100 break;
101 case 15: name = "Bug 7651";
102 if (exec) Bug7651();
103 break;
104
105 default: name = "";
106 break; //needed to end loop
107 }
108 }
109
110
111 //---------------------------------------------------------------------------
112 //
113 // Error Checking / Reporting macros used in all of the tests.
114 //
115 //---------------------------------------------------------------------------
116 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("RegexTest failure at line %d. status=%s", \
117 __LINE__, u_errorName(status)); return;}}
118
119 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
120
121 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
122 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
123 __LINE__, u_errorName(errcode), u_errorName(status));};}
124
125 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
126 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
127
128 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
129 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
130
assertUText(const char * expected,UText * actual,const char * file,int line)131 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
132 UErrorCode status = U_ZERO_ERROR;
133 UText expectedText = UTEXT_INITIALIZER;
134 utext_openUTF8(&expectedText, expected, -1, &status);
135 utext_setNativeIndex(actual, 0);
136 if (utext_compare(&expectedText, -1, actual, -1) != 0) {
137 char buf[201 /*21*/];
138 char *bufPtr = buf;
139 UChar32 c = utext_next32From(actual, 0);
140 while (c != U_SENTINEL && bufPtr < buf+200/*20*/) {
141 if (0x20<c && c<0x7e) {
142 *bufPtr = c;
143 } else {
144 *bufPtr = '.';
145 }
146 bufPtr++;
147 c = UTEXT_NEXT32(actual);
148 }
149 *bufPtr = 0;
150
151 errln("Failure at file %s, line %d, expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expected, utext_nativeLength(&expectedText), buf, utext_nativeLength(actual));
152 }
153 utext_close(&expectedText);
154 }
155
156 #define REGEX_ASSERT_UTEXT(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
157
158
159 //---------------------------------------------------------------------------
160 //
161 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
162 // for the LookingAt() and Match() functions.
163 //
164 // usage:
165 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
166 //
167 // The expected results are UBool - TRUE or FALSE.
168 // The input text is unescaped. The pattern is not.
169 //
170 //
171 //---------------------------------------------------------------------------
172
173 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
174
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)175 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
176 const UnicodeString pattern(pat, -1, US_INV);
177 const UnicodeString inputText(text, -1, US_INV);
178 UErrorCode status = U_ZERO_ERROR;
179 UParseError pe;
180 RegexPattern *REPattern = NULL;
181 RegexMatcher *REMatcher = NULL;
182 UBool retVal = TRUE;
183
184 UnicodeString patString(pat, -1, US_INV);
185 REPattern = RegexPattern::compile(patString, 0, pe, status);
186 if (U_FAILURE(status)) {
187 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
188 line, u_errorName(status));
189 return FALSE;
190 }
191 if (line==376) { RegexPatternDump(REPattern);}
192
193 UnicodeString inputString(inputText);
194 UnicodeString unEscapedInput = inputString.unescape();
195 REMatcher = REPattern->matcher(unEscapedInput, status);
196 if (U_FAILURE(status)) {
197 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
198 line, u_errorName(status));
199 return FALSE;
200 }
201
202 UBool actualmatch;
203 actualmatch = REMatcher->lookingAt(status);
204 if (U_FAILURE(status)) {
205 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
206 line, u_errorName(status));
207 retVal = FALSE;
208 }
209 if (actualmatch != looking) {
210 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
211 retVal = FALSE;
212 }
213
214 status = U_ZERO_ERROR;
215 actualmatch = REMatcher->matches(status);
216 if (U_FAILURE(status)) {
217 errln("RegexTest failure in matches() at line %d. Status = %s\n",
218 line, u_errorName(status));
219 retVal = FALSE;
220 }
221 if (actualmatch != match) {
222 errln("RegexTest: wrong return from matches() at line %d.\n", line);
223 retVal = FALSE;
224 }
225
226 if (retVal == FALSE) {
227 RegexPatternDump(REPattern);
228 }
229
230 delete REPattern;
231 delete REMatcher;
232 return retVal;
233 }
234
235
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)236 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
237 UText pattern = UTEXT_INITIALIZER;
238 int32_t inputUTF8Length;
239 char *textChars = NULL;
240 UText inputText = UTEXT_INITIALIZER;
241 UErrorCode status = U_ZERO_ERROR;
242 UParseError pe;
243 RegexPattern *REPattern = NULL;
244 RegexMatcher *REMatcher = NULL;
245 UBool retVal = TRUE;
246
247 utext_openUTF8(&pattern, pat, -1, &status);
248 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
249 if (U_FAILURE(status)) {
250 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
251 line, u_errorName(status));
252 return FALSE;
253 }
254
255 UnicodeString inputString(text, -1, US_INV);
256 UnicodeString unEscapedInput = inputString.unescape();
257 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
258 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
259
260 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
261 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
262 // UTF-8 does not allow unpaired surrogates, so this could actually happen
263 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
264 return TRUE; // not a failure of the Regex engine
265 }
266 status = U_ZERO_ERROR; // buffer overflow
267 textChars = new char[inputUTF8Length+1];
268 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
269 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
270
271 REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
272 if (U_FAILURE(status)) {
273 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
274 line, u_errorName(status));
275 return FALSE;
276 }
277
278 UBool actualmatch;
279 actualmatch = REMatcher->lookingAt(status);
280 if (U_FAILURE(status)) {
281 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
282 line, u_errorName(status));
283 retVal = FALSE;
284 }
285 if (actualmatch != looking) {
286 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
287 retVal = FALSE;
288 }
289
290 status = U_ZERO_ERROR;
291 actualmatch = REMatcher->matches(status);
292 if (U_FAILURE(status)) {
293 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
294 line, u_errorName(status));
295 retVal = FALSE;
296 }
297 if (actualmatch != match) {
298 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
299 retVal = FALSE;
300 }
301
302 if (retVal == FALSE) {
303 RegexPatternDump(REPattern);
304 }
305
306 delete REPattern;
307 delete REMatcher;
308 utext_close(&inputText);
309 utext_close(&pattern);
310 delete[] textChars;
311 return retVal;
312 }
313
314
315
316 //---------------------------------------------------------------------------
317 //
318 // REGEX_ERR Macro + invocation function to simplify writing tests
319 // regex tests for incorrect patterns
320 //
321 // usage:
322 // REGEX_ERR("pattern", expected error line, column, expected status);
323 //
324 //---------------------------------------------------------------------------
325 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
326
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)327 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
328 UErrorCode expectedStatus, int32_t line) {
329 UnicodeString pattern(pat);
330
331 UErrorCode status = U_ZERO_ERROR;
332 UParseError pe;
333 RegexPattern *callerPattern = NULL;
334
335 //
336 // Compile the caller's pattern
337 //
338 UnicodeString patString(pat);
339 callerPattern = RegexPattern::compile(patString, 0, pe, status);
340 if (status != expectedStatus) {
341 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
342 } else {
343 if (status != U_ZERO_ERROR) {
344 if (pe.line != errLine || pe.offset != errCol) {
345 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
346 line, errLine, errCol, pe.line, pe.offset);
347 }
348 }
349 }
350
351 delete callerPattern;
352
353 //
354 // Compile again, using a UTF-8-based UText
355 //
356 UText patternText = UTEXT_INITIALIZER;
357 utext_openUTF8(&patternText, pat, -1, &status);
358 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
359 if (status != expectedStatus) {
360 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
361 } else {
362 if (status != U_ZERO_ERROR) {
363 if (pe.line != errLine || pe.offset != errCol) {
364 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
365 line, errLine, errCol, pe.line, pe.offset);
366 }
367 }
368 }
369
370 delete callerPattern;
371 utext_close(&patternText);
372 }
373
374
375
376 //---------------------------------------------------------------------------
377 //
378 // Basic Check for basic functionality of regex pattern matching.
379 // Avoid the use of REGEX_FIND test macro, which has
380 // substantial dependencies on basic Regex functionality.
381 //
382 //---------------------------------------------------------------------------
Basic()383 void RegexTest::Basic() {
384
385
386 //
387 // Debug - slide failing test cases early
388 //
389 #if 0
390 {
391 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
392 UParseError pe;
393 UErrorCode status = U_ZERO_ERROR;
394 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
395 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
396 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
397 }
398 exit(1);
399 #endif
400
401
402 //
403 // Pattern with parentheses
404 //
405 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
406 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
407 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
408
409 //
410 // Patterns with *
411 //
412 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
413 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
414 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
415 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
416 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
417
418 REGEX_TESTLM("a*", "", TRUE, TRUE);
419 REGEX_TESTLM("a*", "b", TRUE, FALSE);
420
421
422 //
423 // Patterns with "."
424 //
425 REGEX_TESTLM(".", "abc", TRUE, FALSE);
426 REGEX_TESTLM("...", "abc", TRUE, TRUE);
427 REGEX_TESTLM("....", "abc", FALSE, FALSE);
428 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
429 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
430 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
431 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
432 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
433
434 //
435 // Patterns with * applied to chars at end of literal string
436 //
437 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
438 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
439
440 //
441 // Supplemental chars match as single chars, not a pair of surrogates.
442 //
443 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
444 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
445 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
446
447
448 //
449 // UnicodeSets in the pattern
450 //
451 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
452 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
453 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
454 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
455 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
456 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
457
458 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
459 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
460 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
461 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
462 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
463
464 //
465 // OR operator in patterns
466 //
467 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
468 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
469 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
470 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
471
472 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
473 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
474 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
475 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
476 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
477 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
478
479 //
480 // +
481 //
482 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
483 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
484 REGEX_TESTLM("b+", "", FALSE, FALSE);
485 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
486 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
487 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
488
489 //
490 // ?
491 //
492 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
493 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
494 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
495 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
496 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
497 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
498 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
499 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
500 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
501
502 //
503 // Escape sequences that become single literal chars, handled internally
504 // by ICU's Unescape.
505 //
506
507 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
508 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
509 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
510 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
511 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
512 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
513 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
514 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
515 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
516 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
517
518 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
519 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
520
521 // Escape of special chars in patterns
522 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
523 }
524
525
526 //---------------------------------------------------------------------------
527 //
528 // UTextBasic Check for quirks that are specific to the UText
529 // implementation.
530 //
531 //---------------------------------------------------------------------------
UTextBasic()532 void RegexTest::UTextBasic() {
533 UErrorCode status = U_ZERO_ERROR;
534 UText pattern = UTEXT_INITIALIZER;
535 utext_openUTF8(&pattern, "abc", -1, &status);
536 RegexMatcher matcher(&pattern, 0, status);
537 REGEX_CHECK_STATUS;
538
539 UText input = UTEXT_INITIALIZER;
540 utext_openUTF8(&input, "abc", -1, &status);
541 REGEX_CHECK_STATUS;
542 matcher.reset(&input);
543 REGEX_CHECK_STATUS;
544 REGEX_ASSERT_UTEXT("abc", matcher.inputText());
545
546 matcher.reset(matcher.inputText());
547 REGEX_CHECK_STATUS;
548 REGEX_ASSERT_UTEXT("abc", matcher.inputText());
549
550 utext_close(&pattern);
551 utext_close(&input);
552 }
553
554
555 //---------------------------------------------------------------------------
556 //
557 // API_Match Test that the API for class RegexMatcher
558 // is present and nominally working, but excluding functions
559 // implementing replace operations.
560 //
561 //---------------------------------------------------------------------------
API_Match()562 void RegexTest::API_Match() {
563 UParseError pe;
564 UErrorCode status=U_ZERO_ERROR;
565 int32_t flags = 0;
566
567 //
568 // Debug - slide failing test cases early
569 //
570 #if 0
571 {
572 }
573 return;
574 #endif
575
576 //
577 // Simple pattern compilation
578 //
579 {
580 UnicodeString re("abc");
581 RegexPattern *pat2;
582 pat2 = RegexPattern::compile(re, flags, pe, status);
583 REGEX_CHECK_STATUS;
584
585 UnicodeString inStr1 = "abcdef this is a test";
586 UnicodeString instr2 = "not abc";
587 UnicodeString empty = "";
588
589
590 //
591 // Matcher creation and reset.
592 //
593 RegexMatcher *m1 = pat2->matcher(inStr1, status);
594 REGEX_CHECK_STATUS;
595 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
596 REGEX_ASSERT(m1->input() == inStr1);
597 m1->reset(instr2);
598 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
599 REGEX_ASSERT(m1->input() == instr2);
600 m1->reset(inStr1);
601 REGEX_ASSERT(m1->input() == inStr1);
602 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
603 m1->reset(empty);
604 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
605 REGEX_ASSERT(m1->input() == empty);
606 REGEX_ASSERT(&m1->pattern() == pat2);
607
608 //
609 // reset(pos, status)
610 //
611 m1->reset(inStr1);
612 m1->reset(4, status);
613 REGEX_CHECK_STATUS;
614 REGEX_ASSERT(m1->input() == inStr1);
615 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
616
617 m1->reset(-1, status);
618 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
619 status = U_ZERO_ERROR;
620
621 m1->reset(0, status);
622 REGEX_CHECK_STATUS;
623 status = U_ZERO_ERROR;
624
625 int32_t len = m1->input().length();
626 m1->reset(len-1, status);
627 REGEX_CHECK_STATUS;
628 status = U_ZERO_ERROR;
629
630 m1->reset(len, status);
631 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
632 status = U_ZERO_ERROR;
633
634 //
635 // match(pos, status)
636 //
637 m1->reset(instr2);
638 REGEX_ASSERT(m1->matches(4, status) == TRUE);
639 m1->reset();
640 REGEX_ASSERT(m1->matches(3, status) == FALSE);
641 m1->reset();
642 REGEX_ASSERT(m1->matches(5, status) == FALSE);
643 REGEX_ASSERT(m1->matches(4, status) == TRUE);
644 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
645 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
646
647 // Match() at end of string should fail, but should not
648 // be an error.
649 status = U_ZERO_ERROR;
650 len = m1->input().length();
651 REGEX_ASSERT(m1->matches(len, status) == FALSE);
652 REGEX_CHECK_STATUS;
653
654 // Match beyond end of string should fail with an error.
655 status = U_ZERO_ERROR;
656 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
657 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
658
659 // Successful match at end of string.
660 {
661 status = U_ZERO_ERROR;
662 RegexMatcher m("A?", 0, status); // will match zero length string.
663 REGEX_CHECK_STATUS;
664 m.reset(inStr1);
665 len = inStr1.length();
666 REGEX_ASSERT(m.matches(len, status) == TRUE);
667 REGEX_CHECK_STATUS;
668 m.reset(empty);
669 REGEX_ASSERT(m.matches(0, status) == TRUE);
670 REGEX_CHECK_STATUS;
671 }
672
673
674 //
675 // lookingAt(pos, status)
676 //
677 status = U_ZERO_ERROR;
678 m1->reset(instr2); // "not abc"
679 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
680 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
681 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
682 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
683 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
684 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
685 status = U_ZERO_ERROR;
686 len = m1->input().length();
687 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
688 REGEX_CHECK_STATUS;
689 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
690 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
691
692 delete m1;
693 delete pat2;
694 }
695
696
697 //
698 // Capture Group.
699 // RegexMatcher::start();
700 // RegexMatcher::end();
701 // RegexMatcher::groupCount();
702 //
703 {
704 int32_t flags=0;
705 UParseError pe;
706 UErrorCode status=U_ZERO_ERROR;
707
708 UnicodeString re("01(23(45)67)(.*)");
709 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
710 REGEX_CHECK_STATUS;
711 UnicodeString data = "0123456789";
712
713 RegexMatcher *matcher = pat->matcher(data, status);
714 REGEX_CHECK_STATUS;
715 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
716 static const int32_t matchStarts[] = {0, 2, 4, 8};
717 static const int32_t matchEnds[] = {10, 8, 6, 10};
718 int32_t i;
719 for (i=0; i<4; i++) {
720 int32_t actualStart = matcher->start(i, status);
721 REGEX_CHECK_STATUS;
722 if (actualStart != matchStarts[i]) {
723 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
724 __LINE__, i, matchStarts[i], actualStart);
725 }
726 int32_t actualEnd = matcher->end(i, status);
727 REGEX_CHECK_STATUS;
728 if (actualEnd != matchEnds[i]) {
729 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
730 __LINE__, i, matchEnds[i], actualEnd);
731 }
732 }
733
734 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
735 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
736
737 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
738 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
739 matcher->reset();
740 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
741
742 matcher->lookingAt(status);
743 REGEX_ASSERT(matcher->group(status) == "0123456789");
744 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
745 REGEX_ASSERT(matcher->group(1, status) == "234567" );
746 REGEX_ASSERT(matcher->group(2, status) == "45" );
747 REGEX_ASSERT(matcher->group(3, status) == "89" );
748 REGEX_CHECK_STATUS;
749 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
750 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
751 matcher->reset();
752 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
753
754 delete matcher;
755 delete pat;
756
757 }
758
759 //
760 // find
761 //
762 {
763 int32_t flags=0;
764 UParseError pe;
765 UErrorCode status=U_ZERO_ERROR;
766
767 UnicodeString re("abc");
768 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
769 REGEX_CHECK_STATUS;
770 UnicodeString data = ".abc..abc...abc..";
771 // 012345678901234567
772
773 RegexMatcher *matcher = pat->matcher(data, status);
774 REGEX_CHECK_STATUS;
775 REGEX_ASSERT(matcher->find());
776 REGEX_ASSERT(matcher->start(status) == 1);
777 REGEX_ASSERT(matcher->find());
778 REGEX_ASSERT(matcher->start(status) == 6);
779 REGEX_ASSERT(matcher->find());
780 REGEX_ASSERT(matcher->start(status) == 12);
781 REGEX_ASSERT(matcher->find() == FALSE);
782 REGEX_ASSERT(matcher->find() == FALSE);
783
784 matcher->reset();
785 REGEX_ASSERT(matcher->find());
786 REGEX_ASSERT(matcher->start(status) == 1);
787
788 REGEX_ASSERT(matcher->find(0, status));
789 REGEX_ASSERT(matcher->start(status) == 1);
790 REGEX_ASSERT(matcher->find(1, status));
791 REGEX_ASSERT(matcher->start(status) == 1);
792 REGEX_ASSERT(matcher->find(2, status));
793 REGEX_ASSERT(matcher->start(status) == 6);
794 REGEX_ASSERT(matcher->find(12, status));
795 REGEX_ASSERT(matcher->start(status) == 12);
796 REGEX_ASSERT(matcher->find(13, status) == FALSE);
797 REGEX_ASSERT(matcher->find(16, status) == FALSE);
798 REGEX_ASSERT(matcher->find(17, status) == FALSE);
799 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
800
801 status = U_ZERO_ERROR;
802 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
803 status = U_ZERO_ERROR;
804 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
805
806 REGEX_ASSERT(matcher->groupCount() == 0);
807
808 delete matcher;
809 delete pat;
810 }
811
812
813 //
814 // find, with \G in pattern (true if at the end of a previous match).
815 //
816 {
817 int32_t flags=0;
818 UParseError pe;
819 UErrorCode status=U_ZERO_ERROR;
820
821 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
822 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
823 REGEX_CHECK_STATUS;
824 UnicodeString data = ".abcabc.abc..";
825 // 012345678901234567
826
827 RegexMatcher *matcher = pat->matcher(data, status);
828 REGEX_CHECK_STATUS;
829 REGEX_ASSERT(matcher->find());
830 REGEX_ASSERT(matcher->start(status) == 0);
831 REGEX_ASSERT(matcher->start(1, status) == -1);
832 REGEX_ASSERT(matcher->start(2, status) == 1);
833
834 REGEX_ASSERT(matcher->find());
835 REGEX_ASSERT(matcher->start(status) == 4);
836 REGEX_ASSERT(matcher->start(1, status) == 4);
837 REGEX_ASSERT(matcher->start(2, status) == -1);
838 REGEX_CHECK_STATUS;
839
840 delete matcher;
841 delete pat;
842 }
843
844 //
845 // find with zero length matches, match position should bump ahead
846 // to prevent loops.
847 //
848 {
849 int32_t i;
850 UErrorCode status=U_ZERO_ERROR;
851 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
852 // using an always-true look-ahead.
853 REGEX_CHECK_STATUS;
854 UnicodeString s(" ");
855 m.reset(s);
856 for (i=0; ; i++) {
857 if (m.find() == FALSE) {
858 break;
859 }
860 REGEX_ASSERT(m.start(status) == i);
861 REGEX_ASSERT(m.end(status) == i);
862 }
863 REGEX_ASSERT(i==5);
864
865 // Check that the bump goes over surrogate pairs OK
866 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
867 s = s.unescape();
868 m.reset(s);
869 for (i=0; ; i+=2) {
870 if (m.find() == FALSE) {
871 break;
872 }
873 REGEX_ASSERT(m.start(status) == i);
874 REGEX_ASSERT(m.end(status) == i);
875 }
876 REGEX_ASSERT(i==10);
877 }
878 {
879 // find() loop breaking test.
880 // with pattern of /.?/, should see a series of one char matches, then a single
881 // match of zero length at the end of the input string.
882 int32_t i;
883 UErrorCode status=U_ZERO_ERROR;
884 RegexMatcher m(".?", 0, status);
885 REGEX_CHECK_STATUS;
886 UnicodeString s(" ");
887 m.reset(s);
888 for (i=0; ; i++) {
889 if (m.find() == FALSE) {
890 break;
891 }
892 REGEX_ASSERT(m.start(status) == i);
893 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
894 }
895 REGEX_ASSERT(i==5);
896 }
897
898
899 //
900 // Matchers with no input string behave as if they had an empty input string.
901 //
902
903 {
904 UErrorCode status = U_ZERO_ERROR;
905 RegexMatcher m(".?", 0, status);
906 REGEX_CHECK_STATUS;
907 REGEX_ASSERT(m.find());
908 REGEX_ASSERT(m.start(status) == 0);
909 REGEX_ASSERT(m.input() == "");
910 }
911 {
912 UErrorCode status = U_ZERO_ERROR;
913 RegexPattern *p = RegexPattern::compile(".", 0, status);
914 RegexMatcher *m = p->matcher(status);
915 REGEX_CHECK_STATUS;
916
917 REGEX_ASSERT(m->find() == FALSE);
918 REGEX_ASSERT(m->input() == "");
919 delete m;
920 delete p;
921 }
922
923 //
924 // Regions
925 //
926 {
927 UErrorCode status = U_ZERO_ERROR;
928 UnicodeString testString("This is test data");
929 RegexMatcher m(".*", testString, 0, status);
930 REGEX_CHECK_STATUS;
931 REGEX_ASSERT(m.regionStart() == 0);
932 REGEX_ASSERT(m.regionEnd() == testString.length());
933 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
934 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
935
936 m.region(2,4, status);
937 REGEX_CHECK_STATUS;
938 REGEX_ASSERT(m.matches(status));
939 REGEX_ASSERT(m.start(status)==2);
940 REGEX_ASSERT(m.end(status)==4);
941 REGEX_CHECK_STATUS;
942
943 m.reset();
944 REGEX_ASSERT(m.regionStart() == 0);
945 REGEX_ASSERT(m.regionEnd() == testString.length());
946
947 UnicodeString shorterString("short");
948 m.reset(shorterString);
949 REGEX_ASSERT(m.regionStart() == 0);
950 REGEX_ASSERT(m.regionEnd() == shorterString.length());
951
952 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
953 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
954 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
955 REGEX_ASSERT(&m == &m.reset());
956 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
957
958 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
959 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
960 REGEX_ASSERT(&m == &m.reset());
961 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
962
963 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
964 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
965 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
966 REGEX_ASSERT(&m == &m.reset());
967 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
968
969 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
970 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
971 REGEX_ASSERT(&m == &m.reset());
972 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
973
974 }
975
976 //
977 // hitEnd() and requireEnd()
978 //
979 {
980 UErrorCode status = U_ZERO_ERROR;
981 UnicodeString testString("aabb");
982 RegexMatcher m1(".*", testString, 0, status);
983 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
984 REGEX_ASSERT(m1.hitEnd() == TRUE);
985 REGEX_ASSERT(m1.requireEnd() == FALSE);
986 REGEX_CHECK_STATUS;
987
988 status = U_ZERO_ERROR;
989 RegexMatcher m2("a*", testString, 0, status);
990 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
991 REGEX_ASSERT(m2.hitEnd() == FALSE);
992 REGEX_ASSERT(m2.requireEnd() == FALSE);
993 REGEX_CHECK_STATUS;
994
995 status = U_ZERO_ERROR;
996 RegexMatcher m3(".*$", testString, 0, status);
997 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
998 REGEX_ASSERT(m3.hitEnd() == TRUE);
999 REGEX_ASSERT(m3.requireEnd() == TRUE);
1000 REGEX_CHECK_STATUS;
1001 }
1002
1003
1004 //
1005 // Compilation error on reset with UChar *
1006 // These were a hazard that people were stumbling over with runtime errors.
1007 // Changed them to compiler errors by adding private methods that more closely
1008 // matched the incorrect use of the functions.
1009 //
1010 #if 0
1011 {
1012 UErrorCode status = U_ZERO_ERROR;
1013 UChar ucharString[20];
1014 RegexMatcher m(".", 0, status);
1015 m.reset(ucharString); // should not compile.
1016
1017 RegexPattern *p = RegexPattern::compile(".", 0, status);
1018 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1019
1020 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1021 }
1022 #endif
1023
1024 //
1025 // Time Outs.
1026 // Note: These tests will need to be changed when the regexp engine is
1027 // able to detect and cut short the exponential time behavior on
1028 // this type of match.
1029 //
1030 {
1031 UErrorCode status = U_ZERO_ERROR;
1032 // Enough 'a's in the string to cause the match to time out.
1033 // (Each on additonal 'a' doubles the time)
1034 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1035 RegexMatcher matcher("(a+)+b", testString, 0, status);
1036 REGEX_CHECK_STATUS;
1037 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1038 matcher.setTimeLimit(100, status);
1039 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1040 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1041 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1042 }
1043 {
1044 UErrorCode status = U_ZERO_ERROR;
1045 // Few enough 'a's to slip in under the time limit.
1046 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1047 RegexMatcher matcher("(a+)+b", testString, 0, status);
1048 REGEX_CHECK_STATUS;
1049 matcher.setTimeLimit(100, status);
1050 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1051 REGEX_CHECK_STATUS;
1052 }
1053
1054 //
1055 // Stack Limits
1056 //
1057 {
1058 UErrorCode status = U_ZERO_ERROR;
1059 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1060
1061 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1062 // of the '+', and makes the stack frames larger.
1063 RegexMatcher matcher("(A)+A$", testString, 0, status);
1064
1065 // With the default stack, this match should fail to run
1066 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1067 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1068
1069 // With unlimited stack, it should run
1070 status = U_ZERO_ERROR;
1071 matcher.setStackLimit(0, status);
1072 REGEX_CHECK_STATUS;
1073 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1074 REGEX_CHECK_STATUS;
1075 REGEX_ASSERT(matcher.getStackLimit() == 0);
1076
1077 // With a limited stack, it the match should fail
1078 status = U_ZERO_ERROR;
1079 matcher.setStackLimit(10000, status);
1080 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1081 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1082 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1083 }
1084
1085 // A pattern that doesn't save state should work with
1086 // a minimal sized stack
1087 {
1088 UErrorCode status = U_ZERO_ERROR;
1089 UnicodeString testString = "abc";
1090 RegexMatcher matcher("abc", testString, 0, status);
1091 REGEX_CHECK_STATUS;
1092 matcher.setStackLimit(30, status);
1093 REGEX_CHECK_STATUS;
1094 REGEX_ASSERT(matcher.matches(status) == TRUE);
1095 REGEX_CHECK_STATUS;
1096 REGEX_ASSERT(matcher.getStackLimit() == 30);
1097
1098 // Negative stack sizes should fail
1099 status = U_ZERO_ERROR;
1100 matcher.setStackLimit(1000, status);
1101 REGEX_CHECK_STATUS;
1102 matcher.setStackLimit(-1, status);
1103 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1104 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1105 }
1106
1107
1108 }
1109
1110
1111
1112
1113
1114
1115 //---------------------------------------------------------------------------
1116 //
1117 // API_Replace API test for class RegexMatcher, testing the
1118 // Replace family of functions.
1119 //
1120 //---------------------------------------------------------------------------
API_Replace()1121 void RegexTest::API_Replace() {
1122 //
1123 // Replace
1124 //
1125 int32_t flags=0;
1126 UParseError pe;
1127 UErrorCode status=U_ZERO_ERROR;
1128
1129 UnicodeString re("abc");
1130 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1131 REGEX_CHECK_STATUS;
1132 UnicodeString data = ".abc..abc...abc..";
1133 // 012345678901234567
1134 RegexMatcher *matcher = pat->matcher(data, status);
1135
1136 //
1137 // Plain vanilla matches.
1138 //
1139 UnicodeString dest;
1140 dest = matcher->replaceFirst("yz", status);
1141 REGEX_CHECK_STATUS;
1142 REGEX_ASSERT(dest == ".yz..abc...abc..");
1143
1144 dest = matcher->replaceAll("yz", status);
1145 REGEX_CHECK_STATUS;
1146 REGEX_ASSERT(dest == ".yz..yz...yz..");
1147
1148 //
1149 // Plain vanilla non-matches.
1150 //
1151 UnicodeString d2 = ".abx..abx...abx..";
1152 matcher->reset(d2);
1153 dest = matcher->replaceFirst("yz", status);
1154 REGEX_CHECK_STATUS;
1155 REGEX_ASSERT(dest == ".abx..abx...abx..");
1156
1157 dest = matcher->replaceAll("yz", status);
1158 REGEX_CHECK_STATUS;
1159 REGEX_ASSERT(dest == ".abx..abx...abx..");
1160
1161 //
1162 // Empty source string
1163 //
1164 UnicodeString d3 = "";
1165 matcher->reset(d3);
1166 dest = matcher->replaceFirst("yz", status);
1167 REGEX_CHECK_STATUS;
1168 REGEX_ASSERT(dest == "");
1169
1170 dest = matcher->replaceAll("yz", status);
1171 REGEX_CHECK_STATUS;
1172 REGEX_ASSERT(dest == "");
1173
1174 //
1175 // Empty substitution string
1176 //
1177 matcher->reset(data); // ".abc..abc...abc.."
1178 dest = matcher->replaceFirst("", status);
1179 REGEX_CHECK_STATUS;
1180 REGEX_ASSERT(dest == "...abc...abc..");
1181
1182 dest = matcher->replaceAll("", status);
1183 REGEX_CHECK_STATUS;
1184 REGEX_ASSERT(dest == "........");
1185
1186 //
1187 // match whole string
1188 //
1189 UnicodeString d4 = "abc";
1190 matcher->reset(d4);
1191 dest = matcher->replaceFirst("xyz", status);
1192 REGEX_CHECK_STATUS;
1193 REGEX_ASSERT(dest == "xyz");
1194
1195 dest = matcher->replaceAll("xyz", status);
1196 REGEX_CHECK_STATUS;
1197 REGEX_ASSERT(dest == "xyz");
1198
1199 //
1200 // Capture Group, simple case
1201 //
1202 UnicodeString re2("a(..)");
1203 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1204 REGEX_CHECK_STATUS;
1205 UnicodeString d5 = "abcdefg";
1206 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1207 REGEX_CHECK_STATUS;
1208 dest = matcher2->replaceFirst("$1$1", status);
1209 REGEX_CHECK_STATUS;
1210 REGEX_ASSERT(dest == "bcbcdefg");
1211
1212 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1213 REGEX_CHECK_STATUS;
1214 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1215
1216 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1217 REGEX_CHECK_STATUS;
1218 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1219
1220 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1221 replacement = replacement.unescape();
1222 dest = matcher2->replaceFirst(replacement, status);
1223 REGEX_CHECK_STATUS;
1224 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1225
1226 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1227
1228
1229 //
1230 // Replacement String with \u hex escapes
1231 //
1232 {
1233 UnicodeString src = "abc 1 abc 2 abc 3";
1234 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1235 matcher->reset(src);
1236 UnicodeString result = matcher->replaceAll(substitute, status);
1237 REGEX_CHECK_STATUS;
1238 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1239 }
1240 {
1241 UnicodeString src = "abc !";
1242 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1243 matcher->reset(src);
1244 UnicodeString result = matcher->replaceAll(substitute, status);
1245 REGEX_CHECK_STATUS;
1246 UnicodeString expected = UnicodeString("--");
1247 expected.append((UChar32)0x10000);
1248 expected.append("-- !");
1249 REGEX_ASSERT(result == expected);
1250 }
1251 // TODO: need more through testing of capture substitutions.
1252
1253 // Bug 4057
1254 //
1255 {
1256 status = U_ZERO_ERROR;
1257 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1258 RegexMatcher m("ss(.*?)ee", 0, status);
1259 REGEX_CHECK_STATUS;
1260 UnicodeString result;
1261
1262 // Multiple finds do NOT bump up the previous appendReplacement postion.
1263 m.reset(s);
1264 m.find();
1265 m.find();
1266 m.appendReplacement(result, "ooh", status);
1267 REGEX_CHECK_STATUS;
1268 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1269
1270 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1271 status = U_ZERO_ERROR;
1272 result.truncate(0);
1273 m.reset(10, status);
1274 m.find();
1275 m.find();
1276 m.appendReplacement(result, "ooh", status);
1277 REGEX_CHECK_STATUS;
1278 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1279
1280 // find() at interior of string, appendReplacemnt still starts at beginning.
1281 status = U_ZERO_ERROR;
1282 result.truncate(0);
1283 m.reset();
1284 m.find(10, status);
1285 m.find();
1286 m.appendReplacement(result, "ooh", status);
1287 REGEX_CHECK_STATUS;
1288 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1289
1290 m.appendTail(result);
1291 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1292
1293 }
1294
1295 delete matcher2;
1296 delete pat2;
1297 delete matcher;
1298 delete pat;
1299 }
1300
1301
1302 //---------------------------------------------------------------------------
1303 //
1304 // API_Pattern Test that the API for class RegexPattern is
1305 // present and nominally working.
1306 //
1307 //---------------------------------------------------------------------------
API_Pattern()1308 void RegexTest::API_Pattern() {
1309 RegexPattern pata; // Test default constructor to not crash.
1310 RegexPattern patb;
1311
1312 REGEX_ASSERT(pata == patb);
1313 REGEX_ASSERT(pata == pata);
1314
1315 UnicodeString re1("abc[a-l][m-z]");
1316 UnicodeString re2("def");
1317 UErrorCode status = U_ZERO_ERROR;
1318 UParseError pe;
1319
1320 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1321 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1322 REGEX_CHECK_STATUS;
1323 REGEX_ASSERT(*pat1 == *pat1);
1324 REGEX_ASSERT(*pat1 != pata);
1325
1326 // Assign
1327 patb = *pat1;
1328 REGEX_ASSERT(patb == *pat1);
1329
1330 // Copy Construct
1331 RegexPattern patc(*pat1);
1332 REGEX_ASSERT(patc == *pat1);
1333 REGEX_ASSERT(patb == patc);
1334 REGEX_ASSERT(pat1 != pat2);
1335 patb = *pat2;
1336 REGEX_ASSERT(patb != patc);
1337 REGEX_ASSERT(patb == *pat2);
1338
1339 // Compile with no flags.
1340 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1341 REGEX_ASSERT(*pat1a == *pat1);
1342
1343 REGEX_ASSERT(pat1a->flags() == 0);
1344
1345 // Compile with different flags should be not equal
1346 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1347 REGEX_CHECK_STATUS;
1348
1349 REGEX_ASSERT(*pat1b != *pat1a);
1350 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1351 REGEX_ASSERT(pat1a->flags() == 0);
1352 delete pat1b;
1353
1354 // clone
1355 RegexPattern *pat1c = pat1->clone();
1356 REGEX_ASSERT(*pat1c == *pat1);
1357 REGEX_ASSERT(*pat1c != *pat2);
1358
1359 delete pat1c;
1360 delete pat1a;
1361 delete pat1;
1362 delete pat2;
1363
1364
1365 //
1366 // Verify that a matcher created from a cloned pattern works.
1367 // (Jitterbug 3423)
1368 //
1369 {
1370 UErrorCode status = U_ZERO_ERROR;
1371 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1372 RegexPattern *pClone = pSource->clone();
1373 delete pSource;
1374 RegexMatcher *mFromClone = pClone->matcher(status);
1375 REGEX_CHECK_STATUS;
1376 UnicodeString s = "Hello World";
1377 mFromClone->reset(s);
1378 REGEX_ASSERT(mFromClone->find() == TRUE);
1379 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1380 REGEX_ASSERT(mFromClone->find() == TRUE);
1381 REGEX_ASSERT(mFromClone->group(status) == "World");
1382 REGEX_ASSERT(mFromClone->find() == FALSE);
1383 delete mFromClone;
1384 delete pClone;
1385 }
1386
1387 //
1388 // matches convenience API
1389 //
1390 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1391 REGEX_CHECK_STATUS;
1392 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1393 REGEX_CHECK_STATUS;
1394 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1395 REGEX_CHECK_STATUS;
1396 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1397 REGEX_CHECK_STATUS;
1398 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1399 REGEX_CHECK_STATUS;
1400 status = U_INDEX_OUTOFBOUNDS_ERROR;
1401 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1402 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1403
1404
1405 //
1406 // Split()
1407 //
1408 status = U_ZERO_ERROR;
1409 pat1 = RegexPattern::compile(" +", pe, status);
1410 REGEX_CHECK_STATUS;
1411 UnicodeString fields[10];
1412
1413 int32_t n;
1414 n = pat1->split("Now is the time", fields, 10, status);
1415 REGEX_CHECK_STATUS;
1416 REGEX_ASSERT(n==4);
1417 REGEX_ASSERT(fields[0]=="Now");
1418 REGEX_ASSERT(fields[1]=="is");
1419 REGEX_ASSERT(fields[2]=="the");
1420 REGEX_ASSERT(fields[3]=="time");
1421 REGEX_ASSERT(fields[4]=="");
1422
1423 n = pat1->split("Now is the time", fields, 2, status);
1424 REGEX_CHECK_STATUS;
1425 REGEX_ASSERT(n==2);
1426 REGEX_ASSERT(fields[0]=="Now");
1427 REGEX_ASSERT(fields[1]=="is the time");
1428 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1429
1430 fields[1] = "*";
1431 status = U_ZERO_ERROR;
1432 n = pat1->split("Now is the time", fields, 1, status);
1433 REGEX_CHECK_STATUS;
1434 REGEX_ASSERT(n==1);
1435 REGEX_ASSERT(fields[0]=="Now is the time");
1436 REGEX_ASSERT(fields[1]=="*");
1437 status = U_ZERO_ERROR;
1438
1439 n = pat1->split(" Now is the time ", fields, 10, status);
1440 REGEX_CHECK_STATUS;
1441 REGEX_ASSERT(n==5);
1442 REGEX_ASSERT(fields[0]=="");
1443 REGEX_ASSERT(fields[1]=="Now");
1444 REGEX_ASSERT(fields[2]=="is");
1445 REGEX_ASSERT(fields[3]=="the");
1446 REGEX_ASSERT(fields[4]=="time");
1447 REGEX_ASSERT(fields[5]=="");
1448
1449 n = pat1->split(" ", fields, 10, status);
1450 REGEX_CHECK_STATUS;
1451 REGEX_ASSERT(n==1);
1452 REGEX_ASSERT(fields[0]=="");
1453
1454 fields[0] = "foo";
1455 n = pat1->split("", fields, 10, status);
1456 REGEX_CHECK_STATUS;
1457 REGEX_ASSERT(n==0);
1458 REGEX_ASSERT(fields[0]=="foo");
1459
1460 delete pat1;
1461
1462 // split, with a pattern with (capture)
1463 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1464 REGEX_CHECK_STATUS;
1465
1466 status = U_ZERO_ERROR;
1467 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1468 REGEX_CHECK_STATUS;
1469 REGEX_ASSERT(n==6);
1470 REGEX_ASSERT(fields[0]=="");
1471 REGEX_ASSERT(fields[1]=="a");
1472 REGEX_ASSERT(fields[2]=="Now is ");
1473 REGEX_ASSERT(fields[3]=="b");
1474 REGEX_ASSERT(fields[4]=="the time");
1475 REGEX_ASSERT(fields[5]=="c");
1476 REGEX_ASSERT(fields[6]=="");
1477 REGEX_ASSERT(status==U_ZERO_ERROR);
1478
1479 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1480 REGEX_CHECK_STATUS;
1481 REGEX_ASSERT(n==6);
1482 REGEX_ASSERT(fields[0]==" ");
1483 REGEX_ASSERT(fields[1]=="a");
1484 REGEX_ASSERT(fields[2]=="Now is ");
1485 REGEX_ASSERT(fields[3]=="b");
1486 REGEX_ASSERT(fields[4]=="the time");
1487 REGEX_ASSERT(fields[5]=="c");
1488 REGEX_ASSERT(fields[6]=="");
1489
1490 status = U_ZERO_ERROR;
1491 fields[6] = "foo";
1492 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1493 REGEX_CHECK_STATUS;
1494 REGEX_ASSERT(n==6);
1495 REGEX_ASSERT(fields[0]==" ");
1496 REGEX_ASSERT(fields[1]=="a");
1497 REGEX_ASSERT(fields[2]=="Now is ");
1498 REGEX_ASSERT(fields[3]=="b");
1499 REGEX_ASSERT(fields[4]=="the time");
1500 REGEX_ASSERT(fields[5]=="c");
1501 REGEX_ASSERT(fields[6]=="foo");
1502
1503 status = U_ZERO_ERROR;
1504 fields[5] = "foo";
1505 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1506 REGEX_CHECK_STATUS;
1507 REGEX_ASSERT(n==5);
1508 REGEX_ASSERT(fields[0]==" ");
1509 REGEX_ASSERT(fields[1]=="a");
1510 REGEX_ASSERT(fields[2]=="Now is ");
1511 REGEX_ASSERT(fields[3]=="b");
1512 REGEX_ASSERT(fields[4]=="the time<c>");
1513 REGEX_ASSERT(fields[5]=="foo");
1514
1515 status = U_ZERO_ERROR;
1516 fields[5] = "foo";
1517 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1518 REGEX_CHECK_STATUS;
1519 REGEX_ASSERT(n==5);
1520 REGEX_ASSERT(fields[0]==" ");
1521 REGEX_ASSERT(fields[1]=="a");
1522 REGEX_ASSERT(fields[2]=="Now is ");
1523 REGEX_ASSERT(fields[3]=="b");
1524 REGEX_ASSERT(fields[4]=="the time");
1525 REGEX_ASSERT(fields[5]=="foo");
1526
1527 status = U_ZERO_ERROR;
1528 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1529 REGEX_CHECK_STATUS;
1530 REGEX_ASSERT(n==4);
1531 REGEX_ASSERT(fields[0]==" ");
1532 REGEX_ASSERT(fields[1]=="a");
1533 REGEX_ASSERT(fields[2]=="Now is ");
1534 REGEX_ASSERT(fields[3]=="the time<c>");
1535 status = U_ZERO_ERROR;
1536 delete pat1;
1537
1538 pat1 = RegexPattern::compile("([-,])", pe, status);
1539 REGEX_CHECK_STATUS;
1540 n = pat1->split("1-10,20", fields, 10, status);
1541 REGEX_CHECK_STATUS;
1542 REGEX_ASSERT(n==5);
1543 REGEX_ASSERT(fields[0]=="1");
1544 REGEX_ASSERT(fields[1]=="-");
1545 REGEX_ASSERT(fields[2]=="10");
1546 REGEX_ASSERT(fields[3]==",");
1547 REGEX_ASSERT(fields[4]=="20");
1548 delete pat1;
1549
1550
1551 //
1552 // RegexPattern::pattern()
1553 //
1554 pat1 = new RegexPattern();
1555 REGEX_ASSERT(pat1->pattern() == "");
1556 delete pat1;
1557
1558 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1559 REGEX_CHECK_STATUS;
1560 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1561 delete pat1;
1562
1563
1564 //
1565 // classID functions
1566 //
1567 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1568 REGEX_CHECK_STATUS;
1569 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1570 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1571 UnicodeString Hello("Hello, world.");
1572 RegexMatcher *m = pat1->matcher(Hello, status);
1573 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1574 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1575 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1576 delete m;
1577 delete pat1;
1578
1579 }
1580
1581 //---------------------------------------------------------------------------
1582 //
1583 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1584 // is present and working, but excluding functions
1585 // implementing replace operations.
1586 //
1587 //---------------------------------------------------------------------------
API_Match_UTF8()1588 void RegexTest::API_Match_UTF8() {
1589 UParseError pe;
1590 UErrorCode status=U_ZERO_ERROR;
1591 int32_t flags = 0;
1592
1593 //
1594 // Debug - slide failing test cases early
1595 //
1596 #if 0
1597 {
1598 }
1599 return;
1600 #endif
1601
1602 //
1603 // Simple pattern compilation
1604 //
1605 {
1606 UText re = UTEXT_INITIALIZER;
1607 utext_openUTF8(&re, "abc", -1, &status);
1608 RegexPattern *pat2;
1609 pat2 = RegexPattern::compile(&re, flags, pe, status);
1610 REGEX_CHECK_STATUS;
1611
1612 UText input1 = UTEXT_INITIALIZER;
1613 UText input2 = UTEXT_INITIALIZER;
1614 UText empty = UTEXT_INITIALIZER;
1615 utext_openUTF8(&input1, "abcdef this is a test", -1, &status);
1616 utext_openUTF8(&input2, "not abc", -1, &status);
1617 utext_openUChars(&empty, NULL, 0, &status);
1618
1619 int32_t input1Len = strlen("abcdef this is a test");
1620 int32_t input2Len = strlen("not abc");
1621
1622
1623 //
1624 // Matcher creation and reset.
1625 //
1626 RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT, status);
1627 REGEX_CHECK_STATUS;
1628 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1629 REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
1630 m1->reset(&input2);
1631 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1632 REGEX_ASSERT_UTEXT("not abc", m1->inputText());
1633 m1->reset(&input1);
1634 REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
1635 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1636 m1->reset(&empty);
1637 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1638 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1639
1640 //
1641 // reset(pos, status)
1642 //
1643 m1->reset(&input1);
1644 m1->reset(4, status);
1645 REGEX_CHECK_STATUS;
1646 REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
1647 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1648
1649 m1->reset(-1, status);
1650 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1651 status = U_ZERO_ERROR;
1652
1653 m1->reset(0, status);
1654 REGEX_CHECK_STATUS;
1655 status = U_ZERO_ERROR;
1656
1657 m1->reset(input1Len-1, status);
1658 REGEX_CHECK_STATUS;
1659 status = U_ZERO_ERROR;
1660
1661 m1->reset(input1Len, status);
1662 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1663 status = U_ZERO_ERROR;
1664
1665 //
1666 // match(pos, status)
1667 //
1668 m1->reset(&input2);
1669 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1670 m1->reset();
1671 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1672 m1->reset();
1673 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1674 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1675 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1676 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1677
1678 // Match() at end of string should fail, but should not
1679 // be an error.
1680 status = U_ZERO_ERROR;
1681 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1682 REGEX_CHECK_STATUS;
1683
1684 // Match beyond end of string should fail with an error.
1685 status = U_ZERO_ERROR;
1686 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1687 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1688
1689 // Successful match at end of string.
1690 {
1691 status = U_ZERO_ERROR;
1692 RegexMatcher m("A?", 0, status); // will match zero length string.
1693 REGEX_CHECK_STATUS;
1694 m.reset(&input1);
1695 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1696 REGEX_CHECK_STATUS;
1697 m.reset(&empty);
1698 REGEX_ASSERT(m.matches(0, status) == TRUE);
1699 REGEX_CHECK_STATUS;
1700 }
1701
1702
1703 //
1704 // lookingAt(pos, status)
1705 //
1706 status = U_ZERO_ERROR;
1707 m1->reset(&input2); // "not abc"
1708 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1709 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1710 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1711 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1712 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1713 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1714 status = U_ZERO_ERROR;
1715 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1716 REGEX_CHECK_STATUS;
1717 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1718 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1719
1720 delete m1;
1721 delete pat2;
1722
1723 utext_close(&re);
1724 utext_close(&input1);
1725 utext_close(&input2);
1726 utext_close(&empty);
1727 }
1728
1729
1730 //
1731 // Capture Group.
1732 // RegexMatcher::start();
1733 // RegexMatcher::end();
1734 // RegexMatcher::groupCount();
1735 //
1736 {
1737 int32_t flags=0;
1738 UParseError pe;
1739 UErrorCode status=U_ZERO_ERROR;
1740 UText re=UTEXT_INITIALIZER;
1741 utext_openUTF8(&re, "01(23(45)67)(.*)", -1, &status);
1742
1743 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1744 REGEX_CHECK_STATUS;
1745
1746 UText input = UTEXT_INITIALIZER;
1747 utext_openUTF8(&input, "0123456789", -1, &status);
1748
1749 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1750 REGEX_CHECK_STATUS;
1751 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1752 static const int32_t matchStarts[] = {0, 2, 4, 8};
1753 static const int32_t matchEnds[] = {10, 8, 6, 10};
1754 int32_t i;
1755 for (i=0; i<4; i++) {
1756 int32_t actualStart = matcher->start(i, status);
1757 REGEX_CHECK_STATUS;
1758 if (actualStart != matchStarts[i]) {
1759 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
1760 __LINE__, i, matchStarts[i], actualStart);
1761 }
1762 int32_t actualEnd = matcher->end(i, status);
1763 REGEX_CHECK_STATUS;
1764 if (actualEnd != matchEnds[i]) {
1765 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
1766 __LINE__, i, matchEnds[i], actualEnd);
1767 }
1768 }
1769
1770 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1771 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1772
1773 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1774 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1775 matcher->reset();
1776 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1777
1778 matcher->lookingAt(status);
1779
1780 UnicodeString dest;
1781 UText destText = UTEXT_INITIALIZER;
1782 utext_openUnicodeString(&destText, &dest, &status);
1783 UText *result;
1784
1785 result = matcher->group((UText *)NULL, RegexMatcher::MATCHER_DEST_IS_UTEXT, status);
1786 REGEX_CHECK_STATUS;
1787 REGEX_ASSERT_UTEXT("0123456789", result);
1788 utext_close(result);
1789 result = matcher->group(&destText, RegexMatcher::MATCHER_DEST_IS_UTEXT, status);
1790 REGEX_CHECK_STATUS;
1791 REGEX_ASSERT(result == &destText);
1792 REGEX_ASSERT_UTEXT("0123456789", result);
1793
1794 result = matcher->group(0, NULL, status);
1795 REGEX_CHECK_STATUS;
1796 REGEX_ASSERT_UTEXT("0123456789", result);
1797 utext_close(result);
1798 result = matcher->group(0, &destText, status);
1799 REGEX_CHECK_STATUS;
1800 REGEX_ASSERT(result == &destText);
1801 REGEX_ASSERT_UTEXT("0123456789", result);
1802
1803 result = matcher->group(1, NULL, status);
1804 REGEX_CHECK_STATUS;
1805 REGEX_ASSERT_UTEXT("234567", result);
1806 utext_close(result);
1807 result = matcher->group(1, &destText, status);
1808 REGEX_CHECK_STATUS;
1809 REGEX_ASSERT(result == &destText);
1810 REGEX_ASSERT_UTEXT("234567", result);
1811
1812 result = matcher->group(2, NULL, status);
1813 REGEX_CHECK_STATUS;
1814 REGEX_ASSERT_UTEXT("45", result);
1815 utext_close(result);
1816 result = matcher->group(2, &destText, status);
1817 REGEX_CHECK_STATUS;
1818 REGEX_ASSERT(result == &destText);
1819 REGEX_ASSERT_UTEXT("45", result);
1820
1821 result = matcher->group(3, NULL, status);
1822 REGEX_CHECK_STATUS;
1823 REGEX_ASSERT_UTEXT("89", result);
1824 utext_close(result);
1825 result = matcher->group(3, &destText, status);
1826 REGEX_CHECK_STATUS;
1827 REGEX_ASSERT(result == &destText);
1828 REGEX_ASSERT_UTEXT("89", result);
1829
1830 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1831 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1832 matcher->reset();
1833 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
1834
1835 delete matcher;
1836 delete pat;
1837
1838 utext_close(&destText);
1839 utext_close(&input);
1840 utext_close(&re);
1841 }
1842
1843 //
1844 // find
1845 //
1846 {
1847 int32_t flags=0;
1848 UParseError pe;
1849 UErrorCode status=U_ZERO_ERROR;
1850 UText re=UTEXT_INITIALIZER;
1851 utext_openUTF8(&re, "abc", -1, &status);
1852
1853 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1854 REGEX_CHECK_STATUS;
1855 UText input = UTEXT_INITIALIZER;
1856 utext_openUTF8(&input, ".abc..abc...abc..", -1, &status);
1857 // 012345678901234567
1858
1859 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1860 REGEX_CHECK_STATUS;
1861 REGEX_ASSERT(matcher->find());
1862 REGEX_ASSERT(matcher->start(status) == 1);
1863 REGEX_ASSERT(matcher->find());
1864 REGEX_ASSERT(matcher->start(status) == 6);
1865 REGEX_ASSERT(matcher->find());
1866 REGEX_ASSERT(matcher->start(status) == 12);
1867 REGEX_ASSERT(matcher->find() == FALSE);
1868 REGEX_ASSERT(matcher->find() == FALSE);
1869
1870 matcher->reset();
1871 REGEX_ASSERT(matcher->find());
1872 REGEX_ASSERT(matcher->start(status) == 1);
1873
1874 REGEX_ASSERT(matcher->find(0, status));
1875 REGEX_ASSERT(matcher->start(status) == 1);
1876 REGEX_ASSERT(matcher->find(1, status));
1877 REGEX_ASSERT(matcher->start(status) == 1);
1878 REGEX_ASSERT(matcher->find(2, status));
1879 REGEX_ASSERT(matcher->start(status) == 6);
1880 REGEX_ASSERT(matcher->find(12, status));
1881 REGEX_ASSERT(matcher->start(status) == 12);
1882 REGEX_ASSERT(matcher->find(13, status) == FALSE);
1883 REGEX_ASSERT(matcher->find(16, status) == FALSE);
1884 REGEX_ASSERT(matcher->find(17, status) == FALSE);
1885 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1886
1887 status = U_ZERO_ERROR;
1888 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1889 status = U_ZERO_ERROR;
1890 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1891
1892 REGEX_ASSERT(matcher->groupCount() == 0);
1893
1894 delete matcher;
1895 delete pat;
1896
1897 utext_close(&input);
1898 utext_close(&re);
1899 }
1900
1901
1902 //
1903 // find, with \G in pattern (true if at the end of a previous match).
1904 //
1905 {
1906 int32_t flags=0;
1907 UParseError pe;
1908 UErrorCode status=U_ZERO_ERROR;
1909 UText re=UTEXT_INITIALIZER;
1910 utext_openUTF8(&re, ".*?(?:(\\Gabc)|(abc))", -1, &status);
1911
1912 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1913
1914 REGEX_CHECK_STATUS;
1915 UText input = UTEXT_INITIALIZER;
1916 utext_openUTF8(&input, ".abcabc.abc..", -1, &status);
1917 // 012345678901234567
1918
1919 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1920 REGEX_CHECK_STATUS;
1921 REGEX_ASSERT(matcher->find());
1922 REGEX_ASSERT(matcher->start(status) == 0);
1923 REGEX_ASSERT(matcher->start(1, status) == -1);
1924 REGEX_ASSERT(matcher->start(2, status) == 1);
1925
1926 REGEX_ASSERT(matcher->find());
1927 REGEX_ASSERT(matcher->start(status) == 4);
1928 REGEX_ASSERT(matcher->start(1, status) == 4);
1929 REGEX_ASSERT(matcher->start(2, status) == -1);
1930 REGEX_CHECK_STATUS;
1931
1932 delete matcher;
1933 delete pat;
1934
1935 utext_close(&input);
1936 utext_close(&re);
1937 }
1938
1939 //
1940 // find with zero length matches, match position should bump ahead
1941 // to prevent loops.
1942 //
1943 {
1944 int32_t i;
1945 UErrorCode status=U_ZERO_ERROR;
1946 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1947 // using an always-true look-ahead.
1948 REGEX_CHECK_STATUS;
1949 UText s = UTEXT_INITIALIZER;
1950 utext_openUTF8(&s, " ", -1, &status);
1951 m.reset(&s);
1952 for (i=0; ; i++) {
1953 if (m.find() == FALSE) {
1954 break;
1955 }
1956 REGEX_ASSERT(m.start(status) == i);
1957 REGEX_ASSERT(m.end(status) == i);
1958 }
1959 REGEX_ASSERT(i==5);
1960
1961 // Check that the bump goes over characters outside the BMP OK
1962 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
1963 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
1964 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
1965 m.reset(&s);
1966 for (i=0; ; i+=2) {
1967 if (m.find() == FALSE) {
1968 break;
1969 }
1970 REGEX_ASSERT(m.start(status) == i);
1971 REGEX_ASSERT(m.end(status) == i);
1972 }
1973 REGEX_ASSERT(i==10);
1974
1975 utext_close(&s);
1976 }
1977 {
1978 // find() loop breaking test.
1979 // with pattern of /.?/, should see a series of one char matches, then a single
1980 // match of zero length at the end of the input string.
1981 int32_t i;
1982 UErrorCode status=U_ZERO_ERROR;
1983 RegexMatcher m(".?", 0, status);
1984 REGEX_CHECK_STATUS;
1985 UText s = UTEXT_INITIALIZER;
1986 utext_openUTF8(&s, " ", -1, &status);
1987 m.reset(&s);
1988 for (i=0; ; i++) {
1989 if (m.find() == FALSE) {
1990 break;
1991 }
1992 REGEX_ASSERT(m.start(status) == i);
1993 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1994 }
1995 REGEX_ASSERT(i==5);
1996
1997 utext_close(&s);
1998 }
1999
2000
2001 //
2002 // Matchers with no input string behave as if they had an empty input string.
2003 //
2004
2005 {
2006 UErrorCode status = U_ZERO_ERROR;
2007 RegexMatcher m(".?", 0, status);
2008 REGEX_CHECK_STATUS;
2009 REGEX_ASSERT(m.find());
2010 REGEX_ASSERT(m.start(status) == 0);
2011 REGEX_ASSERT(m.input() == "");
2012 }
2013 {
2014 UErrorCode status = U_ZERO_ERROR;
2015 RegexPattern *p = RegexPattern::compile(".", 0, status);
2016 RegexMatcher *m = p->matcher(status);
2017 REGEX_CHECK_STATUS;
2018
2019 REGEX_ASSERT(m->find() == FALSE);
2020 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2021 delete m;
2022 delete p;
2023 }
2024
2025 //
2026 // Regions
2027 //
2028 {
2029 UErrorCode status = U_ZERO_ERROR;
2030 UText testPattern = UTEXT_INITIALIZER;
2031 UText testText = UTEXT_INITIALIZER;
2032 utext_openUTF8(&testPattern, ".*", -1, &status);
2033 utext_openUTF8(&testText, "This is test data", -1, &status);
2034
2035 RegexMatcher m(&testPattern, &testText, 0, status);
2036 REGEX_CHECK_STATUS;
2037 REGEX_ASSERT(m.regionStart() == 0);
2038 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2039 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2040 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2041
2042 m.region(2,4, status);
2043 REGEX_CHECK_STATUS;
2044 REGEX_ASSERT(m.matches(status));
2045 REGEX_ASSERT(m.start(status)==2);
2046 REGEX_ASSERT(m.end(status)==4);
2047 REGEX_CHECK_STATUS;
2048
2049 m.reset();
2050 REGEX_ASSERT(m.regionStart() == 0);
2051 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2052
2053 utext_openUTF8(&testText, "short", -1, &status);
2054 m.reset(&testText);
2055 REGEX_ASSERT(m.regionStart() == 0);
2056 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2057
2058 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2059 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2060 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2061 REGEX_ASSERT(&m == &m.reset());
2062 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2063
2064 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2065 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2066 REGEX_ASSERT(&m == &m.reset());
2067 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2068
2069 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2070 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2071 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2072 REGEX_ASSERT(&m == &m.reset());
2073 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2074
2075 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2076 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2077 REGEX_ASSERT(&m == &m.reset());
2078 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2079
2080 utext_close(&testText);
2081 utext_close(&testPattern);
2082 }
2083
2084 //
2085 // hitEnd() and requireEnd()
2086 //
2087 {
2088 UErrorCode status = U_ZERO_ERROR;
2089 UText testPattern = UTEXT_INITIALIZER;
2090 UText testText = UTEXT_INITIALIZER;
2091 utext_openUTF8(&testPattern, ".*", -1, &status);
2092 utext_openUTF8(&testText, "aabb", -1, &status);
2093
2094 RegexMatcher m1(&testPattern, &testText, 0, status);
2095 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2096 REGEX_ASSERT(m1.hitEnd() == TRUE);
2097 REGEX_ASSERT(m1.requireEnd() == FALSE);
2098 REGEX_CHECK_STATUS;
2099
2100 status = U_ZERO_ERROR;
2101 utext_openUTF8(&testPattern, "a*", -1, &status);
2102 RegexMatcher m2(&testPattern, &testText, 0, status);
2103 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2104 REGEX_ASSERT(m2.hitEnd() == FALSE);
2105 REGEX_ASSERT(m2.requireEnd() == FALSE);
2106 REGEX_CHECK_STATUS;
2107
2108 status = U_ZERO_ERROR;
2109 utext_openUTF8(&testPattern, ".*$", -1, &status);
2110 RegexMatcher m3(&testPattern, &testText, 0, status);
2111 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2112 REGEX_ASSERT(m3.hitEnd() == TRUE);
2113 REGEX_ASSERT(m3.requireEnd() == TRUE);
2114 REGEX_CHECK_STATUS;
2115
2116 utext_close(&testText);
2117 utext_close(&testPattern);
2118 }
2119 }
2120
2121
2122 //---------------------------------------------------------------------------
2123 //
2124 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2125 // Replace family of functions.
2126 //
2127 //---------------------------------------------------------------------------
API_Replace_UTF8()2128 void RegexTest::API_Replace_UTF8() {
2129 //
2130 // Replace
2131 //
2132 int32_t flags=0;
2133 UParseError pe;
2134 UErrorCode status=U_ZERO_ERROR;
2135
2136 UText re=UTEXT_INITIALIZER;
2137 utext_openUTF8(&re, "abc", -1, &status);
2138 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2139 REGEX_CHECK_STATUS;
2140
2141 char data[] = ".abc..abc...abc..";
2142 // 012345678901234567
2143 UText dataText = UTEXT_INITIALIZER;
2144 utext_openUTF8(&dataText, data, -1, &status);
2145 RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2146
2147 //
2148 // Plain vanilla matches.
2149 //
2150 UnicodeString dest;
2151 UText destText = UTEXT_INITIALIZER;
2152 utext_openUnicodeString(&destText, &dest, &status);
2153 UText *result;
2154
2155 UText replText = UTEXT_INITIALIZER;
2156
2157 utext_openUTF8(&replText, "yz", -1, &status);
2158 result = matcher->replaceFirst(&replText, NULL, status);
2159 REGEX_CHECK_STATUS;
2160 REGEX_ASSERT_UTEXT(".yz..abc...abc..", result);
2161 utext_close(result);
2162 result = matcher->replaceFirst(&replText, &destText, status);
2163 REGEX_CHECK_STATUS;
2164 REGEX_ASSERT(result == &destText);
2165 REGEX_ASSERT_UTEXT(".yz..abc...abc..", result);
2166
2167 result = matcher->replaceAll(&replText, NULL, status);
2168 REGEX_CHECK_STATUS;
2169 REGEX_ASSERT_UTEXT(".yz..yz...yz..", result);
2170 utext_close(result);
2171
2172 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2173 result = matcher->replaceAll(&replText, &destText, status);
2174 REGEX_CHECK_STATUS;
2175 REGEX_ASSERT(result == &destText);
2176 REGEX_ASSERT_UTEXT(".yz..yz...yz..", result);
2177
2178 //
2179 // Plain vanilla non-matches.
2180 //
2181 utext_openUTF8(&dataText, ".abx..abx...abx..", -1, &status);
2182 matcher->reset(&dataText);
2183
2184 result = matcher->replaceFirst(&replText, NULL, status);
2185 REGEX_CHECK_STATUS;
2186 REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2187 utext_close(result);
2188 result = matcher->replaceFirst(&replText, &destText, status);
2189 REGEX_CHECK_STATUS;
2190 REGEX_ASSERT(result == &destText);
2191 REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2192
2193 result = matcher->replaceAll(&replText, NULL, status);
2194 REGEX_CHECK_STATUS;
2195 REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2196 utext_close(result);
2197 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2198 result = matcher->replaceAll(&replText, &destText, status);
2199 REGEX_CHECK_STATUS;
2200 REGEX_ASSERT(result == &destText);
2201 REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2202
2203 //
2204 // Empty source string
2205 //
2206 utext_openUTF8(&dataText, NULL, 0, &status);
2207 matcher->reset(&dataText);
2208
2209 result = matcher->replaceFirst(&replText, NULL, status);
2210 REGEX_CHECK_STATUS;
2211 REGEX_ASSERT_UTEXT("", result);
2212 utext_close(result);
2213 result = matcher->replaceFirst(&replText, &destText, status);
2214 REGEX_CHECK_STATUS;
2215 REGEX_ASSERT(result == &destText);
2216 REGEX_ASSERT_UTEXT("", result);
2217
2218 result = matcher->replaceAll(&replText, NULL, status);
2219 REGEX_CHECK_STATUS;
2220 REGEX_ASSERT_UTEXT("", result);
2221 utext_close(result);
2222 result = matcher->replaceAll(&replText, &destText, status);
2223 REGEX_CHECK_STATUS;
2224 REGEX_ASSERT(result == &destText);
2225 REGEX_ASSERT_UTEXT("", result);
2226
2227 //
2228 // Empty substitution string
2229 //
2230 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2231 matcher->reset(&dataText);
2232
2233 utext_openUTF8(&replText, NULL, 0, &status);
2234 result = matcher->replaceFirst(&replText, NULL, status);
2235 REGEX_CHECK_STATUS;
2236 REGEX_ASSERT_UTEXT("...abc...abc..", result);
2237 utext_close(result);
2238 result = matcher->replaceFirst(&replText, &destText, status);
2239 REGEX_CHECK_STATUS;
2240 REGEX_ASSERT(result == &destText);
2241 REGEX_ASSERT_UTEXT("...abc...abc..", result);
2242
2243 result = matcher->replaceAll(&replText, NULL, status);
2244 REGEX_CHECK_STATUS;
2245 REGEX_ASSERT_UTEXT("........", result);
2246 utext_close(result);
2247 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2248 result = matcher->replaceAll(&replText, &destText, status);
2249 REGEX_CHECK_STATUS;
2250 REGEX_ASSERT(result == &destText);
2251 REGEX_ASSERT_UTEXT("........", result);
2252
2253 //
2254 // match whole string
2255 //
2256 utext_openUTF8(&dataText, "abc", -1, &status);
2257 matcher->reset(&dataText);
2258
2259 utext_openUTF8(&replText, "xyz", -1, &status);
2260 result = matcher->replaceFirst(&replText, NULL, status);
2261 REGEX_CHECK_STATUS;
2262 REGEX_ASSERT_UTEXT("xyz", result);
2263 utext_close(result);
2264 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2265 result = matcher->replaceFirst(&replText, &destText, status);
2266 REGEX_CHECK_STATUS;
2267 REGEX_ASSERT(result == &destText);
2268 REGEX_ASSERT_UTEXT("xyz", result);
2269
2270 result = matcher->replaceAll(&replText, NULL, status);
2271 REGEX_CHECK_STATUS;
2272 REGEX_ASSERT_UTEXT("xyz", result);
2273 utext_close(result);
2274 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2275 result = matcher->replaceAll(&replText, &destText, status);
2276 REGEX_CHECK_STATUS;
2277 REGEX_ASSERT(result == &destText);
2278 REGEX_ASSERT_UTEXT("xyz", result);
2279
2280 //
2281 // Capture Group, simple case
2282 //
2283 utext_openUTF8(&re, "a(..)", -1, &status);
2284 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2285 REGEX_CHECK_STATUS;
2286
2287 utext_openUTF8(&dataText, "abcdefg", -1, &status);
2288 RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2289 REGEX_CHECK_STATUS;
2290
2291 utext_openUTF8(&replText, "$1$1", -1, &status);
2292 result = matcher2->replaceFirst(&replText, NULL, status);
2293 REGEX_CHECK_STATUS;
2294 REGEX_ASSERT_UTEXT("bcbcdefg", result);
2295 utext_close(result);
2296 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2297 result = matcher2->replaceFirst(&replText, &destText, status);
2298 REGEX_CHECK_STATUS;
2299 REGEX_ASSERT(result == &destText);
2300 REGEX_ASSERT_UTEXT("bcbcdefg", result);
2301
2302 utext_openUTF8(&replText, "The value of \\$1 is $1.", -1, &status);
2303 result = matcher2->replaceFirst(&replText, NULL, status);
2304 REGEX_CHECK_STATUS;
2305 REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result);
2306 utext_close(result);
2307 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2308 result = matcher2->replaceFirst(&replText, &destText, status);
2309 REGEX_CHECK_STATUS;
2310 REGEX_ASSERT(result == &destText);
2311 REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result);
2312
2313 utext_openUTF8(&replText, "$ by itself, no group number $$$", -1, &status);
2314 result = matcher2->replaceFirst(&replText, NULL, status);
2315 REGEX_CHECK_STATUS;
2316 REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result);
2317 utext_close(result);
2318 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2319 result = matcher2->replaceFirst(&replText, &destText, status);
2320 REGEX_CHECK_STATUS;
2321 REGEX_ASSERT(result == &destText);
2322 REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result);
2323
2324 unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2325 // 012345678901234567890123456
2326 supplDigitChars[22] = 0xF0;
2327 supplDigitChars[23] = 0x9D;
2328 supplDigitChars[24] = 0x9F;
2329 supplDigitChars[25] = 0x8F;
2330 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2331
2332 result = matcher2->replaceFirst(&replText, NULL, status);
2333 REGEX_CHECK_STATUS;
2334 REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result);
2335 utext_close(result);
2336 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2337 result = matcher2->replaceFirst(&replText, &destText, status);
2338 REGEX_CHECK_STATUS;
2339 REGEX_ASSERT(result == &destText);
2340 REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result);
2341
2342 utext_openUTF8(&replText, "bad capture group number $5...", -1, &status);
2343 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2344 // REGEX_ASSERT_UTEXT("abcdefg", result);
2345 utext_close(result);
2346 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2347 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2348 REGEX_ASSERT(result == &destText);
2349 // REGEX_ASSERT_UTEXT("abcdefg", result);
2350
2351 //
2352 // Replacement String with \u hex escapes
2353 //
2354 {
2355 utext_openUTF8(&dataText, "abc 1 abc 2 abc 3", -1, &status);
2356 utext_openUTF8(&replText, "--\\u0043--", -1, &status);
2357 matcher->reset(&dataText);
2358
2359 result = matcher->replaceAll(&replText, NULL, status);
2360 REGEX_CHECK_STATUS;
2361 REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result);
2362 utext_close(result);
2363 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2364 result = matcher->replaceAll(&replText, &destText, status);
2365 REGEX_CHECK_STATUS;
2366 REGEX_ASSERT(result == &destText);
2367 REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result);
2368 }
2369 {
2370 utext_openUTF8(&dataText, "abc !", -1, &status);
2371 utext_openUTF8(&replText, "--\\U00010000--", -1, &status);
2372 matcher->reset(&dataText);
2373
2374 unsigned char expected[] = "--xxxx-- !"; // \U00010000, "LINEAR B SYLLABLE B008 A"
2375 // 0123456789
2376 expected[2] = 0xF0;
2377 expected[3] = 0x90;
2378 expected[4] = 0x80;
2379 expected[5] = 0x80;
2380
2381 result = matcher->replaceAll(&replText, NULL, status);
2382 REGEX_CHECK_STATUS;
2383 REGEX_ASSERT_UTEXT((char *)expected, result);
2384 utext_close(result);
2385 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2386 result = matcher->replaceAll(&replText, &destText, status);
2387 REGEX_CHECK_STATUS;
2388 REGEX_ASSERT(result == &destText);
2389 REGEX_ASSERT_UTEXT((char *)expected, result);
2390 }
2391 // TODO: need more through testing of capture substitutions.
2392
2393 // Bug 4057
2394 //
2395 {
2396 status = U_ZERO_ERROR;
2397 utext_openUTF8(&re, "ss(.*?)ee", -1, &status);
2398 utext_openUTF8(&dataText, "The matches start with ss and end with ee ss stuff ee fin", -1, &status);
2399 utext_openUTF8(&replText, "ooh", -1, &status);
2400
2401 RegexMatcher m(&re, 0, status);
2402 REGEX_CHECK_STATUS;
2403
2404 UnicodeString result;
2405 UText resultText = UTEXT_INITIALIZER;
2406 utext_openUnicodeString(&resultText, &result, &status);
2407
2408 // Multiple finds do NOT bump up the previous appendReplacement postion.
2409 m.reset(&dataText);
2410 m.find();
2411 m.find();
2412 m.appendReplacement(&resultText, &replText, status);
2413 REGEX_CHECK_STATUS;
2414 REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
2415
2416 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2417 status = U_ZERO_ERROR;
2418 result.truncate(0);
2419 utext_openUnicodeString(&resultText, &result, &status);
2420 m.reset(10, status);
2421 m.find();
2422 m.find();
2423 m.appendReplacement(&resultText, &replText, status);
2424 REGEX_CHECK_STATUS;
2425 REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
2426
2427 // find() at interior of string, appendReplacement still starts at beginning.
2428 status = U_ZERO_ERROR;
2429 result.truncate(0);
2430 utext_openUnicodeString(&resultText, &result, &status);
2431 m.reset();
2432 m.find(10, status);
2433 m.find();
2434 m.appendReplacement(&resultText, &replText, status);
2435 REGEX_CHECK_STATUS;
2436 REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
2437
2438 m.appendTail(&resultText);
2439 REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh fin", &resultText);
2440
2441 utext_close(&resultText);
2442 }
2443
2444 delete matcher2;
2445 delete pat2;
2446 delete matcher;
2447 delete pat;
2448
2449 utext_close(&dataText);
2450 utext_close(&replText);
2451 utext_close(&destText);
2452 utext_close(&re);
2453 }
2454
2455
2456 //---------------------------------------------------------------------------
2457 //
2458 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2459 // present and nominally working.
2460 //
2461 //---------------------------------------------------------------------------
API_Pattern_UTF8()2462 void RegexTest::API_Pattern_UTF8() {
2463 RegexPattern pata; // Test default constructor to not crash.
2464 RegexPattern patb;
2465
2466 REGEX_ASSERT(pata == patb);
2467 REGEX_ASSERT(pata == pata);
2468
2469 UText re1 = UTEXT_INITIALIZER;
2470 UText re2 = UTEXT_INITIALIZER;
2471 UErrorCode status = U_ZERO_ERROR;
2472 UParseError pe;
2473
2474 utext_openUTF8(&re1, "abc[a-l][m-z]", -1, &status);
2475 utext_openUTF8(&re2, "def", -1, &status);
2476
2477 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2478 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2479 REGEX_CHECK_STATUS;
2480 REGEX_ASSERT(*pat1 == *pat1);
2481 REGEX_ASSERT(*pat1 != pata);
2482
2483 // Assign
2484 patb = *pat1;
2485 REGEX_ASSERT(patb == *pat1);
2486
2487 // Copy Construct
2488 RegexPattern patc(*pat1);
2489 REGEX_ASSERT(patc == *pat1);
2490 REGEX_ASSERT(patb == patc);
2491 REGEX_ASSERT(pat1 != pat2);
2492 patb = *pat2;
2493 REGEX_ASSERT(patb != patc);
2494 REGEX_ASSERT(patb == *pat2);
2495
2496 // Compile with no flags.
2497 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2498 REGEX_ASSERT(*pat1a == *pat1);
2499
2500 REGEX_ASSERT(pat1a->flags() == 0);
2501
2502 // Compile with different flags should be not equal
2503 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2504 REGEX_CHECK_STATUS;
2505
2506 REGEX_ASSERT(*pat1b != *pat1a);
2507 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2508 REGEX_ASSERT(pat1a->flags() == 0);
2509 delete pat1b;
2510
2511 // clone
2512 RegexPattern *pat1c = pat1->clone();
2513 REGEX_ASSERT(*pat1c == *pat1);
2514 REGEX_ASSERT(*pat1c != *pat2);
2515
2516 delete pat1c;
2517 delete pat1a;
2518 delete pat1;
2519 delete pat2;
2520
2521 utext_close(&re1);
2522 utext_close(&re2);
2523
2524
2525 //
2526 // Verify that a matcher created from a cloned pattern works.
2527 // (Jitterbug 3423)
2528 //
2529 {
2530 UErrorCode status = U_ZERO_ERROR;
2531 UText pattern = UTEXT_INITIALIZER;
2532 utext_openUTF8(&pattern, "\\p{L}+", -1, &status);
2533
2534 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2535 RegexPattern *pClone = pSource->clone();
2536 delete pSource;
2537 RegexMatcher *mFromClone = pClone->matcher(status);
2538 REGEX_CHECK_STATUS;
2539
2540 UText input = UTEXT_INITIALIZER;
2541 utext_openUTF8(&input, "Hello World", -1, &status);
2542 mFromClone->reset(&input);
2543 REGEX_ASSERT(mFromClone->find() == TRUE);
2544 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2545 REGEX_ASSERT(mFromClone->find() == TRUE);
2546 REGEX_ASSERT(mFromClone->group(status) == "World");
2547 REGEX_ASSERT(mFromClone->find() == FALSE);
2548 delete mFromClone;
2549 delete pClone;
2550
2551 utext_close(&input);
2552 utext_close(&pattern);
2553 }
2554
2555 //
2556 // matches convenience API
2557 //
2558 {
2559 UErrorCode status = U_ZERO_ERROR;
2560 UText pattern = UTEXT_INITIALIZER;
2561 UText input = UTEXT_INITIALIZER;
2562
2563 utext_openUTF8(&input, "random input", -1, &status);
2564
2565 utext_openUTF8(&pattern, ".*", -1, &status);
2566 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2567 REGEX_CHECK_STATUS;
2568
2569 utext_openUTF8(&pattern, "abc", -1, &status);
2570 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2571 REGEX_CHECK_STATUS;
2572
2573 utext_openUTF8(&pattern, ".*nput", -1, &status);
2574 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2575 REGEX_CHECK_STATUS;
2576
2577 utext_openUTF8(&pattern, "random input", -1, &status);
2578 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2579 REGEX_CHECK_STATUS;
2580
2581 utext_openUTF8(&pattern, ".*u", -1, &status);
2582 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2583 REGEX_CHECK_STATUS;
2584
2585 utext_openUTF8(&input, "abc", -1, &status);
2586 utext_openUTF8(&pattern, "abc", -1, &status);
2587 status = U_INDEX_OUTOFBOUNDS_ERROR;
2588 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2589 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2590
2591 utext_close(&input);
2592 utext_close(&pattern);
2593 }
2594
2595
2596 //
2597 // Split()
2598 //
2599 status = U_ZERO_ERROR;
2600 utext_openUTF8(&re1, " +", -1, &status);
2601 pat1 = RegexPattern::compile(&re1, pe, status);
2602 REGEX_CHECK_STATUS;
2603 UnicodeString fields[10];
2604
2605 int32_t n;
2606 n = pat1->split("Now is the time", fields, 10, status);
2607 REGEX_CHECK_STATUS;
2608 REGEX_ASSERT(n==4);
2609 REGEX_ASSERT(fields[0]=="Now");
2610 REGEX_ASSERT(fields[1]=="is");
2611 REGEX_ASSERT(fields[2]=="the");
2612 REGEX_ASSERT(fields[3]=="time");
2613 REGEX_ASSERT(fields[4]=="");
2614
2615 n = pat1->split("Now is the time", fields, 2, status);
2616 REGEX_CHECK_STATUS;
2617 REGEX_ASSERT(n==2);
2618 REGEX_ASSERT(fields[0]=="Now");
2619 REGEX_ASSERT(fields[1]=="is the time");
2620 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2621
2622 fields[1] = "*";
2623 status = U_ZERO_ERROR;
2624 n = pat1->split("Now is the time", fields, 1, status);
2625 REGEX_CHECK_STATUS;
2626 REGEX_ASSERT(n==1);
2627 REGEX_ASSERT(fields[0]=="Now is the time");
2628 REGEX_ASSERT(fields[1]=="*");
2629 status = U_ZERO_ERROR;
2630
2631 n = pat1->split(" Now is the time ", fields, 10, status);
2632 REGEX_CHECK_STATUS;
2633 REGEX_ASSERT(n==5);
2634 REGEX_ASSERT(fields[0]=="");
2635 REGEX_ASSERT(fields[1]=="Now");
2636 REGEX_ASSERT(fields[2]=="is");
2637 REGEX_ASSERT(fields[3]=="the");
2638 REGEX_ASSERT(fields[4]=="time");
2639 REGEX_ASSERT(fields[5]=="");
2640
2641 n = pat1->split(" ", fields, 10, status);
2642 REGEX_CHECK_STATUS;
2643 REGEX_ASSERT(n==1);
2644 REGEX_ASSERT(fields[0]=="");
2645
2646 fields[0] = "foo";
2647 n = pat1->split("", fields, 10, status);
2648 REGEX_CHECK_STATUS;
2649 REGEX_ASSERT(n==0);
2650 REGEX_ASSERT(fields[0]=="foo");
2651
2652 delete pat1;
2653
2654 // split, with a pattern with (capture)
2655 utext_openUTF8(&re1, "<(\\w*)>", -1, &status);
2656 pat1 = RegexPattern::compile(&re1, pe, status);
2657 REGEX_CHECK_STATUS;
2658
2659 status = U_ZERO_ERROR;
2660 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2661 REGEX_CHECK_STATUS;
2662 REGEX_ASSERT(n==6);
2663 REGEX_ASSERT(fields[0]=="");
2664 REGEX_ASSERT(fields[1]=="a");
2665 REGEX_ASSERT(fields[2]=="Now is ");
2666 REGEX_ASSERT(fields[3]=="b");
2667 REGEX_ASSERT(fields[4]=="the time");
2668 REGEX_ASSERT(fields[5]=="c");
2669 REGEX_ASSERT(fields[6]=="");
2670 REGEX_ASSERT(status==U_ZERO_ERROR);
2671
2672 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
2673 REGEX_CHECK_STATUS;
2674 REGEX_ASSERT(n==6);
2675 REGEX_ASSERT(fields[0]==" ");
2676 REGEX_ASSERT(fields[1]=="a");
2677 REGEX_ASSERT(fields[2]=="Now is ");
2678 REGEX_ASSERT(fields[3]=="b");
2679 REGEX_ASSERT(fields[4]=="the time");
2680 REGEX_ASSERT(fields[5]=="c");
2681 REGEX_ASSERT(fields[6]=="");
2682
2683 status = U_ZERO_ERROR;
2684 fields[6] = "foo";
2685 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
2686 REGEX_CHECK_STATUS;
2687 REGEX_ASSERT(n==6);
2688 REGEX_ASSERT(fields[0]==" ");
2689 REGEX_ASSERT(fields[1]=="a");
2690 REGEX_ASSERT(fields[2]=="Now is ");
2691 REGEX_ASSERT(fields[3]=="b");
2692 REGEX_ASSERT(fields[4]=="the time");
2693 REGEX_ASSERT(fields[5]=="c");
2694 REGEX_ASSERT(fields[6]=="foo");
2695
2696 status = U_ZERO_ERROR;
2697 fields[5] = "foo";
2698 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
2699 REGEX_CHECK_STATUS;
2700 REGEX_ASSERT(n==5);
2701 REGEX_ASSERT(fields[0]==" ");
2702 REGEX_ASSERT(fields[1]=="a");
2703 REGEX_ASSERT(fields[2]=="Now is ");
2704 REGEX_ASSERT(fields[3]=="b");
2705 REGEX_ASSERT(fields[4]=="the time<c>");
2706 REGEX_ASSERT(fields[5]=="foo");
2707
2708 status = U_ZERO_ERROR;
2709 fields[5] = "foo";
2710 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
2711 REGEX_CHECK_STATUS;
2712 REGEX_ASSERT(n==5);
2713 REGEX_ASSERT(fields[0]==" ");
2714 REGEX_ASSERT(fields[1]=="a");
2715 REGEX_ASSERT(fields[2]=="Now is ");
2716 REGEX_ASSERT(fields[3]=="b");
2717 REGEX_ASSERT(fields[4]=="the time");
2718 REGEX_ASSERT(fields[5]=="foo");
2719
2720 status = U_ZERO_ERROR;
2721 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
2722 REGEX_CHECK_STATUS;
2723 REGEX_ASSERT(n==4);
2724 REGEX_ASSERT(fields[0]==" ");
2725 REGEX_ASSERT(fields[1]=="a");
2726 REGEX_ASSERT(fields[2]=="Now is ");
2727 REGEX_ASSERT(fields[3]=="the time<c>");
2728 status = U_ZERO_ERROR;
2729 delete pat1;
2730
2731 utext_openUTF8(&re1, "([-,])", -1, &status);
2732 pat1 = RegexPattern::compile(&re1, pe, status);
2733 REGEX_CHECK_STATUS;
2734 n = pat1->split("1-10,20", fields, 10, status);
2735 REGEX_CHECK_STATUS;
2736 REGEX_ASSERT(n==5);
2737 REGEX_ASSERT(fields[0]=="1");
2738 REGEX_ASSERT(fields[1]=="-");
2739 REGEX_ASSERT(fields[2]=="10");
2740 REGEX_ASSERT(fields[3]==",");
2741 REGEX_ASSERT(fields[4]=="20");
2742 delete pat1;
2743
2744
2745 //
2746 // RegexPattern::pattern() and patternText()
2747 //
2748 pat1 = new RegexPattern();
2749 REGEX_ASSERT(pat1->pattern() == "");
2750 REGEX_ASSERT_UTEXT("", pat1->patternText());
2751 delete pat1;
2752
2753 utext_openUTF8(&re1, "(Hello, world)*", -1, &status);
2754 pat1 = RegexPattern::compile(&re1, pe, status);
2755 REGEX_CHECK_STATUS;
2756 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
2757 REGEX_ASSERT_UTEXT("(Hello, world)*", pat1->patternText());
2758 delete pat1;
2759
2760 utext_close(&re1);
2761 }
2762
2763
2764 //---------------------------------------------------------------------------
2765 //
2766 // Extended A more thorough check for features of regex patterns
2767 // The test cases are in a separate data file,
2768 // source/tests/testdata/regextst.txt
2769 // A description of the test data format is included in that file.
2770 //
2771 //---------------------------------------------------------------------------
2772
2773 const char *
getPath(char buffer[2048],const char * filename)2774 RegexTest::getPath(char buffer[2048], const char *filename) {
2775 UErrorCode status=U_ZERO_ERROR;
2776 const char *testDataDirectory = IntlTest::getSourceTestData(status);
2777 if (U_FAILURE(status)) {
2778 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
2779 return NULL;
2780 }
2781
2782 strcpy(buffer, testDataDirectory);
2783 strcat(buffer, filename);
2784 return buffer;
2785 }
2786
Extended()2787 void RegexTest::Extended() {
2788 char tdd[2048];
2789 const char *srcPath;
2790 UErrorCode status = U_ZERO_ERROR;
2791 int32_t lineNum = 0;
2792
2793 //
2794 // Open and read the test data file.
2795 //
2796 srcPath=getPath(tdd, "regextst.txt");
2797 if(srcPath==NULL) {
2798 return; /* something went wrong, error already output */
2799 }
2800
2801 int32_t len;
2802 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
2803 if (U_FAILURE(status)) {
2804 return; /* something went wrong, error already output */
2805 }
2806
2807 //
2808 // Put the test data into a UnicodeString
2809 //
2810 UnicodeString testString(FALSE, testData, len);
2811
2812 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
2813 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
2814 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
2815
2816 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
2817 UnicodeString testPattern; // The pattern for test from the test file.
2818 UnicodeString testFlags; // the flags for a test.
2819 UnicodeString matchString; // The marked up string to be used as input
2820
2821 if (U_FAILURE(status)){
2822 dataerrln("Construct RegexMatcher() error.");
2823 delete [] testData;
2824 return;
2825 }
2826
2827 //
2828 // Loop over the test data file, once per line.
2829 //
2830 while (lineMat.find()) {
2831 lineNum++;
2832 if (U_FAILURE(status)) {
2833 errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
2834 }
2835
2836 status = U_ZERO_ERROR;
2837 UnicodeString testLine = lineMat.group(1, status);
2838 if (testLine.length() == 0) {
2839 continue;
2840 }
2841
2842 //
2843 // Parse the test line. Skip blank and comment only lines.
2844 // Separate out the three main fields - pattern, flags, target.
2845 //
2846
2847 commentMat.reset(testLine);
2848 if (commentMat.lookingAt(status)) {
2849 // This line is a comment, or blank.
2850 continue;
2851 }
2852
2853 //
2854 // Pull out the pattern field, remove it from the test file line.
2855 //
2856 quotedStuffMat.reset(testLine);
2857 if (quotedStuffMat.lookingAt(status)) {
2858 testPattern = quotedStuffMat.group(2, status);
2859 testLine.remove(0, quotedStuffMat.end(0, status));
2860 } else {
2861 errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
2862 continue;
2863 }
2864
2865
2866 //
2867 // Pull out the flags from the test file line.
2868 //
2869 flagsMat.reset(testLine);
2870 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
2871 testFlags = flagsMat.group(1, status);
2872 if (flagsMat.group(2, status).length() > 0) {
2873 errln("Bad Match flag at line %d. Scanning %c\n",
2874 lineNum, flagsMat.group(2, status).charAt(0));
2875 continue;
2876 }
2877 testLine.remove(0, flagsMat.end(0, status));
2878
2879 //
2880 // Pull out the match string, as a whole.
2881 // We'll process the <tags> later.
2882 //
2883 quotedStuffMat.reset(testLine);
2884 if (quotedStuffMat.lookingAt(status)) {
2885 matchString = quotedStuffMat.group(2, status);
2886 testLine.remove(0, quotedStuffMat.end(0, status));
2887 } else {
2888 errln("Bad match string at test file line %d", lineNum);
2889 continue;
2890 }
2891
2892 //
2893 // The only thing left from the input line should be an optional trailing comment.
2894 //
2895 commentMat.reset(testLine);
2896 if (commentMat.lookingAt(status) == FALSE) {
2897 errln("Line %d: unexpected characters at end of test line.", lineNum);
2898 continue;
2899 }
2900
2901 //
2902 // Run the test
2903 //
2904 regex_find(testPattern, testFlags, matchString, lineNum);
2905 }
2906
2907 delete [] testData;
2908
2909 }
2910
2911
2912
2913 //---------------------------------------------------------------------------
2914 //
2915 // regex_find(pattern, flags, inputString, lineNumber)
2916 //
2917 // Function to run a single test from the Extended (data driven) tests.
2918 // See file test/testdata/regextst.txt for a description of the
2919 // pattern and inputString fields, and the allowed flags.
2920 // lineNumber is the source line in regextst.txt of the test.
2921 //
2922 //---------------------------------------------------------------------------
2923
2924
2925 // Set a value into a UVector at position specified by a decimal number in
2926 // a UnicodeString. This is a utility function needed by the actual test function,
2927 // which follows.
set(UVector & vec,int32_t val,UnicodeString index)2928 static void set(UVector &vec, int32_t val, UnicodeString index) {
2929 UErrorCode status=U_ZERO_ERROR;
2930 int32_t idx = 0;
2931 for (int32_t i=0; i<index.length(); i++) {
2932 int32_t d=u_charDigitValue(index.charAt(i));
2933 if (d<0) {return;}
2934 idx = idx*10 + d;
2935 }
2936 while (vec.size()<idx+1) {vec.addElement(-1, status);}
2937 vec.setElementAt(val, idx);
2938 }
2939
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,int32_t line)2940 void RegexTest::regex_find(const UnicodeString &pattern,
2941 const UnicodeString &flags,
2942 const UnicodeString &inputString,
2943 int32_t line) {
2944 UnicodeString unEscapedInput;
2945 UnicodeString deTaggedInput;
2946
2947 int32_t patternUTF8Length, inputUTF8Length;
2948 char *patternChars = NULL, *inputChars = NULL;
2949 UText patternText = UTEXT_INITIALIZER;
2950 UText inputText = UTEXT_INITIALIZER;
2951 UConverter *UTF8Converter = NULL;
2952
2953 UErrorCode status = U_ZERO_ERROR;
2954 UParseError pe;
2955 RegexPattern *parsePat = NULL;
2956 RegexMatcher *parseMatcher = NULL;
2957 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
2958 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
2959 UVector groupStarts(status);
2960 UVector groupEnds(status);
2961 UBool isMatch = FALSE, isUTF8Match = FALSE;
2962 UBool failed = FALSE;
2963 int32_t numFinds;
2964 int32_t i;
2965 UBool useMatchesFunc = FALSE;
2966 UBool useLookingAtFunc = FALSE;
2967 int32_t regionStart = -1;
2968 int32_t regionEnd = -1;
2969
2970 //
2971 // Compile the caller's pattern
2972 //
2973 uint32_t bflags = 0;
2974 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
2975 bflags |= UREGEX_CASE_INSENSITIVE;
2976 }
2977 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
2978 bflags |= UREGEX_COMMENTS;
2979 }
2980 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
2981 bflags |= UREGEX_DOTALL;
2982 }
2983 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
2984 bflags |= UREGEX_MULTILINE;
2985 }
2986
2987 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
2988 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
2989 }
2990 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
2991 bflags |= UREGEX_UNIX_LINES;
2992 }
2993
2994
2995 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
2996 if (status != U_ZERO_ERROR) {
2997 #if UCONFIG_NO_BREAK_ITERATION==1
2998 // 'v' test flag means that the test pattern should not compile if ICU was configured
2999 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3000 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3001 goto cleanupAndReturn;
3002 }
3003 #endif
3004 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3005 // Expected pattern compilation error.
3006 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3007 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3008 }
3009 goto cleanupAndReturn;
3010 } else {
3011 // Unexpected pattern compilation error.
3012 errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3013 goto cleanupAndReturn;
3014 }
3015 }
3016
3017 UTF8Converter = ucnv_open("UTF8", &status);
3018 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3019
3020 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3021 status = U_ZERO_ERROR; // buffer overflow
3022 patternChars = new char[patternUTF8Length+1];
3023 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3024 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3025
3026 if (status == U_ZERO_ERROR) {
3027 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3028
3029 if (status != U_ZERO_ERROR) {
3030 #if UCONFIG_NO_BREAK_ITERATION==1
3031 // 'v' test flag means that the test pattern should not compile if ICU was configured
3032 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3033 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3034 goto cleanupAndReturn;
3035 }
3036 #endif
3037 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3038 // Expected pattern compilation error.
3039 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3040 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3041 }
3042 goto cleanupAndReturn;
3043 } else {
3044 // Unexpected pattern compilation error.
3045 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3046 goto cleanupAndReturn;
3047 }
3048 }
3049 }
3050
3051 if (UTF8Pattern == NULL) {
3052 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3053 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for line %d", line);
3054 status = U_ZERO_ERROR;
3055 }
3056
3057 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3058 RegexPatternDump(callerPattern);
3059 }
3060
3061 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3062 errln("Expected, but did not get, a pattern compilation error.");
3063 goto cleanupAndReturn;
3064 }
3065
3066
3067 //
3068 // Number of times find() should be called on the test string, default to 1
3069 //
3070 numFinds = 1;
3071 for (i=2; i<=9; i++) {
3072 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3073 if (numFinds != 1) {
3074 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3075 goto cleanupAndReturn;
3076 }
3077 numFinds = i;
3078 }
3079 }
3080
3081 // 'M' flag. Use matches() instead of find()
3082 if (flags.indexOf((UChar)0x4d) >= 0) {
3083 useMatchesFunc = TRUE;
3084 }
3085 if (flags.indexOf((UChar)0x4c) >= 0) {
3086 useLookingAtFunc = TRUE;
3087 }
3088
3089 //
3090 // Find the tags in the input data, remove them, and record the group boundary
3091 // positions.
3092 //
3093 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3094 REGEX_CHECK_STATUS_L(line);
3095
3096 unEscapedInput = inputString.unescape();
3097 parseMatcher = parsePat->matcher(unEscapedInput, status);
3098 REGEX_CHECK_STATUS_L(line);
3099 while(parseMatcher->find()) {
3100 parseMatcher->appendReplacement(deTaggedInput, "", status);
3101 REGEX_CHECK_STATUS;
3102 UnicodeString groupNum = parseMatcher->group(2, status);
3103 if (groupNum == "r") {
3104 // <r> or </r>, a region specification within the string
3105 if (parseMatcher->group(1, status) == "/") {
3106 regionEnd = deTaggedInput.length();
3107 } else {
3108 regionStart = deTaggedInput.length();
3109 }
3110 } else {
3111 // <digits> or </digits>, a group match boundary tag.
3112 if (parseMatcher->group(1, status) == "/") {
3113 set(groupEnds, deTaggedInput.length(), groupNum);
3114 } else {
3115 set(groupStarts, deTaggedInput.length(), groupNum);
3116 }
3117 }
3118 }
3119 parseMatcher->appendTail(deTaggedInput);
3120 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3121 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3122 errln("mismatched <r> tags");
3123 failed = TRUE;
3124 goto cleanupAndReturn;
3125 }
3126
3127
3128 //
3129 // Configure the matcher according to the flags specified with this test.
3130 //
3131 matcher = callerPattern->matcher(deTaggedInput, status);
3132 REGEX_CHECK_STATUS_L(line);
3133 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3134 matcher->setTrace(TRUE);
3135 }
3136
3137 if (UTF8Pattern != NULL) {
3138 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3139 status = U_ZERO_ERROR; // buffer overflow
3140 inputChars = new char[inputUTF8Length+1];
3141 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3142 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3143
3144 if (status == U_ZERO_ERROR) {
3145 UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
3146 REGEX_CHECK_STATUS_L(line);
3147 }
3148
3149 if (UTF8Matcher == NULL) {
3150 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3151 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for line %d", line);
3152 status = U_ZERO_ERROR;
3153 }
3154 }
3155
3156 if (regionStart>=0) {
3157 matcher->region(regionStart, regionEnd, status);
3158 REGEX_CHECK_STATUS_L(line);
3159 if (UTF8Matcher != NULL) {
3160 UTF8Matcher->region(regionStart, regionEnd, status);
3161 REGEX_CHECK_STATUS_L(line);
3162 }
3163 }
3164 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3165 matcher->useAnchoringBounds(FALSE);
3166 if (UTF8Matcher != NULL) {
3167 UTF8Matcher->useAnchoringBounds(FALSE);
3168 }
3169 }
3170 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3171 matcher->useTransparentBounds(TRUE);
3172 if (UTF8Matcher != NULL) {
3173 UTF8Matcher->useTransparentBounds(TRUE);
3174 }
3175 }
3176
3177
3178
3179 //
3180 // Do a find on the de-tagged input using the caller's pattern
3181 // TODO: error on count>1 and not find().
3182 // error on both matches() and lookingAt().
3183 //
3184 for (i=0; i<numFinds; i++) {
3185 if (useMatchesFunc) {
3186 isMatch = matcher->matches(status);
3187 if (UTF8Matcher != NULL) {
3188 isUTF8Match = UTF8Matcher->matches(status);
3189 }
3190 } else if (useLookingAtFunc) {
3191 isMatch = matcher->lookingAt(status);
3192 if (UTF8Matcher != NULL) {
3193 isUTF8Match = UTF8Matcher->lookingAt(status);
3194 }
3195 } else {
3196 isMatch = matcher->find();
3197 if (UTF8Matcher != NULL) {
3198 isUTF8Match = UTF8Matcher->find();
3199 }
3200 }
3201 }
3202 matcher->setTrace(FALSE);
3203
3204 //
3205 // Match up the groups from the find() with the groups from the tags
3206 //
3207
3208 // number of tags should match number of groups from find operation.
3209 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3210 // G option in test means that capture group data is not available in the
3211 // expected results, so the check needs to be suppressed.
3212 if (isMatch == FALSE && groupStarts.size() != 0) {
3213 errln("Error at line %d: Match expected, but none found.", line);
3214 failed = TRUE;
3215 goto cleanupAndReturn;
3216 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3217 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3218 failed = TRUE;
3219 goto cleanupAndReturn;
3220 }
3221
3222 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3223 // Only check for match / no match. Don't check capture groups.
3224 if (isMatch && groupStarts.size() == 0) {
3225 errln("Error at line %d: No match expected, but one found.", line);
3226 failed = TRUE;
3227 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3228 errln("Error at line %d: No match expected, but one found. (UTF8)", line);
3229 failed = TRUE;
3230 }
3231 goto cleanupAndReturn;
3232 }
3233
3234 REGEX_CHECK_STATUS_L(line);
3235 for (i=0; i<=matcher->groupCount(); i++) {
3236 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3237 if (matcher->start(i, status) != expectedStart) {
3238 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3239 line, i, expectedStart, matcher->start(i, status));
3240 failed = TRUE;
3241 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3242 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStart) {
3243 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3244 line, i, expectedStart, UTF8Matcher->start(i, status));
3245 failed = TRUE;
3246 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3247 }
3248
3249 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3250 if (matcher->end(i, status) != expectedEnd) {
3251 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3252 line, i, expectedEnd, matcher->end(i, status));
3253 failed = TRUE;
3254 // Error on end position; keep going; real error is probably yet to come as group
3255 // end positions work from end of the input data towards the front.
3256 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEnd) {
3257 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3258 line, i, expectedEnd, UTF8Matcher->end(i, status));
3259 failed = TRUE;
3260 // Error on end position; keep going; real error is probably yet to come as group
3261 // end positions work from end of the input data towards the front.
3262 }
3263 }
3264 if ( matcher->groupCount()+1 < groupStarts.size()) {
3265 errln("Error at line %d: Expected %d capture groups, found %d.",
3266 line, groupStarts.size()-1, matcher->groupCount());
3267 failed = TRUE;
3268 }
3269 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3270 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3271 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3272 failed = TRUE;
3273 }
3274
3275 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3276 matcher->requireEnd() == TRUE) {
3277 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3278 failed = TRUE;
3279 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3280 UTF8Matcher->requireEnd() == TRUE) {
3281 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3282 failed = TRUE;
3283 }
3284
3285 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3286 matcher->requireEnd() == FALSE) {
3287 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3288 failed = TRUE;
3289 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3290 UTF8Matcher->requireEnd() == FALSE) {
3291 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3292 failed = TRUE;
3293 }
3294
3295 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3296 matcher->hitEnd() == TRUE) {
3297 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3298 failed = TRUE;
3299 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3300 UTF8Matcher->hitEnd() == TRUE) {
3301 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3302 failed = TRUE;
3303 }
3304
3305 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3306 matcher->hitEnd() == FALSE) {
3307 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3308 failed = TRUE;
3309 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3310 UTF8Matcher->hitEnd() == FALSE) {
3311 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3312 failed = TRUE;
3313 }
3314
3315
3316 cleanupAndReturn:
3317 if (failed) {
3318 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3319 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3320 // callerPattern->dump();
3321 }
3322 delete parseMatcher;
3323 delete parsePat;
3324 delete UTF8Matcher;
3325 delete UTF8Pattern;
3326 delete matcher;
3327 delete callerPattern;
3328
3329 utext_close(&inputText);
3330 delete[] inputChars;
3331 utext_close(&patternText);
3332 delete[] patternChars;
3333 ucnv_close(UTF8Converter);
3334 }
3335
3336
3337
3338
3339 //---------------------------------------------------------------------------
3340 //
3341 // Errors Check for error handling in patterns.
3342 //
3343 //---------------------------------------------------------------------------
Errors()3344 void RegexTest::Errors() {
3345 // \escape sequences that aren't implemented yet.
3346 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3347
3348 // Missing close parentheses
3349 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3350 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3351 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3352
3353 // Extra close paren
3354 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3355 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3356 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3357
3358 // Look-ahead, Look-behind
3359 // TODO: add tests for unbounded length look-behinds.
3360 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3361
3362 // Attempt to use non-default flags
3363 {
3364 UParseError pe;
3365 UErrorCode status = U_ZERO_ERROR;
3366 int32_t flags = UREGEX_CANON_EQ |
3367 UREGEX_COMMENTS | UREGEX_DOTALL |
3368 UREGEX_MULTILINE;
3369 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3370 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3371 delete pat1;
3372 }
3373
3374
3375 // Quantifiers are allowed only after something that can be quantified.
3376 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3377 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3378 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3379
3380 // Mal-formed {min,max} quantifiers
3381 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3382 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3383 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3384 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3385 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3386 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3387 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3388 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3389 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3390
3391 // Ticket 5389
3392 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3393
3394 // Invalid Back Reference \0
3395 // For ICU 3.8 and earlier
3396 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3397 //
3398 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3399
3400 }
3401
3402
3403 //-------------------------------------------------------------------------------
3404 //
3405 // Read a text data file, convert it to UChars, and return the data
3406 // in one big UChar * buffer, which the caller must delete.
3407 //
3408 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3409 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3410 const char *defEncoding, UErrorCode &status) {
3411 UChar *retPtr = NULL;
3412 char *fileBuf = NULL;
3413 UConverter* conv = NULL;
3414 FILE *f = NULL;
3415
3416 ulen = 0;
3417 if (U_FAILURE(status)) {
3418 return retPtr;
3419 }
3420
3421 //
3422 // Open the file.
3423 //
3424 f = fopen(fileName, "rb");
3425 if (f == 0) {
3426 dataerrln("Error opening test data file %s\n", fileName);
3427 status = U_FILE_ACCESS_ERROR;
3428 return NULL;
3429 }
3430 //
3431 // Read it in
3432 //
3433 int32_t fileSize;
3434 int32_t amt_read;
3435
3436 fseek( f, 0, SEEK_END);
3437 fileSize = ftell(f);
3438 fileBuf = new char[fileSize];
3439 fseek(f, 0, SEEK_SET);
3440 amt_read = fread(fileBuf, 1, fileSize, f);
3441 if (amt_read != fileSize || fileSize <= 0) {
3442 errln("Error reading test data file.");
3443 goto cleanUpAndReturn;
3444 }
3445
3446 //
3447 // Look for a Unicode Signature (BOM) on the data just read
3448 //
3449 int32_t signatureLength;
3450 const char * fileBufC;
3451 const char* encoding;
3452
3453 fileBufC = fileBuf;
3454 encoding = ucnv_detectUnicodeSignature(
3455 fileBuf, fileSize, &signatureLength, &status);
3456 if(encoding!=NULL ){
3457 fileBufC += signatureLength;
3458 fileSize -= signatureLength;
3459 } else {
3460 encoding = defEncoding;
3461 if (strcmp(encoding, "utf-8") == 0) {
3462 errln("file %s is missing its BOM", fileName);
3463 }
3464 }
3465
3466 //
3467 // Open a converter to take the rule file to UTF-16
3468 //
3469 conv = ucnv_open(encoding, &status);
3470 if (U_FAILURE(status)) {
3471 goto cleanUpAndReturn;
3472 }
3473
3474 //
3475 // Convert the rules to UChar.
3476 // Preflight first to determine required buffer size.
3477 //
3478 ulen = ucnv_toUChars(conv,
3479 NULL, // dest,
3480 0, // destCapacity,
3481 fileBufC,
3482 fileSize,
3483 &status);
3484 if (status == U_BUFFER_OVERFLOW_ERROR) {
3485 // Buffer Overflow is expected from the preflight operation.
3486 status = U_ZERO_ERROR;
3487
3488 retPtr = new UChar[ulen+1];
3489 ucnv_toUChars(conv,
3490 retPtr, // dest,
3491 ulen+1,
3492 fileBufC,
3493 fileSize,
3494 &status);
3495 }
3496
3497 cleanUpAndReturn:
3498 fclose(f);
3499 delete[] fileBuf;
3500 ucnv_close(conv);
3501 if (U_FAILURE(status)) {
3502 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3503 delete retPtr;
3504 retPtr = 0;
3505 ulen = 0;
3506 };
3507 return retPtr;
3508 }
3509
3510
3511 //-------------------------------------------------------------------------------
3512 //
3513 // PerlTests - Run Perl's regular expression tests
3514 // The input file for this test is re_tests, the standard regular
3515 // expression test data distributed with the Perl source code.
3516 //
3517 // Here is Perl's description of the test data file:
3518 //
3519 // # The tests are in a separate file 't/op/re_tests'.
3520 // # Each line in that file is a separate test.
3521 // # There are five columns, separated by tabs.
3522 // #
3523 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3524 // # Modifiers can be put after the closing C<'>.
3525 // #
3526 // # Column 2 contains the string to be matched.
3527 // #
3528 // # Column 3 contains the expected result:
3529 // # y expect a match
3530 // # n expect no match
3531 // # c expect an error
3532 // # B test exposes a known bug in Perl, should be skipped
3533 // # b test exposes a known bug in Perl, should be skipped if noamp
3534 // #
3535 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3536 // #
3537 // # Column 4 contains a string, usually C<$&>.
3538 // #
3539 // # Column 5 contains the expected result of double-quote
3540 // # interpolating that string after the match, or start of error message.
3541 // #
3542 // # Column 6, if present, contains a reason why the test is skipped.
3543 // # This is printed with "skipped", for harness to pick up.
3544 // #
3545 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3546 // #
3547 // # If you want to add a regular expression test that can't be expressed
3548 // # in this format, don't add it here: put it in op/pat.t instead.
3549 //
3550 // For ICU, if field 3 contains an 'i', the test will be skipped.
3551 // The test exposes is some known incompatibility between ICU and Perl regexps.
3552 // (The i is in addition to whatever was there before.)
3553 //
3554 //-------------------------------------------------------------------------------
PerlTests()3555 void RegexTest::PerlTests() {
3556 char tdd[2048];
3557 const char *srcPath;
3558 UErrorCode status = U_ZERO_ERROR;
3559 UParseError pe;
3560
3561 //
3562 // Open and read the test data file.
3563 //
3564 srcPath=getPath(tdd, "re_tests.txt");
3565 if(srcPath==NULL) {
3566 return; /* something went wrong, error already output */
3567 }
3568
3569 int32_t len;
3570 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3571 if (U_FAILURE(status)) {
3572 return; /* something went wrong, error already output */
3573 }
3574
3575 //
3576 // Put the test data into a UnicodeString
3577 //
3578 UnicodeString testDataString(FALSE, testData, len);
3579
3580 //
3581 // Regex to break the input file into lines, and strip the new lines.
3582 // One line per match, capture group one is the desired data.
3583 //
3584 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3585 if (U_FAILURE(status)) {
3586 dataerrln("RegexPattern::compile() error");
3587 return;
3588 }
3589 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3590
3591 //
3592 // Regex to split a test file line into fields.
3593 // There are six fields, separated by tabs.
3594 //
3595 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3596
3597 //
3598 // Regex to identify test patterns with flag settings, and to separate them.
3599 // Test patterns with flags look like 'pattern'i
3600 // Test patterns without flags are not quoted: pattern
3601 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3602 //
3603 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3604 RegexMatcher* flagMat = flagPat->matcher(status);
3605
3606 //
3607 // The Perl tests reference several perl-isms, which are evaluated/substituted
3608 // in the test data. Not being perl, this must be done explicitly. Here
3609 // are string constants and REs for these constructs.
3610 //
3611 UnicodeString nulnulSrc("${nulnul}");
3612 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3613 nulnul = nulnul.unescape();
3614
3615 UnicodeString ffffSrc("${ffff}");
3616 UnicodeString ffff("\\uffff", -1, US_INV);
3617 ffff = ffff.unescape();
3618
3619 // regexp for $-[0], $+[2], etc.
3620 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3621 RegexMatcher *groupsMat = groupsPat->matcher(status);
3622
3623 // regexp for $0, $1, $2, etc.
3624 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3625 RegexMatcher *cgMat = cgPat->matcher(status);
3626
3627
3628 //
3629 // Main Loop for the Perl Tests, runs once per line from the
3630 // test data file.
3631 //
3632 int32_t lineNum = 0;
3633 int32_t skippedUnimplementedCount = 0;
3634 while (lineMat->find()) {
3635 lineNum++;
3636
3637 //
3638 // Get a line, break it into its fields, do the Perl
3639 // variable substitutions.
3640 //
3641 UnicodeString line = lineMat->group(1, status);
3642 UnicodeString fields[7];
3643 fieldPat->split(line, fields, 7, status);
3644
3645 flagMat->reset(fields[0]);
3646 flagMat->matches(status);
3647 UnicodeString pattern = flagMat->group(2, status);
3648 pattern.findAndReplace("${bang}", "!");
3649 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
3650 pattern.findAndReplace(ffffSrc, ffff);
3651
3652 //
3653 // Identify patterns that include match flag settings,
3654 // split off the flags, remove the extra quotes.
3655 //
3656 UnicodeString flagStr = flagMat->group(3, status);
3657 if (U_FAILURE(status)) {
3658 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3659 return;
3660 }
3661 int32_t flags = 0;
3662 const UChar UChar_c = 0x63; // Char constants for the flag letters.
3663 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
3664 const UChar UChar_m = 0x6d;
3665 const UChar UChar_x = 0x78;
3666 const UChar UChar_y = 0x79;
3667 if (flagStr.indexOf(UChar_i) != -1) {
3668 flags |= UREGEX_CASE_INSENSITIVE;
3669 }
3670 if (flagStr.indexOf(UChar_m) != -1) {
3671 flags |= UREGEX_MULTILINE;
3672 }
3673 if (flagStr.indexOf(UChar_x) != -1) {
3674 flags |= UREGEX_COMMENTS;
3675 }
3676
3677 //
3678 // Compile the test pattern.
3679 //
3680 status = U_ZERO_ERROR;
3681 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
3682 if (status == U_REGEX_UNIMPLEMENTED) {
3683 //
3684 // Test of a feature that is planned for ICU, but not yet implemented.
3685 // skip the test.
3686 skippedUnimplementedCount++;
3687 delete testPat;
3688 status = U_ZERO_ERROR;
3689 continue;
3690 }
3691
3692 if (U_FAILURE(status)) {
3693 // Some tests are supposed to generate errors.
3694 // Only report an error for tests that are supposed to succeed.
3695 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
3696 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
3697 {
3698 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
3699 }
3700 status = U_ZERO_ERROR;
3701 delete testPat;
3702 continue;
3703 }
3704
3705 if (fields[2].indexOf(UChar_i) >= 0) {
3706 // ICU should skip this test.
3707 delete testPat;
3708 continue;
3709 }
3710
3711 if (fields[2].indexOf(UChar_c) >= 0) {
3712 // This pattern should have caused a compilation error, but didn't/
3713 errln("line %d: Expected a pattern compile error, got success.", lineNum);
3714 delete testPat;
3715 continue;
3716 }
3717
3718 //
3719 // replace the Perl variables that appear in some of the
3720 // match data strings.
3721 //
3722 UnicodeString matchString = fields[1];
3723 matchString.findAndReplace(nulnulSrc, nulnul);
3724 matchString.findAndReplace(ffffSrc, ffff);
3725
3726 // Replace any \n in the match string with an actual new-line char.
3727 // Don't do full unescape, as this unescapes more than Perl does, which
3728 // causes other spurious failures in the tests.
3729 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
3730
3731
3732
3733 //
3734 // Run the test, check for expected match/don't match result.
3735 //
3736 RegexMatcher *testMat = testPat->matcher(matchString, status);
3737 UBool found = testMat->find();
3738 UBool expected = FALSE;
3739 if (fields[2].indexOf(UChar_y) >=0) {
3740 expected = TRUE;
3741 }
3742 if (expected != found) {
3743 errln("line %d: Expected %smatch, got %smatch",
3744 lineNum, expected?"":"no ", found?"":"no " );
3745 continue;
3746 }
3747
3748 // Don't try to check expected results if there is no match.
3749 // (Some have stuff in the expected fields)
3750 if (!found) {
3751 delete testMat;
3752 delete testPat;
3753 continue;
3754 }
3755
3756 //
3757 // Interpret the Perl expression from the fourth field of the data file,
3758 // building up an ICU string from the results of the ICU match.
3759 // The Perl expression will contain references to the results of
3760 // a regex match, including the matched string, capture group strings,
3761 // group starting and ending indicies, etc.
3762 //
3763 UnicodeString resultString;
3764 UnicodeString perlExpr = fields[3];
3765 #if SUPPORT_MUTATING_INPUT_STRING
3766 groupsMat->reset(perlExpr);
3767 cgMat->reset(perlExpr);
3768 #endif
3769
3770 while (perlExpr.length() > 0) {
3771 #if !SUPPORT_MUTATING_INPUT_STRING
3772 // Perferred usage. Reset after any modification to input string.
3773 groupsMat->reset(perlExpr);
3774 cgMat->reset(perlExpr);
3775 #endif
3776
3777 if (perlExpr.startsWith("$&")) {
3778 resultString.append(testMat->group(status));
3779 perlExpr.remove(0, 2);
3780 }
3781
3782 else if (groupsMat->lookingAt(status)) {
3783 // $-[0] $+[2] etc.
3784 UnicodeString digitString = groupsMat->group(2, status);
3785 int32_t t = 0;
3786 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
3787 UnicodeString plusOrMinus = groupsMat->group(1, status);
3788 int32_t matchPosition;
3789 if (plusOrMinus.compare("+") == 0) {
3790 matchPosition = testMat->end(groupNum, status);
3791 } else {
3792 matchPosition = testMat->start(groupNum, status);
3793 }
3794 if (matchPosition != -1) {
3795 ICU_Utility::appendNumber(resultString, matchPosition);
3796 }
3797 perlExpr.remove(0, groupsMat->end(status));
3798 }
3799
3800 else if (cgMat->lookingAt(status)) {
3801 // $1, $2, $3, etc.
3802 UnicodeString digitString = cgMat->group(1, status);
3803 int32_t t = 0;
3804 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
3805 if (U_SUCCESS(status)) {
3806 resultString.append(testMat->group(groupNum, status));
3807 status = U_ZERO_ERROR;
3808 }
3809 perlExpr.remove(0, cgMat->end(status));
3810 }
3811
3812 else if (perlExpr.startsWith("@-")) {
3813 int32_t i;
3814 for (i=0; i<=testMat->groupCount(); i++) {
3815 if (i>0) {
3816 resultString.append(" ");
3817 }
3818 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
3819 }
3820 perlExpr.remove(0, 2);
3821 }
3822
3823 else if (perlExpr.startsWith("@+")) {
3824 int32_t i;
3825 for (i=0; i<=testMat->groupCount(); i++) {
3826 if (i>0) {
3827 resultString.append(" ");
3828 }
3829 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
3830 }
3831 perlExpr.remove(0, 2);
3832 }
3833
3834 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
3835 // or as an escaped sequence (e.g. \n)
3836 if (perlExpr.length() > 1) {
3837 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
3838 }
3839 UChar c = perlExpr.charAt(0);
3840 switch (c) {
3841 case 'n': c = '\n'; break;
3842 // add any other escape sequences that show up in the test expected results.
3843 }
3844 resultString.append(c);
3845 perlExpr.remove(0, 1);
3846 }
3847
3848 else {
3849 // Any characters from the perl expression that we don't explicitly
3850 // recognize before here are assumed to be literals and copied
3851 // as-is to the expected results.
3852 resultString.append(perlExpr.charAt(0));
3853 perlExpr.remove(0, 1);
3854 }
3855
3856 if (U_FAILURE(status)) {
3857 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
3858 break;
3859 }
3860 }
3861
3862 //
3863 // Expected Results Compare
3864 //
3865 UnicodeString expectedS(fields[4]);
3866 expectedS.findAndReplace(nulnulSrc, nulnul);
3867 expectedS.findAndReplace(ffffSrc, ffff);
3868 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
3869
3870
3871 if (expectedS.compare(resultString) != 0) {
3872 err("Line %d: Incorrect perl expression results.", lineNum);
3873 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
3874 }
3875
3876 delete testMat;
3877 delete testPat;
3878 }
3879
3880 //
3881 // All done. Clean up allocated stuff.
3882 //
3883 delete cgMat;
3884 delete cgPat;
3885
3886 delete groupsMat;
3887 delete groupsPat;
3888
3889 delete flagMat;
3890 delete flagPat;
3891
3892 delete lineMat;
3893 delete linePat;
3894
3895 delete fieldPat;
3896 delete [] testData;
3897
3898
3899 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
3900
3901 }
3902
3903
3904 //-------------------------------------------------------------------------------
3905 //
3906 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
3907 // (instead of using UnicodeStrings) to test the alternate engine.
3908 // The input file for this test is re_tests, the standard regular
3909 // expression test data distributed with the Perl source code.
3910 // See PerlTests() for more information.
3911 //
3912 //-------------------------------------------------------------------------------
PerlTestsUTF8()3913 void RegexTest::PerlTestsUTF8() {
3914 char tdd[2048];
3915 const char *srcPath;
3916 UErrorCode status = U_ZERO_ERROR;
3917 UParseError pe;
3918 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
3919 UText patternText = UTEXT_INITIALIZER;
3920 char *patternChars = NULL;
3921 int32_t patternLength;
3922 int32_t patternCapacity = 0;
3923 UText inputText = UTEXT_INITIALIZER;
3924 char *inputChars = NULL;
3925 int32_t inputLength;
3926 int32_t inputCapacity = 0;
3927
3928 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3929
3930 //
3931 // Open and read the test data file.
3932 //
3933 srcPath=getPath(tdd, "re_tests.txt");
3934 if(srcPath==NULL) {
3935 return; /* something went wrong, error already output */
3936 }
3937
3938 int32_t len;
3939 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3940 if (U_FAILURE(status)) {
3941 return; /* something went wrong, error already output */
3942 }
3943
3944 //
3945 // Put the test data into a UnicodeString
3946 //
3947 UnicodeString testDataString(FALSE, testData, len);
3948
3949 //
3950 // Regex to break the input file into lines, and strip the new lines.
3951 // One line per match, capture group one is the desired data.
3952 //
3953 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3954 if (U_FAILURE(status)) {
3955 dataerrln("RegexPattern::compile() error");
3956 return;
3957 }
3958 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3959
3960 //
3961 // Regex to split a test file line into fields.
3962 // There are six fields, separated by tabs.
3963 //
3964 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3965
3966 //
3967 // Regex to identify test patterns with flag settings, and to separate them.
3968 // Test patterns with flags look like 'pattern'i
3969 // Test patterns without flags are not quoted: pattern
3970 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3971 //
3972 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3973 RegexMatcher* flagMat = flagPat->matcher(status);
3974
3975 //
3976 // The Perl tests reference several perl-isms, which are evaluated/substituted
3977 // in the test data. Not being perl, this must be done explicitly. Here
3978 // are string constants and REs for these constructs.
3979 //
3980 UnicodeString nulnulSrc("${nulnul}");
3981 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3982 nulnul = nulnul.unescape();
3983
3984 UnicodeString ffffSrc("${ffff}");
3985 UnicodeString ffff("\\uffff", -1, US_INV);
3986 ffff = ffff.unescape();
3987
3988 // regexp for $-[0], $+[2], etc.
3989 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3990 RegexMatcher *groupsMat = groupsPat->matcher(status);
3991
3992 // regexp for $0, $1, $2, etc.
3993 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3994 RegexMatcher *cgMat = cgPat->matcher(status);
3995
3996
3997 //
3998 // Main Loop for the Perl Tests, runs once per line from the
3999 // test data file.
4000 //
4001 int32_t lineNum = 0;
4002 int32_t skippedUnimplementedCount = 0;
4003 while (lineMat->find()) {
4004 lineNum++;
4005
4006 //
4007 // Get a line, break it into its fields, do the Perl
4008 // variable substitutions.
4009 //
4010 UnicodeString line = lineMat->group(1, status);
4011 UnicodeString fields[7];
4012 fieldPat->split(line, fields, 7, status);
4013
4014 flagMat->reset(fields[0]);
4015 flagMat->matches(status);
4016 UnicodeString pattern = flagMat->group(2, status);
4017 pattern.findAndReplace("${bang}", "!");
4018 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4019 pattern.findAndReplace(ffffSrc, ffff);
4020
4021 //
4022 // Identify patterns that include match flag settings,
4023 // split off the flags, remove the extra quotes.
4024 //
4025 UnicodeString flagStr = flagMat->group(3, status);
4026 if (U_FAILURE(status)) {
4027 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4028 return;
4029 }
4030 int32_t flags = 0;
4031 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4032 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4033 const UChar UChar_m = 0x6d;
4034 const UChar UChar_x = 0x78;
4035 const UChar UChar_y = 0x79;
4036 if (flagStr.indexOf(UChar_i) != -1) {
4037 flags |= UREGEX_CASE_INSENSITIVE;
4038 }
4039 if (flagStr.indexOf(UChar_m) != -1) {
4040 flags |= UREGEX_MULTILINE;
4041 }
4042 if (flagStr.indexOf(UChar_x) != -1) {
4043 flags |= UREGEX_COMMENTS;
4044 }
4045
4046 //
4047 // Put the pattern in a UTF-8 UText
4048 //
4049 status = U_ZERO_ERROR;
4050 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4051 if (status == U_BUFFER_OVERFLOW_ERROR) {
4052 status = U_ZERO_ERROR;
4053 delete[] patternChars;
4054 patternCapacity = patternLength + 1;
4055 patternChars = new char[patternCapacity];
4056 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4057 }
4058 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4059
4060 //
4061 // Compile the test pattern.
4062 //
4063 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4064 if (status == U_REGEX_UNIMPLEMENTED) {
4065 //
4066 // Test of a feature that is planned for ICU, but not yet implemented.
4067 // skip the test.
4068 skippedUnimplementedCount++;
4069 delete testPat;
4070 status = U_ZERO_ERROR;
4071 continue;
4072 }
4073
4074 if (U_FAILURE(status)) {
4075 // Some tests are supposed to generate errors.
4076 // Only report an error for tests that are supposed to succeed.
4077 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4078 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4079 {
4080 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4081 }
4082 status = U_ZERO_ERROR;
4083 delete testPat;
4084 continue;
4085 }
4086
4087 if (fields[2].indexOf(UChar_i) >= 0) {
4088 // ICU should skip this test.
4089 delete testPat;
4090 continue;
4091 }
4092
4093 if (fields[2].indexOf(UChar_c) >= 0) {
4094 // This pattern should have caused a compilation error, but didn't/
4095 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4096 delete testPat;
4097 continue;
4098 }
4099
4100
4101 //
4102 // replace the Perl variables that appear in some of the
4103 // match data strings.
4104 //
4105 UnicodeString matchString = fields[1];
4106 matchString.findAndReplace(nulnulSrc, nulnul);
4107 matchString.findAndReplace(ffffSrc, ffff);
4108
4109 // Replace any \n in the match string with an actual new-line char.
4110 // Don't do full unescape, as this unescapes more than Perl does, which
4111 // causes other spurious failures in the tests.
4112 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4113
4114 //
4115 // Put the input in a UTF-8 UText
4116 //
4117 status = U_ZERO_ERROR;
4118 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4119 if (status == U_BUFFER_OVERFLOW_ERROR) {
4120 status = U_ZERO_ERROR;
4121 delete[] inputChars;
4122 inputCapacity = inputLength + 1;
4123 inputChars = new char[inputCapacity];
4124 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4125 }
4126 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4127
4128 //
4129 // Run the test, check for expected match/don't match result.
4130 //
4131 RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
4132 UBool found = testMat->find();
4133 UBool expected = FALSE;
4134 if (fields[2].indexOf(UChar_y) >=0) {
4135 expected = TRUE;
4136 }
4137 if (expected != found) {
4138 errln("line %d: Expected %smatch, got %smatch",
4139 lineNum, expected?"":"no ", found?"":"no " );
4140 continue;
4141 }
4142
4143 // Don't try to check expected results if there is no match.
4144 // (Some have stuff in the expected fields)
4145 if (!found) {
4146 delete testMat;
4147 delete testPat;
4148 continue;
4149 }
4150
4151 //
4152 // Interpret the Perl expression from the fourth field of the data file,
4153 // building up an ICU string from the results of the ICU match.
4154 // The Perl expression will contain references to the results of
4155 // a regex match, including the matched string, capture group strings,
4156 // group starting and ending indicies, etc.
4157 //
4158 UnicodeString resultString;
4159 UnicodeString perlExpr = fields[3];
4160
4161 while (perlExpr.length() > 0) {
4162 groupsMat->reset(perlExpr);
4163 cgMat->reset(perlExpr);
4164
4165 if (perlExpr.startsWith("$&")) {
4166 resultString.append(testMat->group(status));
4167 perlExpr.remove(0, 2);
4168 }
4169
4170 else if (groupsMat->lookingAt(status)) {
4171 // $-[0] $+[2] etc.
4172 UnicodeString digitString = groupsMat->group(2, status);
4173 int32_t t = 0;
4174 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4175 UnicodeString plusOrMinus = groupsMat->group(1, status);
4176 int32_t matchPosition;
4177 if (plusOrMinus.compare("+") == 0) {
4178 matchPosition = testMat->end(groupNum, status);
4179 } else {
4180 matchPosition = testMat->start(groupNum, status);
4181 }
4182 if (matchPosition != -1) {
4183 ICU_Utility::appendNumber(resultString, matchPosition);
4184 }
4185 perlExpr.remove(0, groupsMat->end(status));
4186 }
4187
4188 else if (cgMat->lookingAt(status)) {
4189 // $1, $2, $3, etc.
4190 UnicodeString digitString = cgMat->group(1, status);
4191 int32_t t = 0;
4192 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4193 if (U_SUCCESS(status)) {
4194 resultString.append(testMat->group(groupNum, status));
4195 status = U_ZERO_ERROR;
4196 }
4197 perlExpr.remove(0, cgMat->end(status));
4198 }
4199
4200 else if (perlExpr.startsWith("@-")) {
4201 int32_t i;
4202 for (i=0; i<=testMat->groupCount(); i++) {
4203 if (i>0) {
4204 resultString.append(" ");
4205 }
4206 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4207 }
4208 perlExpr.remove(0, 2);
4209 }
4210
4211 else if (perlExpr.startsWith("@+")) {
4212 int32_t i;
4213 for (i=0; i<=testMat->groupCount(); i++) {
4214 if (i>0) {
4215 resultString.append(" ");
4216 }
4217 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4218 }
4219 perlExpr.remove(0, 2);
4220 }
4221
4222 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4223 // or as an escaped sequence (e.g. \n)
4224 if (perlExpr.length() > 1) {
4225 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4226 }
4227 UChar c = perlExpr.charAt(0);
4228 switch (c) {
4229 case 'n': c = '\n'; break;
4230 // add any other escape sequences that show up in the test expected results.
4231 }
4232 resultString.append(c);
4233 perlExpr.remove(0, 1);
4234 }
4235
4236 else {
4237 // Any characters from the perl expression that we don't explicitly
4238 // recognize before here are assumed to be literals and copied
4239 // as-is to the expected results.
4240 resultString.append(perlExpr.charAt(0));
4241 perlExpr.remove(0, 1);
4242 }
4243
4244 if (U_FAILURE(status)) {
4245 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4246 break;
4247 }
4248 }
4249
4250 //
4251 // Expected Results Compare
4252 //
4253 UnicodeString expectedS(fields[4]);
4254 expectedS.findAndReplace(nulnulSrc, nulnul);
4255 expectedS.findAndReplace(ffffSrc, ffff);
4256 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4257
4258
4259 if (expectedS.compare(resultString) != 0) {
4260 err("Line %d: Incorrect perl expression results.", lineNum);
4261 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4262 }
4263
4264 delete testMat;
4265 delete testPat;
4266 }
4267
4268 //
4269 // All done. Clean up allocated stuff.
4270 //
4271 delete cgMat;
4272 delete cgPat;
4273
4274 delete groupsMat;
4275 delete groupsPat;
4276
4277 delete flagMat;
4278 delete flagPat;
4279
4280 delete lineMat;
4281 delete linePat;
4282
4283 delete fieldPat;
4284 delete [] testData;
4285
4286 utext_close(&patternText);
4287 utext_close(&inputText);
4288
4289 delete [] patternChars;
4290 delete [] inputChars;
4291
4292
4293 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4294
4295 }
4296
4297
4298 //--------------------------------------------------------------
4299 //
4300 // Bug6149 Verify limits to heap expansion for backtrack stack.
4301 // Use this pattern,
4302 // "(a?){1,}"
4303 // The zero-length match will repeat forever.
4304 // (That this goes into a loop is another bug)
4305 //
4306 //---------------------------------------------------------------
Bug6149()4307 void RegexTest::Bug6149() {
4308 UnicodeString pattern("(a?){1,}");
4309 UnicodeString s("xyz");
4310 uint32_t flags = 0;
4311 UErrorCode status = U_ZERO_ERROR;
4312
4313 RegexMatcher matcher(pattern, s, flags, status);
4314 UBool result = false;
4315 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4316 REGEX_ASSERT(result == FALSE);
4317 }
4318
4319
4320 //
4321 // Callbacks() Test the callback function.
4322 // When set, callbacks occur periodically during matching operations,
4323 // giving the application code the ability to abort the operation
4324 // before it's normal completion.
4325 //
4326
4327 struct callBackContext {
4328 RegexTest *test;
4329 int32_t maxCalls;
4330 int32_t numCalls;
4331 int32_t lastSteps;
resetcallBackContext4332 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4333 };
4334
4335 U_CDECL_BEGIN
4336 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4337 testCallBackFn(const void *context, int32_t steps) {
4338 callBackContext *info = (callBackContext *)context;
4339 if (info->lastSteps+1 != steps) {
4340 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4341 }
4342 info->lastSteps = steps;
4343 info->numCalls++;
4344 return (info->numCalls < info->maxCalls);
4345 }
4346 U_CDECL_END
4347
Callbacks()4348 void RegexTest::Callbacks() {
4349 {
4350 // Getter returns NULLs if no callback has been set
4351
4352 // The variables that the getter will fill in.
4353 // Init to non-null values so that the action of the getter can be seen.
4354 const void *returnedContext = &returnedContext;
4355 URegexMatchCallback *returnedFn = &testCallBackFn;
4356
4357 UErrorCode status = U_ZERO_ERROR;
4358 RegexMatcher matcher("x", 0, status);
4359 REGEX_CHECK_STATUS;
4360 matcher.getMatchCallback(returnedFn, returnedContext, status);
4361 REGEX_CHECK_STATUS;
4362 REGEX_ASSERT(returnedFn == NULL);
4363 REGEX_ASSERT(returnedContext == NULL);
4364 }
4365
4366 {
4367 // Set and Get work
4368 callBackContext cbInfo = {this, 0, 0, 0};
4369 const void *returnedContext;
4370 URegexMatchCallback *returnedFn;
4371 UErrorCode status = U_ZERO_ERROR;
4372 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4373 REGEX_CHECK_STATUS;
4374 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4375 REGEX_CHECK_STATUS;
4376 matcher.getMatchCallback(returnedFn, returnedContext, status);
4377 REGEX_CHECK_STATUS;
4378 REGEX_ASSERT(returnedFn == testCallBackFn);
4379 REGEX_ASSERT(returnedContext == &cbInfo);
4380
4381 // A short-running match shouldn't invoke the callback
4382 status = U_ZERO_ERROR;
4383 cbInfo.reset(1);
4384 UnicodeString s = "xxx";
4385 matcher.reset(s);
4386 REGEX_ASSERT(matcher.matches(status));
4387 REGEX_CHECK_STATUS;
4388 REGEX_ASSERT(cbInfo.numCalls == 0);
4389
4390 // A medium-length match that runs long enough to invoke the
4391 // callback, but not so long that the callback aborts it.
4392 status = U_ZERO_ERROR;
4393 cbInfo.reset(4);
4394 s = "aaaaaaaaaaaaaaaaaaab";
4395 matcher.reset(s);
4396 REGEX_ASSERT(matcher.matches(status)==FALSE);
4397 REGEX_CHECK_STATUS;
4398 REGEX_ASSERT(cbInfo.numCalls > 0);
4399
4400 // A longer running match that the callback function will abort.
4401 status = U_ZERO_ERROR;
4402 cbInfo.reset(4);
4403 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4404 matcher.reset(s);
4405 REGEX_ASSERT(matcher.matches(status)==FALSE);
4406 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4407 REGEX_ASSERT(cbInfo.numCalls == 4);
4408 }
4409
4410
4411 }
4412
4413
4414 //---------------------------------------------------------------------------
4415 //
4416 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4417 // UTexts. The pure-C implementation of UText
4418 // has no mutable backing stores, but we can
4419 // use UnicodeString here to test the functionality.
4420 //
4421 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4422 void RegexTest::PreAllocatedUTextCAPI () {
4423 UErrorCode status = U_ZERO_ERROR;
4424 URegularExpression *re;
4425 UText patternText = UTEXT_INITIALIZER;
4426 UnicodeString buffer;
4427 UText bufferText = UTEXT_INITIALIZER;
4428
4429 utext_openUnicodeString(&bufferText, &buffer, &status);
4430
4431 /*
4432 * getText() and getUText()
4433 */
4434 {
4435 UText text1 = UTEXT_INITIALIZER;
4436 UText text2 = UTEXT_INITIALIZER;
4437 UChar text2Chars[20];
4438 UText *resultText;
4439
4440 status = U_ZERO_ERROR;
4441 utext_openUTF8(&text1, "abcccd", -1, &status);
4442 utext_openUTF8(&text2, "abcccxd", -1, &status);
4443 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4444 utext_openUChars(&text2, text2Chars, -1, &status);
4445
4446 utext_openUTF8(&patternText, "abc*d", -1, &status);
4447 re = uregex_openUText(&patternText, 0, NULL, &status);
4448
4449 /* First set a UText */
4450 uregex_setUText(re, &text1, &status);
4451 resultText = uregex_getUText(re, &bufferText, &status);
4452 REGEX_CHECK_STATUS;
4453 REGEX_ASSERT(resultText == &bufferText);
4454 utext_setNativeIndex(resultText, 0);
4455 utext_setNativeIndex(&text1, 0);
4456 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4457
4458 resultText = uregex_getUText(re, &bufferText, &status);
4459 REGEX_CHECK_STATUS;
4460 REGEX_ASSERT(resultText == &bufferText);
4461 utext_setNativeIndex(resultText, 0);
4462 utext_setNativeIndex(&text1, 0);
4463 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4464
4465 /* Then set a UChar * */
4466 uregex_setText(re, text2Chars, 7, &status);
4467 resultText = uregex_getUText(re, &bufferText, &status);
4468 REGEX_CHECK_STATUS;
4469 REGEX_ASSERT(resultText == &bufferText);
4470 utext_setNativeIndex(resultText, 0);
4471 utext_setNativeIndex(&text2, 0);
4472 REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
4473
4474 uregex_close(re);
4475 utext_close(&text1);
4476 utext_close(&text2);
4477 }
4478
4479 /*
4480 * group()
4481 */
4482 {
4483 UChar text1[80];
4484 UText *actual;
4485 UBool result;
4486 u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2);
4487
4488 status = U_ZERO_ERROR;
4489 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4490 REGEX_CHECK_STATUS;
4491
4492 uregex_setText(re, text1, -1, &status);
4493 result = uregex_find(re, 0, &status);
4494 REGEX_ASSERT(result==TRUE);
4495
4496 /* Capture Group 0, the full match. Should succeed. */
4497 status = U_ZERO_ERROR;
4498 actual = uregex_groupUText(re, 0, &bufferText, &status);
4499 REGEX_CHECK_STATUS;
4500 REGEX_ASSERT(actual == &bufferText);
4501 REGEX_ASSERT_UTEXT("abc interior def", actual);
4502
4503 /* Capture group #1. Should succeed. */
4504 status = U_ZERO_ERROR;
4505 actual = uregex_groupUText(re, 1, &bufferText, &status);
4506 REGEX_CHECK_STATUS;
4507 REGEX_ASSERT(actual == &bufferText);
4508 REGEX_ASSERT_UTEXT(" interior ", actual);
4509
4510 /* Capture group out of range. Error. */
4511 status = U_ZERO_ERROR;
4512 actual = uregex_groupUText(re, 2, &bufferText, &status);
4513 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
4514 REGEX_ASSERT(actual == &bufferText);
4515
4516 uregex_close(re);
4517
4518 }
4519
4520 /*
4521 * replaceFirst()
4522 */
4523 {
4524 UChar text1[80];
4525 UChar text2[80];
4526 UText replText = UTEXT_INITIALIZER;
4527 UText *result;
4528
4529 status = U_ZERO_ERROR;
4530 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
4531 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
4532 utext_openUTF8(&replText, "<$1>", -1, &status);
4533
4534 re = uregex_openC("x(.*?)x", 0, NULL, &status);
4535 REGEX_CHECK_STATUS;
4536
4537 /* Normal case, with match */
4538 uregex_setText(re, text1, -1, &status);
4539 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4540 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4541 REGEX_CHECK_STATUS;
4542 REGEX_ASSERT(result == &bufferText);
4543 REGEX_ASSERT_UTEXT("Replace <aa> x1x x...x.", result);
4544
4545 /* No match. Text should copy to output with no changes. */
4546 uregex_setText(re, text2, -1, &status);
4547 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4548 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4549 REGEX_CHECK_STATUS;
4550 REGEX_ASSERT(result == &bufferText);
4551 REGEX_ASSERT_UTEXT("No match here.", result);
4552
4553 /* Unicode escapes */
4554 uregex_setText(re, text1, -1, &status);
4555 utext_openUTF8(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
4556 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4557 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4558 REGEX_CHECK_STATUS;
4559 REGEX_ASSERT(result == &bufferText);
4560 REGEX_ASSERT_UTEXT("Replace \\AaaB$a x1x x...x.", result);
4561
4562 uregex_close(re);
4563 utext_close(&replText);
4564 }
4565
4566
4567 /*
4568 * replaceAll()
4569 */
4570 {
4571 UChar text1[80];
4572 UChar text2[80];
4573 UText replText = UTEXT_INITIALIZER;
4574 UText *result;
4575
4576 status = U_ZERO_ERROR;
4577 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
4578 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
4579 utext_openUTF8(&replText, "<$1>", -1, &status);
4580
4581 re = uregex_openC("x(.*?)x", 0, NULL, &status);
4582 REGEX_CHECK_STATUS;
4583
4584 /* Normal case, with match */
4585 uregex_setText(re, text1, -1, &status);
4586 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4587 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4588 REGEX_CHECK_STATUS;
4589 REGEX_ASSERT(result == &bufferText);
4590 REGEX_ASSERT_UTEXT("Replace <aa> <1> <...>.", result);
4591
4592 /* No match. Text should copy to output with no changes. */
4593 uregex_setText(re, text2, -1, &status);
4594 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4595 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4596 REGEX_CHECK_STATUS;
4597 REGEX_ASSERT(result == &bufferText);
4598 REGEX_ASSERT_UTEXT("No match here.", result);
4599
4600 uregex_close(re);
4601 utext_close(&replText);
4602 }
4603
4604
4605 /*
4606 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
4607 * so we don't need to test it here.
4608 */
4609
4610 utext_close(&bufferText);
4611 utext_close(&patternText);
4612 }
4613
4614 //--------------------------------------------------------------
4615 //
4616 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
4617 //
4618 //---------------------------------------------------------------
Bug7651()4619 void RegexTest::Bug7651() {
4620 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
4621 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
4622 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
4623 UnicodeString s("#ff @abcd This is test");
4624 RegexPattern *REPattern = NULL;
4625 RegexMatcher *REMatcher = NULL;
4626 UErrorCode status = U_ZERO_ERROR;
4627 UParseError pe;
4628
4629 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
4630 REGEX_CHECK_STATUS;
4631 REMatcher = REPattern->matcher(s, status);
4632 REGEX_CHECK_STATUS;
4633 REGEX_ASSERT(REMatcher->find());
4634 REGEX_ASSERT(REMatcher->start(status) == 0);
4635 delete REPattern;
4636 delete REMatcher;
4637 status = U_ZERO_ERROR;
4638
4639 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
4640 REGEX_CHECK_STATUS;
4641 REMatcher = REPattern->matcher(s, status);
4642 REGEX_CHECK_STATUS;
4643 REGEX_ASSERT(REMatcher->find());
4644 REGEX_ASSERT(REMatcher->start(status) == 0);
4645 delete REPattern;
4646 delete REMatcher;
4647 status = U_ZERO_ERROR;
4648 }
4649
4650 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
4651
4652