1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2007, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7 //
8 // regextst.cpp
9 //
10 // ICU Regular Expressions test, part of intltest.
11 //
12
13 #include "intltest.h"
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16 #include "unicode/regex.h"
17 #include "unicode/uchar.h"
18 #include "unicode/ucnv.h"
19 #include "regextst.h"
20 #include "uvector.h"
21 #include "util.h"
22 #include <stdlib.h>
23 #include <string.h>
24 #include <stdio.h>
25
26
27 //---------------------------------------------------------------------------
28 //
29 // Test class boilerplate
30 //
31 //---------------------------------------------------------------------------
RegexTest()32 RegexTest::RegexTest()
33 {
34 }
35
36
~RegexTest()37 RegexTest::~RegexTest()
38 {
39 }
40
41
42
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)43 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
44 {
45 if (exec) logln("TestSuite RegexTest: ");
46 switch (index) {
47
48 case 0: name = "Basic";
49 if (exec) Basic();
50 break;
51 case 1: name = "API_Match";
52 if (exec) API_Match();
53 break;
54 case 2: name = "API_Replace";
55 if (exec) API_Replace();
56 break;
57 case 3: name = "API_Pattern";
58 if (exec) API_Pattern();
59 break;
60 case 4: name = "Extended";
61 if (exec) Extended();
62 break;
63 case 5: name = "Errors";
64 if (exec) Errors();
65 break;
66 case 6: name = "PerlTests";
67 if (exec) PerlTests();
68 break;
69
70
71 default: name = "";
72 break; //needed to end loop
73 }
74 }
75
76
77 //---------------------------------------------------------------------------
78 //
79 // Error Checking / Reporting macros used in all of the tests.
80 //
81 //---------------------------------------------------------------------------
82 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d. status=%s\n", \
83 __LINE__, u_errorName(status)); return;}}
84
85 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
86
87 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
88 if (status!=errcode) {errln("RegexTest failure at line %d. Expected status=%s, got %s\n", \
89 __LINE__, u_errorName(errcode), u_errorName(status));};}
90
91 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
92 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
93
94 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
95 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
96
97
98
99 //---------------------------------------------------------------------------
100 //
101 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
102 // for the LookingAt() and Match() functions.
103 //
104 // usage:
105 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
106 //
107 // The expected results are UBool - TRUE or FALSE.
108 // The input text is unescaped. The pattern is not.
109 //
110 //
111 //---------------------------------------------------------------------------
112
113 #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
114
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)115 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
116 const UnicodeString pattern(pat);
117 const UnicodeString inputText(text);
118 UErrorCode status = U_ZERO_ERROR;
119 UParseError pe;
120 RegexPattern *REPattern = NULL;
121 RegexMatcher *REMatcher = NULL;
122 UBool retVal = TRUE;
123
124 UnicodeString patString(pat);
125 REPattern = RegexPattern::compile(patString, 0, pe, status);
126 if (U_FAILURE(status)) {
127 errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s\n",
128 line, u_errorName(status));
129 return FALSE;
130 }
131 if (line==376) { RegexPatternDump(REPattern);}
132
133 UnicodeString inputString(inputText);
134 UnicodeString unEscapedInput = inputString.unescape();
135 REMatcher = REPattern->matcher(unEscapedInput, status);
136 if (U_FAILURE(status)) {
137 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
138 line, u_errorName(status));
139 return FALSE;
140 }
141
142 UBool actualmatch;
143 actualmatch = REMatcher->lookingAt(status);
144 if (U_FAILURE(status)) {
145 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
146 line, u_errorName(status));
147 retVal = FALSE;
148 }
149 if (actualmatch != looking) {
150 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
151 retVal = FALSE;
152 }
153
154 status = U_ZERO_ERROR;
155 actualmatch = REMatcher->matches(status);
156 if (U_FAILURE(status)) {
157 errln("RegexTest failure in matches() at line %d. Status = %s\n",
158 line, u_errorName(status));
159 retVal = FALSE;
160 }
161 if (actualmatch != match) {
162 errln("RegexTest: wrong return from matches() at line %d.\n", line);
163 retVal = FALSE;
164 }
165
166 if (retVal == FALSE) {
167 RegexPatternDump(REPattern);
168 }
169
170 delete REPattern;
171 delete REMatcher;
172 return retVal;
173 }
174
175
176
177
178
179 //---------------------------------------------------------------------------
180 //
181 // REGEX_ERR Macro + invocation function to simplify writing tests
182 // regex tests for incorrect patterns
183 //
184 // usage:
185 // REGEX_ERR("pattern", expected error line, column, expected status);
186 //
187 //---------------------------------------------------------------------------
188 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
189
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)190 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
191 UErrorCode expectedStatus, int32_t line) {
192 UnicodeString pattern(pat);
193
194 UErrorCode status = U_ZERO_ERROR;
195 UParseError pe;
196 RegexPattern *callerPattern = NULL;
197
198 //
199 // Compile the caller's pattern
200 //
201 UnicodeString patString(pat);
202 callerPattern = RegexPattern::compile(patString, 0, pe, status);
203 if (status != expectedStatus) {
204 errln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
205 } else {
206 if (status != U_ZERO_ERROR) {
207 if (pe.line != errLine || pe.offset != errCol) {
208 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
209 line, errLine, errCol, pe.line, pe.offset);
210 }
211 }
212 }
213
214 delete callerPattern;
215 }
216
217
218
219 //---------------------------------------------------------------------------
220 //
221 // Basic Check for basic functionality of regex pattern matching.
222 // Avoid the use of REGEX_FIND test macro, which has
223 // substantial dependencies on basic Regex functionality.
224 //
225 //---------------------------------------------------------------------------
Basic()226 void RegexTest::Basic() {
227
228
229 //
230 // Debug - slide failing test cases early
231 //
232 #if 0
233 {
234 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
235 UParseError pe;
236 UErrorCode status = U_ZERO_ERROR;
237 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
238 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
239 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
240 }
241 exit(1);
242 #endif
243
244
245 //
246 // Pattern with parentheses
247 //
248 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
249 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
250 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
251
252 //
253 // Patterns with *
254 //
255 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
256 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
257 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
258 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
259 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
260
261 REGEX_TESTLM("a*", "", TRUE, TRUE);
262 REGEX_TESTLM("a*", "b", TRUE, FALSE);
263
264
265 //
266 // Patterns with "."
267 //
268 REGEX_TESTLM(".", "abc", TRUE, FALSE);
269 REGEX_TESTLM("...", "abc", TRUE, TRUE);
270 REGEX_TESTLM("....", "abc", FALSE, FALSE);
271 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
272 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
273 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
274 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
275 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
276
277 //
278 // Patterns with * applied to chars at end of literal string
279 //
280 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
281 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
282
283 //
284 // Supplemental chars match as single chars, not a pair of surrogates.
285 //
286 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
287 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
288 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
289
290
291 //
292 // UnicodeSets in the pattern
293 //
294 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
295 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
296 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
297 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
298 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
299 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
300
301 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
302 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
303 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
304 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
305 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
306
307 //
308 // OR operator in patterns
309 //
310 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
311 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
312 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
313 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
314
315 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
316 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
317 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
318 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
319 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
320 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
321
322 //
323 // +
324 //
325 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
326 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
327 REGEX_TESTLM("b+", "", FALSE, FALSE);
328 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
329 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
330 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
331
332 //
333 // ?
334 //
335 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
336 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
337 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
338 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
339 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
340 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
341 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
342 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
343 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
344
345 //
346 // Escape sequences that become single literal chars, handled internally
347 // by ICU's Unescape.
348 //
349
350 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
351 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
352 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
353 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
354 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
355 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
356 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
357 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
358 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
359 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
360
361 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
362 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
363
364 // Escape of special chars in patterns
365 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
366
367
368 }
369
370
371 //---------------------------------------------------------------------------
372 //
373 // API_Match Test that the API for class RegexMatcher
374 // is present and nominally working, but excluding functions
375 // implementing replace operations.
376 //
377 //---------------------------------------------------------------------------
API_Match()378 void RegexTest::API_Match() {
379 UParseError pe;
380 UErrorCode status=U_ZERO_ERROR;
381 int32_t flags = 0;
382
383 //
384 // Debug - slide failing test cases early
385 //
386 #if 0
387 {
388 }
389 return;
390 #endif
391
392 //
393 // Simple pattern compilation
394 //
395 {
396 UnicodeString re("abc");
397 RegexPattern *pat2;
398 pat2 = RegexPattern::compile(re, flags, pe, status);
399 REGEX_CHECK_STATUS;
400
401 UnicodeString inStr1 = "abcdef this is a test";
402 UnicodeString instr2 = "not abc";
403 UnicodeString empty = "";
404
405
406 //
407 // Matcher creation and reset.
408 //
409 RegexMatcher *m1 = pat2->matcher(inStr1, status);
410 REGEX_CHECK_STATUS;
411 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
412 REGEX_ASSERT(m1->input() == inStr1);
413 m1->reset(instr2);
414 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
415 REGEX_ASSERT(m1->input() == instr2);
416 m1->reset(inStr1);
417 REGEX_ASSERT(m1->input() == inStr1);
418 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
419 m1->reset(empty);
420 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
421 REGEX_ASSERT(m1->input() == empty);
422 REGEX_ASSERT(&m1->pattern() == pat2);
423
424 //
425 // reset(pos, status)
426 //
427 m1->reset(inStr1);
428 m1->reset(4, status);
429 REGEX_CHECK_STATUS;
430 REGEX_ASSERT(m1->input() == inStr1);
431 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
432
433 m1->reset(-1, status);
434 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
435 status = U_ZERO_ERROR;
436
437 m1->reset(0, status);
438 REGEX_CHECK_STATUS;
439 status = U_ZERO_ERROR;
440
441 int32_t len = m1->input().length();
442 m1->reset(len-1, status);
443 REGEX_CHECK_STATUS;
444 status = U_ZERO_ERROR;
445
446 m1->reset(len, status);
447 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
448 status = U_ZERO_ERROR;
449
450 //
451 // match(pos, status)
452 //
453 m1->reset(instr2);
454 REGEX_ASSERT(m1->matches(4, status) == TRUE);
455 m1->reset();
456 REGEX_ASSERT(m1->matches(3, status) == FALSE);
457 m1->reset();
458 REGEX_ASSERT(m1->matches(5, status) == FALSE);
459 REGEX_ASSERT(m1->matches(4, status) == TRUE);
460 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
461 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
462
463 // Match() at end of string should fail, but should not
464 // be an error.
465 status = U_ZERO_ERROR;
466 len = m1->input().length();
467 REGEX_ASSERT(m1->matches(len, status) == FALSE);
468 REGEX_CHECK_STATUS;
469
470 // Match beyond end of string should fail with an error.
471 status = U_ZERO_ERROR;
472 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
473 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
474
475 // Successful match at end of string.
476 {
477 status = U_ZERO_ERROR;
478 RegexMatcher m("A?", 0, status); // will match zero length string.
479 REGEX_CHECK_STATUS;
480 m.reset(inStr1);
481 len = inStr1.length();
482 REGEX_ASSERT(m.matches(len, status) == TRUE);
483 REGEX_CHECK_STATUS;
484 m.reset(empty);
485 REGEX_ASSERT(m.matches(0, status) == TRUE);
486 REGEX_CHECK_STATUS;
487 }
488
489
490 //
491 // lookingAt(pos, status)
492 //
493 status = U_ZERO_ERROR;
494 m1->reset(instr2); // "not abc"
495 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
496 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
497 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
498 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
499 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
500 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
501 status = U_ZERO_ERROR;
502 len = m1->input().length();
503 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
504 REGEX_CHECK_STATUS;
505 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
506 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
507
508 delete m1;
509 delete pat2;
510 }
511
512
513 //
514 // Capture Group.
515 // RegexMatcher::start();
516 // RegexMatcher::end();
517 // RegexMatcher::groupCount();
518 //
519 {
520 int32_t flags=0;
521 UParseError pe;
522 UErrorCode status=U_ZERO_ERROR;
523
524 UnicodeString re("01(23(45)67)(.*)");
525 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
526 REGEX_CHECK_STATUS;
527 UnicodeString data = "0123456789";
528
529 RegexMatcher *matcher = pat->matcher(data, status);
530 REGEX_CHECK_STATUS;
531 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
532 static const int32_t matchStarts[] = {0, 2, 4, 8};
533 static const int32_t matchEnds[] = {10, 8, 6, 10};
534 int32_t i;
535 for (i=0; i<4; i++) {
536 int32_t actualStart = matcher->start(i, status);
537 REGEX_CHECK_STATUS;
538 if (actualStart != matchStarts[i]) {
539 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
540 __LINE__, i, matchStarts[i], actualStart);
541 }
542 int32_t actualEnd = matcher->end(i, status);
543 REGEX_CHECK_STATUS;
544 if (actualEnd != matchEnds[i]) {
545 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
546 __LINE__, i, matchEnds[i], actualEnd);
547 }
548 }
549
550 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
551 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
552
553 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
554 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
555 matcher->reset();
556 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
557
558 matcher->lookingAt(status);
559 REGEX_ASSERT(matcher->group(status) == "0123456789");
560 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
561 REGEX_ASSERT(matcher->group(1, status) == "234567" );
562 REGEX_ASSERT(matcher->group(2, status) == "45" );
563 REGEX_ASSERT(matcher->group(3, status) == "89" );
564 REGEX_CHECK_STATUS;
565 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
566 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
567 matcher->reset();
568 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
569
570 delete matcher;
571 delete pat;
572
573 }
574
575 //
576 // find
577 //
578 {
579 int32_t flags=0;
580 UParseError pe;
581 UErrorCode status=U_ZERO_ERROR;
582
583 UnicodeString re("abc");
584 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
585 REGEX_CHECK_STATUS;
586 UnicodeString data = ".abc..abc...abc..";
587 // 012345678901234567
588
589 RegexMatcher *matcher = pat->matcher(data, status);
590 REGEX_CHECK_STATUS;
591 REGEX_ASSERT(matcher->find());
592 REGEX_ASSERT(matcher->start(status) == 1);
593 REGEX_ASSERT(matcher->find());
594 REGEX_ASSERT(matcher->start(status) == 6);
595 REGEX_ASSERT(matcher->find());
596 REGEX_ASSERT(matcher->start(status) == 12);
597 REGEX_ASSERT(matcher->find() == FALSE);
598 REGEX_ASSERT(matcher->find() == FALSE);
599
600 matcher->reset();
601 REGEX_ASSERT(matcher->find());
602 REGEX_ASSERT(matcher->start(status) == 1);
603
604 REGEX_ASSERT(matcher->find(0, status));
605 REGEX_ASSERT(matcher->start(status) == 1);
606 REGEX_ASSERT(matcher->find(1, status));
607 REGEX_ASSERT(matcher->start(status) == 1);
608 REGEX_ASSERT(matcher->find(2, status));
609 REGEX_ASSERT(matcher->start(status) == 6);
610 REGEX_ASSERT(matcher->find(12, status));
611 REGEX_ASSERT(matcher->start(status) == 12);
612 REGEX_ASSERT(matcher->find(13, status) == FALSE);
613 REGEX_ASSERT(matcher->find(16, status) == FALSE);
614 REGEX_ASSERT(matcher->find(17, status) == FALSE);
615 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
616
617 status = U_ZERO_ERROR;
618 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
619 status = U_ZERO_ERROR;
620 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
621
622 REGEX_ASSERT(matcher->groupCount() == 0);
623
624 delete matcher;
625 delete pat;
626 }
627
628
629 //
630 // find, with \G in pattern (true if at the end of a previous match).
631 //
632 {
633 int32_t flags=0;
634 UParseError pe;
635 UErrorCode status=U_ZERO_ERROR;
636
637 UnicodeString re(".*?(?:(\\Gabc)|(abc))");
638 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
639 REGEX_CHECK_STATUS;
640 UnicodeString data = ".abcabc.abc..";
641 // 012345678901234567
642
643 RegexMatcher *matcher = pat->matcher(data, status);
644 REGEX_CHECK_STATUS;
645 REGEX_ASSERT(matcher->find());
646 REGEX_ASSERT(matcher->start(status) == 0);
647 REGEX_ASSERT(matcher->start(1, status) == -1);
648 REGEX_ASSERT(matcher->start(2, status) == 1);
649
650 REGEX_ASSERT(matcher->find());
651 REGEX_ASSERT(matcher->start(status) == 4);
652 REGEX_ASSERT(matcher->start(1, status) == 4);
653 REGEX_ASSERT(matcher->start(2, status) == -1);
654 REGEX_CHECK_STATUS;
655
656 delete matcher;
657 delete pat;
658 }
659
660 //
661 // find with zero length matches, match position should bump ahead
662 // to prevent loops.
663 //
664 {
665 int32_t i;
666 UErrorCode status=U_ZERO_ERROR;
667 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
668 // using an always-true look-ahead.
669 REGEX_CHECK_STATUS;
670 UnicodeString s(" ");
671 m.reset(s);
672 for (i=0; ; i++) {
673 if (m.find() == FALSE) {
674 break;
675 }
676 REGEX_ASSERT(m.start(status) == i);
677 REGEX_ASSERT(m.end(status) == i);
678 }
679 REGEX_ASSERT(i==5);
680
681 // Check that the bump goes over surrogate pairs OK
682 s = "\\U00010001\\U00010002\\U00010003\\U00010004";
683 s = s.unescape();
684 m.reset(s);
685 for (i=0; ; i+=2) {
686 if (m.find() == FALSE) {
687 break;
688 }
689 REGEX_ASSERT(m.start(status) == i);
690 REGEX_ASSERT(m.end(status) == i);
691 }
692 REGEX_ASSERT(i==10);
693 }
694 {
695 // find() loop breaking test.
696 // with pattern of /.?/, should see a series of one char matches, then a single
697 // match of zero length at the end of the input string.
698 int32_t i;
699 UErrorCode status=U_ZERO_ERROR;
700 RegexMatcher m(".?", 0, status);
701 REGEX_CHECK_STATUS;
702 UnicodeString s(" ");
703 m.reset(s);
704 for (i=0; ; i++) {
705 if (m.find() == FALSE) {
706 break;
707 }
708 REGEX_ASSERT(m.start(status) == i);
709 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
710 }
711 REGEX_ASSERT(i==5);
712 }
713
714
715 //
716 // Matchers with no input string behave as if they had an empty input string.
717 //
718
719 {
720 UErrorCode status = U_ZERO_ERROR;
721 RegexMatcher m(".?", 0, status);
722 REGEX_CHECK_STATUS;
723 REGEX_ASSERT(m.find());
724 REGEX_ASSERT(m.start(status) == 0);
725 REGEX_ASSERT(m.input() == "");
726 }
727 {
728 UErrorCode status = U_ZERO_ERROR;
729 RegexPattern *p = RegexPattern::compile(".", 0, status);
730 RegexMatcher *m = p->matcher(status);
731 REGEX_CHECK_STATUS;
732
733 REGEX_ASSERT(m->find() == FALSE);
734 REGEX_ASSERT(m->input() == "");
735 delete m;
736 delete p;
737 }
738
739 //
740 // Regions
741 //
742 {
743 UErrorCode status = U_ZERO_ERROR;
744 UnicodeString testString("This is test data");
745 RegexMatcher m(".*", testString, 0, status);
746 REGEX_CHECK_STATUS;
747 REGEX_ASSERT(m.regionStart() == 0);
748 REGEX_ASSERT(m.regionEnd() == testString.length());
749 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
750 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
751
752 m.region(2,4, status);
753 REGEX_CHECK_STATUS;
754 REGEX_ASSERT(m.matches(status));
755 REGEX_ASSERT(m.start(status)==2);
756 REGEX_ASSERT(m.end(status)==4);
757 REGEX_CHECK_STATUS;
758
759 m.reset();
760 REGEX_ASSERT(m.regionStart() == 0);
761 REGEX_ASSERT(m.regionEnd() == testString.length());
762
763 UnicodeString shorterString("short");
764 m.reset(shorterString);
765 REGEX_ASSERT(m.regionStart() == 0);
766 REGEX_ASSERT(m.regionEnd() == shorterString.length());
767
768 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
769 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
770 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
771 REGEX_ASSERT(&m == &m.reset());
772 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
773
774 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
775 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
776 REGEX_ASSERT(&m == &m.reset());
777 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
778
779 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
780 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
781 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
782 REGEX_ASSERT(&m == &m.reset());
783 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
784
785 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
786 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
787 REGEX_ASSERT(&m == &m.reset());
788 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
789
790 }
791
792 //
793 // hitEnd() and requireEnd()
794 //
795 {
796 UErrorCode status = U_ZERO_ERROR;
797 UnicodeString testString("aabb");
798 RegexMatcher m1(".*", testString, 0, status);
799 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
800 REGEX_ASSERT(m1.hitEnd() == TRUE);
801 REGEX_ASSERT(m1.requireEnd() == FALSE);
802 REGEX_CHECK_STATUS;
803
804 status = U_ZERO_ERROR;
805 RegexMatcher m2("a*", testString, 0, status);
806 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
807 REGEX_ASSERT(m2.hitEnd() == FALSE);
808 REGEX_ASSERT(m2.requireEnd() == FALSE);
809 REGEX_CHECK_STATUS;
810
811 status = U_ZERO_ERROR;
812 RegexMatcher m3(".*$", testString, 0, status);
813 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
814 REGEX_ASSERT(m3.hitEnd() == TRUE);
815 REGEX_ASSERT(m3.requireEnd() == TRUE);
816 REGEX_CHECK_STATUS;
817 }
818
819
820 //
821 // Compilation error on reset with UChar *
822 // These were a hazard that people were stumbling over with runtime errors.
823 // Changed them to compiler errors by adding private methods that more closely
824 // matched the incorrect use of the functions.
825 //
826 #if 0
827 {
828 UErrorCode status = U_ZERO_ERROR;
829 UChar ucharString[20];
830 RegexMatcher m(".", 0, status);
831 m.reset(ucharString); // should not compile.
832
833 RegexPattern *p = RegexPattern::compile(".", 0, status);
834 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
835
836 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
837 }
838 #endif
839
840 }
841
842
843
844
845
846
847 //---------------------------------------------------------------------------
848 //
849 // API_Replace API test for class RegexMatcher, testing the
850 // Replace family of functions.
851 //
852 //---------------------------------------------------------------------------
API_Replace()853 void RegexTest::API_Replace() {
854 //
855 // Replace
856 //
857 int32_t flags=0;
858 UParseError pe;
859 UErrorCode status=U_ZERO_ERROR;
860
861 UnicodeString re("abc");
862 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
863 REGEX_CHECK_STATUS;
864 UnicodeString data = ".abc..abc...abc..";
865 // 012345678901234567
866 RegexMatcher *matcher = pat->matcher(data, status);
867
868 //
869 // Plain vanilla matches.
870 //
871 UnicodeString dest;
872 dest = matcher->replaceFirst("yz", status);
873 REGEX_CHECK_STATUS;
874 REGEX_ASSERT(dest == ".yz..abc...abc..");
875
876 dest = matcher->replaceAll("yz", status);
877 REGEX_CHECK_STATUS;
878 REGEX_ASSERT(dest == ".yz..yz...yz..");
879
880 //
881 // Plain vanilla non-matches.
882 //
883 UnicodeString d2 = ".abx..abx...abx..";
884 matcher->reset(d2);
885 dest = matcher->replaceFirst("yz", status);
886 REGEX_CHECK_STATUS;
887 REGEX_ASSERT(dest == ".abx..abx...abx..");
888
889 dest = matcher->replaceAll("yz", status);
890 REGEX_CHECK_STATUS;
891 REGEX_ASSERT(dest == ".abx..abx...abx..");
892
893 //
894 // Empty source string
895 //
896 UnicodeString d3 = "";
897 matcher->reset(d3);
898 dest = matcher->replaceFirst("yz", status);
899 REGEX_CHECK_STATUS;
900 REGEX_ASSERT(dest == "");
901
902 dest = matcher->replaceAll("yz", status);
903 REGEX_CHECK_STATUS;
904 REGEX_ASSERT(dest == "");
905
906 //
907 // Empty substitution string
908 //
909 matcher->reset(data); // ".abc..abc...abc.."
910 dest = matcher->replaceFirst("", status);
911 REGEX_CHECK_STATUS;
912 REGEX_ASSERT(dest == "...abc...abc..");
913
914 dest = matcher->replaceAll("", status);
915 REGEX_CHECK_STATUS;
916 REGEX_ASSERT(dest == "........");
917
918 //
919 // match whole string
920 //
921 UnicodeString d4 = "abc";
922 matcher->reset(d4);
923 dest = matcher->replaceFirst("xyz", status);
924 REGEX_CHECK_STATUS;
925 REGEX_ASSERT(dest == "xyz");
926
927 dest = matcher->replaceAll("xyz", status);
928 REGEX_CHECK_STATUS;
929 REGEX_ASSERT(dest == "xyz");
930
931 //
932 // Capture Group, simple case
933 //
934 UnicodeString re2("a(..)");
935 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
936 REGEX_CHECK_STATUS;
937 UnicodeString d5 = "abcdefg";
938 RegexMatcher *matcher2 = pat2->matcher(d5, status);
939 REGEX_CHECK_STATUS;
940 dest = matcher2->replaceFirst("$1$1", status);
941 REGEX_CHECK_STATUS;
942 REGEX_ASSERT(dest == "bcbcdefg");
943
944 dest = matcher2->replaceFirst("The value of \\$1 is $1.", status);
945 REGEX_CHECK_STATUS;
946 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
947
948 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
949 REGEX_CHECK_STATUS;
950 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
951
952 UnicodeString replacement = "Supplemental Digit 1 $\\U0001D7CF.";
953 replacement = replacement.unescape();
954 dest = matcher2->replaceFirst(replacement, status);
955 REGEX_CHECK_STATUS;
956 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
957
958 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
959
960
961 //
962 // Replacement String with \u hex escapes
963 //
964 {
965 UnicodeString src = "abc 1 abc 2 abc 3";
966 UnicodeString substitute = "--\\u0043--";
967 matcher->reset(src);
968 UnicodeString result = matcher->replaceAll(substitute, status);
969 REGEX_CHECK_STATUS;
970 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
971 }
972 {
973 UnicodeString src = "abc !";
974 UnicodeString substitute = "--\\U00010000--";
975 matcher->reset(src);
976 UnicodeString result = matcher->replaceAll(substitute, status);
977 REGEX_CHECK_STATUS;
978 UnicodeString expected = UnicodeString("--");
979 expected.append((UChar32)0x10000);
980 expected.append("-- !");
981 REGEX_ASSERT(result == expected);
982 }
983 // TODO: need more through testing of capture substitutions.
984
985 // Bug 4057
986 //
987 {
988 status = U_ZERO_ERROR;
989 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
990 RegexMatcher m("ss(.*?)ee", 0, status);
991 REGEX_CHECK_STATUS;
992 UnicodeString result;
993
994 // Multiple finds do NOT bump up the previous appendReplacement postion.
995 m.reset(s);
996 m.find();
997 m.find();
998 m.appendReplacement(result, "ooh", status);
999 REGEX_CHECK_STATUS;
1000 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1001
1002 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1003 status = U_ZERO_ERROR;
1004 result.truncate(0);
1005 m.reset(10, status);
1006 m.find();
1007 m.find();
1008 m.appendReplacement(result, "ooh", status);
1009 REGEX_CHECK_STATUS;
1010 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1011
1012 // find() at interior of string, appendReplacemnt still starts at beginning.
1013 status = U_ZERO_ERROR;
1014 result.truncate(0);
1015 m.reset();
1016 m.find(10, status);
1017 m.find();
1018 m.appendReplacement(result, "ooh", status);
1019 REGEX_CHECK_STATUS;
1020 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1021
1022 m.appendTail(result);
1023 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1024
1025 }
1026
1027 delete matcher2;
1028 delete pat2;
1029 delete matcher;
1030 delete pat;
1031 }
1032
1033
1034 //---------------------------------------------------------------------------
1035 //
1036 // API_Pattern Test that the API for class RegexPattern is
1037 // present and nominally working.
1038 //
1039 //---------------------------------------------------------------------------
API_Pattern()1040 void RegexTest::API_Pattern() {
1041 RegexPattern pata; // Test default constructor to not crash.
1042 RegexPattern patb;
1043
1044 REGEX_ASSERT(pata == patb);
1045 REGEX_ASSERT(pata == pata);
1046
1047 UnicodeString re1("abc[a-l][m-z]");
1048 UnicodeString re2("def");
1049 UErrorCode status = U_ZERO_ERROR;
1050 UParseError pe;
1051
1052 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1053 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1054 REGEX_CHECK_STATUS;
1055 REGEX_ASSERT(*pat1 == *pat1);
1056 REGEX_ASSERT(*pat1 != pata);
1057
1058 // Assign
1059 patb = *pat1;
1060 REGEX_ASSERT(patb == *pat1);
1061
1062 // Copy Construct
1063 RegexPattern patc(*pat1);
1064 REGEX_ASSERT(patc == *pat1);
1065 REGEX_ASSERT(patb == patc);
1066 REGEX_ASSERT(pat1 != pat2);
1067 patb = *pat2;
1068 REGEX_ASSERT(patb != patc);
1069 REGEX_ASSERT(patb == *pat2);
1070
1071 // Compile with no flags.
1072 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1073 REGEX_ASSERT(*pat1a == *pat1);
1074
1075 REGEX_ASSERT(pat1a->flags() == 0);
1076
1077 // Compile with different flags should be not equal
1078 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1079 REGEX_CHECK_STATUS;
1080
1081 REGEX_ASSERT(*pat1b != *pat1a);
1082 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1083 REGEX_ASSERT(pat1a->flags() == 0);
1084 delete pat1b;
1085
1086 // clone
1087 RegexPattern *pat1c = pat1->clone();
1088 REGEX_ASSERT(*pat1c == *pat1);
1089 REGEX_ASSERT(*pat1c != *pat2);
1090
1091 delete pat1c;
1092 delete pat1a;
1093 delete pat1;
1094 delete pat2;
1095
1096
1097 //
1098 // Verify that a matcher created from a cloned pattern works.
1099 // (Jitterbug 3423)
1100 //
1101 {
1102 UErrorCode status = U_ZERO_ERROR;
1103 RegexPattern *pSource = RegexPattern::compile("\\p{L}+", 0, status);
1104 RegexPattern *pClone = pSource->clone();
1105 delete pSource;
1106 RegexMatcher *mFromClone = pClone->matcher(status);
1107 REGEX_CHECK_STATUS;
1108 UnicodeString s = "Hello World";
1109 mFromClone->reset(s);
1110 REGEX_ASSERT(mFromClone->find() == TRUE);
1111 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1112 REGEX_ASSERT(mFromClone->find() == TRUE);
1113 REGEX_ASSERT(mFromClone->group(status) == "World");
1114 REGEX_ASSERT(mFromClone->find() == FALSE);
1115 delete mFromClone;
1116 delete pClone;
1117 }
1118
1119 //
1120 // matches convenience API
1121 //
1122 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1123 REGEX_CHECK_STATUS;
1124 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1125 REGEX_CHECK_STATUS;
1126 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1127 REGEX_CHECK_STATUS;
1128 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1129 REGEX_CHECK_STATUS;
1130 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1131 REGEX_CHECK_STATUS;
1132 status = U_INDEX_OUTOFBOUNDS_ERROR;
1133 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1134 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1135
1136
1137 //
1138 // Split()
1139 //
1140 status = U_ZERO_ERROR;
1141 pat1 = RegexPattern::compile(" +", pe, status);
1142 REGEX_CHECK_STATUS;
1143 UnicodeString fields[10];
1144
1145 int32_t n;
1146 n = pat1->split("Now is the time", fields, 10, status);
1147 REGEX_CHECK_STATUS;
1148 REGEX_ASSERT(n==4);
1149 REGEX_ASSERT(fields[0]=="Now");
1150 REGEX_ASSERT(fields[1]=="is");
1151 REGEX_ASSERT(fields[2]=="the");
1152 REGEX_ASSERT(fields[3]=="time");
1153 REGEX_ASSERT(fields[4]=="");
1154
1155 n = pat1->split("Now is the time", fields, 2, status);
1156 REGEX_CHECK_STATUS;
1157 REGEX_ASSERT(n==2);
1158 REGEX_ASSERT(fields[0]=="Now");
1159 REGEX_ASSERT(fields[1]=="is the time");
1160 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1161
1162 fields[1] = "*";
1163 status = U_ZERO_ERROR;
1164 n = pat1->split("Now is the time", fields, 1, status);
1165 REGEX_CHECK_STATUS;
1166 REGEX_ASSERT(n==1);
1167 REGEX_ASSERT(fields[0]=="Now is the time");
1168 REGEX_ASSERT(fields[1]=="*");
1169 status = U_ZERO_ERROR;
1170
1171 n = pat1->split(" Now is the time ", fields, 10, status);
1172 REGEX_CHECK_STATUS;
1173 REGEX_ASSERT(n==5);
1174 REGEX_ASSERT(fields[0]=="");
1175 REGEX_ASSERT(fields[1]=="Now");
1176 REGEX_ASSERT(fields[2]=="is");
1177 REGEX_ASSERT(fields[3]=="the");
1178 REGEX_ASSERT(fields[4]=="time");
1179 REGEX_ASSERT(fields[5]=="");
1180
1181 n = pat1->split(" ", fields, 10, status);
1182 REGEX_CHECK_STATUS;
1183 REGEX_ASSERT(n==1);
1184 REGEX_ASSERT(fields[0]=="");
1185
1186 fields[0] = "foo";
1187 n = pat1->split("", fields, 10, status);
1188 REGEX_CHECK_STATUS;
1189 REGEX_ASSERT(n==0);
1190 REGEX_ASSERT(fields[0]=="foo");
1191
1192 delete pat1;
1193
1194 // split, with a pattern with (capture)
1195 pat1 = RegexPattern::compile("<(\\w*)>", pe, status);
1196 REGEX_CHECK_STATUS;
1197
1198 status = U_ZERO_ERROR;
1199 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1200 REGEX_CHECK_STATUS;
1201 REGEX_ASSERT(n==6);
1202 REGEX_ASSERT(fields[0]=="");
1203 REGEX_ASSERT(fields[1]=="a");
1204 REGEX_ASSERT(fields[2]=="Now is ");
1205 REGEX_ASSERT(fields[3]=="b");
1206 REGEX_ASSERT(fields[4]=="the time");
1207 REGEX_ASSERT(fields[5]=="c");
1208 REGEX_ASSERT(fields[6]=="");
1209 REGEX_ASSERT(status==U_ZERO_ERROR);
1210
1211 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1212 REGEX_CHECK_STATUS;
1213 REGEX_ASSERT(n==6);
1214 REGEX_ASSERT(fields[0]==" ");
1215 REGEX_ASSERT(fields[1]=="a");
1216 REGEX_ASSERT(fields[2]=="Now is ");
1217 REGEX_ASSERT(fields[3]=="b");
1218 REGEX_ASSERT(fields[4]=="the time");
1219 REGEX_ASSERT(fields[5]=="c");
1220 REGEX_ASSERT(fields[6]=="");
1221
1222 status = U_ZERO_ERROR;
1223 fields[6] = "foo";
1224 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1225 REGEX_CHECK_STATUS;
1226 REGEX_ASSERT(n==6);
1227 REGEX_ASSERT(fields[0]==" ");
1228 REGEX_ASSERT(fields[1]=="a");
1229 REGEX_ASSERT(fields[2]=="Now is ");
1230 REGEX_ASSERT(fields[3]=="b");
1231 REGEX_ASSERT(fields[4]=="the time");
1232 REGEX_ASSERT(fields[5]=="c");
1233 REGEX_ASSERT(fields[6]=="foo");
1234
1235 status = U_ZERO_ERROR;
1236 fields[5] = "foo";
1237 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1238 REGEX_CHECK_STATUS;
1239 REGEX_ASSERT(n==5);
1240 REGEX_ASSERT(fields[0]==" ");
1241 REGEX_ASSERT(fields[1]=="a");
1242 REGEX_ASSERT(fields[2]=="Now is ");
1243 REGEX_ASSERT(fields[3]=="b");
1244 REGEX_ASSERT(fields[4]=="the time<c>");
1245 REGEX_ASSERT(fields[5]=="foo");
1246
1247 status = U_ZERO_ERROR;
1248 fields[5] = "foo";
1249 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1250 REGEX_CHECK_STATUS;
1251 REGEX_ASSERT(n==5);
1252 REGEX_ASSERT(fields[0]==" ");
1253 REGEX_ASSERT(fields[1]=="a");
1254 REGEX_ASSERT(fields[2]=="Now is ");
1255 REGEX_ASSERT(fields[3]=="b");
1256 REGEX_ASSERT(fields[4]=="the time");
1257 REGEX_ASSERT(fields[5]=="foo");
1258
1259 status = U_ZERO_ERROR;
1260 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1261 REGEX_CHECK_STATUS;
1262 REGEX_ASSERT(n==4);
1263 REGEX_ASSERT(fields[0]==" ");
1264 REGEX_ASSERT(fields[1]=="a");
1265 REGEX_ASSERT(fields[2]=="Now is ");
1266 REGEX_ASSERT(fields[3]=="the time<c>");
1267 status = U_ZERO_ERROR;
1268 delete pat1;
1269
1270 pat1 = RegexPattern::compile("([-,])", pe, status);
1271 REGEX_CHECK_STATUS;
1272 n = pat1->split("1-10,20", fields, 10, status);
1273 REGEX_CHECK_STATUS;
1274 REGEX_ASSERT(n==5);
1275 REGEX_ASSERT(fields[0]=="1");
1276 REGEX_ASSERT(fields[1]=="-");
1277 REGEX_ASSERT(fields[2]=="10");
1278 REGEX_ASSERT(fields[3]==",");
1279 REGEX_ASSERT(fields[4]=="20");
1280 delete pat1;
1281
1282
1283 //
1284 // RegexPattern::pattern()
1285 //
1286 pat1 = new RegexPattern();
1287 REGEX_ASSERT(pat1->pattern() == "");
1288 delete pat1;
1289
1290 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1291 REGEX_CHECK_STATUS;
1292 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1293 delete pat1;
1294
1295
1296 //
1297 // classID functions
1298 //
1299 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1300 REGEX_CHECK_STATUS;
1301 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1302 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1303 UnicodeString Hello("Hello, world.");
1304 RegexMatcher *m = pat1->matcher(Hello, status);
1305 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1306 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1307 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1308 delete m;
1309 delete pat1;
1310
1311 }
1312
1313 //---------------------------------------------------------------------------
1314 //
1315 // Extended A more thorough check for features of regex patterns
1316 // The test cases are in a separate data file,
1317 // source/tests/testdata/regextst.txt
1318 // A description of the test data format is included in that file.
1319 //
1320 //---------------------------------------------------------------------------
1321
1322 const char *
getPath(char buffer[2048],const char * filename)1323 RegexTest::getPath(char buffer[2048], const char *filename) {
1324 UErrorCode status=U_ZERO_ERROR;
1325 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1326 if (U_FAILURE(status)) {
1327 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
1328 return NULL;
1329 }
1330
1331 strcpy(buffer, testDataDirectory);
1332 strcat(buffer, filename);
1333 return buffer;
1334 }
1335
Extended()1336 void RegexTest::Extended() {
1337 char tdd[2048];
1338 const char *srcPath;
1339 UErrorCode status = U_ZERO_ERROR;
1340 int32_t lineNum = 0;
1341
1342 //
1343 // Open and read the test data file.
1344 //
1345 srcPath=getPath(tdd, "regextst.txt");
1346 if(srcPath==NULL) {
1347 return; /* something went wrong, error already output */
1348 }
1349
1350 int32_t len;
1351 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
1352 if (U_FAILURE(status)) {
1353 return; /* something went wrong, error already output */
1354 }
1355
1356 //
1357 // Put the test data into a UnicodeString
1358 //
1359 UnicodeString testString(FALSE, testData, len);
1360
1361 RegexMatcher quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status);
1362 RegexMatcher commentMat ("\\s*(#.*)?$", 0, status);
1363 RegexMatcher flagsMat ("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)", 0, status);
1364
1365 RegexMatcher lineMat("(.*?)\\r?\\n", testString, 0, status);
1366 UnicodeString testPattern; // The pattern for test from the test file.
1367 UnicodeString testFlags; // the flags for a test.
1368 UnicodeString matchString; // The marked up string to be used as input
1369
1370 if (U_FAILURE(status)){
1371 dataerrln("Construct RegexMatcher() error.");
1372 delete [] testData;
1373 return;
1374 }
1375
1376 //
1377 // Loop over the test data file, once per line.
1378 //
1379 while (lineMat.find()) {
1380 lineNum++;
1381 if (U_FAILURE(status)) {
1382 errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
1383 }
1384
1385 status = U_ZERO_ERROR;
1386 UnicodeString testLine = lineMat.group(1, status);
1387 if (testLine.length() == 0) {
1388 continue;
1389 }
1390
1391 //
1392 // Parse the test line. Skip blank and comment only lines.
1393 // Separate out the three main fields - pattern, flags, target.
1394 //
1395
1396 commentMat.reset(testLine);
1397 if (commentMat.lookingAt(status)) {
1398 // This line is a comment, or blank.
1399 continue;
1400 }
1401
1402 //
1403 // Pull out the pattern field, remove it from the test file line.
1404 //
1405 quotedStuffMat.reset(testLine);
1406 if (quotedStuffMat.lookingAt(status)) {
1407 testPattern = quotedStuffMat.group(2, status);
1408 testLine.remove(0, quotedStuffMat.end(0, status));
1409 } else {
1410 errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
1411 continue;
1412 }
1413
1414
1415 //
1416 // Pull out the flags from the test file line.
1417 //
1418 flagsMat.reset(testLine);
1419 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
1420 testFlags = flagsMat.group(1, status);
1421 if (flagsMat.group(2, status).length() > 0) {
1422 errln("Bad Match flag at line %d. Scanning %c\n",
1423 lineNum, flagsMat.group(2, status).charAt(0));
1424 continue;
1425 }
1426 testLine.remove(0, flagsMat.end(0, status));
1427
1428 //
1429 // Pull out the match string, as a whole.
1430 // We'll process the <tags> later.
1431 //
1432 quotedStuffMat.reset(testLine);
1433 if (quotedStuffMat.lookingAt(status)) {
1434 matchString = quotedStuffMat.group(2, status);
1435 testLine.remove(0, quotedStuffMat.end(0, status));
1436 } else {
1437 errln("Bad match string at test file line %d", lineNum);
1438 continue;
1439 }
1440
1441 //
1442 // The only thing left from the input line should be an optional trailing comment.
1443 //
1444 commentMat.reset(testLine);
1445 if (commentMat.lookingAt(status) == FALSE) {
1446 errln("Line %d: unexpected characters at end of test line.", lineNum);
1447 continue;
1448 }
1449
1450 //
1451 // Run the test
1452 //
1453 regex_find(testPattern, testFlags, matchString, lineNum);
1454 }
1455
1456 delete [] testData;
1457
1458 }
1459
1460
1461
1462 //---------------------------------------------------------------------------
1463 //
1464 // regex_find(pattern, flags, inputString, lineNumber)
1465 //
1466 // Function to run a single test from the Extended (data driven) tests.
1467 // See file test/testdata/regextst.txt for a description of the
1468 // pattern and inputString fields, and the allowed flags.
1469 // lineNumber is the source line in regextst.txt of the test.
1470 //
1471 //---------------------------------------------------------------------------
1472
1473
1474 // Set a value into a UVector at position specified by a decimal number in
1475 // a UnicodeString. This is a utility function needed by the actual test function,
1476 // which follows.
set(UVector & vec,int32_t val,UnicodeString index)1477 static void set(UVector &vec, int32_t val, UnicodeString index) {
1478 UErrorCode status=U_ZERO_ERROR;
1479 int32_t idx = 0;
1480 for (int32_t i=0; i<index.length(); i++) {
1481 int32_t d=u_charDigitValue(index.charAt(i));
1482 if (d<0) {return;}
1483 idx = idx*10 + d;
1484 }
1485 while (vec.size()<idx+1) {vec.addElement(-1, status);}
1486 vec.setElementAt(val, idx);
1487 }
1488
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,int32_t line)1489 void RegexTest::regex_find(const UnicodeString &pattern,
1490 const UnicodeString &flags,
1491 const UnicodeString &inputString,
1492 int32_t line) {
1493 UnicodeString unEscapedInput;
1494 UnicodeString deTaggedInput;
1495
1496 UErrorCode status = U_ZERO_ERROR;
1497 UParseError pe;
1498 RegexPattern *parsePat = NULL;
1499 RegexMatcher *parseMatcher = NULL;
1500 RegexPattern *callerPattern = NULL;
1501 RegexMatcher *matcher = NULL;
1502 UVector groupStarts(status);
1503 UVector groupEnds(status);
1504 UBool isMatch = FALSE;
1505 UBool failed = FALSE;
1506 int32_t numFinds;
1507 int32_t i;
1508 UBool useMatchesFunc = FALSE;
1509 UBool useLookingAtFunc = FALSE;
1510 int32_t regionStart = -1;
1511 int32_t regionEnd = -1;
1512
1513 //
1514 // Compile the caller's pattern
1515 //
1516 uint32_t bflags = 0;
1517 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
1518 bflags |= UREGEX_CASE_INSENSITIVE;
1519 }
1520 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
1521 bflags |= UREGEX_COMMENTS;
1522 }
1523 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
1524 bflags |= UREGEX_DOTALL;
1525 }
1526 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
1527 bflags |= UREGEX_MULTILINE;
1528 }
1529
1530 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
1531 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
1532 }
1533 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
1534 bflags |= UREGEX_UNIX_LINES;
1535 }
1536
1537
1538 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
1539 if (status != U_ZERO_ERROR) {
1540 #if UCONFIG_NO_BREAK_ITERATION==1
1541 // 'v' test flag means that the test pattern should not compile if ICU was configured
1542 // to not include break iteration. RBBI is needed for Unicode word boundaries.
1543 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
1544 goto cleanupAndReturn;
1545 }
1546 #endif
1547 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
1548 // Expected pattern compilation error.
1549 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
1550 logln("Pattern Compile returns \"%s\"", u_errorName(status));
1551 }
1552 goto cleanupAndReturn;
1553 } else {
1554 // Unexpected pattern compilation error.
1555 errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
1556 goto cleanupAndReturn;
1557 }
1558 }
1559
1560 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
1561 RegexPatternDump(callerPattern);
1562 }
1563
1564 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
1565 errln("Expected, but did not get, a pattern compilation error.");
1566 goto cleanupAndReturn;
1567 }
1568
1569
1570 //
1571 // Number of times find() should be called on the test string, default to 1
1572 //
1573 numFinds = 1;
1574 for (i=2; i<=9; i++) {
1575 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
1576 if (numFinds != 1) {
1577 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
1578 goto cleanupAndReturn;
1579 }
1580 numFinds = i;
1581 }
1582 }
1583
1584 // 'M' flag. Use matches() instead of find()
1585 if (flags.indexOf((UChar)0x4d) >= 0) {
1586 useMatchesFunc = TRUE;
1587 }
1588 if (flags.indexOf((UChar)0x4c) >= 0) {
1589 useLookingAtFunc = TRUE;
1590 }
1591
1592 //
1593 // Find the tags in the input data, remove them, and record the group boundary
1594 // positions.
1595 //
1596 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
1597 REGEX_CHECK_STATUS_L(line);
1598
1599 unEscapedInput = inputString.unescape();
1600 parseMatcher = parsePat->matcher(unEscapedInput, status);
1601 REGEX_CHECK_STATUS_L(line);
1602 while(parseMatcher->find()) {
1603 parseMatcher->appendReplacement(deTaggedInput, "", status);
1604 REGEX_CHECK_STATUS;
1605 UnicodeString groupNum = parseMatcher->group(2, status);
1606 if (groupNum == "r") {
1607 // <r> or </r>, a region specification within the string
1608 if (parseMatcher->group(1, status) == "/") {
1609 regionEnd = deTaggedInput.length();
1610 } else {
1611 regionStart = deTaggedInput.length();
1612 }
1613 } else {
1614 // <digits> or </digits>, a group match boundary tag.
1615 if (parseMatcher->group(1, status) == "/") {
1616 set(groupEnds, deTaggedInput.length(), groupNum);
1617 } else {
1618 set(groupStarts, deTaggedInput.length(), groupNum);
1619 }
1620 }
1621 }
1622 parseMatcher->appendTail(deTaggedInput);
1623 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
1624 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
1625 errln("mismatched <r> tags");
1626 failed = TRUE;
1627 goto cleanupAndReturn;
1628 }
1629
1630
1631 //
1632 // Configure the matcher according to the flags specified with this test.
1633 //
1634 matcher = callerPattern->matcher(deTaggedInput, status);
1635 REGEX_CHECK_STATUS_L(line);
1636 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
1637 matcher->setTrace(TRUE);
1638 }
1639 if (regionStart>=0) {
1640 matcher->region(regionStart, regionEnd, status);
1641 REGEX_CHECK_STATUS_L(line);
1642 }
1643 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
1644 matcher->useAnchoringBounds(FALSE);
1645 }
1646 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
1647 matcher->useTransparentBounds(TRUE);
1648 }
1649
1650
1651
1652 //
1653 // Do a find on the de-tagged input using the caller's pattern
1654 // TODO: error on count>1 and not find().
1655 // error on both matches() and lookingAt().
1656 //
1657 for (i=0; i<numFinds; i++) {
1658 if (useMatchesFunc) {
1659 isMatch = matcher->matches(status);
1660 } else if (useLookingAtFunc) {
1661 isMatch = matcher->lookingAt(status);
1662 } else {
1663 isMatch = matcher->find();
1664 }
1665 }
1666 matcher->setTrace(FALSE);
1667
1668 //
1669 // Match up the groups from the find() with the groups from the tags
1670 //
1671
1672 // number of tags should match number of groups from find operation.
1673 // matcher->groupCount does not include group 0, the entire match, hence the +1.
1674 // G option in test means that capture group data is not available in the
1675 // expected results, so the check needs to be suppressed.
1676 if (isMatch == FALSE && groupStarts.size() != 0) {
1677 errln("Error at line %d: Match expected, but none found.\n", line);
1678 failed = TRUE;
1679 goto cleanupAndReturn;
1680 }
1681
1682 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
1683 // Only check for match / no match. Don't check capture groups.
1684 if (isMatch && groupStarts.size() == 0) {
1685 errln("Error at line %d: No match expected, but one found.\n", line);
1686 failed = TRUE;
1687 }
1688 goto cleanupAndReturn;
1689 }
1690
1691 for (i=0; i<=matcher->groupCount(); i++) {
1692 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
1693 if (matcher->start(i, status) != expectedStart) {
1694 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
1695 line, i, expectedStart, matcher->start(i, status));
1696 failed = TRUE;
1697 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
1698 }
1699 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
1700 if (matcher->end(i, status) != expectedEnd) {
1701 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
1702 line, i, expectedEnd, matcher->end(i, status));
1703 failed = TRUE;
1704 // Error on end position; keep going; real error is probably yet to come as group
1705 // end positions work from end of the input data towards the front.
1706 }
1707 }
1708 if ( matcher->groupCount()+1 < groupStarts.size()) {
1709 errln("Error at line %d: Expected %d capture groups, found %d.",
1710 line, groupStarts.size()-1, matcher->groupCount());
1711 failed = TRUE;
1712 }
1713
1714 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
1715 matcher->requireEnd() == TRUE) {
1716 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
1717 failed = TRUE;
1718 }
1719 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
1720 matcher->requireEnd() == FALSE) {
1721 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
1722 failed = TRUE;
1723 }
1724 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
1725 matcher->hitEnd() == TRUE) {
1726 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
1727 failed = TRUE;
1728 }
1729 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
1730 matcher->hitEnd() == FALSE) {
1731 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
1732 failed = TRUE;
1733 }
1734
1735
1736 cleanupAndReturn:
1737 if (failed) {
1738 errln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
1739 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
1740 // callerPattern->dump();
1741 }
1742 delete parseMatcher;
1743 delete parsePat;
1744 delete matcher;
1745 delete callerPattern;
1746 }
1747
1748
1749
1750
1751 //---------------------------------------------------------------------------
1752 //
1753 // Errors Check for error handling in patterns.
1754 //
1755 //---------------------------------------------------------------------------
Errors()1756 void RegexTest::Errors() {
1757 // \escape sequences that aren't implemented yet.
1758 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
1759
1760 // Missing close parentheses
1761 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
1762 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
1763 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
1764
1765 // Extra close paren
1766 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
1767 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
1768 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
1769
1770 // Look-ahead, Look-behind
1771 // TODO: add tests for unbounded length look-behinds.
1772 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
1773
1774 // Attempt to use non-default flags
1775 {
1776 UParseError pe;
1777 UErrorCode status = U_ZERO_ERROR;
1778 int32_t flags = UREGEX_CANON_EQ |
1779 UREGEX_COMMENTS | UREGEX_DOTALL |
1780 UREGEX_MULTILINE;
1781 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
1782 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
1783 delete pat1;
1784 }
1785
1786
1787 // Quantifiers are allowed only after something that can be quantified.
1788 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
1789 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
1790 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
1791
1792 // Mal-formed {min,max} quantifiers
1793 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
1794 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
1795 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
1796 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
1797 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
1798 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
1799 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
1800 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
1801 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
1802
1803 // Ticket 5389
1804 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
1805
1806 }
1807
1808
1809 //-------------------------------------------------------------------------------
1810 //
1811 // Read a text data file, convert it to UChars, and return the data
1812 // in one big UChar * buffer, which the caller must delete.
1813 //
1814 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)1815 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
1816 const char *defEncoding, UErrorCode &status) {
1817 UChar *retPtr = NULL;
1818 char *fileBuf = NULL;
1819 UConverter* conv = NULL;
1820 FILE *f = NULL;
1821
1822 ulen = 0;
1823 if (U_FAILURE(status)) {
1824 return retPtr;
1825 }
1826
1827 //
1828 // Open the file.
1829 //
1830 f = fopen(fileName, "rb");
1831 if (f == 0) {
1832 errln("Error opening test data file %s\n", fileName);
1833 status = U_FILE_ACCESS_ERROR;
1834 return NULL;
1835 }
1836 //
1837 // Read it in
1838 //
1839 int32_t fileSize;
1840 int32_t amt_read;
1841
1842 fseek( f, 0, SEEK_END);
1843 fileSize = ftell(f);
1844 fileBuf = new char[fileSize];
1845 fseek(f, 0, SEEK_SET);
1846 amt_read = fread(fileBuf, 1, fileSize, f);
1847 if (amt_read != fileSize || fileSize <= 0) {
1848 errln("Error reading test data file.");
1849 goto cleanUpAndReturn;
1850 }
1851
1852 //
1853 // Look for a Unicode Signature (BOM) on the data just read
1854 //
1855 int32_t signatureLength;
1856 const char * fileBufC;
1857 const char* encoding;
1858
1859 fileBufC = fileBuf;
1860 encoding = ucnv_detectUnicodeSignature(
1861 fileBuf, fileSize, &signatureLength, &status);
1862 if(encoding!=NULL ){
1863 fileBufC += signatureLength;
1864 fileSize -= signatureLength;
1865 } else {
1866 encoding = defEncoding;
1867 if (strcmp(encoding, "utf-8") == 0) {
1868 errln("file %s is missing its BOM", fileName);
1869 }
1870 }
1871
1872 //
1873 // Open a converter to take the rule file to UTF-16
1874 //
1875 conv = ucnv_open(encoding, &status);
1876 if (U_FAILURE(status)) {
1877 goto cleanUpAndReturn;
1878 }
1879
1880 //
1881 // Convert the rules to UChar.
1882 // Preflight first to determine required buffer size.
1883 //
1884 ulen = ucnv_toUChars(conv,
1885 NULL, // dest,
1886 0, // destCapacity,
1887 fileBufC,
1888 fileSize,
1889 &status);
1890 if (status == U_BUFFER_OVERFLOW_ERROR) {
1891 // Buffer Overflow is expected from the preflight operation.
1892 status = U_ZERO_ERROR;
1893
1894 retPtr = new UChar[ulen+1];
1895 ucnv_toUChars(conv,
1896 retPtr, // dest,
1897 ulen+1,
1898 fileBufC,
1899 fileSize,
1900 &status);
1901 }
1902
1903 cleanUpAndReturn:
1904 fclose(f);
1905 delete[] fileBuf;
1906 ucnv_close(conv);
1907 if (U_FAILURE(status)) {
1908 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1909 delete retPtr;
1910 retPtr = 0;
1911 ulen = 0;
1912 };
1913 return retPtr;
1914 }
1915
1916
1917 //-------------------------------------------------------------------------------
1918 //
1919 // PerlTests - Run Perl's regular expression tests
1920 // The input file for this test is re_tests, the standard regular
1921 // expression test data distributed with the Perl source code.
1922 //
1923 // Here is Perl's description of the test data file:
1924 //
1925 // # The tests are in a separate file 't/op/re_tests'.
1926 // # Each line in that file is a separate test.
1927 // # There are five columns, separated by tabs.
1928 // #
1929 // # Column 1 contains the pattern, optionally enclosed in C<''>.
1930 // # Modifiers can be put after the closing C<'>.
1931 // #
1932 // # Column 2 contains the string to be matched.
1933 // #
1934 // # Column 3 contains the expected result:
1935 // # y expect a match
1936 // # n expect no match
1937 // # c expect an error
1938 // # B test exposes a known bug in Perl, should be skipped
1939 // # b test exposes a known bug in Perl, should be skipped if noamp
1940 // #
1941 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
1942 // #
1943 // # Column 4 contains a string, usually C<$&>.
1944 // #
1945 // # Column 5 contains the expected result of double-quote
1946 // # interpolating that string after the match, or start of error message.
1947 // #
1948 // # Column 6, if present, contains a reason why the test is skipped.
1949 // # This is printed with "skipped", for harness to pick up.
1950 // #
1951 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
1952 // #
1953 // # If you want to add a regular expression test that can't be expressed
1954 // # in this format, don't add it here: put it in op/pat.t instead.
1955 //
1956 // For ICU, if field 3 contains an 'i', the test will be skipped.
1957 // The test exposes is some known incompatibility between ICU and Perl regexps.
1958 // (The i is in addition to whatever was there before.)
1959 //
1960 //-------------------------------------------------------------------------------
PerlTests()1961 void RegexTest::PerlTests() {
1962 char tdd[2048];
1963 const char *srcPath;
1964 UErrorCode status = U_ZERO_ERROR;
1965 UParseError pe;
1966
1967 //
1968 // Open and read the test data file.
1969 //
1970 srcPath=getPath(tdd, "re_tests.txt");
1971 if(srcPath==NULL) {
1972 return; /* something went wrong, error already output */
1973 }
1974
1975 int32_t len;
1976 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
1977 if (U_FAILURE(status)) {
1978 return; /* something went wrong, error already output */
1979 }
1980
1981 //
1982 // Put the test data into a UnicodeString
1983 //
1984 UnicodeString testDataString(FALSE, testData, len);
1985
1986 //
1987 // Regex to break the input file into lines, and strip the new lines.
1988 // One line per match, capture group one is the desired data.
1989 //
1990 RegexPattern* linePat = RegexPattern::compile("(.+?)[\\r\\n]+", 0, pe, status);
1991 if (U_FAILURE(status)) {
1992 dataerrln("RegexPattern::compile() error");
1993 return;
1994 }
1995 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
1996
1997 //
1998 // Regex to split a test file line into fields.
1999 // There are six fields, separated by tabs.
2000 //
2001 RegexPattern* fieldPat = RegexPattern::compile("\\t", 0, pe, status);
2002
2003 //
2004 // Regex to identify test patterns with flag settings, and to separate them.
2005 // Test patterns with flags look like 'pattern'i
2006 // Test patterns without flags are not quoted: pattern
2007 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
2008 //
2009 RegexPattern *flagPat = RegexPattern::compile("('?)(.*)\\1(.*)", 0, pe, status);
2010 RegexMatcher* flagMat = flagPat->matcher(status);
2011
2012 //
2013 // The Perl tests reference several perl-isms, which are evaluated/substituted
2014 // in the test data. Not being perl, this must be done explicitly. Here
2015 // are string constants and REs for these constructs.
2016 //
2017 UnicodeString nulnulSrc("${nulnul}");
2018 UnicodeString nulnul("\\u0000\\u0000");
2019 nulnul = nulnul.unescape();
2020
2021 UnicodeString ffffSrc("${ffff}");
2022 UnicodeString ffff("\\uffff");
2023 ffff = ffff.unescape();
2024
2025 // regexp for $-[0], $+[2], etc.
2026 RegexPattern *groupsPat = RegexPattern::compile("\\$([+\\-])\\[(\\d+)\\]", 0, pe, status);
2027 RegexMatcher *groupsMat = groupsPat->matcher(status);
2028
2029 // regexp for $0, $1, $2, etc.
2030 RegexPattern *cgPat = RegexPattern::compile("\\$(\\d+)", 0, pe, status);
2031 RegexMatcher *cgMat = cgPat->matcher(status);
2032
2033
2034 //
2035 // Main Loop for the Perl Tests, runs once per line from the
2036 // test data file.
2037 //
2038 int32_t lineNum = 0;
2039 int32_t skippedUnimplementedCount = 0;
2040 while (lineMat->find()) {
2041 lineNum++;
2042
2043 //
2044 // Get a line, break it into its fields, do the Perl
2045 // variable substitutions.
2046 //
2047 UnicodeString line = lineMat->group(1, status);
2048 UnicodeString fields[7];
2049 fieldPat->split(line, fields, 7, status);
2050
2051 flagMat->reset(fields[0]);
2052 flagMat->matches(status);
2053 UnicodeString pattern = flagMat->group(2, status);
2054 pattern.findAndReplace("${bang}", "!");
2055 pattern.findAndReplace(nulnulSrc, "\\u0000\\u0000");
2056 pattern.findAndReplace(ffffSrc, ffff);
2057
2058 //
2059 // Identify patterns that include match flag settings,
2060 // split off the flags, remove the extra quotes.
2061 //
2062 UnicodeString flagStr = flagMat->group(3, status);
2063 if (U_FAILURE(status)) {
2064 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2065 return;
2066 }
2067 int32_t flags = 0;
2068 const UChar UChar_c = 0x63; // Char constants for the flag letters.
2069 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
2070 const UChar UChar_m = 0x6d;
2071 const UChar UChar_x = 0x78;
2072 const UChar UChar_y = 0x79;
2073 if (flagStr.indexOf(UChar_i) != -1) {
2074 flags |= UREGEX_CASE_INSENSITIVE;
2075 }
2076 if (flagStr.indexOf(UChar_m) != -1) {
2077 flags |= UREGEX_MULTILINE;
2078 }
2079 if (flagStr.indexOf(UChar_x) != -1) {
2080 flags |= UREGEX_COMMENTS;
2081 }
2082
2083 //
2084 // Compile the test pattern.
2085 //
2086 status = U_ZERO_ERROR;
2087 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
2088 if (status == U_REGEX_UNIMPLEMENTED) {
2089 //
2090 // Test of a feature that is planned for ICU, but not yet implemented.
2091 // skip the test.
2092 skippedUnimplementedCount++;
2093 delete testPat;
2094 status = U_ZERO_ERROR;
2095 continue;
2096 }
2097
2098 if (U_FAILURE(status)) {
2099 // Some tests are supposed to generate errors.
2100 // Only report an error for tests that are supposed to succeed.
2101 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
2102 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
2103 {
2104 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
2105 }
2106 status = U_ZERO_ERROR;
2107 delete testPat;
2108 continue;
2109 }
2110
2111 if (fields[2].indexOf(UChar_i) >= 0) {
2112 // ICU should skip this test.
2113 delete testPat;
2114 continue;
2115 }
2116
2117 if (fields[2].indexOf(UChar_c) >= 0) {
2118 // This pattern should have caused a compilation error, but didn't/
2119 errln("line %d: Expected a pattern compile error, got success.", lineNum);
2120 delete testPat;
2121 continue;
2122 }
2123
2124 //
2125 // replace the Perl variables that appear in some of the
2126 // match data strings.
2127 //
2128 UnicodeString matchString = fields[1];
2129 matchString.findAndReplace(nulnulSrc, nulnul);
2130 matchString.findAndReplace(ffffSrc, ffff);
2131
2132 // Replace any \n in the match string with an actual new-line char.
2133 // Don't do full unescape, as this unescapes more than Perl does, which
2134 // causes other spurious failures in the tests.
2135 matchString.findAndReplace("\\n", "\n");
2136
2137
2138
2139 //
2140 // Run the test, check for expected match/don't match result.
2141 //
2142 RegexMatcher *testMat = testPat->matcher(matchString, status);
2143 UBool found = testMat->find();
2144 UBool expected = FALSE;
2145 if (fields[2].indexOf(UChar_y) >=0) {
2146 expected = TRUE;
2147 }
2148 if (expected != found) {
2149 errln("line %d: Expected %smatch, got %smatch",
2150 lineNum, expected?"":"no ", found?"":"no " );
2151 continue;
2152 }
2153
2154 // Don't try to check expected results if there is no match.
2155 // (Some have stuff in the expected fields)
2156 if (!found) {
2157 delete testMat;
2158 delete testPat;
2159 continue;
2160 }
2161
2162 //
2163 // Interpret the Perl expression from the fourth field of the data file,
2164 // building up an ICU string from the results of the ICU match.
2165 // The Perl expression will contain references to the results of
2166 // a regex match, including the matched string, capture group strings,
2167 // group starting and ending indicies, etc.
2168 //
2169 UnicodeString resultString;
2170 UnicodeString perlExpr = fields[3];
2171 groupsMat->reset(perlExpr);
2172 cgMat->reset(perlExpr);
2173
2174 while (perlExpr.length() > 0) {
2175 if (perlExpr.startsWith("$&")) {
2176 resultString.append(testMat->group(status));
2177 perlExpr.remove(0, 2);
2178 }
2179
2180 else if (groupsMat->lookingAt(status)) {
2181 // $-[0] $+[2] etc.
2182 UnicodeString digitString = groupsMat->group(2, status);
2183 int32_t t = 0;
2184 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
2185 UnicodeString plusOrMinus = groupsMat->group(1, status);
2186 int32_t matchPosition;
2187 if (plusOrMinus.compare("+") == 0) {
2188 matchPosition = testMat->end(groupNum, status);
2189 } else {
2190 matchPosition = testMat->start(groupNum, status);
2191 }
2192 if (matchPosition != -1) {
2193 ICU_Utility::appendNumber(resultString, matchPosition);
2194 }
2195 perlExpr.remove(0, groupsMat->end(status));
2196 }
2197
2198 else if (cgMat->lookingAt(status)) {
2199 // $1, $2, $3, etc.
2200 UnicodeString digitString = cgMat->group(1, status);
2201 int32_t t = 0;
2202 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
2203 if (U_SUCCESS(status)) {
2204 resultString.append(testMat->group(groupNum, status));
2205 status = U_ZERO_ERROR;
2206 }
2207 perlExpr.remove(0, cgMat->end(status));
2208 }
2209
2210 else if (perlExpr.startsWith("@-")) {
2211 int32_t i;
2212 for (i=0; i<=testMat->groupCount(); i++) {
2213 if (i>0) {
2214 resultString.append(" ");
2215 }
2216 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
2217 }
2218 perlExpr.remove(0, 2);
2219 }
2220
2221 else if (perlExpr.startsWith("@+")) {
2222 int32_t i;
2223 for (i=0; i<=testMat->groupCount(); i++) {
2224 if (i>0) {
2225 resultString.append(" ");
2226 }
2227 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
2228 }
2229 perlExpr.remove(0, 2);
2230 }
2231
2232 else if (perlExpr.startsWith("\\")) { // \Escape. Take following char as a literal.
2233 // or as an escaped sequence (e.g. \n)
2234 if (perlExpr.length() > 1) {
2235 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
2236 }
2237 UChar c = perlExpr.charAt(0);
2238 switch (c) {
2239 case 'n': c = '\n'; break;
2240 // add any other escape sequences that show up in the test expected results.
2241 }
2242 resultString.append(c);
2243 perlExpr.remove(0, 1);
2244 }
2245
2246 else {
2247 // Any characters from the perl expression that we don't explicitly
2248 // recognize before here are assumed to be literals and copied
2249 // as-is to the expected results.
2250 resultString.append(perlExpr.charAt(0));
2251 perlExpr.remove(0, 1);
2252 }
2253
2254 if (U_FAILURE(status)) {
2255 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
2256 break;
2257 }
2258 }
2259
2260 //
2261 // Expected Results Compare
2262 //
2263 UnicodeString expectedS(fields[4]);
2264 expectedS.findAndReplace(nulnulSrc, nulnul);
2265 expectedS.findAndReplace(ffffSrc, ffff);
2266 expectedS.findAndReplace("\\n", "\n");
2267
2268
2269 if (expectedS.compare(resultString) != 0) {
2270 err("Line %d: Incorrect perl expression results.", lineNum);
2271 errln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
2272 }
2273
2274 delete testMat;
2275 delete testPat;
2276 }
2277
2278 //
2279 // All done. Clean up allocated stuff.
2280 //
2281 delete cgMat;
2282 delete cgPat;
2283
2284 delete groupsMat;
2285 delete groupsPat;
2286
2287 delete flagMat;
2288 delete flagPat;
2289
2290 delete lineMat;
2291 delete linePat;
2292
2293 delete fieldPat;
2294 delete [] testData;
2295
2296
2297 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
2298
2299 }
2300
2301
2302
2303 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
2304
2305