1 /*
2 *******************************************************************************
3 * Copyright (C) 2004-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: regex.cpp
7 */
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "umutex.h"
20 #include "uassert.h"
21 #include "cmemory.h"
22
23 #include "regextxt.h"
24
25 #include <stdio.h>
26
27 U_NAMESPACE_BEGIN
28
29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
30
31 struct RegularExpression: public UMemory {
32 public:
33 RegularExpression();
34 ~RegularExpression();
35 int32_t fMagic;
36 RegexPattern *fPat;
37 int32_t *fPatRefCount;
38 UChar *fPatString;
39 int32_t fPatStringLen;
40 RegexMatcher *fMatcher;
41 const UChar *fText; // Text from setText()
42 int32_t fTextLength; // Length provided by user with setText(), which
43 // may be -1.
44 UBool fOwnsText;
45 };
46
47 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
48
RegularExpression()49 RegularExpression::RegularExpression() {
50 fMagic = REXP_MAGIC;
51 fPat = NULL;
52 fPatRefCount = NULL;
53 fPatString = NULL;
54 fPatStringLen = 0;
55 fMatcher = NULL;
56 fText = NULL;
57 fTextLength = 0;
58 fOwnsText = FALSE;
59 }
60
~RegularExpression()61 RegularExpression::~RegularExpression() {
62 delete fMatcher;
63 fMatcher = NULL;
64 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
65 delete fPat;
66 uprv_free(fPatString);
67 uprv_free(fPatRefCount);
68 }
69 if (fOwnsText && fText!=NULL) {
70 uprv_free((void *)fText);
71 }
72 fMagic = 0;
73 }
74
75 U_NAMESPACE_END
76
77 U_NAMESPACE_USE
78
79 //----------------------------------------------------------------------------------------
80 //
81 // validateRE Do boilerplate style checks on API function parameters.
82 // Return TRUE if they look OK.
83 //----------------------------------------------------------------------------------------
validateRE(const RegularExpression * re,UErrorCode * status,UBool requiresText=TRUE)84 static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
85 if (U_FAILURE(*status)) {
86 return FALSE;
87 }
88 if (re == NULL || re->fMagic != REXP_MAGIC) {
89 *status = U_ILLEGAL_ARGUMENT_ERROR;
90 return FALSE;
91 }
92 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
93 if (requiresText && re->fText == NULL && !re->fOwnsText) {
94 *status = U_REGEX_INVALID_STATE;
95 return FALSE;
96 }
97 return TRUE;
98 }
99
100 //----------------------------------------------------------------------------------------
101 //
102 // uregex_open
103 //
104 //----------------------------------------------------------------------------------------
105 U_CAPI URegularExpression * U_EXPORT2
uregex_open(const UChar * pattern,int32_t patternLength,uint32_t flags,UParseError * pe,UErrorCode * status)106 uregex_open( const UChar *pattern,
107 int32_t patternLength,
108 uint32_t flags,
109 UParseError *pe,
110 UErrorCode *status) {
111
112 if (U_FAILURE(*status)) {
113 return NULL;
114 }
115 if (pattern == NULL || patternLength < -1 || patternLength == 0) {
116 *status = U_ILLEGAL_ARGUMENT_ERROR;
117 return NULL;
118 }
119 int32_t actualPatLen = patternLength;
120 if (actualPatLen == -1) {
121 actualPatLen = u_strlen(pattern);
122 }
123
124 RegularExpression *re = new RegularExpression;
125 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
126 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
127 if (re == NULL || refC == NULL || patBuf == NULL) {
128 *status = U_MEMORY_ALLOCATION_ERROR;
129 delete re;
130 uprv_free(refC);
131 uprv_free(patBuf);
132 return NULL;
133 }
134 re->fPatRefCount = refC;
135 *re->fPatRefCount = 1;
136
137 //
138 // Make a copy of the pattern string, so we can return it later if asked.
139 // For compiling the pattern, we will use a UText wrapper around
140 // this local copy, to avoid making even more copies.
141 //
142 re->fPatString = patBuf;
143 re->fPatStringLen = patternLength;
144 u_memcpy(patBuf, pattern, actualPatLen);
145 patBuf[actualPatLen] = 0;
146
147 UText patText = UTEXT_INITIALIZER;
148 utext_openUChars(&patText, patBuf, patternLength, status);
149
150 //
151 // Compile the pattern
152 //
153 if (pe != NULL) {
154 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
155 } else {
156 re->fPat = RegexPattern::compile(&patText, flags, *status);
157 }
158 utext_close(&patText);
159
160 if (U_FAILURE(*status)) {
161 goto ErrorExit;
162 }
163
164 //
165 // Create the matcher object
166 //
167 re->fMatcher = re->fPat->matcher(*status);
168 if (U_SUCCESS(*status)) {
169 return (URegularExpression*)re;
170 }
171
172 ErrorExit:
173 delete re;
174 return NULL;
175
176 }
177
178 //----------------------------------------------------------------------------------------
179 //
180 // uregex_openUText
181 //
182 //----------------------------------------------------------------------------------------
183 U_CAPI URegularExpression * U_EXPORT2
uregex_openUText(UText * pattern,uint32_t flags,UParseError * pe,UErrorCode * status)184 uregex_openUText(UText *pattern,
185 uint32_t flags,
186 UParseError *pe,
187 UErrorCode *status) {
188
189 if (U_FAILURE(*status)) {
190 return NULL;
191 }
192 if (pattern == NULL) {
193 *status = U_ILLEGAL_ARGUMENT_ERROR;
194 return NULL;
195 }
196
197 int64_t patternNativeLength = utext_nativeLength(pattern);
198
199 if (patternNativeLength == 0) {
200 *status = U_ILLEGAL_ARGUMENT_ERROR;
201 return NULL;
202 }
203
204 RegularExpression *re = new RegularExpression;
205
206 UErrorCode lengthStatus = U_ZERO_ERROR;
207 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
208
209 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
210 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
211 if (re == NULL || refC == NULL || patBuf == NULL) {
212 *status = U_MEMORY_ALLOCATION_ERROR;
213 delete re;
214 uprv_free(refC);
215 uprv_free(patBuf);
216 return NULL;
217 }
218 re->fPatRefCount = refC;
219 *re->fPatRefCount = 1;
220
221 //
222 // Make a copy of the pattern string, so we can return it later if asked.
223 // For compiling the pattern, we will use a read-only UText wrapper
224 // around this local copy, to avoid making even more copies.
225 //
226 re->fPatString = patBuf;
227 re->fPatStringLen = pattern16Length;
228 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
229
230 UText patText = UTEXT_INITIALIZER;
231 utext_openUChars(&patText, patBuf, pattern16Length, status);
232
233 //
234 // Compile the pattern
235 //
236 if (pe != NULL) {
237 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
238 } else {
239 re->fPat = RegexPattern::compile(&patText, flags, *status);
240 }
241 utext_close(&patText);
242
243 if (U_FAILURE(*status)) {
244 goto ErrorExit;
245 }
246
247 //
248 // Create the matcher object
249 //
250 re->fMatcher = re->fPat->matcher(*status);
251 if (U_SUCCESS(*status)) {
252 return (URegularExpression*)re;
253 }
254
255 ErrorExit:
256 delete re;
257 return NULL;
258
259 }
260
261 //----------------------------------------------------------------------------------------
262 //
263 // uregex_close
264 //
265 //----------------------------------------------------------------------------------------
266 U_CAPI void U_EXPORT2
uregex_close(URegularExpression * re2)267 uregex_close(URegularExpression *re2) {
268 RegularExpression *re = (RegularExpression*)re2;
269 UErrorCode status = U_ZERO_ERROR;
270 if (validateRE(re, &status, FALSE) == FALSE) {
271 return;
272 }
273 delete re;
274 }
275
276
277 //----------------------------------------------------------------------------------------
278 //
279 // uregex_clone
280 //
281 //----------------------------------------------------------------------------------------
282 U_CAPI URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression * source2,UErrorCode * status)283 uregex_clone(const URegularExpression *source2, UErrorCode *status) {
284 RegularExpression *source = (RegularExpression*)source2;
285 if (validateRE(source, status, FALSE) == FALSE) {
286 return NULL;
287 }
288
289 RegularExpression *clone = new RegularExpression;
290 if (clone == NULL) {
291 *status = U_MEMORY_ALLOCATION_ERROR;
292 return NULL;
293 }
294
295 clone->fMatcher = source->fPat->matcher(*status);
296 if (U_FAILURE(*status)) {
297 delete clone;
298 return NULL;
299 }
300
301 clone->fPat = source->fPat;
302 clone->fPatRefCount = source->fPatRefCount;
303 clone->fPatString = source->fPatString;
304 clone->fPatStringLen = source->fPatStringLen;
305 umtx_atomic_inc(source->fPatRefCount);
306 // Note: fText is not cloned.
307
308 return (URegularExpression*)clone;
309 }
310
311
312
313
314 //------------------------------------------------------------------------------
315 //
316 // uregex_pattern
317 //
318 //------------------------------------------------------------------------------
319 U_CAPI const UChar * U_EXPORT2
uregex_pattern(const URegularExpression * regexp2,int32_t * patLength,UErrorCode * status)320 uregex_pattern(const URegularExpression *regexp2,
321 int32_t *patLength,
322 UErrorCode *status) {
323 RegularExpression *regexp = (RegularExpression*)regexp2;
324
325 if (validateRE(regexp, status, FALSE) == FALSE) {
326 return NULL;
327 }
328 if (patLength != NULL) {
329 *patLength = regexp->fPatStringLen;
330 }
331 return regexp->fPatString;
332 }
333
334
335 //------------------------------------------------------------------------------
336 //
337 // uregex_patternUText
338 //
339 //------------------------------------------------------------------------------
340 U_CAPI UText * U_EXPORT2
uregex_patternUText(const URegularExpression * regexp2,UErrorCode * status)341 uregex_patternUText(const URegularExpression *regexp2,
342 UErrorCode *status) {
343 RegularExpression *regexp = (RegularExpression*)regexp2;
344 return regexp->fPat->patternText(*status);
345 }
346
347
348 //------------------------------------------------------------------------------
349 //
350 // uregex_flags
351 //
352 //------------------------------------------------------------------------------
353 U_CAPI int32_t U_EXPORT2
uregex_flags(const URegularExpression * regexp2,UErrorCode * status)354 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) {
355 RegularExpression *regexp = (RegularExpression*)regexp2;
356 if (validateRE(regexp, status, FALSE) == FALSE) {
357 return 0;
358 }
359 int32_t flags = regexp->fPat->flags();
360 return flags;
361 }
362
363
364 //------------------------------------------------------------------------------
365 //
366 // uregex_setText
367 //
368 //------------------------------------------------------------------------------
369 U_CAPI void U_EXPORT2
uregex_setText(URegularExpression * regexp2,const UChar * text,int32_t textLength,UErrorCode * status)370 uregex_setText(URegularExpression *regexp2,
371 const UChar *text,
372 int32_t textLength,
373 UErrorCode *status) {
374 RegularExpression *regexp = (RegularExpression*)regexp2;
375 if (validateRE(regexp, status, FALSE) == FALSE) {
376 return;
377 }
378 if (text == NULL || textLength < -1) {
379 *status = U_ILLEGAL_ARGUMENT_ERROR;
380 return;
381 }
382
383 if (regexp->fOwnsText && regexp->fText != NULL) {
384 uprv_free((void *)regexp->fText);
385 }
386
387 regexp->fText = text;
388 regexp->fTextLength = textLength;
389 regexp->fOwnsText = FALSE;
390
391 UText input = UTEXT_INITIALIZER;
392 utext_openUChars(&input, text, textLength, status);
393 regexp->fMatcher->reset(&input);
394 utext_close(&input); // reset() made a shallow clone, so we don't need this copy
395 }
396
397
398 //------------------------------------------------------------------------------
399 //
400 // uregex_setUText
401 //
402 //------------------------------------------------------------------------------
403 U_CAPI void U_EXPORT2
uregex_setUText(URegularExpression * regexp2,UText * text,UErrorCode * status)404 uregex_setUText(URegularExpression *regexp2,
405 UText *text,
406 UErrorCode *status) {
407 RegularExpression *regexp = (RegularExpression*)regexp2;
408 if (validateRE(regexp, status, FALSE) == FALSE) {
409 return;
410 }
411 if (text == NULL) {
412 *status = U_ILLEGAL_ARGUMENT_ERROR;
413 return;
414 }
415
416 if (regexp->fOwnsText && regexp->fText != NULL) {
417 uprv_free((void *)regexp->fText);
418 }
419
420 regexp->fText = NULL; // only fill it in on request
421 regexp->fTextLength = -1;
422 regexp->fOwnsText = TRUE;
423 regexp->fMatcher->reset(text);
424 }
425
426
427
428 //------------------------------------------------------------------------------
429 //
430 // uregex_getText
431 //
432 //------------------------------------------------------------------------------
433 U_CAPI const UChar * U_EXPORT2
uregex_getText(URegularExpression * regexp2,int32_t * textLength,UErrorCode * status)434 uregex_getText(URegularExpression *regexp2,
435 int32_t *textLength,
436 UErrorCode *status) {
437 RegularExpression *regexp = (RegularExpression*)regexp2;
438 if (validateRE(regexp, status, FALSE) == FALSE) {
439 return NULL;
440 }
441
442 if (regexp->fText == NULL) {
443 // need to fill in the text
444 UText *inputText = regexp->fMatcher->inputText();
445 int64_t inputNativeLength = utext_nativeLength(inputText);
446 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
447 regexp->fText = inputText->chunkContents;
448 regexp->fTextLength = (int32_t)inputNativeLength;
449 regexp->fOwnsText = FALSE; // because the UText owns it
450 } else {
451 UErrorCode lengthStatus = U_ZERO_ERROR;
452 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
453 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
454
455 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
456 regexp->fText = inputChars;
457 regexp->fOwnsText = TRUE; // should already be set but just in case
458 }
459 }
460
461 if (textLength != NULL) {
462 *textLength = regexp->fTextLength;
463 }
464 return regexp->fText;
465 }
466
467
468 //------------------------------------------------------------------------------
469 //
470 // uregex_getUText
471 //
472 //------------------------------------------------------------------------------
473 U_CAPI UText * U_EXPORT2
uregex_getUText(URegularExpression * regexp2,UText * dest,UErrorCode * status)474 uregex_getUText(URegularExpression *regexp2,
475 UText *dest,
476 UErrorCode *status) {
477 RegularExpression *regexp = (RegularExpression*)regexp2;
478 if (validateRE(regexp, status, FALSE) == FALSE) {
479 return dest;
480 }
481 return regexp->fMatcher->getInput(dest, *status);
482 }
483
484 // BEGIN android-added
485 // Removed this function after Android upgrade to ICU4.8.
486 //------------------------------------------------------------------------------
487 //
488 // uregex_refreshUText
489 //
490 //------------------------------------------------------------------------------
491 U_CAPI void U_EXPORT2
uregex_refreshUText(URegularExpression * regexp2,UText * text,UErrorCode * status)492 uregex_refreshUText(URegularExpression *regexp2,
493 UText *text,
494 UErrorCode *status) {
495 RegularExpression *regexp = (RegularExpression*)regexp2;
496 if (validateRE(regexp, status, FALSE) == FALSE) {
497 return;
498 }
499 regexp->fMatcher->refreshInputText(text, *status);
500 }
501 // END android-added
502
503 //------------------------------------------------------------------------------
504 //
505 // uregex_matches
506 //
507 //------------------------------------------------------------------------------
508 U_CAPI UBool U_EXPORT2
uregex_matches(URegularExpression * regexp2,int32_t startIndex,UErrorCode * status)509 uregex_matches(URegularExpression *regexp2,
510 int32_t startIndex,
511 UErrorCode *status) {
512 return uregex_matches64( regexp2, (int64_t)startIndex, status);
513 }
514
515 U_CAPI UBool U_EXPORT2
uregex_matches64(URegularExpression * regexp2,int64_t startIndex,UErrorCode * status)516 uregex_matches64(URegularExpression *regexp2,
517 int64_t startIndex,
518 UErrorCode *status) {
519 RegularExpression *regexp = (RegularExpression*)regexp2;
520 UBool result = FALSE;
521 if (validateRE(regexp, status) == FALSE) {
522 return result;
523 }
524 if (startIndex == -1) {
525 result = regexp->fMatcher->matches(*status);
526 } else {
527 result = regexp->fMatcher->matches(startIndex, *status);
528 }
529 return result;
530 }
531
532
533 //------------------------------------------------------------------------------
534 //
535 // uregex_lookingAt
536 //
537 //------------------------------------------------------------------------------
538 U_CAPI UBool U_EXPORT2
uregex_lookingAt(URegularExpression * regexp2,int32_t startIndex,UErrorCode * status)539 uregex_lookingAt(URegularExpression *regexp2,
540 int32_t startIndex,
541 UErrorCode *status) {
542 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
543 }
544
545 U_CAPI UBool U_EXPORT2
uregex_lookingAt64(URegularExpression * regexp2,int64_t startIndex,UErrorCode * status)546 uregex_lookingAt64(URegularExpression *regexp2,
547 int64_t startIndex,
548 UErrorCode *status) {
549 RegularExpression *regexp = (RegularExpression*)regexp2;
550 UBool result = FALSE;
551 if (validateRE(regexp, status) == FALSE) {
552 return result;
553 }
554 if (startIndex == -1) {
555 result = regexp->fMatcher->lookingAt(*status);
556 } else {
557 result = regexp->fMatcher->lookingAt(startIndex, *status);
558 }
559 return result;
560 }
561
562
563
564 //------------------------------------------------------------------------------
565 //
566 // uregex_find
567 //
568 //------------------------------------------------------------------------------
569 U_CAPI UBool U_EXPORT2
uregex_find(URegularExpression * regexp2,int32_t startIndex,UErrorCode * status)570 uregex_find(URegularExpression *regexp2,
571 int32_t startIndex,
572 UErrorCode *status) {
573 return uregex_find64( regexp2, (int64_t)startIndex, status);
574 }
575
576 U_CAPI UBool U_EXPORT2
uregex_find64(URegularExpression * regexp2,int64_t startIndex,UErrorCode * status)577 uregex_find64(URegularExpression *regexp2,
578 int64_t startIndex,
579 UErrorCode *status) {
580 RegularExpression *regexp = (RegularExpression*)regexp2;
581 UBool result = FALSE;
582 if (validateRE(regexp, status) == FALSE) {
583 return result;
584 }
585 if (startIndex == -1) {
586 regexp->fMatcher->resetPreserveRegion();
587 result = regexp->fMatcher->find();
588 } else {
589 result = regexp->fMatcher->find(startIndex, *status);
590 }
591 return result;
592 }
593
594
595 //------------------------------------------------------------------------------
596 //
597 // uregex_findNext
598 //
599 //------------------------------------------------------------------------------
600 U_CAPI UBool U_EXPORT2
uregex_findNext(URegularExpression * regexp2,UErrorCode * status)601 uregex_findNext(URegularExpression *regexp2,
602 UErrorCode *status) {
603 RegularExpression *regexp = (RegularExpression*)regexp2;
604 if (validateRE(regexp, status) == FALSE) {
605 return FALSE;
606 }
607 UBool result = regexp->fMatcher->find();
608 return result;
609 }
610
611 //------------------------------------------------------------------------------
612 //
613 // uregex_groupCount
614 //
615 //------------------------------------------------------------------------------
616 U_CAPI int32_t U_EXPORT2
uregex_groupCount(URegularExpression * regexp2,UErrorCode * status)617 uregex_groupCount(URegularExpression *regexp2,
618 UErrorCode *status) {
619 RegularExpression *regexp = (RegularExpression*)regexp2;
620 if (validateRE(regexp, status, FALSE) == FALSE) {
621 return 0;
622 }
623 int32_t result = regexp->fMatcher->groupCount();
624 return result;
625 }
626
627
628 //------------------------------------------------------------------------------
629 //
630 // uregex_group
631 //
632 //------------------------------------------------------------------------------
633 U_CAPI int32_t U_EXPORT2
uregex_group(URegularExpression * regexp2,int32_t groupNum,UChar * dest,int32_t destCapacity,UErrorCode * status)634 uregex_group(URegularExpression *regexp2,
635 int32_t groupNum,
636 UChar *dest,
637 int32_t destCapacity,
638 UErrorCode *status) {
639 RegularExpression *regexp = (RegularExpression*)regexp2;
640 if (validateRE(regexp, status) == FALSE) {
641 return 0;
642 }
643 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
644 *status = U_ILLEGAL_ARGUMENT_ERROR;
645 return 0;
646 }
647
648 if (destCapacity == 0 || regexp->fText != NULL) {
649 // If preflighting or if we already have the text as UChars,
650 // this is a little cheaper than going through uregex_groupUTextDeep()
651
652 //
653 // Pick up the range of characters from the matcher
654 //
655 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
656 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
657 if (U_FAILURE(*status)) {
658 return 0;
659 }
660
661 //
662 // Trim length based on buffer capacity
663 //
664 int32_t fullLength = endIx - startIx;
665 int32_t copyLength = fullLength;
666 if (copyLength < destCapacity) {
667 dest[copyLength] = 0;
668 } else if (copyLength == destCapacity) {
669 *status = U_STRING_NOT_TERMINATED_WARNING;
670 } else {
671 copyLength = destCapacity;
672 *status = U_BUFFER_OVERFLOW_ERROR;
673 }
674
675 //
676 // Copy capture group to user's buffer
677 //
678 if (copyLength > 0) {
679 u_memcpy(dest, ®exp->fText[startIx], copyLength);
680 }
681 return fullLength;
682 } else {
683 UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
684 int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
685 utext_close(groupText);
686 return result;
687 }
688 }
689
690
691 //------------------------------------------------------------------------------
692 //
693 // uregex_groupUText
694 //
695 //------------------------------------------------------------------------------
696 U_CAPI UText * U_EXPORT2
uregex_groupUText(URegularExpression * regexp2,int32_t groupNum,UText * dest,int64_t * groupLength,UErrorCode * status)697 uregex_groupUText(URegularExpression *regexp2,
698 int32_t groupNum,
699 UText *dest,
700 int64_t *groupLength,
701 UErrorCode *status) {
702 RegularExpression *regexp = (RegularExpression*)regexp2;
703 if (validateRE(regexp, status) == FALSE) {
704 UErrorCode emptyTextStatus = U_ZERO_ERROR;
705 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
706 }
707
708 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
709 }
710
711 //------------------------------------------------------------------------------
712 //
713 // uregex_groupUTextDeep
714 //
715 //------------------------------------------------------------------------------
716 U_CAPI UText * U_EXPORT2
uregex_groupUTextDeep(URegularExpression * regexp2,int32_t groupNum,UText * dest,UErrorCode * status)717 uregex_groupUTextDeep(URegularExpression *regexp2,
718 int32_t groupNum,
719 UText *dest,
720 UErrorCode *status) {
721 RegularExpression *regexp = (RegularExpression*)regexp2;
722 if (validateRE(regexp, status) == FALSE) {
723 UErrorCode emptyTextStatus = U_ZERO_ERROR;
724 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
725 }
726
727 if (regexp->fText != NULL) {
728 //
729 // Pick up the range of characters from the matcher
730 // and use our already-extracted characters
731 //
732 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
733 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
734 if (U_FAILURE(*status)) {
735 UErrorCode emptyTextStatus = U_ZERO_ERROR;
736 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
737 }
738
739 if (dest) {
740 utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status);
741 } else {
742 UText groupText = UTEXT_INITIALIZER;
743 utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status);
744 dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
745 utext_close(&groupText);
746 }
747
748 return dest;
749 } else {
750 return regexp->fMatcher->group(groupNum, dest, *status);
751 }
752 }
753
754 //------------------------------------------------------------------------------
755 //
756 // uregex_start
757 //
758 //------------------------------------------------------------------------------
759 U_CAPI int32_t U_EXPORT2
uregex_start(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)760 uregex_start(URegularExpression *regexp2,
761 int32_t groupNum,
762 UErrorCode *status) {
763 return (int32_t)uregex_start64( regexp2, groupNum, status);
764 }
765
766 U_CAPI int64_t U_EXPORT2
uregex_start64(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)767 uregex_start64(URegularExpression *regexp2,
768 int32_t groupNum,
769 UErrorCode *status) {
770 RegularExpression *regexp = (RegularExpression*)regexp2;
771 if (validateRE(regexp, status) == FALSE) {
772 return 0;
773 }
774 int32_t result = regexp->fMatcher->start(groupNum, *status);
775 return result;
776 }
777
778 //------------------------------------------------------------------------------
779 //
780 // uregex_end
781 //
782 //------------------------------------------------------------------------------
783 U_CAPI int32_t U_EXPORT2
uregex_end(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)784 uregex_end(URegularExpression *regexp2,
785 int32_t groupNum,
786 UErrorCode *status) {
787 return (int32_t)uregex_end64( regexp2, groupNum, status);
788 }
789
790 U_CAPI int64_t U_EXPORT2
uregex_end64(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)791 uregex_end64(URegularExpression *regexp2,
792 int32_t groupNum,
793 UErrorCode *status) {
794 RegularExpression *regexp = (RegularExpression*)regexp2;
795 if (validateRE(regexp, status) == FALSE) {
796 return 0;
797 }
798 int32_t result = regexp->fMatcher->end(groupNum, *status);
799 return result;
800 }
801
802 //------------------------------------------------------------------------------
803 //
804 // uregex_reset
805 //
806 //------------------------------------------------------------------------------
807 U_CAPI void U_EXPORT2
uregex_reset(URegularExpression * regexp2,int32_t index,UErrorCode * status)808 uregex_reset(URegularExpression *regexp2,
809 int32_t index,
810 UErrorCode *status) {
811 uregex_reset64( regexp2, (int64_t)index, status);
812 }
813
814 U_CAPI void U_EXPORT2
uregex_reset64(URegularExpression * regexp2,int64_t index,UErrorCode * status)815 uregex_reset64(URegularExpression *regexp2,
816 int64_t index,
817 UErrorCode *status) {
818 RegularExpression *regexp = (RegularExpression*)regexp2;
819 if (validateRE(regexp, status) == FALSE) {
820 return;
821 }
822 regexp->fMatcher->reset(index, *status);
823 }
824
825
826 //------------------------------------------------------------------------------
827 //
828 // uregex_setRegion
829 //
830 //------------------------------------------------------------------------------
831 U_CAPI void U_EXPORT2
uregex_setRegion(URegularExpression * regexp2,int32_t regionStart,int32_t regionLimit,UErrorCode * status)832 uregex_setRegion(URegularExpression *regexp2,
833 int32_t regionStart,
834 int32_t regionLimit,
835 UErrorCode *status) {
836 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
837 }
838
839 U_CAPI void U_EXPORT2
uregex_setRegion64(URegularExpression * regexp2,int64_t regionStart,int64_t regionLimit,UErrorCode * status)840 uregex_setRegion64(URegularExpression *regexp2,
841 int64_t regionStart,
842 int64_t regionLimit,
843 UErrorCode *status) {
844 RegularExpression *regexp = (RegularExpression*)regexp2;
845 if (validateRE(regexp, status) == FALSE) {
846 return;
847 }
848 regexp->fMatcher->region(regionStart, regionLimit, *status);
849 }
850
851
852 //------------------------------------------------------------------------------
853 //
854 // uregex_setRegionAndStart
855 //
856 //------------------------------------------------------------------------------
857 U_DRAFT void U_EXPORT2
uregex_setRegionAndStart(URegularExpression * regexp2,int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode * status)858 uregex_setRegionAndStart(URegularExpression *regexp2,
859 int64_t regionStart,
860 int64_t regionLimit,
861 int64_t startIndex,
862 UErrorCode *status) {
863 RegularExpression *regexp = (RegularExpression*)regexp2;
864 if (validateRE(regexp, status) == FALSE) {
865 return;
866 }
867 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
868 }
869
870 //------------------------------------------------------------------------------
871 //
872 // uregex_regionStart
873 //
874 //------------------------------------------------------------------------------
875 U_CAPI int32_t U_EXPORT2
uregex_regionStart(const URegularExpression * regexp2,UErrorCode * status)876 uregex_regionStart(const URegularExpression *regexp2,
877 UErrorCode *status) {
878 return (int32_t)uregex_regionStart64(regexp2, status);
879 }
880
881 U_CAPI int64_t U_EXPORT2
uregex_regionStart64(const URegularExpression * regexp2,UErrorCode * status)882 uregex_regionStart64(const URegularExpression *regexp2,
883 UErrorCode *status) {
884 RegularExpression *regexp = (RegularExpression*)regexp2;
885 if (validateRE(regexp, status) == FALSE) {
886 return 0;
887 }
888 return regexp->fMatcher->regionStart();
889 }
890
891
892 //------------------------------------------------------------------------------
893 //
894 // uregex_regionEnd
895 //
896 //------------------------------------------------------------------------------
897 U_CAPI int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression * regexp2,UErrorCode * status)898 uregex_regionEnd(const URegularExpression *regexp2,
899 UErrorCode *status) {
900 return (int32_t)uregex_regionEnd64(regexp2, status);
901 }
902
903 U_CAPI int64_t U_EXPORT2
uregex_regionEnd64(const URegularExpression * regexp2,UErrorCode * status)904 uregex_regionEnd64(const URegularExpression *regexp2,
905 UErrorCode *status) {
906 RegularExpression *regexp = (RegularExpression*)regexp2;
907 if (validateRE(regexp, status) == FALSE) {
908 return 0;
909 }
910 return regexp->fMatcher->regionEnd();
911 }
912
913
914 //------------------------------------------------------------------------------
915 //
916 // uregex_hasTransparentBounds
917 //
918 //------------------------------------------------------------------------------
919 U_CAPI UBool U_EXPORT2
uregex_hasTransparentBounds(const URegularExpression * regexp2,UErrorCode * status)920 uregex_hasTransparentBounds(const URegularExpression *regexp2,
921 UErrorCode *status) {
922 RegularExpression *regexp = (RegularExpression*)regexp2;
923 if (validateRE(regexp, status) == FALSE) {
924 return FALSE;
925 }
926 return regexp->fMatcher->hasTransparentBounds();
927 }
928
929
930 //------------------------------------------------------------------------------
931 //
932 // uregex_useTransparentBounds
933 //
934 //------------------------------------------------------------------------------
935 U_CAPI void U_EXPORT2
uregex_useTransparentBounds(URegularExpression * regexp2,UBool b,UErrorCode * status)936 uregex_useTransparentBounds(URegularExpression *regexp2,
937 UBool b,
938 UErrorCode *status) {
939 RegularExpression *regexp = (RegularExpression*)regexp2;
940 if (validateRE(regexp, status) == FALSE) {
941 return;
942 }
943 regexp->fMatcher->useTransparentBounds(b);
944 }
945
946
947 //------------------------------------------------------------------------------
948 //
949 // uregex_hasAnchoringBounds
950 //
951 //------------------------------------------------------------------------------
952 U_CAPI UBool U_EXPORT2
uregex_hasAnchoringBounds(const URegularExpression * regexp2,UErrorCode * status)953 uregex_hasAnchoringBounds(const URegularExpression *regexp2,
954 UErrorCode *status) {
955 RegularExpression *regexp = (RegularExpression*)regexp2;
956 if (validateRE(regexp, status) == FALSE) {
957 return FALSE;
958 }
959 return regexp->fMatcher->hasAnchoringBounds();
960 }
961
962
963 //------------------------------------------------------------------------------
964 //
965 // uregex_useAnchoringBounds
966 //
967 //------------------------------------------------------------------------------
968 U_CAPI void U_EXPORT2
uregex_useAnchoringBounds(URegularExpression * regexp2,UBool b,UErrorCode * status)969 uregex_useAnchoringBounds(URegularExpression *regexp2,
970 UBool b,
971 UErrorCode *status) {
972 RegularExpression *regexp = (RegularExpression*)regexp2;
973 if (validateRE(regexp, status) == FALSE) {
974 return;
975 }
976 regexp->fMatcher->useAnchoringBounds(b);
977 }
978
979
980 //------------------------------------------------------------------------------
981 //
982 // uregex_hitEnd
983 //
984 //------------------------------------------------------------------------------
985 U_CAPI UBool U_EXPORT2
uregex_hitEnd(const URegularExpression * regexp2,UErrorCode * status)986 uregex_hitEnd(const URegularExpression *regexp2,
987 UErrorCode *status) {
988 RegularExpression *regexp = (RegularExpression*)regexp2;
989 if (validateRE(regexp, status) == FALSE) {
990 return FALSE;
991 }
992 return regexp->fMatcher->hitEnd();
993 }
994
995
996 //------------------------------------------------------------------------------
997 //
998 // uregex_requireEnd
999 //
1000 //------------------------------------------------------------------------------
1001 U_CAPI UBool U_EXPORT2
uregex_requireEnd(const URegularExpression * regexp2,UErrorCode * status)1002 uregex_requireEnd(const URegularExpression *regexp2,
1003 UErrorCode *status) {
1004 RegularExpression *regexp = (RegularExpression*)regexp2;
1005 if (validateRE(regexp, status) == FALSE) {
1006 return FALSE;
1007 }
1008 return regexp->fMatcher->requireEnd();
1009 }
1010
1011
1012 //------------------------------------------------------------------------------
1013 //
1014 // uregex_setTimeLimit
1015 //
1016 //------------------------------------------------------------------------------
1017 U_CAPI void U_EXPORT2
uregex_setTimeLimit(URegularExpression * regexp2,int32_t limit,UErrorCode * status)1018 uregex_setTimeLimit(URegularExpression *regexp2,
1019 int32_t limit,
1020 UErrorCode *status) {
1021 RegularExpression *regexp = (RegularExpression*)regexp2;
1022 if (validateRE(regexp, status)) {
1023 regexp->fMatcher->setTimeLimit(limit, *status);
1024 }
1025 }
1026
1027
1028
1029 //------------------------------------------------------------------------------
1030 //
1031 // uregex_getTimeLimit
1032 //
1033 //------------------------------------------------------------------------------
1034 U_CAPI int32_t U_EXPORT2
uregex_getTimeLimit(const URegularExpression * regexp2,UErrorCode * status)1035 uregex_getTimeLimit(const URegularExpression *regexp2,
1036 UErrorCode *status) {
1037 int32_t retVal = 0;
1038 RegularExpression *regexp = (RegularExpression*)regexp2;
1039 if (validateRE(regexp, status)) {
1040 retVal = regexp->fMatcher->getTimeLimit();
1041 }
1042 return retVal;
1043 }
1044
1045
1046
1047 //------------------------------------------------------------------------------
1048 //
1049 // uregex_setStackLimit
1050 //
1051 //------------------------------------------------------------------------------
1052 U_CAPI void U_EXPORT2
uregex_setStackLimit(URegularExpression * regexp2,int32_t limit,UErrorCode * status)1053 uregex_setStackLimit(URegularExpression *regexp2,
1054 int32_t limit,
1055 UErrorCode *status) {
1056 RegularExpression *regexp = (RegularExpression*)regexp2;
1057 if (validateRE(regexp, status)) {
1058 regexp->fMatcher->setStackLimit(limit, *status);
1059 }
1060 }
1061
1062
1063
1064 //------------------------------------------------------------------------------
1065 //
1066 // uregex_getStackLimit
1067 //
1068 //------------------------------------------------------------------------------
1069 U_CAPI int32_t U_EXPORT2
uregex_getStackLimit(const URegularExpression * regexp2,UErrorCode * status)1070 uregex_getStackLimit(const URegularExpression *regexp2,
1071 UErrorCode *status) {
1072 int32_t retVal = 0;
1073 RegularExpression *regexp = (RegularExpression*)regexp2;
1074 if (validateRE(regexp, status)) {
1075 retVal = regexp->fMatcher->getStackLimit();
1076 }
1077 return retVal;
1078 }
1079
1080
1081 //------------------------------------------------------------------------------
1082 //
1083 // uregex_setMatchCallback
1084 //
1085 //------------------------------------------------------------------------------
1086 U_CAPI void U_EXPORT2
uregex_setMatchCallback(URegularExpression * regexp2,URegexMatchCallback * callback,const void * context,UErrorCode * status)1087 uregex_setMatchCallback(URegularExpression *regexp2,
1088 URegexMatchCallback *callback,
1089 const void *context,
1090 UErrorCode *status) {
1091 RegularExpression *regexp = (RegularExpression*)regexp2;
1092 if (validateRE(regexp, status)) {
1093 regexp->fMatcher->setMatchCallback(callback, context, *status);
1094 }
1095 }
1096
1097
1098 //------------------------------------------------------------------------------
1099 //
1100 // uregex_getMatchCallback
1101 //
1102 //------------------------------------------------------------------------------
1103 U_CAPI void U_EXPORT2
uregex_getMatchCallback(const URegularExpression * regexp2,URegexMatchCallback ** callback,const void ** context,UErrorCode * status)1104 uregex_getMatchCallback(const URegularExpression *regexp2,
1105 URegexMatchCallback **callback,
1106 const void **context,
1107 UErrorCode *status) {
1108 RegularExpression *regexp = (RegularExpression*)regexp2;
1109 if (validateRE(regexp, status)) {
1110 regexp->fMatcher->getMatchCallback(*callback, *context, *status);
1111 }
1112 }
1113
1114
1115 //------------------------------------------------------------------------------
1116 //
1117 // uregex_setMatchProgressCallback
1118 //
1119 //------------------------------------------------------------------------------
1120 U_CAPI void U_EXPORT2
uregex_setFindProgressCallback(URegularExpression * regexp2,URegexFindProgressCallback * callback,const void * context,UErrorCode * status)1121 uregex_setFindProgressCallback(URegularExpression *regexp2,
1122 URegexFindProgressCallback *callback,
1123 const void *context,
1124 UErrorCode *status) {
1125 RegularExpression *regexp = (RegularExpression*)regexp2;
1126 if (validateRE(regexp, status)) {
1127 regexp->fMatcher->setFindProgressCallback(callback, context, *status);
1128 }
1129 }
1130
1131
1132 //------------------------------------------------------------------------------
1133 //
1134 // uregex_getMatchCallback
1135 //
1136 //------------------------------------------------------------------------------
1137 U_CAPI void U_EXPORT2
uregex_getFindProgressCallback(const URegularExpression * regexp2,URegexFindProgressCallback ** callback,const void ** context,UErrorCode * status)1138 uregex_getFindProgressCallback(const URegularExpression *regexp2,
1139 URegexFindProgressCallback **callback,
1140 const void **context,
1141 UErrorCode *status) {
1142 RegularExpression *regexp = (RegularExpression*)regexp2;
1143 if (validateRE(regexp, status)) {
1144 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
1145 }
1146 }
1147
1148
1149 //------------------------------------------------------------------------------
1150 //
1151 // uregex_replaceAll
1152 //
1153 //------------------------------------------------------------------------------
1154 U_CAPI int32_t U_EXPORT2
uregex_replaceAll(URegularExpression * regexp2,const UChar * replacementText,int32_t replacementLength,UChar * destBuf,int32_t destCapacity,UErrorCode * status)1155 uregex_replaceAll(URegularExpression *regexp2,
1156 const UChar *replacementText,
1157 int32_t replacementLength,
1158 UChar *destBuf,
1159 int32_t destCapacity,
1160 UErrorCode *status) {
1161 RegularExpression *regexp = (RegularExpression*)regexp2;
1162 if (validateRE(regexp, status) == FALSE) {
1163 return 0;
1164 }
1165 if (replacementText == NULL || replacementLength < -1 ||
1166 (destBuf == NULL && destCapacity > 0) ||
1167 destCapacity < 0) {
1168 *status = U_ILLEGAL_ARGUMENT_ERROR;
1169 return 0;
1170 }
1171
1172 int32_t len = 0;
1173
1174 uregex_reset(regexp2, 0, status);
1175
1176 // Note: Seperate error code variables for findNext() and appendReplacement()
1177 // are used so that destination buffer overflow errors
1178 // in appendReplacement won't stop findNext() from working.
1179 // appendReplacement() and appendTail() special case incoming buffer
1180 // overflow errors, continuing to return the correct length.
1181 UErrorCode findStatus = *status;
1182 while (uregex_findNext(regexp2, &findStatus)) {
1183 len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
1184 &destBuf, &destCapacity, status);
1185 }
1186 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1187
1188 if (U_FAILURE(findStatus)) {
1189 // If anything went wrong with the findNext(), make that error trump
1190 // whatever may have happened with the append() operations.
1191 // Errors in findNext() are not expected.
1192 *status = findStatus;
1193 }
1194
1195 return len;
1196 }
1197
1198
1199 //------------------------------------------------------------------------------
1200 //
1201 // uregex_replaceAllUText
1202 //
1203 //------------------------------------------------------------------------------
1204 U_CAPI UText * U_EXPORT2
uregex_replaceAllUText(URegularExpression * regexp2,UText * replacementText,UText * dest,UErrorCode * status)1205 uregex_replaceAllUText(URegularExpression *regexp2,
1206 UText *replacementText,
1207 UText *dest,
1208 UErrorCode *status) {
1209 RegularExpression *regexp = (RegularExpression*)regexp2;
1210 if (validateRE(regexp, status) == FALSE) {
1211 return 0;
1212 }
1213 if (replacementText == NULL) {
1214 *status = U_ILLEGAL_ARGUMENT_ERROR;
1215 return 0;
1216 }
1217
1218 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
1219 return dest;
1220 }
1221
1222
1223 //------------------------------------------------------------------------------
1224 //
1225 // uregex_replaceFirst
1226 //
1227 //------------------------------------------------------------------------------
1228 U_CAPI int32_t U_EXPORT2
uregex_replaceFirst(URegularExpression * regexp2,const UChar * replacementText,int32_t replacementLength,UChar * destBuf,int32_t destCapacity,UErrorCode * status)1229 uregex_replaceFirst(URegularExpression *regexp2,
1230 const UChar *replacementText,
1231 int32_t replacementLength,
1232 UChar *destBuf,
1233 int32_t destCapacity,
1234 UErrorCode *status) {
1235 RegularExpression *regexp = (RegularExpression*)regexp2;
1236 if (validateRE(regexp, status) == FALSE) {
1237 return 0;
1238 }
1239 if (replacementText == NULL || replacementLength < -1 ||
1240 (destBuf == NULL && destCapacity > 0) ||
1241 destCapacity < 0) {
1242 *status = U_ILLEGAL_ARGUMENT_ERROR;
1243 return 0;
1244 }
1245
1246 int32_t len = 0;
1247 UBool findSucceeded;
1248 uregex_reset(regexp2, 0, status);
1249 findSucceeded = uregex_find(regexp2, 0, status);
1250 if (findSucceeded) {
1251 len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
1252 &destBuf, &destCapacity, status);
1253 }
1254 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1255
1256 return len;
1257 }
1258
1259
1260 //------------------------------------------------------------------------------
1261 //
1262 // uregex_replaceFirstUText
1263 //
1264 //------------------------------------------------------------------------------
1265 U_CAPI UText * U_EXPORT2
uregex_replaceFirstUText(URegularExpression * regexp2,UText * replacementText,UText * dest,UErrorCode * status)1266 uregex_replaceFirstUText(URegularExpression *regexp2,
1267 UText *replacementText,
1268 UText *dest,
1269 UErrorCode *status) {
1270 RegularExpression *regexp = (RegularExpression*)regexp2;
1271 if (validateRE(regexp, status) == FALSE) {
1272 return 0;
1273 }
1274 if (replacementText == NULL) {
1275 *status = U_ILLEGAL_ARGUMENT_ERROR;
1276 return 0;
1277 }
1278
1279 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
1280 return dest;
1281 }
1282
1283
1284 //------------------------------------------------------------------------------
1285 //
1286 // uregex_appendReplacement
1287 //
1288 //------------------------------------------------------------------------------
1289
1290 U_NAMESPACE_BEGIN
1291 //
1292 // Dummy class, because these functions need to be friends of class RegexMatcher,
1293 // and stand-alone C functions don't work as friends
1294 //
1295 class RegexCImpl {
1296 public:
1297 inline static int32_t appendReplacement(RegularExpression *regexp,
1298 const UChar *replacementText,
1299 int32_t replacementLength,
1300 UChar **destBuf,
1301 int32_t *destCapacity,
1302 UErrorCode *status);
1303
1304 inline static int32_t appendTail(RegularExpression *regexp,
1305 UChar **destBuf,
1306 int32_t *destCapacity,
1307 UErrorCode *status);
1308
1309 inline static int32_t split(RegularExpression *regexp,
1310 UChar *destBuf,
1311 int32_t destCapacity,
1312 int32_t *requiredCapacity,
1313 UChar *destFields[],
1314 int32_t destFieldsCapacity,
1315 UErrorCode *status);
1316 };
1317
1318 U_NAMESPACE_END
1319
1320
1321
1322 static const UChar BACKSLASH = 0x5c;
1323 static const UChar DOLLARSIGN = 0x24;
1324
1325 //
1326 // Move a character to an output buffer, with bounds checking on the index.
1327 // Index advances even if capacity is exceeded, for preflight size computations.
1328 // This little sequence is used a LOT.
1329 //
appendToBuf(UChar c,int32_t * idx,UChar * buf,int32_t bufCapacity)1330 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
1331 if (*idx < bufCapacity) {
1332 buf[*idx] = c;
1333 }
1334 (*idx)++;
1335 }
1336
1337
1338 //
1339 // appendReplacement, the actual implementation.
1340 //
appendReplacement(RegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1341 int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
1342 const UChar *replacementText,
1343 int32_t replacementLength,
1344 UChar **destBuf,
1345 int32_t *destCapacity,
1346 UErrorCode *status) {
1347
1348 // If we come in with a buffer overflow error, don't suppress the operation.
1349 // A series of appendReplacements, appendTail need to correctly preflight
1350 // the buffer size when an overflow happens somewhere in the middle.
1351 UBool pendingBufferOverflow = FALSE;
1352 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1353 pendingBufferOverflow = TRUE;
1354 *status = U_ZERO_ERROR;
1355 }
1356
1357 //
1358 // Validate all paramters
1359 //
1360 if (validateRE(regexp, status) == FALSE) {
1361 return 0;
1362 }
1363 if (replacementText == NULL || replacementLength < -1 ||
1364 destCapacity == NULL || destBuf == NULL ||
1365 (*destBuf == NULL && *destCapacity > 0) ||
1366 *destCapacity < 0) {
1367 *status = U_ILLEGAL_ARGUMENT_ERROR;
1368 return 0;
1369 }
1370
1371 RegexMatcher *m = regexp->fMatcher;
1372 if (m->fMatch == FALSE) {
1373 *status = U_REGEX_INVALID_STATE;
1374 return 0;
1375 }
1376
1377 UChar *dest = *destBuf;
1378 int32_t capacity = *destCapacity;
1379 int32_t destIdx = 0;
1380 int32_t i;
1381
1382 // If it wasn't supplied by the caller, get the length of the replacement text.
1383 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
1384 // the fly and avoid this step.
1385 if (replacementLength == -1) {
1386 replacementLength = u_strlen(replacementText);
1387 }
1388
1389 // Copy input string from the end of previous match to start of current match
1390 if (regexp->fText != NULL) {
1391 int32_t matchStart;
1392 int32_t lastMatchEnd;
1393 if (UTEXT_USES_U16(m->fInputText)) {
1394 lastMatchEnd = (int32_t)m->fLastMatchEnd;
1395 matchStart = (int32_t)m->fMatchStart;
1396 } else {
1397 // !!!: Would like a better way to do this!
1398 UErrorCode status = U_ZERO_ERROR;
1399 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
1400 status = U_ZERO_ERROR;
1401 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
1402 }
1403 for (i=lastMatchEnd; i<matchStart; i++) {
1404 appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
1405 }
1406 } else {
1407 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
1408 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
1409 &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError);
1410 }
1411
1412
1413 // scan the replacement text, looking for substitutions ($n) and \escapes.
1414 int32_t replIdx = 0;
1415 while (replIdx < replacementLength) {
1416 UChar c = replacementText[replIdx];
1417 replIdx++;
1418 if (c != DOLLARSIGN && c != BACKSLASH) {
1419 // Common case, no substitution, no escaping,
1420 // just copy the char to the dest buf.
1421 appendToBuf(c, &destIdx, dest, capacity);
1422 continue;
1423 }
1424
1425 if (c == BACKSLASH) {
1426 // Backslash Escape. Copy the following char out without further checks.
1427 // Note: Surrogate pairs don't need any special handling
1428 // The second half wont be a '$' or a '\', and
1429 // will move to the dest normally on the next
1430 // loop iteration.
1431 if (replIdx >= replacementLength) {
1432 break;
1433 }
1434 c = replacementText[replIdx];
1435
1436 if (c==0x55/*U*/ || c==0x75/*u*/) {
1437 // We have a \udddd or \Udddddddd escape sequence.
1438 UChar32 escapedChar =
1439 u_unescapeAt(uregex_ucstr_unescape_charAt,
1440 &replIdx, // Index is updated by unescapeAt
1441 replacementLength, // Length of replacement text
1442 (void *)replacementText);
1443
1444 if (escapedChar != (UChar32)0xFFFFFFFF) {
1445 if (escapedChar <= 0xffff) {
1446 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
1447 } else {
1448 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
1449 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
1450 }
1451 continue;
1452 }
1453 // Note: if the \u escape was invalid, just fall through and
1454 // treat it as a plain \<anything> escape.
1455 }
1456
1457 // Plain backslash escape. Just put out the escaped character.
1458 appendToBuf(c, &destIdx, dest, capacity);
1459
1460 replIdx++;
1461 continue;
1462 }
1463
1464
1465
1466 // We've got a $. Pick up a capture group number if one follows.
1467 // Consume at most the number of digits necessary for the largest capture
1468 // number that is valid for this pattern.
1469
1470 int32_t numDigits = 0;
1471 int32_t groupNum = 0;
1472 UChar32 digitC;
1473 for (;;) {
1474 if (replIdx >= replacementLength) {
1475 break;
1476 }
1477 U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
1478 if (u_isdigit(digitC) == FALSE) {
1479 break;
1480 }
1481
1482 U16_FWD_1(replacementText, replIdx, replacementLength);
1483 groupNum=groupNum*10 + u_charDigitValue(digitC);
1484 numDigits++;
1485 if (numDigits >= m->fPattern->fMaxCaptureDigits) {
1486 break;
1487 }
1488 }
1489
1490
1491 if (numDigits == 0) {
1492 // The $ didn't introduce a group number at all.
1493 // Treat it as just part of the substitution text.
1494 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
1495 continue;
1496 }
1497
1498 // Finally, append the capture group data to the destination.
1499 destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
1500 if (*status == U_BUFFER_OVERFLOW_ERROR) {
1501 // Ignore buffer overflow when extracting the group. We need to
1502 // continue on to get full size of the untruncated result. We will
1503 // raise our own buffer overflow error at the end.
1504 *status = U_ZERO_ERROR;
1505 }
1506
1507 if (U_FAILURE(*status)) {
1508 // Can fail if group number is out of range.
1509 break;
1510 }
1511
1512 }
1513
1514 //
1515 // Nul Terminate the dest buffer if possible.
1516 // Set the appropriate buffer overflow or not terminated error, if needed.
1517 //
1518 if (destIdx < capacity) {
1519 dest[destIdx] = 0;
1520 } else if (destIdx == *destCapacity) {
1521 *status = U_STRING_NOT_TERMINATED_WARNING;
1522 } else {
1523 *status = U_BUFFER_OVERFLOW_ERROR;
1524 }
1525
1526 //
1527 // Return an updated dest buffer and capacity to the caller.
1528 //
1529 if (destIdx > 0 && *destCapacity > 0) {
1530 if (destIdx < capacity) {
1531 *destBuf += destIdx;
1532 *destCapacity -= destIdx;
1533 } else {
1534 *destBuf += capacity;
1535 *destCapacity = 0;
1536 }
1537 }
1538
1539 // If we came in with a buffer overflow, make sure we go out with one also.
1540 // (A zero length match right at the end of the previous match could
1541 // make this function succeed even though a previous call had overflowed the buf)
1542 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1543 *status = U_BUFFER_OVERFLOW_ERROR;
1544 }
1545
1546 return destIdx;
1547 }
1548
1549 //
1550 // appendReplacement the actual API function,
1551 //
1552 U_CAPI int32_t U_EXPORT2
uregex_appendReplacement(URegularExpression * regexp2,const UChar * replacementText,int32_t replacementLength,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1553 uregex_appendReplacement(URegularExpression *regexp2,
1554 const UChar *replacementText,
1555 int32_t replacementLength,
1556 UChar **destBuf,
1557 int32_t *destCapacity,
1558 UErrorCode *status) {
1559
1560 RegularExpression *regexp = (RegularExpression*)regexp2;
1561 return RegexCImpl::appendReplacement(
1562 regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1563 }
1564
1565 //
1566 // uregex_appendReplacementUText...can just use the normal C++ method
1567 //
1568 U_CAPI void U_EXPORT2
uregex_appendReplacementUText(URegularExpression * regexp2,UText * replText,UText * dest,UErrorCode * status)1569 uregex_appendReplacementUText(URegularExpression *regexp2,
1570 UText *replText,
1571 UText *dest,
1572 UErrorCode *status) {
1573 RegularExpression *regexp = (RegularExpression*)regexp2;
1574 regexp->fMatcher->appendReplacement(dest, replText, *status);
1575 }
1576
1577
1578 //------------------------------------------------------------------------------
1579 //
1580 // uregex_appendTail
1581 //
1582 //------------------------------------------------------------------------------
appendTail(RegularExpression * regexp,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1583 int32_t RegexCImpl::appendTail(RegularExpression *regexp,
1584 UChar **destBuf,
1585 int32_t *destCapacity,
1586 UErrorCode *status)
1587 {
1588
1589 // If we come in with a buffer overflow error, don't suppress the operation.
1590 // A series of appendReplacements, appendTail need to correctly preflight
1591 // the buffer size when an overflow happens somewhere in the middle.
1592 UBool pendingBufferOverflow = FALSE;
1593 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1594 pendingBufferOverflow = TRUE;
1595 *status = U_ZERO_ERROR;
1596 }
1597
1598 if (validateRE(regexp, status) == FALSE) {
1599 return 0;
1600 }
1601
1602 if (destCapacity == NULL || destBuf == NULL ||
1603 (*destBuf == NULL && *destCapacity > 0) ||
1604 *destCapacity < 0)
1605 {
1606 *status = U_ILLEGAL_ARGUMENT_ERROR;
1607 return 0;
1608 }
1609
1610 RegexMatcher *m = regexp->fMatcher;
1611
1612 int32_t destIdx = 0;
1613 int32_t destCap = *destCapacity;
1614 UChar *dest = *destBuf;
1615
1616 if (regexp->fText != NULL) {
1617 int32_t srcIdx;
1618 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
1619 if (nativeIdx == -1) {
1620 srcIdx = 0;
1621 } else if (UTEXT_USES_U16(m->fInputText)) {
1622 srcIdx = (int32_t)nativeIdx;
1623 } else {
1624 UErrorCode status = U_ZERO_ERROR;
1625 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
1626 }
1627
1628 for (;;) {
1629 if (srcIdx == regexp->fTextLength) {
1630 break;
1631 }
1632 UChar c = regexp->fText[srcIdx];
1633 if (c == 0 && regexp->fTextLength == -1) {
1634 regexp->fTextLength = srcIdx;
1635 break;
1636 }
1637 if (destIdx < destCap) {
1638 dest[destIdx] = c;
1639 } else {
1640 // We've overflowed the dest buffer.
1641 // If the total input string length is known, we can
1642 // compute the total buffer size needed without scanning through the string.
1643 if (regexp->fTextLength > 0) {
1644 destIdx += (regexp->fTextLength - srcIdx);
1645 break;
1646 }
1647 }
1648 srcIdx++;
1649 destIdx++;
1650 }
1651 } else {
1652 int64_t srcIdx;
1653 if (m->fMatch) {
1654 // The most recent call to find() succeeded.
1655 srcIdx = m->fMatchEnd;
1656 } else {
1657 // The last call to find() on this matcher failed().
1658 // Look back to the end of the last find() that succeeded for src index.
1659 srcIdx = m->fLastMatchEnd;
1660 if (srcIdx == -1) {
1661 // There has been no successful match with this matcher.
1662 // We want to copy the whole string.
1663 srcIdx = 0;
1664 }
1665 }
1666
1667 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
1668 }
1669
1670 //
1671 // NUL terminate the output string, if possible, otherwise issue the
1672 // appropriate error or warning.
1673 //
1674 if (destIdx < destCap) {
1675 dest[destIdx] = 0;
1676 } else if (destIdx == destCap) {
1677 *status = U_STRING_NOT_TERMINATED_WARNING;
1678 } else {
1679 *status = U_BUFFER_OVERFLOW_ERROR;
1680 }
1681
1682 //
1683 // Update the user's buffer ptr and capacity vars to reflect the
1684 // amount used.
1685 //
1686 if (destIdx < destCap) {
1687 *destBuf += destIdx;
1688 *destCapacity -= destIdx;
1689 } else {
1690 *destBuf += destCap;
1691 *destCapacity = 0;
1692 }
1693
1694 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1695 *status = U_BUFFER_OVERFLOW_ERROR;
1696 }
1697
1698 return destIdx;
1699 }
1700
1701
1702 //
1703 // appendTail the actual API function
1704 //
1705 U_CAPI int32_t U_EXPORT2
uregex_appendTail(URegularExpression * regexp2,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1706 uregex_appendTail(URegularExpression *regexp2,
1707 UChar **destBuf,
1708 int32_t *destCapacity,
1709 UErrorCode *status) {
1710 RegularExpression *regexp = (RegularExpression*)regexp2;
1711 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1712 }
1713
1714
1715 //
1716 // uregex_appendTailUText...can just use the normal C++ method
1717 //
1718 U_CAPI UText * U_EXPORT2
uregex_appendTailUText(URegularExpression * regexp2,UText * dest,UErrorCode * status)1719 uregex_appendTailUText(URegularExpression *regexp2,
1720 UText *dest,
1721 UErrorCode *status) {
1722 RegularExpression *regexp = (RegularExpression*)regexp2;
1723 return regexp->fMatcher->appendTail(dest, *status);
1724 }
1725
1726
1727 //------------------------------------------------------------------------------
1728 //
1729 // copyString Internal utility to copy a string to an output buffer,
1730 // while managing buffer overflow and preflight size
1731 // computation. NUL termination is added to destination,
1732 // and the NUL is counted in the output size.
1733 //
1734 //------------------------------------------------------------------------------
1735 #if 0
1736 static void copyString(UChar *destBuffer, // Destination buffer.
1737 int32_t destCapacity, // Total capacity of dest buffer
1738 int32_t *destIndex, // Index into dest buffer. Updated on return.
1739 // Update not clipped to destCapacity.
1740 const UChar *srcPtr, // Pointer to source string
1741 int32_t srcLen) // Source string len.
1742 {
1743 int32_t si;
1744 int32_t di = *destIndex;
1745 UChar c;
1746
1747 for (si=0; si<srcLen; si++) {
1748 c = srcPtr[si];
1749 if (di < destCapacity) {
1750 destBuffer[di] = c;
1751 di++;
1752 } else {
1753 di += srcLen - si;
1754 break;
1755 }
1756 }
1757 if (di<destCapacity) {
1758 destBuffer[di] = 0;
1759 }
1760 di++;
1761 *destIndex = di;
1762 }
1763 #endif
1764
1765 //------------------------------------------------------------------------------
1766 //
1767 // uregex_split
1768 //
1769 //------------------------------------------------------------------------------
split(RegularExpression * regexp,UChar * destBuf,int32_t destCapacity,int32_t * requiredCapacity,UChar * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1770 int32_t RegexCImpl::split(RegularExpression *regexp,
1771 UChar *destBuf,
1772 int32_t destCapacity,
1773 int32_t *requiredCapacity,
1774 UChar *destFields[],
1775 int32_t destFieldsCapacity,
1776 UErrorCode *status) {
1777 //
1778 // Reset for the input text
1779 //
1780 regexp->fMatcher->reset();
1781 UText *inputText = regexp->fMatcher->fInputText;
1782 int64_t nextOutputStringStart = 0;
1783 int64_t inputLen = regexp->fMatcher->fInputLength;
1784 if (inputLen == 0) {
1785 return 0;
1786 }
1787
1788 //
1789 // Loop through the input text, searching for the delimiter pattern
1790 //
1791 int32_t i; // Index of the field being processed.
1792 int32_t destIdx = 0; // Next available position in destBuf;
1793 int32_t numCaptureGroups = regexp->fMatcher->groupCount();
1794 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted
1795 for (i=0; ; i++) {
1796 if (i>=destFieldsCapacity-1) {
1797 // There are one or zero output strings left.
1798 // Fill the last output string with whatever is left from the input, then exit the loop.
1799 // ( i will be == destFieldsCapacity if we filled the output array while processing
1800 // capture groups of the delimiter expression, in which case we will discard the
1801 // last capture group saved in favor of the unprocessed remainder of the
1802 // input string.)
1803 if (inputLen > nextOutputStringStart) {
1804 if (i != destFieldsCapacity-1) {
1805 // No fields are left. Recycle the last one for holding the trailing part of
1806 // the input string.
1807 i = destFieldsCapacity-1;
1808 destIdx = (int32_t)(destFields[i] - destFields[0]);
1809 }
1810
1811 destFields[i] = &destBuf[destIdx];
1812 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1813 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1814 }
1815 break;
1816 }
1817
1818 if (regexp->fMatcher->find()) {
1819 // We found another delimiter. Move everything from where we started looking
1820 // up until the start of the delimiter into the next output string.
1821 destFields[i] = &destBuf[destIdx];
1822
1823 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
1824 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1825 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1826 tStatus = U_ZERO_ERROR;
1827 } else {
1828 *status = tStatus;
1829 }
1830 nextOutputStringStart = regexp->fMatcher->fMatchEnd;
1831
1832 // If the delimiter pattern has capturing parentheses, the captured
1833 // text goes out into the next n destination strings.
1834 int32_t groupNum;
1835 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1836 // If we've run out of output string slots, bail out.
1837 if (i==destFieldsCapacity-1) {
1838 break;
1839 }
1840 i++;
1841
1842 // Set up to extract the capture group contents into the dest buffer.
1843 destFields[i] = &destBuf[destIdx];
1844 tStatus = U_ZERO_ERROR;
1845 int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1846 destIdx += t + 1; // Record the space used in the output string buffer.
1847 // +1 for the NUL that terminates the string.
1848 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1849 tStatus = U_ZERO_ERROR;
1850 } else {
1851 *status = tStatus;
1852 }
1853 }
1854
1855 if (nextOutputStringStart == inputLen) {
1856 // The delimiter was at the end of the string. We're done.
1857 break;
1858 }
1859
1860 }
1861 else
1862 {
1863 // We ran off the end of the input while looking for the next delimiter.
1864 // All the remaining text goes into the current output string.
1865 destFields[i] = &destBuf[destIdx];
1866 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1867 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1868 break;
1869 }
1870 }
1871
1872 // Zero out any unused portion of the destFields array
1873 int j;
1874 for (j=i+1; j<destFieldsCapacity; j++) {
1875 destFields[j] = NULL;
1876 }
1877
1878 if (requiredCapacity != NULL) {
1879 *requiredCapacity = destIdx;
1880 }
1881 if (destIdx > destCapacity) {
1882 *status = U_BUFFER_OVERFLOW_ERROR;
1883 }
1884 return i+1;
1885 }
1886
1887 //
1888 // uregex_split The actual API function
1889 //
1890 U_CAPI int32_t U_EXPORT2
uregex_split(URegularExpression * regexp2,UChar * destBuf,int32_t destCapacity,int32_t * requiredCapacity,UChar * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1891 uregex_split(URegularExpression *regexp2,
1892 UChar *destBuf,
1893 int32_t destCapacity,
1894 int32_t *requiredCapacity,
1895 UChar *destFields[],
1896 int32_t destFieldsCapacity,
1897 UErrorCode *status) {
1898 RegularExpression *regexp = (RegularExpression*)regexp2;
1899 if (validateRE(regexp, status) == FALSE) {
1900 return 0;
1901 }
1902 if ((destBuf == NULL && destCapacity > 0) ||
1903 destCapacity < 0 ||
1904 destFields == NULL ||
1905 destFieldsCapacity < 1 ) {
1906 *status = U_ILLEGAL_ARGUMENT_ERROR;
1907 return 0;
1908 }
1909
1910 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
1911 }
1912
1913
1914 //
1915 // uregex_splitUText...can just use the normal C++ method
1916 //
1917 U_CAPI int32_t U_EXPORT2
uregex_splitUText(URegularExpression * regexp2,UText * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1918 uregex_splitUText(URegularExpression *regexp2,
1919 UText *destFields[],
1920 int32_t destFieldsCapacity,
1921 UErrorCode *status) {
1922 RegularExpression *regexp = (RegularExpression*)regexp2;
1923 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
1924 }
1925
1926
1927 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1928
1929