1 /*
2 *******************************************************************************
3 * Copyright (C) 2004-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: regex.cpp
7 */
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "umutex.h"
20 #include "uassert.h"
21 #include "cmemory.h"
22
23 #include "regextxt.h"
24
25 #include <stdio.h>
26
27 U_NAMESPACE_BEGIN
28
29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
30
31 struct RegularExpression: public UMemory {
32 public:
33 RegularExpression();
34 ~RegularExpression();
35 int32_t fMagic;
36 RegexPattern *fPat;
37 int32_t *fPatRefCount;
38 UChar *fPatString;
39 int32_t fPatStringLen;
40 RegexMatcher *fMatcher;
41 const UChar *fText; // Text from setText()
42 int32_t fTextLength; // Length provided by user with setText(), which
43 // may be -1.
44 UBool fOwnsText;
45 };
46
47 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
48
RegularExpression()49 RegularExpression::RegularExpression() {
50 fMagic = REXP_MAGIC;
51 fPat = NULL;
52 fPatRefCount = NULL;
53 fPatString = NULL;
54 fPatStringLen = 0;
55 fMatcher = NULL;
56 fText = NULL;
57 fTextLength = 0;
58 fOwnsText = FALSE;
59 }
60
~RegularExpression()61 RegularExpression::~RegularExpression() {
62 delete fMatcher;
63 fMatcher = NULL;
64 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
65 delete fPat;
66 uprv_free(fPatString);
67 uprv_free(fPatRefCount);
68 }
69 if (fOwnsText && fText!=NULL) {
70 uprv_free((void *)fText);
71 }
72 fMagic = 0;
73 }
74
75 U_NAMESPACE_END
76
77 U_NAMESPACE_USE
78
79 //----------------------------------------------------------------------------------------
80 //
81 // validateRE Do boilerplate style checks on API function parameters.
82 // Return TRUE if they look OK.
83 //----------------------------------------------------------------------------------------
validateRE(const RegularExpression * re,UBool requiresText,UErrorCode * status)84 static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
85 if (U_FAILURE(*status)) {
86 return FALSE;
87 }
88 if (re == NULL || re->fMagic != REXP_MAGIC) {
89 *status = U_ILLEGAL_ARGUMENT_ERROR;
90 return FALSE;
91 }
92 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
93 if (requiresText && re->fText == NULL && !re->fOwnsText) {
94 *status = U_REGEX_INVALID_STATE;
95 return FALSE;
96 }
97 return TRUE;
98 }
99
100 //----------------------------------------------------------------------------------------
101 //
102 // uregex_open
103 //
104 //----------------------------------------------------------------------------------------
105 U_CAPI URegularExpression * U_EXPORT2
uregex_open(const UChar * pattern,int32_t patternLength,uint32_t flags,UParseError * pe,UErrorCode * status)106 uregex_open( const UChar *pattern,
107 int32_t patternLength,
108 uint32_t flags,
109 UParseError *pe,
110 UErrorCode *status) {
111
112 if (U_FAILURE(*status)) {
113 return NULL;
114 }
115 if (pattern == NULL || patternLength < -1 || patternLength == 0) {
116 *status = U_ILLEGAL_ARGUMENT_ERROR;
117 return NULL;
118 }
119 int32_t actualPatLen = patternLength;
120 if (actualPatLen == -1) {
121 actualPatLen = u_strlen(pattern);
122 }
123
124 RegularExpression *re = new RegularExpression;
125 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
126 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
127 if (re == NULL || refC == NULL || patBuf == NULL) {
128 *status = U_MEMORY_ALLOCATION_ERROR;
129 delete re;
130 uprv_free(refC);
131 uprv_free(patBuf);
132 return NULL;
133 }
134 re->fPatRefCount = refC;
135 *re->fPatRefCount = 1;
136
137 //
138 // Make a copy of the pattern string, so we can return it later if asked.
139 // For compiling the pattern, we will use a UText wrapper around
140 // this local copy, to avoid making even more copies.
141 //
142 re->fPatString = patBuf;
143 re->fPatStringLen = patternLength;
144 u_memcpy(patBuf, pattern, actualPatLen);
145 patBuf[actualPatLen] = 0;
146
147 UText patText = UTEXT_INITIALIZER;
148 utext_openUChars(&patText, patBuf, patternLength, status);
149
150 //
151 // Compile the pattern
152 //
153 if (pe != NULL) {
154 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
155 } else {
156 re->fPat = RegexPattern::compile(&patText, flags, *status);
157 }
158 utext_close(&patText);
159
160 if (U_FAILURE(*status)) {
161 goto ErrorExit;
162 }
163
164 //
165 // Create the matcher object
166 //
167 re->fMatcher = re->fPat->matcher(*status);
168 if (U_SUCCESS(*status)) {
169 return (URegularExpression*)re;
170 }
171
172 ErrorExit:
173 delete re;
174 return NULL;
175
176 }
177
178 //----------------------------------------------------------------------------------------
179 //
180 // uregex_openUText
181 //
182 //----------------------------------------------------------------------------------------
183 U_CAPI URegularExpression * U_EXPORT2
uregex_openUText(UText * pattern,uint32_t flags,UParseError * pe,UErrorCode * status)184 uregex_openUText(UText *pattern,
185 uint32_t flags,
186 UParseError *pe,
187 UErrorCode *status) {
188
189 if (U_FAILURE(*status)) {
190 return NULL;
191 }
192 if (pattern == NULL) {
193 *status = U_ILLEGAL_ARGUMENT_ERROR;
194 return NULL;
195 }
196
197 int64_t patternNativeLength = utext_nativeLength(pattern);
198
199 if (patternNativeLength == 0) {
200 *status = U_ILLEGAL_ARGUMENT_ERROR;
201 return NULL;
202 }
203
204 RegularExpression *re = new RegularExpression;
205
206 UErrorCode lengthStatus = U_ZERO_ERROR;
207 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
208
209 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
210 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
211 if (re == NULL || refC == NULL || patBuf == NULL) {
212 *status = U_MEMORY_ALLOCATION_ERROR;
213 delete re;
214 uprv_free(refC);
215 uprv_free(patBuf);
216 return NULL;
217 }
218 re->fPatRefCount = refC;
219 *re->fPatRefCount = 1;
220
221 //
222 // Make a copy of the pattern string, so we can return it later if asked.
223 // For compiling the pattern, we will use a read-only UText wrapper
224 // around this local copy, to avoid making even more copies.
225 //
226 re->fPatString = patBuf;
227 re->fPatStringLen = pattern16Length;
228 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
229
230 UText patText = UTEXT_INITIALIZER;
231 utext_openUChars(&patText, patBuf, pattern16Length, status);
232
233 //
234 // Compile the pattern
235 //
236 if (pe != NULL) {
237 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
238 } else {
239 re->fPat = RegexPattern::compile(&patText, flags, *status);
240 }
241 utext_close(&patText);
242
243 if (U_FAILURE(*status)) {
244 goto ErrorExit;
245 }
246
247 //
248 // Create the matcher object
249 //
250 re->fMatcher = re->fPat->matcher(*status);
251 if (U_SUCCESS(*status)) {
252 return (URegularExpression*)re;
253 }
254
255 ErrorExit:
256 delete re;
257 return NULL;
258
259 }
260
261 //----------------------------------------------------------------------------------------
262 //
263 // uregex_close
264 //
265 //----------------------------------------------------------------------------------------
266 U_CAPI void U_EXPORT2
uregex_close(URegularExpression * re2)267 uregex_close(URegularExpression *re2) {
268 RegularExpression *re = (RegularExpression*)re2;
269 UErrorCode status = U_ZERO_ERROR;
270 if (validateRE(re, FALSE, &status) == FALSE) {
271 return;
272 }
273 delete re;
274 }
275
276
277 //----------------------------------------------------------------------------------------
278 //
279 // uregex_clone
280 //
281 //----------------------------------------------------------------------------------------
282 U_CAPI URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression * source2,UErrorCode * status)283 uregex_clone(const URegularExpression *source2, UErrorCode *status) {
284 RegularExpression *source = (RegularExpression*)source2;
285 if (validateRE(source, FALSE, status) == FALSE) {
286 return NULL;
287 }
288
289 RegularExpression *clone = new RegularExpression;
290 if (clone == NULL) {
291 *status = U_MEMORY_ALLOCATION_ERROR;
292 return NULL;
293 }
294
295 clone->fMatcher = source->fPat->matcher(*status);
296 if (U_FAILURE(*status)) {
297 delete clone;
298 return NULL;
299 }
300
301 clone->fPat = source->fPat;
302 clone->fPatRefCount = source->fPatRefCount;
303 clone->fPatString = source->fPatString;
304 clone->fPatStringLen = source->fPatStringLen;
305 umtx_atomic_inc(source->fPatRefCount);
306 // Note: fText is not cloned.
307
308 return (URegularExpression*)clone;
309 }
310
311
312
313
314 //------------------------------------------------------------------------------
315 //
316 // uregex_pattern
317 //
318 //------------------------------------------------------------------------------
319 U_CAPI const UChar * U_EXPORT2
uregex_pattern(const URegularExpression * regexp2,int32_t * patLength,UErrorCode * status)320 uregex_pattern(const URegularExpression *regexp2,
321 int32_t *patLength,
322 UErrorCode *status) {
323 RegularExpression *regexp = (RegularExpression*)regexp2;
324
325 if (validateRE(regexp, FALSE, status) == FALSE) {
326 return NULL;
327 }
328 if (patLength != NULL) {
329 *patLength = regexp->fPatStringLen;
330 }
331 return regexp->fPatString;
332 }
333
334
335 //------------------------------------------------------------------------------
336 //
337 // uregex_patternUText
338 //
339 //------------------------------------------------------------------------------
340 U_CAPI UText * U_EXPORT2
uregex_patternUText(const URegularExpression * regexp2,UErrorCode * status)341 uregex_patternUText(const URegularExpression *regexp2,
342 UErrorCode *status) {
343 RegularExpression *regexp = (RegularExpression*)regexp2;
344 return regexp->fPat->patternText(*status);
345 }
346
347
348 //------------------------------------------------------------------------------
349 //
350 // uregex_flags
351 //
352 //------------------------------------------------------------------------------
353 U_CAPI int32_t U_EXPORT2
uregex_flags(const URegularExpression * regexp2,UErrorCode * status)354 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) {
355 RegularExpression *regexp = (RegularExpression*)regexp2;
356 if (validateRE(regexp, FALSE, status) == FALSE) {
357 return 0;
358 }
359 int32_t flags = regexp->fPat->flags();
360 return flags;
361 }
362
363
364 //------------------------------------------------------------------------------
365 //
366 // uregex_setText
367 //
368 //------------------------------------------------------------------------------
369 U_CAPI void U_EXPORT2
uregex_setText(URegularExpression * regexp2,const UChar * text,int32_t textLength,UErrorCode * status)370 uregex_setText(URegularExpression *regexp2,
371 const UChar *text,
372 int32_t textLength,
373 UErrorCode *status) {
374 RegularExpression *regexp = (RegularExpression*)regexp2;
375 if (validateRE(regexp, FALSE, status) == FALSE) {
376 return;
377 }
378 if (text == NULL || textLength < -1) {
379 *status = U_ILLEGAL_ARGUMENT_ERROR;
380 return;
381 }
382
383 if (regexp->fOwnsText && regexp->fText != NULL) {
384 uprv_free((void *)regexp->fText);
385 }
386
387 regexp->fText = text;
388 regexp->fTextLength = textLength;
389 regexp->fOwnsText = FALSE;
390
391 UText input = UTEXT_INITIALIZER;
392 utext_openUChars(&input, text, textLength, status);
393 regexp->fMatcher->reset(&input);
394 utext_close(&input); // reset() made a shallow clone, so we don't need this copy
395 }
396
397
398 //------------------------------------------------------------------------------
399 //
400 // uregex_setUText
401 //
402 //------------------------------------------------------------------------------
403 U_CAPI void U_EXPORT2
uregex_setUText(URegularExpression * regexp2,UText * text,UErrorCode * status)404 uregex_setUText(URegularExpression *regexp2,
405 UText *text,
406 UErrorCode *status) {
407 RegularExpression *regexp = (RegularExpression*)regexp2;
408 if (validateRE(regexp, FALSE, status) == FALSE) {
409 return;
410 }
411 if (text == NULL) {
412 *status = U_ILLEGAL_ARGUMENT_ERROR;
413 return;
414 }
415
416 if (regexp->fOwnsText && regexp->fText != NULL) {
417 uprv_free((void *)regexp->fText);
418 }
419
420 regexp->fText = NULL; // only fill it in on request
421 regexp->fTextLength = -1;
422 regexp->fOwnsText = TRUE;
423 regexp->fMatcher->reset(text);
424 }
425
426
427
428 //------------------------------------------------------------------------------
429 //
430 // uregex_getText
431 //
432 //------------------------------------------------------------------------------
433 U_CAPI const UChar * U_EXPORT2
uregex_getText(URegularExpression * regexp2,int32_t * textLength,UErrorCode * status)434 uregex_getText(URegularExpression *regexp2,
435 int32_t *textLength,
436 UErrorCode *status) {
437 RegularExpression *regexp = (RegularExpression*)regexp2;
438 if (validateRE(regexp, FALSE, status) == FALSE) {
439 return NULL;
440 }
441
442 if (regexp->fText == NULL) {
443 // need to fill in the text
444 UText *inputText = regexp->fMatcher->inputText();
445 int64_t inputNativeLength = utext_nativeLength(inputText);
446 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
447 regexp->fText = inputText->chunkContents;
448 regexp->fTextLength = (int32_t)inputNativeLength;
449 regexp->fOwnsText = FALSE; // because the UText owns it
450 } else {
451 UErrorCode lengthStatus = U_ZERO_ERROR;
452 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
453 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
454
455 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
456 regexp->fText = inputChars;
457 regexp->fOwnsText = TRUE; // should already be set but just in case
458 }
459 }
460
461 if (textLength != NULL) {
462 *textLength = regexp->fTextLength;
463 }
464 return regexp->fText;
465 }
466
467
468 //------------------------------------------------------------------------------
469 //
470 // uregex_getUText
471 //
472 //------------------------------------------------------------------------------
473 U_CAPI UText * U_EXPORT2
uregex_getUText(URegularExpression * regexp2,UText * dest,UErrorCode * status)474 uregex_getUText(URegularExpression *regexp2,
475 UText *dest,
476 UErrorCode *status) {
477 RegularExpression *regexp = (RegularExpression*)regexp2;
478 if (validateRE(regexp, FALSE, status) == FALSE) {
479 return dest;
480 }
481 return regexp->fMatcher->getInput(dest, *status);
482 }
483
484
485 //------------------------------------------------------------------------------
486 //
487 // uregex_refreshUText
488 //
489 //------------------------------------------------------------------------------
490 U_CAPI void U_EXPORT2
uregex_refreshUText(URegularExpression * regexp2,UText * text,UErrorCode * status)491 uregex_refreshUText(URegularExpression *regexp2,
492 UText *text,
493 UErrorCode *status) {
494 RegularExpression *regexp = (RegularExpression*)regexp2;
495 if (validateRE(regexp, FALSE, status) == FALSE) {
496 return;
497 }
498 regexp->fMatcher->refreshInputText(text, *status);
499 }
500
501
502 //------------------------------------------------------------------------------
503 //
504 // uregex_matches
505 //
506 //------------------------------------------------------------------------------
507 U_CAPI UBool U_EXPORT2
uregex_matches(URegularExpression * regexp2,int32_t startIndex,UErrorCode * status)508 uregex_matches(URegularExpression *regexp2,
509 int32_t startIndex,
510 UErrorCode *status) {
511 return uregex_matches64( regexp2, (int64_t)startIndex, status);
512 }
513
514 U_CAPI UBool U_EXPORT2
uregex_matches64(URegularExpression * regexp2,int64_t startIndex,UErrorCode * status)515 uregex_matches64(URegularExpression *regexp2,
516 int64_t startIndex,
517 UErrorCode *status) {
518 RegularExpression *regexp = (RegularExpression*)regexp2;
519 UBool result = FALSE;
520 if (validateRE(regexp, TRUE, status) == FALSE) {
521 return result;
522 }
523 if (startIndex == -1) {
524 result = regexp->fMatcher->matches(*status);
525 } else {
526 result = regexp->fMatcher->matches(startIndex, *status);
527 }
528 return result;
529 }
530
531
532 //------------------------------------------------------------------------------
533 //
534 // uregex_lookingAt
535 //
536 //------------------------------------------------------------------------------
537 U_CAPI UBool U_EXPORT2
uregex_lookingAt(URegularExpression * regexp2,int32_t startIndex,UErrorCode * status)538 uregex_lookingAt(URegularExpression *regexp2,
539 int32_t startIndex,
540 UErrorCode *status) {
541 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
542 }
543
544 U_CAPI UBool U_EXPORT2
uregex_lookingAt64(URegularExpression * regexp2,int64_t startIndex,UErrorCode * status)545 uregex_lookingAt64(URegularExpression *regexp2,
546 int64_t startIndex,
547 UErrorCode *status) {
548 RegularExpression *regexp = (RegularExpression*)regexp2;
549 UBool result = FALSE;
550 if (validateRE(regexp, TRUE, status) == FALSE) {
551 return result;
552 }
553 if (startIndex == -1) {
554 result = regexp->fMatcher->lookingAt(*status);
555 } else {
556 result = regexp->fMatcher->lookingAt(startIndex, *status);
557 }
558 return result;
559 }
560
561
562
563 //------------------------------------------------------------------------------
564 //
565 // uregex_find
566 //
567 //------------------------------------------------------------------------------
568 U_CAPI UBool U_EXPORT2
uregex_find(URegularExpression * regexp2,int32_t startIndex,UErrorCode * status)569 uregex_find(URegularExpression *regexp2,
570 int32_t startIndex,
571 UErrorCode *status) {
572 return uregex_find64( regexp2, (int64_t)startIndex, status);
573 }
574
575 U_CAPI UBool U_EXPORT2
uregex_find64(URegularExpression * regexp2,int64_t startIndex,UErrorCode * status)576 uregex_find64(URegularExpression *regexp2,
577 int64_t startIndex,
578 UErrorCode *status) {
579 RegularExpression *regexp = (RegularExpression*)regexp2;
580 UBool result = FALSE;
581 if (validateRE(regexp, TRUE, status) == FALSE) {
582 return result;
583 }
584 if (startIndex == -1) {
585 regexp->fMatcher->resetPreserveRegion();
586 result = regexp->fMatcher->find();
587 } else {
588 result = regexp->fMatcher->find(startIndex, *status);
589 }
590 return result;
591 }
592
593
594 //------------------------------------------------------------------------------
595 //
596 // uregex_findNext
597 //
598 //------------------------------------------------------------------------------
599 U_CAPI UBool U_EXPORT2
uregex_findNext(URegularExpression * regexp2,UErrorCode * status)600 uregex_findNext(URegularExpression *regexp2,
601 UErrorCode *status) {
602 RegularExpression *regexp = (RegularExpression*)regexp2;
603 if (validateRE(regexp, TRUE, status) == FALSE) {
604 return FALSE;
605 }
606 UBool result = regexp->fMatcher->find();
607 return result;
608 }
609
610 //------------------------------------------------------------------------------
611 //
612 // uregex_groupCount
613 //
614 //------------------------------------------------------------------------------
615 U_CAPI int32_t U_EXPORT2
uregex_groupCount(URegularExpression * regexp2,UErrorCode * status)616 uregex_groupCount(URegularExpression *regexp2,
617 UErrorCode *status) {
618 RegularExpression *regexp = (RegularExpression*)regexp2;
619 if (validateRE(regexp, FALSE, status) == FALSE) {
620 return 0;
621 }
622 int32_t result = regexp->fMatcher->groupCount();
623 return result;
624 }
625
626
627 //------------------------------------------------------------------------------
628 //
629 // uregex_group
630 //
631 //------------------------------------------------------------------------------
632 U_CAPI int32_t U_EXPORT2
uregex_group(URegularExpression * regexp2,int32_t groupNum,UChar * dest,int32_t destCapacity,UErrorCode * status)633 uregex_group(URegularExpression *regexp2,
634 int32_t groupNum,
635 UChar *dest,
636 int32_t destCapacity,
637 UErrorCode *status) {
638 RegularExpression *regexp = (RegularExpression*)regexp2;
639 if (validateRE(regexp, TRUE, status) == FALSE) {
640 return 0;
641 }
642 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
643 *status = U_ILLEGAL_ARGUMENT_ERROR;
644 return 0;
645 }
646
647 if (destCapacity == 0 || regexp->fText != NULL) {
648 // If preflighting or if we already have the text as UChars,
649 // this is a little cheaper than going through uregex_groupUTextDeep()
650
651 //
652 // Pick up the range of characters from the matcher
653 //
654 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
655 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
656 if (U_FAILURE(*status)) {
657 return 0;
658 }
659
660 //
661 // Trim length based on buffer capacity
662 //
663 int32_t fullLength = endIx - startIx;
664 int32_t copyLength = fullLength;
665 if (copyLength < destCapacity) {
666 dest[copyLength] = 0;
667 } else if (copyLength == destCapacity) {
668 *status = U_STRING_NOT_TERMINATED_WARNING;
669 } else {
670 copyLength = destCapacity;
671 *status = U_BUFFER_OVERFLOW_ERROR;
672 }
673
674 //
675 // Copy capture group to user's buffer
676 //
677 if (copyLength > 0) {
678 u_memcpy(dest, ®exp->fText[startIx], copyLength);
679 }
680 return fullLength;
681 } else {
682 UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
683 int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
684 utext_close(groupText);
685 return result;
686 }
687 }
688
689
690 //------------------------------------------------------------------------------
691 //
692 // uregex_groupUText
693 //
694 //------------------------------------------------------------------------------
695 U_CAPI UText * U_EXPORT2
uregex_groupUText(URegularExpression * regexp2,int32_t groupNum,UText * dest,int64_t * groupLength,UErrorCode * status)696 uregex_groupUText(URegularExpression *regexp2,
697 int32_t groupNum,
698 UText *dest,
699 int64_t *groupLength,
700 UErrorCode *status) {
701 RegularExpression *regexp = (RegularExpression*)regexp2;
702 if (validateRE(regexp, TRUE, status) == FALSE) {
703 UErrorCode emptyTextStatus = U_ZERO_ERROR;
704 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
705 }
706
707 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
708 }
709
710 //------------------------------------------------------------------------------
711 //
712 // uregex_groupUTextDeep
713 //
714 //------------------------------------------------------------------------------
715 U_CAPI UText * U_EXPORT2
uregex_groupUTextDeep(URegularExpression * regexp2,int32_t groupNum,UText * dest,UErrorCode * status)716 uregex_groupUTextDeep(URegularExpression *regexp2,
717 int32_t groupNum,
718 UText *dest,
719 UErrorCode *status) {
720 RegularExpression *regexp = (RegularExpression*)regexp2;
721 if (validateRE(regexp, TRUE, status) == FALSE) {
722 UErrorCode emptyTextStatus = U_ZERO_ERROR;
723 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
724 }
725
726 if (regexp->fText != NULL) {
727 //
728 // Pick up the range of characters from the matcher
729 // and use our already-extracted characters
730 //
731 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
732 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
733 if (U_FAILURE(*status)) {
734 UErrorCode emptyTextStatus = U_ZERO_ERROR;
735 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
736 }
737
738 if (dest) {
739 utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status);
740 } else {
741 UText groupText = UTEXT_INITIALIZER;
742 utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status);
743 dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
744 utext_close(&groupText);
745 }
746
747 return dest;
748 } else {
749 return regexp->fMatcher->group(groupNum, dest, *status);
750 }
751 }
752
753 //------------------------------------------------------------------------------
754 //
755 // uregex_start
756 //
757 //------------------------------------------------------------------------------
758 U_CAPI int32_t U_EXPORT2
uregex_start(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)759 uregex_start(URegularExpression *regexp2,
760 int32_t groupNum,
761 UErrorCode *status) {
762 return (int32_t)uregex_start64( regexp2, groupNum, status);
763 }
764
765 U_CAPI int64_t U_EXPORT2
uregex_start64(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)766 uregex_start64(URegularExpression *regexp2,
767 int32_t groupNum,
768 UErrorCode *status) {
769 RegularExpression *regexp = (RegularExpression*)regexp2;
770 if (validateRE(regexp, TRUE, status) == FALSE) {
771 return 0;
772 }
773 int32_t result = regexp->fMatcher->start(groupNum, *status);
774 return result;
775 }
776
777 //------------------------------------------------------------------------------
778 //
779 // uregex_end
780 //
781 //------------------------------------------------------------------------------
782 U_CAPI int32_t U_EXPORT2
uregex_end(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)783 uregex_end(URegularExpression *regexp2,
784 int32_t groupNum,
785 UErrorCode *status) {
786 return (int32_t)uregex_end64( regexp2, groupNum, status);
787 }
788
789 U_CAPI int64_t U_EXPORT2
uregex_end64(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)790 uregex_end64(URegularExpression *regexp2,
791 int32_t groupNum,
792 UErrorCode *status) {
793 RegularExpression *regexp = (RegularExpression*)regexp2;
794 if (validateRE(regexp, TRUE, status) == FALSE) {
795 return 0;
796 }
797 int32_t result = regexp->fMatcher->end(groupNum, *status);
798 return result;
799 }
800
801 //------------------------------------------------------------------------------
802 //
803 // uregex_reset
804 //
805 //------------------------------------------------------------------------------
806 U_CAPI void U_EXPORT2
uregex_reset(URegularExpression * regexp2,int32_t index,UErrorCode * status)807 uregex_reset(URegularExpression *regexp2,
808 int32_t index,
809 UErrorCode *status) {
810 uregex_reset64( regexp2, (int64_t)index, status);
811 }
812
813 U_CAPI void U_EXPORT2
uregex_reset64(URegularExpression * regexp2,int64_t index,UErrorCode * status)814 uregex_reset64(URegularExpression *regexp2,
815 int64_t index,
816 UErrorCode *status) {
817 RegularExpression *regexp = (RegularExpression*)regexp2;
818 if (validateRE(regexp, TRUE, status) == FALSE) {
819 return;
820 }
821 regexp->fMatcher->reset(index, *status);
822 }
823
824
825 //------------------------------------------------------------------------------
826 //
827 // uregex_setRegion
828 //
829 //------------------------------------------------------------------------------
830 U_CAPI void U_EXPORT2
uregex_setRegion(URegularExpression * regexp2,int32_t regionStart,int32_t regionLimit,UErrorCode * status)831 uregex_setRegion(URegularExpression *regexp2,
832 int32_t regionStart,
833 int32_t regionLimit,
834 UErrorCode *status) {
835 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
836 }
837
838 U_CAPI void U_EXPORT2
uregex_setRegion64(URegularExpression * regexp2,int64_t regionStart,int64_t regionLimit,UErrorCode * status)839 uregex_setRegion64(URegularExpression *regexp2,
840 int64_t regionStart,
841 int64_t regionLimit,
842 UErrorCode *status) {
843 RegularExpression *regexp = (RegularExpression*)regexp2;
844 if (validateRE(regexp, TRUE, status) == FALSE) {
845 return;
846 }
847 regexp->fMatcher->region(regionStart, regionLimit, *status);
848 }
849
850
851 //------------------------------------------------------------------------------
852 //
853 // uregex_setRegionAndStart
854 //
855 //------------------------------------------------------------------------------
856 U_DRAFT void U_EXPORT2
uregex_setRegionAndStart(URegularExpression * regexp2,int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode * status)857 uregex_setRegionAndStart(URegularExpression *regexp2,
858 int64_t regionStart,
859 int64_t regionLimit,
860 int64_t startIndex,
861 UErrorCode *status) {
862 RegularExpression *regexp = (RegularExpression*)regexp2;
863 if (validateRE(regexp, TRUE, status) == FALSE) {
864 return;
865 }
866 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
867 }
868
869 //------------------------------------------------------------------------------
870 //
871 // uregex_regionStart
872 //
873 //------------------------------------------------------------------------------
874 U_CAPI int32_t U_EXPORT2
uregex_regionStart(const URegularExpression * regexp2,UErrorCode * status)875 uregex_regionStart(const URegularExpression *regexp2,
876 UErrorCode *status) {
877 return (int32_t)uregex_regionStart64(regexp2, status);
878 }
879
880 U_CAPI int64_t U_EXPORT2
uregex_regionStart64(const URegularExpression * regexp2,UErrorCode * status)881 uregex_regionStart64(const URegularExpression *regexp2,
882 UErrorCode *status) {
883 RegularExpression *regexp = (RegularExpression*)regexp2;
884 if (validateRE(regexp, TRUE, status) == FALSE) {
885 return 0;
886 }
887 return regexp->fMatcher->regionStart();
888 }
889
890
891 //------------------------------------------------------------------------------
892 //
893 // uregex_regionEnd
894 //
895 //------------------------------------------------------------------------------
896 U_CAPI int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression * regexp2,UErrorCode * status)897 uregex_regionEnd(const URegularExpression *regexp2,
898 UErrorCode *status) {
899 return (int32_t)uregex_regionEnd64(regexp2, status);
900 }
901
902 U_CAPI int64_t U_EXPORT2
uregex_regionEnd64(const URegularExpression * regexp2,UErrorCode * status)903 uregex_regionEnd64(const URegularExpression *regexp2,
904 UErrorCode *status) {
905 RegularExpression *regexp = (RegularExpression*)regexp2;
906 if (validateRE(regexp, TRUE, status) == FALSE) {
907 return 0;
908 }
909 return regexp->fMatcher->regionEnd();
910 }
911
912
913 //------------------------------------------------------------------------------
914 //
915 // uregex_hasTransparentBounds
916 //
917 //------------------------------------------------------------------------------
918 U_CAPI UBool U_EXPORT2
uregex_hasTransparentBounds(const URegularExpression * regexp2,UErrorCode * status)919 uregex_hasTransparentBounds(const URegularExpression *regexp2,
920 UErrorCode *status) {
921 RegularExpression *regexp = (RegularExpression*)regexp2;
922 if (validateRE(regexp, FALSE, status) == FALSE) {
923 return FALSE;
924 }
925 return regexp->fMatcher->hasTransparentBounds();
926 }
927
928
929 //------------------------------------------------------------------------------
930 //
931 // uregex_useTransparentBounds
932 //
933 //------------------------------------------------------------------------------
934 U_CAPI void U_EXPORT2
uregex_useTransparentBounds(URegularExpression * regexp2,UBool b,UErrorCode * status)935 uregex_useTransparentBounds(URegularExpression *regexp2,
936 UBool b,
937 UErrorCode *status) {
938 RegularExpression *regexp = (RegularExpression*)regexp2;
939 if (validateRE(regexp, FALSE, status) == FALSE) {
940 return;
941 }
942 regexp->fMatcher->useTransparentBounds(b);
943 }
944
945
946 //------------------------------------------------------------------------------
947 //
948 // uregex_hasAnchoringBounds
949 //
950 //------------------------------------------------------------------------------
951 U_CAPI UBool U_EXPORT2
uregex_hasAnchoringBounds(const URegularExpression * regexp2,UErrorCode * status)952 uregex_hasAnchoringBounds(const URegularExpression *regexp2,
953 UErrorCode *status) {
954 RegularExpression *regexp = (RegularExpression*)regexp2;
955 if (validateRE(regexp, FALSE, status) == FALSE) {
956 return FALSE;
957 }
958 return regexp->fMatcher->hasAnchoringBounds();
959 }
960
961
962 //------------------------------------------------------------------------------
963 //
964 // uregex_useAnchoringBounds
965 //
966 //------------------------------------------------------------------------------
967 U_CAPI void U_EXPORT2
uregex_useAnchoringBounds(URegularExpression * regexp2,UBool b,UErrorCode * status)968 uregex_useAnchoringBounds(URegularExpression *regexp2,
969 UBool b,
970 UErrorCode *status) {
971 RegularExpression *regexp = (RegularExpression*)regexp2;
972 if (validateRE(regexp, FALSE, status) == FALSE) {
973 return;
974 }
975 regexp->fMatcher->useAnchoringBounds(b);
976 }
977
978
979 //------------------------------------------------------------------------------
980 //
981 // uregex_hitEnd
982 //
983 //------------------------------------------------------------------------------
984 U_CAPI UBool U_EXPORT2
uregex_hitEnd(const URegularExpression * regexp2,UErrorCode * status)985 uregex_hitEnd(const URegularExpression *regexp2,
986 UErrorCode *status) {
987 RegularExpression *regexp = (RegularExpression*)regexp2;
988 if (validateRE(regexp, TRUE, status) == FALSE) {
989 return FALSE;
990 }
991 return regexp->fMatcher->hitEnd();
992 }
993
994
995 //------------------------------------------------------------------------------
996 //
997 // uregex_requireEnd
998 //
999 //------------------------------------------------------------------------------
1000 U_CAPI UBool U_EXPORT2
uregex_requireEnd(const URegularExpression * regexp2,UErrorCode * status)1001 uregex_requireEnd(const URegularExpression *regexp2,
1002 UErrorCode *status) {
1003 RegularExpression *regexp = (RegularExpression*)regexp2;
1004 if (validateRE(regexp, TRUE, status) == FALSE) {
1005 return FALSE;
1006 }
1007 return regexp->fMatcher->requireEnd();
1008 }
1009
1010
1011 //------------------------------------------------------------------------------
1012 //
1013 // uregex_setTimeLimit
1014 //
1015 //------------------------------------------------------------------------------
1016 U_CAPI void U_EXPORT2
uregex_setTimeLimit(URegularExpression * regexp2,int32_t limit,UErrorCode * status)1017 uregex_setTimeLimit(URegularExpression *regexp2,
1018 int32_t limit,
1019 UErrorCode *status) {
1020 RegularExpression *regexp = (RegularExpression*)regexp2;
1021 if (validateRE(regexp, FALSE, status)) {
1022 regexp->fMatcher->setTimeLimit(limit, *status);
1023 }
1024 }
1025
1026
1027
1028 //------------------------------------------------------------------------------
1029 //
1030 // uregex_getTimeLimit
1031 //
1032 //------------------------------------------------------------------------------
1033 U_CAPI int32_t U_EXPORT2
uregex_getTimeLimit(const URegularExpression * regexp2,UErrorCode * status)1034 uregex_getTimeLimit(const URegularExpression *regexp2,
1035 UErrorCode *status) {
1036 int32_t retVal = 0;
1037 RegularExpression *regexp = (RegularExpression*)regexp2;
1038 if (validateRE(regexp, FALSE, status)) {
1039 retVal = regexp->fMatcher->getTimeLimit();
1040 }
1041 return retVal;
1042 }
1043
1044
1045
1046 //------------------------------------------------------------------------------
1047 //
1048 // uregex_setStackLimit
1049 //
1050 //------------------------------------------------------------------------------
1051 U_CAPI void U_EXPORT2
uregex_setStackLimit(URegularExpression * regexp2,int32_t limit,UErrorCode * status)1052 uregex_setStackLimit(URegularExpression *regexp2,
1053 int32_t limit,
1054 UErrorCode *status) {
1055 RegularExpression *regexp = (RegularExpression*)regexp2;
1056 if (validateRE(regexp, FALSE, status)) {
1057 regexp->fMatcher->setStackLimit(limit, *status);
1058 }
1059 }
1060
1061
1062
1063 //------------------------------------------------------------------------------
1064 //
1065 // uregex_getStackLimit
1066 //
1067 //------------------------------------------------------------------------------
1068 U_CAPI int32_t U_EXPORT2
uregex_getStackLimit(const URegularExpression * regexp2,UErrorCode * status)1069 uregex_getStackLimit(const URegularExpression *regexp2,
1070 UErrorCode *status) {
1071 int32_t retVal = 0;
1072 RegularExpression *regexp = (RegularExpression*)regexp2;
1073 if (validateRE(regexp, FALSE, status)) {
1074 retVal = regexp->fMatcher->getStackLimit();
1075 }
1076 return retVal;
1077 }
1078
1079
1080 //------------------------------------------------------------------------------
1081 //
1082 // uregex_setMatchCallback
1083 //
1084 //------------------------------------------------------------------------------
1085 U_CAPI void U_EXPORT2
uregex_setMatchCallback(URegularExpression * regexp2,URegexMatchCallback * callback,const void * context,UErrorCode * status)1086 uregex_setMatchCallback(URegularExpression *regexp2,
1087 URegexMatchCallback *callback,
1088 const void *context,
1089 UErrorCode *status) {
1090 RegularExpression *regexp = (RegularExpression*)regexp2;
1091 if (validateRE(regexp, FALSE, status)) {
1092 regexp->fMatcher->setMatchCallback(callback, context, *status);
1093 }
1094 }
1095
1096
1097 //------------------------------------------------------------------------------
1098 //
1099 // uregex_getMatchCallback
1100 //
1101 //------------------------------------------------------------------------------
1102 U_CAPI void U_EXPORT2
uregex_getMatchCallback(const URegularExpression * regexp2,URegexMatchCallback ** callback,const void ** context,UErrorCode * status)1103 uregex_getMatchCallback(const URegularExpression *regexp2,
1104 URegexMatchCallback **callback,
1105 const void **context,
1106 UErrorCode *status) {
1107 RegularExpression *regexp = (RegularExpression*)regexp2;
1108 if (validateRE(regexp, FALSE, status)) {
1109 regexp->fMatcher->getMatchCallback(*callback, *context, *status);
1110 }
1111 }
1112
1113
1114 //------------------------------------------------------------------------------
1115 //
1116 // uregex_setMatchProgressCallback
1117 //
1118 //------------------------------------------------------------------------------
1119 U_CAPI void U_EXPORT2
uregex_setFindProgressCallback(URegularExpression * regexp2,URegexFindProgressCallback * callback,const void * context,UErrorCode * status)1120 uregex_setFindProgressCallback(URegularExpression *regexp2,
1121 URegexFindProgressCallback *callback,
1122 const void *context,
1123 UErrorCode *status) {
1124 RegularExpression *regexp = (RegularExpression*)regexp2;
1125 if (validateRE(regexp, FALSE, status)) {
1126 regexp->fMatcher->setFindProgressCallback(callback, context, *status);
1127 }
1128 }
1129
1130
1131 //------------------------------------------------------------------------------
1132 //
1133 // uregex_getMatchCallback
1134 //
1135 //------------------------------------------------------------------------------
1136 U_CAPI void U_EXPORT2
uregex_getFindProgressCallback(const URegularExpression * regexp2,URegexFindProgressCallback ** callback,const void ** context,UErrorCode * status)1137 uregex_getFindProgressCallback(const URegularExpression *regexp2,
1138 URegexFindProgressCallback **callback,
1139 const void **context,
1140 UErrorCode *status) {
1141 RegularExpression *regexp = (RegularExpression*)regexp2;
1142 if (validateRE(regexp, FALSE, status)) {
1143 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
1144 }
1145 }
1146
1147
1148 //------------------------------------------------------------------------------
1149 //
1150 // uregex_replaceAll
1151 //
1152 //------------------------------------------------------------------------------
1153 U_CAPI int32_t U_EXPORT2
uregex_replaceAll(URegularExpression * regexp2,const UChar * replacementText,int32_t replacementLength,UChar * destBuf,int32_t destCapacity,UErrorCode * status)1154 uregex_replaceAll(URegularExpression *regexp2,
1155 const UChar *replacementText,
1156 int32_t replacementLength,
1157 UChar *destBuf,
1158 int32_t destCapacity,
1159 UErrorCode *status) {
1160 RegularExpression *regexp = (RegularExpression*)regexp2;
1161 if (validateRE(regexp, TRUE, status) == FALSE) {
1162 return 0;
1163 }
1164 if (replacementText == NULL || replacementLength < -1 ||
1165 (destBuf == NULL && destCapacity > 0) ||
1166 destCapacity < 0) {
1167 *status = U_ILLEGAL_ARGUMENT_ERROR;
1168 return 0;
1169 }
1170
1171 int32_t len = 0;
1172
1173 uregex_reset(regexp2, 0, status);
1174
1175 // Note: Seperate error code variables for findNext() and appendReplacement()
1176 // are used so that destination buffer overflow errors
1177 // in appendReplacement won't stop findNext() from working.
1178 // appendReplacement() and appendTail() special case incoming buffer
1179 // overflow errors, continuing to return the correct length.
1180 UErrorCode findStatus = *status;
1181 while (uregex_findNext(regexp2, &findStatus)) {
1182 len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
1183 &destBuf, &destCapacity, status);
1184 }
1185 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1186
1187 if (U_FAILURE(findStatus)) {
1188 // If anything went wrong with the findNext(), make that error trump
1189 // whatever may have happened with the append() operations.
1190 // Errors in findNext() are not expected.
1191 *status = findStatus;
1192 }
1193
1194 return len;
1195 }
1196
1197
1198 //------------------------------------------------------------------------------
1199 //
1200 // uregex_replaceAllUText
1201 //
1202 //------------------------------------------------------------------------------
1203 U_CAPI UText * U_EXPORT2
uregex_replaceAllUText(URegularExpression * regexp2,UText * replacementText,UText * dest,UErrorCode * status)1204 uregex_replaceAllUText(URegularExpression *regexp2,
1205 UText *replacementText,
1206 UText *dest,
1207 UErrorCode *status) {
1208 RegularExpression *regexp = (RegularExpression*)regexp2;
1209 if (validateRE(regexp, TRUE, status) == FALSE) {
1210 return 0;
1211 }
1212 if (replacementText == NULL) {
1213 *status = U_ILLEGAL_ARGUMENT_ERROR;
1214 return 0;
1215 }
1216
1217 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
1218 return dest;
1219 }
1220
1221
1222 //------------------------------------------------------------------------------
1223 //
1224 // uregex_replaceFirst
1225 //
1226 //------------------------------------------------------------------------------
1227 U_CAPI int32_t U_EXPORT2
uregex_replaceFirst(URegularExpression * regexp2,const UChar * replacementText,int32_t replacementLength,UChar * destBuf,int32_t destCapacity,UErrorCode * status)1228 uregex_replaceFirst(URegularExpression *regexp2,
1229 const UChar *replacementText,
1230 int32_t replacementLength,
1231 UChar *destBuf,
1232 int32_t destCapacity,
1233 UErrorCode *status) {
1234 RegularExpression *regexp = (RegularExpression*)regexp2;
1235 if (validateRE(regexp, TRUE, status) == FALSE) {
1236 return 0;
1237 }
1238 if (replacementText == NULL || replacementLength < -1 ||
1239 (destBuf == NULL && destCapacity > 0) ||
1240 destCapacity < 0) {
1241 *status = U_ILLEGAL_ARGUMENT_ERROR;
1242 return 0;
1243 }
1244
1245 int32_t len = 0;
1246 UBool findSucceeded;
1247 uregex_reset(regexp2, 0, status);
1248 findSucceeded = uregex_find(regexp2, 0, status);
1249 if (findSucceeded) {
1250 len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
1251 &destBuf, &destCapacity, status);
1252 }
1253 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1254
1255 return len;
1256 }
1257
1258
1259 //------------------------------------------------------------------------------
1260 //
1261 // uregex_replaceFirstUText
1262 //
1263 //------------------------------------------------------------------------------
1264 U_CAPI UText * U_EXPORT2
uregex_replaceFirstUText(URegularExpression * regexp2,UText * replacementText,UText * dest,UErrorCode * status)1265 uregex_replaceFirstUText(URegularExpression *regexp2,
1266 UText *replacementText,
1267 UText *dest,
1268 UErrorCode *status) {
1269 RegularExpression *regexp = (RegularExpression*)regexp2;
1270 if (validateRE(regexp, TRUE, status) == FALSE) {
1271 return 0;
1272 }
1273 if (replacementText == NULL) {
1274 *status = U_ILLEGAL_ARGUMENT_ERROR;
1275 return 0;
1276 }
1277
1278 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
1279 return dest;
1280 }
1281
1282
1283 //------------------------------------------------------------------------------
1284 //
1285 // uregex_appendReplacement
1286 //
1287 //------------------------------------------------------------------------------
1288
1289 U_NAMESPACE_BEGIN
1290 //
1291 // Dummy class, because these functions need to be friends of class RegexMatcher,
1292 // and stand-alone C functions don't work as friends
1293 //
1294 class RegexCImpl {
1295 public:
1296 inline static int32_t appendReplacement(RegularExpression *regexp,
1297 const UChar *replacementText,
1298 int32_t replacementLength,
1299 UChar **destBuf,
1300 int32_t *destCapacity,
1301 UErrorCode *status);
1302
1303 inline static int32_t appendTail(RegularExpression *regexp,
1304 UChar **destBuf,
1305 int32_t *destCapacity,
1306 UErrorCode *status);
1307
1308 inline static int32_t split(RegularExpression *regexp,
1309 UChar *destBuf,
1310 int32_t destCapacity,
1311 int32_t *requiredCapacity,
1312 UChar *destFields[],
1313 int32_t destFieldsCapacity,
1314 UErrorCode *status);
1315 };
1316
1317 U_NAMESPACE_END
1318
1319
1320
1321 static const UChar BACKSLASH = 0x5c;
1322 static const UChar DOLLARSIGN = 0x24;
1323
1324 //
1325 // Move a character to an output buffer, with bounds checking on the index.
1326 // Index advances even if capacity is exceeded, for preflight size computations.
1327 // This little sequence is used a LOT.
1328 //
appendToBuf(UChar c,int32_t * idx,UChar * buf,int32_t bufCapacity)1329 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
1330 if (*idx < bufCapacity) {
1331 buf[*idx] = c;
1332 }
1333 (*idx)++;
1334 }
1335
1336
1337 //
1338 // appendReplacement, the actual implementation.
1339 //
appendReplacement(RegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1340 int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
1341 const UChar *replacementText,
1342 int32_t replacementLength,
1343 UChar **destBuf,
1344 int32_t *destCapacity,
1345 UErrorCode *status) {
1346
1347 // If we come in with a buffer overflow error, don't suppress the operation.
1348 // A series of appendReplacements, appendTail need to correctly preflight
1349 // the buffer size when an overflow happens somewhere in the middle.
1350 UBool pendingBufferOverflow = FALSE;
1351 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1352 pendingBufferOverflow = TRUE;
1353 *status = U_ZERO_ERROR;
1354 }
1355
1356 //
1357 // Validate all paramters
1358 //
1359 if (validateRE(regexp, TRUE, status) == FALSE) {
1360 return 0;
1361 }
1362 if (replacementText == NULL || replacementLength < -1 ||
1363 destCapacity == NULL || destBuf == NULL ||
1364 (*destBuf == NULL && *destCapacity > 0) ||
1365 *destCapacity < 0) {
1366 *status = U_ILLEGAL_ARGUMENT_ERROR;
1367 return 0;
1368 }
1369
1370 RegexMatcher *m = regexp->fMatcher;
1371 if (m->fMatch == FALSE) {
1372 *status = U_REGEX_INVALID_STATE;
1373 return 0;
1374 }
1375
1376 UChar *dest = *destBuf;
1377 int32_t capacity = *destCapacity;
1378 int32_t destIdx = 0;
1379 int32_t i;
1380
1381 // If it wasn't supplied by the caller, get the length of the replacement text.
1382 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
1383 // the fly and avoid this step.
1384 if (replacementLength == -1) {
1385 replacementLength = u_strlen(replacementText);
1386 }
1387
1388 // Copy input string from the end of previous match to start of current match
1389 if (regexp->fText != NULL) {
1390 int32_t matchStart;
1391 int32_t lastMatchEnd;
1392 if (UTEXT_USES_U16(m->fInputText)) {
1393 lastMatchEnd = (int32_t)m->fLastMatchEnd;
1394 matchStart = (int32_t)m->fMatchStart;
1395 } else {
1396 // !!!: Would like a better way to do this!
1397 UErrorCode status = U_ZERO_ERROR;
1398 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
1399 status = U_ZERO_ERROR;
1400 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
1401 }
1402 for (i=lastMatchEnd; i<matchStart; i++) {
1403 appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
1404 }
1405 } else {
1406 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
1407 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
1408 &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError);
1409 }
1410
1411
1412 // scan the replacement text, looking for substitutions ($n) and \escapes.
1413 int32_t replIdx = 0;
1414 while (replIdx < replacementLength) {
1415 UChar c = replacementText[replIdx];
1416 replIdx++;
1417 if (c != DOLLARSIGN && c != BACKSLASH) {
1418 // Common case, no substitution, no escaping,
1419 // just copy the char to the dest buf.
1420 appendToBuf(c, &destIdx, dest, capacity);
1421 continue;
1422 }
1423
1424 if (c == BACKSLASH) {
1425 // Backslash Escape. Copy the following char out without further checks.
1426 // Note: Surrogate pairs don't need any special handling
1427 // The second half wont be a '$' or a '\', and
1428 // will move to the dest normally on the next
1429 // loop iteration.
1430 if (replIdx >= replacementLength) {
1431 break;
1432 }
1433 c = replacementText[replIdx];
1434
1435 if (c==0x55/*U*/ || c==0x75/*u*/) {
1436 // We have a \udddd or \Udddddddd escape sequence.
1437 UChar32 escapedChar =
1438 u_unescapeAt(uregex_ucstr_unescape_charAt,
1439 &replIdx, // Index is updated by unescapeAt
1440 replacementLength, // Length of replacement text
1441 (void *)replacementText);
1442
1443 if (escapedChar != (UChar32)0xFFFFFFFF) {
1444 if (escapedChar <= 0xffff) {
1445 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
1446 } else {
1447 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
1448 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
1449 }
1450 continue;
1451 }
1452 // Note: if the \u escape was invalid, just fall through and
1453 // treat it as a plain \<anything> escape.
1454 }
1455
1456 // Plain backslash escape. Just put out the escaped character.
1457 appendToBuf(c, &destIdx, dest, capacity);
1458
1459 replIdx++;
1460 continue;
1461 }
1462
1463
1464
1465 // We've got a $. Pick up a capture group number if one follows.
1466 // Consume at most the number of digits necessary for the largest capture
1467 // number that is valid for this pattern.
1468
1469 int32_t numDigits = 0;
1470 int32_t groupNum = 0;
1471 UChar32 digitC;
1472 for (;;) {
1473 if (replIdx >= replacementLength) {
1474 break;
1475 }
1476 U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
1477 if (u_isdigit(digitC) == FALSE) {
1478 break;
1479 }
1480
1481 U16_FWD_1(replacementText, replIdx, replacementLength);
1482 groupNum=groupNum*10 + u_charDigitValue(digitC);
1483 numDigits++;
1484 if (numDigits >= m->fPattern->fMaxCaptureDigits) {
1485 break;
1486 }
1487 }
1488
1489
1490 if (numDigits == 0) {
1491 // The $ didn't introduce a group number at all.
1492 // Treat it as just part of the substitution text.
1493 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
1494 continue;
1495 }
1496
1497 // Finally, append the capture group data to the destination.
1498 destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
1499 if (*status == U_BUFFER_OVERFLOW_ERROR) {
1500 // Ignore buffer overflow when extracting the group. We need to
1501 // continue on to get full size of the untruncated result. We will
1502 // raise our own buffer overflow error at the end.
1503 *status = U_ZERO_ERROR;
1504 }
1505
1506 if (U_FAILURE(*status)) {
1507 // Can fail if group number is out of range.
1508 break;
1509 }
1510
1511 }
1512
1513 //
1514 // Nul Terminate the dest buffer if possible.
1515 // Set the appropriate buffer overflow or not terminated error, if needed.
1516 //
1517 if (destIdx < capacity) {
1518 dest[destIdx] = 0;
1519 } else if (destIdx == *destCapacity) {
1520 *status = U_STRING_NOT_TERMINATED_WARNING;
1521 } else {
1522 *status = U_BUFFER_OVERFLOW_ERROR;
1523 }
1524
1525 //
1526 // Return an updated dest buffer and capacity to the caller.
1527 //
1528 if (destIdx > 0 && *destCapacity > 0) {
1529 if (destIdx < capacity) {
1530 *destBuf += destIdx;
1531 *destCapacity -= destIdx;
1532 } else {
1533 *destBuf += capacity;
1534 *destCapacity = 0;
1535 }
1536 }
1537
1538 // If we came in with a buffer overflow, make sure we go out with one also.
1539 // (A zero length match right at the end of the previous match could
1540 // make this function succeed even though a previous call had overflowed the buf)
1541 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1542 *status = U_BUFFER_OVERFLOW_ERROR;
1543 }
1544
1545 return destIdx;
1546 }
1547
1548 //
1549 // appendReplacement the actual API function,
1550 //
1551 U_CAPI int32_t U_EXPORT2
uregex_appendReplacement(URegularExpression * regexp2,const UChar * replacementText,int32_t replacementLength,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1552 uregex_appendReplacement(URegularExpression *regexp2,
1553 const UChar *replacementText,
1554 int32_t replacementLength,
1555 UChar **destBuf,
1556 int32_t *destCapacity,
1557 UErrorCode *status) {
1558
1559 RegularExpression *regexp = (RegularExpression*)regexp2;
1560 return RegexCImpl::appendReplacement(
1561 regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1562 }
1563
1564 //
1565 // uregex_appendReplacementUText...can just use the normal C++ method
1566 //
1567 U_CAPI void U_EXPORT2
uregex_appendReplacementUText(URegularExpression * regexp2,UText * replText,UText * dest,UErrorCode * status)1568 uregex_appendReplacementUText(URegularExpression *regexp2,
1569 UText *replText,
1570 UText *dest,
1571 UErrorCode *status) {
1572 RegularExpression *regexp = (RegularExpression*)regexp2;
1573 regexp->fMatcher->appendReplacement(dest, replText, *status);
1574 }
1575
1576
1577 //------------------------------------------------------------------------------
1578 //
1579 // uregex_appendTail
1580 //
1581 //------------------------------------------------------------------------------
appendTail(RegularExpression * regexp,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1582 int32_t RegexCImpl::appendTail(RegularExpression *regexp,
1583 UChar **destBuf,
1584 int32_t *destCapacity,
1585 UErrorCode *status)
1586 {
1587
1588 // If we come in with a buffer overflow error, don't suppress the operation.
1589 // A series of appendReplacements, appendTail need to correctly preflight
1590 // the buffer size when an overflow happens somewhere in the middle.
1591 UBool pendingBufferOverflow = FALSE;
1592 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1593 pendingBufferOverflow = TRUE;
1594 *status = U_ZERO_ERROR;
1595 }
1596
1597 if (validateRE(regexp, TRUE, status) == FALSE) {
1598 return 0;
1599 }
1600
1601 if (destCapacity == NULL || destBuf == NULL ||
1602 (*destBuf == NULL && *destCapacity > 0) ||
1603 *destCapacity < 0)
1604 {
1605 *status = U_ILLEGAL_ARGUMENT_ERROR;
1606 return 0;
1607 }
1608
1609 RegexMatcher *m = regexp->fMatcher;
1610
1611 int32_t destIdx = 0;
1612 int32_t destCap = *destCapacity;
1613 UChar *dest = *destBuf;
1614
1615 if (regexp->fText != NULL) {
1616 int32_t srcIdx;
1617 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
1618 if (nativeIdx == -1) {
1619 srcIdx = 0;
1620 } else if (UTEXT_USES_U16(m->fInputText)) {
1621 srcIdx = (int32_t)nativeIdx;
1622 } else {
1623 UErrorCode status = U_ZERO_ERROR;
1624 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
1625 }
1626
1627 for (;;) {
1628 if (srcIdx == regexp->fTextLength) {
1629 break;
1630 }
1631 UChar c = regexp->fText[srcIdx];
1632 if (c == 0 && regexp->fTextLength == -1) {
1633 regexp->fTextLength = srcIdx;
1634 break;
1635 }
1636 if (destIdx < destCap) {
1637 dest[destIdx] = c;
1638 } else {
1639 // We've overflowed the dest buffer.
1640 // If the total input string length is known, we can
1641 // compute the total buffer size needed without scanning through the string.
1642 if (regexp->fTextLength > 0) {
1643 destIdx += (regexp->fTextLength - srcIdx);
1644 break;
1645 }
1646 }
1647 srcIdx++;
1648 destIdx++;
1649 }
1650 } else {
1651 int64_t srcIdx;
1652 if (m->fMatch) {
1653 // The most recent call to find() succeeded.
1654 srcIdx = m->fMatchEnd;
1655 } else {
1656 // The last call to find() on this matcher failed().
1657 // Look back to the end of the last find() that succeeded for src index.
1658 srcIdx = m->fLastMatchEnd;
1659 if (srcIdx == -1) {
1660 // There has been no successful match with this matcher.
1661 // We want to copy the whole string.
1662 srcIdx = 0;
1663 }
1664 }
1665
1666 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
1667 }
1668
1669 //
1670 // NUL terminate the output string, if possible, otherwise issue the
1671 // appropriate error or warning.
1672 //
1673 if (destIdx < destCap) {
1674 dest[destIdx] = 0;
1675 } else if (destIdx == destCap) {
1676 *status = U_STRING_NOT_TERMINATED_WARNING;
1677 } else {
1678 *status = U_BUFFER_OVERFLOW_ERROR;
1679 }
1680
1681 //
1682 // Update the user's buffer ptr and capacity vars to reflect the
1683 // amount used.
1684 //
1685 if (destIdx < destCap) {
1686 *destBuf += destIdx;
1687 *destCapacity -= destIdx;
1688 } else {
1689 *destBuf += destCap;
1690 *destCapacity = 0;
1691 }
1692
1693 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1694 *status = U_BUFFER_OVERFLOW_ERROR;
1695 }
1696
1697 return destIdx;
1698 }
1699
1700
1701 //
1702 // appendTail the actual API function
1703 //
1704 U_CAPI int32_t U_EXPORT2
uregex_appendTail(URegularExpression * regexp2,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1705 uregex_appendTail(URegularExpression *regexp2,
1706 UChar **destBuf,
1707 int32_t *destCapacity,
1708 UErrorCode *status) {
1709 RegularExpression *regexp = (RegularExpression*)regexp2;
1710 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1711 }
1712
1713
1714 //
1715 // uregex_appendTailUText...can just use the normal C++ method
1716 //
1717 U_CAPI UText * U_EXPORT2
uregex_appendTailUText(URegularExpression * regexp2,UText * dest,UErrorCode * status)1718 uregex_appendTailUText(URegularExpression *regexp2,
1719 UText *dest,
1720 UErrorCode *status) {
1721 RegularExpression *regexp = (RegularExpression*)regexp2;
1722 return regexp->fMatcher->appendTail(dest, *status);
1723 }
1724
1725
1726 //------------------------------------------------------------------------------
1727 //
1728 // copyString Internal utility to copy a string to an output buffer,
1729 // while managing buffer overflow and preflight size
1730 // computation. NUL termination is added to destination,
1731 // and the NUL is counted in the output size.
1732 //
1733 //------------------------------------------------------------------------------
1734 #if 0
1735 static void copyString(UChar *destBuffer, // Destination buffer.
1736 int32_t destCapacity, // Total capacity of dest buffer
1737 int32_t *destIndex, // Index into dest buffer. Updated on return.
1738 // Update not clipped to destCapacity.
1739 const UChar *srcPtr, // Pointer to source string
1740 int32_t srcLen) // Source string len.
1741 {
1742 int32_t si;
1743 int32_t di = *destIndex;
1744 UChar c;
1745
1746 for (si=0; si<srcLen; si++) {
1747 c = srcPtr[si];
1748 if (di < destCapacity) {
1749 destBuffer[di] = c;
1750 di++;
1751 } else {
1752 di += srcLen - si;
1753 break;
1754 }
1755 }
1756 if (di<destCapacity) {
1757 destBuffer[di] = 0;
1758 }
1759 di++;
1760 *destIndex = di;
1761 }
1762 #endif
1763
1764 //------------------------------------------------------------------------------
1765 //
1766 // uregex_split
1767 //
1768 //------------------------------------------------------------------------------
split(RegularExpression * regexp,UChar * destBuf,int32_t destCapacity,int32_t * requiredCapacity,UChar * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1769 int32_t RegexCImpl::split(RegularExpression *regexp,
1770 UChar *destBuf,
1771 int32_t destCapacity,
1772 int32_t *requiredCapacity,
1773 UChar *destFields[],
1774 int32_t destFieldsCapacity,
1775 UErrorCode *status) {
1776 //
1777 // Reset for the input text
1778 //
1779 regexp->fMatcher->reset();
1780 UText *inputText = regexp->fMatcher->fInputText;
1781 int64_t nextOutputStringStart = 0;
1782 int64_t inputLen = regexp->fMatcher->fInputLength;
1783 if (inputLen == 0) {
1784 return 0;
1785 }
1786
1787 //
1788 // Loop through the input text, searching for the delimiter pattern
1789 //
1790 int32_t i; // Index of the field being processed.
1791 int32_t destIdx = 0; // Next available position in destBuf;
1792 int32_t numCaptureGroups = regexp->fMatcher->groupCount();
1793 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted
1794 for (i=0; ; i++) {
1795 if (i>=destFieldsCapacity-1) {
1796 // There are one or zero output strings left.
1797 // Fill the last output string with whatever is left from the input, then exit the loop.
1798 // ( i will be == destFieldsCapacity if we filled the output array while processing
1799 // capture groups of the delimiter expression, in which case we will discard the
1800 // last capture group saved in favor of the unprocessed remainder of the
1801 // input string.)
1802 if (inputLen > nextOutputStringStart) {
1803 if (i != destFieldsCapacity-1) {
1804 // No fields are left. Recycle the last one for holding the trailing part of
1805 // the input string.
1806 i = destFieldsCapacity-1;
1807 destIdx = (int32_t)(destFields[i] - destFields[0]);
1808 }
1809
1810 destFields[i] = &destBuf[destIdx];
1811 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1812 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1813 }
1814 break;
1815 }
1816
1817 if (regexp->fMatcher->find()) {
1818 // We found another delimiter. Move everything from where we started looking
1819 // up until the start of the delimiter into the next output string.
1820 destFields[i] = &destBuf[destIdx];
1821
1822 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
1823 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1824 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1825 tStatus = U_ZERO_ERROR;
1826 } else {
1827 *status = tStatus;
1828 }
1829 nextOutputStringStart = regexp->fMatcher->fMatchEnd;
1830
1831 // If the delimiter pattern has capturing parentheses, the captured
1832 // text goes out into the next n destination strings.
1833 int32_t groupNum;
1834 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1835 // If we've run out of output string slots, bail out.
1836 if (i==destFieldsCapacity-1) {
1837 break;
1838 }
1839 i++;
1840
1841 // Set up to extract the capture group contents into the dest buffer.
1842 destFields[i] = &destBuf[destIdx];
1843 tStatus = U_ZERO_ERROR;
1844 int32_t t = uregex_group((URegularExpression*)regexp,
1845 groupNum,
1846 destFields[i],
1847 REMAINING_CAPACITY(destIdx, destCapacity),
1848 &tStatus);
1849 destIdx += t + 1; // Record the space used in the output string buffer.
1850 // +1 for the NUL that terminates the string.
1851 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1852 tStatus = U_ZERO_ERROR;
1853 } else {
1854 *status = tStatus;
1855 }
1856 }
1857
1858 if (nextOutputStringStart == inputLen) {
1859 // The delimiter was at the end of the string.
1860 // Output an empty string, and then we are done.
1861 if (destIdx < destCapacity) {
1862 destBuf[destIdx] = 0;
1863 }
1864 if (i < destFieldsCapacity-1) {
1865 ++i;
1866 }
1867 if (destIdx < destCapacity) {
1868 destFields[i] = destBuf + destIdx;
1869 }
1870 ++destIdx;
1871 break;
1872 }
1873
1874 }
1875 else
1876 {
1877 // We ran off the end of the input while looking for the next delimiter.
1878 // All the remaining text goes into the current output string.
1879 destFields[i] = &destBuf[destIdx];
1880 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1881 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1882 break;
1883 }
1884 }
1885
1886 // Zero out any unused portion of the destFields array
1887 int j;
1888 for (j=i+1; j<destFieldsCapacity; j++) {
1889 destFields[j] = NULL;
1890 }
1891
1892 if (requiredCapacity != NULL) {
1893 *requiredCapacity = destIdx;
1894 }
1895 if (destIdx > destCapacity) {
1896 *status = U_BUFFER_OVERFLOW_ERROR;
1897 }
1898 return i+1;
1899 }
1900
1901 //
1902 // uregex_split The actual API function
1903 //
1904 U_CAPI int32_t U_EXPORT2
uregex_split(URegularExpression * regexp2,UChar * destBuf,int32_t destCapacity,int32_t * requiredCapacity,UChar * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1905 uregex_split(URegularExpression *regexp2,
1906 UChar *destBuf,
1907 int32_t destCapacity,
1908 int32_t *requiredCapacity,
1909 UChar *destFields[],
1910 int32_t destFieldsCapacity,
1911 UErrorCode *status) {
1912 RegularExpression *regexp = (RegularExpression*)regexp2;
1913 if (validateRE(regexp, TRUE, status) == FALSE) {
1914 return 0;
1915 }
1916 if ((destBuf == NULL && destCapacity > 0) ||
1917 destCapacity < 0 ||
1918 destFields == NULL ||
1919 destFieldsCapacity < 1 ) {
1920 *status = U_ILLEGAL_ARGUMENT_ERROR;
1921 return 0;
1922 }
1923
1924 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
1925 }
1926
1927
1928 //
1929 // uregex_splitUText...can just use the normal C++ method
1930 //
1931 U_CAPI int32_t U_EXPORT2
uregex_splitUText(URegularExpression * regexp2,UText * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1932 uregex_splitUText(URegularExpression *regexp2,
1933 UText *destFields[],
1934 int32_t destFieldsCapacity,
1935 UErrorCode *status) {
1936 RegularExpression *regexp = (RegularExpression*)regexp2;
1937 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
1938 }
1939
1940
1941 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1942
1943