1 /*
2 *******************************************************************************
3 * Copyright (C) 2004-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: regex.cpp
7 */
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "umutex.h"
20 #include "uassert.h"
21 #include "cmemory.h"
22
23 U_NAMESPACE_USE
24
25 struct URegularExpression: public UMemory {
26 public:
27 URegularExpression();
28 ~URegularExpression();
29 int32_t fMagic;
30 RegexPattern *fPat;
31 int32_t *fPatRefCount;
32 UChar *fPatString;
33 int32_t fPatStringLen;
34 RegexMatcher *fMatcher;
35 const UChar *fText; // Text from setText()
36 int32_t fTextLength; // Length provided by user with setText(), which
37 // may be -1.
38
39 UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString.
40 // TODO: regexp engine should not depend on UnicodeString.
41 };
42
43 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
44
URegularExpression()45 URegularExpression::URegularExpression() {
46 fMagic = REXP_MAGIC;
47 fPat = NULL;
48 fPatRefCount = NULL;
49 fPatString = NULL;
50 fPatStringLen = 0;
51 fMatcher = NULL;
52 fText = NULL;
53 fTextLength = 0;
54 }
55
~URegularExpression()56 URegularExpression::~URegularExpression() {
57 delete fMatcher;
58 fMatcher = NULL;
59 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
60 delete fPat;
61 uprv_free(fPatString);
62 uprv_free(fPatRefCount);
63 }
64 fMagic = 0;
65 }
66
67 //----------------------------------------------------------------------------------------
68 //
69 // validateRE Do boilerplate style checks on API function parameters.
70 // Return TRUE if they look OK.
71 //----------------------------------------------------------------------------------------
validateRE(const URegularExpression * re,UErrorCode * status,UBool requiresText=TRUE)72 static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
73 if (U_FAILURE(*status)) {
74 return FALSE;
75 }
76 if (re == NULL || re->fMagic != REXP_MAGIC) {
77 // U_ASSERT(FALSE);
78 *status = U_ILLEGAL_ARGUMENT_ERROR;
79 return FALSE;
80 }
81 if (requiresText && re->fText == NULL) {
82 *status = U_REGEX_INVALID_STATE;
83 return FALSE;
84 }
85 return TRUE;
86 }
87
88 //----------------------------------------------------------------------------------------
89 //
90 // uregex_open
91 //
92 //----------------------------------------------------------------------------------------
93 U_CAPI URegularExpression * U_EXPORT2
uregex_open(const UChar * pattern,int32_t patternLength,uint32_t flags,UParseError * pe,UErrorCode * status)94 uregex_open( const UChar *pattern,
95 int32_t patternLength,
96 uint32_t flags,
97 UParseError *pe,
98 UErrorCode *status) {
99
100 if (U_FAILURE(*status)) {
101 return NULL;
102 }
103 if (pattern == NULL || patternLength < -1 || patternLength == 0) {
104 *status = U_ILLEGAL_ARGUMENT_ERROR;
105 return NULL;
106 }
107 int32_t actualPatLen = patternLength;
108 if (actualPatLen == -1) {
109 actualPatLen = u_strlen(pattern);
110 }
111
112 URegularExpression *re = new URegularExpression;
113 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
114 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
115 if (re == NULL || refC == NULL || patBuf == NULL) {
116 *status = U_MEMORY_ALLOCATION_ERROR;
117 delete re;
118 uprv_free(refC);
119 uprv_free(patBuf);
120 return NULL;
121 }
122 re->fPatRefCount = refC;
123 *re->fPatRefCount = 1;
124
125 //
126 // Make a copy of the pattern string, so we can return it later if asked.
127 // For compiling the pattern, we will use a read-only-aliased UnicodeString
128 // of this local copy, to avoid making even more copies.
129 //
130 re->fPatString = patBuf;
131 re->fPatStringLen = patternLength;
132 u_memcpy(patBuf, pattern, actualPatLen);
133 patBuf[actualPatLen] = 0;
134 UnicodeString patString(patternLength==-1, patBuf, patternLength);
135
136 //
137 // Compile the pattern
138 //
139 if (pe != NULL) {
140 re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
141 } else {
142 re->fPat = RegexPattern::compile(patString, flags, *status);
143 }
144 if (U_FAILURE(*status)) {
145 goto ErrorExit;
146 }
147
148 //
149 // Create the matcher object
150 //
151 re->fMatcher = re->fPat->matcher(*status);
152 if (U_SUCCESS(*status)) {
153 return re;
154 }
155
156 ErrorExit:
157 delete re;
158 return NULL;
159
160 }
161
162 //----------------------------------------------------------------------------------------
163 //
164 // uregex_close
165 //
166 //----------------------------------------------------------------------------------------
167 U_CAPI void U_EXPORT2
uregex_close(URegularExpression * re)168 uregex_close(URegularExpression *re) {
169 UErrorCode status = U_ZERO_ERROR;
170 if (validateRE(re, &status, FALSE) == FALSE) {
171 return;
172 }
173 delete re;
174 }
175
176
177 //----------------------------------------------------------------------------------------
178 //
179 // uregex_clone
180 //
181 //----------------------------------------------------------------------------------------
182 U_CAPI URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression * source,UErrorCode * status)183 uregex_clone(const URegularExpression *source, UErrorCode *status) {
184 if (validateRE(source, status, FALSE) == FALSE) {
185 return NULL;
186 }
187
188 URegularExpression *clone = new URegularExpression;
189 if (clone == NULL) {
190 *status = U_MEMORY_ALLOCATION_ERROR;
191 return NULL;
192 }
193
194 clone->fMatcher = source->fPat->matcher(*status);
195 if (U_FAILURE(*status)) {
196 delete clone;
197 return NULL;
198 }
199 if (clone == NULL) {
200 *status = U_MEMORY_ALLOCATION_ERROR;
201 return NULL;
202 }
203
204 clone->fPat = source->fPat;
205 clone->fPatRefCount = source->fPatRefCount;
206 clone->fPatString = source->fPatString;
207 clone->fPatStringLen = source->fPatStringLen;
208 umtx_atomic_inc(source->fPatRefCount);
209 // Note: fText is not cloned.
210
211 return clone;
212 }
213
214
215
216
217 //------------------------------------------------------------------------------
218 //
219 // uregex_pattern
220 //
221 //------------------------------------------------------------------------------
222 U_CAPI const UChar * U_EXPORT2
uregex_pattern(const URegularExpression * regexp,int32_t * patLength,UErrorCode * status)223 uregex_pattern(const URegularExpression *regexp,
224 int32_t *patLength,
225 UErrorCode *status) {
226
227 if (validateRE(regexp, status, FALSE) == FALSE) {
228 return NULL;
229 }
230 if (patLength != NULL) {
231 *patLength = regexp->fPatStringLen;
232 }
233 return regexp->fPatString;
234 }
235
236
237 //------------------------------------------------------------------------------
238 //
239 // uregex_flags
240 //
241 //------------------------------------------------------------------------------
242 U_CAPI int32_t U_EXPORT2
uregex_flags(const URegularExpression * regexp,UErrorCode * status)243 uregex_flags(const URegularExpression *regexp, UErrorCode *status) {
244 if (validateRE(regexp, status, FALSE) == FALSE) {
245 return 0;
246 }
247 int32_t flags = regexp->fPat->flags();
248 return flags;
249 }
250
251
252 //------------------------------------------------------------------------------
253 //
254 // uregex_setText
255 //
256 //------------------------------------------------------------------------------
257 U_CAPI void U_EXPORT2
uregex_setText(URegularExpression * regexp,const UChar * text,int32_t textLength,UErrorCode * status)258 uregex_setText(URegularExpression *regexp,
259 const UChar *text,
260 int32_t textLength,
261 UErrorCode *status) {
262 if (validateRE(regexp, status, FALSE) == FALSE) {
263 return;
264 }
265 if (text == NULL || textLength < -1) {
266 *status = U_ILLEGAL_ARGUMENT_ERROR;
267 return;
268 }
269 regexp->fText = text;
270 regexp->fTextLength = textLength;
271 UBool isTerminated = (textLength == -1);
272
273 regexp->fTextString.setTo(isTerminated, text, textLength);
274 regexp->fMatcher->reset(regexp->fTextString);
275 }
276
277
278
279 //------------------------------------------------------------------------------
280 //
281 // uregex_getText
282 //
283 //------------------------------------------------------------------------------
284 U_CAPI const UChar * U_EXPORT2
uregex_getText(URegularExpression * regexp,int32_t * textLength,UErrorCode * status)285 uregex_getText(URegularExpression *regexp,
286 int32_t *textLength,
287 UErrorCode *status) {
288 if (validateRE(regexp, status, FALSE) == FALSE) {
289 return NULL;
290 }
291 if (textLength != NULL) {
292 *textLength = regexp->fTextLength;
293 }
294 return regexp->fText;
295 }
296
297
298 //------------------------------------------------------------------------------
299 //
300 // uregex_matches
301 //
302 //------------------------------------------------------------------------------
303 U_CAPI UBool U_EXPORT2
uregex_matches(URegularExpression * regexp,int32_t startIndex,UErrorCode * status)304 uregex_matches(URegularExpression *regexp,
305 int32_t startIndex,
306 UErrorCode *status) {
307 UBool result = FALSE;
308 if (validateRE(regexp, status) == FALSE) {
309 return result;
310 }
311 if (startIndex == -1) {
312 result = regexp->fMatcher->matches(*status);
313 } else {
314 result = regexp->fMatcher->matches(startIndex, *status);
315 }
316 return result;
317 }
318
319
320
321 //------------------------------------------------------------------------------
322 //
323 // uregex_lookingAt
324 //
325 //------------------------------------------------------------------------------
326 U_CAPI UBool U_EXPORT2
uregex_lookingAt(URegularExpression * regexp,int32_t startIndex,UErrorCode * status)327 uregex_lookingAt(URegularExpression *regexp,
328 int32_t startIndex,
329 UErrorCode *status) {
330 UBool result = FALSE;
331 if (validateRE(regexp, status) == FALSE) {
332 return result;
333 }
334 if (startIndex == -1) {
335 result = regexp->fMatcher->lookingAt(*status);
336 } else {
337 result = regexp->fMatcher->lookingAt(startIndex, *status);
338 }
339 return result;
340 }
341
342
343
344 //------------------------------------------------------------------------------
345 //
346 // uregex_find
347 //
348 //------------------------------------------------------------------------------
349 U_CAPI UBool U_EXPORT2
uregex_find(URegularExpression * regexp,int32_t startIndex,UErrorCode * status)350 uregex_find(URegularExpression *regexp,
351 int32_t startIndex,
352 UErrorCode *status) {
353 UBool result = FALSE;
354 if (validateRE(regexp, status) == FALSE) {
355 return result;
356 }
357 if (startIndex == -1) {
358 regexp->fMatcher->resetPreserveRegion();
359 result = regexp->fMatcher->find();
360 } else {
361 result = regexp->fMatcher->find(startIndex, *status);
362 }
363 return result;
364 }
365
366 //------------------------------------------------------------------------------
367 //
368 // uregex_findNext
369 //
370 //------------------------------------------------------------------------------
371 U_CAPI UBool U_EXPORT2
uregex_findNext(URegularExpression * regexp,UErrorCode * status)372 uregex_findNext(URegularExpression *regexp,
373 UErrorCode *status) {
374 if (validateRE(regexp, status) == FALSE) {
375 return FALSE;
376 }
377 UBool result = regexp->fMatcher->find();
378 return result;
379 }
380
381 //------------------------------------------------------------------------------
382 //
383 // uregex_groupCount
384 //
385 //------------------------------------------------------------------------------
386 U_CAPI int32_t U_EXPORT2
uregex_groupCount(URegularExpression * regexp,UErrorCode * status)387 uregex_groupCount(URegularExpression *regexp,
388 UErrorCode *status) {
389 if (validateRE(regexp, status, FALSE) == FALSE) {
390 return 0;
391 }
392 int32_t result = regexp->fMatcher->groupCount();
393 return result;
394 }
395
396
397 //------------------------------------------------------------------------------
398 //
399 // uregex_group
400 //
401 //------------------------------------------------------------------------------
402 U_CAPI int32_t U_EXPORT2
uregex_group(URegularExpression * regexp,int32_t groupNum,UChar * dest,int32_t destCapacity,UErrorCode * status)403 uregex_group(URegularExpression *regexp,
404 int32_t groupNum,
405 UChar *dest,
406 int32_t destCapacity,
407 UErrorCode *status) {
408 if (validateRE(regexp, status) == FALSE) {
409 return 0;
410 }
411 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
412 *status = U_ILLEGAL_ARGUMENT_ERROR;
413 return 0;
414 }
415
416 //
417 // Pick up the range of characters from the matcher
418 //
419 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
420 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
421 if (U_FAILURE(*status)) {
422 return 0;
423 }
424
425 //
426 // Trim length based on buffer capacity
427 //
428 int32_t fullLength = endIx - startIx;
429 int32_t copyLength = fullLength;
430 if (copyLength < destCapacity) {
431 dest[copyLength] = 0;
432 } else if (copyLength == destCapacity) {
433 *status = U_STRING_NOT_TERMINATED_WARNING;
434 } else {
435 copyLength = destCapacity;
436 *status = U_BUFFER_OVERFLOW_ERROR;
437 }
438
439 //
440 // Copy capture group to user's buffer
441 //
442 if (copyLength > 0) {
443 u_memcpy(dest, ®exp->fText[startIx], copyLength);
444 }
445 return fullLength;
446 }
447
448
449 //------------------------------------------------------------------------------
450 //
451 // uregex_start
452 //
453 //------------------------------------------------------------------------------
454 U_CAPI int32_t U_EXPORT2
uregex_start(URegularExpression * regexp,int32_t groupNum,UErrorCode * status)455 uregex_start(URegularExpression *regexp,
456 int32_t groupNum,
457 UErrorCode *status) {
458 if (validateRE(regexp, status) == FALSE) {
459 return 0;
460 }
461 int32_t result = regexp->fMatcher->start(groupNum, *status);
462 return result;
463 }
464
465
466 //------------------------------------------------------------------------------
467 //
468 // uregex_end
469 //
470 //------------------------------------------------------------------------------
471 U_CAPI int32_t U_EXPORT2
uregex_end(URegularExpression * regexp,int32_t groupNum,UErrorCode * status)472 uregex_end(URegularExpression *regexp,
473 int32_t groupNum,
474 UErrorCode *status) {
475 if (validateRE(regexp, status) == FALSE) {
476 return 0;
477 }
478 int32_t result = regexp->fMatcher->end(groupNum, *status);
479 return result;
480 }
481
482 //------------------------------------------------------------------------------
483 //
484 // uregex_reset
485 //
486 //------------------------------------------------------------------------------
487 U_CAPI void U_EXPORT2
uregex_reset(URegularExpression * regexp,int32_t index,UErrorCode * status)488 uregex_reset(URegularExpression *regexp,
489 int32_t index,
490 UErrorCode *status) {
491 if (validateRE(regexp, status) == FALSE) {
492 return;
493 }
494 regexp->fMatcher->reset(index, *status);
495 }
496
497
498 //------------------------------------------------------------------------------
499 //
500 // uregex_setRegion
501 //
502 //------------------------------------------------------------------------------
503 U_CAPI void U_EXPORT2
uregex_setRegion(URegularExpression * regexp,int32_t regionStart,int32_t regionLimit,UErrorCode * status)504 uregex_setRegion(URegularExpression *regexp,
505 int32_t regionStart,
506 int32_t regionLimit,
507 UErrorCode *status) {
508 if (validateRE(regexp, status) == FALSE) {
509 return;
510 }
511 regexp->fMatcher->region(regionStart, regionLimit, *status);
512 }
513
514
515 //------------------------------------------------------------------------------
516 //
517 // uregex_regionStart
518 //
519 //------------------------------------------------------------------------------
520 U_CAPI int32_t U_EXPORT2
uregex_regionStart(const URegularExpression * regexp,UErrorCode * status)521 uregex_regionStart(const URegularExpression *regexp,
522 UErrorCode *status) {
523 if (validateRE(regexp, status) == FALSE) {
524 return 0;
525 }
526 return regexp->fMatcher->regionStart();
527 }
528
529
530 //------------------------------------------------------------------------------
531 //
532 // uregex_regionEnd
533 //
534 //------------------------------------------------------------------------------
535 U_CAPI int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression * regexp,UErrorCode * status)536 uregex_regionEnd(const URegularExpression *regexp,
537 UErrorCode *status) {
538 if (validateRE(regexp, status) == FALSE) {
539 return 0;
540 }
541 return regexp->fMatcher->regionEnd();
542 }
543
544
545 //------------------------------------------------------------------------------
546 //
547 // uregex_hasTransparentBounds
548 //
549 //------------------------------------------------------------------------------
550 U_CAPI UBool U_EXPORT2
uregex_hasTransparentBounds(const URegularExpression * regexp,UErrorCode * status)551 uregex_hasTransparentBounds(const URegularExpression *regexp,
552 UErrorCode *status) {
553 if (validateRE(regexp, status) == FALSE) {
554 return FALSE;
555 }
556 return regexp->fMatcher->hasTransparentBounds();
557 }
558
559
560 //------------------------------------------------------------------------------
561 //
562 // uregex_useTransparentBounds
563 //
564 //------------------------------------------------------------------------------
565 U_CAPI void U_EXPORT2
uregex_useTransparentBounds(URegularExpression * regexp,UBool b,UErrorCode * status)566 uregex_useTransparentBounds(URegularExpression *regexp,
567 UBool b,
568 UErrorCode *status) {
569 if (validateRE(regexp, status) == FALSE) {
570 return;
571 }
572 regexp->fMatcher->useTransparentBounds(b);
573 }
574
575
576 //------------------------------------------------------------------------------
577 //
578 // uregex_hasAnchoringBounds
579 //
580 //------------------------------------------------------------------------------
581 U_CAPI UBool U_EXPORT2
uregex_hasAnchoringBounds(const URegularExpression * regexp,UErrorCode * status)582 uregex_hasAnchoringBounds(const URegularExpression *regexp,
583 UErrorCode *status) {
584 if (validateRE(regexp, status) == FALSE) {
585 return FALSE;
586 }
587 return regexp->fMatcher->hasAnchoringBounds();
588 }
589
590
591 //------------------------------------------------------------------------------
592 //
593 // uregex_useAnchoringBounds
594 //
595 //------------------------------------------------------------------------------
596 U_CAPI void U_EXPORT2
uregex_useAnchoringBounds(URegularExpression * regexp,UBool b,UErrorCode * status)597 uregex_useAnchoringBounds(URegularExpression *regexp,
598 UBool b,
599 UErrorCode *status) {
600 if (validateRE(regexp, status) == FALSE) {
601 return;
602 }
603 regexp->fMatcher->useAnchoringBounds(b);
604 }
605
606
607 //------------------------------------------------------------------------------
608 //
609 // uregex_hitEnd
610 //
611 //------------------------------------------------------------------------------
612 U_CAPI UBool U_EXPORT2
uregex_hitEnd(const URegularExpression * regexp,UErrorCode * status)613 uregex_hitEnd(const URegularExpression *regexp,
614 UErrorCode *status) {
615 if (validateRE(regexp, status) == FALSE) {
616 return FALSE;
617 }
618 return regexp->fMatcher->hitEnd();
619 }
620
621
622 //------------------------------------------------------------------------------
623 //
624 // uregex_requireEnd
625 //
626 //------------------------------------------------------------------------------
627 U_CAPI UBool U_EXPORT2
uregex_requireEnd(const URegularExpression * regexp,UErrorCode * status)628 uregex_requireEnd(const URegularExpression *regexp,
629 UErrorCode *status) {
630 if (validateRE(regexp, status) == FALSE) {
631 return FALSE;
632 }
633 return regexp->fMatcher->requireEnd();
634 }
635
636
637 //------------------------------------------------------------------------------
638 //
639 // uregex_replaceAll
640 //
641 //------------------------------------------------------------------------------
642 U_CAPI int32_t U_EXPORT2
uregex_replaceAll(URegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar * destBuf,int32_t destCapacity,UErrorCode * status)643 uregex_replaceAll(URegularExpression *regexp,
644 const UChar *replacementText,
645 int32_t replacementLength,
646 UChar *destBuf,
647 int32_t destCapacity,
648 UErrorCode *status) {
649 if (validateRE(regexp, status) == FALSE) {
650 return 0;
651 }
652 if (replacementText == NULL || replacementLength < -1 ||
653 destBuf == NULL && destCapacity > 0 ||
654 destCapacity < 0) {
655 *status = U_ILLEGAL_ARGUMENT_ERROR;
656 return 0;
657 }
658
659 int32_t len = 0;
660 uregex_reset(regexp, 0, status);
661 while (uregex_findNext(regexp, status)) {
662 len += uregex_appendReplacement(regexp, replacementText, replacementLength,
663 &destBuf, &destCapacity, status);
664 }
665 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
666
667 return len;
668 }
669
670
671 //------------------------------------------------------------------------------
672 //
673 // uregex_replaceFirst
674 //
675 //------------------------------------------------------------------------------
676 U_CAPI int32_t U_EXPORT2
uregex_replaceFirst(URegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar * destBuf,int32_t destCapacity,UErrorCode * status)677 uregex_replaceFirst(URegularExpression *regexp,
678 const UChar *replacementText,
679 int32_t replacementLength,
680 UChar *destBuf,
681 int32_t destCapacity,
682 UErrorCode *status) {
683 if (validateRE(regexp, status) == FALSE) {
684 return 0;
685 }
686 if (replacementText == NULL || replacementLength < -1 ||
687 destBuf == NULL && destCapacity > 0 ||
688 destCapacity < 0) {
689 *status = U_ILLEGAL_ARGUMENT_ERROR;
690 return 0;
691 }
692
693 int32_t len = 0;
694 UBool findSucceeded;
695 uregex_reset(regexp, 0, status);
696 findSucceeded = uregex_find(regexp, 0, status);
697 if (findSucceeded) {
698 len = uregex_appendReplacement(regexp, replacementText, replacementLength,
699 &destBuf, &destCapacity, status);
700 }
701 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
702
703 return len;
704 }
705
706
707 //------------------------------------------------------------------------------
708 //
709 // uregex_appendReplacement
710 //
711 //------------------------------------------------------------------------------
712
713
714 //
715 // Dummy class, because these functions need to be friends of class RegexMatcher,
716 // and stand-alone C functions don't work as friends
717 //
718 U_NAMESPACE_BEGIN
719 class RegexCImpl {
720 public:
721 inline static int32_t appendReplacement(URegularExpression *regexp,
722 const UChar *replacementText,
723 int32_t replacementLength,
724 UChar **destBuf,
725 int32_t *destCapacity,
726 UErrorCode *status);
727
728 inline static int32_t appendTail(URegularExpression *regexp,
729 UChar **destBuf,
730 int32_t *destCapacity,
731 UErrorCode *status);
732 };
733 U_NAMESPACE_END
734
735
736 //
737 // Call-back function for u_unescapeAt(), used when we encounter
738 // \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
739 //
740 U_CDECL_BEGIN
741 static UChar U_CALLCONV
unescape_charAt(int32_t offset,void * context)742 unescape_charAt(int32_t offset, void *context) {
743 UChar c16 = ((UChar *)context)[offset];
744 return c16;
745 }
746 U_CDECL_END
747
748
749 static const UChar BACKSLASH = 0x5c;
750 static const UChar DOLLARSIGN = 0x24;
751
752 //
753 // Move a character to an output buffer, with bounds checking on the index.
754 // Index advances even if capacity is exceeded, for preflight size computations.
755 // This little sequence is used a LOT.
756 //
appendToBuf(UChar c,int32_t * idx,UChar * buf,int32_t bufCapacity)757 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
758 if (*idx < bufCapacity) {
759 buf[*idx] = c;
760 }
761 (*idx)++;
762 }
763
764
765 //
766 // appendReplacement, the actual implementation.
767 //
appendReplacement(URegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)768 int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
769 const UChar *replacementText,
770 int32_t replacementLength,
771 UChar **destBuf,
772 int32_t *destCapacity,
773 UErrorCode *status) {
774
775 // If we come in with a buffer overflow error, don't suppress the operation.
776 // A series of appendReplacements, appendTail need to correctly preflight
777 // the buffer size when an overflow happens somewhere in the middle.
778 UBool pendingBufferOverflow = FALSE;
779 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) {
780 pendingBufferOverflow = TRUE;
781 *status = U_ZERO_ERROR;
782 }
783
784 //
785 // Validate all paramters
786 //
787 if (validateRE(regexp, status) == FALSE) {
788 return 0;
789 }
790 if (replacementText == NULL || replacementLength < -1 ||
791 destCapacity == NULL || destBuf == NULL ||
792 *destBuf == NULL && *destCapacity > 0 ||
793 *destCapacity < 0) {
794 *status = U_ILLEGAL_ARGUMENT_ERROR;
795 return 0;
796 }
797
798 RegexMatcher *m = regexp->fMatcher;
799 if (m->fMatch == FALSE) {
800 *status = U_REGEX_INVALID_STATE;
801 return 0;
802 }
803
804 UChar *dest = *destBuf;
805 int32_t capacity = *destCapacity;
806 int32_t destIdx = 0;
807 int32_t i;
808
809 // If it wasn't supplied by the caller, get the length of the replacement text.
810 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
811 // the fly and avoid this step.
812 if (replacementLength == -1) {
813 replacementLength = u_strlen(replacementText);
814 }
815
816 // Copy input string from the end of previous match to start of current match
817 for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
818 appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
819 }
820
821
822
823 // scan the replacement text, looking for substitutions ($n) and \escapes.
824 int32_t replIdx = 0;
825 while (replIdx < replacementLength) {
826 UChar c = replacementText[replIdx];
827 replIdx++;
828 if (c != DOLLARSIGN && c != BACKSLASH) {
829 // Common case, no substitution, no escaping,
830 // just copy the char to the dest buf.
831 appendToBuf(c, &destIdx, dest, capacity);
832 continue;
833 }
834
835 if (c == BACKSLASH) {
836 // Backslash Escape. Copy the following char out without further checks.
837 // Note: Surrogate pairs don't need any special handling
838 // The second half wont be a '$' or a '\', and
839 // will move to the dest normally on the next
840 // loop iteration.
841 if (replIdx >= replacementLength) {
842 break;
843 }
844 c = replacementText[replIdx];
845
846 if (c==0x55/*U*/ || c==0x75/*u*/) {
847 // We have a \udddd or \Udddddddd escape sequence.
848 UChar32 escapedChar =
849 u_unescapeAt(unescape_charAt,
850 &replIdx, // Index is updated by unescapeAt
851 replacementLength, // Length of replacement text
852 (void *)replacementText);
853
854 if (escapedChar != (UChar32)0xFFFFFFFF) {
855 if (escapedChar <= 0xffff) {
856 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
857 } else {
858 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
859 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
860 }
861 continue;
862 }
863 // Note: if the \u escape was invalid, just fall through and
864 // treat it as a plain \<anything> escape.
865 }
866
867 // Plain backslash escape. Just put out the escaped character.
868 appendToBuf(c, &destIdx, dest, capacity);
869
870 replIdx++;
871 continue;
872 }
873
874
875
876 // We've got a $. Pick up a capture group number if one follows.
877 // Consume at most the number of digits necessary for the largest capture
878 // number that is valid for this pattern.
879
880 int32_t numDigits = 0;
881 int32_t groupNum = 0;
882 UChar32 digitC;
883 for (;;) {
884 if (replIdx >= replacementLength) {
885 break;
886 }
887 U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
888 if (u_isdigit(digitC) == FALSE) {
889 break;
890 }
891
892 U16_FWD_1(replacementText, replIdx, replacementLength);
893 groupNum=groupNum*10 + u_charDigitValue(digitC);
894 numDigits++;
895 if (numDigits >= m->fPattern->fMaxCaptureDigits) {
896 break;
897 }
898 }
899
900
901 if (numDigits == 0) {
902 // The $ didn't introduce a group number at all.
903 // Treat it as just part of the substitution text.
904 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
905 continue;
906 }
907
908 // Finally, append the capture group data to the destination.
909 int32_t capacityRemaining = capacity - destIdx;
910 if (capacityRemaining < 0) {
911 capacityRemaining = 0;
912 }
913 destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
914 if (*status == U_BUFFER_OVERFLOW_ERROR) {
915 // Ignore buffer overflow when extracting the group. We need to
916 // continue on to get full size of the untruncated result. We will
917 // raise our own buffer overflow error at the end.
918 *status = U_ZERO_ERROR;
919 }
920
921 if (U_FAILURE(*status)) {
922 // Can fail if group number is out of range.
923 break;
924 }
925
926 }
927
928 //
929 // Nul Terminate the dest buffer if possible.
930 // Set the appropriate buffer overflow or not terminated error, if needed.
931 //
932 if (destIdx < capacity) {
933 dest[destIdx] = 0;
934 } else if (destIdx == *destCapacity) {
935 *status = U_STRING_NOT_TERMINATED_WARNING;
936 } else {
937 *status = U_BUFFER_OVERFLOW_ERROR;
938 }
939
940 //
941 // Return an updated dest buffer and capacity to the caller.
942 //
943 if (destIdx > 0 && *destCapacity > 0) {
944 if (destIdx < capacity) {
945 *destBuf += destIdx;
946 *destCapacity -= destIdx;
947 } else {
948 *destBuf += capacity;
949 *destCapacity = 0;
950 }
951 }
952
953 // If we came in with a buffer overflow, make sure we go out with one also.
954 // (A zero length match right at the end of the previous match could
955 // make this function succeed even though a previous call had overflowed the buf)
956 if (pendingBufferOverflow && U_SUCCESS(*status)) {
957 *status = U_BUFFER_OVERFLOW_ERROR;
958 }
959
960 return destIdx;
961 }
962
963 //
964 // appendReplacement the acutal API function,
965 //
966 U_CAPI int32_t U_EXPORT2
uregex_appendReplacement(URegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)967 uregex_appendReplacement(URegularExpression *regexp,
968 const UChar *replacementText,
969 int32_t replacementLength,
970 UChar **destBuf,
971 int32_t *destCapacity,
972 UErrorCode *status) {
973 return RegexCImpl::appendReplacement(
974 regexp, replacementText, replacementLength,destBuf, destCapacity, status);
975 }
976
977
978 //------------------------------------------------------------------------------
979 //
980 // uregex_appendTail
981 //
982 //------------------------------------------------------------------------------
appendTail(URegularExpression * regexp,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)983 int32_t RegexCImpl::appendTail(URegularExpression *regexp,
984 UChar **destBuf,
985 int32_t *destCapacity,
986 UErrorCode *status) {
987
988 // If we come in with a buffer overflow error, don't suppress the operation.
989 // A series of appendReplacements, appendTail need to correctly preflight
990 // the buffer size when an overflow happens somewhere in the middle.
991 UBool pendingBufferOverflow = FALSE;
992 if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) {
993 pendingBufferOverflow = TRUE;
994 *status = U_ZERO_ERROR;
995 }
996
997 if (validateRE(regexp, status) == FALSE) {
998 return 0;
999 }
1000 if (destCapacity == NULL || destBuf == NULL ||
1001 *destBuf == NULL && *destCapacity > 0 ||
1002 *destCapacity < 0) {
1003 *status = U_ILLEGAL_ARGUMENT_ERROR;
1004 return 0;
1005 }
1006
1007 RegexMatcher *m = regexp->fMatcher;
1008
1009 int32_t srcIdx;
1010 if (m->fMatch) {
1011 // The most recent call to find() succeeded.
1012 srcIdx = m->fMatchEnd;
1013 } else {
1014 // The last call to find() on this matcher failed().
1015 // Look back to the end of the last find() that succeeded for src index.
1016 srcIdx = m->fLastMatchEnd;
1017 if (srcIdx == -1) {
1018 // There has been no successful match with this matcher.
1019 // We want to copy the whole string.
1020 srcIdx = 0;
1021 }
1022 }
1023
1024 int32_t destIdx = 0;
1025 int32_t destCap = *destCapacity;
1026 UChar *dest = *destBuf;
1027
1028 for (;;) {
1029 if (srcIdx == regexp->fTextLength) {
1030 break;
1031 }
1032 UChar c = regexp->fText[srcIdx];
1033 if (c == 0 && regexp->fTextLength == -1) {
1034 break;
1035 }
1036 if (destIdx < destCap) {
1037 dest[destIdx] = c;
1038 } else {
1039 // We've overflowed the dest buffer.
1040 // If the total input string length is known, we can
1041 // compute the total buffer size needed without scanning through the string.
1042 if (regexp->fTextLength > 0) {
1043 destIdx += (regexp->fTextLength - srcIdx);
1044 break;
1045 }
1046 }
1047 srcIdx++;
1048 destIdx++;
1049 }
1050
1051 //
1052 // NUL terminate the output string, if possible, otherwise issue the
1053 // appropriate error or warning.
1054 //
1055 if (destIdx < destCap) {
1056 dest[destIdx] = 0;
1057 } else if (destIdx == destCap) {
1058 *status = U_STRING_NOT_TERMINATED_WARNING;
1059 } else {
1060 *status = U_BUFFER_OVERFLOW_ERROR;
1061 }
1062
1063 //
1064 // Update the user's buffer ptr and capacity vars to reflect the
1065 // amount used.
1066 //
1067 if (destIdx < destCap) {
1068 *destBuf += destIdx;
1069 *destCapacity -= destIdx;
1070 } else {
1071 *destBuf += destCap;
1072 *destCapacity = 0;
1073 }
1074
1075 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1076 *status = U_BUFFER_OVERFLOW_ERROR;
1077 }
1078
1079 return destIdx;
1080 }
1081
1082
1083 U_CAPI int32_t U_EXPORT2
uregex_appendTail(URegularExpression * regexp,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1084 uregex_appendTail(URegularExpression *regexp,
1085 UChar **destBuf,
1086 int32_t *destCapacity,
1087 UErrorCode *status) {
1088 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1089 }
1090
1091
1092 //------------------------------------------------------------------------------
1093 //
1094 // copyString Internal utility to copy a string to an output buffer,
1095 // while managing buffer overflow and preflight size
1096 // computation. NUL termination is added to destination,
1097 // and the NUL is counted in the output size.
1098 //
1099 //------------------------------------------------------------------------------
copyString(UChar * destBuffer,int32_t destCapacity,int32_t * destIndex,const UChar * srcPtr,int32_t srcLen)1100 static void copyString(UChar *destBuffer, // Destination buffer.
1101 int32_t destCapacity, // Total capacity of dest buffer
1102 int32_t *destIndex, // Index into dest buffer. Updated on return.
1103 // Update not clipped to destCapacity.
1104 const UChar *srcPtr, // Pointer to source string
1105 int32_t srcLen) // Source string len.
1106 {
1107 int32_t si;
1108 int32_t di = *destIndex;
1109 UChar c;
1110
1111 for (si=0; si<srcLen; si++) {
1112 c = srcPtr[si];
1113 if (di < destCapacity) {
1114 destBuffer[di] = c;
1115 di++;
1116 } else {
1117 di += srcLen - si;
1118 break;
1119 }
1120 }
1121 if (di<destCapacity) {
1122 destBuffer[di] = 0;
1123 }
1124 di++;
1125 *destIndex = di;
1126 }
1127
1128
1129 //------------------------------------------------------------------------------
1130 //
1131 // uregex_split
1132 //
1133 //------------------------------------------------------------------------------
1134 U_CAPI int32_t U_EXPORT2
uregex_split(URegularExpression * regexp,UChar * destBuf,int32_t destCapacity,int32_t * requiredCapacity,UChar * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1135 uregex_split( URegularExpression *regexp,
1136 UChar *destBuf,
1137 int32_t destCapacity,
1138 int32_t *requiredCapacity,
1139 UChar *destFields[],
1140 int32_t destFieldsCapacity,
1141 UErrorCode *status) {
1142 if (validateRE(regexp, status) == FALSE) {
1143 return 0;
1144 }
1145 if (destBuf == NULL && destCapacity > 0 ||
1146 destCapacity < 0 ||
1147 destFields == NULL ||
1148 destFieldsCapacity < 1 ) {
1149 *status = U_ILLEGAL_ARGUMENT_ERROR;
1150 return 0;
1151 }
1152
1153 //
1154 // Reset for the input text
1155 //
1156 regexp->fMatcher->reset();
1157 int32_t inputLen = regexp->fTextString.length();
1158 int32_t nextOutputStringStart = 0;
1159 if (inputLen == 0) {
1160 return 0;
1161 }
1162
1163
1164 //
1165 // Loop through the input text, searching for the delimiter pattern
1166 //
1167 int32_t i; // Index of the field being processed.
1168 int32_t destIdx = 0; // Next available position in destBuf;
1169 int32_t numCaptureGroups = regexp->fMatcher->groupCount();
1170 for (i=0; ; i++) {
1171 if (i>=destFieldsCapacity-1) {
1172 // There are one or zero output string left.
1173 // Fill the last output string with whatever is left from the input, then exit the loop.
1174 // ( i will be == destFieldsCapacity if we filled the output array while processing
1175 // capture groups of the delimiter expression, in which case we will discard the
1176 // last capture group saved in favor of the unprocessed remainder of the
1177 // input string.)
1178 int32_t remainingLength = inputLen-nextOutputStringStart;
1179 if (remainingLength > 0) {
1180 }
1181 if (i >= destFieldsCapacity) {
1182 // No fields are left. Recycle the last one for holding the trailing part of
1183 // the input string.
1184 i = destFieldsCapacity-1;
1185 destIdx = (int32_t)(destFields[i] - destFields[0]);
1186 }
1187
1188 destFields[i] = &destBuf[destIdx];
1189 copyString(destBuf, destCapacity, &destIdx,
1190 ®exp->fText[nextOutputStringStart], remainingLength);
1191 break;
1192 }
1193
1194 if (regexp->fMatcher->find()) {
1195 // We found another delimiter. Move everything from where we started looking
1196 // up until the start of the delimiter into the next output string.
1197 int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
1198 destFields[i] = &destBuf[destIdx];
1199 copyString(destBuf, destCapacity, &destIdx,
1200 ®exp->fText[nextOutputStringStart], fieldLen);
1201 nextOutputStringStart = regexp->fMatcher->end(*status);
1202
1203 // If the delimiter pattern has capturing parentheses, the captured
1204 // text goes out into the next n destination strings.
1205 int32_t groupNum;
1206 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1207 // If we've run out of output string slots, bail out.
1208 if (i==destFieldsCapacity-1) {
1209 break;
1210 }
1211 i++;
1212
1213 // Set up to extract the capture group contents into the dest buffer.
1214 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow
1215 // error while extracting this group.
1216 int32_t remainingCapacity = destCapacity - destIdx;
1217 if (remainingCapacity < 0) {
1218 remainingCapacity = 0;
1219 }
1220 destFields[i] = &destBuf[destIdx];
1221 int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus);
1222 destIdx += t + 1; // Record the space used in the output string buffer.
1223 // +1 for the NUL that terminates the string.
1224 }
1225
1226 if (nextOutputStringStart == inputLen) {
1227 // The delimiter was at the end of the string. We're done.
1228 break;
1229 }
1230
1231 }
1232 else
1233 {
1234 // We ran off the end of the input while looking for the next delimiter.
1235 // All the remaining text goes into the current output string.
1236 destFields[i] = &destBuf[destIdx];
1237 copyString(destBuf, destCapacity, &destIdx,
1238 ®exp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
1239 break;
1240 }
1241 }
1242
1243 // Zero out any unused portion of the destFields array
1244 int j;
1245 for (j=i+1; j<destFieldsCapacity; j++) {
1246 destFields[j] = NULL;
1247 }
1248
1249 if (requiredCapacity != NULL) {
1250 *requiredCapacity = destIdx;
1251 }
1252 if (destIdx > destCapacity) {
1253 *status = U_BUFFER_OVERFLOW_ERROR;
1254 }
1255 return i+1;
1256 }
1257
1258
1259 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1260
1261