• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *   Copyright (C) 2004-2007, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  regex.cpp
7 */
8 
9 #include "unicode/utypes.h"
10 
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
12 
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "umutex.h"
20 #include "uassert.h"
21 #include "cmemory.h"
22 
23 U_NAMESPACE_USE
24 
25 struct URegularExpression: public UMemory {
26 public:
27     URegularExpression();
28     ~URegularExpression();
29     int32_t           fMagic;
30     RegexPattern     *fPat;
31     int32_t          *fPatRefCount;
32     UChar            *fPatString;
33     int32_t           fPatStringLen;
34     RegexMatcher     *fMatcher;
35     const UChar      *fText;         // Text from setText()
36     int32_t           fTextLength;   // Length provided by user with setText(), which
37                                      //  may be -1.
38 
39     UnicodeString     fTextString;   // The setText(text) is wrapped into a UnicodeString.
40                                      // TODO: regexp engine should not depend on UnicodeString.
41 };
42 
43 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
44 
URegularExpression()45 URegularExpression::URegularExpression() {
46     fMagic        = REXP_MAGIC;
47     fPat          = NULL;
48     fPatRefCount  = NULL;
49     fPatString    = NULL;
50     fPatStringLen = 0;
51     fMatcher      = NULL;
52     fText         = NULL;
53     fTextLength   = 0;
54 }
55 
~URegularExpression()56 URegularExpression::~URegularExpression() {
57     delete fMatcher;
58     fMatcher = NULL;
59     if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
60         delete fPat;
61         uprv_free(fPatString);
62         uprv_free(fPatRefCount);
63     }
64     fMagic = 0;
65 }
66 
67 //----------------------------------------------------------------------------------------
68 //
69 //   validateRE    Do boilerplate style checks on API function parameters.
70 //                 Return TRUE if they look OK.
71 //----------------------------------------------------------------------------------------
validateRE(const URegularExpression * re,UErrorCode * status,UBool requiresText=TRUE)72 static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
73     if (U_FAILURE(*status)) {
74         return FALSE;
75     }
76     if (re == NULL || re->fMagic != REXP_MAGIC) {
77         // U_ASSERT(FALSE);
78         *status = U_ILLEGAL_ARGUMENT_ERROR;
79         return FALSE;
80     }
81     if (requiresText && re->fText == NULL) {
82         *status = U_REGEX_INVALID_STATE;
83         return FALSE;
84     }
85     return TRUE;
86 }
87 
88 //----------------------------------------------------------------------------------------
89 //
90 //    uregex_open
91 //
92 //----------------------------------------------------------------------------------------
93 U_CAPI URegularExpression *  U_EXPORT2
uregex_open(const UChar * pattern,int32_t patternLength,uint32_t flags,UParseError * pe,UErrorCode * status)94 uregex_open( const  UChar          *pattern,
95                     int32_t         patternLength,
96                     uint32_t        flags,
97                     UParseError    *pe,
98                     UErrorCode     *status) {
99 
100     if (U_FAILURE(*status)) {
101         return NULL;
102     }
103     if (pattern == NULL || patternLength < -1 || patternLength == 0) {
104         *status = U_ILLEGAL_ARGUMENT_ERROR;
105         return NULL;
106     }
107     int32_t actualPatLen = patternLength;
108     if (actualPatLen == -1) {
109         actualPatLen = u_strlen(pattern);
110     }
111 
112     URegularExpression *re     = new URegularExpression;
113     int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
114     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
115     if (re == NULL || refC == NULL || patBuf == NULL) {
116         *status = U_MEMORY_ALLOCATION_ERROR;
117         delete re;
118         uprv_free(refC);
119         uprv_free(patBuf);
120         return NULL;
121     }
122     re->fPatRefCount = refC;
123     *re->fPatRefCount = 1;
124 
125     //
126     // Make a copy of the pattern string, so we can return it later if asked.
127     //    For compiling the pattern, we will use a read-only-aliased UnicodeString
128     //    of this local copy, to avoid making even more copies.
129     //
130     re->fPatString    = patBuf;
131     re->fPatStringLen = patternLength;
132     u_memcpy(patBuf, pattern, actualPatLen);
133     patBuf[actualPatLen] = 0;
134     UnicodeString  patString(patternLength==-1, patBuf, patternLength);
135 
136     //
137     // Compile the pattern
138     //
139     if (pe != NULL) {
140         re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
141     } else {
142         re->fPat = RegexPattern::compile(patString, flags, *status);
143     }
144     if (U_FAILURE(*status)) {
145         goto ErrorExit;
146     }
147 
148     //
149     // Create the matcher object
150     //
151     re->fMatcher = re->fPat->matcher(*status);
152     if (U_SUCCESS(*status)) {
153         return re;
154     }
155 
156 ErrorExit:
157     delete re;
158     return NULL;
159 
160 }
161 
162 //----------------------------------------------------------------------------------------
163 //
164 //    uregex_close
165 //
166 //----------------------------------------------------------------------------------------
167 U_CAPI void  U_EXPORT2
uregex_close(URegularExpression * re)168 uregex_close(URegularExpression  *re) {
169     UErrorCode  status = U_ZERO_ERROR;
170     if (validateRE(re, &status, FALSE) == FALSE) {
171         return;
172     }
173     delete re;
174 }
175 
176 
177 //----------------------------------------------------------------------------------------
178 //
179 //    uregex_clone
180 //
181 //----------------------------------------------------------------------------------------
182 U_CAPI URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression * source,UErrorCode * status)183 uregex_clone(const URegularExpression *source, UErrorCode *status)  {
184     if (validateRE(source, status, FALSE) == FALSE) {
185         return NULL;
186     }
187 
188     URegularExpression *clone = new URegularExpression;
189     if (clone == NULL) {
190         *status = U_MEMORY_ALLOCATION_ERROR;
191         return NULL;
192     }
193 
194     clone->fMatcher = source->fPat->matcher(*status);
195     if (U_FAILURE(*status)) {
196         delete clone;
197         return NULL;
198     }
199     if (clone == NULL) {
200         *status = U_MEMORY_ALLOCATION_ERROR;
201         return NULL;
202     }
203 
204     clone->fPat          = source->fPat;
205     clone->fPatRefCount  = source->fPatRefCount;
206     clone->fPatString    = source->fPatString;
207     clone->fPatStringLen = source->fPatStringLen;
208     umtx_atomic_inc(source->fPatRefCount);
209     // Note:  fText is not cloned.
210 
211     return clone;
212 }
213 
214 
215 
216 
217 //------------------------------------------------------------------------------
218 //
219 //    uregex_pattern
220 //
221 //------------------------------------------------------------------------------
222 U_CAPI const UChar * U_EXPORT2
uregex_pattern(const URegularExpression * regexp,int32_t * patLength,UErrorCode * status)223 uregex_pattern(const  URegularExpression *regexp,
224                int32_t            *patLength,
225                UErrorCode         *status)  {
226 
227     if (validateRE(regexp, status, FALSE) == FALSE) {
228         return NULL;
229     }
230     if (patLength != NULL) {
231         *patLength = regexp->fPatStringLen;
232     }
233     return regexp->fPatString;
234 }
235 
236 
237 //------------------------------------------------------------------------------
238 //
239 //    uregex_flags
240 //
241 //------------------------------------------------------------------------------
242 U_CAPI int32_t U_EXPORT2
uregex_flags(const URegularExpression * regexp,UErrorCode * status)243 uregex_flags(const URegularExpression *regexp, UErrorCode *status)  {
244     if (validateRE(regexp, status, FALSE) == FALSE) {
245         return 0;
246     }
247     int32_t flags = regexp->fPat->flags();
248     return flags;
249 }
250 
251 
252 //------------------------------------------------------------------------------
253 //
254 //    uregex_setText
255 //
256 //------------------------------------------------------------------------------
257 U_CAPI void U_EXPORT2
uregex_setText(URegularExpression * regexp,const UChar * text,int32_t textLength,UErrorCode * status)258 uregex_setText(URegularExpression *regexp,
259                const UChar        *text,
260                int32_t             textLength,
261                UErrorCode         *status)  {
262     if (validateRE(regexp, status, FALSE) == FALSE) {
263         return;
264     }
265     if (text == NULL || textLength < -1) {
266         *status = U_ILLEGAL_ARGUMENT_ERROR;
267         return;
268     }
269     regexp->fText       = text;
270     regexp->fTextLength = textLength;
271     UBool isTerminated  = (textLength == -1);
272 
273     regexp->fTextString.setTo(isTerminated, text, textLength);
274     regexp->fMatcher->reset(regexp->fTextString);
275 }
276 
277 
278 
279 //------------------------------------------------------------------------------
280 //
281 //    uregex_getText
282 //
283 //------------------------------------------------------------------------------
284 U_CAPI const UChar * U_EXPORT2
uregex_getText(URegularExpression * regexp,int32_t * textLength,UErrorCode * status)285 uregex_getText(URegularExpression *regexp,
286                int32_t            *textLength,
287                UErrorCode         *status)  {
288     if (validateRE(regexp, status, FALSE) == FALSE) {
289         return NULL;
290     }
291     if (textLength != NULL) {
292         *textLength = regexp->fTextLength;
293     }
294     return regexp->fText;
295 }
296 
297 
298 //------------------------------------------------------------------------------
299 //
300 //    uregex_matches
301 //
302 //------------------------------------------------------------------------------
303 U_CAPI UBool U_EXPORT2
uregex_matches(URegularExpression * regexp,int32_t startIndex,UErrorCode * status)304 uregex_matches(URegularExpression *regexp,
305                 int32_t            startIndex,
306                 UErrorCode        *status)  {
307     UBool result = FALSE;
308     if (validateRE(regexp, status) == FALSE) {
309         return result;
310     }
311     if (startIndex == -1) {
312         result = regexp->fMatcher->matches(*status);
313     } else {
314         result = regexp->fMatcher->matches(startIndex, *status);
315     }
316     return result;
317 }
318 
319 
320 
321 //------------------------------------------------------------------------------
322 //
323 //    uregex_lookingAt
324 //
325 //------------------------------------------------------------------------------
326 U_CAPI UBool U_EXPORT2
uregex_lookingAt(URegularExpression * regexp,int32_t startIndex,UErrorCode * status)327 uregex_lookingAt(URegularExpression *regexp,
328                  int32_t             startIndex,
329                  UErrorCode         *status)  {
330     UBool result = FALSE;
331     if (validateRE(regexp, status) == FALSE) {
332         return result;
333     }
334     if (startIndex == -1) {
335         result = regexp->fMatcher->lookingAt(*status);
336     } else {
337         result = regexp->fMatcher->lookingAt(startIndex, *status);
338     }
339     return result;
340 }
341 
342 
343 
344 //------------------------------------------------------------------------------
345 //
346 //    uregex_find
347 //
348 //------------------------------------------------------------------------------
349 U_CAPI UBool U_EXPORT2
uregex_find(URegularExpression * regexp,int32_t startIndex,UErrorCode * status)350 uregex_find(URegularExpression *regexp,
351             int32_t             startIndex,
352             UErrorCode         *status)  {
353     UBool result = FALSE;
354     if (validateRE(regexp, status) == FALSE) {
355         return result;
356     }
357     if (startIndex == -1) {
358         regexp->fMatcher->resetPreserveRegion();
359         result = regexp->fMatcher->find();
360     } else {
361         result = regexp->fMatcher->find(startIndex, *status);
362     }
363     return result;
364 }
365 
366 //------------------------------------------------------------------------------
367 //
368 //    uregex_findNext
369 //
370 //------------------------------------------------------------------------------
371 U_CAPI UBool U_EXPORT2
uregex_findNext(URegularExpression * regexp,UErrorCode * status)372 uregex_findNext(URegularExpression *regexp,
373                 UErrorCode         *status)  {
374     if (validateRE(regexp, status) == FALSE) {
375         return FALSE;
376     }
377     UBool result = regexp->fMatcher->find();
378     return result;
379 }
380 
381 //------------------------------------------------------------------------------
382 //
383 //    uregex_groupCount
384 //
385 //------------------------------------------------------------------------------
386 U_CAPI int32_t U_EXPORT2
uregex_groupCount(URegularExpression * regexp,UErrorCode * status)387 uregex_groupCount(URegularExpression *regexp,
388                   UErrorCode         *status)  {
389     if (validateRE(regexp, status, FALSE) == FALSE) {
390         return 0;
391     }
392     int32_t  result = regexp->fMatcher->groupCount();
393     return result;
394 }
395 
396 
397 //------------------------------------------------------------------------------
398 //
399 //    uregex_group
400 //
401 //------------------------------------------------------------------------------
402 U_CAPI int32_t U_EXPORT2
uregex_group(URegularExpression * regexp,int32_t groupNum,UChar * dest,int32_t destCapacity,UErrorCode * status)403 uregex_group(URegularExpression *regexp,
404              int32_t             groupNum,
405              UChar              *dest,
406              int32_t             destCapacity,
407              UErrorCode          *status)  {
408     if (validateRE(regexp, status) == FALSE) {
409         return 0;
410     }
411     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
412         *status = U_ILLEGAL_ARGUMENT_ERROR;
413         return 0;
414     }
415 
416     //
417     // Pick up the range of characters from the matcher
418     //
419     int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
420     int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
421     if (U_FAILURE(*status)) {
422         return 0;
423     }
424 
425     //
426     // Trim length based on buffer capacity
427     //
428     int32_t fullLength = endIx - startIx;
429     int32_t copyLength = fullLength;
430     if (copyLength < destCapacity) {
431         dest[copyLength] = 0;
432     } else  if (copyLength == destCapacity) {
433         *status = U_STRING_NOT_TERMINATED_WARNING;
434     } else {
435         copyLength = destCapacity;
436         *status = U_BUFFER_OVERFLOW_ERROR;
437     }
438 
439     //
440     // Copy capture group to user's buffer
441     //
442     if (copyLength > 0) {
443         u_memcpy(dest, &regexp->fText[startIx], copyLength);
444     }
445     return fullLength;
446 }
447 
448 
449 //------------------------------------------------------------------------------
450 //
451 //    uregex_start
452 //
453 //------------------------------------------------------------------------------
454 U_CAPI int32_t U_EXPORT2
uregex_start(URegularExpression * regexp,int32_t groupNum,UErrorCode * status)455 uregex_start(URegularExpression *regexp,
456              int32_t             groupNum,
457              UErrorCode          *status)  {
458     if (validateRE(regexp, status) == FALSE) {
459         return 0;
460     }
461     int32_t result = regexp->fMatcher->start(groupNum, *status);
462     return result;
463 }
464 
465 
466 //------------------------------------------------------------------------------
467 //
468 //    uregex_end
469 //
470 //------------------------------------------------------------------------------
471 U_CAPI int32_t U_EXPORT2
uregex_end(URegularExpression * regexp,int32_t groupNum,UErrorCode * status)472 uregex_end(URegularExpression   *regexp,
473            int32_t               groupNum,
474            UErrorCode           *status)  {
475     if (validateRE(regexp, status) == FALSE) {
476         return 0;
477     }
478     int32_t result = regexp->fMatcher->end(groupNum, *status);
479     return result;
480 }
481 
482 //------------------------------------------------------------------------------
483 //
484 //    uregex_reset
485 //
486 //------------------------------------------------------------------------------
487 U_CAPI void U_EXPORT2
uregex_reset(URegularExpression * regexp,int32_t index,UErrorCode * status)488 uregex_reset(URegularExpression    *regexp,
489              int32_t               index,
490              UErrorCode            *status)  {
491     if (validateRE(regexp, status) == FALSE) {
492         return;
493     }
494     regexp->fMatcher->reset(index, *status);
495 }
496 
497 
498 //------------------------------------------------------------------------------
499 //
500 //    uregex_setRegion
501 //
502 //------------------------------------------------------------------------------
503 U_CAPI void U_EXPORT2
uregex_setRegion(URegularExpression * regexp,int32_t regionStart,int32_t regionLimit,UErrorCode * status)504 uregex_setRegion(URegularExpression   *regexp,
505                  int32_t               regionStart,
506                  int32_t               regionLimit,
507                  UErrorCode           *status)  {
508     if (validateRE(regexp, status) == FALSE) {
509         return;
510     }
511     regexp->fMatcher->region(regionStart, regionLimit, *status);
512 }
513 
514 
515 //------------------------------------------------------------------------------
516 //
517 //    uregex_regionStart
518 //
519 //------------------------------------------------------------------------------
520 U_CAPI int32_t U_EXPORT2
uregex_regionStart(const URegularExpression * regexp,UErrorCode * status)521 uregex_regionStart(const  URegularExpression   *regexp,
522                           UErrorCode           *status)  {
523     if (validateRE(regexp, status) == FALSE) {
524         return 0;
525     }
526     return regexp->fMatcher->regionStart();
527 }
528 
529 
530 //------------------------------------------------------------------------------
531 //
532 //    uregex_regionEnd
533 //
534 //------------------------------------------------------------------------------
535 U_CAPI int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression * regexp,UErrorCode * status)536 uregex_regionEnd(const  URegularExpression   *regexp,
537                         UErrorCode           *status)  {
538     if (validateRE(regexp, status) == FALSE) {
539         return 0;
540     }
541     return regexp->fMatcher->regionEnd();
542 }
543 
544 
545 //------------------------------------------------------------------------------
546 //
547 //    uregex_hasTransparentBounds
548 //
549 //------------------------------------------------------------------------------
550 U_CAPI UBool U_EXPORT2
uregex_hasTransparentBounds(const URegularExpression * regexp,UErrorCode * status)551 uregex_hasTransparentBounds(const  URegularExpression   *regexp,
552                                    UErrorCode           *status)  {
553     if (validateRE(regexp, status) == FALSE) {
554         return FALSE;
555     }
556     return regexp->fMatcher->hasTransparentBounds();
557 }
558 
559 
560 //------------------------------------------------------------------------------
561 //
562 //    uregex_useTransparentBounds
563 //
564 //------------------------------------------------------------------------------
565 U_CAPI void U_EXPORT2
uregex_useTransparentBounds(URegularExpression * regexp,UBool b,UErrorCode * status)566 uregex_useTransparentBounds(URegularExpression    *regexp,
567              UBool                 b,
568              UErrorCode            *status)  {
569     if (validateRE(regexp, status) == FALSE) {
570         return;
571     }
572     regexp->fMatcher->useTransparentBounds(b);
573 }
574 
575 
576 //------------------------------------------------------------------------------
577 //
578 //    uregex_hasAnchoringBounds
579 //
580 //------------------------------------------------------------------------------
581 U_CAPI UBool U_EXPORT2
uregex_hasAnchoringBounds(const URegularExpression * regexp,UErrorCode * status)582 uregex_hasAnchoringBounds(const  URegularExpression   *regexp,
583                                    UErrorCode           *status)  {
584     if (validateRE(regexp, status) == FALSE) {
585         return FALSE;
586     }
587     return regexp->fMatcher->hasAnchoringBounds();
588 }
589 
590 
591 //------------------------------------------------------------------------------
592 //
593 //    uregex_useAnchoringBounds
594 //
595 //------------------------------------------------------------------------------
596 U_CAPI void U_EXPORT2
uregex_useAnchoringBounds(URegularExpression * regexp,UBool b,UErrorCode * status)597 uregex_useAnchoringBounds(URegularExpression    *regexp,
598              UBool                 b,
599              UErrorCode            *status)  {
600     if (validateRE(regexp, status) == FALSE) {
601         return;
602     }
603     regexp->fMatcher->useAnchoringBounds(b);
604 }
605 
606 
607 //------------------------------------------------------------------------------
608 //
609 //    uregex_hitEnd
610 //
611 //------------------------------------------------------------------------------
612 U_CAPI UBool U_EXPORT2
uregex_hitEnd(const URegularExpression * regexp,UErrorCode * status)613 uregex_hitEnd(const  URegularExpression   *regexp,
614                      UErrorCode           *status)  {
615     if (validateRE(regexp, status) == FALSE) {
616         return FALSE;
617     }
618     return regexp->fMatcher->hitEnd();
619 }
620 
621 
622 //------------------------------------------------------------------------------
623 //
624 //    uregex_requireEnd
625 //
626 //------------------------------------------------------------------------------
627 U_CAPI UBool U_EXPORT2
uregex_requireEnd(const URegularExpression * regexp,UErrorCode * status)628 uregex_requireEnd(const  URegularExpression   *regexp,
629                          UErrorCode           *status)  {
630     if (validateRE(regexp, status) == FALSE) {
631         return FALSE;
632     }
633     return regexp->fMatcher->requireEnd();
634 }
635 
636 
637 //------------------------------------------------------------------------------
638 //
639 //    uregex_replaceAll
640 //
641 //------------------------------------------------------------------------------
642 U_CAPI int32_t U_EXPORT2
uregex_replaceAll(URegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar * destBuf,int32_t destCapacity,UErrorCode * status)643 uregex_replaceAll(URegularExpression    *regexp,
644                   const UChar           *replacementText,
645                   int32_t                replacementLength,
646                   UChar                 *destBuf,
647                   int32_t                destCapacity,
648                   UErrorCode            *status)  {
649     if (validateRE(regexp, status) == FALSE) {
650         return 0;
651     }
652     if (replacementText == NULL || replacementLength < -1 ||
653         destBuf == NULL && destCapacity > 0 ||
654         destCapacity < 0) {
655         *status = U_ILLEGAL_ARGUMENT_ERROR;
656         return 0;
657     }
658 
659     int32_t   len = 0;
660     uregex_reset(regexp, 0, status);
661     while (uregex_findNext(regexp, status)) {
662         len += uregex_appendReplacement(regexp, replacementText, replacementLength,
663                                         &destBuf, &destCapacity, status);
664     }
665     len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
666 
667     return len;
668 }
669 
670 
671 //------------------------------------------------------------------------------
672 //
673 //    uregex_replaceFirst
674 //
675 //------------------------------------------------------------------------------
676 U_CAPI int32_t U_EXPORT2
uregex_replaceFirst(URegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar * destBuf,int32_t destCapacity,UErrorCode * status)677 uregex_replaceFirst(URegularExpression  *regexp,
678                     const UChar         *replacementText,
679                     int32_t              replacementLength,
680                     UChar               *destBuf,
681                     int32_t              destCapacity,
682                     UErrorCode          *status)  {
683     if (validateRE(regexp, status) == FALSE) {
684         return 0;
685     }
686     if (replacementText == NULL || replacementLength < -1 ||
687         destBuf == NULL && destCapacity > 0 ||
688         destCapacity < 0) {
689         *status = U_ILLEGAL_ARGUMENT_ERROR;
690         return 0;
691     }
692 
693     int32_t   len = 0;
694     UBool     findSucceeded;
695     uregex_reset(regexp, 0, status);
696     findSucceeded = uregex_find(regexp, 0, status);
697     if (findSucceeded) {
698         len = uregex_appendReplacement(regexp, replacementText, replacementLength,
699                                        &destBuf, &destCapacity, status);
700     }
701     len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
702 
703     return len;
704 }
705 
706 
707 //------------------------------------------------------------------------------
708 //
709 //    uregex_appendReplacement
710 //
711 //------------------------------------------------------------------------------
712 
713 
714 //
715 //  Dummy class, because these functions need to be friends of class RegexMatcher,
716 //               and stand-alone C functions don't work as friends
717 //
718 U_NAMESPACE_BEGIN
719 class RegexCImpl {
720  public:
721    inline static  int32_t appendReplacement(URegularExpression    *regexp,
722                       const UChar           *replacementText,
723                       int32_t                replacementLength,
724                       UChar                **destBuf,
725                       int32_t               *destCapacity,
726                       UErrorCode            *status);
727 
728    inline static int32_t appendTail(URegularExpression    *regexp,
729                   UChar                **destBuf,
730                   int32_t               *destCapacity,
731                   UErrorCode            *status);
732 };
733 U_NAMESPACE_END
734 
735 
736 //
737 //  Call-back function for u_unescapeAt(), used when we encounter
738 //    \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
739 //
740 U_CDECL_BEGIN
741 static UChar U_CALLCONV
unescape_charAt(int32_t offset,void * context)742 unescape_charAt(int32_t offset, void *context) {
743     UChar c16 = ((UChar *)context)[offset];
744     return c16;
745 }
746 U_CDECL_END
747 
748 
749 static const UChar BACKSLASH  = 0x5c;
750 static const UChar DOLLARSIGN = 0x24;
751 
752 //
753 //  Move a character to an output buffer, with bounds checking on the index.
754 //      Index advances even if capacity is exceeded, for preflight size computations.
755 //      This little sequence is used a LOT.
756 //
appendToBuf(UChar c,int32_t * idx,UChar * buf,int32_t bufCapacity)757 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
758     if (*idx < bufCapacity) {
759         buf[*idx] = c;
760     }
761     (*idx)++;
762 }
763 
764 
765 //
766 //  appendReplacement, the actual implementation.
767 //
appendReplacement(URegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)768 int32_t RegexCImpl::appendReplacement(URegularExpression    *regexp,
769                   const UChar           *replacementText,
770                   int32_t                replacementLength,
771                   UChar                **destBuf,
772                   int32_t               *destCapacity,
773                   UErrorCode            *status)  {
774 
775     // If we come in with a buffer overflow error, don't suppress the operation.
776     //  A series of appendReplacements, appendTail need to correctly preflight
777     //  the buffer size when an overflow happens somewhere in the middle.
778     UBool pendingBufferOverflow = FALSE;
779     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) {
780         pendingBufferOverflow = TRUE;
781         *status = U_ZERO_ERROR;
782     }
783 
784     //
785     // Validate all paramters
786     //
787     if (validateRE(regexp, status) == FALSE) {
788         return 0;
789     }
790     if (replacementText == NULL || replacementLength < -1 ||
791         destCapacity == NULL || destBuf == NULL ||
792         *destBuf == NULL && *destCapacity > 0 ||
793         *destCapacity < 0) {
794         *status = U_ILLEGAL_ARGUMENT_ERROR;
795         return 0;
796     }
797 
798     RegexMatcher *m = regexp->fMatcher;
799     if (m->fMatch == FALSE) {
800         *status = U_REGEX_INVALID_STATE;
801         return 0;
802     }
803 
804     UChar    *dest             = *destBuf;
805     int32_t   capacity         = *destCapacity;
806     int32_t   destIdx          =  0;
807     int32_t   i;
808 
809     // If it wasn't supplied by the caller,  get the length of the replacement text.
810     //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
811     //          the fly and avoid this step.
812     if (replacementLength == -1) {
813         replacementLength = u_strlen(replacementText);
814     }
815 
816     // Copy input string from the end of previous match to start of current match
817     for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
818         appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
819     }
820 
821 
822 
823     // scan the replacement text, looking for substitutions ($n) and \escapes.
824     int32_t  replIdx = 0;
825     while (replIdx < replacementLength) {
826         UChar  c = replacementText[replIdx];
827         replIdx++;
828         if (c != DOLLARSIGN && c != BACKSLASH) {
829             // Common case, no substitution, no escaping,
830             //  just copy the char to the dest buf.
831             appendToBuf(c, &destIdx, dest, capacity);
832             continue;
833         }
834 
835         if (c == BACKSLASH) {
836             // Backslash Escape.  Copy the following char out without further checks.
837             //                    Note:  Surrogate pairs don't need any special handling
838             //                           The second half wont be a '$' or a '\', and
839             //                           will move to the dest normally on the next
840             //                           loop iteration.
841             if (replIdx >= replacementLength) {
842                 break;
843             }
844             c = replacementText[replIdx];
845 
846             if (c==0x55/*U*/ || c==0x75/*u*/) {
847                 // We have a \udddd or \Udddddddd escape sequence.
848                 UChar32 escapedChar =
849                     u_unescapeAt(unescape_charAt,
850                        &replIdx,                   // Index is updated by unescapeAt
851                        replacementLength,          // Length of replacement text
852                        (void *)replacementText);
853 
854                 if (escapedChar != (UChar32)0xFFFFFFFF) {
855                     if (escapedChar <= 0xffff) {
856                         appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
857                     } else {
858                         appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
859                         appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
860                     }
861                     continue;
862                 }
863                 // Note:  if the \u escape was invalid, just fall through and
864                 //        treat it as a plain \<anything> escape.
865             }
866 
867             // Plain backslash escape.  Just put out the escaped character.
868             appendToBuf(c, &destIdx, dest, capacity);
869 
870             replIdx++;
871             continue;
872         }
873 
874 
875 
876         // We've got a $.  Pick up a capture group number if one follows.
877         // Consume at most the number of digits necessary for the largest capture
878         // number that is valid for this pattern.
879 
880         int32_t numDigits = 0;
881         int32_t groupNum  = 0;
882         UChar32 digitC;
883         for (;;) {
884             if (replIdx >= replacementLength) {
885                 break;
886             }
887             U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
888             if (u_isdigit(digitC) == FALSE) {
889                 break;
890             }
891 
892             U16_FWD_1(replacementText, replIdx, replacementLength);
893             groupNum=groupNum*10 + u_charDigitValue(digitC);
894             numDigits++;
895             if (numDigits >= m->fPattern->fMaxCaptureDigits) {
896                 break;
897             }
898         }
899 
900 
901         if (numDigits == 0) {
902             // The $ didn't introduce a group number at all.
903             // Treat it as just part of the substitution text.
904             appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
905             continue;
906         }
907 
908         // Finally, append the capture group data to the destination.
909         int32_t  capacityRemaining = capacity - destIdx;
910         if (capacityRemaining < 0) {
911             capacityRemaining = 0;
912         }
913         destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
914         if (*status == U_BUFFER_OVERFLOW_ERROR) {
915             // Ignore buffer overflow when extracting the group.  We need to
916             //   continue on to get full size of the untruncated result.  We will
917             //   raise our own buffer overflow error at the end.
918             *status = U_ZERO_ERROR;
919         }
920 
921         if (U_FAILURE(*status)) {
922             // Can fail if group number is out of range.
923             break;
924         }
925 
926     }
927 
928     //
929     //  Nul Terminate the dest buffer if possible.
930     //  Set the appropriate buffer overflow or not terminated error, if needed.
931     //
932     if (destIdx < capacity) {
933         dest[destIdx] = 0;
934     } else if (destIdx == *destCapacity) {
935         *status = U_STRING_NOT_TERMINATED_WARNING;
936     } else {
937         *status = U_BUFFER_OVERFLOW_ERROR;
938     }
939 
940     //
941     // Return an updated dest buffer and capacity to the caller.
942     //
943     if (destIdx > 0 &&  *destCapacity > 0) {
944         if (destIdx < capacity) {
945             *destBuf      += destIdx;
946             *destCapacity -= destIdx;
947         } else {
948             *destBuf      += capacity;
949             *destCapacity =  0;
950         }
951     }
952 
953     // If we came in with a buffer overflow, make sure we go out with one also.
954     //   (A zero length match right at the end of the previous match could
955     //    make this function succeed even though a previous call had overflowed the buf)
956     if (pendingBufferOverflow && U_SUCCESS(*status)) {
957         *status = U_BUFFER_OVERFLOW_ERROR;
958     }
959 
960     return destIdx;
961 }
962 
963 //
964 //   appendReplacement   the acutal API function,
965 //
966 U_CAPI int32_t U_EXPORT2
uregex_appendReplacement(URegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)967 uregex_appendReplacement(URegularExpression    *regexp,
968                   const UChar           *replacementText,
969                   int32_t                replacementLength,
970                   UChar                **destBuf,
971                   int32_t               *destCapacity,
972                   UErrorCode            *status) {
973     return RegexCImpl::appendReplacement(
974         regexp, replacementText, replacementLength,destBuf, destCapacity, status);
975 }
976 
977 
978 //------------------------------------------------------------------------------
979 //
980 //    uregex_appendTail
981 //
982 //------------------------------------------------------------------------------
appendTail(URegularExpression * regexp,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)983 int32_t RegexCImpl::appendTail(URegularExpression    *regexp,
984                   UChar                **destBuf,
985                   int32_t               *destCapacity,
986                   UErrorCode            *status)  {
987 
988     // If we come in with a buffer overflow error, don't suppress the operation.
989     //  A series of appendReplacements, appendTail need to correctly preflight
990     //  the buffer size when an overflow happens somewhere in the middle.
991     UBool pendingBufferOverflow = FALSE;
992     if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) {
993         pendingBufferOverflow = TRUE;
994         *status = U_ZERO_ERROR;
995     }
996 
997     if (validateRE(regexp, status) == FALSE) {
998         return 0;
999     }
1000     if (destCapacity == NULL || destBuf == NULL ||
1001         *destBuf == NULL && *destCapacity > 0 ||
1002         *destCapacity < 0) {
1003         *status = U_ILLEGAL_ARGUMENT_ERROR;
1004         return 0;
1005     }
1006 
1007     RegexMatcher *m = regexp->fMatcher;
1008 
1009     int32_t  srcIdx;
1010     if (m->fMatch) {
1011         // The most recent call to find() succeeded.
1012         srcIdx = m->fMatchEnd;
1013     } else {
1014         // The last call to find() on this matcher failed().
1015         //   Look back to the end of the last find() that succeeded for src index.
1016         srcIdx = m->fLastMatchEnd;
1017         if (srcIdx == -1)  {
1018             // There has been no successful match with this matcher.
1019             //   We want to copy the whole string.
1020             srcIdx = 0;
1021         }
1022     }
1023 
1024     int32_t  destIdx     = 0;
1025     int32_t  destCap     = *destCapacity;
1026     UChar    *dest       = *destBuf;
1027 
1028     for (;;) {
1029         if (srcIdx == regexp->fTextLength) {
1030             break;
1031         }
1032         UChar c = regexp->fText[srcIdx];
1033         if (c == 0 && regexp->fTextLength == -1) {
1034             break;
1035         }
1036         if (destIdx < destCap) {
1037             dest[destIdx] = c;
1038         } else {
1039             // We've overflowed the dest buffer.
1040             //  If the total input string length is known, we can
1041             //    compute the total buffer size needed without scanning through the string.
1042             if (regexp->fTextLength > 0) {
1043                 destIdx += (regexp->fTextLength - srcIdx);
1044                 break;
1045             }
1046         }
1047         srcIdx++;
1048         destIdx++;
1049     }
1050 
1051     //
1052     //  NUL terminate the output string, if possible, otherwise issue the
1053     //   appropriate error or warning.
1054     //
1055     if (destIdx < destCap) {
1056         dest[destIdx] = 0;
1057     } else  if (destIdx == destCap) {
1058         *status = U_STRING_NOT_TERMINATED_WARNING;
1059     } else {
1060         *status = U_BUFFER_OVERFLOW_ERROR;
1061     }
1062 
1063     //
1064     // Update the user's buffer ptr and capacity vars to reflect the
1065     //   amount used.
1066     //
1067     if (destIdx < destCap) {
1068         *destBuf      += destIdx;
1069         *destCapacity -= destIdx;
1070     } else {
1071         *destBuf      += destCap;
1072         *destCapacity  = 0;
1073     }
1074 
1075     if (pendingBufferOverflow && U_SUCCESS(*status)) {
1076         *status = U_BUFFER_OVERFLOW_ERROR;
1077     }
1078 
1079     return destIdx;
1080 }
1081 
1082 
1083 U_CAPI int32_t U_EXPORT2
uregex_appendTail(URegularExpression * regexp,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1084 uregex_appendTail(URegularExpression    *regexp,
1085                   UChar                **destBuf,
1086                   int32_t               *destCapacity,
1087                   UErrorCode            *status)  {
1088     return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1089 }
1090 
1091 
1092 //------------------------------------------------------------------------------
1093 //
1094 //    copyString     Internal utility to copy a string to an output buffer,
1095 //                   while managing buffer overflow and preflight size
1096 //                   computation.  NUL termination is added to destination,
1097 //                   and the NUL is counted in the output size.
1098 //
1099 //------------------------------------------------------------------------------
copyString(UChar * destBuffer,int32_t destCapacity,int32_t * destIndex,const UChar * srcPtr,int32_t srcLen)1100 static void copyString(UChar        *destBuffer,    //  Destination buffer.
1101                        int32_t       destCapacity,  //  Total capacity of dest buffer
1102                        int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
1103                                                     //    Update not clipped to destCapacity.
1104                        const UChar  *srcPtr,        //  Pointer to source string
1105                        int32_t       srcLen)        //  Source string len.
1106 {
1107     int32_t  si;
1108     int32_t  di = *destIndex;
1109     UChar    c;
1110 
1111     for (si=0; si<srcLen;  si++) {
1112         c = srcPtr[si];
1113         if (di < destCapacity) {
1114             destBuffer[di] = c;
1115             di++;
1116         } else {
1117             di += srcLen - si;
1118             break;
1119         }
1120     }
1121     if (di<destCapacity) {
1122         destBuffer[di] = 0;
1123     }
1124     di++;
1125     *destIndex = di;
1126 }
1127 
1128 
1129 //------------------------------------------------------------------------------
1130 //
1131 //    uregex_split
1132 //
1133 //------------------------------------------------------------------------------
1134 U_CAPI int32_t U_EXPORT2
uregex_split(URegularExpression * regexp,UChar * destBuf,int32_t destCapacity,int32_t * requiredCapacity,UChar * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1135 uregex_split(   URegularExpression      *regexp,
1136                   UChar                 *destBuf,
1137                   int32_t                destCapacity,
1138                   int32_t               *requiredCapacity,
1139                   UChar                 *destFields[],
1140                   int32_t                destFieldsCapacity,
1141                   UErrorCode            *status) {
1142     if (validateRE(regexp, status) == FALSE) {
1143         return 0;
1144     }
1145     if (destBuf == NULL && destCapacity > 0 ||
1146         destCapacity < 0 ||
1147         destFields == NULL ||
1148         destFieldsCapacity < 1 ) {
1149         *status = U_ILLEGAL_ARGUMENT_ERROR;
1150         return 0;
1151     }
1152 
1153     //
1154     // Reset for the input text
1155     //
1156     regexp->fMatcher->reset();
1157     int32_t   inputLen = regexp->fTextString.length();
1158     int32_t   nextOutputStringStart = 0;
1159     if (inputLen == 0) {
1160         return 0;
1161     }
1162 
1163 
1164     //
1165     // Loop through the input text, searching for the delimiter pattern
1166     //
1167     int32_t   i;             // Index of the field being processed.
1168     int32_t   destIdx = 0;   // Next available position in destBuf;
1169     int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
1170     for (i=0; ; i++) {
1171         if (i>=destFieldsCapacity-1) {
1172             // There are one or zero output string left.
1173             // Fill the last output string with whatever is left from the input, then exit the loop.
1174             //  ( i will be == destFieldsCapacity if we filled the output array while processing
1175             //    capture groups of the delimiter expression, in which case we will discard the
1176             //    last capture group saved in favor of the unprocessed remainder of the
1177             //    input string.)
1178             int32_t remainingLength = inputLen-nextOutputStringStart;
1179             if (remainingLength > 0) {
1180             }
1181             if (i >= destFieldsCapacity) {
1182                 // No fields are left.  Recycle the last one for holding the trailing part of
1183                 //   the input string.
1184                 i = destFieldsCapacity-1;
1185                 destIdx = (int32_t)(destFields[i] - destFields[0]);
1186             }
1187 
1188             destFields[i] = &destBuf[destIdx];
1189             copyString(destBuf, destCapacity, &destIdx,
1190                 &regexp->fText[nextOutputStringStart], remainingLength);
1191             break;
1192         }
1193 
1194         if (regexp->fMatcher->find()) {
1195             // We found another delimiter.  Move everything from where we started looking
1196             //  up until the start of the delimiter into the next output string.
1197             int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
1198             destFields[i] = &destBuf[destIdx];
1199             copyString(destBuf, destCapacity, &destIdx,
1200                 &regexp->fText[nextOutputStringStart], fieldLen);
1201             nextOutputStringStart =  regexp->fMatcher->end(*status);
1202 
1203             // If the delimiter pattern has capturing parentheses, the captured
1204             //  text goes out into the next n destination strings.
1205             int32_t groupNum;
1206             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1207                 // If we've run out of output string slots, bail out.
1208                 if (i==destFieldsCapacity-1) {
1209                     break;
1210                 }
1211                 i++;
1212 
1213                 // Set up to extract the capture group contents into the dest buffer.
1214                 UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow
1215                                                       //  error while extracting this group.
1216                 int32_t remainingCapacity = destCapacity - destIdx;
1217                 if (remainingCapacity < 0) {
1218                     remainingCapacity = 0;
1219                 }
1220                 destFields[i] = &destBuf[destIdx];
1221                 int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus);
1222                 destIdx += t + 1;    // Record the space used in the output string buffer.
1223                                      //  +1 for the NUL that terminates the string.
1224             }
1225 
1226             if (nextOutputStringStart == inputLen) {
1227                 // The delimiter was at the end of the string.  We're done.
1228                 break;
1229             }
1230 
1231         }
1232         else
1233         {
1234             // We ran off the end of the input while looking for the next delimiter.
1235             // All the remaining text goes into the current output string.
1236             destFields[i] = &destBuf[destIdx];
1237             copyString(destBuf, destCapacity, &destIdx,
1238                          &regexp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
1239             break;
1240         }
1241     }
1242 
1243     // Zero out any unused portion of the destFields array
1244     int j;
1245     for (j=i+1; j<destFieldsCapacity; j++) {
1246         destFields[j] = NULL;
1247     }
1248 
1249     if (requiredCapacity != NULL) {
1250         *requiredCapacity = destIdx;
1251     }
1252     if (destIdx > destCapacity) {
1253         *status = U_BUFFER_OVERFLOW_ERROR;
1254     }
1255     return i+1;
1256 }
1257 
1258 
1259 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1260 
1261