1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #include "pcre2_internal.h"
47
48 #define PTR_STACK_SIZE 20
49
50 #define SUBSTITUTE_OPTIONS \
51 (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_UNKNOWN_UNSET| \
53 PCRE2_SUBSTITUTE_UNSET_EMPTY)
54
55
56
57 /*************************************************
58 * Find end of substitute text *
59 *************************************************/
60
61 /* In extended mode, we recognize ${name:+set text:unset text} and similar
62 constructions. This requires the identification of unescaped : and }
63 characters. This function scans for such. It must deal with nested ${
64 constructions. The pointer to the text is updated, either to the required end
65 character, or to where an error was detected.
66
67 Arguments:
68 code points to the compiled expression (for options)
69 ptrptr points to the pointer to the start of the text (updated)
70 ptrend end of the whole string
71 last TRUE if the last expected string (only } recognized)
72
73 Returns: 0 on success
74 negative error code on failure
75 */
76
77 static int
find_text_end(const pcre2_code * code,PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL last)78 find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
79 BOOL last)
80 {
81 int rc = 0;
82 uint32_t nestlevel = 0;
83 BOOL literal = FALSE;
84 PCRE2_SPTR ptr = *ptrptr;
85
86 for (; ptr < ptrend; ptr++)
87 {
88 if (literal)
89 {
90 if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
91 {
92 literal = FALSE;
93 ptr += 1;
94 }
95 }
96
97 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
98 {
99 if (nestlevel == 0) goto EXIT;
100 nestlevel--;
101 }
102
103 else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
104
105 else if (*ptr == CHAR_DOLLAR_SIGN)
106 {
107 if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
108 {
109 nestlevel++;
110 ptr += 1;
111 }
112 }
113
114 else if (*ptr == CHAR_BACKSLASH)
115 {
116 int erc;
117 int errorcode;
118 uint32_t ch;
119
120 if (ptr < ptrend - 1) switch (ptr[1])
121 {
122 case CHAR_L:
123 case CHAR_l:
124 case CHAR_U:
125 case CHAR_u:
126 ptr += 1;
127 continue;
128 }
129
130 ptr += 1; /* Must point after \ */
131 erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
132 code->overall_options, FALSE, NULL);
133 ptr -= 1; /* Back to last code unit of escape */
134 if (errorcode != 0)
135 {
136 rc = errorcode;
137 goto EXIT;
138 }
139
140 switch(erc)
141 {
142 case 0: /* Data character */
143 case ESC_E: /* Isolated \E is ignored */
144 break;
145
146 case ESC_Q:
147 literal = TRUE;
148 break;
149
150 default:
151 rc = PCRE2_ERROR_BADREPESCAPE;
152 goto EXIT;
153 }
154 }
155 }
156
157 rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
158
159 EXIT:
160 *ptrptr = ptr;
161 return rc;
162 }
163
164
165
166 /*************************************************
167 * Match and substitute *
168 *************************************************/
169
170 /* This function applies a compiled re to a subject string and creates a new
171 string with substitutions. The first 7 arguments are the same as for
172 pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
173
174 Arguments:
175 code points to the compiled expression
176 subject points to the subject string
177 length length of subject string (may contain binary zeros)
178 start_offset where to start in the subject string
179 options option bits
180 match_data points to a match_data block, or is NULL
181 context points a PCRE2 context
182 replacement points to the replacement string
183 rlength length of replacement string
184 buffer where to put the substituted string
185 blength points to length of buffer; updated to length of string
186
187 Returns: >= 0 number of substitutions made
188 < 0 an error code
189 PCRE2_ERROR_BADREPLACEMENT means invalid use of $
190 */
191
192 /* This macro checks for space in the buffer before copying into it. On
193 overflow, either give an error immediately, or keep on, accumulating the
194 length. */
195
196 #define CHECKMEMCPY(from,length) \
197 if (!overflowed && lengthleft < length) \
198 { \
199 if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
200 overflowed = TRUE; \
201 extra_needed = length - lengthleft; \
202 } \
203 else if (overflowed) \
204 { \
205 extra_needed += length; \
206 } \
207 else \
208 { \
209 memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
210 buff_offset += length; \
211 lengthleft -= length; \
212 }
213
214 /* Here's the function */
215
216 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substitute(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,PCRE2_SPTR replacement,PCRE2_SIZE rlength,PCRE2_UCHAR * buffer,PCRE2_SIZE * blength)217 pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
218 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
219 pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
220 PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
221 {
222 int rc;
223 int subs;
224 int forcecase = 0;
225 int forcecasereset = 0;
226 uint32_t ovector_count;
227 uint32_t goptions = 0;
228 uint32_t suboptions;
229 BOOL match_data_created = FALSE;
230 BOOL literal = FALSE;
231 BOOL overflowed = FALSE;
232 #ifdef SUPPORT_UNICODE
233 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
234 #endif
235 PCRE2_UCHAR temp[6];
236 PCRE2_SPTR ptr;
237 PCRE2_SPTR repend;
238 PCRE2_SIZE extra_needed = 0;
239 PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
240 PCRE2_SIZE *ovector;
241 PCRE2_SIZE ovecsave[3];
242
243 buff_offset = 0;
244 lengthleft = buff_length = *blength;
245 *blength = PCRE2_UNSET;
246 ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
247
248 /* Partial matching is not valid. */
249
250 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
251 return PCRE2_ERROR_BADOPTION;
252
253 /* If no match data block is provided, create one. */
254
255 if (match_data == NULL)
256 {
257 pcre2_general_context *gcontext = (mcontext == NULL)?
258 (pcre2_general_context *)code :
259 (pcre2_general_context *)mcontext;
260 match_data = pcre2_match_data_create_from_pattern(code, gcontext);
261 if (match_data == NULL) return PCRE2_ERROR_NOMEMORY;
262 match_data_created = TRUE;
263 }
264 ovector = pcre2_get_ovector_pointer(match_data);
265 ovector_count = pcre2_get_ovector_count(match_data);
266
267 /* Find lengths of zero-terminated strings and the end of the replacement. */
268
269 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
270 if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
271 repend = replacement + rlength;
272
273 /* Check UTF replacement string if necessary. */
274
275 #ifdef SUPPORT_UNICODE
276 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
277 {
278 rc = PRIV(valid_utf)(replacement, rlength, &(match_data->rightchar));
279 if (rc != 0)
280 {
281 match_data->leftchar = 0;
282 goto EXIT;
283 }
284 }
285 #endif /* SUPPORT_UNICODE */
286
287 /* Save the substitute options and remove them from the match options. */
288
289 suboptions = options & SUBSTITUTE_OPTIONS;
290 options &= ~SUBSTITUTE_OPTIONS;
291
292 /* Copy up to the start offset */
293
294 if (start_offset > length)
295 {
296 match_data->leftchar = 0;
297 rc = PCRE2_ERROR_BADOFFSET;
298 goto EXIT;
299 }
300 CHECKMEMCPY(subject, start_offset);
301
302 /* Loop for global substituting. */
303
304 subs = 0;
305 do
306 {
307 PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
308 uint32_t ptrstackptr = 0;
309
310 rc = pcre2_match(code, subject, length, start_offset, options|goptions,
311 match_data, mcontext);
312
313 #ifdef SUPPORT_UNICODE
314 if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
315 #endif
316
317 /* Any error other than no match returns the error code. No match when not
318 doing the special after-empty-match global rematch, or when at the end of the
319 subject, breaks the global loop. Otherwise, advance the starting point by one
320 character, copying it to the output, and try again. */
321
322 if (rc < 0)
323 {
324 PCRE2_SIZE save_start;
325
326 if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
327 if (goptions == 0 || start_offset >= length) break;
328
329 /* Advance by one code point. Then, if CRLF is a valid newline sequence and
330 we have advanced into the middle of it, advance one more code point. In
331 other words, do not start in the middle of CRLF, even if CR and LF on their
332 own are valid newlines. */
333
334 save_start = start_offset++;
335 if (subject[start_offset-1] == CHAR_CR &&
336 code->newline_convention != PCRE2_NEWLINE_CR &&
337 code->newline_convention != PCRE2_NEWLINE_LF &&
338 start_offset < length &&
339 subject[start_offset] == CHAR_LF)
340 start_offset++;
341
342 /* Otherwise, in UTF mode, advance past any secondary code points. */
343
344 else if ((code->overall_options & PCRE2_UTF) != 0)
345 {
346 #if PCRE2_CODE_UNIT_WIDTH == 8
347 while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
348 start_offset++;
349 #elif PCRE2_CODE_UNIT_WIDTH == 16
350 while (start_offset < length &&
351 (subject[start_offset] & 0xfc00) == 0xdc00)
352 start_offset++;
353 #endif
354 }
355
356 /* Copy what we have advanced past, reset the special global options, and
357 continue to the next match. */
358
359 fraglength = start_offset - save_start;
360 CHECKMEMCPY(subject + save_start, fraglength);
361 goptions = 0;
362 continue;
363 }
364
365 /* Handle a successful match. Matches that use \K to end before they start
366 or start before the current point in the subject are not supported. */
367
368 if (ovector[1] < ovector[0] || ovector[0] < start_offset)
369 {
370 rc = PCRE2_ERROR_BADSUBSPATTERN;
371 goto EXIT;
372 }
373
374 /* Check for the same match as previous. This is legitimate after matching an
375 empty string that starts after the initial match offset. We have tried again
376 at the match point in case the pattern is one like /(?<=\G.)/ which can never
377 match at its starting point, so running the match achieves the bumpalong. If
378 we do get the same (null) match at the original match point, it isn't such a
379 pattern, so we now do the empty string magic. In all other cases, a repeat
380 match should never occur. */
381
382 if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
383 {
384 if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
385 {
386 goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
387 ovecsave[2] = start_offset;
388 continue; /* Back to the top of the loop */
389 }
390 rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
391 goto EXIT;
392 }
393
394 /* Count substitutions with a paranoid check for integer overflow; surely no
395 real call to this function would ever hit this! */
396
397 if (subs == INT_MAX)
398 {
399 rc = PCRE2_ERROR_TOOMANYREPLACE;
400 goto EXIT;
401 }
402 subs++;
403
404 /* Copy the text leading up to the match. */
405
406 if (rc == 0) rc = ovector_count;
407 fraglength = ovector[0] - start_offset;
408 CHECKMEMCPY(subject + start_offset, fraglength);
409
410 /* Process the replacement string. Literal mode is set by \Q, but only in
411 extended mode when backslashes are being interpreted. In extended mode we
412 must handle nested substrings that are to be reprocessed. */
413
414 ptr = replacement;
415 for (;;)
416 {
417 uint32_t ch;
418 unsigned int chlen;
419
420 /* If at the end of a nested substring, pop the stack. */
421
422 if (ptr >= repend)
423 {
424 if (ptrstackptr <= 0) break; /* End of replacement string */
425 repend = ptrstack[--ptrstackptr];
426 ptr = ptrstack[--ptrstackptr];
427 continue;
428 }
429
430 /* Handle the next character */
431
432 if (literal)
433 {
434 if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
435 {
436 literal = FALSE;
437 ptr += 2;
438 continue;
439 }
440 goto LOADLITERAL;
441 }
442
443 /* Not in literal mode. */
444
445 if (*ptr == CHAR_DOLLAR_SIGN)
446 {
447 int group, n;
448 uint32_t special = 0;
449 BOOL inparens;
450 BOOL star;
451 PCRE2_SIZE sublength;
452 PCRE2_SPTR text1_start = NULL;
453 PCRE2_SPTR text1_end = NULL;
454 PCRE2_SPTR text2_start = NULL;
455 PCRE2_SPTR text2_end = NULL;
456 PCRE2_UCHAR next;
457 PCRE2_UCHAR name[33];
458
459 if (++ptr >= repend) goto BAD;
460 if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
461
462 group = -1;
463 n = 0;
464 inparens = FALSE;
465 star = FALSE;
466
467 if (next == CHAR_LEFT_CURLY_BRACKET)
468 {
469 if (++ptr >= repend) goto BAD;
470 next = *ptr;
471 inparens = TRUE;
472 }
473
474 if (next == CHAR_ASTERISK)
475 {
476 if (++ptr >= repend) goto BAD;
477 next = *ptr;
478 star = TRUE;
479 }
480
481 if (!star && next >= CHAR_0 && next <= CHAR_9)
482 {
483 group = next - CHAR_0;
484 while (++ptr < repend)
485 {
486 next = *ptr;
487 if (next < CHAR_0 || next > CHAR_9) break;
488 group = group * 10 + next - CHAR_0;
489
490 /* A check for a number greater than the hightest captured group
491 is sufficient here; no need for a separate overflow check. If unknown
492 groups are to be treated as unset, just skip over any remaining
493 digits and carry on. */
494
495 if (group > code->top_bracket)
496 {
497 if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
498 {
499 while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
500 break;
501 }
502 else
503 {
504 rc = PCRE2_ERROR_NOSUBSTRING;
505 goto PTREXIT;
506 }
507 }
508 }
509 }
510 else
511 {
512 const uint8_t *ctypes = code->tables + ctypes_offset;
513 while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
514 {
515 name[n++] = next;
516 if (n > 32) goto BAD;
517 if (++ptr >= repend) break;
518 next = *ptr;
519 }
520 if (n == 0) goto BAD;
521 name[n] = 0;
522 }
523
524 /* In extended mode we recognize ${name:+set text:unset text} and
525 ${name:-default text}. */
526
527 if (inparens)
528 {
529 if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
530 !star && ptr < repend - 2 && next == CHAR_COLON)
531 {
532 special = *(++ptr);
533 if (special != CHAR_PLUS && special != CHAR_MINUS)
534 {
535 rc = PCRE2_ERROR_BADSUBSTITUTION;
536 goto PTREXIT;
537 }
538
539 text1_start = ++ptr;
540 rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
541 if (rc != 0) goto PTREXIT;
542 text1_end = ptr;
543
544 if (special == CHAR_PLUS && *ptr == CHAR_COLON)
545 {
546 text2_start = ++ptr;
547 rc = find_text_end(code, &ptr, repend, TRUE);
548 if (rc != 0) goto PTREXIT;
549 text2_end = ptr;
550 }
551 }
552
553 else
554 {
555 if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
556 {
557 rc = PCRE2_ERROR_REPMISSINGBRACE;
558 goto PTREXIT;
559 }
560 }
561
562 ptr++;
563 }
564
565 /* Have found a syntactically correct group number or name, or *name.
566 Only *MARK is currently recognized. */
567
568 if (star)
569 {
570 if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
571 {
572 PCRE2_SPTR mark = pcre2_get_mark(match_data);
573 if (mark != NULL)
574 {
575 PCRE2_SPTR mark_start = mark;
576 while (*mark != 0) mark++;
577 fraglength = mark - mark_start;
578 CHECKMEMCPY(mark_start, fraglength);
579 }
580 }
581 else goto BAD;
582 }
583
584 /* Substitute the contents of a group. We don't use substring_copy
585 functions any more, in order to support case forcing. */
586
587 else
588 {
589 PCRE2_SPTR subptr, subptrend;
590
591 /* Find a number for a named group. In case there are duplicate names,
592 search for the first one that is set. If the name is not found when
593 PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
594 non-existent group. */
595
596 if (group < 0)
597 {
598 PCRE2_SPTR first, last, entry;
599 rc = pcre2_substring_nametable_scan(code, name, &first, &last);
600 if (rc == PCRE2_ERROR_NOSUBSTRING &&
601 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
602 {
603 group = code->top_bracket + 1;
604 }
605 else
606 {
607 if (rc < 0) goto PTREXIT;
608 for (entry = first; entry <= last; entry += rc)
609 {
610 uint32_t ng = GET2(entry, 0);
611 if (ng < ovector_count)
612 {
613 if (group < 0) group = ng; /* First in ovector */
614 if (ovector[ng*2] != PCRE2_UNSET)
615 {
616 group = ng; /* First that is set */
617 break;
618 }
619 }
620 }
621
622 /* If group is still negative, it means we did not find a group
623 that is in the ovector. Just set the first group. */
624
625 if (group < 0) group = GET2(first, 0);
626 }
627 }
628
629 /* We now have a group that is identified by number. Find the length of
630 the captured string. If a group in a non-special substitution is unset
631 when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
632
633 rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
634 if (rc < 0)
635 {
636 if (rc == PCRE2_ERROR_NOSUBSTRING &&
637 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
638 {
639 rc = PCRE2_ERROR_UNSET;
640 }
641 if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */
642 if (special == 0) /* Plain substitution */
643 {
644 if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
645 goto PTREXIT; /* Else error */
646 }
647 }
648
649 /* If special is '+' we have a 'set' and possibly an 'unset' text,
650 both of which are reprocessed when used. If special is '-' we have a
651 default text for when the group is unset; it must be reprocessed. */
652
653 if (special != 0)
654 {
655 if (special == CHAR_MINUS)
656 {
657 if (rc == 0) goto LITERAL_SUBSTITUTE;
658 text2_start = text1_start;
659 text2_end = text1_end;
660 }
661
662 if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
663 ptrstack[ptrstackptr++] = ptr;
664 ptrstack[ptrstackptr++] = repend;
665
666 if (rc == 0)
667 {
668 ptr = text1_start;
669 repend = text1_end;
670 }
671 else
672 {
673 ptr = text2_start;
674 repend = text2_end;
675 }
676 continue;
677 }
678
679 /* Otherwise we have a literal substitution of a group's contents. */
680
681 LITERAL_SUBSTITUTE:
682 subptr = subject + ovector[group*2];
683 subptrend = subject + ovector[group*2 + 1];
684
685 /* Substitute a literal string, possibly forcing alphabetic case. */
686
687 while (subptr < subptrend)
688 {
689 GETCHARINCTEST(ch, subptr);
690 if (forcecase != 0)
691 {
692 #ifdef SUPPORT_UNICODE
693 if (utf)
694 {
695 uint32_t type = UCD_CHARTYPE(ch);
696 if (PRIV(ucp_gentype)[type] == ucp_L &&
697 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
698 ch = UCD_OTHERCASE(ch);
699 }
700 else
701 #endif
702 {
703 if (((code->tables + cbits_offset +
704 ((forcecase > 0)? cbit_upper:cbit_lower)
705 )[ch/8] & (1 << (ch%8))) == 0)
706 ch = (code->tables + fcc_offset)[ch];
707 }
708 forcecase = forcecasereset;
709 }
710
711 #ifdef SUPPORT_UNICODE
712 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
713 #endif
714 {
715 temp[0] = ch;
716 chlen = 1;
717 }
718 CHECKMEMCPY(temp, chlen);
719 }
720 }
721 }
722
723 /* Handle an escape sequence in extended mode. We can use check_escape()
724 to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
725 the case-forcing escapes are not supported in pcre2_compile() so must be
726 recognized here. */
727
728 else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
729 *ptr == CHAR_BACKSLASH)
730 {
731 int errorcode;
732
733 if (ptr < repend - 1) switch (ptr[1])
734 {
735 case CHAR_L:
736 forcecase = forcecasereset = -1;
737 ptr += 2;
738 continue;
739
740 case CHAR_l:
741 forcecase = -1;
742 forcecasereset = 0;
743 ptr += 2;
744 continue;
745
746 case CHAR_U:
747 forcecase = forcecasereset = 1;
748 ptr += 2;
749 continue;
750
751 case CHAR_u:
752 forcecase = 1;
753 forcecasereset = 0;
754 ptr += 2;
755 continue;
756
757 default:
758 break;
759 }
760
761 ptr++; /* Point after \ */
762 rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
763 code->overall_options, FALSE, NULL);
764 if (errorcode != 0) goto BADESCAPE;
765
766 switch(rc)
767 {
768 case ESC_E:
769 forcecase = forcecasereset = 0;
770 continue;
771
772 case ESC_Q:
773 literal = TRUE;
774 continue;
775
776 case 0: /* Data character */
777 goto LITERAL;
778
779 default:
780 goto BADESCAPE;
781 }
782 }
783
784 /* Handle a literal code unit */
785
786 else
787 {
788 LOADLITERAL:
789 GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */
790
791 LITERAL:
792 if (forcecase != 0)
793 {
794 #ifdef SUPPORT_UNICODE
795 if (utf)
796 {
797 uint32_t type = UCD_CHARTYPE(ch);
798 if (PRIV(ucp_gentype)[type] == ucp_L &&
799 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
800 ch = UCD_OTHERCASE(ch);
801 }
802 else
803 #endif
804 {
805 if (((code->tables + cbits_offset +
806 ((forcecase > 0)? cbit_upper:cbit_lower)
807 )[ch/8] & (1 << (ch%8))) == 0)
808 ch = (code->tables + fcc_offset)[ch];
809 }
810 forcecase = forcecasereset;
811 }
812
813 #ifdef SUPPORT_UNICODE
814 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
815 #endif
816 {
817 temp[0] = ch;
818 chlen = 1;
819 }
820 CHECKMEMCPY(temp, chlen);
821 } /* End handling a literal code unit */
822 } /* End of loop for scanning the replacement. */
823
824 /* The replacement has been copied to the output. Save the details of this
825 match. See above for how this data is used. If we matched an empty string, do
826 the magic for global matches. Finally, update the start offset to point to
827 the rest of the subject string. */
828
829 ovecsave[0] = ovector[0];
830 ovecsave[1] = ovector[1];
831 ovecsave[2] = start_offset;
832
833 goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
834 PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
835 start_offset = ovector[1];
836 } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
837
838 /* Copy the rest of the subject. */
839
840 fraglength = length - start_offset;
841 CHECKMEMCPY(subject + start_offset, fraglength);
842 temp[0] = 0;
843 CHECKMEMCPY(temp , 1);
844
845 /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
846 and matching has carried on after a full buffer, in order to compute the length
847 needed. Otherwise, an overflow generates an immediate error return. */
848
849 if (overflowed)
850 {
851 rc = PCRE2_ERROR_NOMEMORY;
852 *blength = buff_length + extra_needed;
853 }
854
855 /* After a successful execution, return the number of substitutions and set the
856 length of buffer used, excluding the trailing zero. */
857
858 else
859 {
860 rc = subs;
861 *blength = buff_offset - 1;
862 }
863
864 EXIT:
865 if (match_data_created) pcre2_match_data_free(match_data);
866 else match_data->rc = rc;
867 return rc;
868
869 NOROOM:
870 rc = PCRE2_ERROR_NOMEMORY;
871 goto EXIT;
872
873 BAD:
874 rc = PCRE2_ERROR_BADREPLACEMENT;
875 goto PTREXIT;
876
877 BADESCAPE:
878 rc = PCRE2_ERROR_BADREPESCAPE;
879
880 PTREXIT:
881 *blength = (PCRE2_SIZE)(ptr - replacement);
882 goto EXIT;
883 }
884
885 /* End of pcre2_substitute.c */
886