1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2019 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #include "pcre2_internal.h"
47
48 #define PTR_STACK_SIZE 20
49
50 #define SUBSTITUTE_OPTIONS \
51 (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_UNKNOWN_UNSET| \
53 PCRE2_SUBSTITUTE_UNSET_EMPTY)
54
55
56
57 /*************************************************
58 * Find end of substitute text *
59 *************************************************/
60
61 /* In extended mode, we recognize ${name:+set text:unset text} and similar
62 constructions. This requires the identification of unescaped : and }
63 characters. This function scans for such. It must deal with nested ${
64 constructions. The pointer to the text is updated, either to the required end
65 character, or to where an error was detected.
66
67 Arguments:
68 code points to the compiled expression (for options)
69 ptrptr points to the pointer to the start of the text (updated)
70 ptrend end of the whole string
71 last TRUE if the last expected string (only } recognized)
72
73 Returns: 0 on success
74 negative error code on failure
75 */
76
77 static int
find_text_end(const pcre2_code * code,PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL last)78 find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
79 BOOL last)
80 {
81 int rc = 0;
82 uint32_t nestlevel = 0;
83 BOOL literal = FALSE;
84 PCRE2_SPTR ptr = *ptrptr;
85
86 for (; ptr < ptrend; ptr++)
87 {
88 if (literal)
89 {
90 if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
91 {
92 literal = FALSE;
93 ptr += 1;
94 }
95 }
96
97 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
98 {
99 if (nestlevel == 0) goto EXIT;
100 nestlevel--;
101 }
102
103 else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
104
105 else if (*ptr == CHAR_DOLLAR_SIGN)
106 {
107 if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
108 {
109 nestlevel++;
110 ptr += 1;
111 }
112 }
113
114 else if (*ptr == CHAR_BACKSLASH)
115 {
116 int erc;
117 int errorcode;
118 uint32_t ch;
119
120 if (ptr < ptrend - 1) switch (ptr[1])
121 {
122 case CHAR_L:
123 case CHAR_l:
124 case CHAR_U:
125 case CHAR_u:
126 ptr += 1;
127 continue;
128 }
129
130 ptr += 1; /* Must point after \ */
131 erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
132 code->overall_options, code->extra_options, FALSE, NULL);
133 ptr -= 1; /* Back to last code unit of escape */
134 if (errorcode != 0)
135 {
136 rc = errorcode;
137 goto EXIT;
138 }
139
140 switch(erc)
141 {
142 case 0: /* Data character */
143 case ESC_E: /* Isolated \E is ignored */
144 break;
145
146 case ESC_Q:
147 literal = TRUE;
148 break;
149
150 default:
151 rc = PCRE2_ERROR_BADREPESCAPE;
152 goto EXIT;
153 }
154 }
155 }
156
157 rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
158
159 EXIT:
160 *ptrptr = ptr;
161 return rc;
162 }
163
164
165
166 /*************************************************
167 * Match and substitute *
168 *************************************************/
169
170 /* This function applies a compiled re to a subject string and creates a new
171 string with substitutions. The first 7 arguments are the same as for
172 pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
173
174 Arguments:
175 code points to the compiled expression
176 subject points to the subject string
177 length length of subject string (may contain binary zeros)
178 start_offset where to start in the subject string
179 options option bits
180 match_data points to a match_data block, or is NULL
181 context points a PCRE2 context
182 replacement points to the replacement string
183 rlength length of replacement string
184 buffer where to put the substituted string
185 blength points to length of buffer; updated to length of string
186
187 Returns: >= 0 number of substitutions made
188 < 0 an error code
189 PCRE2_ERROR_BADREPLACEMENT means invalid use of $
190 */
191
192 /* This macro checks for space in the buffer before copying into it. On
193 overflow, either give an error immediately, or keep on, accumulating the
194 length. */
195
196 #define CHECKMEMCPY(from,length) \
197 if (!overflowed && lengthleft < length) \
198 { \
199 if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
200 overflowed = TRUE; \
201 extra_needed = length - lengthleft; \
202 } \
203 else if (overflowed) \
204 { \
205 extra_needed += length; \
206 } \
207 else \
208 { \
209 memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
210 buff_offset += length; \
211 lengthleft -= length; \
212 }
213
214 /* Here's the function */
215
216 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substitute(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,PCRE2_SPTR replacement,PCRE2_SIZE rlength,PCRE2_UCHAR * buffer,PCRE2_SIZE * blength)217 pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
218 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
219 pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
220 PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
221 {
222 int rc;
223 int subs;
224 int forcecase = 0;
225 int forcecasereset = 0;
226 uint32_t ovector_count;
227 uint32_t goptions = 0;
228 uint32_t suboptions;
229 BOOL match_data_created = FALSE;
230 BOOL literal = FALSE;
231 BOOL overflowed = FALSE;
232 #ifdef SUPPORT_UNICODE
233 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
234 #endif
235 PCRE2_UCHAR temp[6];
236 PCRE2_SPTR ptr;
237 PCRE2_SPTR repend;
238 PCRE2_SIZE extra_needed = 0;
239 PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
240 PCRE2_SIZE *ovector;
241 PCRE2_SIZE ovecsave[3];
242 pcre2_substitute_callout_block scb;
243
244 /* General initialization */
245
246 buff_offset = 0;
247 lengthleft = buff_length = *blength;
248 *blength = PCRE2_UNSET;
249 ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
250
251 /* Partial matching is not valid. This must come after setting *blength to
252 PCRE2_UNSET, so as not to imply an offset in the replacement. */
253
254 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
255 return PCRE2_ERROR_BADOPTION;
256
257 /* If no match data block is provided, create one. */
258
259 if (match_data == NULL)
260 {
261 pcre2_general_context *gcontext = (mcontext == NULL)?
262 (pcre2_general_context *)code :
263 (pcre2_general_context *)mcontext;
264 match_data = pcre2_match_data_create_from_pattern(code, gcontext);
265 if (match_data == NULL) return PCRE2_ERROR_NOMEMORY;
266 match_data_created = TRUE;
267 }
268 ovector = pcre2_get_ovector_pointer(match_data);
269 ovector_count = pcre2_get_ovector_count(match_data);
270
271 /* Fixed things in the callout block */
272
273 scb.version = 0;
274 scb.input = subject;
275 scb.output = (PCRE2_SPTR)buffer;
276 scb.ovector = ovector;
277
278 /* Find lengths of zero-terminated strings and the end of the replacement. */
279
280 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
281 if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
282 repend = replacement + rlength;
283
284 /* Check UTF replacement string if necessary. */
285
286 #ifdef SUPPORT_UNICODE
287 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
288 {
289 rc = PRIV(valid_utf)(replacement, rlength, &(match_data->rightchar));
290 if (rc != 0)
291 {
292 match_data->leftchar = 0;
293 goto EXIT;
294 }
295 }
296 #endif /* SUPPORT_UNICODE */
297
298 /* Save the substitute options and remove them from the match options. */
299
300 suboptions = options & SUBSTITUTE_OPTIONS;
301 options &= ~SUBSTITUTE_OPTIONS;
302
303 /* Copy up to the start offset */
304
305 if (start_offset > length)
306 {
307 match_data->leftchar = 0;
308 rc = PCRE2_ERROR_BADOFFSET;
309 goto EXIT;
310 }
311 CHECKMEMCPY(subject, start_offset);
312
313 /* Loop for global substituting. */
314
315 subs = 0;
316 do
317 {
318 PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
319 uint32_t ptrstackptr = 0;
320
321 rc = pcre2_match(code, subject, length, start_offset, options|goptions,
322 match_data, mcontext);
323
324 #ifdef SUPPORT_UNICODE
325 if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
326 #endif
327
328 /* Any error other than no match returns the error code. No match when not
329 doing the special after-empty-match global rematch, or when at the end of the
330 subject, breaks the global loop. Otherwise, advance the starting point by one
331 character, copying it to the output, and try again. */
332
333 if (rc < 0)
334 {
335 PCRE2_SIZE save_start;
336
337 if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
338 if (goptions == 0 || start_offset >= length) break;
339
340 /* Advance by one code point. Then, if CRLF is a valid newline sequence and
341 we have advanced into the middle of it, advance one more code point. In
342 other words, do not start in the middle of CRLF, even if CR and LF on their
343 own are valid newlines. */
344
345 save_start = start_offset++;
346 if (subject[start_offset-1] == CHAR_CR &&
347 code->newline_convention != PCRE2_NEWLINE_CR &&
348 code->newline_convention != PCRE2_NEWLINE_LF &&
349 start_offset < length &&
350 subject[start_offset] == CHAR_LF)
351 start_offset++;
352
353 /* Otherwise, in UTF mode, advance past any secondary code points. */
354
355 else if ((code->overall_options & PCRE2_UTF) != 0)
356 {
357 #if PCRE2_CODE_UNIT_WIDTH == 8
358 while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
359 start_offset++;
360 #elif PCRE2_CODE_UNIT_WIDTH == 16
361 while (start_offset < length &&
362 (subject[start_offset] & 0xfc00) == 0xdc00)
363 start_offset++;
364 #endif
365 }
366
367 /* Copy what we have advanced past, reset the special global options, and
368 continue to the next match. */
369
370 fraglength = start_offset - save_start;
371 CHECKMEMCPY(subject + save_start, fraglength);
372 goptions = 0;
373 continue;
374 }
375
376 /* Handle a successful match. Matches that use \K to end before they start
377 or start before the current point in the subject are not supported. */
378
379 if (ovector[1] < ovector[0] || ovector[0] < start_offset)
380 {
381 rc = PCRE2_ERROR_BADSUBSPATTERN;
382 goto EXIT;
383 }
384
385 /* Check for the same match as previous. This is legitimate after matching an
386 empty string that starts after the initial match offset. We have tried again
387 at the match point in case the pattern is one like /(?<=\G.)/ which can never
388 match at its starting point, so running the match achieves the bumpalong. If
389 we do get the same (null) match at the original match point, it isn't such a
390 pattern, so we now do the empty string magic. In all other cases, a repeat
391 match should never occur. */
392
393 if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
394 {
395 if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
396 {
397 goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
398 ovecsave[2] = start_offset;
399 continue; /* Back to the top of the loop */
400 }
401 rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
402 goto EXIT;
403 }
404
405 /* Count substitutions with a paranoid check for integer overflow; surely no
406 real call to this function would ever hit this! */
407
408 if (subs == INT_MAX)
409 {
410 rc = PCRE2_ERROR_TOOMANYREPLACE;
411 goto EXIT;
412 }
413 subs++;
414
415 /* Copy the text leading up to the match, and remember where the insert
416 begins and how many ovector pairs are set. */
417
418 if (rc == 0) rc = ovector_count;
419 fraglength = ovector[0] - start_offset;
420 CHECKMEMCPY(subject + start_offset, fraglength);
421 scb.output_offsets[0] = buff_offset;
422 scb.oveccount = rc;
423
424 /* Process the replacement string. Literal mode is set by \Q, but only in
425 extended mode when backslashes are being interpreted. In extended mode we
426 must handle nested substrings that are to be reprocessed. */
427
428 ptr = replacement;
429 for (;;)
430 {
431 uint32_t ch;
432 unsigned int chlen;
433
434 /* If at the end of a nested substring, pop the stack. */
435
436 if (ptr >= repend)
437 {
438 if (ptrstackptr == 0) break; /* End of replacement string */
439 repend = ptrstack[--ptrstackptr];
440 ptr = ptrstack[--ptrstackptr];
441 continue;
442 }
443
444 /* Handle the next character */
445
446 if (literal)
447 {
448 if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
449 {
450 literal = FALSE;
451 ptr += 2;
452 continue;
453 }
454 goto LOADLITERAL;
455 }
456
457 /* Not in literal mode. */
458
459 if (*ptr == CHAR_DOLLAR_SIGN)
460 {
461 int group, n;
462 uint32_t special = 0;
463 BOOL inparens;
464 BOOL star;
465 PCRE2_SIZE sublength;
466 PCRE2_SPTR text1_start = NULL;
467 PCRE2_SPTR text1_end = NULL;
468 PCRE2_SPTR text2_start = NULL;
469 PCRE2_SPTR text2_end = NULL;
470 PCRE2_UCHAR next;
471 PCRE2_UCHAR name[33];
472
473 if (++ptr >= repend) goto BAD;
474 if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
475
476 group = -1;
477 n = 0;
478 inparens = FALSE;
479 star = FALSE;
480
481 if (next == CHAR_LEFT_CURLY_BRACKET)
482 {
483 if (++ptr >= repend) goto BAD;
484 next = *ptr;
485 inparens = TRUE;
486 }
487
488 if (next == CHAR_ASTERISK)
489 {
490 if (++ptr >= repend) goto BAD;
491 next = *ptr;
492 star = TRUE;
493 }
494
495 if (!star && next >= CHAR_0 && next <= CHAR_9)
496 {
497 group = next - CHAR_0;
498 while (++ptr < repend)
499 {
500 next = *ptr;
501 if (next < CHAR_0 || next > CHAR_9) break;
502 group = group * 10 + next - CHAR_0;
503
504 /* A check for a number greater than the hightest captured group
505 is sufficient here; no need for a separate overflow check. If unknown
506 groups are to be treated as unset, just skip over any remaining
507 digits and carry on. */
508
509 if (group > code->top_bracket)
510 {
511 if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
512 {
513 while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
514 break;
515 }
516 else
517 {
518 rc = PCRE2_ERROR_NOSUBSTRING;
519 goto PTREXIT;
520 }
521 }
522 }
523 }
524 else
525 {
526 const uint8_t *ctypes = code->tables + ctypes_offset;
527 while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
528 {
529 name[n++] = next;
530 if (n > 32) goto BAD;
531 if (++ptr >= repend) break;
532 next = *ptr;
533 }
534 if (n == 0) goto BAD;
535 name[n] = 0;
536 }
537
538 /* In extended mode we recognize ${name:+set text:unset text} and
539 ${name:-default text}. */
540
541 if (inparens)
542 {
543 if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
544 !star && ptr < repend - 2 && next == CHAR_COLON)
545 {
546 special = *(++ptr);
547 if (special != CHAR_PLUS && special != CHAR_MINUS)
548 {
549 rc = PCRE2_ERROR_BADSUBSTITUTION;
550 goto PTREXIT;
551 }
552
553 text1_start = ++ptr;
554 rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
555 if (rc != 0) goto PTREXIT;
556 text1_end = ptr;
557
558 if (special == CHAR_PLUS && *ptr == CHAR_COLON)
559 {
560 text2_start = ++ptr;
561 rc = find_text_end(code, &ptr, repend, TRUE);
562 if (rc != 0) goto PTREXIT;
563 text2_end = ptr;
564 }
565 }
566
567 else
568 {
569 if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
570 {
571 rc = PCRE2_ERROR_REPMISSINGBRACE;
572 goto PTREXIT;
573 }
574 }
575
576 ptr++;
577 }
578
579 /* Have found a syntactically correct group number or name, or *name.
580 Only *MARK is currently recognized. */
581
582 if (star)
583 {
584 if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
585 {
586 PCRE2_SPTR mark = pcre2_get_mark(match_data);
587 if (mark != NULL)
588 {
589 PCRE2_SPTR mark_start = mark;
590 while (*mark != 0) mark++;
591 fraglength = mark - mark_start;
592 CHECKMEMCPY(mark_start, fraglength);
593 }
594 }
595 else goto BAD;
596 }
597
598 /* Substitute the contents of a group. We don't use substring_copy
599 functions any more, in order to support case forcing. */
600
601 else
602 {
603 PCRE2_SPTR subptr, subptrend;
604
605 /* Find a number for a named group. In case there are duplicate names,
606 search for the first one that is set. If the name is not found when
607 PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
608 non-existent group. */
609
610 if (group < 0)
611 {
612 PCRE2_SPTR first, last, entry;
613 rc = pcre2_substring_nametable_scan(code, name, &first, &last);
614 if (rc == PCRE2_ERROR_NOSUBSTRING &&
615 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
616 {
617 group = code->top_bracket + 1;
618 }
619 else
620 {
621 if (rc < 0) goto PTREXIT;
622 for (entry = first; entry <= last; entry += rc)
623 {
624 uint32_t ng = GET2(entry, 0);
625 if (ng < ovector_count)
626 {
627 if (group < 0) group = ng; /* First in ovector */
628 if (ovector[ng*2] != PCRE2_UNSET)
629 {
630 group = ng; /* First that is set */
631 break;
632 }
633 }
634 }
635
636 /* If group is still negative, it means we did not find a group
637 that is in the ovector. Just set the first group. */
638
639 if (group < 0) group = GET2(first, 0);
640 }
641 }
642
643 /* We now have a group that is identified by number. Find the length of
644 the captured string. If a group in a non-special substitution is unset
645 when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
646
647 rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
648 if (rc < 0)
649 {
650 if (rc == PCRE2_ERROR_NOSUBSTRING &&
651 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
652 {
653 rc = PCRE2_ERROR_UNSET;
654 }
655 if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */
656 if (special == 0) /* Plain substitution */
657 {
658 if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
659 goto PTREXIT; /* Else error */
660 }
661 }
662
663 /* If special is '+' we have a 'set' and possibly an 'unset' text,
664 both of which are reprocessed when used. If special is '-' we have a
665 default text for when the group is unset; it must be reprocessed. */
666
667 if (special != 0)
668 {
669 if (special == CHAR_MINUS)
670 {
671 if (rc == 0) goto LITERAL_SUBSTITUTE;
672 text2_start = text1_start;
673 text2_end = text1_end;
674 }
675
676 if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
677 ptrstack[ptrstackptr++] = ptr;
678 ptrstack[ptrstackptr++] = repend;
679
680 if (rc == 0)
681 {
682 ptr = text1_start;
683 repend = text1_end;
684 }
685 else
686 {
687 ptr = text2_start;
688 repend = text2_end;
689 }
690 continue;
691 }
692
693 /* Otherwise we have a literal substitution of a group's contents. */
694
695 LITERAL_SUBSTITUTE:
696 subptr = subject + ovector[group*2];
697 subptrend = subject + ovector[group*2 + 1];
698
699 /* Substitute a literal string, possibly forcing alphabetic case. */
700
701 while (subptr < subptrend)
702 {
703 GETCHARINCTEST(ch, subptr);
704 if (forcecase != 0)
705 {
706 #ifdef SUPPORT_UNICODE
707 if (utf)
708 {
709 uint32_t type = UCD_CHARTYPE(ch);
710 if (PRIV(ucp_gentype)[type] == ucp_L &&
711 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
712 ch = UCD_OTHERCASE(ch);
713 }
714 else
715 #endif
716 {
717 if (((code->tables + cbits_offset +
718 ((forcecase > 0)? cbit_upper:cbit_lower)
719 )[ch/8] & (1u << (ch%8))) == 0)
720 ch = (code->tables + fcc_offset)[ch];
721 }
722 forcecase = forcecasereset;
723 }
724
725 #ifdef SUPPORT_UNICODE
726 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
727 #endif
728 {
729 temp[0] = ch;
730 chlen = 1;
731 }
732 CHECKMEMCPY(temp, chlen);
733 }
734 }
735 }
736
737 /* Handle an escape sequence in extended mode. We can use check_escape()
738 to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
739 the case-forcing escapes are not supported in pcre2_compile() so must be
740 recognized here. */
741
742 else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
743 *ptr == CHAR_BACKSLASH)
744 {
745 int errorcode;
746
747 if (ptr < repend - 1) switch (ptr[1])
748 {
749 case CHAR_L:
750 forcecase = forcecasereset = -1;
751 ptr += 2;
752 continue;
753
754 case CHAR_l:
755 forcecase = -1;
756 forcecasereset = 0;
757 ptr += 2;
758 continue;
759
760 case CHAR_U:
761 forcecase = forcecasereset = 1;
762 ptr += 2;
763 continue;
764
765 case CHAR_u:
766 forcecase = 1;
767 forcecasereset = 0;
768 ptr += 2;
769 continue;
770
771 default:
772 break;
773 }
774
775 ptr++; /* Point after \ */
776 rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
777 code->overall_options, code->extra_options, FALSE, NULL);
778 if (errorcode != 0) goto BADESCAPE;
779
780 switch(rc)
781 {
782 case ESC_E:
783 forcecase = forcecasereset = 0;
784 continue;
785
786 case ESC_Q:
787 literal = TRUE;
788 continue;
789
790 case 0: /* Data character */
791 goto LITERAL;
792
793 default:
794 goto BADESCAPE;
795 }
796 }
797
798 /* Handle a literal code unit */
799
800 else
801 {
802 LOADLITERAL:
803 GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */
804
805 LITERAL:
806 if (forcecase != 0)
807 {
808 #ifdef SUPPORT_UNICODE
809 if (utf)
810 {
811 uint32_t type = UCD_CHARTYPE(ch);
812 if (PRIV(ucp_gentype)[type] == ucp_L &&
813 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
814 ch = UCD_OTHERCASE(ch);
815 }
816 else
817 #endif
818 {
819 if (((code->tables + cbits_offset +
820 ((forcecase > 0)? cbit_upper:cbit_lower)
821 )[ch/8] & (1u << (ch%8))) == 0)
822 ch = (code->tables + fcc_offset)[ch];
823 }
824 forcecase = forcecasereset;
825 }
826
827 #ifdef SUPPORT_UNICODE
828 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
829 #endif
830 {
831 temp[0] = ch;
832 chlen = 1;
833 }
834 CHECKMEMCPY(temp, chlen);
835 } /* End handling a literal code unit */
836 } /* End of loop for scanning the replacement. */
837
838 /* The replacement has been copied to the output, or its size has been
839 remembered. Do the callout if there is one and we have done an actual
840 replacement. */
841
842 if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
843 {
844 scb.subscount = subs;
845 scb.output_offsets[1] = buff_offset;
846 rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
847
848 /* A non-zero return means cancel this substitution. Instead, copy the
849 matched string fragment. */
850
851 if (rc != 0)
852 {
853 PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
854 PCRE2_SIZE oldlength = ovector[1] - ovector[0];
855
856 buff_offset -= newlength;
857 lengthleft += newlength;
858 CHECKMEMCPY(subject + ovector[0], oldlength);
859
860 /* A negative return means do not do any more. */
861
862 if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
863 }
864 }
865
866 /* Save the details of this match. See above for how this data is used. If we
867 matched an empty string, do the magic for global matches. Finally, update the
868 start offset to point to the rest of the subject string. */
869
870 ovecsave[0] = ovector[0];
871 ovecsave[1] = ovector[1];
872 ovecsave[2] = start_offset;
873
874 goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
875 PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
876 start_offset = ovector[1];
877 } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
878
879 /* Copy the rest of the subject. */
880
881 fraglength = length - start_offset;
882 CHECKMEMCPY(subject + start_offset, fraglength);
883 temp[0] = 0;
884 CHECKMEMCPY(temp , 1);
885
886 /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
887 and matching has carried on after a full buffer, in order to compute the length
888 needed. Otherwise, an overflow generates an immediate error return. */
889
890 if (overflowed)
891 {
892 rc = PCRE2_ERROR_NOMEMORY;
893 *blength = buff_length + extra_needed;
894 }
895
896 /* After a successful execution, return the number of substitutions and set the
897 length of buffer used, excluding the trailing zero. */
898
899 else
900 {
901 rc = subs;
902 *blength = buff_offset - 1;
903 }
904
905 EXIT:
906 if (match_data_created) pcre2_match_data_free(match_data);
907 else match_data->rc = rc;
908 return rc;
909
910 NOROOM:
911 rc = PCRE2_ERROR_NOMEMORY;
912 goto EXIT;
913
914 BAD:
915 rc = PCRE2_ERROR_BADREPLACEMENT;
916 goto PTREXIT;
917
918 BADESCAPE:
919 rc = PCRE2_ERROR_BADREPESCAPE;
920
921 PTREXIT:
922 *blength = (PCRE2_SIZE)(ptr - replacement);
923 goto EXIT;
924 }
925
926 /* End of pcre2_substitute.c */
927