• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2020 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #include "pcre2_internal.h"
47 
48 #define PTR_STACK_SIZE 20
49 
50 #define SUBSTITUTE_OPTIONS \
51   (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52    PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53    PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54    PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
55 
56 
57 
58 /*************************************************
59 *           Find end of substitute text          *
60 *************************************************/
61 
62 /* In extended mode, we recognize ${name:+set text:unset text} and similar
63 constructions. This requires the identification of unescaped : and }
64 characters. This function scans for such. It must deal with nested ${
65 constructions. The pointer to the text is updated, either to the required end
66 character, or to where an error was detected.
67 
68 Arguments:
69   code      points to the compiled expression (for options)
70   ptrptr    points to the pointer to the start of the text (updated)
71   ptrend    end of the whole string
72   last      TRUE if the last expected string (only } recognized)
73 
74 Returns:    0 on success
75             negative error code on failure
76 */
77 
78 static int
find_text_end(const pcre2_code * code,PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL last)79 find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80   BOOL last)
81 {
82 int rc = 0;
83 uint32_t nestlevel = 0;
84 BOOL literal = FALSE;
85 PCRE2_SPTR ptr = *ptrptr;
86 
87 for (; ptr < ptrend; ptr++)
88   {
89   if (literal)
90     {
91     if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92       {
93       literal = FALSE;
94       ptr += 1;
95       }
96     }
97 
98   else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99     {
100     if (nestlevel == 0) goto EXIT;
101     nestlevel--;
102     }
103 
104   else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105 
106   else if (*ptr == CHAR_DOLLAR_SIGN)
107     {
108     if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109       {
110       nestlevel++;
111       ptr += 1;
112       }
113     }
114 
115   else if (*ptr == CHAR_BACKSLASH)
116     {
117     int erc;
118     int errorcode;
119     uint32_t ch;
120 
121     if (ptr < ptrend - 1) switch (ptr[1])
122       {
123       case CHAR_L:
124       case CHAR_l:
125       case CHAR_U:
126       case CHAR_u:
127       ptr += 1;
128       continue;
129       }
130 
131     ptr += 1;  /* Must point after \ */
132     erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
133       code->overall_options, code->extra_options, FALSE, NULL);
134     ptr -= 1;  /* Back to last code unit of escape */
135     if (errorcode != 0)
136       {
137       rc = errorcode;
138       goto EXIT;
139       }
140 
141     switch(erc)
142       {
143       case 0:      /* Data character */
144       case ESC_E:  /* Isolated \E is ignored */
145       break;
146 
147       case ESC_Q:
148       literal = TRUE;
149       break;
150 
151       default:
152       rc = PCRE2_ERROR_BADREPESCAPE;
153       goto EXIT;
154       }
155     }
156   }
157 
158 rc = PCRE2_ERROR_REPMISSINGBRACE;   /* Terminator not found */
159 
160 EXIT:
161 *ptrptr = ptr;
162 return rc;
163 }
164 
165 
166 
167 /*************************************************
168 *              Match and substitute              *
169 *************************************************/
170 
171 /* This function applies a compiled re to a subject string and creates a new
172 string with substitutions. The first 7 arguments are the same as for
173 pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
174 
175 Arguments:
176   code            points to the compiled expression
177   subject         points to the subject string
178   length          length of subject string (may contain binary zeros)
179   start_offset    where to start in the subject string
180   options         option bits
181   match_data      points to a match_data block, or is NULL
182   context         points a PCRE2 context
183   replacement     points to the replacement string
184   rlength         length of replacement string
185   buffer          where to put the substituted string
186   blength         points to length of buffer; updated to length of string
187 
188 Returns:          >= 0 number of substitutions made
189                   < 0 an error code
190                   PCRE2_ERROR_BADREPLACEMENT means invalid use of $
191 */
192 
193 /* This macro checks for space in the buffer before copying into it. On
194 overflow, either give an error immediately, or keep on, accumulating the
195 length. */
196 
197 #define CHECKMEMCPY(from,length) \
198   { \
199   if (!overflowed && lengthleft < length) \
200     { \
201     if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
202     overflowed = TRUE; \
203     extra_needed = length - lengthleft; \
204     } \
205   else if (overflowed) \
206     { \
207     extra_needed += length; \
208     }  \
209   else \
210     {  \
211     memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
212     buff_offset += length; \
213     lengthleft -= length; \
214     } \
215   }
216 
217 /* Here's the function */
218 
219 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substitute(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,PCRE2_SPTR replacement,PCRE2_SIZE rlength,PCRE2_UCHAR * buffer,PCRE2_SIZE * blength)220 pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
221   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
222   pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
223   PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
224 {
225 int rc;
226 int subs;
227 int forcecase = 0;
228 int forcecasereset = 0;
229 uint32_t ovector_count;
230 uint32_t goptions = 0;
231 uint32_t suboptions;
232 pcre2_match_data *internal_match_data = NULL;
233 BOOL escaped_literal = FALSE;
234 BOOL overflowed = FALSE;
235 BOOL use_existing_match;
236 BOOL replacement_only;
237 #ifdef SUPPORT_UNICODE
238 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
239 BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
240 #endif
241 PCRE2_UCHAR temp[6];
242 PCRE2_SPTR ptr;
243 PCRE2_SPTR repend;
244 PCRE2_SIZE extra_needed = 0;
245 PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
246 PCRE2_SIZE *ovector;
247 PCRE2_SIZE ovecsave[3];
248 pcre2_substitute_callout_block scb;
249 
250 /* General initialization */
251 
252 buff_offset = 0;
253 lengthleft = buff_length = *blength;
254 *blength = PCRE2_UNSET;
255 ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
256 
257 /* Partial matching is not valid. This must come after setting *blength to
258 PCRE2_UNSET, so as not to imply an offset in the replacement. */
259 
260 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
261   return PCRE2_ERROR_BADOPTION;
262 
263 /* Check for using a match that has already happened. Note that the subject
264 pointer in the match data may be NULL after a no-match. */
265 
266 use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
267 replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
268 
269 /* If starting from an existing match, there must be an externally provided
270 match data block. We create an internal match_data block in two cases: (a) an
271 external one is not supplied (and we are not starting from an existing match);
272 (b) an existing match is to be used for the first substitution. In the latter
273 case, we copy the existing match into the internal block. This ensures that no
274 changes are made to the existing match data block. */
275 
276 if (match_data == NULL)
277   {
278   pcre2_general_context *gcontext;
279   if (use_existing_match) return PCRE2_ERROR_NULL;
280   gcontext = (mcontext == NULL)?
281     (pcre2_general_context *)code :
282     (pcre2_general_context *)mcontext;
283   match_data = internal_match_data =
284     pcre2_match_data_create_from_pattern(code, gcontext);
285   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
286   }
287 
288 else if (use_existing_match)
289   {
290   pcre2_general_context *gcontext = (mcontext == NULL)?
291     (pcre2_general_context *)code :
292     (pcre2_general_context *)mcontext;
293   int pairs = (code->top_bracket + 1 < match_data->oveccount)?
294     code->top_bracket + 1 : match_data->oveccount;
295   internal_match_data = pcre2_match_data_create(match_data->oveccount,
296     gcontext);
297   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
298   memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
299     + 2*pairs*sizeof(PCRE2_SIZE));
300   match_data = internal_match_data;
301   }
302 
303 /* Remember ovector details */
304 
305 ovector = pcre2_get_ovector_pointer(match_data);
306 ovector_count = pcre2_get_ovector_count(match_data);
307 
308 /* Fixed things in the callout block */
309 
310 scb.version = 0;
311 scb.input = subject;
312 scb.output = (PCRE2_SPTR)buffer;
313 scb.ovector = ovector;
314 
315 /* Find lengths of zero-terminated strings and the end of the replacement. */
316 
317 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
318 if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
319 repend = replacement + rlength;
320 
321 /* Check UTF replacement string if necessary. */
322 
323 #ifdef SUPPORT_UNICODE
324 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
325   {
326   rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
327   if (rc != 0)
328     {
329     match_data->leftchar = 0;
330     goto EXIT;
331     }
332   }
333 #endif  /* SUPPORT_UNICODE */
334 
335 /* Save the substitute options and remove them from the match options. */
336 
337 suboptions = options & SUBSTITUTE_OPTIONS;
338 options &= ~SUBSTITUTE_OPTIONS;
339 
340 /* Error if the start match offset is greater than the length of the subject. */
341 
342 if (start_offset > length)
343   {
344   match_data->leftchar = 0;
345   rc = PCRE2_ERROR_BADOFFSET;
346   goto EXIT;
347   }
348 
349 /* Copy up to the start offset, unless only the replacement is required. */
350 
351 if (!replacement_only) CHECKMEMCPY(subject, start_offset);
352 
353 /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
354 match is taken from the match_data that was passed in. */
355 
356 subs = 0;
357 do
358   {
359   PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
360   uint32_t ptrstackptr = 0;
361 
362   if (use_existing_match)
363     {
364     rc = match_data->rc;
365     use_existing_match = FALSE;
366     }
367   else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
368     match_data, mcontext);
369 
370 #ifdef SUPPORT_UNICODE
371   if (utf) options |= PCRE2_NO_UTF_CHECK;  /* Only need to check once */
372 #endif
373 
374   /* Any error other than no match returns the error code. No match when not
375   doing the special after-empty-match global rematch, or when at the end of the
376   subject, breaks the global loop. Otherwise, advance the starting point by one
377   character, copying it to the output, and try again. */
378 
379   if (rc < 0)
380     {
381     PCRE2_SIZE save_start;
382 
383     if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
384     if (goptions == 0 || start_offset >= length) break;
385 
386     /* Advance by one code point. Then, if CRLF is a valid newline sequence and
387     we have advanced into the middle of it, advance one more code point. In
388     other words, do not start in the middle of CRLF, even if CR and LF on their
389     own are valid newlines. */
390 
391     save_start = start_offset++;
392     if (subject[start_offset-1] == CHAR_CR &&
393         code->newline_convention != PCRE2_NEWLINE_CR &&
394         code->newline_convention != PCRE2_NEWLINE_LF &&
395         start_offset < length &&
396         subject[start_offset] == CHAR_LF)
397       start_offset++;
398 
399     /* Otherwise, in UTF mode, advance past any secondary code points. */
400 
401     else if ((code->overall_options & PCRE2_UTF) != 0)
402       {
403 #if PCRE2_CODE_UNIT_WIDTH == 8
404       while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
405         start_offset++;
406 #elif PCRE2_CODE_UNIT_WIDTH == 16
407       while (start_offset < length &&
408             (subject[start_offset] & 0xfc00) == 0xdc00)
409         start_offset++;
410 #endif
411       }
412 
413     /* Copy what we have advanced past (unless not required), reset the special
414     global options, and continue to the next match. */
415 
416     fraglength = start_offset - save_start;
417     if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
418     goptions = 0;
419     continue;
420     }
421 
422   /* Handle a successful match. Matches that use \K to end before they start
423   or start before the current point in the subject are not supported. */
424 
425   if (ovector[1] < ovector[0] || ovector[0] < start_offset)
426     {
427     rc = PCRE2_ERROR_BADSUBSPATTERN;
428     goto EXIT;
429     }
430 
431   /* Check for the same match as previous. This is legitimate after matching an
432   empty string that starts after the initial match offset. We have tried again
433   at the match point in case the pattern is one like /(?<=\G.)/ which can never
434   match at its starting point, so running the match achieves the bumpalong. If
435   we do get the same (null) match at the original match point, it isn't such a
436   pattern, so we now do the empty string magic. In all other cases, a repeat
437   match should never occur. */
438 
439   if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
440     {
441     if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
442       {
443       goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
444       ovecsave[2] = start_offset;
445       continue;    /* Back to the top of the loop */
446       }
447     rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
448     goto EXIT;
449     }
450 
451   /* Count substitutions with a paranoid check for integer overflow; surely no
452   real call to this function would ever hit this! */
453 
454   if (subs == INT_MAX)
455     {
456     rc = PCRE2_ERROR_TOOMANYREPLACE;
457     goto EXIT;
458     }
459   subs++;
460 
461   /* Copy the text leading up to the match (unless not required), and remember
462   where the insert begins and how many ovector pairs are set. */
463 
464   if (rc == 0) rc = ovector_count;
465   fraglength = ovector[0] - start_offset;
466   if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
467   scb.output_offsets[0] = buff_offset;
468   scb.oveccount = rc;
469 
470   /* Process the replacement string. If the entire replacement is literal, just
471   copy it with length check. */
472 
473   ptr = replacement;
474   if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
475     {
476     CHECKMEMCPY(ptr, rlength);
477     }
478 
479   /* Within a non-literal replacement, which must be scanned character by
480   character, local literal mode can be set by \Q, but only in extended mode
481   when backslashes are being interpreted. In extended mode we must handle
482   nested substrings that are to be reprocessed. */
483 
484   else for (;;)
485     {
486     uint32_t ch;
487     unsigned int chlen;
488 
489     /* If at the end of a nested substring, pop the stack. */
490 
491     if (ptr >= repend)
492       {
493       if (ptrstackptr == 0) break;       /* End of replacement string */
494       repend = ptrstack[--ptrstackptr];
495       ptr = ptrstack[--ptrstackptr];
496       continue;
497       }
498 
499     /* Handle the next character */
500 
501     if (escaped_literal)
502       {
503       if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
504         {
505         escaped_literal = FALSE;
506         ptr += 2;
507         continue;
508         }
509       goto LOADLITERAL;
510       }
511 
512     /* Not in literal mode. */
513 
514     if (*ptr == CHAR_DOLLAR_SIGN)
515       {
516       int group, n;
517       uint32_t special = 0;
518       BOOL inparens;
519       BOOL star;
520       PCRE2_SIZE sublength;
521       PCRE2_SPTR text1_start = NULL;
522       PCRE2_SPTR text1_end = NULL;
523       PCRE2_SPTR text2_start = NULL;
524       PCRE2_SPTR text2_end = NULL;
525       PCRE2_UCHAR next;
526       PCRE2_UCHAR name[33];
527 
528       if (++ptr >= repend) goto BAD;
529       if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
530 
531       group = -1;
532       n = 0;
533       inparens = FALSE;
534       star = FALSE;
535 
536       if (next == CHAR_LEFT_CURLY_BRACKET)
537         {
538         if (++ptr >= repend) goto BAD;
539         next = *ptr;
540         inparens = TRUE;
541         }
542 
543       if (next == CHAR_ASTERISK)
544         {
545         if (++ptr >= repend) goto BAD;
546         next = *ptr;
547         star = TRUE;
548         }
549 
550       if (!star && next >= CHAR_0 && next <= CHAR_9)
551         {
552         group = next - CHAR_0;
553         while (++ptr < repend)
554           {
555           next = *ptr;
556           if (next < CHAR_0 || next > CHAR_9) break;
557           group = group * 10 + next - CHAR_0;
558 
559           /* A check for a number greater than the hightest captured group
560           is sufficient here; no need for a separate overflow check. If unknown
561           groups are to be treated as unset, just skip over any remaining
562           digits and carry on. */
563 
564           if (group > code->top_bracket)
565             {
566             if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
567               {
568               while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
569               break;
570               }
571             else
572               {
573               rc = PCRE2_ERROR_NOSUBSTRING;
574               goto PTREXIT;
575               }
576             }
577           }
578         }
579       else
580         {
581         const uint8_t *ctypes = code->tables + ctypes_offset;
582         while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
583           {
584           name[n++] = next;
585           if (n > 32) goto BAD;
586           if (++ptr >= repend) break;
587           next = *ptr;
588           }
589         if (n == 0) goto BAD;
590         name[n] = 0;
591         }
592 
593       /* In extended mode we recognize ${name:+set text:unset text} and
594       ${name:-default text}. */
595 
596       if (inparens)
597         {
598         if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
599              !star && ptr < repend - 2 && next == CHAR_COLON)
600           {
601           special = *(++ptr);
602           if (special != CHAR_PLUS && special != CHAR_MINUS)
603             {
604             rc = PCRE2_ERROR_BADSUBSTITUTION;
605             goto PTREXIT;
606             }
607 
608           text1_start = ++ptr;
609           rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
610           if (rc != 0) goto PTREXIT;
611           text1_end = ptr;
612 
613           if (special == CHAR_PLUS && *ptr == CHAR_COLON)
614             {
615             text2_start = ++ptr;
616             rc = find_text_end(code, &ptr, repend, TRUE);
617             if (rc != 0) goto PTREXIT;
618             text2_end = ptr;
619             }
620           }
621 
622         else
623           {
624           if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
625             {
626             rc = PCRE2_ERROR_REPMISSINGBRACE;
627             goto PTREXIT;
628             }
629           }
630 
631         ptr++;
632         }
633 
634       /* Have found a syntactically correct group number or name, or *name.
635       Only *MARK is currently recognized. */
636 
637       if (star)
638         {
639         if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
640           {
641           PCRE2_SPTR mark = pcre2_get_mark(match_data);
642           if (mark != NULL)
643             {
644             PCRE2_SPTR mark_start = mark;
645             while (*mark != 0) mark++;
646             fraglength = mark - mark_start;
647             CHECKMEMCPY(mark_start, fraglength);
648             }
649           }
650         else goto BAD;
651         }
652 
653       /* Substitute the contents of a group. We don't use substring_copy
654       functions any more, in order to support case forcing. */
655 
656       else
657         {
658         PCRE2_SPTR subptr, subptrend;
659 
660         /* Find a number for a named group. In case there are duplicate names,
661         search for the first one that is set. If the name is not found when
662         PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
663         non-existent group. */
664 
665         if (group < 0)
666           {
667           PCRE2_SPTR first, last, entry;
668           rc = pcre2_substring_nametable_scan(code, name, &first, &last);
669           if (rc == PCRE2_ERROR_NOSUBSTRING &&
670               (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
671             {
672             group = code->top_bracket + 1;
673             }
674           else
675             {
676             if (rc < 0) goto PTREXIT;
677             for (entry = first; entry <= last; entry += rc)
678               {
679               uint32_t ng = GET2(entry, 0);
680               if (ng < ovector_count)
681                 {
682                 if (group < 0) group = ng;          /* First in ovector */
683                 if (ovector[ng*2] != PCRE2_UNSET)
684                   {
685                   group = ng;                       /* First that is set */
686                   break;
687                   }
688                 }
689               }
690 
691             /* If group is still negative, it means we did not find a group
692             that is in the ovector. Just set the first group. */
693 
694             if (group < 0) group = GET2(first, 0);
695             }
696           }
697 
698         /* We now have a group that is identified by number. Find the length of
699         the captured string. If a group in a non-special substitution is unset
700         when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
701 
702         rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
703         if (rc < 0)
704           {
705           if (rc == PCRE2_ERROR_NOSUBSTRING &&
706               (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
707             {
708             rc = PCRE2_ERROR_UNSET;
709             }
710           if (rc != PCRE2_ERROR_UNSET) goto PTREXIT;  /* Non-unset errors */
711           if (special == 0)                           /* Plain substitution */
712             {
713             if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
714             goto PTREXIT;                             /* Else error */
715             }
716           }
717 
718         /* If special is '+' we have a 'set' and possibly an 'unset' text,
719         both of which are reprocessed when used. If special is '-' we have a
720         default text for when the group is unset; it must be reprocessed. */
721 
722         if (special != 0)
723           {
724           if (special == CHAR_MINUS)
725             {
726             if (rc == 0) goto LITERAL_SUBSTITUTE;
727             text2_start = text1_start;
728             text2_end = text1_end;
729             }
730 
731           if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
732           ptrstack[ptrstackptr++] = ptr;
733           ptrstack[ptrstackptr++] = repend;
734 
735           if (rc == 0)
736             {
737             ptr = text1_start;
738             repend = text1_end;
739             }
740           else
741             {
742             ptr = text2_start;
743             repend = text2_end;
744             }
745           continue;
746           }
747 
748         /* Otherwise we have a literal substitution of a group's contents. */
749 
750         LITERAL_SUBSTITUTE:
751         subptr = subject + ovector[group*2];
752         subptrend = subject + ovector[group*2 + 1];
753 
754         /* Substitute a literal string, possibly forcing alphabetic case. */
755 
756         while (subptr < subptrend)
757           {
758           GETCHARINCTEST(ch, subptr);
759           if (forcecase != 0)
760             {
761 #ifdef SUPPORT_UNICODE
762             if (utf || ucp)
763               {
764               uint32_t type = UCD_CHARTYPE(ch);
765               if (PRIV(ucp_gentype)[type] == ucp_L &&
766                   type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
767                 ch = UCD_OTHERCASE(ch);
768               }
769             else
770 #endif
771               {
772               if (((code->tables + cbits_offset +
773                   ((forcecase > 0)? cbit_upper:cbit_lower)
774                   )[ch/8] & (1u << (ch%8))) == 0)
775                 ch = (code->tables + fcc_offset)[ch];
776               }
777             forcecase = forcecasereset;
778             }
779 
780 #ifdef SUPPORT_UNICODE
781           if (utf) chlen = PRIV(ord2utf)(ch, temp); else
782 #endif
783             {
784             temp[0] = ch;
785             chlen = 1;
786             }
787           CHECKMEMCPY(temp, chlen);
788           }
789         }
790       }
791 
792     /* Handle an escape sequence in extended mode. We can use check_escape()
793     to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
794     the case-forcing escapes are not supported in pcre2_compile() so must be
795     recognized here. */
796 
797     else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
798               *ptr == CHAR_BACKSLASH)
799       {
800       int errorcode;
801 
802       if (ptr < repend - 1) switch (ptr[1])
803         {
804         case CHAR_L:
805         forcecase = forcecasereset = -1;
806         ptr += 2;
807         continue;
808 
809         case CHAR_l:
810         forcecase = -1;
811         forcecasereset = 0;
812         ptr += 2;
813         continue;
814 
815         case CHAR_U:
816         forcecase = forcecasereset = 1;
817         ptr += 2;
818         continue;
819 
820         case CHAR_u:
821         forcecase = 1;
822         forcecasereset = 0;
823         ptr += 2;
824         continue;
825 
826         default:
827         break;
828         }
829 
830       ptr++;  /* Point after \ */
831       rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
832         code->overall_options, code->extra_options, FALSE, NULL);
833       if (errorcode != 0) goto BADESCAPE;
834 
835       switch(rc)
836         {
837         case ESC_E:
838         forcecase = forcecasereset = 0;
839         continue;
840 
841         case ESC_Q:
842         escaped_literal = TRUE;
843         continue;
844 
845         case 0:      /* Data character */
846         goto LITERAL;
847 
848         default:
849         goto BADESCAPE;
850         }
851       }
852 
853     /* Handle a literal code unit */
854 
855     else
856       {
857       LOADLITERAL:
858       GETCHARINCTEST(ch, ptr);    /* Get character value, increment pointer */
859 
860       LITERAL:
861       if (forcecase != 0)
862         {
863 #ifdef SUPPORT_UNICODE
864         if (utf || ucp)
865           {
866           uint32_t type = UCD_CHARTYPE(ch);
867           if (PRIV(ucp_gentype)[type] == ucp_L &&
868               type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
869             ch = UCD_OTHERCASE(ch);
870           }
871         else
872 #endif
873           {
874           if (((code->tables + cbits_offset +
875               ((forcecase > 0)? cbit_upper:cbit_lower)
876               )[ch/8] & (1u << (ch%8))) == 0)
877             ch = (code->tables + fcc_offset)[ch];
878           }
879         forcecase = forcecasereset;
880         }
881 
882 #ifdef SUPPORT_UNICODE
883       if (utf) chlen = PRIV(ord2utf)(ch, temp); else
884 #endif
885         {
886         temp[0] = ch;
887         chlen = 1;
888         }
889       CHECKMEMCPY(temp, chlen);
890       } /* End handling a literal code unit */
891     }   /* End of loop for scanning the replacement. */
892 
893   /* The replacement has been copied to the output, or its size has been
894   remembered. Do the callout if there is one and we have done an actual
895   replacement. */
896 
897   if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
898     {
899     scb.subscount = subs;
900     scb.output_offsets[1] = buff_offset;
901     rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
902 
903     /* A non-zero return means cancel this substitution. Instead, copy the
904     matched string fragment. */
905 
906     if (rc != 0)
907       {
908       PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
909       PCRE2_SIZE oldlength = ovector[1] - ovector[0];
910 
911       buff_offset -= newlength;
912       lengthleft += newlength;
913       if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
914 
915       /* A negative return means do not do any more. */
916 
917       if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
918       }
919     }
920 
921   /* Save the details of this match. See above for how this data is used. If we
922   matched an empty string, do the magic for global matches. Update the start
923   offset to point to the rest of the subject string. If we re-used an existing
924   match for the first match, switch to the internal match data block. */
925 
926   ovecsave[0] = ovector[0];
927   ovecsave[1] = ovector[1];
928   ovecsave[2] = start_offset;
929 
930   goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
931     PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
932   start_offset = ovector[1];
933   } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0);  /* Repeat "do" loop */
934 
935 /* Copy the rest of the subject unless not required, and terminate the output
936 with a binary zero. */
937 
938 if (!replacement_only)
939   {
940   fraglength = length - start_offset;
941   CHECKMEMCPY(subject + start_offset, fraglength);
942   }
943 
944 temp[0] = 0;
945 CHECKMEMCPY(temp, 1);
946 
947 /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
948 and matching has carried on after a full buffer, in order to compute the length
949 needed. Otherwise, an overflow generates an immediate error return. */
950 
951 if (overflowed)
952   {
953   rc = PCRE2_ERROR_NOMEMORY;
954   *blength = buff_length + extra_needed;
955   }
956 
957 /* After a successful execution, return the number of substitutions and set the
958 length of buffer used, excluding the trailing zero. */
959 
960 else
961   {
962   rc = subs;
963   *blength = buff_offset - 1;
964   }
965 
966 EXIT:
967 if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
968   else match_data->rc = rc;
969 return rc;
970 
971 NOROOM:
972 rc = PCRE2_ERROR_NOMEMORY;
973 goto EXIT;
974 
975 BAD:
976 rc = PCRE2_ERROR_BADREPLACEMENT;
977 goto PTREXIT;
978 
979 BADESCAPE:
980 rc = PCRE2_ERROR_BADREPESCAPE;
981 
982 PTREXIT:
983 *blength = (PCRE2_SIZE)(ptr - replacement);
984 goto EXIT;
985 }
986 
987 /* End of pcre2_substitute.c */
988