1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #define NLBLOCK cb /* Block containing newline information */
47 #define PSSTART start_pattern /* Field containing processed string start */
48 #define PSEND end_pattern /* Field containing processed string end */
49
50 #include "pcre2_internal.h"
51
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define CALL_PRINTINT
62 #endif
63
64 /* There are a few things that vary with different code unit sizes. Handle them
65 by defining macros in order to minimize #if usage. */
66
67 #if PCRE2_CODE_UNIT_WIDTH == 8
68 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
69 #define XDIGIT(c) xdigitab[c]
70
71 #else /* Either 16-bit or 32-bit */
72 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
73
74 #if PCRE2_CODE_UNIT_WIDTH == 16
75 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
76
77 #else /* 32-bit */
78 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
79 #endif
80 #endif
81
82 /* Function definitions to allow mutual recursion */
83
84 static unsigned int
85 add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
86 const uint32_t *, unsigned int);
87
88 static BOOL
89 compile_regex(uint32_t, PCRE2_UCHAR **, PCRE2_SPTR *, int *, BOOL, BOOL,
90 uint32_t, int, uint32_t *, int32_t *, uint32_t *, int32_t *,
91 branch_chain *, compile_block *, size_t *);
92
93
94
95 /*************************************************
96 * Code parameters and static tables *
97 *************************************************/
98
99 /* This value specifies the size of stack workspace, which is used in different
100 ways in the different pattern scans. The group-identifying pre-scan uses it to
101 handle nesting, and needs it to be 16-bit aligned.
102
103 During the first compiling phase, when determining how much memory is required,
104 the regex is partly compiled into this space, but the compiled parts are
105 discarded as soon as they can be, so that hopefully there will never be an
106 overrun. The code does, however, check for an overrun, which can occur for
107 pathological patterns. The size of the workspace depends on LINK_SIZE because
108 the length of compiled items varies with this.
109
110 In the real compile phase, the workspace is used for remembering data about
111 numbered groups, provided there are not too many of them (if there are, extra
112 memory is acquired). For this phase the memory must be 32-bit aligned. Having
113 defined the size in code units, we set up C32_WORK_SIZE as the number of
114 elements in the 32-bit vector. */
115
116 #define COMPILE_WORK_SIZE (2048*LINK_SIZE) /* Size in code units */
117
118 #define C32_WORK_SIZE \
119 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint32_t))
120
121 /* The overrun tests check for a slightly smaller size so that they detect the
122 overrun before it actually does run off the end of the data block. */
123
124 #define WORK_SIZE_SAFETY_MARGIN (100)
125
126 /* This value determines the size of the initial vector that is used for
127 remembering named groups during the pre-compile. It is allocated on the stack,
128 but if it is too small, it is expanded, in a similar way to the workspace. The
129 value is the number of slots in the list. */
130
131 #define NAMED_GROUP_LIST_SIZE 20
132
133 /* The original PCRE required patterns to be zero-terminated, and it simplifies
134 the compiling code if it is guaranteed that there is a zero code unit at the
135 end of the pattern, because this means that tests for coding sequences such as
136 (*SKIP) or even just (?<= can check a sequence of code units without having to
137 keep checking for the end of the pattern. The new PCRE2 API allows zero code
138 units within patterns if a positive length is given, but in order to keep most
139 of the compiling code as it was, we copy such patterns and add a zero on the
140 end. This value determines the size of space on the stack that is used if the
141 pattern fits; if not, heap memory is used. */
142
143 #define COPIED_PATTERN_SIZE 1024
144
145 /* Maximum length value to check against when making sure that the variable
146 that holds the compiled pattern length does not overflow. We make it a bit less
147 than INT_MAX to allow for adding in group terminating bytes, so that we don't
148 have to check them every time. */
149
150 #define OFLOW_MAX (INT_MAX - 20)
151
152 /* Macro for setting individual bits in class bitmaps. It took some
153 experimenting to figure out how to stop gcc 5.3.0 from warning with
154 -Wconversion. This version gets a warning:
155
156 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7))
157
158 Let's hope the apparently less efficient version isn't actually so bad if the
159 compiler is clever with identical subexpressions. */
160
161 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7)))
162
163 /* Private flags added to firstcu and reqcu. */
164
165 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
166 #define REQ_VARY (1 << 1) /* reqcu followed non-literal item */
167 /* Negative values for the firstcu and reqcu flags */
168 #define REQ_UNSET (-2) /* Not yet found anything */
169 #define REQ_NONE (-1) /* Found not fixed char */
170
171 /* These flags are used in the groupinfo vector. */
172
173 #define GI_SET_COULD_BE_EMPTY 0x80000000u
174 #define GI_COULD_BE_EMPTY 0x40000000u
175 #define GI_NOT_FIXED_LENGTH 0x20000000u
176 #define GI_SET_FIXED_LENGTH 0x10000000u
177 #define GI_FIXED_LENGTH_MASK 0x0000ffffu
178
179 /* This bit (which is greater than any UTF value) is used to indicate that a
180 variable contains a number of code units instead of an actual code point. */
181
182 #define UTF_LENGTH 0x10000000l
183
184 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
185 and is fast (a good compiler can turn it into a subtraction and unsigned
186 comparison). */
187
188 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
189
190 /* Table to identify hex digits. The tables in chartables are dependent on the
191 locale, and may mark arbitrary characters as digits. We want to recognize only
192 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
193 costs 256 bytes, but it is a lot faster than doing character value tests (at
194 least in some simple cases I timed), and in some applications one wants PCRE to
195 compile efficiently as well as match efficiently. The value in the table is
196 the binary hex digit value, or 0xff for non-hex digits. */
197
198 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
199 UTF-8 mode. */
200
201 #ifndef EBCDIC
202 static const uint8_t xdigitab[] =
203 {
204 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
205 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
206 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
207 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
208 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
209 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
210 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
211 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
212 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
213 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
214 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
215 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
216 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
217 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
218 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
219 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
220 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
221 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
222 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
223 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
224 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
225 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
226 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
227 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
228 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
229 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
230 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
231 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
232 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
233 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
234 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
235 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
236
237 #else
238
239 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
240
241 static const uint8_t xdigitab[] =
242 {
243 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
244 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
245 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
246 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
247 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
248 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
249 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
250 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
251 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
252 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
253 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
254 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
255 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
256 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
257 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
258 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
259 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
260 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
261 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
262 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
263 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
264 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
265 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
266 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
267 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
268 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
269 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
270 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
271 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
272 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
273 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
274 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
275 #endif /* EBCDIC */
276
277
278 /* Table for handling alphanumeric escaped characters. Positive returns are
279 simple data values; negative values are for special things like \d and so on.
280 Zero means further processing is needed (for things like \x), or the escape is
281 invalid. */
282
283 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
284 in UTF-8 mode. It runs from '0' to 'z'. */
285
286 #ifndef EBCDIC
287 #define ESCAPES_FIRST CHAR_0
288 #define ESCAPES_LAST CHAR_z
289 #define UPPER_CASE(c) (c-32)
290
291 static const short int escapes[] = {
292 0, 0,
293 0, 0,
294 0, 0,
295 0, 0,
296 0, 0,
297 CHAR_COLON, CHAR_SEMICOLON,
298 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
299 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
300 CHAR_COMMERCIAL_AT, -ESC_A,
301 -ESC_B, -ESC_C,
302 -ESC_D, -ESC_E,
303 0, -ESC_G,
304 -ESC_H, 0,
305 0, -ESC_K,
306 0, 0,
307 -ESC_N, 0,
308 -ESC_P, -ESC_Q,
309 -ESC_R, -ESC_S,
310 0, 0,
311 -ESC_V, -ESC_W,
312 -ESC_X, 0,
313 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
314 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
315 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
316 CHAR_GRAVE_ACCENT, ESC_a,
317 -ESC_b, 0,
318 -ESC_d, ESC_e,
319 ESC_f, 0,
320 -ESC_h, 0,
321 0, -ESC_k,
322 0, 0,
323 ESC_n, 0,
324 -ESC_p, 0,
325 ESC_r, -ESC_s,
326 ESC_tee, 0,
327 -ESC_v, -ESC_w,
328 0, 0,
329 -ESC_z
330 };
331
332 #else
333
334 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
335 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
336 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
337 because it is defined as 'a', which of course picks up the ASCII value. */
338
339 #if 'a' == 0x81 /* Check for a real EBCDIC environment */
340 #define ESCAPES_FIRST CHAR_a
341 #define ESCAPES_LAST CHAR_9
342 #define UPPER_CASE(c) (c+64)
343 #else /* Testing in an ASCII environment */
344 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
345 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
346 #define UPPER_CASE(c) (c-32)
347 #endif
348
349 static const short int escapes[] = {
350 /* 80 */ ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
351 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
352 /* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p,
353 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
354 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
355 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
356 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
357 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
358 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
359 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
360 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
361 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
362 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
363 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
364 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
365 /* F8 */ 0, 0
366 };
367
368 /* We also need a table of characters that may follow \c in an EBCDIC
369 environment for characters 0-31. */
370
371 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
372
373 #endif /* EBCDIC */
374
375
376 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
377 searched linearly. Put all the names into a single string, in order to reduce
378 the number of relocations when a shared library is dynamically linked. The
379 string is built from string macros so that it works in UTF-8 mode on EBCDIC
380 platforms. */
381
382 typedef struct verbitem {
383 int len; /* Length of verb name */
384 int op; /* Op when no arg, or -1 if arg mandatory */
385 int op_arg; /* Op when arg present, or -1 if not allowed */
386 } verbitem;
387
388 static const char verbnames[] =
389 "\0" /* Empty name is a shorthand for MARK */
390 STRING_MARK0
391 STRING_ACCEPT0
392 STRING_COMMIT0
393 STRING_F0
394 STRING_FAIL0
395 STRING_PRUNE0
396 STRING_SKIP0
397 STRING_THEN;
398
399 static const verbitem verbs[] = {
400 { 0, -1, OP_MARK },
401 { 4, -1, OP_MARK },
402 { 6, OP_ACCEPT, -1 },
403 { 6, OP_COMMIT, -1 },
404 { 1, OP_FAIL, -1 },
405 { 4, OP_FAIL, -1 },
406 { 5, OP_PRUNE, OP_PRUNE_ARG },
407 { 4, OP_SKIP, OP_SKIP_ARG },
408 { 4, OP_THEN, OP_THEN_ARG }
409 };
410
411 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
412
413
414 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
415 another regex library. */
416
417 static const PCRE2_UCHAR sub_start_of_word[] = {
418 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
419 CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
420
421 static const PCRE2_UCHAR sub_end_of_word[] = {
422 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
423 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
424 CHAR_RIGHT_PARENTHESIS, '\0' };
425
426
427 /* Tables of names of POSIX character classes and their lengths. The names are
428 now all in a single string, to reduce the number of relocations when a shared
429 library is dynamically loaded. The list of lengths is terminated by a zero
430 length entry. The first three must be alpha, lower, upper, as this is assumed
431 for handling case independence. The indices for graph, print, and punct are
432 needed, so identify them. */
433
434 static const char posix_names[] =
435 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
436 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
437 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
438 STRING_word0 STRING_xdigit;
439
440 static const uint8_t posix_name_lengths[] = {
441 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
442
443 #define PC_GRAPH 8
444 #define PC_PRINT 9
445 #define PC_PUNCT 10
446
447
448 /* Table of class bit maps for each POSIX class. Each class is formed from a
449 base map, with an optional addition or removal of another map. Then, for some
450 classes, there is some additional tweaking: for [:blank:] the vertical space
451 characters are removed, and for [:alpha:] and [:alnum:] the underscore
452 character is removed. The triples in the table consist of the base map offset,
453 second map offset or -1 if no second map, and a non-negative value for map
454 addition or a negative value for map subtraction (if there are two maps). The
455 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
456 remove vertical space characters, 2 => remove underscore. */
457
458 static const int posix_class_maps[] = {
459 cbit_word, cbit_digit, -2, /* alpha */
460 cbit_lower, -1, 0, /* lower */
461 cbit_upper, -1, 0, /* upper */
462 cbit_word, -1, 2, /* alnum - word without underscore */
463 cbit_print, cbit_cntrl, 0, /* ascii */
464 cbit_space, -1, 1, /* blank - a GNU extension */
465 cbit_cntrl, -1, 0, /* cntrl */
466 cbit_digit, -1, 0, /* digit */
467 cbit_graph, -1, 0, /* graph */
468 cbit_print, -1, 0, /* print */
469 cbit_punct, -1, 0, /* punct */
470 cbit_space, -1, 0, /* space */
471 cbit_word, -1, 0, /* word - a Perl extension */
472 cbit_xdigit,-1, 0 /* xdigit */
473 };
474
475 /* Table of substitutes for \d etc when PCRE2_UCP is set. They are replaced by
476 Unicode property escapes. */
477
478 #ifdef SUPPORT_UNICODE
479 static const PCRE2_UCHAR string_PNd[] = {
480 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
481 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
482 static const PCRE2_UCHAR string_pNd[] = {
483 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
484 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
485 static const PCRE2_UCHAR string_PXsp[] = {
486 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
487 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
488 static const PCRE2_UCHAR string_pXsp[] = {
489 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
490 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
491 static const PCRE2_UCHAR string_PXwd[] = {
492 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
493 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
494 static const PCRE2_UCHAR string_pXwd[] = {
495 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
496 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
497
498 static PCRE2_SPTR substitutes[] = {
499 string_PNd, /* \D */
500 string_pNd, /* \d */
501 string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
502 string_pXsp, /* \s */ /* space and POSIX space are the same. */
503 string_PXwd, /* \W */
504 string_pXwd /* \w */
505 };
506
507 /* The POSIX class substitutes must be in the order of the POSIX class names,
508 defined above, and there are both positive and negative cases. NULL means no
509 general substitute of a Unicode property escape (\p or \P). However, for some
510 POSIX classes (e.g. graph, print, punct) a special property code is compiled
511 directly. */
512
513 static const PCRE2_UCHAR string_pCc[] = {
514 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
515 CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
516 static const PCRE2_UCHAR string_pL[] = {
517 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
518 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
519 static const PCRE2_UCHAR string_pLl[] = {
520 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
521 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
522 static const PCRE2_UCHAR string_pLu[] = {
523 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
524 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
525 static const PCRE2_UCHAR string_pXan[] = {
526 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
527 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
528 static const PCRE2_UCHAR string_h[] = {
529 CHAR_BACKSLASH, CHAR_h, '\0' };
530 static const PCRE2_UCHAR string_pXps[] = {
531 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
532 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
533 static const PCRE2_UCHAR string_PCc[] = {
534 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
535 CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
536 static const PCRE2_UCHAR string_PL[] = {
537 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
538 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
539 static const PCRE2_UCHAR string_PLl[] = {
540 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
541 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
542 static const PCRE2_UCHAR string_PLu[] = {
543 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
544 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
545 static const PCRE2_UCHAR string_PXan[] = {
546 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
547 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
548 static const PCRE2_UCHAR string_H[] = {
549 CHAR_BACKSLASH, CHAR_H, '\0' };
550 static const PCRE2_UCHAR string_PXps[] = {
551 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
552 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
553
554 static PCRE2_SPTR posix_substitutes[] = {
555 string_pL, /* alpha */
556 string_pLl, /* lower */
557 string_pLu, /* upper */
558 string_pXan, /* alnum */
559 NULL, /* ascii */
560 string_h, /* blank */
561 string_pCc, /* cntrl */
562 string_pNd, /* digit */
563 NULL, /* graph */
564 NULL, /* print */
565 NULL, /* punct */
566 string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
567 string_pXwd, /* word */ /* Perl and POSIX space are the same */
568 NULL, /* xdigit */
569 /* Negated cases */
570 string_PL, /* ^alpha */
571 string_PLl, /* ^lower */
572 string_PLu, /* ^upper */
573 string_PXan, /* ^alnum */
574 NULL, /* ^ascii */
575 string_H, /* ^blank */
576 string_PCc, /* ^cntrl */
577 string_PNd, /* ^digit */
578 NULL, /* ^graph */
579 NULL, /* ^print */
580 NULL, /* ^punct */
581 string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
582 string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
583 NULL /* ^xdigit */
584 };
585 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(PCRE2_UCHAR *))
586 #endif /* SUPPORT_UNICODE */
587
588 /* Masks for checking option settings. */
589
590 #define PUBLIC_COMPILE_OPTIONS \
591 (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
592 PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \
593 PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \
594 PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \
595 PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \
596 PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \
597 PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \
598 PCRE2_UTF)
599
600 /* Compile time error code numbers. They are given names so that they can more
601 easily be tracked. When a new number is added, the tables called eint1 and
602 eint2 in pcre2posix.c may need to be updated, and a new error text must be
603 added to compile_error_texts in pcre2_error.c. */
604
605 enum { ERR0 = COMPILE_ERROR_BASE,
606 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
607 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
608 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
609 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
610 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
611 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
612 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
613 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
614 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88 };
615
616 /* Error codes that correspond to negative error codes returned by
617 find_fixedlength(). */
618
619 static int fixed_length_errors[] =
620 {
621 ERR0, /* Not an error */
622 ERR0, /* Not an error; -1 is used for "process later" */
623 ERR25, /* Lookbehind is not fixed length */
624 ERR36, /* \C in lookbehind is not allowed */
625 ERR87, /* Lookbehind is too long */
626 ERR86, /* Pattern too complicated */
627 ERR70 /* Internal error: unknown opcode encountered */
628 };
629
630 /* This is a table of start-of-pattern options such as (*UTF) and settings such
631 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
632 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
633 generic and always supported. */
634
635 enum { PSO_OPT, /* Value is an option bit */
636 PSO_FLG, /* Value is a flag bit */
637 PSO_NL, /* Value is a newline type */
638 PSO_BSR, /* Value is a \R type */
639 PSO_LIMM, /* Read integer value for match limit */
640 PSO_LIMR }; /* Read integer value for recursion limit */
641
642 typedef struct pso {
643 const uint8_t *name;
644 uint16_t length;
645 uint16_t type;
646 uint32_t value;
647 } pso;
648
649 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
650
651 static pso pso_list[] = {
652 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
653 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
654 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
655 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
656 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
657 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
658 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
659 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
660 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
661 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
662 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMR, 0 },
663 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
664 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
665 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
666 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
667 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
668 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
669 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
670 };
671
672 /* This table is used when converting repeating opcodes into possessified
673 versions as a result of an explicit possessive quantifier such as ++. A zero
674 value means there is no possessified version - in those cases the item in
675 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
676 because all relevant opcodes are less than that. */
677
678 static const uint8_t opcode_possessify[] = {
679 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
680 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
681
682 0, /* NOTI */
683 OP_POSSTAR, 0, /* STAR, MINSTAR */
684 OP_POSPLUS, 0, /* PLUS, MINPLUS */
685 OP_POSQUERY, 0, /* QUERY, MINQUERY */
686 OP_POSUPTO, 0, /* UPTO, MINUPTO */
687 0, /* EXACT */
688 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
689
690 OP_POSSTARI, 0, /* STARI, MINSTARI */
691 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
692 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
693 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
694 0, /* EXACTI */
695 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
696
697 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
698 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
699 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
700 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
701 0, /* NOTEXACT */
702 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
703
704 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
705 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
706 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
707 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
708 0, /* NOTEXACTI */
709 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
710
711 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
712 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
713 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
714 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
715 0, /* TYPEEXACT */
716 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
717
718 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
719 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
720 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
721 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
722 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
723
724 0, 0, 0, /* CLASS, NCLASS, XCLASS */
725 0, 0, /* REF, REFI */
726 0, 0, /* DNREF, DNREFI */
727 0, 0 /* RECURSE, CALLOUT */
728 };
729
730
731
732 /*************************************************
733 * Copy compiled code *
734 *************************************************/
735
736 /* Compiled JIT code cannot be copied, so the new compiled block has no
737 associated JIT data. */
738
739 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)740 pcre2_code_copy(const pcre2_code *code)
741 {
742 PCRE2_SIZE* ref_count;
743 pcre2_code *newcode;
744
745 if (code == NULL) return NULL;
746 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
747 if (newcode == NULL) return NULL;
748 memcpy(newcode, code, code->blocksize);
749 newcode->executable_jit = NULL;
750
751 /* If the code is one that has been deserialized, increment the reference count
752 in the decoded tables. */
753
754 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
755 {
756 ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
757 (*ref_count)++;
758 }
759
760 return newcode;
761 }
762
763
764
765 /*************************************************
766 * Free compiled code *
767 *************************************************/
768
769 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)770 pcre2_code_free(pcre2_code *code)
771 {
772 PCRE2_SIZE* ref_count;
773
774 if (code != NULL)
775 {
776 if (code->executable_jit != NULL)
777 PRIV(jit_free)(code->executable_jit, &code->memctl);
778
779 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
780 {
781 /* Decoded tables belong to the codes after deserialization, and they must
782 be freed when there are no more reference to them. The *ref_count should
783 always be > 0. */
784
785 ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
786 if (*ref_count > 0)
787 {
788 (*ref_count)--;
789 if (*ref_count == 0)
790 code->memctl.free((void *)code->tables, code->memctl.memory_data);
791 }
792 }
793
794 code->memctl.free(code, code->memctl.memory_data);
795 }
796 }
797
798
799
800 /*************************************************
801 * Insert an automatic callout point *
802 *************************************************/
803
804 /* This function is called when the PCRE2_AUTO_CALLOUT option is set, to insert
805 callout points before each pattern item.
806
807 Arguments:
808 code current code pointer
809 ptr current pattern pointer
810 cb general compile-time data
811
812 Returns: new code pointer
813 */
814
815 static PCRE2_UCHAR *
auto_callout(PCRE2_UCHAR * code,PCRE2_SPTR ptr,compile_block * cb)816 auto_callout(PCRE2_UCHAR *code, PCRE2_SPTR ptr, compile_block *cb)
817 {
818 code[0] = OP_CALLOUT;
819 PUT(code, 1, ptr - cb->start_pattern); /* Pattern offset */
820 PUT(code, 1 + LINK_SIZE, 0); /* Default length */
821 code[1 + 2*LINK_SIZE] = 255;
822 return code + PRIV(OP_lengths)[OP_CALLOUT];
823 }
824
825
826
827 /*************************************************
828 * Complete a callout item *
829 *************************************************/
830
831 /* A callout item contains the length of the next item in the pattern, which
832 we can't fill in till after we have reached the relevant point. This is used
833 for both automatic and manual callouts.
834
835 Arguments:
836 previous_callout points to previous callout item
837 ptr current pattern pointer
838 cb general compile-time data
839
840 Returns: nothing
841 */
842
843 static void
complete_callout(PCRE2_UCHAR * previous_callout,PCRE2_SPTR ptr,compile_block * cb)844 complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr,
845 compile_block *cb)
846 {
847 size_t length = (size_t)(ptr - cb->start_pattern - GET(previous_callout, 1));
848 PUT(previous_callout, 1 + LINK_SIZE, length);
849 }
850
851
852
853 /*************************************************
854 * Find the fixed length of a branch *
855 *************************************************/
856
857 /* Scan a branch and compute the fixed length of subject that will match it, if
858 the length is fixed. This is needed for dealing with lookbehind assertions. In
859 UTF mode, the result is in code units rather than bytes. The branch is
860 temporarily terminated with OP_END when this function is called.
861
862 This function is called when a lookbehind assertion is encountered, so that if
863 it fails, the error message can point to the correct place in the pattern.
864 However, we cannot do this when the assertion contains subroutine calls,
865 because they can be forward references. We solve this by remembering this case
866 and doing the check at the end; a flag specifies which mode we are running in.
867
868 Lookbehind lengths are held in 16-bit fields and the maximum value is defined
869 as LOOKBEHIND_MAX.
870
871 Arguments:
872 code points to the start of the pattern (the bracket)
873 utf TRUE in UTF mode
874 atend TRUE if called when the pattern is complete
875 cb the "compile data" structure
876 recurses chain of recurse_check to catch mutual recursion
877 countptr pointer to counter, to catch over-complexity
878
879 Returns: if non-negative, the fixed length,
880 or -1 if an OP_RECURSE item was encountered and atend is FALSE
881 or -2 if there is no fixed length,
882 or -3 if \C was encountered (in UTF mode only)
883 or -4 if length is too long
884 or -5 if regex is too complicated
885 or -6 if an unknown opcode was encountered (internal error)
886 */
887
888 #define FFL_LATER (-1)
889 #define FFL_NOTFIXED (-2)
890 #define FFL_BACKSLASHC (-3)
891 #define FFL_TOOLONG (-4)
892 #define FFL_TOOCOMPLICATED (-5)
893 #define FFL_UNKNOWNOP (-6)
894
895 static int
find_fixedlength(PCRE2_UCHAR * code,BOOL utf,BOOL atend,compile_block * cb,recurse_check * recurses,int * countptr)896 find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
897 recurse_check *recurses, int *countptr)
898 {
899 uint32_t length = 0xffffffffu; /* Unset */
900 uint32_t group = 0;
901 uint32_t groupinfo = 0;
902 recurse_check this_recurse;
903 register uint32_t branchlength = 0;
904 register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE;
905
906 /* If this is a capturing group, we may have the answer cached, but we can only
907 use this information if there are no (?| groups in the pattern, because
908 otherwise group numbers are not unique. */
909
910 if (*code == OP_CBRA || *code == OP_CBRAPOS || *code == OP_SCBRA ||
911 *code == OP_SCBRAPOS)
912 {
913 group = GET2(cc, 0);
914 cc += IMM2_SIZE;
915 groupinfo = cb->groupinfo[group];
916 if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0)
917 {
918 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED;
919 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
920 return groupinfo & GI_FIXED_LENGTH_MASK;
921 }
922 }
923
924 /* A large and/or complex regex can take too long to process. This can happen
925 more often when (?| groups are present in the pattern. */
926
927 if ((*countptr)++ > 2000) return FFL_TOOCOMPLICATED;
928
929 /* Scan along the opcodes for this branch. If we get to the end of the
930 branch, check the length against that of the other branches. */
931
932 for (;;)
933 {
934 int d;
935 PCRE2_UCHAR *ce, *cs;
936 register PCRE2_UCHAR op = *cc;
937
938 if (branchlength > LOOKBEHIND_MAX) return FFL_TOOLONG;
939
940 switch (op)
941 {
942 /* We only need to continue for OP_CBRA (normal capturing bracket) and
943 OP_BRA (normal non-capturing bracket) because the other variants of these
944 opcodes are all concerned with unlimited repeated groups, which of course
945 are not of fixed length. */
946
947 case OP_CBRA:
948 case OP_BRA:
949 case OP_ONCE:
950 case OP_ONCE_NC:
951 case OP_COND:
952 d = find_fixedlength(cc, utf, atend, cb, recurses, countptr);
953 if (d < 0) return d;
954 branchlength += (uint32_t)d;
955 do cc += GET(cc, 1); while (*cc == OP_ALT);
956 cc += 1 + LINK_SIZE;
957 break;
958
959 /* Reached end of a branch; if it's a ket it is the end of a nested call.
960 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
961 an ALT. If it is END it's the end of the outer call. All can be handled by
962 the same code. Note that we must not include the OP_KETRxxx opcodes here,
963 because they all imply an unlimited repeat. */
964
965 case OP_ALT:
966 case OP_KET:
967 case OP_END:
968 case OP_ACCEPT:
969 case OP_ASSERT_ACCEPT:
970 if (length == 0xffffffffu) length = branchlength;
971 else if (length != branchlength) goto ISNOTFIXED;
972 if (*cc != OP_ALT)
973 {
974 if (group > 0)
975 {
976 groupinfo |= (uint32_t)(GI_SET_FIXED_LENGTH | length);
977 cb->groupinfo[group] = groupinfo;
978 }
979 return (int)length;
980 }
981 cc += 1 + LINK_SIZE;
982 branchlength = 0;
983 break;
984
985 /* A true recursion implies not fixed length, but a subroutine call may
986 be OK. If the subroutine is a forward reference, we can't deal with
987 it until the end of the pattern, so return FFL_LATER. */
988
989 case OP_RECURSE:
990 if (!atend) return FFL_LATER;
991 cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */
992 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
993 if (cc > cs && cc < ce) goto ISNOTFIXED; /* Recursion */
994 else /* Check for mutual recursion */
995 {
996 recurse_check *r = recurses;
997 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
998 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
999 }
1000 this_recurse.prev = recurses;
1001 this_recurse.group = cs;
1002 d = find_fixedlength(cs, utf, atend, cb, &this_recurse, countptr);
1003 if (d < 0) return d;
1004 branchlength += (uint32_t)d;
1005 cc += 1 + LINK_SIZE;
1006 break;
1007
1008 /* Skip over assertive subpatterns. Note that we must increment cc by
1009 1 + LINK_SIZE at the end, not by OP_length[*cc] because in a recursive
1010 situation this assertion may be the one that is ultimately being checked
1011 for having a fixed length, in which case its terminating OP_KET will have
1012 been temporarily replaced by OP_END. */
1013
1014 case OP_ASSERT:
1015 case OP_ASSERT_NOT:
1016 case OP_ASSERTBACK:
1017 case OP_ASSERTBACK_NOT:
1018 do cc += GET(cc, 1); while (*cc == OP_ALT);
1019 cc += 1 + LINK_SIZE;
1020 break;
1021
1022 /* Skip over things that don't match chars */
1023
1024 case OP_MARK:
1025 case OP_PRUNE_ARG:
1026 case OP_SKIP_ARG:
1027 case OP_THEN_ARG:
1028 cc += cc[1] + PRIV(OP_lengths)[*cc];
1029 break;
1030
1031 case OP_CALLOUT:
1032 case OP_CIRC:
1033 case OP_CIRCM:
1034 case OP_CLOSE:
1035 case OP_COMMIT:
1036 case OP_CREF:
1037 case OP_FALSE:
1038 case OP_TRUE:
1039 case OP_DNCREF:
1040 case OP_DNRREF:
1041 case OP_DOLL:
1042 case OP_DOLLM:
1043 case OP_EOD:
1044 case OP_EODN:
1045 case OP_FAIL:
1046 case OP_NOT_WORD_BOUNDARY:
1047 case OP_PRUNE:
1048 case OP_REVERSE:
1049 case OP_RREF:
1050 case OP_SET_SOM:
1051 case OP_SKIP:
1052 case OP_SOD:
1053 case OP_SOM:
1054 case OP_THEN:
1055 case OP_WORD_BOUNDARY:
1056 cc += PRIV(OP_lengths)[*cc];
1057 break;
1058
1059 case OP_CALLOUT_STR:
1060 cc += GET(cc, 1 + 2*LINK_SIZE);
1061 break;
1062
1063 /* Handle literal characters */
1064
1065 case OP_CHAR:
1066 case OP_CHARI:
1067 case OP_NOT:
1068 case OP_NOTI:
1069 branchlength++;
1070 cc += 2;
1071 #ifdef SUPPORT_UNICODE
1072 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1073 #endif
1074 break;
1075
1076 /* Handle exact repetitions. The count is already in characters, but we
1077 need to skip over a multibyte character in UTF8 mode. */
1078
1079 case OP_EXACT:
1080 case OP_EXACTI:
1081 case OP_NOTEXACT:
1082 case OP_NOTEXACTI:
1083 branchlength += GET2(cc,1);
1084 cc += 2 + IMM2_SIZE;
1085 #ifdef SUPPORT_UNICODE
1086 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1087 #endif
1088 break;
1089
1090 case OP_TYPEEXACT:
1091 branchlength += GET2(cc,1);
1092 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1093 cc += 2;
1094 cc += 1 + IMM2_SIZE + 1;
1095 break;
1096
1097 /* Handle single-char matchers */
1098
1099 case OP_PROP:
1100 case OP_NOTPROP:
1101 cc += 2;
1102 /* Fall through */
1103
1104 case OP_HSPACE:
1105 case OP_VSPACE:
1106 case OP_NOT_HSPACE:
1107 case OP_NOT_VSPACE:
1108 case OP_NOT_DIGIT:
1109 case OP_DIGIT:
1110 case OP_NOT_WHITESPACE:
1111 case OP_WHITESPACE:
1112 case OP_NOT_WORDCHAR:
1113 case OP_WORDCHAR:
1114 case OP_ANY:
1115 case OP_ALLANY:
1116 branchlength++;
1117 cc++;
1118 break;
1119
1120 /* The single-byte matcher isn't allowed. This only happens in UTF-8 or
1121 UTF-16 mode; otherwise \C is coded as OP_ALLANY. */
1122
1123 case OP_ANYBYTE:
1124 return FFL_BACKSLASHC;
1125
1126 /* Check a class for variable quantification */
1127
1128 case OP_CLASS:
1129 case OP_NCLASS:
1130 #ifdef SUPPORT_WIDE_CHARS
1131 case OP_XCLASS:
1132 /* The original code caused an unsigned overflow in 64 bit systems,
1133 so now we use a conditional statement. */
1134 if (op == OP_XCLASS)
1135 cc += GET(cc, 1);
1136 else
1137 cc += PRIV(OP_lengths)[OP_CLASS];
1138 #else
1139 cc += PRIV(OP_lengths)[OP_CLASS];
1140 #endif
1141
1142 switch (*cc)
1143 {
1144 case OP_CRSTAR:
1145 case OP_CRMINSTAR:
1146 case OP_CRPLUS:
1147 case OP_CRMINPLUS:
1148 case OP_CRQUERY:
1149 case OP_CRMINQUERY:
1150 case OP_CRPOSSTAR:
1151 case OP_CRPOSPLUS:
1152 case OP_CRPOSQUERY:
1153 goto ISNOTFIXED;
1154
1155 case OP_CRRANGE:
1156 case OP_CRMINRANGE:
1157 case OP_CRPOSRANGE:
1158 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) goto ISNOTFIXED;
1159 branchlength += GET2(cc,1);
1160 cc += 1 + 2 * IMM2_SIZE;
1161 break;
1162
1163 default:
1164 branchlength++;
1165 }
1166 break;
1167
1168 /* Anything else is variable length */
1169
1170 case OP_ANYNL:
1171 case OP_BRAMINZERO:
1172 case OP_BRAPOS:
1173 case OP_BRAPOSZERO:
1174 case OP_BRAZERO:
1175 case OP_CBRAPOS:
1176 case OP_EXTUNI:
1177 case OP_KETRMAX:
1178 case OP_KETRMIN:
1179 case OP_KETRPOS:
1180 case OP_MINPLUS:
1181 case OP_MINPLUSI:
1182 case OP_MINQUERY:
1183 case OP_MINQUERYI:
1184 case OP_MINSTAR:
1185 case OP_MINSTARI:
1186 case OP_MINUPTO:
1187 case OP_MINUPTOI:
1188 case OP_NOTMINPLUS:
1189 case OP_NOTMINPLUSI:
1190 case OP_NOTMINQUERY:
1191 case OP_NOTMINQUERYI:
1192 case OP_NOTMINSTAR:
1193 case OP_NOTMINSTARI:
1194 case OP_NOTMINUPTO:
1195 case OP_NOTMINUPTOI:
1196 case OP_NOTPLUS:
1197 case OP_NOTPLUSI:
1198 case OP_NOTPOSPLUS:
1199 case OP_NOTPOSPLUSI:
1200 case OP_NOTPOSQUERY:
1201 case OP_NOTPOSQUERYI:
1202 case OP_NOTPOSSTAR:
1203 case OP_NOTPOSSTARI:
1204 case OP_NOTPOSUPTO:
1205 case OP_NOTPOSUPTOI:
1206 case OP_NOTQUERY:
1207 case OP_NOTQUERYI:
1208 case OP_NOTSTAR:
1209 case OP_NOTSTARI:
1210 case OP_NOTUPTO:
1211 case OP_NOTUPTOI:
1212 case OP_PLUS:
1213 case OP_PLUSI:
1214 case OP_POSPLUS:
1215 case OP_POSPLUSI:
1216 case OP_POSQUERY:
1217 case OP_POSQUERYI:
1218 case OP_POSSTAR:
1219 case OP_POSSTARI:
1220 case OP_POSUPTO:
1221 case OP_POSUPTOI:
1222 case OP_QUERY:
1223 case OP_QUERYI:
1224 case OP_REF:
1225 case OP_REFI:
1226 case OP_DNREF:
1227 case OP_DNREFI:
1228 case OP_SBRA:
1229 case OP_SBRAPOS:
1230 case OP_SCBRA:
1231 case OP_SCBRAPOS:
1232 case OP_SCOND:
1233 case OP_SKIPZERO:
1234 case OP_STAR:
1235 case OP_STARI:
1236 case OP_TYPEMINPLUS:
1237 case OP_TYPEMINQUERY:
1238 case OP_TYPEMINSTAR:
1239 case OP_TYPEMINUPTO:
1240 case OP_TYPEPLUS:
1241 case OP_TYPEPOSPLUS:
1242 case OP_TYPEPOSQUERY:
1243 case OP_TYPEPOSSTAR:
1244 case OP_TYPEPOSUPTO:
1245 case OP_TYPEQUERY:
1246 case OP_TYPESTAR:
1247 case OP_TYPEUPTO:
1248 case OP_UPTO:
1249 case OP_UPTOI:
1250 goto ISNOTFIXED;
1251
1252 /* Catch unrecognized opcodes so that when new ones are added they
1253 are not forgotten, as has happened in the past. */
1254
1255 default:
1256 return FFL_UNKNOWNOP;
1257 }
1258 }
1259 /* Control never gets here except by goto. */
1260
1261 ISNOTFIXED:
1262 if (group > 0)
1263 {
1264 groupinfo |= GI_NOT_FIXED_LENGTH;
1265 cb->groupinfo[group] = groupinfo;
1266 }
1267 return FFL_NOTFIXED;
1268 }
1269
1270
1271
1272 /*************************************************
1273 * Find first significant op code *
1274 *************************************************/
1275
1276 /* This is called by several functions that scan a compiled expression looking
1277 for a fixed first character, or an anchoring op code etc. It skips over things
1278 that do not influence this. For some calls, it makes sense to skip negative
1279 forward and all backward assertions, and also the \b assertion; for others it
1280 does not.
1281
1282 Arguments:
1283 code pointer to the start of the group
1284 skipassert TRUE if certain assertions are to be skipped
1285
1286 Returns: pointer to the first significant opcode
1287 */
1288
1289 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)1290 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
1291 {
1292 for (;;)
1293 {
1294 switch ((int)*code)
1295 {
1296 case OP_ASSERT_NOT:
1297 case OP_ASSERTBACK:
1298 case OP_ASSERTBACK_NOT:
1299 if (!skipassert) return code;
1300 do code += GET(code, 1); while (*code == OP_ALT);
1301 code += PRIV(OP_lengths)[*code];
1302 break;
1303
1304 case OP_WORD_BOUNDARY:
1305 case OP_NOT_WORD_BOUNDARY:
1306 if (!skipassert) return code;
1307 /* Fall through */
1308
1309 case OP_CALLOUT:
1310 case OP_CREF:
1311 case OP_DNCREF:
1312 case OP_RREF:
1313 case OP_DNRREF:
1314 case OP_FALSE:
1315 case OP_TRUE:
1316 code += PRIV(OP_lengths)[*code];
1317 break;
1318
1319 case OP_CALLOUT_STR:
1320 code += GET(code, 1 + 2*LINK_SIZE);
1321 break;
1322
1323 default:
1324 return code;
1325 }
1326 }
1327 /* Control never reaches here */
1328 }
1329
1330
1331
1332 /*************************************************
1333 * Scan compiled branch for non-emptiness *
1334 *************************************************/
1335
1336 /* This function scans through a branch of a compiled pattern to see whether it
1337 can match the empty string. It is called at the end of compiling to check the
1338 entire pattern, and from compile_branch() when checking for an unlimited repeat
1339 of a group that can match nothing. In the latter case it is called only when
1340 doing the real compile, not during the pre-compile that measures the size of
1341 the compiled pattern.
1342
1343 Note that first_significant_code() skips over backward and negative forward
1344 assertions when its final argument is TRUE. If we hit an unclosed bracket, we
1345 return "empty" - this means we've struck an inner bracket whose current branch
1346 will already have been scanned.
1347
1348 Arguments:
1349 code points to start of search
1350 endcode points to where to stop
1351 utf TRUE if in UTF mode
1352 cb compile data
1353 atend TRUE if being called to check an entire pattern
1354 recurses chain of recurse_check to catch mutual recursion
1355 countptr pointer to count to catch over-complicated pattern
1356
1357 Returns: 0 if what is matched cannot be empty
1358 1 if what is matched could be empty
1359 -1 if the pattern is too complicated
1360 */
1361
1362 #define CBE_NOTEMPTY 0
1363 #define CBE_EMPTY 1
1364 #define CBE_TOOCOMPLICATED (-1)
1365
1366
1367 static int
could_be_empty_branch(PCRE2_SPTR code,PCRE2_SPTR endcode,BOOL utf,compile_block * cb,BOOL atend,recurse_check * recurses,int * countptr)1368 could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf,
1369 compile_block *cb, BOOL atend, recurse_check *recurses, int *countptr)
1370 {
1371 uint32_t group = 0;
1372 uint32_t groupinfo = 0;
1373 register PCRE2_UCHAR c;
1374 recurse_check this_recurse;
1375
1376 /* If what we are checking has already been set as "could be empty", we know
1377 the answer. */
1378
1379 if (*code >= OP_SBRA && *code <= OP_SCOND) return CBE_EMPTY;
1380
1381 /* If this is a capturing group, we may have the answer cached, but we can only
1382 use this information if there are no (?| groups in the pattern, because
1383 otherwise group numbers are not unique. */
1384
1385 if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 &&
1386 (*code == OP_CBRA || *code == OP_CBRAPOS))
1387 {
1388 group = GET2(code, 1 + LINK_SIZE);
1389 groupinfo = cb->groupinfo[group];
1390 if ((groupinfo & GI_SET_COULD_BE_EMPTY) != 0)
1391 return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
1392 }
1393
1394 /* A large and/or complex regex can take too long to process. We have to assume
1395 it can match an empty string. This can happen more often when (?| groups are
1396 present in the pattern and the caching is disabled. Setting the cap at 1100
1397 allows the test for more than 1023 capturing patterns to work. */
1398
1399 if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED;
1400
1401 /* Scan the opcodes for this branch. */
1402
1403 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
1404 code < endcode;
1405 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
1406 {
1407 PCRE2_SPTR ccode;
1408
1409 c = *code;
1410
1411 /* Skip over forward assertions; the other assertions are skipped by
1412 first_significant_code() with a TRUE final argument. */
1413
1414 if (c == OP_ASSERT)
1415 {
1416 do code += GET(code, 1); while (*code == OP_ALT);
1417 c = *code;
1418 continue;
1419 }
1420
1421 /* For a recursion/subroutine call we can scan the recursion when this
1422 function is called at the end, to check a complete pattern. Before then,
1423 recursions just have the group number as their argument and in any case may
1424 be forward references. In that situation, we return CBE_EMPTY, just in case.
1425 It means that unlimited repeats of groups that contain recursions are always
1426 treated as "could be empty" - which just adds a bit more processing time
1427 because of the runtime check. */
1428
1429 if (c == OP_RECURSE)
1430 {
1431 PCRE2_SPTR scode, endgroup;
1432 BOOL empty_branch;
1433
1434 if (!atend) goto ISTRUE;
1435 scode = cb->start_code + GET(code, 1);
1436 endgroup = scode;
1437
1438 /* We need to detect whether this is a recursive call, as otherwise there
1439 will be an infinite loop. If it is a recursion, just skip over it. Simple
1440 recursions are easily detected. For mutual recursions we keep a chain on
1441 the stack. */
1442
1443 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
1444 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
1445 else
1446 {
1447 recurse_check *r = recurses;
1448 for (r = recurses; r != NULL; r = r->prev)
1449 if (r->group == scode) break;
1450 if (r != NULL) continue; /* Mutual recursion */
1451 }
1452
1453 /* Scan the referenced group, remembering it on the stack chain to detect
1454 mutual recursions. */
1455
1456 empty_branch = FALSE;
1457 this_recurse.prev = recurses;
1458 this_recurse.group = scode;
1459
1460 do
1461 {
1462 int rc = could_be_empty_branch(scode, endcode, utf, cb, atend,
1463 &this_recurse, countptr);
1464 if (rc < 0) return rc;
1465 if (rc > 0)
1466 {
1467 empty_branch = TRUE;
1468 break;
1469 }
1470 scode += GET(scode, 1);
1471 }
1472 while (*scode == OP_ALT);
1473
1474 if (!empty_branch) goto ISFALSE; /* All branches are non-empty */
1475 continue;
1476 }
1477
1478 /* Groups with zero repeats can of course be empty; skip them. */
1479
1480 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
1481 c == OP_BRAPOSZERO)
1482 {
1483 code += PRIV(OP_lengths)[c];
1484 do code += GET(code, 1); while (*code == OP_ALT);
1485 c = *code;
1486 continue;
1487 }
1488
1489 /* A nested group that is already marked as "could be empty" can just be
1490 skipped. */
1491
1492 if (c == OP_SBRA || c == OP_SBRAPOS ||
1493 c == OP_SCBRA || c == OP_SCBRAPOS)
1494 {
1495 do code += GET(code, 1); while (*code == OP_ALT);
1496 c = *code;
1497 continue;
1498 }
1499
1500 /* For other groups, scan the branches. */
1501
1502 if (c == OP_BRA || c == OP_BRAPOS ||
1503 c == OP_CBRA || c == OP_CBRAPOS ||
1504 c == OP_ONCE || c == OP_ONCE_NC ||
1505 c == OP_COND || c == OP_SCOND)
1506 {
1507 BOOL empty_branch;
1508 if (GET(code, 1) == 0) goto ISTRUE; /* Hit unclosed bracket */
1509
1510 /* If a conditional group has only one branch, there is a second, implied,
1511 empty branch, so just skip over the conditional, because it could be empty.
1512 Otherwise, scan the individual branches of the group. */
1513
1514 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1515 code += GET(code, 1);
1516 else
1517 {
1518 empty_branch = FALSE;
1519 do
1520 {
1521 if (!empty_branch)
1522 {
1523 int rc = could_be_empty_branch(code, endcode, utf, cb, atend,
1524 recurses, countptr);
1525 if (rc < 0) return rc;
1526 if (rc > 0) empty_branch = TRUE;
1527 }
1528 code += GET(code, 1);
1529 }
1530 while (*code == OP_ALT);
1531 if (!empty_branch) goto ISFALSE; /* All branches are non-empty */
1532 }
1533
1534 c = *code;
1535 continue;
1536 }
1537
1538 /* Handle the other opcodes */
1539
1540 switch (c)
1541 {
1542 /* Check for quantifiers after a class. XCLASS is used for classes that
1543 cannot be represented just by a bit map. This includes negated single
1544 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
1545 actual length is stored in the compiled code, so we must update "code"
1546 here. */
1547
1548 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
1549 case OP_XCLASS:
1550 ccode = code += GET(code, 1);
1551 goto CHECK_CLASS_REPEAT;
1552 #endif
1553
1554 case OP_CLASS:
1555 case OP_NCLASS:
1556 ccode = code + PRIV(OP_lengths)[OP_CLASS];
1557
1558 #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
1559 CHECK_CLASS_REPEAT:
1560 #endif
1561
1562 switch (*ccode)
1563 {
1564 case OP_CRSTAR: /* These could be empty; continue */
1565 case OP_CRMINSTAR:
1566 case OP_CRQUERY:
1567 case OP_CRMINQUERY:
1568 case OP_CRPOSSTAR:
1569 case OP_CRPOSQUERY:
1570 break;
1571
1572 default: /* Non-repeat => class must match */
1573 case OP_CRPLUS: /* These repeats aren't empty */
1574 case OP_CRMINPLUS:
1575 case OP_CRPOSPLUS:
1576 goto ISFALSE;
1577
1578 case OP_CRRANGE:
1579 case OP_CRMINRANGE:
1580 case OP_CRPOSRANGE:
1581 if (GET2(ccode, 1) > 0) goto ISFALSE; /* Minimum > 0 */
1582 break;
1583 }
1584 break;
1585
1586 /* Opcodes that must match a character */
1587
1588 case OP_ANY:
1589 case OP_ALLANY:
1590 case OP_ANYBYTE:
1591
1592 case OP_PROP:
1593 case OP_NOTPROP:
1594 case OP_ANYNL:
1595
1596 case OP_NOT_HSPACE:
1597 case OP_HSPACE:
1598 case OP_NOT_VSPACE:
1599 case OP_VSPACE:
1600 case OP_EXTUNI:
1601
1602 case OP_NOT_DIGIT:
1603 case OP_DIGIT:
1604 case OP_NOT_WHITESPACE:
1605 case OP_WHITESPACE:
1606 case OP_NOT_WORDCHAR:
1607 case OP_WORDCHAR:
1608
1609 case OP_CHAR:
1610 case OP_CHARI:
1611 case OP_NOT:
1612 case OP_NOTI:
1613
1614 case OP_PLUS:
1615 case OP_PLUSI:
1616 case OP_MINPLUS:
1617 case OP_MINPLUSI:
1618
1619 case OP_NOTPLUS:
1620 case OP_NOTPLUSI:
1621 case OP_NOTMINPLUS:
1622 case OP_NOTMINPLUSI:
1623
1624 case OP_POSPLUS:
1625 case OP_POSPLUSI:
1626 case OP_NOTPOSPLUS:
1627 case OP_NOTPOSPLUSI:
1628
1629 case OP_EXACT:
1630 case OP_EXACTI:
1631 case OP_NOTEXACT:
1632 case OP_NOTEXACTI:
1633
1634 case OP_TYPEPLUS:
1635 case OP_TYPEMINPLUS:
1636 case OP_TYPEPOSPLUS:
1637 case OP_TYPEEXACT:
1638 goto ISFALSE;
1639
1640 /* These are going to continue, as they may be empty, but we have to
1641 fudge the length for the \p and \P cases. */
1642
1643 case OP_TYPESTAR:
1644 case OP_TYPEMINSTAR:
1645 case OP_TYPEPOSSTAR:
1646 case OP_TYPEQUERY:
1647 case OP_TYPEMINQUERY:
1648 case OP_TYPEPOSQUERY:
1649 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1650 break;
1651
1652 /* Same for these */
1653
1654 case OP_TYPEUPTO:
1655 case OP_TYPEMINUPTO:
1656 case OP_TYPEPOSUPTO:
1657 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1658 code += 2;
1659 break;
1660
1661 /* End of branch */
1662
1663 case OP_KET:
1664 case OP_KETRMAX:
1665 case OP_KETRMIN:
1666 case OP_KETRPOS:
1667 case OP_ALT:
1668 goto ISTRUE;
1669
1670 /* In UTF-8 or UTF-16 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY,
1671 POSQUERY, UPTO, MINUPTO, and POSUPTO and their caseless and negative
1672 versions may be followed by a multibyte character. */
1673
1674 #ifdef MAYBE_UTF_MULTI
1675 case OP_STAR:
1676 case OP_STARI:
1677 case OP_NOTSTAR:
1678 case OP_NOTSTARI:
1679
1680 case OP_MINSTAR:
1681 case OP_MINSTARI:
1682 case OP_NOTMINSTAR:
1683 case OP_NOTMINSTARI:
1684
1685 case OP_POSSTAR:
1686 case OP_POSSTARI:
1687 case OP_NOTPOSSTAR:
1688 case OP_NOTPOSSTARI:
1689
1690 case OP_QUERY:
1691 case OP_QUERYI:
1692 case OP_NOTQUERY:
1693 case OP_NOTQUERYI:
1694
1695 case OP_MINQUERY:
1696 case OP_MINQUERYI:
1697 case OP_NOTMINQUERY:
1698 case OP_NOTMINQUERYI:
1699
1700 case OP_POSQUERY:
1701 case OP_POSQUERYI:
1702 case OP_NOTPOSQUERY:
1703 case OP_NOTPOSQUERYI:
1704 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
1705 break;
1706
1707 case OP_UPTO:
1708 case OP_UPTOI:
1709 case OP_NOTUPTO:
1710 case OP_NOTUPTOI:
1711
1712 case OP_MINUPTO:
1713 case OP_MINUPTOI:
1714 case OP_NOTMINUPTO:
1715 case OP_NOTMINUPTOI:
1716
1717 case OP_POSUPTO:
1718 case OP_POSUPTOI:
1719 case OP_NOTPOSUPTO:
1720 case OP_NOTPOSUPTOI:
1721 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
1722 break;
1723 #endif /* MAYBE_UTF_MULTI */
1724
1725 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
1726 string. */
1727
1728 case OP_MARK:
1729 case OP_PRUNE_ARG:
1730 case OP_SKIP_ARG:
1731 case OP_THEN_ARG:
1732 code += code[1];
1733 break;
1734
1735 /* None of the remaining opcodes are required to match a character. */
1736
1737 default:
1738 break;
1739 }
1740 }
1741
1742 ISTRUE:
1743 groupinfo |= GI_COULD_BE_EMPTY;
1744
1745 ISFALSE:
1746 if (group > 0) cb->groupinfo[group] = groupinfo | GI_SET_COULD_BE_EMPTY;
1747
1748 return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
1749 }
1750
1751
1752
1753 /*************************************************
1754 * Check for counted repeat *
1755 *************************************************/
1756
1757 /* This function is called when a '{' is encountered in a place where it might
1758 start a quantifier. It looks ahead to see if it really is a quantifier, that
1759 is, one of the forms {ddd} {ddd,} or {ddd,ddd} where the ddds are digits.
1760
1761 Argument: pointer to the first char after '{'
1762 Returns: TRUE or FALSE
1763 */
1764
1765 static BOOL
is_counted_repeat(PCRE2_SPTR p)1766 is_counted_repeat(PCRE2_SPTR p)
1767 {
1768 if (!IS_DIGIT(*p)) return FALSE;
1769 p++;
1770 while (IS_DIGIT(*p)) p++;
1771 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1772
1773 if (*p++ != CHAR_COMMA) return FALSE;
1774 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1775
1776 if (!IS_DIGIT(*p)) return FALSE;
1777 p++;
1778 while (IS_DIGIT(*p)) p++;
1779
1780 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1781 }
1782
1783
1784
1785 /*************************************************
1786 * Handle escapes *
1787 *************************************************/
1788
1789 /* This function is called when a \ has been encountered. It either returns a
1790 positive value for a simple escape such as \d, or 0 for a data character, which
1791 is placed in chptr. A backreference to group n is returned as negative n. On
1792 entry, ptr is pointing at the \. On exit, it points the final code unit of the
1793 escape sequence.
1794
1795 This function is also called from pcre2_substitute() to handle escape sequences
1796 in replacement strings. In this case, the cb argument is NULL, and only
1797 sequences that define a data character are recognised. The isclass argument is
1798 not relevant, but the options argument is the final value of the compiled
1799 pattern's options.
1800
1801 There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is
1802 processed, it is replaced by a nested alternative sequence. If this contains a
1803 backslash (which is usually does), ptrend does not point to its end - it still
1804 points to the end of the whole pattern. However, we can detect this case
1805 because cb->nestptr[0] will be non-NULL. The nested sequences are all zero-
1806 terminated and there are only ever two levels of nesting.
1807
1808 Arguments:
1809 ptrptr points to the input position pointer
1810 ptrend points to the end of the input
1811 chptr points to a returned data character
1812 errorcodeptr points to the errorcode variable (containing zero)
1813 options the current options bits
1814 isclass TRUE if inside a character class
1815 cb compile data block
1816
1817 Returns: zero => a data character
1818 positive => a special escape sequence
1819 negative => a back reference
1820 on error, errorcodeptr is set non-zero
1821 */
1822
1823 int
PRIV(check_escape)1824 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1825 int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb)
1826 {
1827 BOOL utf = (options & PCRE2_UTF) != 0;
1828 PCRE2_SPTR ptr = *ptrptr + 1;
1829 register uint32_t c, cc;
1830 int escape = 0;
1831 int i;
1832
1833 /* Find the end of a nested insert. */
1834
1835 if (cb != NULL && cb->nestptr[0] != NULL)
1836 ptrend = ptr + PRIV(strlen)(ptr);
1837
1838 /* If backslash is at the end of the string, it's an error. */
1839
1840 if (ptr >= ptrend)
1841 {
1842 *errorcodeptr = ERR1;
1843 return 0;
1844 }
1845
1846 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1847 ptr--; /* Set pointer back to the last code unit */
1848
1849 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1850 value test saves a memory lookup for code points outside the alphanumeric
1851 range. Otherwise, do a table lookup. A non-zero result is something that can be
1852 returned immediately. Otherwise further processing is required. */
1853
1854 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1855
1856 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1857 {
1858 if (i > 0) c = (uint32_t)i; else /* Positive is a data character */
1859 {
1860 escape = -i; /* Else return a special escape */
1861 if (escape == ESC_P || escape == ESC_p || escape == ESC_X)
1862 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1863 }
1864 }
1865
1866 /* Escapes that need further processing, including those that are unknown.
1867 When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u
1868 when BSUX is set). */
1869
1870 else
1871 {
1872 PCRE2_SPTR oldptr;
1873 BOOL braced, negated, overflow;
1874 unsigned int s;
1875
1876 /* Filter calls from pcre2_substitute(). */
1877
1878 if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x &&
1879 (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0))
1880 {
1881 *errorcodeptr = ERR3;
1882 return 0;
1883 }
1884
1885 switch (c)
1886 {
1887 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1888 error. */
1889
1890 case CHAR_l:
1891 case CHAR_L:
1892 *errorcodeptr = ERR37;
1893 break;
1894
1895 /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
1896 specially, \u must be followed by four hex digits. Otherwise it is a
1897 lowercase u letter. */
1898
1899 case CHAR_u:
1900 if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
1901 {
1902 uint32_t xc;
1903 if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1904 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1905 cc = (cc << 4) | xc;
1906 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1907 cc = (cc << 4) | xc;
1908 if ((xc = XDIGIT(ptr[4])) == 0xff) break; /* Not a hex digit */
1909 c = (cc << 4) | xc;
1910 ptr += 4;
1911 if (utf)
1912 {
1913 if (c > 0x10ffffU) *errorcodeptr = ERR77;
1914 else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1915 }
1916 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1917 }
1918 break;
1919
1920 case CHAR_U:
1921 /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
1922 upper case letter. */
1923 if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
1924 break;
1925
1926 /* In a character class, \g is just a literal "g". Outside a character
1927 class, \g must be followed by one of a number of specific things:
1928
1929 (1) A number, either plain or braced. If positive, it is an absolute
1930 backreference. If negative, it is a relative backreference. This is a Perl
1931 5.10 feature.
1932
1933 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1934 is part of Perl's movement towards a unified syntax for back references. As
1935 this is synonymous with \k{name}, we fudge it up by pretending it really
1936 was \k.
1937
1938 (3) For Oniguruma compatibility we also support \g followed by a name or a
1939 number either in angle brackets or in single quotes. However, these are
1940 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1941 the ESC_g code (cf \k). */
1942
1943 case CHAR_g:
1944 if (isclass) break;
1945 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1946 {
1947 escape = ESC_g;
1948 break;
1949 }
1950
1951 /* Handle the Perl-compatible cases */
1952
1953 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1954 {
1955 PCRE2_SPTR p;
1956 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1957 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1958 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1959 {
1960 escape = ESC_k;
1961 break;
1962 }
1963 braced = TRUE;
1964 ptr++;
1965 }
1966 else braced = FALSE;
1967
1968 if (ptr[1] == CHAR_MINUS)
1969 {
1970 negated = TRUE;
1971 ptr++;
1972 }
1973 else negated = FALSE;
1974
1975 /* The integer range is limited by the machine's int representation. */
1976 s = 0;
1977 overflow = FALSE;
1978 while (IS_DIGIT(ptr[1]))
1979 {
1980 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1981 {
1982 overflow = TRUE;
1983 break;
1984 }
1985 s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0);
1986 }
1987 if (overflow) /* Integer overflow */
1988 {
1989 while (IS_DIGIT(ptr[1])) ptr++;
1990 *errorcodeptr = ERR61;
1991 break;
1992 }
1993
1994 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1995 {
1996 *errorcodeptr = ERR57;
1997 break;
1998 }
1999
2000 if (s == 0)
2001 {
2002 *errorcodeptr = ERR58;
2003 break;
2004 }
2005
2006 if (negated)
2007 {
2008 if (s > cb->bracount)
2009 {
2010 *errorcodeptr = ERR15;
2011 break;
2012 }
2013 s = cb->bracount - (s - 1);
2014 }
2015
2016 escape = -(int)s;
2017 break;
2018
2019 /* The handling of escape sequences consisting of a string of digits
2020 starting with one that is not zero is not straightforward. Perl has changed
2021 over the years. Nowadays \g{} for backreferences and \o{} for octal are
2022 recommended to avoid the ambiguities in the old syntax.
2023
2024 Outside a character class, the digits are read as a decimal number. If the
2025 number is less than 10, or if there are that many previous extracting left
2026 brackets, it is a back reference. Otherwise, up to three octal digits are
2027 read to form an escaped character code. Thus \123 is likely to be octal 123
2028 (cf \0123, which is octal 012 followed by the literal 3).
2029
2030 Inside a character class, \ followed by a digit is always either a literal
2031 8 or 9 or an octal number. */
2032
2033 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
2034 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
2035
2036 if (!isclass)
2037 {
2038 oldptr = ptr;
2039 /* The integer range is limited by the machine's int representation. */
2040 s = c - CHAR_0;
2041 overflow = FALSE;
2042 while (IS_DIGIT(ptr[1]))
2043 {
2044 if (s > INT_MAX / 10 - 1) /* Integer overflow */
2045 {
2046 overflow = TRUE;
2047 break;
2048 }
2049 s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0);
2050 }
2051 if (overflow) /* Integer overflow */
2052 {
2053 while (IS_DIGIT(ptr[1])) ptr++;
2054 *errorcodeptr = ERR61;
2055 break;
2056 }
2057
2058 /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
2059 are octal escapes if there are not that many previous captures. */
2060
2061 if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount)
2062 {
2063 escape = -(int)s; /* Indicates a back reference */
2064 break;
2065 }
2066 ptr = oldptr; /* Put the pointer back and fall through */
2067 }
2068
2069 /* Handle a digit following \ when the number is not a back reference, or
2070 we are within a character class. If the first digit is 8 or 9, Perl used to
2071 generate a binary zero byte and then treat the digit as a following
2072 literal. At least by Perl 5.18 this changed so as not to insert the binary
2073 zero. */
2074
2075 if ((c = *ptr) >= CHAR_8) break;
2076
2077 /* Fall through with a digit less than 8 */
2078
2079 /* \0 always starts an octal number, but we may drop through to here with a
2080 larger first octal digit. The original code used just to take the least
2081 significant 8 bits of octal numbers (I think this is what early Perls used
2082 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
2083 but no more than 3 octal digits. */
2084
2085 case CHAR_0:
2086 c -= CHAR_0;
2087 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
2088 c = c * 8 + *(++ptr) - CHAR_0;
2089 #if PCRE2_CODE_UNIT_WIDTH == 8
2090 if (!utf && c > 0xff) *errorcodeptr = ERR51;
2091 #endif
2092 break;
2093
2094 /* \o is a relatively new Perl feature, supporting a more general way of
2095 specifying character codes in octal. The only supported form is \o{ddd}. */
2096
2097 case CHAR_o:
2098 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else
2099 if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else
2100 {
2101 ptr += 2;
2102 c = 0;
2103 overflow = FALSE;
2104 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
2105 {
2106 cc = *ptr++;
2107 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
2108 #if PCRE2_CODE_UNIT_WIDTH == 32
2109 if (c >= 0x20000000l) { overflow = TRUE; break; }
2110 #endif
2111 c = (c << 3) + (cc - CHAR_0);
2112 #if PCRE2_CODE_UNIT_WIDTH == 8
2113 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
2114 #elif PCRE2_CODE_UNIT_WIDTH == 16
2115 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
2116 #elif PCRE2_CODE_UNIT_WIDTH == 32
2117 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
2118 #endif
2119 }
2120 if (overflow)
2121 {
2122 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
2123 *errorcodeptr = ERR34;
2124 }
2125 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
2126 {
2127 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
2128 }
2129 else *errorcodeptr = ERR64;
2130 }
2131 break;
2132
2133 /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by
2134 two hexadecimal digits. Otherwise it is a lowercase x letter. */
2135
2136 case CHAR_x:
2137 if ((options & PCRE2_ALT_BSUX) != 0)
2138 {
2139 uint32_t xc;
2140 if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
2141 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
2142 c = (cc << 4) | xc;
2143 ptr += 2;
2144 } /* End PCRE2_ALT_BSUX handling */
2145
2146 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
2147 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
2148 digits. If not, { used to be treated as a data character. However, Perl
2149 seems to read hex digits up to the first non-such, and ignore the rest, so
2150 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2151 now gives an error. */
2152
2153 else
2154 {
2155 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
2156 {
2157 ptr += 2;
2158 if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
2159 {
2160 *errorcodeptr = ERR78;
2161 break;
2162 }
2163 c = 0;
2164 overflow = FALSE;
2165
2166 while ((cc = XDIGIT(*ptr)) != 0xff)
2167 {
2168 ptr++;
2169 if (c == 0 && cc == 0) continue; /* Leading zeroes */
2170 #if PCRE2_CODE_UNIT_WIDTH == 32
2171 if (c >= 0x10000000l) { overflow = TRUE; break; }
2172 #endif
2173 c = (c << 4) | cc;
2174 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2175 {
2176 overflow = TRUE;
2177 break;
2178 }
2179 }
2180
2181 if (overflow)
2182 {
2183 while (XDIGIT(*ptr) != 0xff) ptr++;
2184 *errorcodeptr = ERR34;
2185 }
2186 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
2187 {
2188 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
2189 }
2190
2191 /* If the sequence of hex digits does not end with '}', give an error.
2192 We used just to recognize this construct and fall through to the normal
2193 \x handling, but nowadays Perl gives an error, which seems much more
2194 sensible, so we do too. */
2195
2196 else *errorcodeptr = ERR67;
2197 } /* End of \x{} processing */
2198
2199 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
2200
2201 else
2202 {
2203 c = 0;
2204 if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
2205 ptr++;
2206 c = cc;
2207 if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
2208 ptr++;
2209 c = (c << 4) | cc;
2210 } /* End of \xdd handling */
2211 } /* End of Perl-style \x handling */
2212 break;
2213
2214 /* The handling of \c is different in ASCII and EBCDIC environments. In an
2215 ASCII (or Unicode) environment, an error is given if the character
2216 following \c is not a printable ASCII character. Otherwise, the following
2217 character is upper-cased if it is a letter, and after that the 0x40 bit is
2218 flipped. The result is the value of the escape.
2219
2220 In an EBCDIC environment the handling of \c is compatible with the
2221 specification in the perlebcdic document. The following character must be
2222 a letter or one of small number of special characters. These provide a
2223 means of defining the character values 0-31.
2224
2225 For testing the EBCDIC handling of \c in an ASCII environment, recognize
2226 the EBCDIC value of 'c' explicitly. */
2227
2228 #if defined EBCDIC && 'a' != 0x81
2229 case 0x83:
2230 #else
2231 case CHAR_c:
2232 #endif
2233
2234 c = *(++ptr);
2235 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2236 if (c == CHAR_NULL && ptr >= ptrend)
2237 {
2238 *errorcodeptr = ERR2;
2239 break;
2240 }
2241
2242 /* Handle \c in an ASCII/Unicode environment. */
2243
2244 #ifndef EBCDIC /* ASCII/UTF-8 coding */
2245 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
2246 {
2247 *errorcodeptr = ERR68;
2248 break;
2249 }
2250 c ^= 0x40;
2251
2252 /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2253 255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC
2254 encoding. (This is the way Perl indicates that it handles \c?.) The other
2255 valid sequences correspond to a list of specific characters. */
2256
2257 #else
2258 if (c == CHAR_QUESTION_MARK)
2259 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2260 else
2261 {
2262 for (i = 0; i < 32; i++)
2263 {
2264 if (c == ebcdic_escape_c[i]) break;
2265 }
2266 if (i < 32) c = i; else *errorcodeptr = ERR68;
2267 }
2268 #endif /* EBCDIC */
2269
2270 break;
2271
2272 /* Any other alphanumeric following \ is an error. Perl gives an error only
2273 if in warning mode, but PCRE doesn't have a warning mode. */
2274
2275 default:
2276 *errorcodeptr = ERR3;
2277 break;
2278 }
2279 }
2280
2281 /* Perl supports \N{name} for character names, as well as plain \N for "not
2282 newline". PCRE does not support \N{name}. However, it does support
2283 quantification such as \N{2,3}. */
2284
2285 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
2286 !is_counted_repeat(ptr+2))
2287 *errorcodeptr = ERR37;
2288
2289 /* If PCRE2_UCP is set, we change the values for \d etc. */
2290
2291 if ((options & PCRE2_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
2292 escape += (ESC_DU - ESC_D);
2293
2294 /* Set the pointer to the final character before returning. */
2295
2296 *ptrptr = ptr;
2297 *chptr = c;
2298 return escape;
2299 }
2300
2301
2302
2303 #ifdef SUPPORT_UNICODE
2304 /*************************************************
2305 * Handle \P and \p *
2306 *************************************************/
2307
2308 /* This function is called after \P or \p has been encountered, provided that
2309 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2310 contents of ptrptr are pointing at the P or p. On exit, it is left pointing at
2311 the final code unit of the escape sequence.
2312
2313 Arguments:
2314 ptrptr the pattern position pointer
2315 negptr a boolean that is set TRUE for negation else FALSE
2316 ptypeptr an unsigned int that is set to the type value
2317 pdataptr an unsigned int that is set to the detailed property value
2318 errorcodeptr the error code variable
2319 cb the compile data
2320
2321 Returns: TRUE if the type value was found, or FALSE for an invalid type
2322 */
2323
2324 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,unsigned int * ptypeptr,unsigned int * pdataptr,int * errorcodeptr,compile_block * cb)2325 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, unsigned int *ptypeptr,
2326 unsigned int *pdataptr, int *errorcodeptr, compile_block *cb)
2327 {
2328 register PCRE2_UCHAR c;
2329 size_t i, bot, top;
2330 PCRE2_SPTR ptr = *ptrptr;
2331 PCRE2_UCHAR name[32];
2332
2333 *negptr = FALSE;
2334 c = *(++ptr);
2335
2336 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2337 negation. */
2338
2339 if (c == CHAR_LEFT_CURLY_BRACKET)
2340 {
2341 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
2342 {
2343 *negptr = TRUE;
2344 ptr++;
2345 }
2346 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2347 {
2348 c = *(++ptr);
2349 if (c == CHAR_NULL) goto ERROR_RETURN;
2350 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2351 name[i] = c;
2352 }
2353 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2354 name[i] = 0;
2355 }
2356
2357 /* Otherwise there is just one following character, which must be an ASCII
2358 letter. */
2359
2360 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2361 {
2362 name[0] = c;
2363 name[1] = 0;
2364 }
2365 else goto ERROR_RETURN;
2366
2367 *ptrptr = ptr;
2368
2369 /* Search for a recognized property name using binary chop. */
2370
2371 bot = 0;
2372 top = PRIV(utt_size);
2373
2374 while (bot < top)
2375 {
2376 int r;
2377 i = (bot + top) >> 1;
2378 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2379 if (r == 0)
2380 {
2381 *ptypeptr = PRIV(utt)[i].type;
2382 *pdataptr = PRIV(utt)[i].value;
2383 return TRUE;
2384 }
2385 if (r > 0) bot = i + 1; else top = i;
2386 }
2387 *errorcodeptr = ERR47; /* Unrecognized name */
2388 return FALSE;
2389
2390 ERROR_RETURN: /* Malformed \P or \p */
2391 *errorcodeptr = ERR46;
2392 *ptrptr = ptr;
2393 return FALSE;
2394 }
2395 #endif
2396
2397
2398
2399 /*************************************************
2400 * Read repeat counts *
2401 *************************************************/
2402
2403 /* Read an item of the form {n,m} and return the values. This is called only
2404 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
2405 so the syntax is guaranteed to be correct, but we need to check the values.
2406
2407 Arguments:
2408 p pointer to first char after '{'
2409 minp pointer to int for min
2410 maxp pointer to int for max
2411 returned as -1 if no max
2412 errorcodeptr points to error code variable
2413
2414 Returns: pointer to '}' on success;
2415 current ptr on error, with errorcodeptr set non-zero
2416 */
2417
2418 static PCRE2_SPTR
read_repeat_counts(PCRE2_SPTR p,int * minp,int * maxp,int * errorcodeptr)2419 read_repeat_counts(PCRE2_SPTR p, int *minp, int *maxp, int *errorcodeptr)
2420 {
2421 int min = 0;
2422 int max = -1;
2423
2424 while (IS_DIGIT(*p))
2425 {
2426 min = min * 10 + (int)(*p++ - CHAR_0);
2427 if (min > 65535)
2428 {
2429 *errorcodeptr = ERR5;
2430 return p;
2431 }
2432 }
2433
2434 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
2435 {
2436 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
2437 {
2438 max = 0;
2439 while(IS_DIGIT(*p))
2440 {
2441 max = max * 10 + (int)(*p++ - CHAR_0);
2442 if (max > 65535)
2443 {
2444 *errorcodeptr = ERR5;
2445 return p;
2446 }
2447 }
2448 if (max < min)
2449 {
2450 *errorcodeptr = ERR4;
2451 return p;
2452 }
2453 }
2454 }
2455
2456 *minp = min;
2457 *maxp = max;
2458 return p;
2459 }
2460
2461
2462
2463 /*************************************************
2464 * Scan compiled regex for recursion reference *
2465 *************************************************/
2466
2467 /* This function scans through a compiled pattern until it finds an instance of
2468 OP_RECURSE.
2469
2470 Arguments:
2471 code points to start of expression
2472 utf TRUE in UTF mode
2473
2474 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2475 */
2476
2477 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)2478 find_recurse(PCRE2_SPTR code, BOOL utf)
2479 {
2480 for (;;)
2481 {
2482 register PCRE2_UCHAR c = *code;
2483 if (c == OP_END) return NULL;
2484 if (c == OP_RECURSE) return code;
2485
2486 /* XCLASS is used for classes that cannot be represented just by a bit map.
2487 This includes negated single high-valued characters. CALLOUT_STR is used for
2488 callouts with string arguments. In both cases the length in the table is
2489 zero; the actual length is stored in the compiled code. */
2490
2491 if (c == OP_XCLASS) code += GET(code, 1);
2492 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
2493
2494 /* Otherwise, we can get the item's length from the table, except that for
2495 repeated character types, we have to test for \p and \P, which have an extra
2496 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2497 must add in its length. */
2498
2499 else
2500 {
2501 switch(c)
2502 {
2503 case OP_TYPESTAR:
2504 case OP_TYPEMINSTAR:
2505 case OP_TYPEPLUS:
2506 case OP_TYPEMINPLUS:
2507 case OP_TYPEQUERY:
2508 case OP_TYPEMINQUERY:
2509 case OP_TYPEPOSSTAR:
2510 case OP_TYPEPOSPLUS:
2511 case OP_TYPEPOSQUERY:
2512 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2513 break;
2514
2515 case OP_TYPEPOSUPTO:
2516 case OP_TYPEUPTO:
2517 case OP_TYPEMINUPTO:
2518 case OP_TYPEEXACT:
2519 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2520 code += 2;
2521 break;
2522
2523 case OP_MARK:
2524 case OP_PRUNE_ARG:
2525 case OP_SKIP_ARG:
2526 case OP_THEN_ARG:
2527 code += code[1];
2528 break;
2529 }
2530
2531 /* Add in the fixed length from the table */
2532
2533 code += PRIV(OP_lengths)[c];
2534
2535 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
2536 be followed by a multi-unit character. The length in the table is a
2537 minimum, so we have to arrange to skip the extra units. */
2538
2539 #ifdef MAYBE_UTF_MULTI
2540 if (utf) switch(c)
2541 {
2542 case OP_CHAR:
2543 case OP_CHARI:
2544 case OP_NOT:
2545 case OP_NOTI:
2546 case OP_EXACT:
2547 case OP_EXACTI:
2548 case OP_NOTEXACT:
2549 case OP_NOTEXACTI:
2550 case OP_UPTO:
2551 case OP_UPTOI:
2552 case OP_NOTUPTO:
2553 case OP_NOTUPTOI:
2554 case OP_MINUPTO:
2555 case OP_MINUPTOI:
2556 case OP_NOTMINUPTO:
2557 case OP_NOTMINUPTOI:
2558 case OP_POSUPTO:
2559 case OP_POSUPTOI:
2560 case OP_NOTPOSUPTO:
2561 case OP_NOTPOSUPTOI:
2562 case OP_STAR:
2563 case OP_STARI:
2564 case OP_NOTSTAR:
2565 case OP_NOTSTARI:
2566 case OP_MINSTAR:
2567 case OP_MINSTARI:
2568 case OP_NOTMINSTAR:
2569 case OP_NOTMINSTARI:
2570 case OP_POSSTAR:
2571 case OP_POSSTARI:
2572 case OP_NOTPOSSTAR:
2573 case OP_NOTPOSSTARI:
2574 case OP_PLUS:
2575 case OP_PLUSI:
2576 case OP_NOTPLUS:
2577 case OP_NOTPLUSI:
2578 case OP_MINPLUS:
2579 case OP_MINPLUSI:
2580 case OP_NOTMINPLUS:
2581 case OP_NOTMINPLUSI:
2582 case OP_POSPLUS:
2583 case OP_POSPLUSI:
2584 case OP_NOTPOSPLUS:
2585 case OP_NOTPOSPLUSI:
2586 case OP_QUERY:
2587 case OP_QUERYI:
2588 case OP_NOTQUERY:
2589 case OP_NOTQUERYI:
2590 case OP_MINQUERY:
2591 case OP_MINQUERYI:
2592 case OP_NOTMINQUERY:
2593 case OP_NOTMINQUERYI:
2594 case OP_POSQUERY:
2595 case OP_POSQUERYI:
2596 case OP_NOTPOSQUERY:
2597 case OP_NOTPOSQUERYI:
2598 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2599 break;
2600 }
2601 #else
2602 (void)(utf); /* Keep compiler happy by referencing function argument */
2603 #endif /* MAYBE_UTF_MULTI */
2604 }
2605 }
2606 }
2607
2608
2609
2610 /*************************************************
2611 * Check for POSIX class syntax *
2612 *************************************************/
2613
2614 /* This function is called when the sequence "[:" or "[." or "[=" is
2615 encountered in a character class. It checks whether this is followed by a
2616 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2617 reach an unescaped ']' without the special preceding character, return FALSE.
2618
2619 Originally, this function only recognized a sequence of letters between the
2620 terminators, but it seems that Perl recognizes any sequence of characters,
2621 though of course unknown POSIX names are subsequently rejected. Perl gives an
2622 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2623 didn't consider this to be a POSIX class. Likewise for [:1234:].
2624
2625 The problem in trying to be exactly like Perl is in the handling of escapes. We
2626 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2627 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2628 below handles the special cases \\ and \], but does not try to do any other
2629 escape processing. This makes it different from Perl for cases such as
2630 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2631 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2632 when Perl does, I think.
2633
2634 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2635 It seems that the appearance of a nested POSIX class supersedes an apparent
2636 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2637 a digit. This is handled by returning FALSE if the start of a new group with
2638 the same terminator is encountered, since the next closing sequence must close
2639 the nested group, not the outer one.
2640
2641 In Perl, unescaped square brackets may also appear as part of class names. For
2642 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2643 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2644 seem right at all. PCRE does not allow closing square brackets in POSIX class
2645 names.
2646
2647 Arguments:
2648 ptr pointer to the initial [
2649 endptr where to return a pointer to the terminating ':', '.', or '='
2650
2651 Returns: TRUE or FALSE
2652 */
2653
2654 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR * endptr)2655 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR *endptr)
2656 {
2657 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2658 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2659
2660 for (++ptr; *ptr != CHAR_NULL; ptr++)
2661 {
2662 if (*ptr == CHAR_BACKSLASH &&
2663 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2664 ptr++;
2665 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2666 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2667 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2668 {
2669 *endptr = ptr;
2670 return TRUE;
2671 }
2672 }
2673
2674 return FALSE;
2675 }
2676
2677
2678
2679 /*************************************************
2680 * Check POSIX class name *
2681 *************************************************/
2682
2683 /* This function is called to check the name given in a POSIX-style class entry
2684 such as [:alnum:].
2685
2686 Arguments:
2687 ptr points to the first letter
2688 len the length of the name
2689
2690 Returns: a value representing the name, or -1 if unknown
2691 */
2692
2693 static int
check_posix_name(PCRE2_SPTR ptr,int len)2694 check_posix_name(PCRE2_SPTR ptr, int len)
2695 {
2696 const char *pn = posix_names;
2697 register int yield = 0;
2698 while (posix_name_lengths[yield] != 0)
2699 {
2700 if (len == posix_name_lengths[yield] &&
2701 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2702 pn += posix_name_lengths[yield] + 1;
2703 yield++;
2704 }
2705 return -1;
2706 }
2707
2708
2709
2710 #ifdef SUPPORT_UNICODE
2711 /*************************************************
2712 * Get othercase range *
2713 *************************************************/
2714
2715 /* This function is passed the start and end of a class range in UCT mode. It
2716 searches up the characters, looking for ranges of characters in the "other"
2717 case. Each call returns the next one, updating the start address. A character
2718 with multiple other cases is returned on its own with a special return value.
2719
2720 Arguments:
2721 cptr points to starting character value; updated
2722 d end value
2723 ocptr where to put start of othercase range
2724 odptr where to put end of othercase range
2725
2726 Yield: -1 when no more
2727 0 when a range is returned
2728 >0 the CASESET offset for char with multiple other cases
2729 in this case, ocptr contains the original
2730 */
2731
2732 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)2733 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
2734 uint32_t *odptr)
2735 {
2736 uint32_t c, othercase, next;
2737 unsigned int co;
2738
2739 /* Find the first character that has an other case. If it has multiple other
2740 cases, return its case offset value. */
2741
2742 for (c = *cptr; c <= d; c++)
2743 {
2744 if ((co = UCD_CASESET(c)) != 0)
2745 {
2746 *ocptr = c++; /* Character that has the set */
2747 *cptr = c; /* Rest of input range */
2748 return (int)co;
2749 }
2750 if ((othercase = UCD_OTHERCASE(c)) != c) break;
2751 }
2752
2753 if (c > d) return -1; /* Reached end of range */
2754
2755 /* Found a character that has a single other case. Search for the end of the
2756 range, which is either the end of the input range, or a character that has zero
2757 or more than one other cases. */
2758
2759 *ocptr = othercase;
2760 next = othercase + 1;
2761
2762 for (++c; c <= d; c++)
2763 {
2764 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
2765 next++;
2766 }
2767
2768 *odptr = next - 1; /* End of othercase range */
2769 *cptr = c; /* Rest of input range */
2770 return 0;
2771 }
2772 #endif /* SUPPORT_UNICODE */
2773
2774
2775
2776 /*************************************************
2777 * Add a character or range to a class *
2778 *************************************************/
2779
2780 /* This function packages up the logic of adding a character or range of
2781 characters to a class. The character values in the arguments will be within the
2782 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
2783 mutually recursive with the function immediately below.
2784
2785 Arguments:
2786 classbits the bit map for characters < 256
2787 uchardptr points to the pointer for extra data
2788 options the options word
2789 cb compile data
2790 start start of range character
2791 end end of range character
2792
2793 Returns: the number of < 256 characters added
2794 the pointer to extra data is updated
2795 */
2796
2797 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)2798 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
2799 compile_block *cb, uint32_t start, uint32_t end)
2800 {
2801 uint32_t c;
2802 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
2803 unsigned int n8 = 0;
2804
2805 /* If caseless matching is required, scan the range and process alternate
2806 cases. In Unicode, there are 8-bit characters that have alternate cases that
2807 are greater than 255 and vice-versa. Sometimes we can just extend the original
2808 range. */
2809
2810 if ((options & PCRE2_CASELESS) != 0)
2811 {
2812 #ifdef SUPPORT_UNICODE
2813 if ((options & PCRE2_UTF) != 0)
2814 {
2815 int rc;
2816 uint32_t oc, od;
2817
2818 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
2819 c = start;
2820
2821 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
2822 {
2823 /* Handle a single character that has more than one other case. */
2824
2825 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb,
2826 PRIV(ucd_caseless_sets) + rc, oc);
2827
2828 /* Do nothing if the other case range is within the original range. */
2829
2830 else if (oc >= start && od <= end) continue;
2831
2832 /* Extend the original range if there is overlap, noting that if oc < c, we
2833 can't have od > end because a subrange is always shorter than the basic
2834 range. Otherwise, use a recursive call to add the additional range. */
2835
2836 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
2837 else if (od > end && oc <= end + 1)
2838 {
2839 end = od; /* Extend upwards */
2840 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
2841 }
2842 else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od);
2843 }
2844 }
2845 else
2846 #endif /* SUPPORT_UNICODE */
2847
2848 /* Not UTF mode */
2849
2850 for (c = start; c <= classbits_end; c++)
2851 {
2852 SETBIT(classbits, cb->fcc[c]);
2853 n8++;
2854 }
2855 }
2856
2857 /* Now handle the original range. Adjust the final value according to the bit
2858 length - this means that the same lists of (e.g.) horizontal spaces can be used
2859 in all cases. */
2860
2861 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
2862 end = MAX_NON_UTF_CHAR;
2863
2864 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
2865
2866 for (c = start; c <= classbits_end; c++)
2867 {
2868 /* Regardless of start, c will always be <= 255. */
2869 SETBIT(classbits, c);
2870 n8++;
2871 }
2872
2873 #ifdef SUPPORT_WIDE_CHARS
2874 if (start <= 0xff) start = 0xff + 1;
2875
2876 if (end >= start)
2877 {
2878 PCRE2_UCHAR *uchardata = *uchardptr;
2879
2880 #ifdef SUPPORT_UNICODE
2881 if ((options & PCRE2_UTF) != 0)
2882 {
2883 if (start < end)
2884 {
2885 *uchardata++ = XCL_RANGE;
2886 uchardata += PRIV(ord2utf)(start, uchardata);
2887 uchardata += PRIV(ord2utf)(end, uchardata);
2888 }
2889 else if (start == end)
2890 {
2891 *uchardata++ = XCL_SINGLE;
2892 uchardata += PRIV(ord2utf)(start, uchardata);
2893 }
2894 }
2895 else
2896 #endif /* SUPPORT_UNICODE */
2897
2898 /* Without UTF support, character values are constrained by the bit length,
2899 and can only be > 256 for 16-bit and 32-bit libraries. */
2900
2901 #if PCRE2_CODE_UNIT_WIDTH == 8
2902 {}
2903 #else
2904 if (start < end)
2905 {
2906 *uchardata++ = XCL_RANGE;
2907 *uchardata++ = start;
2908 *uchardata++ = end;
2909 }
2910 else if (start == end)
2911 {
2912 *uchardata++ = XCL_SINGLE;
2913 *uchardata++ = start;
2914 }
2915 #endif
2916 *uchardptr = uchardata; /* Updata extra data pointer */
2917 }
2918 #else
2919 (void)uchardptr; /* Avoid compiler warning */
2920 #endif /* SUPPORT_WIDE_CHARS */
2921
2922 return n8; /* Number of 8-bit characters */
2923 }
2924
2925
2926
2927 /*************************************************
2928 * Add a list of characters to a class *
2929 *************************************************/
2930
2931 /* This function is used for adding a list of case-equivalent characters to a
2932 class, and also for adding a list of horizontal or vertical whitespace. If the
2933 list is in order (which it should be), ranges of characters are detected and
2934 handled appropriately. This function is mutually recursive with the function
2935 above.
2936
2937 Arguments:
2938 classbits the bit map for characters < 256
2939 uchardptr points to the pointer for extra data
2940 options the options word
2941 cb contains pointers to tables etc.
2942 p points to row of 32-bit values, terminated by NOTACHAR
2943 except character to omit; this is used when adding lists of
2944 case-equivalent characters to avoid including the one we
2945 already know about
2946
2947 Returns: the number of < 256 characters added
2948 the pointer to extra data is updated
2949 */
2950
2951 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)2952 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
2953 compile_block *cb, const uint32_t *p, unsigned int except)
2954 {
2955 unsigned int n8 = 0;
2956 while (p[0] < NOTACHAR)
2957 {
2958 unsigned int n = 0;
2959 if (p[0] != except)
2960 {
2961 while(p[n+1] == p[0] + n + 1) n++;
2962 n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]);
2963 }
2964 p += n + 1;
2965 }
2966 return n8;
2967 }
2968
2969
2970
2971 /*************************************************
2972 * Add characters not in a list to a class *
2973 *************************************************/
2974
2975 /* This function is used for adding the complement of a list of horizontal or
2976 vertical whitespace to a class. The list must be in order.
2977
2978 Arguments:
2979 classbits the bit map for characters < 256
2980 uchardptr points to the pointer for extra data
2981 options the options word
2982 cb contains pointers to tables etc.
2983 p points to row of 32-bit values, terminated by NOTACHAR
2984
2985 Returns: the number of < 256 characters added
2986 the pointer to extra data is updated
2987 */
2988
2989 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)2990 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
2991 uint32_t options, compile_block *cb, const uint32_t *p)
2992 {
2993 BOOL utf = (options & PCRE2_UTF) != 0;
2994 unsigned int n8 = 0;
2995 if (p[0] > 0)
2996 n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
2997 while (p[0] < NOTACHAR)
2998 {
2999 while (p[1] == p[0] + 1) p++;
3000 n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
3001 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
3002 p++;
3003 }
3004 return n8;
3005 }
3006
3007
3008
3009 /*************************************************
3010 * Process (*VERB) name for escapes *
3011 *************************************************/
3012
3013 /* This function is called when the PCRE2_ALT_VERBNAMES option is set, to
3014 process the characters in a verb's name argument. It is called twice, once with
3015 codeptr == NULL, to find out the length of the processed name, and again to put
3016 the name into memory.
3017
3018 Arguments:
3019 ptrptr pointer to the input pointer
3020 codeptr pointer to the compiled code pointer
3021 errorcodeptr pointer to the error code
3022 options the options bits
3023 utf TRUE if processing UTF
3024 cb compile data block
3025
3026 Returns: length of the processed name, or < 0 on error
3027 */
3028
3029 static int
process_verb_name(PCRE2_SPTR * ptrptr,PCRE2_UCHAR ** codeptr,int * errorcodeptr,uint32_t options,BOOL utf,compile_block * cb)3030 process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr,
3031 uint32_t options, BOOL utf, compile_block *cb)
3032 {
3033 int32_t arglen = 0;
3034 BOOL inescq = FALSE;
3035 PCRE2_SPTR ptr = *ptrptr;
3036 PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr;
3037
3038 for (; ptr < cb->end_pattern; ptr++)
3039 {
3040 uint32_t x = *ptr;
3041
3042 /* Skip over literals */
3043
3044 if (inescq)
3045 {
3046 if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3047 {
3048 inescq = FALSE;
3049 ptr++;;
3050 continue;
3051 }
3052 }
3053
3054 else /* Not a literal character */
3055 {
3056 if (x == CHAR_RIGHT_PARENTHESIS) break;
3057
3058 /* Skip over comments and whitespace in extended mode. */
3059
3060 if ((options & PCRE2_EXTENDED) != 0)
3061 {
3062 PCRE2_SPTR wscptr = ptr;
3063 while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr);
3064 if (x == CHAR_NUMBER_SIGN)
3065 {
3066 ptr++;
3067 while (*ptr != CHAR_NULL || ptr < cb->end_pattern)
3068 {
3069 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
3070 { /* IS_NEWLINE sets cb->nllen. */
3071 ptr += cb->nllen;
3072 break;
3073 }
3074 ptr++;
3075 #ifdef SUPPORT_UNICODE
3076 if (utf) FORWARDCHAR(ptr);
3077 #endif
3078 }
3079 }
3080
3081 /* If we have skipped any characters, restart the loop. */
3082
3083 if (ptr > wscptr)
3084 {
3085 ptr--;
3086 continue;
3087 }
3088 }
3089
3090 /* Process escapes */
3091
3092 if (x == '\\')
3093 {
3094 int rc;
3095 *errorcodeptr = 0;
3096 rc = PRIV(check_escape)(&ptr, cb->end_pattern, &x, errorcodeptr, options,
3097 FALSE, cb);
3098 *ptrptr = ptr; /* For possible error */
3099 if (*errorcodeptr != 0) return -1;
3100 if (rc != 0)
3101 {
3102 if (rc == ESC_Q)
3103 {
3104 inescq = TRUE;
3105 continue;
3106 }
3107 if (rc == ESC_E) continue;
3108 *errorcodeptr = ERR40;
3109 return -1;
3110 }
3111 }
3112 }
3113
3114 /* We have the next character in the name. */
3115
3116 #ifdef SUPPORT_UNICODE
3117 if (utf)
3118 {
3119 if (code == NULL) /* Just want the length */
3120 {
3121 #if PCRE2_CODE_UNIT_WIDTH == 8
3122 int i;
3123 for (i = 0; i < PRIV(utf8_table1_size); i++)
3124 if ((int)x <= PRIV(utf8_table1)[i]) break;
3125 arglen += i;
3126 #elif PCRE2_CODE_UNIT_WIDTH == 16
3127 if (x > 0xffff) arglen++;
3128 #endif
3129 }
3130 else
3131 {
3132 PCRE2_UCHAR cbuff[8];
3133 x = PRIV(ord2utf)(x, cbuff);
3134 memcpy(code, cbuff, CU2BYTES(x));
3135 code += x;
3136 }
3137 }
3138 else
3139 #endif /* SUPPORT_UNICODE */
3140
3141 /* Not UTF */
3142 {
3143 if (code != NULL) *code++ = (PCRE2_UCHAR)x;
3144 }
3145
3146 arglen++;
3147
3148 if ((unsigned int)arglen > MAX_MARK)
3149 {
3150 *errorcodeptr = ERR76;
3151 *ptrptr = ptr;
3152 return -1;
3153 }
3154 }
3155
3156 /* Update the pointers before returning. */
3157
3158 *ptrptr = ptr;
3159 if (codeptr != NULL) *codeptr = code;
3160 return arglen;
3161 }
3162
3163
3164
3165 /*************************************************
3166 * Macro for the next two functions *
3167 *************************************************/
3168
3169 /* Both scan_for_captures() and compile_branch() use this macro to generate a
3170 fragment of code that reads the characters of a name and sets its length
3171 (checking for not being too long). Count the characters dynamically, to avoid
3172 the possibility of integer overflow. The same macro is used for reading *VERB
3173 names. */
3174
3175 #define READ_NAME(ctype, errno, errset) \
3176 namelen = 0; \
3177 while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype) != 0) \
3178 { \
3179 ptr++; \
3180 namelen++; \
3181 if (namelen > MAX_NAME_SIZE) \
3182 { \
3183 errset = errno; \
3184 goto FAILED; \
3185 } \
3186 }
3187
3188
3189
3190 /*************************************************
3191 * Scan regex to identify named groups *
3192 *************************************************/
3193
3194 /* This function is called first of all, to scan for named capturing groups so
3195 that information about them is fully available to both the compiling scans.
3196 It skips over everything except parenthesized items.
3197
3198 Arguments:
3199 ptrptr points to pointer to the start of the pattern
3200 options compiling dynamic options
3201 cb pointer to the compile data block
3202
3203 Returns: zero on success or a non-zero error code, with pointer updated
3204 */
3205
3206 typedef struct nest_save {
3207 uint16_t nest_depth;
3208 uint16_t reset_group;
3209 uint16_t max_group;
3210 uint16_t flags;
3211 } nest_save;
3212
3213 #define NSF_RESET 0x0001u
3214 #define NSF_EXTENDED 0x0002u
3215 #define NSF_DUPNAMES 0x0004u
3216
scan_for_captures(PCRE2_SPTR * ptrptr,uint32_t options,compile_block * cb)3217 static int scan_for_captures(PCRE2_SPTR *ptrptr, uint32_t options,
3218 compile_block *cb)
3219 {
3220 uint32_t c;
3221 uint32_t delimiter;
3222 uint32_t set, unset, *optset;
3223 uint32_t skiptoket = 0;
3224 uint16_t nest_depth = 0;
3225 int errorcode = 0;
3226 int escape;
3227 int namelen;
3228 int i;
3229 BOOL inescq = FALSE;
3230 BOOL isdupname;
3231 BOOL utf = (options & PCRE2_UTF) != 0;
3232 BOOL negate_class;
3233 PCRE2_SPTR name;
3234 PCRE2_SPTR start;
3235 PCRE2_SPTR ptr = *ptrptr;
3236 named_group *ng;
3237 nest_save *top_nest = NULL;
3238 nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3239
3240 /* The size of the nest_save structure might not be a factor of the size of the
3241 workspace. Therefore we must round down end_nests so as to correctly avoid
3242 creating a nest_save that spans the end of the workspace. */
3243
3244 end_nests = (nest_save *)((char *)end_nests -
3245 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3246
3247 /* Now scan the pattern */
3248
3249 for (; ptr < cb->end_pattern; ptr++)
3250 {
3251 c = *ptr;
3252
3253 /* Parenthesized groups set skiptoket when all following characters up to the
3254 next closing parenthesis must be ignored. The parenthesis itself must be
3255 processed (to end the nested parenthesized item). */
3256
3257 if (skiptoket != 0)
3258 {
3259 if (c != CHAR_RIGHT_PARENTHESIS) continue;
3260 skiptoket = 0;
3261 }
3262
3263 /* Skip over literals */
3264
3265 if (inescq)
3266 {
3267 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3268 {
3269 inescq = FALSE;
3270 ptr++;
3271 }
3272 continue;
3273 }
3274
3275 /* Skip over # comments and whitespace in extended mode. */
3276
3277 if ((options & PCRE2_EXTENDED) != 0)
3278 {
3279 PCRE2_SPTR wscptr = ptr;
3280 while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
3281 if (c == CHAR_NUMBER_SIGN)
3282 {
3283 ptr++;
3284 while (ptr < cb->end_pattern)
3285 {
3286 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
3287 { /* IS_NEWLINE sets cb->nllen. */
3288 ptr += cb->nllen;
3289 break;
3290 }
3291 ptr++;
3292 #ifdef SUPPORT_UNICODE
3293 if (utf) FORWARDCHAR(ptr);
3294 #endif
3295 }
3296 }
3297
3298 /* If we skipped any characters, restart the loop. Otherwise, we didn't see
3299 a comment. */
3300
3301 if (ptr > wscptr)
3302 {
3303 ptr--;
3304 continue;
3305 }
3306 }
3307
3308 /* Process the next pattern item. */
3309
3310 switch(c)
3311 {
3312 default: /* Most characters are just skipped */
3313 break;
3314
3315 /* Skip escapes except for \Q */
3316
3317 case CHAR_BACKSLASH:
3318 errorcode = 0;
3319 escape = PRIV(check_escape)(&ptr, cb->end_pattern, &c, &errorcode, options,
3320 FALSE, cb);
3321 if (errorcode != 0) goto FAILED;
3322 if (escape == ESC_Q) inescq = TRUE;
3323 break;
3324
3325 /* Skip a character class. The syntax is complicated so we have to
3326 replicate some of what happens when a class is processed for real. */
3327
3328 case CHAR_LEFT_SQUARE_BRACKET:
3329 if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0 ||
3330 PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
3331 {
3332 ptr += 6;
3333 break;
3334 }
3335
3336 /* If the first character is '^', set the negation flag (not actually used
3337 here, except to recognize only one ^) and skip it. If the first few
3338 characters (either before or after ^) are \Q\E or \E we skip them too. This
3339 makes for compatibility with Perl. */
3340
3341 negate_class = FALSE;
3342 for (;;)
3343 {
3344 c = *(++ptr); /* First character in class */
3345 if (c == CHAR_BACKSLASH)
3346 {
3347 if (ptr[1] == CHAR_E)
3348 ptr++;
3349 else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3350 ptr += 3;
3351 else
3352 break;
3353 }
3354 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3355 negate_class = TRUE;
3356 else break;
3357 }
3358
3359 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3360 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3361 break;
3362
3363 /* Loop for the contents of the class */
3364
3365 for (;;)
3366 {
3367 PCRE2_SPTR tempptr;
3368
3369 if (c == CHAR_NULL && ptr >= cb->end_pattern)
3370 {
3371 errorcode = ERR6; /* Missing terminating ']' */
3372 goto FAILED;
3373 }
3374
3375 #ifdef SUPPORT_UNICODE
3376 if (utf && HAS_EXTRALEN(c))
3377 { /* Braces are required because the */
3378 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3379 }
3380 #endif
3381
3382 /* Inside \Q...\E everything is literal except \E */
3383
3384 if (inescq)
3385 {
3386 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3387 {
3388 inescq = FALSE; /* Reset literal state */
3389 ptr++; /* Skip the 'E' */
3390 }
3391 goto CONTINUE_CLASS;
3392 }
3393
3394 /* Skip POSIX class names. */
3395 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3396 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3397 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3398 {
3399 ptr = tempptr + 1;
3400 }
3401 else if (c == CHAR_BACKSLASH)
3402 {
3403 errorcode = 0;
3404 escape = PRIV(check_escape)(&ptr, cb->end_pattern, &c, &errorcode,
3405 options, TRUE, cb);
3406 if (errorcode != 0) goto FAILED;
3407 if (escape == ESC_Q) inescq = TRUE;
3408 }
3409
3410 CONTINUE_CLASS:
3411 c = *(++ptr);
3412 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3413 } /* End of class-processing loop */
3414 break;
3415
3416 /* This is the real work of this function - handling parentheses. */
3417
3418 case CHAR_LEFT_PARENTHESIS:
3419 nest_depth++;
3420
3421 if (ptr[1] != CHAR_QUESTION_MARK)
3422 {
3423 if (ptr[1] != CHAR_ASTERISK)
3424 {
3425 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) cb->bracount++;
3426 }
3427
3428 /* (*something) - skip over a name, and then just skip to closing ket
3429 unless PCRE2_ALT_VERBNAMES is set, in which case we have to process
3430 escapes in the string after a verb name terminated by a colon. */
3431
3432 else
3433 {
3434 ptr += 2;
3435 while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) ptr++;
3436 if (*ptr == CHAR_COLON && (options & PCRE2_ALT_VERBNAMES) != 0)
3437 {
3438 ptr++;
3439 if (process_verb_name(&ptr, NULL, &errorcode, options, utf, cb) < 0)
3440 goto FAILED;
3441 }
3442 else
3443 {
3444 while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS)
3445 ptr++;
3446 }
3447 nest_depth--;
3448 }
3449 }
3450
3451 /* Handle (?...) groups */
3452
3453 else switch(ptr[2])
3454 {
3455 default:
3456 ptr += 2;
3457 if (ptr[0] == CHAR_R || /* (?R) */
3458 ptr[0] == CHAR_NUMBER_SIGN || /* (?#) */
3459 IS_DIGIT(ptr[0]) || /* (?n) */
3460 (ptr[0] == CHAR_MINUS && IS_DIGIT(ptr[1]))) /* (?-n) */
3461 {
3462 skiptoket = ptr[0];
3463 break;
3464 }
3465
3466 /* Handle (?| and (?imsxJU: which are the only other valid forms. Both
3467 need a new block on the nest stack. */
3468
3469 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3470 else if (++top_nest >= end_nests)
3471 {
3472 errorcode = ERR84;
3473 goto FAILED;
3474 }
3475 top_nest->nest_depth = nest_depth;
3476 top_nest->flags = 0;
3477 if ((options & PCRE2_EXTENDED) != 0) top_nest->flags |= NSF_EXTENDED;
3478 if ((options & PCRE2_DUPNAMES) != 0) top_nest->flags |= NSF_DUPNAMES;
3479
3480 if (*ptr == CHAR_VERTICAL_LINE)
3481 {
3482 top_nest->reset_group = (uint16_t)cb->bracount;
3483 top_nest->max_group = (uint16_t)cb->bracount;
3484 top_nest->flags |= NSF_RESET;
3485 cb->external_flags |= PCRE2_DUPCAPUSED;
3486 break;
3487 }
3488
3489 /* Scan options */
3490
3491 top_nest->reset_group = 0;
3492 top_nest->max_group = 0;
3493
3494 set = unset = 0;
3495 optset = &set;
3496
3497 /* Need only track (?x: and (?J: at this stage */
3498
3499 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
3500 {
3501 switch (*ptr++)
3502 {
3503 case CHAR_MINUS: optset = &unset; break;
3504
3505 case CHAR_x: *optset |= PCRE2_EXTENDED; break;
3506
3507 case CHAR_J:
3508 *optset |= PCRE2_DUPNAMES;
3509 cb->external_flags |= PCRE2_JCHANGED;
3510 break;
3511
3512 case CHAR_i:
3513 case CHAR_m:
3514 case CHAR_s:
3515 case CHAR_U:
3516 break;
3517
3518 default:
3519 errorcode = ERR11;
3520 ptr--; /* Correct the offset */
3521 goto FAILED;
3522 }
3523 }
3524
3525 options = (options | set) & (~unset);
3526
3527 /* If the options ended with ')' this is not the start of a nested
3528 group with option changes, so the options change at this level. If the
3529 previous level set up a nest block, discard the one we have just created.
3530 Otherwise adjust it for the previous level. */
3531
3532 if (*ptr == CHAR_RIGHT_PARENTHESIS)
3533 {
3534 nest_depth--;
3535 if (top_nest > (nest_save *)(cb->start_workspace) &&
3536 (top_nest-1)->nest_depth == nest_depth) top_nest --;
3537 else top_nest->nest_depth = nest_depth;
3538 }
3539 break;
3540
3541 /* Skip over a numerical or string argument for a callout. */
3542
3543 case CHAR_C:
3544 ptr += 2;
3545 if (ptr[1] == CHAR_RIGHT_PARENTHESIS) break;
3546 if (IS_DIGIT(ptr[1]))
3547 {
3548 while (IS_DIGIT(ptr[1])) ptr++;
3549 }
3550
3551 /* Handle a string argument */
3552
3553 else
3554 {
3555 ptr++;
3556 delimiter = 0;
3557 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
3558 {
3559 if (*ptr == PRIV(callout_start_delims)[i])
3560 {
3561 delimiter = PRIV(callout_end_delims)[i];
3562 break;
3563 }
3564 }
3565
3566 if (delimiter == 0)
3567 {
3568 errorcode = ERR82;
3569 goto FAILED;
3570 }
3571
3572 start = ptr;
3573 do
3574 {
3575 if (++ptr >= cb->end_pattern)
3576 {
3577 errorcode = ERR81;
3578 ptr = start; /* To give a more useful message */
3579 goto FAILED;
3580 }
3581 if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2;
3582 }
3583 while (ptr[0] != delimiter);
3584 }
3585
3586 /* Check terminating ) */
3587
3588 if (ptr[1] != CHAR_RIGHT_PARENTHESIS)
3589 {
3590 errorcode = ERR39;
3591 ptr++;
3592 goto FAILED;
3593 }
3594 break;
3595
3596 /* Conditional group */
3597
3598 case CHAR_LEFT_PARENTHESIS:
3599 if (ptr[3] != CHAR_QUESTION_MARK) /* Not assertion or callout */
3600 {
3601 nest_depth++;
3602 ptr += 2;
3603 break;
3604 }
3605
3606 /* Must be an assertion or a callout */
3607
3608 switch(ptr[4])
3609 {
3610 case CHAR_LESS_THAN_SIGN:
3611 if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
3612 goto MISSING_ASSERTION;
3613 /* Fall through */
3614
3615 case CHAR_C:
3616 case CHAR_EXCLAMATION_MARK:
3617 case CHAR_EQUALS_SIGN:
3618 ptr++;
3619 break;
3620
3621 default:
3622 MISSING_ASSERTION:
3623 ptr += 3; /* To improve error message */
3624 errorcode = ERR28;
3625 goto FAILED;
3626 }
3627 break;
3628
3629 case CHAR_COLON:
3630 case CHAR_GREATER_THAN_SIGN:
3631 case CHAR_EQUALS_SIGN:
3632 case CHAR_EXCLAMATION_MARK:
3633 case CHAR_AMPERSAND:
3634 case CHAR_PLUS:
3635 ptr += 2;
3636 break;
3637
3638 case CHAR_P:
3639 if (ptr[3] != CHAR_LESS_THAN_SIGN)
3640 {
3641 ptr += 3;
3642 break;
3643 }
3644 ptr++;
3645 c = CHAR_GREATER_THAN_SIGN; /* Terminator */
3646 goto DEFINE_NAME;
3647
3648 case CHAR_LESS_THAN_SIGN:
3649 if (ptr[3] == CHAR_EQUALS_SIGN || ptr[3] == CHAR_EXCLAMATION_MARK)
3650 {
3651 ptr += 3;
3652 break;
3653 }
3654 c = CHAR_GREATER_THAN_SIGN; /* Terminator */
3655 goto DEFINE_NAME;
3656
3657 case CHAR_APOSTROPHE:
3658 c = CHAR_APOSTROPHE; /* Terminator */
3659
3660 DEFINE_NAME:
3661 name = ptr = ptr + 3;
3662
3663 if (*ptr == c) /* Empty name */
3664 {
3665 errorcode = ERR62;
3666 goto FAILED;
3667 }
3668
3669 if (IS_DIGIT(*ptr))
3670 {
3671 errorcode = ERR44; /* Group name must start with non-digit */
3672 goto FAILED;
3673 }
3674
3675 if (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) == 0)
3676 {
3677 errorcode = ERR24;
3678 goto FAILED;
3679 }
3680
3681 /* Advance ptr, set namelen and check its length. */
3682 READ_NAME(ctype_word, ERR48, errorcode);
3683
3684 if (*ptr != c)
3685 {
3686 errorcode = ERR42;
3687 goto FAILED;
3688 }
3689
3690 if (cb->names_found >= MAX_NAME_COUNT)
3691 {
3692 errorcode = ERR49;
3693 goto FAILED;
3694 }
3695
3696 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
3697 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
3698
3699 /* We have a valid name for this capturing group. */
3700
3701 cb->bracount++;
3702
3703 /* Scan the list to check for duplicates. For duplicate names, if the
3704 number is the same, break the loop, which causes the name to be
3705 discarded; otherwise, if DUPNAMES is not set, give an error.
3706 If it is set, allow the name with a different number, but continue
3707 scanning in case this is a duplicate with the same number. For
3708 non-duplicate names, give an error if the number is duplicated. */
3709
3710 isdupname = FALSE;
3711 ng = cb->named_groups;
3712 for (i = 0; i < cb->names_found; i++, ng++)
3713 {
3714 if (namelen == ng->length &&
3715 PRIV(strncmp)(name, ng->name, (size_t)namelen) == 0)
3716 {
3717 if (ng->number == cb->bracount) break;
3718 if ((options & PCRE2_DUPNAMES) == 0)
3719 {
3720 errorcode = ERR43;
3721 goto FAILED;
3722 }
3723 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
3724 cb->dupnames = TRUE; /* Duplicate names exist */
3725 }
3726 else if (ng->number == cb->bracount)
3727 {
3728 errorcode = ERR65;
3729 goto FAILED;
3730 }
3731 }
3732
3733 if (i < cb->names_found) break; /* Ignore duplicate with same number */
3734
3735 /* Increase the list size if necessary */
3736
3737 if (cb->names_found >= cb->named_group_list_size)
3738 {
3739 uint32_t newsize = cb->named_group_list_size * 2;
3740 named_group *newspace =
3741 cb->cx->memctl.malloc(newsize * sizeof(named_group),
3742 cb->cx->memctl.memory_data);
3743 if (newspace == NULL)
3744 {
3745 errorcode = ERR21;
3746 goto FAILED;
3747 }
3748
3749 memcpy(newspace, cb->named_groups,
3750 cb->named_group_list_size * sizeof(named_group));
3751 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
3752 cb->cx->memctl.free((void *)cb->named_groups,
3753 cb->cx->memctl.memory_data);
3754 cb->named_groups = newspace;
3755 cb->named_group_list_size = newsize;
3756 }
3757
3758 /* Add this name to the list */
3759
3760 cb->named_groups[cb->names_found].name = name;
3761 cb->named_groups[cb->names_found].length = (uint16_t)namelen;
3762 cb->named_groups[cb->names_found].number = cb->bracount;
3763 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
3764 cb->names_found++;
3765 break;
3766 } /* End of (? switch */
3767 break; /* End of ( handling */
3768
3769 /* At an alternation, reset the capture count if we are in a (?| group. */
3770
3771 case CHAR_VERTICAL_LINE:
3772 if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
3773 (top_nest->flags & NSF_RESET) != 0)
3774 {
3775 if (cb->bracount > top_nest->max_group)
3776 top_nest->max_group = (uint16_t)cb->bracount;
3777 cb->bracount = top_nest->reset_group;
3778 }
3779 break;
3780
3781 /* At a right parenthesis, reset the capture count to the maximum if we
3782 are in a (?| group and/or reset the extended option. */
3783
3784 case CHAR_RIGHT_PARENTHESIS:
3785 if (top_nest != NULL && top_nest->nest_depth == nest_depth)
3786 {
3787 if ((top_nest->flags & NSF_RESET) != 0 &&
3788 top_nest->max_group > cb->bracount)
3789 cb->bracount = top_nest->max_group;
3790 if ((top_nest->flags & NSF_EXTENDED) != 0) options |= PCRE2_EXTENDED;
3791 else options &= ~PCRE2_EXTENDED;
3792 if ((top_nest->flags & NSF_DUPNAMES) != 0) options |= PCRE2_DUPNAMES;
3793 else options &= ~PCRE2_DUPNAMES;
3794 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
3795 else top_nest--;
3796 }
3797 if (nest_depth == 0) /* Unmatched closing parenthesis */
3798 {
3799 errorcode = ERR22;
3800 goto FAILED;
3801 }
3802 nest_depth--;
3803 break;
3804 }
3805 }
3806
3807 if (nest_depth == 0)
3808 {
3809 cb->final_bracount = cb->bracount;
3810 return 0;
3811 }
3812
3813 /* We give a special error for a missing closing parentheses after (?# because
3814 it might otherwise be hard to see where the missing character is. */
3815
3816 errorcode = (skiptoket == CHAR_NUMBER_SIGN)? ERR18 : ERR14;
3817
3818 FAILED:
3819 *ptrptr = ptr;
3820 return errorcode;
3821 }
3822
3823
3824
3825 /*************************************************
3826 * Compile one branch *
3827 *************************************************/
3828
3829 /* Scan the pattern, compiling it into the a vector. If the options are
3830 changed during the branch, the pointer is used to change the external options
3831 bits. This function is used during the pre-compile phase when we are trying
3832 to find out the amount of memory needed, as well as during the real compile
3833 phase. The value of lengthptr distinguishes the two phases.
3834
3835 Arguments:
3836 optionsptr pointer to the option bits
3837 codeptr points to the pointer to the current code point
3838 ptrptr points to the current pattern pointer
3839 errorcodeptr points to error code variable
3840 firstcuptr place to put the first required code unit
3841 firstcuflagsptr place to put the first code unit flags, or a negative number
3842 reqcuptr place to put the last required code unit
3843 reqcuflagsptr place to put the last required code unit flags, or a negative number
3844 bcptr points to current branch chain
3845 cond_depth conditional nesting depth
3846 cb contains pointers to tables etc.
3847 lengthptr NULL during the real compile phase
3848 points to length accumulator during pre-compile phase
3849
3850 Returns: TRUE on success
3851 FALSE, with *errorcodeptr set non-zero on error
3852 */
3853
3854 static BOOL
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,PCRE2_SPTR * ptrptr,int * errorcodeptr,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,int cond_depth,compile_block * cb,size_t * lengthptr)3855 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr,
3856 PCRE2_SPTR *ptrptr, int *errorcodeptr,
3857 uint32_t *firstcuptr, int32_t *firstcuflagsptr,
3858 uint32_t *reqcuptr, int32_t *reqcuflagsptr,
3859 branch_chain *bcptr, int cond_depth,
3860 compile_block *cb, size_t *lengthptr)
3861 {
3862 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3863 int bravalue = 0;
3864 uint32_t greedy_default, greedy_non_default;
3865 uint32_t repeat_type, op_type;
3866 uint32_t options = *optionsptr; /* May change dynamically */
3867 uint32_t firstcu, reqcu;
3868 int32_t firstcuflags, reqcuflags;
3869 uint32_t zeroreqcu, zerofirstcu;
3870 int32_t zeroreqcuflags, zerofirstcuflags;
3871 int32_t req_caseopt, reqvary, tempreqvary;
3872 int after_manual_callout = 0;
3873 int escape;
3874 size_t length_prevgroup = 0;
3875 register uint32_t c;
3876 register PCRE2_UCHAR *code = *codeptr;
3877 PCRE2_UCHAR *last_code = code;
3878 PCRE2_UCHAR *orig_code = code;
3879 PCRE2_UCHAR *tempcode;
3880 BOOL inescq = FALSE;
3881 BOOL groupsetfirstcu = FALSE;
3882 PCRE2_SPTR ptr = *ptrptr;
3883 PCRE2_SPTR tempptr;
3884 PCRE2_UCHAR *previous = NULL;
3885 PCRE2_UCHAR *previous_callout = NULL;
3886 uint8_t classbits[32];
3887
3888 /* We can fish out the UTF setting once and for all into a BOOL, but we must
3889 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
3890 dynamically as we process the pattern. */
3891
3892 #ifdef SUPPORT_UNICODE
3893 BOOL utf = (options & PCRE2_UTF) != 0;
3894 #if PCRE2_CODE_UNIT_WIDTH != 32
3895 PCRE2_UCHAR utf_units[6]; /* For setting up multi-cu chars */
3896 #endif
3897
3898 #else /* No UTF support */
3899 BOOL utf = FALSE;
3900 #endif
3901
3902 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
3903 class_uchardata always so that it can be passed to add_to_class() always,
3904 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
3905 alternative calls for the different cases. */
3906
3907 PCRE2_UCHAR *class_uchardata;
3908 #ifdef SUPPORT_WIDE_CHARS
3909 BOOL xclass;
3910 PCRE2_UCHAR *class_uchardata_base;
3911 #endif
3912
3913 /* Set up the default and non-default settings for greediness */
3914
3915 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
3916 greedy_non_default = greedy_default ^ 1;
3917
3918 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
3919 matching encountered yet". It gets changed to REQ_NONE if we hit something that
3920 matches a non-fixed first unit; reqcu just remains unset if we never find one.
3921
3922 When we hit a repeat whose minimum is zero, we may have to adjust these values
3923 to take the zero repeat into account. This is implemented by setting them to
3924 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
3925 item types that can be repeated set these backoff variables appropriately. */
3926
3927 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
3928 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
3929
3930 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3931 according to the current setting of the caseless flag. The REQ_CASELESS value
3932 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
3933 to record the case status of the value. This is used only for ASCII characters.
3934 */
3935
3936 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
3937
3938 /* Switch on next character until the end of the branch */
3939
3940 for (;; ptr++)
3941 {
3942 BOOL negate_class;
3943 BOOL should_flip_negation;
3944 BOOL match_all_or_no_wide_chars;
3945 BOOL possessive_quantifier;
3946 BOOL is_quantifier;
3947 BOOL is_recurse;
3948 BOOL is_dupname;
3949 BOOL reset_bracount;
3950 int class_has_8bitchar;
3951 int class_one_char;
3952 #ifdef SUPPORT_WIDE_CHARS
3953 BOOL xclass_has_prop;
3954 #endif
3955 int recno; /* Must be signed */
3956 int refsign; /* Must be signed */
3957 int terminator; /* Must be signed */
3958 unsigned int mclength;
3959 unsigned int tempbracount;
3960 uint32_t ec;
3961 uint32_t newoptions;
3962 uint32_t skipunits;
3963 uint32_t subreqcu, subfirstcu;
3964 int32_t subreqcuflags, subfirstcuflags; /* Must be signed */
3965 PCRE2_UCHAR mcbuffer[8];
3966
3967 /* Come here to restart the loop. */
3968
3969 REDO_LOOP:
3970
3971 /* Get next character in the pattern */
3972
3973 c = *ptr;
3974
3975 /* If we are at the end of a nested substitution, revert to the outer level
3976 string. Nesting only happens one or two levels deep, and the inserted string
3977 is always zero terminated. */
3978
3979 if (c == CHAR_NULL && cb->nestptr[0] != NULL)
3980 {
3981 ptr = cb->nestptr[0];
3982 cb->nestptr[0] = cb->nestptr[1];
3983 cb->nestptr[1] = NULL;
3984 c = *ptr;
3985 }
3986
3987 /* If we are in the pre-compile phase, accumulate the length used for the
3988 previous cycle of this loop. */
3989
3990 if (lengthptr != NULL)
3991 {
3992 if (code > cb->start_workspace + cb->workspace_size -
3993 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
3994 {
3995 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
3996 ERR52 : ERR86;
3997 goto FAILED;
3998 }
3999
4000 /* There is at least one situation where code goes backwards: this is the
4001 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4002 the class is simply eliminated. However, it is created first, so we have to
4003 allow memory for it. Therefore, don't ever reduce the length at this point.
4004 */
4005
4006 if (code < last_code) code = last_code;
4007
4008 /* Paranoid check for integer overflow */
4009
4010 if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code))
4011 {
4012 *errorcodeptr = ERR20;
4013 goto FAILED;
4014 }
4015 *lengthptr += (size_t)(code - last_code);
4016
4017 /* If "previous" is set and it is not at the start of the work space, move
4018 it back to there, in order to avoid filling up the work space. Otherwise,
4019 if "previous" is NULL, reset the current code pointer to the start. */
4020
4021 if (previous != NULL)
4022 {
4023 if (previous > orig_code)
4024 {
4025 memmove(orig_code, previous, (size_t)CU2BYTES(code - previous));
4026 code -= previous - orig_code;
4027 previous = orig_code;
4028 }
4029 }
4030 else code = orig_code;
4031
4032 /* Remember where this code item starts so we can pick up the length
4033 next time round. */
4034
4035 last_code = code;
4036 }
4037
4038 /* Before doing anything else we must handle all the special items that do
4039 nothing, and which may come between an item and its quantifier. Otherwise,
4040 when auto-callouts are enabled, a callout gets incorrectly inserted before
4041 the quantifier is recognized. After recognizing a "do nothing" item, restart
4042 the loop in case another one follows. */
4043
4044 /* If c is not NULL we are not at the end of the pattern. If it is NULL, we
4045 may still be in the pattern with a NULL data item. In these cases, if we are
4046 in \Q...\E, check for the \E that ends the literal string; if not, we have a
4047 literal character. If not in \Q...\E, an isolated \E is ignored. */
4048
4049 if (c != CHAR_NULL || ptr < cb->end_pattern)
4050 {
4051 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4052 {
4053 inescq = FALSE;
4054 ptr++;
4055 continue;
4056 }
4057 else if (inescq) /* Literal character */
4058 {
4059 if (previous_callout != NULL)
4060 {
4061 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4062 complete_callout(previous_callout, ptr, cb);
4063 previous_callout = NULL;
4064 }
4065 if ((options & PCRE2_AUTO_CALLOUT) != 0)
4066 {
4067 previous_callout = code;
4068 code = auto_callout(code, ptr, cb);
4069 }
4070 goto NORMAL_CHAR;
4071 }
4072
4073 /* Check for the start of a \Q...\E sequence. We must do this here rather
4074 than later in case it is immediately followed by \E, which turns it into a
4075 "do nothing" sequence. */
4076
4077 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4078 {
4079 inescq = TRUE;
4080 ptr++;
4081 continue;
4082 }
4083 }
4084
4085 /* In extended mode, skip white space and #-comments that end at newline. */
4086
4087 if ((options & PCRE2_EXTENDED) != 0)
4088 {
4089 PCRE2_SPTR wscptr = ptr;
4090 while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4091 if (c == CHAR_NUMBER_SIGN)
4092 {
4093 ptr++;
4094 while (ptr < cb->end_pattern)
4095 {
4096 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
4097 { /* IS_NEWLINE sets cb->nllen. */
4098 ptr += cb->nllen;
4099 break;
4100 }
4101 ptr++;
4102 #ifdef SUPPORT_UNICODE
4103 if (utf) FORWARDCHAR(ptr);
4104 #endif
4105 }
4106 }
4107
4108 /* If we skipped any characters, restart the loop. Otherwise, we didn't see
4109 a comment. */
4110
4111 if (ptr > wscptr) goto REDO_LOOP;
4112 }
4113
4114 /* Skip over (?# comments. */
4115
4116 if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
4117 ptr[2] == CHAR_NUMBER_SIGN)
4118 {
4119 ptr += 3;
4120 while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4121 if (*ptr != CHAR_RIGHT_PARENTHESIS)
4122 {
4123 *errorcodeptr = ERR18;
4124 goto FAILED;
4125 }
4126 continue;
4127 }
4128
4129 /* End of processing "do nothing" items. See if the next thing is a
4130 quantifier. */
4131
4132 is_quantifier =
4133 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4134 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4135
4136 /* Fill in length of a previous callout and create an auto callout if
4137 required, except when the next thing is a quantifier or when processing a
4138 property substitution string for \w etc in UCP mode. */
4139
4140 if (!is_quantifier && cb->nestptr[0] == NULL)
4141 {
4142 if (previous_callout != NULL && after_manual_callout-- <= 0)
4143 {
4144 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4145 complete_callout(previous_callout, ptr, cb);
4146 previous_callout = NULL;
4147 }
4148
4149 if ((options & PCRE2_AUTO_CALLOUT) != 0)
4150 {
4151 previous_callout = code;
4152 code = auto_callout(code, ptr, cb);
4153 }
4154 }
4155
4156 /* Process the next pattern item. */
4157
4158 switch(c)
4159 {
4160 /* ===================================================================*/
4161 /* The branch terminates at string end or | or ) */
4162
4163 case CHAR_NULL:
4164 if (ptr < cb->end_pattern) goto NORMAL_CHAR; /* Zero data character */
4165 /* Fall through */
4166
4167 case CHAR_VERTICAL_LINE:
4168 case CHAR_RIGHT_PARENTHESIS:
4169 *firstcuptr = firstcu;
4170 *firstcuflagsptr = firstcuflags;
4171 *reqcuptr = reqcu;
4172 *reqcuflagsptr = reqcuflags;
4173 *codeptr = code;
4174 *ptrptr = ptr;
4175 if (lengthptr != NULL)
4176 {
4177 if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code))
4178 {
4179 *errorcodeptr = ERR20;
4180 goto FAILED;
4181 }
4182 *lengthptr += (size_t)(code - last_code); /* To include callout length */
4183 }
4184 return TRUE;
4185
4186
4187 /* ===================================================================*/
4188 /* Handle single-character metacharacters. In multiline mode, ^ disables
4189 the setting of any following char as a first character. */
4190
4191 case CHAR_CIRCUMFLEX_ACCENT:
4192 previous = NULL;
4193 if ((options & PCRE2_MULTILINE) != 0)
4194 {
4195 if (firstcuflags == REQ_UNSET)
4196 zerofirstcuflags = firstcuflags = REQ_NONE;
4197 *code++ = OP_CIRCM;
4198 }
4199 else *code++ = OP_CIRC;
4200 break;
4201
4202 case CHAR_DOLLAR_SIGN:
4203 previous = NULL;
4204 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4205 break;
4206
4207 /* There can never be a first char if '.' is first, whatever happens about
4208 repeats. The value of reqcu doesn't change either. */
4209
4210 case CHAR_DOT:
4211 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4212 zerofirstcu = firstcu;
4213 zerofirstcuflags = firstcuflags;
4214 zeroreqcu = reqcu;
4215 zeroreqcuflags = reqcuflags;
4216 previous = code;
4217 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4218 break;
4219
4220
4221 /* ===================================================================*/
4222 /* Character classes. If the included characters are all < 256, we build a
4223 32-byte bitmap of the permitted characters, except in the special case
4224 where there is only one such character. For negated classes, we build the
4225 map as usual, then invert it at the end. However, we use a different opcode
4226 so that data characters > 255 can be handled correctly.
4227
4228 If the class contains characters outside the 0-255 range, a different
4229 opcode is compiled. It may optionally have a bit map for characters < 256,
4230 but those above are are explicitly listed afterwards. A flag byte tells
4231 whether the bitmap is present, and whether this is a negated class or not.
4232
4233 An isolated ']' character is not treated specially, so is just another data
4234 character. In earlier versions of PCRE that used the original API there was
4235 a "JavaScript compatibility mode" in which it gave an error. However,
4236 JavaScript itself has changed in this respect so there is no longer any
4237 need for this special handling.
4238
4239 In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4240 used for "start of word" and "end of word". As these are otherwise illegal
4241 sequences, we don't break anything by recognizing them. They are replaced
4242 by \b(?=\w) and \b(?<=\w) respectively. This can only happen at the top
4243 nesting level, as no other inserted sequences will contains these oddities.
4244 Sequences like [a[:<:]] are erroneous and are handled by the normal code
4245 below. */
4246
4247 case CHAR_LEFT_SQUARE_BRACKET:
4248 if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4249 {
4250 cb->nestptr[0] = ptr + 7;
4251 ptr = sub_start_of_word;
4252 goto REDO_LOOP;
4253 }
4254
4255 if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4256 {
4257 cb->nestptr[0] = ptr + 7;
4258 ptr = sub_end_of_word;
4259 goto REDO_LOOP;
4260 }
4261
4262 /* Handle a real character class. */
4263
4264 previous = code;
4265
4266 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4267 they are encountered at the top level, so we'll do that too. */
4268
4269 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4270 ptr[1] == CHAR_EQUALS_SIGN) &&
4271 check_posix_syntax(ptr, &tempptr))
4272 {
4273 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR12 : ERR13;
4274 goto FAILED;
4275 }
4276
4277 /* If the first character is '^', set the negation flag and skip it. Also,
4278 if the first few characters (either before or after ^) are \Q\E or \E we
4279 skip them too. This makes for compatibility with Perl. */
4280
4281 negate_class = FALSE;
4282 for (;;)
4283 {
4284 c = *(++ptr);
4285 if (c == CHAR_BACKSLASH)
4286 {
4287 if (ptr[1] == CHAR_E)
4288 ptr++;
4289 else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4290 ptr += 3;
4291 else
4292 break;
4293 }
4294 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4295 negate_class = TRUE;
4296 else break;
4297 }
4298
4299 /* Empty classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. Otherwise,
4300 an initial ']' is taken as a data character -- the code below handles
4301 that. When empty classes are allowed, [] must always fail, so generate
4302 OP_FAIL, whereas [^] must match any character, so generate OP_ALLANY. */
4303
4304 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4305 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
4306 {
4307 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4308 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4309 zerofirstcu = firstcu;
4310 zerofirstcuflags = firstcuflags;
4311 break;
4312 }
4313
4314 /* If a non-extended class contains a negative special such as \S, we need
4315 to flip the negation flag at the end, so that support for characters > 255
4316 works correctly (they are all included in the class). An extended class may
4317 need to insert specific matching or non-matching code for wide characters.
4318 */
4319
4320 should_flip_negation = match_all_or_no_wide_chars = FALSE;
4321
4322 /* Extended class (xclass) will be used when characters > 255
4323 might match. */
4324
4325 #ifdef SUPPORT_WIDE_CHARS
4326 xclass = FALSE;
4327 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4328 class_uchardata_base = class_uchardata; /* Save the start */
4329 #endif
4330
4331 /* For optimization purposes, we track some properties of the class:
4332 class_has_8bitchar will be non-zero if the class contains at least one 256
4333 character with a code point less than 256; class_one_char will be 1 if the
4334 class contains just one character; xclass_has_prop will be TRUE if Unicode
4335 property checks are present in the class. */
4336
4337 class_has_8bitchar = 0;
4338 class_one_char = 0;
4339 #ifdef SUPPORT_WIDE_CHARS
4340 xclass_has_prop = FALSE;
4341 #endif
4342
4343 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
4344 in a temporary bit of memory, in case the class contains fewer than two
4345 8-bit characters because in that case the compiled code doesn't use the bit
4346 map. */
4347
4348 memset(classbits, 0, 32 * sizeof(uint8_t));
4349
4350 /* Process characters until ] is reached. As the test is at the end of the
4351 loop, an initial ] is taken as a data character. At the start of the loop,
4352 c contains the first code unit of the character. If it is zero, check for
4353 the end of the pattern, to allow binary zero as data. */
4354
4355 for(;;)
4356 {
4357 PCRE2_SPTR oldptr;
4358 #ifdef EBCDIC
4359 BOOL range_is_literal = TRUE;
4360 #endif
4361
4362 if (c == CHAR_NULL && ptr >= cb->end_pattern)
4363 {
4364 *errorcodeptr = ERR6; /* Missing terminating ']' */
4365 goto FAILED;
4366 }
4367
4368 #ifdef SUPPORT_UNICODE
4369 if (utf && HAS_EXTRALEN(c))
4370 { /* Braces are required because the */
4371 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4372 }
4373 #endif
4374
4375 /* Inside \Q...\E everything is literal except \E */
4376
4377 if (inescq)
4378 {
4379 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4380 {
4381 inescq = FALSE; /* Reset literal state */
4382 ptr++; /* Skip the 'E' */
4383 goto CONTINUE_CLASS; /* Carry on with next char */
4384 }
4385 goto CHECK_RANGE; /* Could be range if \E follows */
4386 }
4387
4388 /* Handle POSIX class names. Perl allows a negation extension of the
4389 form [:^name:]. A square bracket that doesn't match the syntax is
4390 treated as a literal. We also recognize the POSIX constructions
4391 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4392 5.6 and 5.8 do. */
4393
4394 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4395 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4396 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4397 {
4398 BOOL local_negate = FALSE;
4399 int posix_class, taboffset, tabopt;
4400 register const uint8_t *cbits = cb->cbits;
4401 uint8_t pbits[32];
4402
4403 if (ptr[1] != CHAR_COLON)
4404 {
4405 *errorcodeptr = ERR13;
4406 goto FAILED;
4407 }
4408
4409 ptr += 2;
4410 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4411 {
4412 local_negate = TRUE;
4413 should_flip_negation = TRUE; /* Note negative special */
4414 ptr++;
4415 }
4416
4417 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4418 if (posix_class < 0)
4419 {
4420 *errorcodeptr = ERR30;
4421 goto FAILED;
4422 }
4423
4424 /* If matching is caseless, upper and lower are converted to
4425 alpha. This relies on the fact that the class table starts with
4426 alpha, lower, upper as the first 3 entries. */
4427
4428 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
4429 posix_class = 0;
4430
4431 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
4432 different escape sequences that use Unicode properties \p or \P. Others
4433 that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4434 directly. UCP support is not available unless UTF support is.*/
4435
4436 #ifdef SUPPORT_UNICODE
4437 if ((options & PCRE2_UCP) != 0)
4438 {
4439 unsigned int ptype = 0;
4440 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4441
4442 /* The posix_substitutes table specifies which POSIX classes can be
4443 converted to \p or \P items. This can only happen at top nestling
4444 level, as there will never be a POSIX class in a string that is
4445 substituted for something else. */
4446
4447 if (posix_substitutes[pc] != NULL)
4448 {
4449 cb->nestptr[0] = tempptr + 1;
4450 ptr = posix_substitutes[pc] - 1;
4451 goto CONTINUE_CLASS;
4452 }
4453
4454 /* There are three other classes that generate special property calls
4455 that are recognized only in an XCLASS. */
4456
4457 else switch(posix_class)
4458 {
4459 case PC_GRAPH:
4460 ptype = PT_PXGRAPH;
4461 /* Fall through */
4462 case PC_PRINT:
4463 if (ptype == 0) ptype = PT_PXPRINT;
4464 /* Fall through */
4465 case PC_PUNCT:
4466 if (ptype == 0) ptype = PT_PXPUNCT;
4467 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4468 *class_uchardata++ = (PCRE2_UCHAR)ptype;
4469 *class_uchardata++ = 0;
4470 xclass_has_prop = TRUE;
4471 ptr = tempptr + 1;
4472 goto CONTINUE_CLASS;
4473
4474 /* For the other POSIX classes (ascii, xdigit) we are going to fall
4475 through to the non-UCP case and build a bit map for characters with
4476 code points less than 256. However, if we are in a negated POSIX
4477 class, characters with code points greater than 255 must either all
4478 match or all not match, depending on whether the whole class is not
4479 or is negated. For example, for [[:^ascii:]... they must all match,
4480 whereas for [^[:^xdigit:]... they must not.
4481
4482 In the special case where there are no xclass items, this is
4483 automatically handled by the use of OP_CLASS or OP_NCLASS, but an
4484 explicit range is needed for OP_XCLASS. Setting a flag here causes
4485 the range to be generated later when it is known that OP_XCLASS is
4486 required. */
4487
4488 default:
4489 match_all_or_no_wide_chars |= local_negate;
4490 break;
4491 }
4492 }
4493 #endif /* SUPPORT_UNICODE */
4494
4495 /* In the non-UCP case, or when UCP makes no difference, we build the
4496 bit map for the POSIX class in a chunk of local store because we may be
4497 adding and subtracting from it, and we don't want to subtract bits that
4498 may be in the main map already. At the end we or the result into the
4499 bit map that is being built. */
4500
4501 posix_class *= 3;
4502
4503 /* Copy in the first table (always present) */
4504
4505 memcpy(pbits, cbits + posix_class_maps[posix_class],
4506 32 * sizeof(uint8_t));
4507
4508 /* If there is a second table, add or remove it as required. */
4509
4510 taboffset = posix_class_maps[posix_class + 1];
4511 tabopt = posix_class_maps[posix_class + 2];
4512
4513 if (taboffset >= 0)
4514 {
4515 if (tabopt >= 0)
4516 for (c = 0; c < 32; c++) pbits[c] |= cbits[(int)c + taboffset];
4517 else
4518 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[(int)c + taboffset];
4519 }
4520
4521 /* Now see if we need to remove any special characters. An option
4522 value of 1 removes vertical space and 2 removes underscore. */
4523
4524 if (tabopt < 0) tabopt = -tabopt;
4525 if (tabopt == 1) pbits[1] &= ~0x3c;
4526 else if (tabopt == 2) pbits[11] &= 0x7f;
4527
4528 /* Add the POSIX table or its complement into the main table that is
4529 being built and we are done. */
4530
4531 if (local_negate)
4532 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4533 else
4534 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4535
4536 ptr = tempptr + 1;
4537 /* Every class contains at least one < 256 character. */
4538 class_has_8bitchar = 1;
4539 /* Every class contains at least two characters. */
4540 class_one_char = 2;
4541 goto CONTINUE_CLASS; /* End of POSIX syntax handling */
4542 }
4543
4544 /* Backslash may introduce a single character, or it may introduce one
4545 of the specials, which just set a flag. The sequence \b is a special
4546 case. Inside a class (and only there) it is treated as backspace. We
4547 assume that other escapes have more than one character in them, so
4548 speculatively set both class_has_8bitchar and class_one_char bigger
4549 than one. Unrecognized escapes fall through and are faulted. */
4550
4551 if (c == CHAR_BACKSLASH)
4552 {
4553 escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr,
4554 options, TRUE, cb);
4555 if (*errorcodeptr != 0) goto FAILED;
4556 if (escape == 0) /* Escaped single char */
4557 {
4558 c = ec;
4559 #ifdef EBCDIC
4560 range_is_literal = FALSE;
4561 #endif
4562 }
4563 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4564 else if (escape == ESC_N) /* \N is not supported in a class */
4565 {
4566 *errorcodeptr = ERR71;
4567 goto FAILED;
4568 }
4569 else if (escape == ESC_Q) /* Handle start of quoted string */
4570 {
4571 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4572 {
4573 ptr += 2; /* avoid empty string */
4574 }
4575 else inescq = TRUE;
4576 goto CONTINUE_CLASS;
4577 }
4578 else if (escape == ESC_E) goto CONTINUE_CLASS; /* Ignore orphan \E */
4579
4580 else /* Handle \d-type escapes */
4581 {
4582 register const uint8_t *cbits = cb->cbits;
4583 /* Every class contains at least two < 256 characters. */
4584 class_has_8bitchar++;
4585 /* Every class contains at least two characters. */
4586 class_one_char += 2;
4587
4588 switch (escape)
4589 {
4590 #ifdef SUPPORT_UNICODE
4591 case ESC_du: /* These are the values given for \d etc */
4592 case ESC_DU: /* when PCRE2_UCP is set. We replace the */
4593 case ESC_wu: /* escape sequence with an appropriate \p */
4594 case ESC_WU: /* or \P to test Unicode properties instead */
4595 case ESC_su: /* of the default ASCII testing. This might be */
4596 case ESC_SU: /* a 2nd-level nesting for [[:<:]] or [[:>:]]. */
4597 cb->nestptr[1] = cb->nestptr[0];
4598 cb->nestptr[0] = ptr;
4599 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
4600 class_has_8bitchar--; /* Undo! */
4601 break;
4602 #endif
4603 case ESC_d:
4604 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4605 break;
4606
4607 case ESC_D:
4608 should_flip_negation = TRUE;
4609 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4610 break;
4611
4612 case ESC_w:
4613 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4614 break;
4615
4616 case ESC_W:
4617 should_flip_negation = TRUE;
4618 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4619 break;
4620
4621 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4622 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4623 previously set by something earlier in the character class.
4624 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4625 we could just adjust the appropriate bit. From PCRE 8.34 we no
4626 longer treat \s and \S specially. */
4627
4628 case ESC_s:
4629 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4630 break;
4631
4632 case ESC_S:
4633 should_flip_negation = TRUE;
4634 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4635 break;
4636
4637 /* The rest apply in both UCP and non-UCP cases. */
4638
4639 case ESC_h:
4640 (void)add_list_to_class(classbits, &class_uchardata, options, cb,
4641 PRIV(hspace_list), NOTACHAR);
4642 break;
4643
4644 case ESC_H:
4645 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4646 cb, PRIV(hspace_list));
4647 break;
4648
4649 case ESC_v:
4650 (void)add_list_to_class(classbits, &class_uchardata, options, cb,
4651 PRIV(vspace_list), NOTACHAR);
4652 break;
4653
4654 case ESC_V:
4655 (void)add_not_list_to_class(classbits, &class_uchardata, options,
4656 cb, PRIV(vspace_list));
4657 break;
4658
4659 case ESC_p:
4660 case ESC_P:
4661 #ifdef SUPPORT_UNICODE
4662 {
4663 BOOL negated;
4664 unsigned int ptype = 0, pdata = 0;
4665 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb))
4666 goto FAILED;
4667 *class_uchardata++ = ((escape == ESC_p) != negated)?
4668 XCL_PROP : XCL_NOTPROP;
4669 *class_uchardata++ = ptype;
4670 *class_uchardata++ = pdata;
4671 xclass_has_prop = TRUE;
4672 class_has_8bitchar--; /* Undo! */
4673 }
4674 break;
4675 #else
4676 *errorcodeptr = ERR45;
4677 goto FAILED;
4678 #endif
4679 /* Unrecognized escapes are faulted. */
4680
4681 default:
4682 *errorcodeptr = ERR7;
4683 goto FAILED;
4684 }
4685
4686 /* Handled \d-type escape */
4687
4688 goto CONTINUE_CLASS;
4689 }
4690
4691 /* Control gets here if the escape just defined a single character.
4692 This is in c and may be greater than 256. */
4693
4694 escape = 0;
4695 } /* End of backslash handling */
4696
4697 /* A character may be followed by '-' to form a range. However, Perl does
4698 not permit ']' to be the end of the range. A '-' character at the end is
4699 treated as a literal. Perl ignores orphaned \E sequences entirely. The
4700 code for handling \Q and \E is messy. */
4701
4702 CHECK_RANGE:
4703 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4704 {
4705 inescq = FALSE;
4706 ptr += 2;
4707 }
4708 oldptr = ptr;
4709
4710 /* Remember if \r or \n were explicitly used */
4711
4712 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
4713
4714 /* Check for range */
4715
4716 if (!inescq && ptr[1] == CHAR_MINUS)
4717 {
4718 uint32_t d;
4719 ptr += 2;
4720 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4721
4722 /* If we hit \Q (not followed by \E) at this point, go into escaped
4723 mode. */
4724
4725 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4726 {
4727 ptr += 2;
4728 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4729 { ptr += 2; continue; }
4730 inescq = TRUE;
4731 break;
4732 }
4733
4734 /* Minus (hyphen) at the end of a class is treated as a literal, so put
4735 back the pointer and jump to handle the character that preceded it. */
4736
4737 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4738 {
4739 ptr = oldptr;
4740 goto CLASS_SINGLE_CHARACTER;
4741 }
4742
4743 /* Otherwise, we have a potential range; pick up the next character */
4744
4745 #ifdef SUPPORT_UNICODE
4746 if (utf)
4747 { /* Braces are required because the */
4748 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
4749 }
4750 else
4751 #endif
4752 d = *ptr; /* Not UTF mode */
4753
4754 /* The second part of a range can be a single-character escape
4755 sequence, but not any of the other escapes. Perl treats a hyphen as a
4756 literal in such circumstances. However, in Perl's warning mode, a
4757 warning is given, so PCRE now faults it as it is almost certainly a
4758 mistake on the user's part. */
4759
4760 if (!inescq)
4761 {
4762 if (d == CHAR_BACKSLASH)
4763 {
4764 int descape;
4765 descape = PRIV(check_escape)(&ptr, cb->end_pattern, &d,
4766 errorcodeptr, options, TRUE, cb);
4767 if (*errorcodeptr != 0) goto FAILED;
4768 #ifdef EBCDIC
4769 range_is_literal = FALSE;
4770 #endif
4771 /* 0 means a character was put into d; \b is backspace; any other
4772 special causes an error. */
4773
4774 if (descape != 0)
4775 {
4776 if (descape == ESC_b) d = CHAR_BS; else
4777 {
4778 *errorcodeptr = ERR50;
4779 goto FAILED;
4780 }
4781 }
4782 }
4783
4784 /* A hyphen followed by a POSIX class is treated in the same way. */
4785
4786 else if (d == CHAR_LEFT_SQUARE_BRACKET &&
4787 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4788 ptr[1] == CHAR_EQUALS_SIGN) &&
4789 check_posix_syntax(ptr, &tempptr))
4790 {
4791 *errorcodeptr = ERR50;
4792 goto FAILED;
4793 }
4794 }
4795
4796 /* Check that the two values are in the correct order. Optimize
4797 one-character ranges. */
4798
4799 if (d < c)
4800 {
4801 *errorcodeptr = ERR8;
4802 goto FAILED;
4803 }
4804 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
4805
4806 /* We have found a character range, so single character optimizations
4807 cannot be done anymore. Any value greater than 1 indicates that there
4808 is more than one character. */
4809
4810 class_one_char = 2;
4811
4812 /* Remember an explicit \r or \n, and add the range to the class. */
4813
4814 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
4815
4816 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
4817 because there are holes in the encoding, and simply using the range A-Z
4818 (for example) would include the characters in the holes. This applies
4819 only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
4820
4821 #ifdef EBCDIC
4822 if (range_is_literal &&
4823 (cb->ctypes[c] & ctype_letter) != 0 &&
4824 (cb->ctypes[d] & ctype_letter) != 0 &&
4825 (c <= CHAR_z) == (d <= CHAR_z))
4826 {
4827 uint32_t uc = (c <= CHAR_z)? 0 : 64;
4828 uint32_t C = c - uc;
4829 uint32_t D = d - uc;
4830
4831 if (C <= CHAR_i)
4832 {
4833 class_has_8bitchar +=
4834 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
4835 ((D < CHAR_i)? D : CHAR_i) + uc);
4836 C = CHAR_j;
4837 }
4838
4839 if (C <= D && C <= CHAR_r)
4840 {
4841 class_has_8bitchar +=
4842 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
4843 ((D < CHAR_r)? D : CHAR_r) + uc);
4844 C = CHAR_s;
4845 }
4846
4847 if (C <= D)
4848 {
4849 class_has_8bitchar +=
4850 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
4851 D + uc);
4852 }
4853 }
4854 else
4855 #endif
4856 class_has_8bitchar +=
4857 add_to_class(classbits, &class_uchardata, options, cb, c, d);
4858 goto CONTINUE_CLASS; /* Go get the next char in the class */
4859 }
4860
4861 /* Handle a single character - we can get here for a normal non-escape
4862 char, or after \ that introduces a single character or for an apparent
4863 range that isn't. Only the value 1 matters for class_one_char, so don't
4864 increase it if it is already 2 or more ... just in case there's a class
4865 with a zillion characters in it. */
4866
4867 CLASS_SINGLE_CHARACTER:
4868 if (class_one_char < 2) class_one_char++;
4869
4870 /* If class_one_char is 1 and xclass_has_prop is false, we have the first
4871 single character in the class, and there have been no prior ranges, or
4872 XCLASS items generated by escapes. If this is the final character in the
4873 class, we can optimize by turning the item into a 1-character OP_CHAR[I]
4874 if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
4875 can cause firstcu to be set. Otherwise, there can be no first char if
4876 this item is first, whatever repeat count may follow. In the case of
4877 reqcu, save the previous value for reinstating. */
4878
4879 if (!inescq &&
4880 #ifdef SUPPORT_UNICODE
4881 !xclass_has_prop &&
4882 #endif
4883 class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4884 {
4885 ptr++;
4886 zeroreqcu = reqcu;
4887 zeroreqcuflags = reqcuflags;
4888
4889 if (negate_class)
4890 {
4891 #ifdef SUPPORT_UNICODE
4892 int d;
4893 #endif
4894 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4895 zerofirstcu = firstcu;
4896 zerofirstcuflags = firstcuflags;
4897
4898 /* For caseless UTF mode, check whether this character has more than
4899 one other case. If so, generate a special OP_NOTPROP item instead of
4900 OP_NOTI. */
4901
4902 #ifdef SUPPORT_UNICODE
4903 if (utf && (options & PCRE2_CASELESS) != 0 &&
4904 (d = UCD_CASESET(c)) != 0)
4905 {
4906 *code++ = OP_NOTPROP;
4907 *code++ = PT_CLIST;
4908 *code++ = d;
4909 }
4910 else
4911 #endif
4912 /* Char has only one other case, or UCP not available */
4913
4914 {
4915 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
4916 code += PUTCHAR(c, code);
4917 }
4918
4919 /* We are finished with this character class */
4920
4921 goto END_CLASS;
4922 }
4923
4924 /* For a single, positive character, get the value into mcbuffer, and
4925 then we can handle this with the normal one-character code. */
4926
4927 mclength = PUTCHAR(c, mcbuffer);
4928 goto ONE_CHAR;
4929 } /* End of 1-char optimization */
4930
4931 /* There is more than one character in the class, or an XCLASS item
4932 has been generated. Add this character to the class. */
4933
4934 class_has_8bitchar +=
4935 add_to_class(classbits, &class_uchardata, options, cb, c, c);
4936
4937 /* Continue to the next character in the class. Closing square bracket
4938 not within \Q..\E ends the class. A NULL character terminates a
4939 nested substitution string, but may be a data character in the main
4940 pattern (tested at the start of this loop). */
4941
4942 CONTINUE_CLASS:
4943 c = *(++ptr);
4944 if (c == CHAR_NULL && cb->nestptr[0] != NULL)
4945 {
4946 ptr = cb->nestptr[0];
4947 cb->nestptr[0] = cb->nestptr[1];
4948 cb->nestptr[1] = NULL;
4949 c = *(++ptr);
4950 }
4951
4952 #ifdef SUPPORT_WIDE_CHARS
4953 /* If any wide characters have been encountered, set xclass = TRUE. Then,
4954 in the pre-compile phase, accumulate the length of the wide characters
4955 and reset the pointer. This is so that very large classes that contain a
4956 zillion wide characters do not overwrite the work space (which is on the
4957 stack). */
4958
4959 if (class_uchardata > class_uchardata_base)
4960 {
4961 xclass = TRUE;
4962 if (lengthptr != NULL)
4963 {
4964 *lengthptr += class_uchardata - class_uchardata_base;
4965 class_uchardata = class_uchardata_base;
4966 }
4967 }
4968 #endif
4969 /* An unescaped ] ends the class */
4970
4971 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
4972 } /* End of main class-processing loop */
4973
4974 /* If this is the first thing in the branch, there can be no first char
4975 setting, whatever the repeat count. Any reqcu setting must remain
4976 unchanged after any kind of repeat. */
4977
4978 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4979 zerofirstcu = firstcu;
4980 zerofirstcuflags = firstcuflags;
4981 zeroreqcu = reqcu;
4982 zeroreqcuflags = reqcuflags;
4983
4984 /* If there are characters with values > 255, or Unicode property settings
4985 (\p or \P), we have to compile an extended class, with its own opcode,
4986 unless there were no property settings and there was a negated special such
4987 as \S in the class, and PCRE2_UCP is not set, because in that case all
4988 characters > 255 are in or not in the class, so any that were explicitly
4989 given as well can be ignored.
4990
4991 In the UCP case, if certain negated POSIX classes ([:^ascii:] or
4992 [^:xdigit:]) were present in a class, we either have to match or not match
4993 all wide characters (depending on whether the whole class is or is not
4994 negated). This requirement is indicated by match_all_or_no_wide_chars being
4995 true. We do this by including an explicit range, which works in both cases.
4996
4997 If, when generating an xclass, there are no characters < 256, we can omit
4998 the bitmap in the actual compiled code. */
4999
5000 #ifdef SUPPORT_WIDE_CHARS
5001 #ifdef SUPPORT_UNICODE
5002 if (xclass && (xclass_has_prop || !should_flip_negation ||
5003 (options & PCRE2_UCP) != 0))
5004 #elif PCRE2_CODE_UNIT_WIDTH != 8
5005 if (xclass && (xclass_has_prop || !should_flip_negation))
5006 #endif
5007 {
5008 if (match_all_or_no_wide_chars)
5009 {
5010 *class_uchardata++ = XCL_RANGE;
5011 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5012 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
5013 }
5014 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5015 *code++ = OP_XCLASS;
5016 code += LINK_SIZE;
5017 *code = negate_class? XCL_NOT:0;
5018 if (xclass_has_prop) *code |= XCL_HASPROP;
5019
5020 /* If the map is required, move up the extra data to make room for it;
5021 otherwise just move the code pointer to the end of the extra data. */
5022
5023 if (class_has_8bitchar > 0)
5024 {
5025 *code++ |= XCL_MAP;
5026 memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
5027 CU2BYTES(class_uchardata - code));
5028 if (negate_class && !xclass_has_prop)
5029 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5030 memcpy(code, classbits, 32);
5031 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
5032 }
5033 else code = class_uchardata;
5034
5035 /* Now fill in the complete length of the item */
5036
5037 PUT(previous, 1, (int)(code - previous));
5038 break; /* End of class handling */
5039 }
5040 #endif
5041
5042 /* If there are no characters > 255, or they are all to be included or
5043 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5044 whole class was negated and whether there were negative specials such as \S
5045 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5046 negating it if necessary. */
5047
5048 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5049 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5050 {
5051 if (negate_class)
5052 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5053 memcpy(code, classbits, 32);
5054 }
5055 code += 32 / sizeof(PCRE2_UCHAR);
5056
5057 END_CLASS:
5058 break;
5059
5060
5061 /* ===================================================================*/
5062 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5063 has been tested above. */
5064
5065 case CHAR_LEFT_CURLY_BRACKET:
5066 if (!is_quantifier) goto NORMAL_CHAR;
5067 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5068 if (*errorcodeptr != 0) goto FAILED;
5069 goto REPEAT;
5070
5071 case CHAR_ASTERISK:
5072 repeat_min = 0;
5073 repeat_max = -1;
5074 goto REPEAT;
5075
5076 case CHAR_PLUS:
5077 repeat_min = 1;
5078 repeat_max = -1;
5079 goto REPEAT;
5080
5081 case CHAR_QUESTION_MARK:
5082 repeat_min = 0;
5083 repeat_max = 1;
5084
5085 REPEAT:
5086 if (previous == NULL)
5087 {
5088 *errorcodeptr = ERR9;
5089 goto FAILED;
5090 }
5091
5092 if (repeat_min == 0)
5093 {
5094 firstcu = zerofirstcu; /* Adjust for zero repeat */
5095 firstcuflags = zerofirstcuflags;
5096 reqcu = zeroreqcu; /* Ditto */
5097 reqcuflags = zeroreqcuflags;
5098 }
5099
5100 /* Remember whether this is a variable length repeat */
5101
5102 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5103
5104 op_type = 0; /* Default single-char op codes */
5105 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5106
5107 /* Save start of previous item, in case we have to move it up in order to
5108 insert something before it. */
5109
5110 tempcode = previous;
5111
5112 /* Before checking for a possessive quantifier, we must skip over
5113 whitespace and comments in extended mode because Perl allows white space at
5114 this point. */
5115
5116 if ((options & PCRE2_EXTENDED) != 0)
5117 {
5118 ptr++;
5119 for (;;)
5120 {
5121 while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_space) != 0) ptr++;
5122 if (*ptr != CHAR_NUMBER_SIGN) break;
5123 ptr++;
5124 while (ptr < cb->end_pattern)
5125 {
5126 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
5127 { /* IS_NEWLINE sets cb->nllen. */
5128 ptr += cb->nllen;
5129 break;
5130 }
5131 ptr++;
5132 #ifdef SUPPORT_UNICODE
5133 if (utf) FORWARDCHAR(ptr);
5134 #endif
5135 } /* Loop for comment characters */
5136 } /* Loop for multiple comments */
5137 ptr--; /* Last code unit of previous character. */
5138 }
5139
5140 /* If the next character is '+', we have a possessive quantifier. This
5141 implies greediness, whatever the setting of the PCRE2_UNGREEDY option.
5142 If the next character is '?' this is a minimizing repeat, by default,
5143 but if PCRE2_UNGREEDY is set, it works the other way round. We change the
5144 repeat type to the non-default. */
5145
5146 if (ptr[1] == CHAR_PLUS)
5147 {
5148 repeat_type = 0; /* Force greedy */
5149 possessive_quantifier = TRUE;
5150 ptr++;
5151 }
5152 else if (ptr[1] == CHAR_QUESTION_MARK)
5153 {
5154 repeat_type = greedy_non_default;
5155 ptr++;
5156 }
5157 else repeat_type = greedy_default;
5158
5159 /* If the repeat is {1} we can ignore it. */
5160
5161 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
5162
5163 /* If previous was a recursion call, wrap it in atomic brackets so that
5164 previous becomes the atomic group. All recursions were so wrapped in the
5165 past, but it no longer happens for non-repeated recursions. In fact, the
5166 repeated ones could be re-implemented independently so as not to need this,
5167 but for the moment we rely on the code for repeating groups. */
5168
5169 if (*previous == OP_RECURSE)
5170 {
5171 memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
5172 *previous = OP_ONCE;
5173 PUT(previous, 1, 2 + 2*LINK_SIZE);
5174 previous[2 + 2*LINK_SIZE] = OP_KET;
5175 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5176 code += 2 + 2 * LINK_SIZE;
5177 length_prevgroup = 3 + 3*LINK_SIZE;
5178 }
5179
5180 /* Now handle repetition for the different types of item. */
5181
5182 /* If previous was a character or negated character match, abolish the item
5183 and generate a repeat item instead. If a char item has a minimum of more
5184 than one, ensure that it is set in reqcu - it might not be if a sequence
5185 such as x{3} is the first thing in a branch because the x will have gone
5186 into firstcu instead. */
5187
5188 if (*previous == OP_CHAR || *previous == OP_CHARI
5189 || *previous == OP_NOT || *previous == OP_NOTI)
5190 {
5191 switch (*previous)
5192 {
5193 default: /* Make compiler happy. */
5194 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5195 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5196 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5197 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5198 }
5199
5200 /* Deal with UTF characters that take up more than one code unit. It's
5201 easier to write this out separately than try to macrify it. Use c to
5202 hold the length of the character in code units, plus UTF_LENGTH to flag
5203 that it's a length rather than a small character. */
5204
5205 #ifdef MAYBE_UTF_MULTI
5206 if (utf && NOT_FIRSTCU(code[-1]))
5207 {
5208 PCRE2_UCHAR *lastchar = code - 1;
5209 BACKCHAR(lastchar);
5210 c = (int)(code - lastchar); /* Length of UTF character */
5211 memcpy(utf_units, lastchar, CU2BYTES(c)); /* Save the char */
5212 c |= UTF_LENGTH; /* Flag c as a length */
5213 }
5214 else
5215 #endif /* MAYBE_UTF_MULTI */
5216
5217 /* Handle the case of a single charater - either with no UTF support, or
5218 with UTF disabled, or for a single-code-unit UTF character. */
5219 {
5220 c = code[-1];
5221 if (*previous <= OP_CHARI && repeat_min > 1)
5222 {
5223 reqcu = c;
5224 reqcuflags = req_caseopt | cb->req_varyopt;
5225 }
5226 }
5227
5228 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5229 }
5230
5231 /* If previous was a character type match (\d or similar), abolish it and
5232 create a suitable repeat item. The code is shared with single-character
5233 repeats by setting op_type to add a suitable offset into repeat_type. Note
5234 the the Unicode property types will be present only when SUPPORT_UNICODE is
5235 defined, but we don't wrap the little bits of code here because it just
5236 makes it horribly messy. */
5237
5238 else if (*previous < OP_EODN)
5239 {
5240 PCRE2_UCHAR *oldcode;
5241 int prop_type, prop_value;
5242 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5243 c = *previous; /* Save previous opcode */
5244 if (c == OP_PROP || c == OP_NOTPROP)
5245 {
5246 prop_type = previous[1];
5247 prop_value = previous[2];
5248 }
5249 else
5250 {
5251 /* Come here from just above with a character in c */
5252 OUTPUT_SINGLE_REPEAT:
5253 prop_type = prop_value = -1;
5254 }
5255
5256 /* At this point we either have prop_type == prop_value == -1 and either
5257 a code point or a character type that is not OP_[NOT]PROP in c, or we
5258 have OP_[NOT]PROP in c and prop_type/prop_value not negative. */
5259
5260 oldcode = code; /* Save where we were */
5261 code = previous; /* Usually overwrite previous item */
5262
5263 /* If the maximum is zero then the minimum must also be zero; Perl allows
5264 this case, so we do too - by simply omitting the item altogether. */
5265
5266 if (repeat_max == 0) goto END_REPEAT;
5267
5268 /* Combine the op_type with the repeat_type */
5269
5270 repeat_type += op_type;
5271
5272 /* A minimum of zero is handled either as the special case * or ?, or as
5273 an UPTO, with the maximum given. */
5274
5275 if (repeat_min == 0)
5276 {
5277 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5278 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5279 else
5280 {
5281 *code++ = OP_UPTO + repeat_type;
5282 PUT2INC(code, 0, repeat_max);
5283 }
5284 }
5285
5286 /* A repeat minimum of 1 is optimized into some special cases. If the
5287 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5288 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5289 one less than the maximum. */
5290
5291 else if (repeat_min == 1)
5292 {
5293 if (repeat_max == -1)
5294 *code++ = OP_PLUS + repeat_type;
5295 else
5296 {
5297 code = oldcode; /* Leave previous item in place */
5298 if (repeat_max == 1) goto END_REPEAT;
5299 *code++ = OP_UPTO + repeat_type;
5300 PUT2INC(code, 0, repeat_max - 1);
5301 }
5302 }
5303
5304 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5305 handled as an EXACT followed by an UPTO or STAR or QUERY. */
5306
5307 else
5308 {
5309 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5310 PUT2INC(code, 0, repeat_min);
5311
5312 /* Unless repeat_max equals repeat_min, fill in the data for EXACT, and
5313 then generate the second opcode. In UTF mode, multi-code-unit
5314 characters have their length in c, with the UTF_LENGTH bit as a flag,
5315 and the code units in utf_units. For a repeated Unicode property match,
5316 there are two extra values that define the required property, and c
5317 never has the UTF_LENGTH bit set. */
5318
5319 if (repeat_max != repeat_min)
5320 {
5321 #ifdef MAYBE_UTF_MULTI
5322 if (utf && (c & UTF_LENGTH) != 0)
5323 {
5324 memcpy(code, utf_units, CU2BYTES(c & 7));
5325 code += c & 7;
5326 }
5327 else
5328 #endif /* MAYBE_UTF_MULTI */
5329 {
5330 *code++ = c;
5331 if (prop_type >= 0)
5332 {
5333 *code++ = prop_type;
5334 *code++ = prop_value;
5335 }
5336 }
5337
5338 /* Now set up the following opcode */
5339
5340 if (repeat_max < 0) *code++ = OP_STAR + repeat_type; else
5341 {
5342 repeat_max -= repeat_min;
5343 if (repeat_max == 1)
5344 {
5345 *code++ = OP_QUERY + repeat_type;
5346 }
5347 else
5348 {
5349 *code++ = OP_UPTO + repeat_type;
5350 PUT2INC(code, 0, repeat_max);
5351 }
5352 }
5353 }
5354 }
5355
5356 /* Fill in the character or character type for the final opcode. */
5357
5358 #ifdef MAYBE_UTF_MULTI
5359 if (utf && (c & UTF_LENGTH) != 0)
5360 {
5361 memcpy(code, utf_units, CU2BYTES(c & 7));
5362 code += c & 7;
5363 }
5364 else
5365 #endif /* MAYBEW_UTF_MULTI */
5366 {
5367 *code++ = c;
5368 if (prop_type >= 0)
5369 {
5370 *code++ = prop_type;
5371 *code++ = prop_value;
5372 }
5373 }
5374 }
5375
5376 /* If previous was a character class or a back reference, we put the repeat
5377 stuff after it, but just skip the item if the repeat was {0,0}. */
5378
5379 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5380 #ifdef SUPPORT_WIDE_CHARS
5381 *previous == OP_XCLASS ||
5382 #endif
5383 *previous == OP_REF || *previous == OP_REFI ||
5384 *previous == OP_DNREF || *previous == OP_DNREFI)
5385 {
5386 if (repeat_max == 0)
5387 {
5388 code = previous;
5389 goto END_REPEAT;
5390 }
5391
5392 if (repeat_min == 0 && repeat_max == -1)
5393 *code++ = OP_CRSTAR + repeat_type;
5394 else if (repeat_min == 1 && repeat_max == -1)
5395 *code++ = OP_CRPLUS + repeat_type;
5396 else if (repeat_min == 0 && repeat_max == 1)
5397 *code++ = OP_CRQUERY + repeat_type;
5398 else
5399 {
5400 *code++ = OP_CRRANGE + repeat_type;
5401 PUT2INC(code, 0, repeat_min);
5402 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5403 PUT2INC(code, 0, repeat_max);
5404 }
5405 }
5406
5407 /* If previous was a bracket group, we may have to replicate it in certain
5408 cases. Note that at this point we can encounter only the "basic" bracket
5409 opcodes such as BRA and CBRA, as this is the place where they get converted
5410 into the more special varieties such as BRAPOS and SBRA. A test for >=
5411 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5412 ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5413 Originally, PCRE did not allow repetition of assertions, but now it does,
5414 for Perl compatibility. */
5415
5416 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5417 {
5418 register int i;
5419 int len = (int)(code - previous);
5420 PCRE2_UCHAR *bralink = NULL;
5421 PCRE2_UCHAR *brazeroptr = NULL;
5422
5423 /* Repeating a DEFINE group (or any group where the condition is always
5424 FALSE and there is only one branch) is pointless, but Perl allows the
5425 syntax, so we just ignore the repeat. */
5426
5427 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
5428 previous[GET(previous, 1)] != OP_ALT)
5429 goto END_REPEAT;
5430
5431 /* There is no sense in actually repeating assertions. The only potential
5432 use of repetition is in cases when the assertion is optional. Therefore,
5433 if the minimum is greater than zero, just ignore the repeat. If the
5434 maximum is not zero or one, set it to 1. */
5435
5436 if (*previous < OP_ONCE) /* Assertion */
5437 {
5438 if (repeat_min > 0) goto END_REPEAT;
5439 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5440 }
5441
5442 /* The case of a zero minimum is special because of the need to stick
5443 OP_BRAZERO in front of it, and because the group appears once in the
5444 data, whereas in other cases it appears the minimum number of times. For
5445 this reason, it is simplest to treat this case separately, as otherwise
5446 the code gets far too messy. There are several special subcases when the
5447 minimum is zero. */
5448
5449 if (repeat_min == 0)
5450 {
5451 /* If the maximum is also zero, we used to just omit the group from the
5452 output altogether, like this:
5453
5454 ** if (repeat_max == 0)
5455 ** {
5456 ** code = previous;
5457 ** goto END_REPEAT;
5458 ** }
5459
5460 However, that fails when a group or a subgroup within it is referenced
5461 as a subroutine from elsewhere in the pattern, so now we stick in
5462 OP_SKIPZERO in front of it so that it is skipped on execution. As we
5463 don't have a list of which groups are referenced, we cannot do this
5464 selectively.
5465
5466 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5467 and do no more at this point. */
5468
5469 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
5470 {
5471 memmove(previous + 1, previous, CU2BYTES(len));
5472 code++;
5473 if (repeat_max == 0)
5474 {
5475 *previous++ = OP_SKIPZERO;
5476 goto END_REPEAT;
5477 }
5478 brazeroptr = previous; /* Save for possessive optimizing */
5479 *previous++ = OP_BRAZERO + repeat_type;
5480 }
5481
5482 /* If the maximum is greater than 1 and limited, we have to replicate
5483 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5484 The first one has to be handled carefully because it's the original
5485 copy, which has to be moved up. The remainder can be handled by code
5486 that is common with the non-zero minimum case below. We have to
5487 adjust the value or repeat_max, since one less copy is required. */
5488
5489 else
5490 {
5491 int offset;
5492 memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
5493 code += 2 + LINK_SIZE;
5494 *previous++ = OP_BRAZERO + repeat_type;
5495 *previous++ = OP_BRA;
5496
5497 /* We chain together the bracket offset fields that have to be
5498 filled in later when the ends of the brackets are reached. */
5499
5500 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5501 bralink = previous;
5502 PUTINC(previous, 0, offset);
5503 }
5504
5505 repeat_max--;
5506 }
5507
5508 /* If the minimum is greater than zero, replicate the group as many
5509 times as necessary, and adjust the maximum to the number of subsequent
5510 copies that we need. */
5511
5512 else
5513 {
5514 if (repeat_min > 1)
5515 {
5516 /* In the pre-compile phase, we don't actually do the replication. We
5517 just adjust the length as if we had. Do some paranoid checks for
5518 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5519 integer type when available, otherwise double. */
5520
5521 if (lengthptr != NULL)
5522 {
5523 size_t delta = (repeat_min - 1)*length_prevgroup;
5524 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5525 (INT64_OR_DOUBLE)length_prevgroup >
5526 (INT64_OR_DOUBLE)INT_MAX ||
5527 OFLOW_MAX - *lengthptr < delta)
5528 {
5529 *errorcodeptr = ERR20;
5530 goto FAILED;
5531 }
5532 *lengthptr += delta;
5533 }
5534
5535 /* This is compiling for real. If there is a set first byte for
5536 the group, and we have not yet set a "required byte", set it. */
5537
5538 else
5539 {
5540 if (groupsetfirstcu && reqcuflags < 0)
5541 {
5542 reqcu = firstcu;
5543 reqcuflags = firstcuflags;
5544 }
5545 for (i = 1; i < repeat_min; i++)
5546 {
5547 memcpy(code, previous, CU2BYTES(len));
5548 code += len;
5549 }
5550 }
5551 }
5552
5553 if (repeat_max > 0) repeat_max -= repeat_min;
5554 }
5555
5556 /* This code is common to both the zero and non-zero minimum cases. If
5557 the maximum is limited, it replicates the group in a nested fashion,
5558 remembering the bracket starts on a stack. In the case of a zero minimum,
5559 the first one was set up above. In all cases the repeat_max now specifies
5560 the number of additional copies needed. Again, we must remember to
5561 replicate entries on the forward reference list. */
5562
5563 if (repeat_max >= 0)
5564 {
5565 /* In the pre-compile phase, we don't actually do the replication. We
5566 just adjust the length as if we had. For each repetition we must add 1
5567 to the length for BRAZERO and for all but the last repetition we must
5568 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5569 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5570 a 64-bit integer type when available, otherwise double. */
5571
5572 if (lengthptr != NULL && repeat_max > 0)
5573 {
5574 size_t delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5575 2 - 2*LINK_SIZE; /* Last one doesn't nest */
5576 if ((INT64_OR_DOUBLE)repeat_max *
5577 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5578 > (INT64_OR_DOUBLE)INT_MAX ||
5579 OFLOW_MAX - *lengthptr < delta)
5580 {
5581 *errorcodeptr = ERR20;
5582 goto FAILED;
5583 }
5584 *lengthptr += delta;
5585 }
5586
5587 /* This is compiling for real */
5588
5589 else for (i = repeat_max - 1; i >= 0; i--)
5590 {
5591 *code++ = OP_BRAZERO + repeat_type;
5592
5593 /* All but the final copy start a new nesting, maintaining the
5594 chain of brackets outstanding. */
5595
5596 if (i != 0)
5597 {
5598 int offset;
5599 *code++ = OP_BRA;
5600 offset = (bralink == NULL)? 0 : (int)(code - bralink);
5601 bralink = code;
5602 PUTINC(code, 0, offset);
5603 }
5604
5605 memcpy(code, previous, CU2BYTES(len));
5606 code += len;
5607 }
5608
5609 /* Now chain through the pending brackets, and fill in their length
5610 fields (which are holding the chain links pro tem). */
5611
5612 while (bralink != NULL)
5613 {
5614 int oldlinkoffset;
5615 int offset = (int)(code - bralink + 1);
5616 PCRE2_UCHAR *bra = code - offset;
5617 oldlinkoffset = GET(bra, 1);
5618 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5619 *code++ = OP_KET;
5620 PUTINC(code, 0, offset);
5621 PUT(bra, 1, offset);
5622 }
5623 }
5624
5625 /* If the maximum is unlimited, set a repeater in the final copy. For
5626 ONCE brackets, that's all we need to do. However, possessively repeated
5627 ONCE brackets can be converted into non-capturing brackets, as the
5628 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5629 deal with possessive ONCEs specially.
5630
5631 Otherwise, when we are doing the actual compile phase, check to see
5632 whether this group is one that could match an empty string. If so,
5633 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5634 that runtime checking can be done. [This check is also applied to ONCE
5635 groups at runtime, but in a different way.]
5636
5637 Then, if the quantifier was possessive and the bracket is not a
5638 conditional, we convert the BRA code to the POS form, and the KET code to
5639 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5640 subpattern at both the start and at the end.) The use of special opcodes
5641 makes it possible to reduce greatly the stack usage in pcre2_match(). If
5642 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5643
5644 Then, if the minimum number of matches is 1 or 0, cancel the possessive
5645 flag so that the default action below, of wrapping everything inside
5646 atomic brackets, does not happen. When the minimum is greater than 1,
5647 there will be earlier copies of the group, and so we still have to wrap
5648 the whole thing. */
5649
5650 else
5651 {
5652 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
5653 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
5654
5655 /* Convert possessive ONCE brackets to non-capturing */
5656
5657 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5658 possessive_quantifier) *bracode = OP_BRA;
5659
5660 /* For non-possessive ONCE brackets, all we need to do is to
5661 set the KET. */
5662
5663 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5664 *ketcode = OP_KETRMAX + repeat_type;
5665
5666 /* Handle non-ONCE brackets and possessive ONCEs (which have been
5667 converted to non-capturing above). */
5668
5669 else
5670 {
5671 /* In the compile phase, check whether the group could match an empty
5672 string. */
5673
5674 if (lengthptr == NULL)
5675 {
5676 PCRE2_UCHAR *scode = bracode;
5677 do
5678 {
5679 int count = 0;
5680 int rc = could_be_empty_branch(scode, ketcode, utf, cb, FALSE,
5681 NULL, &count);
5682 if (rc < 0)
5683 {
5684 *errorcodeptr = ERR86;
5685 goto FAILED;
5686 }
5687 if (rc > 0)
5688 {
5689 *bracode += OP_SBRA - OP_BRA;
5690 break;
5691 }
5692 scode += GET(scode, 1);
5693 }
5694 while (*scode == OP_ALT);
5695
5696 /* A conditional group with only one branch has an implicit empty
5697 alternative branch. */
5698
5699 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
5700 *bracode = OP_SCOND;
5701 }
5702
5703 /* Handle possessive quantifiers. */
5704
5705 if (possessive_quantifier)
5706 {
5707 /* For COND brackets, we wrap the whole thing in a possessively
5708 repeated non-capturing bracket, because we have not invented POS
5709 versions of the COND opcodes. */
5710
5711 if (*bracode == OP_COND || *bracode == OP_SCOND)
5712 {
5713 int nlen = (int)(code - bracode);
5714 memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
5715 code += 1 + LINK_SIZE;
5716 nlen += 1 + LINK_SIZE;
5717 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
5718 *code++ = OP_KETRPOS;
5719 PUTINC(code, 0, nlen);
5720 PUT(bracode, 1, nlen);
5721 }
5722
5723 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5724
5725 else
5726 {
5727 *bracode += 1; /* Switch to xxxPOS opcodes */
5728 *ketcode = OP_KETRPOS;
5729 }
5730
5731 /* If the minimum is zero, mark it as possessive, then unset the
5732 possessive flag when the minimum is 0 or 1. */
5733
5734 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5735 if (repeat_min < 2) possessive_quantifier = FALSE;
5736 }
5737
5738 /* Non-possessive quantifier */
5739
5740 else *ketcode = OP_KETRMAX + repeat_type;
5741 }
5742 }
5743 }
5744
5745 /* If previous is OP_FAIL, it was generated by an empty class []
5746 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
5747 generated, that is by (*FAIL) or (?!), set previous to NULL, which gives a
5748 "nothing to repeat" error above. We can just ignore the repeat in empty
5749 class case. */
5750
5751 else if (*previous == OP_FAIL) goto END_REPEAT;
5752
5753 /* Else there's some kind of shambles */
5754
5755 else
5756 {
5757 *errorcodeptr = ERR10;
5758 goto FAILED;
5759 }
5760
5761 /* If the character following a repeat is '+', possessive_quantifier is
5762 TRUE. For some opcodes, there are special alternative opcodes for this
5763 case. For anything else, we wrap the entire repeated item inside OP_ONCE
5764 brackets. Logically, the '+' notation is just syntactic sugar, taken from
5765 Sun's Java package, but the special opcodes can optimize it.
5766
5767 Some (but not all) possessively repeated subpatterns have already been
5768 completely handled in the code just above. For them, possessive_quantifier
5769 is always FALSE at this stage. Note that the repeated item starts at
5770 tempcode, not at previous, which might be the first part of a string whose
5771 (former) last char we repeated. */
5772
5773 if (possessive_quantifier)
5774 {
5775 int len;
5776
5777 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
5778 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
5779 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
5780 remains is greater than zero, there's a further opcode that can be
5781 handled. If not, do nothing, leaving the EXACT alone. */
5782
5783 switch(*tempcode)
5784 {
5785 case OP_TYPEEXACT:
5786 tempcode += PRIV(OP_lengths)[*tempcode] +
5787 ((tempcode[1 + IMM2_SIZE] == OP_PROP
5788 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5789 break;
5790
5791 /* CHAR opcodes are used for exacts whose count is 1. */
5792
5793 case OP_CHAR:
5794 case OP_CHARI:
5795 case OP_NOT:
5796 case OP_NOTI:
5797 case OP_EXACT:
5798 case OP_EXACTI:
5799 case OP_NOTEXACT:
5800 case OP_NOTEXACTI:
5801 tempcode += PRIV(OP_lengths)[*tempcode];
5802 #ifdef SUPPORT_UNICODE
5803 if (utf && HAS_EXTRALEN(tempcode[-1]))
5804 tempcode += GET_EXTRALEN(tempcode[-1]);
5805 #endif
5806 break;
5807
5808 /* For the class opcodes, the repeat operator appears at the end;
5809 adjust tempcode to point to it. */
5810
5811 case OP_CLASS:
5812 case OP_NCLASS:
5813 tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
5814 break;
5815
5816 #ifdef SUPPORT_WIDE_CHARS
5817 case OP_XCLASS:
5818 tempcode += GET(tempcode, 1);
5819 break;
5820 #endif
5821 }
5822
5823 /* If tempcode is equal to code (which points to the end of the repeated
5824 item), it means we have skipped an EXACT item but there is no following
5825 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
5826 all other cases, tempcode will be pointing to the repeat opcode, and will
5827 be less than code, so the value of len will be greater than 0. */
5828
5829 len = (int)(code - tempcode);
5830 if (len > 0)
5831 {
5832 unsigned int repcode = *tempcode;
5833
5834 /* There is a table for possessifying opcodes, all of which are less
5835 than OP_CALLOUT. A zero entry means there is no possessified version.
5836 */
5837
5838 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
5839 *tempcode = opcode_possessify[repcode];
5840
5841 /* For opcode without a special possessified version, wrap the item in
5842 ONCE brackets. */
5843
5844 else
5845 {
5846 memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
5847 code += 1 + LINK_SIZE;
5848 len += 1 + LINK_SIZE;
5849 tempcode[0] = OP_ONCE;
5850 *code++ = OP_KET;
5851 PUTINC(code, 0, len);
5852 PUT(tempcode, 1, len);
5853 }
5854 }
5855 }
5856
5857 /* In all case we no longer have a previous item. We also set the
5858 "follows varying string" flag for subsequently encountered reqcus if
5859 it isn't already set and we have just passed a varying length item. */
5860
5861 END_REPEAT:
5862 previous = NULL;
5863 cb->req_varyopt |= reqvary;
5864 break;
5865
5866
5867 /* ===================================================================*/
5868 /* Start of nested parenthesized sub-expression, or lookahead or lookbehind
5869 or option setting or condition or all the other extended parenthesis forms.
5870 We must save the current high-water-mark for the forward reference list so
5871 that we know where they start for this group. However, because the list may
5872 be extended when there are very many forward references (usually the result
5873 of a replicated inner group), we must use an offset rather than an absolute
5874 address. Note that (?# comments are dealt with at the top of the loop;
5875 they do not get this far. */
5876
5877 case CHAR_LEFT_PARENTHESIS:
5878 ptr++;
5879
5880 /* Deal with various "verbs" that can be introduced by '*'. */
5881
5882 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5883 || (MAX_255(ptr[1]) && ((cb->ctypes[ptr[1]] & ctype_letter) != 0))))
5884 {
5885 int i, namelen;
5886 int arglen = 0;
5887 const char *vn = verbnames;
5888 PCRE2_SPTR name = ptr + 1;
5889 PCRE2_SPTR arg = NULL;
5890 previous = NULL;
5891 ptr++;
5892
5893 /* Increment ptr, set namelen, check length */
5894
5895 READ_NAME(ctype_letter, ERR60, *errorcodeptr);
5896
5897 /* It appears that Perl allows any characters whatsoever, other than
5898 a closing parenthesis, to appear in arguments, so we no longer insist on
5899 letters, digits, and underscores. Perl does not, however, do any
5900 interpretation within arguments, and has no means of including a closing
5901 parenthesis. PCRE supports escape processing but only when it is
5902 requested by an option. Note that check_escape() will not return values
5903 greater than the code unit maximum when not in UTF mode. */
5904
5905 if (*ptr == CHAR_COLON)
5906 {
5907 arg = ++ptr;
5908
5909 if ((options & PCRE2_ALT_VERBNAMES) == 0)
5910 {
5911 arglen = 0;
5912 while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS)
5913 {
5914 ptr++; /* Check length as we go */
5915 arglen++; /* along, to avoid the */
5916 if ((unsigned int)arglen > MAX_MARK) /* possibility of overflow. */
5917 {
5918 *errorcodeptr = ERR76;
5919 goto FAILED;
5920 }
5921 }
5922 }
5923 else
5924 {
5925 /* The length check is in process_verb_names() */
5926 arglen = process_verb_name(&ptr, NULL, errorcodeptr, options,
5927 utf, cb);
5928 if (arglen < 0) goto FAILED;
5929 }
5930 }
5931
5932 if (*ptr != CHAR_RIGHT_PARENTHESIS)
5933 {
5934 *errorcodeptr = ERR60;
5935 goto FAILED;
5936 }
5937
5938 /* Scan the table of verb names */
5939
5940 for (i = 0; i < verbcount; i++)
5941 {
5942 if (namelen == verbs[i].len &&
5943 PRIV(strncmp_c8)(name, vn, namelen) == 0)
5944 {
5945 int setverb;
5946
5947 /* Check for open captures before ACCEPT and convert it to
5948 ASSERT_ACCEPT if in an assertion. */
5949
5950 if (verbs[i].op == OP_ACCEPT)
5951 {
5952 open_capitem *oc;
5953 if (arglen != 0)
5954 {
5955 *errorcodeptr = ERR59;
5956 goto FAILED;
5957 }
5958 cb->had_accept = TRUE;
5959
5960 /* In the first pass, just accumulate the length required;
5961 otherwise hitting (*ACCEPT) inside many nested parentheses can
5962 cause workspace overflow. */
5963
5964 for (oc = cb->open_caps; oc != NULL; oc = oc->next)
5965 {
5966 if (lengthptr != NULL)
5967 {
5968 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
5969 }
5970 else
5971 {
5972 *code++ = OP_CLOSE;
5973 PUT2INC(code, 0, oc->number);
5974 }
5975 }
5976 setverb = *code++ =
5977 (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5978
5979 /* Do not set firstcu after *ACCEPT */
5980 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5981 }
5982
5983 /* Handle other cases with/without an argument */
5984
5985 else if (arglen == 0) /* There is no argument */
5986 {
5987 if (verbs[i].op < 0) /* Argument is mandatory */
5988 {
5989 *errorcodeptr = ERR66;
5990 goto FAILED;
5991 }
5992 setverb = *code++ = verbs[i].op;
5993 }
5994
5995 else /* An argument is present */
5996 {
5997 if (verbs[i].op_arg < 0) /* Argument is forbidden */
5998 {
5999 *errorcodeptr = ERR59;
6000 goto FAILED;
6001 }
6002 setverb = *code++ = verbs[i].op_arg;
6003
6004 /* Arguments can be very long, especially in 16- and 32-bit modes,
6005 and can overflow the workspace in the first pass. Instead of
6006 putting the argument into memory, we just update the length counter
6007 and set up an empty argument. */
6008
6009 if (lengthptr != NULL)
6010 {
6011 *lengthptr += arglen;
6012 *code++ = 0;
6013 }
6014 else
6015 {
6016 *code++ = arglen;
6017 if ((options & PCRE2_ALT_VERBNAMES) != 0)
6018 {
6019 PCRE2_UCHAR *memcode = code; /* code is "register" */
6020 (void)process_verb_name(&arg, &memcode, errorcodeptr, options,
6021 utf, cb);
6022 code = memcode;
6023 }
6024 else /* No argument processing */
6025 {
6026 memcpy(code, arg, CU2BYTES(arglen));
6027 code += arglen;
6028 }
6029 }
6030
6031 *code++ = 0;
6032 }
6033
6034 switch (setverb)
6035 {
6036 case OP_THEN:
6037 case OP_THEN_ARG:
6038 cb->external_flags |= PCRE2_HASTHEN;
6039 break;
6040
6041 case OP_PRUNE:
6042 case OP_PRUNE_ARG:
6043 case OP_SKIP:
6044 case OP_SKIP_ARG:
6045 cb->had_pruneorskip = TRUE;
6046 break;
6047 }
6048
6049 break; /* Found verb, exit loop */
6050 }
6051
6052 vn += verbs[i].len + 1;
6053 }
6054
6055 if (i < verbcount) continue; /* Successfully handled a verb */
6056 *errorcodeptr = ERR60; /* Verb not recognized */
6057 goto FAILED;
6058 }
6059
6060 /* Initialization for "real" parentheses */
6061
6062 newoptions = options;
6063 skipunits = 0;
6064 bravalue = OP_CBRA;
6065 reset_bracount = FALSE;
6066
6067 /* Deal with the extended parentheses; all are introduced by '?', and the
6068 appearance of any of them means that this is not a capturing group. */
6069
6070 if (*ptr == CHAR_QUESTION_MARK)
6071 {
6072 int i, count;
6073 int namelen; /* Must be signed */
6074 uint32_t index;
6075 uint32_t set, unset, *optset;
6076 named_group *ng;
6077 PCRE2_SPTR name;
6078 PCRE2_UCHAR *slot;
6079
6080 switch (*(++ptr))
6081 {
6082 /* ------------------------------------------------------------ */
6083 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6084 reset_bracount = TRUE;
6085 /* Fall through */
6086
6087 /* ------------------------------------------------------------ */
6088 case CHAR_COLON: /* Non-capturing bracket */
6089 bravalue = OP_BRA;
6090 ptr++;
6091 break;
6092
6093 /* ------------------------------------------------------------ */
6094 case CHAR_LEFT_PARENTHESIS:
6095 bravalue = OP_COND; /* Conditional group */
6096 tempptr = ptr;
6097
6098 /* A condition can be an assertion, a number (referring to a numbered
6099 group's having been set), a name (referring to a named group), or 'R',
6100 referring to recursion. R<digits> and R&name are also permitted for
6101 recursion tests.
6102
6103 There are ways of testing a named group: (?(name)) is used by Python;
6104 Perl 5.10 onwards uses (?(<name>) or (?('name')).
6105
6106 There is one unfortunate ambiguity, caused by history. 'R' can be the
6107 recursive thing or the name 'R' (and similarly for 'R' followed by
6108 digits). We look for a name first; if not found, we try the other case.
6109
6110 For compatibility with auto-callouts, we allow a callout to be
6111 specified before a condition that is an assertion. First, check for the
6112 syntax of a callout; if found, adjust the temporary pointer that is
6113 used to check for an assertion condition. That's all that is needed! */
6114
6115 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6116 {
6117 if (IS_DIGIT(ptr[3]) || ptr[3] == CHAR_RIGHT_PARENTHESIS)
6118 {
6119 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6120 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6121 tempptr += i + 1;
6122 }
6123 else
6124 {
6125 uint32_t delimiter = 0;
6126 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
6127 {
6128 if (ptr[3] == PRIV(callout_start_delims)[i])
6129 {
6130 delimiter = PRIV(callout_end_delims)[i];
6131 break;
6132 }
6133 }
6134 if (delimiter != 0)
6135 {
6136 for (i = 4; ptr + i < cb->end_pattern; i++)
6137 {
6138 if (ptr[i] == delimiter)
6139 {
6140 if (ptr[i+1] == delimiter) i++;
6141 else
6142 {
6143 if (ptr[i+1] == CHAR_RIGHT_PARENTHESIS) tempptr += i + 2;
6144 break;
6145 }
6146 }
6147 }
6148 }
6149 }
6150
6151 /* tempptr should now be pointing to the opening parenthesis of the
6152 assertion condition. */
6153
6154 if (*tempptr != CHAR_LEFT_PARENTHESIS)
6155 {
6156 *errorcodeptr = ERR28;
6157 goto FAILED;
6158 }
6159 }
6160
6161 /* For conditions that are assertions, check the syntax, and then exit
6162 the switch. This will take control down to where bracketed groups
6163 are processed. The assertion will be handled as part of the group,
6164 but we need to identify this case because the conditional assertion may
6165 not be quantifier. */
6166
6167 if (tempptr[1] == CHAR_QUESTION_MARK &&
6168 (tempptr[2] == CHAR_EQUALS_SIGN ||
6169 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6170 (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6171 (tempptr[3] == CHAR_EQUALS_SIGN ||
6172 tempptr[3] == CHAR_EXCLAMATION_MARK))))
6173 {
6174 cb->iscondassert = TRUE;
6175 break;
6176 }
6177
6178 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6179 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6180
6181 code[1+LINK_SIZE] = OP_CREF;
6182 skipunits = 1+IMM2_SIZE;
6183 refsign = -1; /* => not a number */
6184 namelen = -1; /* => not a name; must set to avoid warning */
6185 name = NULL; /* Always set to avoid warning */
6186 recno = 0; /* Always set to avoid warning */
6187
6188 /* Point at character after (?( */
6189
6190 ptr++;
6191
6192 /* Check for (?(VERSION[>]=n.m), which is a facility whereby indirect
6193 users of PCRE2 via an application can discover which release of PCRE2
6194 is being used. */
6195
6196 if (PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
6197 ptr[7] != CHAR_RIGHT_PARENTHESIS)
6198 {
6199 BOOL ge = FALSE;
6200 int major = 0;
6201 int minor = 0;
6202
6203 ptr += 7;
6204 if (*ptr == CHAR_GREATER_THAN_SIGN)
6205 {
6206 ge = TRUE;
6207 ptr++;
6208 }
6209
6210 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
6211 references its argument twice. */
6212
6213 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
6214 {
6215 *errorcodeptr = ERR79;
6216 goto FAILED;
6217 }
6218
6219 while (IS_DIGIT(*ptr)) major = major * 10 + *ptr++ - '0';
6220 if (*ptr == CHAR_DOT)
6221 {
6222 ptr++;
6223 while (IS_DIGIT(*ptr)) minor = minor * 10 + *ptr++ - '0';
6224 if (minor < 10) minor *= 10;
6225 }
6226
6227 if (*ptr != CHAR_RIGHT_PARENTHESIS || minor > 99)
6228 {
6229 *errorcodeptr = ERR79;
6230 goto FAILED;
6231 }
6232
6233 if (ge)
6234 code[1+LINK_SIZE] = ((PCRE2_MAJOR > major) ||
6235 (PCRE2_MAJOR == major && PCRE2_MINOR >= minor))?
6236 OP_TRUE : OP_FALSE;
6237 else
6238 code[1+LINK_SIZE] = (PCRE2_MAJOR == major && PCRE2_MINOR == minor)?
6239 OP_TRUE : OP_FALSE;
6240
6241 ptr++;
6242 skipunits = 1;
6243 break; /* End of condition processing */
6244 }
6245
6246 /* Check for a test for recursion in a named group. */
6247
6248 if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6249 {
6250 terminator = -1;
6251 ptr += 2;
6252 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6253 }
6254
6255 /* Check for a test for a named group's having been set, using the Perl
6256 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6257 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6258
6259 else if (*ptr == CHAR_LESS_THAN_SIGN)
6260 {
6261 terminator = CHAR_GREATER_THAN_SIGN;
6262 ptr++;
6263 }
6264 else if (*ptr == CHAR_APOSTROPHE)
6265 {
6266 terminator = CHAR_APOSTROPHE;
6267 ptr++;
6268 }
6269 else
6270 {
6271 terminator = CHAR_NULL;
6272 if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6273 else if (IS_DIGIT(*ptr)) refsign = 0;
6274 }
6275
6276 /* Handle a number */
6277
6278 if (refsign >= 0)
6279 {
6280 while (IS_DIGIT(*ptr))
6281 {
6282 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
6283 {
6284 while (IS_DIGIT(*ptr)) ptr++;
6285 *errorcodeptr = ERR61;
6286 goto FAILED;
6287 }
6288 recno = recno * 10 + (int)(*ptr - CHAR_0);
6289 ptr++;
6290 }
6291 }
6292
6293 /* Otherwise we expect to read a name; anything else is an error. When
6294 the referenced name is one of a number of duplicates, a different
6295 opcode is used and it needs more memory. Unfortunately we cannot tell
6296 whether this is the case in the first pass, so we have to allow for
6297 more memory always. In the second pass, the additional to skipunits
6298 happens later. */
6299
6300 else
6301 {
6302 if (IS_DIGIT(*ptr))
6303 {
6304 *errorcodeptr = ERR44; /* Group name must start with non-digit */
6305 goto FAILED;
6306 }
6307 if (!MAX_255(*ptr) || (cb->ctypes[*ptr] & ctype_word) == 0)
6308 {
6309 *errorcodeptr = ERR28; /* Assertion expected */
6310 goto FAILED;
6311 }
6312 name = ptr;
6313 /* Increment ptr, set namelen, check length */
6314 READ_NAME(ctype_word, ERR48, *errorcodeptr);
6315 if (lengthptr != NULL) skipunits += IMM2_SIZE;
6316 }
6317
6318 /* Check the terminator */
6319
6320 if ((terminator > 0 && *ptr++ != (PCRE2_UCHAR)terminator) ||
6321 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6322 {
6323 ptr--; /* Error offset */
6324 *errorcodeptr = ERR26; /* Malformed number or name */
6325 goto FAILED;
6326 }
6327
6328 /* Do no further checking in the pre-compile phase. */
6329
6330 if (lengthptr != NULL) break;
6331
6332 /* In the real compile we do the work of looking for the actual
6333 reference. If refsign is not negative, it means we have a number in
6334 recno. */
6335
6336 if (refsign >= 0)
6337 {
6338 if (recno <= 0)
6339 {
6340 *errorcodeptr = ERR35;
6341 goto FAILED;
6342 }
6343 if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6344 (cb->bracount + 1) - recno : recno + cb->bracount;
6345 if (recno <= 0 || (uint32_t)recno > cb->final_bracount)
6346 {
6347 *errorcodeptr = ERR15;
6348 goto FAILED;
6349 }
6350 PUT2(code, 2+LINK_SIZE, recno);
6351 if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
6352 break;
6353 }
6354
6355 /* Otherwise look for the name. */
6356
6357 slot = cb->name_table;
6358 for (i = 0; i < cb->names_found; i++)
6359 {
6360 if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0) break;
6361 slot += cb->name_entry_size;
6362 }
6363
6364 /* Found the named subpattern. If the name is duplicated, add one to
6365 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6366 appropriate data values. Otherwise, just insert the unique subpattern
6367 number. */
6368
6369 if (i < cb->names_found)
6370 {
6371 int offset = i; /* Offset of first name found */
6372
6373 count = 0;
6374 for (;;)
6375 {
6376 recno = GET2(slot, 0); /* Number for last found */
6377 if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
6378 count++;
6379 if (++i >= cb->names_found) break;
6380 slot += cb->name_entry_size;
6381 if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) != 0 ||
6382 (slot+IMM2_SIZE)[namelen] != 0) break;
6383 }
6384
6385 if (count > 1)
6386 {
6387 PUT2(code, 2+LINK_SIZE, offset);
6388 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6389 skipunits += IMM2_SIZE;
6390 code[1+LINK_SIZE]++;
6391 }
6392 else /* Not a duplicated name */
6393 {
6394 PUT2(code, 2+LINK_SIZE, recno);
6395 }
6396 }
6397
6398 /* If terminator == CHAR_NULL it means that the name followed directly
6399 after the opening parenthesis [e.g. (?(abc)...] and in this case there
6400 are some further alternatives to try. For the cases where terminator !=
6401 CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6402 we have now checked all the possibilities, so give an error. */
6403
6404 else if (terminator != CHAR_NULL)
6405 {
6406 *errorcodeptr = ERR15;
6407 goto FAILED;
6408 }
6409
6410 /* Check for (?(R) for recursion. Allow digits after R to specify a
6411 specific group number. */
6412
6413 else if (*name == CHAR_R)
6414 {
6415 recno = 0;
6416 for (i = 1; i < namelen; i++)
6417 {
6418 if (!IS_DIGIT(name[i]))
6419 {
6420 *errorcodeptr = ERR15; /* Non-existent subpattern */
6421 goto FAILED;
6422 }
6423 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
6424 {
6425 *errorcodeptr = ERR61;
6426 goto FAILED;
6427 }
6428 recno = recno * 10 + name[i] - CHAR_0;
6429 }
6430 if (recno == 0) recno = RREF_ANY;
6431 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
6432 PUT2(code, 2+LINK_SIZE, recno);
6433 }
6434
6435 /* Similarly, check for the (?(DEFINE) "condition", which is always
6436 false. During compilation we set OP_DEFINE to distinguish this from
6437 other OP_FALSE conditions so that it can be checked for having only one
6438 branch, but after that the opcode is changed to OP_FALSE. */
6439
6440 else if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
6441 {
6442 code[1+LINK_SIZE] = OP_DEFINE;
6443 skipunits = 1;
6444 }
6445
6446 /* Reference to an unidentified subpattern. */
6447
6448 else
6449 {
6450 *errorcodeptr = ERR15;
6451 goto FAILED;
6452 }
6453 break;
6454
6455
6456 /* ------------------------------------------------------------ */
6457 case CHAR_EQUALS_SIGN: /* Positive lookahead */
6458 bravalue = OP_ASSERT;
6459 cb->assert_depth += 1;
6460 ptr++;
6461 break;
6462
6463 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6464 thing to do, but Perl allows all assertions to be quantified, and when
6465 they contain capturing parentheses there may be a potential use for
6466 this feature. Not that that applies to a quantified (?!) but we allow
6467 it for uniformity. */
6468
6469 /* ------------------------------------------------------------ */
6470 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
6471 ptr++;
6472 if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6473 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6474 (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6475 {
6476 *code++ = OP_FAIL;
6477 previous = NULL;
6478 continue;
6479 }
6480 bravalue = OP_ASSERT_NOT;
6481 cb->assert_depth += 1;
6482 break;
6483
6484
6485 /* ------------------------------------------------------------ */
6486 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
6487 switch (ptr[1])
6488 {
6489 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
6490 bravalue = OP_ASSERTBACK;
6491 cb->assert_depth += 1;
6492 ptr += 2;
6493 break;
6494
6495 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
6496 bravalue = OP_ASSERTBACK_NOT;
6497 cb->assert_depth += 1;
6498 ptr += 2;
6499 break;
6500
6501 /* Must be a name definition - as the syntax was checked in the
6502 pre-pass, we can assume here that it is valid. Skip over the name
6503 and go to handle the numbered group. */
6504
6505 default:
6506 while (*(++ptr) != CHAR_GREATER_THAN_SIGN);
6507 ptr++;
6508 goto NUMBERED_GROUP;
6509 }
6510 break;
6511
6512
6513 /* ------------------------------------------------------------ */
6514 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
6515 bravalue = OP_ONCE;
6516 ptr++;
6517 break;
6518
6519
6520 /* ------------------------------------------------------------ */
6521 case CHAR_C: /* Callout */
6522 previous_callout = code; /* Save for later completion */
6523 after_manual_callout = 1; /* Skip one item before completing */
6524 ptr++; /* Character after (?C */
6525
6526 /* A callout may have a string argument, delimited by one of a fixed
6527 number of characters, or an undelimited numerical argument, or no
6528 argument, which is the same as (?C0). Different opcodes are used for
6529 the two cases. */
6530
6531 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
6532 {
6533 uint32_t delimiter = 0;
6534
6535 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
6536 {
6537 if (*ptr == PRIV(callout_start_delims)[i])
6538 {
6539 delimiter = PRIV(callout_end_delims)[i];
6540 break;
6541 }
6542 }
6543
6544 if (delimiter == 0)
6545 {
6546 *errorcodeptr = ERR82;
6547 goto FAILED;
6548 }
6549
6550 /* During the pre-compile phase, we parse the string and update the
6551 length. There is no need to generate any code. (In fact, the string
6552 has already been parsed in the pre-pass that looks for named
6553 parentheses, but it does no harm to leave this code in.) */
6554
6555 if (lengthptr != NULL) /* Only check the string */
6556 {
6557 PCRE2_SPTR start = ptr;
6558 do
6559 {
6560 if (++ptr >= cb->end_pattern)
6561 {
6562 *errorcodeptr = ERR81;
6563 ptr = start; /* To give a more useful message */
6564 goto FAILED;
6565 }
6566 if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2;
6567 }
6568 while (ptr[0] != delimiter);
6569
6570 /* Start points to the opening delimiter, ptr points to the
6571 closing delimiter. We must allow for including the delimiter and
6572 for the terminating zero. Any doubled delimiters within the string
6573 make this an overestimate, but it is not worth bothering about. */
6574
6575 (*lengthptr) += (ptr - start) + 2 + (1 + 4*LINK_SIZE);
6576 }
6577
6578 /* In the real compile we can copy the string, knowing that it is
6579 syntactically OK. The starting delimiter is included so that the
6580 client can discover it if they want. We also pass the start offset to
6581 help a script language give better error messages. */
6582
6583 else
6584 {
6585 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6586 *callout_string++ = *ptr++;
6587 PUT(code, 1 + 3*LINK_SIZE, (int)(ptr - cb->start_pattern)); /* Start offset */
6588 for(;;)
6589 {
6590 if (*ptr == delimiter)
6591 {
6592 if (ptr[1] == delimiter) ptr++; else break;
6593 }
6594 *callout_string++ = *ptr++;
6595 }
6596 *callout_string++ = CHAR_NULL;
6597 code[0] = OP_CALLOUT_STR;
6598 PUT(code, 1, (int)(ptr + 2 - cb->start_pattern)); /* Next offset */
6599 PUT(code, 1 + LINK_SIZE, 0); /* Default length */
6600 PUT(code, 1 + 2*LINK_SIZE, /* Compute size */
6601 (int)(callout_string - code));
6602 code = callout_string;
6603 }
6604
6605 /* Advance to what should be the closing parenthesis, which is
6606 checked below. */
6607
6608 ptr++;
6609 }
6610
6611 /* Handle a callout with an optional numerical argument, which must be
6612 less than or equal to 255. A missing argument gives 0. */
6613
6614 else
6615 {
6616 int n = 0;
6617 code[0] = OP_CALLOUT; /* Numerical callout */
6618 while (IS_DIGIT(*ptr))
6619 {
6620 n = n * 10 + *ptr++ - CHAR_0;
6621 if (n > 255)
6622 {
6623 *errorcodeptr = ERR38;
6624 goto FAILED;
6625 }
6626 }
6627 PUT(code, 1, (int)(ptr - cb->start_pattern + 1)); /* Next offset */
6628 PUT(code, 1 + LINK_SIZE, 0); /* Default length */
6629 code[1 + 2*LINK_SIZE] = n; /* Callout number */
6630 code += PRIV(OP_lengths)[OP_CALLOUT];
6631 }
6632
6633 /* Both formats must have a closing parenthesis */
6634
6635 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6636 {
6637 *errorcodeptr = ERR39;
6638 goto FAILED;
6639 }
6640
6641 /* Callouts cannot be quantified. */
6642
6643 previous = NULL;
6644 continue;
6645
6646
6647 /* ------------------------------------------------------------ */
6648 case CHAR_P: /* Python-style named subpattern handling */
6649 if (*(++ptr) == CHAR_EQUALS_SIGN ||
6650 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
6651 {
6652 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6653 terminator = CHAR_RIGHT_PARENTHESIS;
6654 goto NAMED_REF_OR_RECURSE;
6655 }
6656 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
6657 {
6658 *errorcodeptr = ERR41;
6659 goto FAILED;
6660 }
6661 /* Fall through to handle (?P< as (?< is handled */
6662
6663
6664 /* ------------------------------------------------------------ */
6665 case CHAR_APOSTROPHE: /* Define a name - note fall through above */
6666
6667 /* The syntax was checked and the list of names was set up in the
6668 pre-pass, so there is nothing to be done now except to skip over the
6669 name. */
6670
6671 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6672 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6673 while (*(++ptr) != (unsigned int)terminator);
6674 ptr++;
6675 goto NUMBERED_GROUP; /* Set up numbered group */
6676
6677
6678 /* ------------------------------------------------------------ */
6679 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
6680 terminator = CHAR_RIGHT_PARENTHESIS;
6681 is_recurse = TRUE;
6682 /* Fall through */
6683
6684 /* We come here from the Python syntax above that handles both
6685 references (?P=name) and recursion (?P>name), as well as falling
6686 through from the Perl recursion syntax (?&name). We also come here from
6687 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6688 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6689
6690 NAMED_REF_OR_RECURSE:
6691 name = ++ptr;
6692 if (IS_DIGIT(*ptr))
6693 {
6694 *errorcodeptr = ERR44; /* Group name must start with non-digit */
6695 goto FAILED;
6696 }
6697 /* Increment ptr, set namelen, check length */
6698 READ_NAME(ctype_word, ERR48, *errorcodeptr);
6699
6700 /* In the pre-compile phase, do a syntax check. */
6701
6702 if (lengthptr != NULL)
6703 {
6704 if (namelen == 0)
6705 {
6706 *errorcodeptr = ERR62;
6707 goto FAILED;
6708 }
6709 if (*ptr != (PCRE2_UCHAR)terminator)
6710 {
6711 *errorcodeptr = ERR42;
6712 goto FAILED;
6713 }
6714 }
6715
6716 /* Scan the list of names generated in the pre-pass in order to get
6717 a number and whether or not this name is duplicated. */
6718
6719 recno = 0;
6720 is_dupname = FALSE;
6721 ng = cb->named_groups;
6722
6723 for (i = 0; i < cb->names_found; i++, ng++)
6724 {
6725 if (namelen == ng->length &&
6726 PRIV(strncmp)(name, ng->name, namelen) == 0)
6727 {
6728 open_capitem *oc;
6729 is_dupname = ng->isdup;
6730 recno = ng->number;
6731
6732 /* For a recursion, that's all that is needed. We can now go to the
6733 code that handles numerical recursion. */
6734
6735 if (is_recurse) goto HANDLE_RECURSION;
6736
6737 /* For a back reference, update the back reference map and the
6738 maximum back reference. Then for each group we must check to see if
6739 it is recursive, that is, it is inside the group that it
6740 references. A flag is set so that the group can be made atomic. */
6741
6742 cb->backref_map |= (recno < 32)? (1u << recno) : 1;
6743 if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
6744
6745 for (oc = cb->open_caps; oc != NULL; oc = oc->next)
6746 {
6747 if (oc->number == recno)
6748 {
6749 oc->flag = TRUE;
6750 break;
6751 }
6752 }
6753 }
6754 }
6755
6756 /* If the name was not found we have a bad reference. */
6757
6758 if (recno == 0)
6759 {
6760 *errorcodeptr = ERR15;
6761 goto FAILED;
6762 }
6763
6764 /* If a back reference name is not duplicated, we can handle it as a
6765 numerical reference. */
6766
6767 if (!is_dupname) goto HANDLE_REFERENCE;
6768
6769 /* If a back reference name is duplicated, we generate a different
6770 opcode to a numerical back reference. In the second pass we must search
6771 for the index and count in the final name table. */
6772
6773 count = 0;
6774 index = 0;
6775
6776 if (lengthptr == NULL)
6777 {
6778 slot = cb->name_table;
6779 for (i = 0; i < cb->names_found; i++)
6780 {
6781 if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0 &&
6782 slot[IMM2_SIZE+namelen] == 0)
6783 {
6784 if (count == 0) index = i;
6785 count++;
6786 }
6787 slot += cb->name_entry_size;
6788 }
6789
6790 if (count == 0)
6791 {
6792 *errorcodeptr = ERR15;
6793 goto FAILED;
6794 }
6795 }
6796
6797 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6798 previous = code;
6799 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6800 PUT2INC(code, 0, index);
6801 PUT2INC(code, 0, count);
6802 continue; /* End of back ref handling */
6803
6804
6805 /* ------------------------------------------------------------ */
6806 case CHAR_R: /* Recursion, same as (?0) */
6807 recno = 0;
6808 if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
6809 {
6810 *errorcodeptr = ERR29;
6811 goto FAILED;
6812 }
6813 goto HANDLE_RECURSION;
6814
6815
6816 /* ------------------------------------------------------------ */
6817 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
6818 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6819 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6820 {
6821 terminator = CHAR_RIGHT_PARENTHESIS;
6822
6823 /* Come here from the \g<...> and \g'...' code (Oniguruma
6824 compatibility). However, the syntax has been checked to ensure that
6825 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6826 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6827 ever be taken. */
6828
6829 HANDLE_NUMERICAL_RECURSION:
6830
6831 if ((refsign = *ptr) == CHAR_PLUS)
6832 {
6833 ptr++;
6834 if (!IS_DIGIT(*ptr))
6835 {
6836 *errorcodeptr = ERR63;
6837 goto FAILED;
6838 }
6839 }
6840 else if (refsign == CHAR_MINUS)
6841 {
6842 if (!IS_DIGIT(ptr[1]))
6843 goto OTHER_CHAR_AFTER_QUERY;
6844 ptr++;
6845 }
6846
6847 recno = 0;
6848 while (IS_DIGIT(*ptr))
6849 {
6850 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
6851 {
6852 while (IS_DIGIT(*ptr)) ptr++;
6853 *errorcodeptr = ERR61;
6854 goto FAILED;
6855 }
6856 recno = recno * 10 + *ptr++ - CHAR_0;
6857 }
6858
6859 if (*ptr != (PCRE2_UCHAR)terminator)
6860 {
6861 *errorcodeptr = ERR29;
6862 goto FAILED;
6863 }
6864
6865 if (refsign == CHAR_MINUS)
6866 {
6867 if (recno == 0)
6868 {
6869 *errorcodeptr = ERR58;
6870 goto FAILED;
6871 }
6872 recno = (int)(cb->bracount + 1) - recno;
6873 if (recno <= 0)
6874 {
6875 *errorcodeptr = ERR15;
6876 goto FAILED;
6877 }
6878 }
6879 else if (refsign == CHAR_PLUS)
6880 {
6881 if (recno == 0)
6882 {
6883 *errorcodeptr = ERR58;
6884 goto FAILED;
6885 }
6886 recno += cb->bracount;
6887 }
6888
6889 if ((uint32_t)recno > cb->final_bracount)
6890 {
6891 *errorcodeptr = ERR15;
6892 goto FAILED;
6893 }
6894
6895 /* Come here from code above that handles a named recursion.
6896 We insert the number of the called group after OP_RECURSE. At the
6897 end of compiling the pattern is scanned and these numbers are
6898 replaced by offsets within the pattern. It is done like this to avoid
6899 problems with forward references and adjusting offsets when groups
6900 are duplicated and moved (as discovered in previous implementations).
6901 Note that a recursion does not have a set first character (relevant
6902 if it is repeated, because it will then be wrapped with ONCE
6903 brackets). */
6904
6905 HANDLE_RECURSION:
6906 previous = code;
6907 *code = OP_RECURSE;
6908 PUT(code, 1, recno);
6909 code += 1 + LINK_SIZE;
6910 groupsetfirstcu = FALSE;
6911 cb->had_recurse = TRUE;
6912 }
6913
6914 /* Can't determine a first byte now */
6915
6916 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6917 continue;
6918
6919
6920 /* ------------------------------------------------------------ */
6921 default: /* Other characters: check option setting */
6922 OTHER_CHAR_AFTER_QUERY:
6923 set = unset = 0;
6924 optset = &set;
6925
6926 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6927 {
6928 switch (*ptr++)
6929 {
6930 case CHAR_MINUS: optset = &unset; break;
6931
6932 case CHAR_J: /* Record that it changed in the external options */
6933 *optset |= PCRE2_DUPNAMES;
6934 cb->external_flags |= PCRE2_JCHANGED;
6935 break;
6936
6937 case CHAR_i: *optset |= PCRE2_CASELESS; break;
6938 case CHAR_m: *optset |= PCRE2_MULTILINE; break;
6939 case CHAR_s: *optset |= PCRE2_DOTALL; break;
6940 case CHAR_x: *optset |= PCRE2_EXTENDED; break;
6941 case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
6942
6943 default: *errorcodeptr = ERR11;
6944 ptr--; /* Correct the offset */
6945 goto FAILED;
6946 }
6947 }
6948
6949 /* Set up the changed option bits, but don't change anything yet. */
6950
6951 newoptions = (options | set) & (~unset);
6952
6953 /* If the options ended with ')' this is not the start of a nested
6954 group with option changes, so the options change at this level. They
6955 must also be passed back for use in subsequent branches. Reset the
6956 greedy defaults and the case value for firstcu and reqcu. */
6957
6958 if (*ptr == CHAR_RIGHT_PARENTHESIS)
6959 {
6960 *optionsptr = options = newoptions;
6961 greedy_default = ((newoptions & PCRE2_UNGREEDY) != 0);
6962 greedy_non_default = greedy_default ^ 1;
6963 req_caseopt = ((newoptions & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
6964 previous = NULL; /* This item can't be repeated */
6965 continue; /* It is complete */
6966 }
6967
6968 /* If the options ended with ':' we are heading into a nested group
6969 with possible change of options. Such groups are non-capturing and are
6970 not assertions of any kind. All we need to do is skip over the ':';
6971 the newoptions value is handled below. */
6972
6973 bravalue = OP_BRA;
6974 ptr++;
6975 } /* End of switch for character following (? */
6976 } /* End of (? handling */
6977
6978 /* Opening parenthesis not followed by '*' or '?'. If PCRE2_NO_AUTO_CAPTURE
6979 is set, all unadorned brackets become non-capturing and behave like (?:...)
6980 brackets. */
6981
6982 else if ((options & PCRE2_NO_AUTO_CAPTURE) != 0)
6983 {
6984 bravalue = OP_BRA;
6985 }
6986
6987 /* Else we have a capturing group. */
6988
6989 else
6990 {
6991 NUMBERED_GROUP:
6992 cb->bracount += 1;
6993 PUT2(code, 1+LINK_SIZE, cb->bracount);
6994 skipunits = IMM2_SIZE;
6995 }
6996
6997 /* Process nested bracketed regex. First check for parentheses nested too
6998 deeply. */
6999
7000 if ((cb->parens_depth += 1) > (int)(cb->cx->parens_nest_limit))
7001 {
7002 *errorcodeptr = ERR19;
7003 goto FAILED;
7004 }
7005
7006 /* All assertions used not to be repeatable, but this was changed for Perl
7007 compatibility. All kinds can now be repeated except for assertions that are
7008 conditions (Perl also forbids these to be repeated). We copy code into a
7009 non-register variable (tempcode) in order to be able to pass its address
7010 because some compilers complain otherwise. At the start of a conditional
7011 group whose condition is an assertion, cb->iscondassert is set. We unset it
7012 here so as to allow assertions later in the group to be quantified. */
7013
7014 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7015 cb->iscondassert)
7016 {
7017 previous = NULL;
7018 cb->iscondassert = FALSE;
7019 }
7020 else
7021 {
7022 previous = code;
7023 }
7024
7025 *code = bravalue;
7026 tempcode = code;
7027 tempreqvary = cb->req_varyopt; /* Save value before bracket */
7028 tempbracount = cb->bracount; /* Save value before bracket */
7029 length_prevgroup = 0; /* Initialize for pre-compile phase */
7030
7031 if (!compile_regex(
7032 newoptions, /* The complete new option state */
7033 &tempcode, /* Where to put code (updated) */
7034 &ptr, /* Input pointer (updated) */
7035 errorcodeptr, /* Where to put an error message */
7036 (bravalue == OP_ASSERTBACK ||
7037 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7038 reset_bracount, /* True if (?| group */
7039 skipunits, /* Skip over bracket number */
7040 cond_depth +
7041 ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
7042 &subfirstcu, /* For possible first char */
7043 &subfirstcuflags,
7044 &subreqcu, /* For possible last char */
7045 &subreqcuflags,
7046 bcptr, /* Current branch chain */
7047 cb, /* Compile data block */
7048 (lengthptr == NULL)? NULL : /* Actual compile phase */
7049 &length_prevgroup /* Pre-compile phase */
7050 ))
7051 goto FAILED;
7052
7053 cb->parens_depth -= 1;
7054
7055 /* If this was an atomic group and there are no capturing groups within it,
7056 generate OP_ONCE_NC instead of OP_ONCE. */
7057
7058 if (bravalue == OP_ONCE && cb->bracount <= tempbracount)
7059 *code = OP_ONCE_NC;
7060
7061 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7062 cb->assert_depth -= 1;
7063
7064 /* At the end of compiling, code is still pointing to the start of the
7065 group, while tempcode has been updated to point past the end of the group.
7066 The pattern pointer (ptr) is on the bracket.
7067
7068 If this is a conditional bracket, check that there are no more than
7069 two branches in the group, or just one if it's a DEFINE group. We do this
7070 in the real compile phase, not in the pre-pass, where the whole group may
7071 not be available. */
7072
7073 if (bravalue == OP_COND && lengthptr == NULL)
7074 {
7075 PCRE2_UCHAR *tc = code;
7076 int condcount = 0;
7077
7078 do {
7079 condcount++;
7080 tc += GET(tc,1);
7081 }
7082 while (*tc != OP_KET);
7083
7084 /* A DEFINE group is never obeyed inline (the "condition" is always
7085 false). It must have only one branch. Having checked this, change the
7086 opcode to OP_FALSE. */
7087
7088 if (code[LINK_SIZE+1] == OP_DEFINE)
7089 {
7090 if (condcount > 1)
7091 {
7092 *errorcodeptr = ERR54;
7093 goto FAILED;
7094 }
7095 code[LINK_SIZE+1] = OP_FALSE;
7096 bravalue = OP_DEFINE; /* Just a flag to suppress char handling below */
7097 }
7098
7099 /* A "normal" conditional group. If there is just one branch, we must not
7100 make use of its firstcu or reqcu, because this is equivalent to an
7101 empty second branch. */
7102
7103 else
7104 {
7105 if (condcount > 2)
7106 {
7107 *errorcodeptr = ERR27;
7108 goto FAILED;
7109 }
7110 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
7111 }
7112 }
7113
7114 /* At the end of a group, it's an error if we hit end of pattern or
7115 any non-closing parenthesis. This check also happens in the pre-scan,
7116 so should not trigger here, but leave this code as an insurance. */
7117
7118 if (*ptr != CHAR_RIGHT_PARENTHESIS)
7119 {
7120 *errorcodeptr = ERR14;
7121 goto FAILED;
7122 }
7123
7124 /* In the pre-compile phase, update the length by the length of the group,
7125 less the brackets at either end. Then reduce the compiled code to just a
7126 set of non-capturing brackets so that it doesn't use much memory if it is
7127 duplicated by a quantifier.*/
7128
7129 if (lengthptr != NULL)
7130 {
7131 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7132 {
7133 *errorcodeptr = ERR20;
7134 goto FAILED;
7135 }
7136 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7137 code++; /* This already contains bravalue */
7138 PUTINC(code, 0, 1 + LINK_SIZE);
7139 *code++ = OP_KET;
7140 PUTINC(code, 0, 1 + LINK_SIZE);
7141 break; /* No need to waste time with special character handling */
7142 }
7143
7144 /* Otherwise update the main code pointer to the end of the group. */
7145
7146 code = tempcode;
7147
7148 /* For a DEFINE group, required and first character settings are not
7149 relevant. */
7150
7151 if (bravalue == OP_DEFINE) break;
7152
7153 /* Handle updating of the required and first characters for other types of
7154 group. Update for normal brackets of all kinds, and conditions with two
7155 branches (see code above). If the bracket is followed by a quantifier with
7156 zero repeat, we have to back off. Hence the definition of zeroreqcu and
7157 zerofirstcu outside the main loop so that they can be accessed for the
7158 back off. */
7159
7160 zeroreqcu = reqcu;
7161 zeroreqcuflags = reqcuflags;
7162 zerofirstcu = firstcu;
7163 zerofirstcuflags = firstcuflags;
7164 groupsetfirstcu = FALSE;
7165
7166 if (bravalue >= OP_ONCE)
7167 {
7168 /* If we have not yet set a firstcu in this branch, take it from the
7169 subpattern, remembering that it was set here so that a repeat of more
7170 than one can replicate it as reqcu if necessary. If the subpattern has
7171 no firstcu, set "none" for the whole branch. In both cases, a zero
7172 repeat forces firstcu to "none". */
7173
7174 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
7175 {
7176 if (subfirstcuflags >= 0)
7177 {
7178 firstcu = subfirstcu;
7179 firstcuflags = subfirstcuflags;
7180 groupsetfirstcu = TRUE;
7181 }
7182 else firstcuflags = REQ_NONE;
7183 zerofirstcuflags = REQ_NONE;
7184 }
7185
7186 /* If firstcu was previously set, convert the subpattern's firstcu
7187 into reqcu if there wasn't one, using the vary flag that was in
7188 existence beforehand. */
7189
7190 else if (subfirstcuflags >= 0 && subreqcuflags < 0)
7191 {
7192 subreqcu = subfirstcu;
7193 subreqcuflags = subfirstcuflags | tempreqvary;
7194 }
7195
7196 /* If the subpattern set a required byte (or set a first byte that isn't
7197 really the first byte - see above), set it. */
7198
7199 if (subreqcuflags >= 0)
7200 {
7201 reqcu = subreqcu;
7202 reqcuflags = subreqcuflags;
7203 }
7204 }
7205
7206 /* For a forward assertion, we take the reqcu, if set. This can be
7207 helpful if the pattern that follows the assertion doesn't set a different
7208 char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
7209 for an assertion, however because it leads to incorrect effect for patterns
7210 such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
7211 of a firstcu. This is overcome by a scan at the end if there's no
7212 firstcu, looking for an asserted first char. */
7213
7214 else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
7215 {
7216 reqcu = subreqcu;
7217 reqcuflags = subreqcuflags;
7218 }
7219 break; /* End of processing '(' */
7220
7221
7222 /* ===================================================================*/
7223 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7224 are arranged to be the negation of the corresponding OP_values in the
7225 default case when PCRE2_UCP is not set. For the back references, the values
7226 are negative the reference number. Only back references and those types
7227 that consume a character may be repeated. We can test for values between
7228 ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7229 ever created.
7230
7231 Note: \Q and \E are handled at the start of the character-processing loop,
7232 not here. */
7233
7234 case CHAR_BACKSLASH:
7235 tempptr = ptr;
7236 escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr,
7237 options, FALSE, cb);
7238 if (*errorcodeptr != 0) goto FAILED;
7239
7240 if (escape == 0) /* The escape coded a single character */
7241 c = ec;
7242 else
7243 {
7244 /* For metasequences that actually match a character, we disable the
7245 setting of a first character if it hasn't already been set. */
7246
7247 if (firstcuflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7248 firstcuflags = REQ_NONE;
7249
7250 /* Set values to reset to if this is followed by a zero repeat. */
7251
7252 zerofirstcu = firstcu;
7253 zerofirstcuflags = firstcuflags;
7254 zeroreqcu = reqcu;
7255 zeroreqcuflags = reqcuflags;
7256
7257 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7258 is a subroutine call by number (Oniguruma syntax). In fact, the value
7259 ESC_g is returned only for these cases. So we don't need to check for <
7260 or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7261 -n, and for the Perl syntax \g{name} the result is ESC_k (as
7262 that is a synonym for a named back reference). */
7263
7264 if (escape == ESC_g)
7265 {
7266 PCRE2_SPTR p;
7267 uint32_t cf;
7268
7269 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7270 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7271
7272 /* These two statements stop the compiler for warning about possibly
7273 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7274 fact, because we do the check for a number below, the paths that
7275 would actually be in error are never taken. */
7276
7277 skipunits = 0;
7278 reset_bracount = FALSE;
7279
7280 /* If it's not a signed or unsigned number, treat it as a name. */
7281
7282 cf = ptr[1];
7283 if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7284 {
7285 is_recurse = TRUE;
7286 goto NAMED_REF_OR_RECURSE;
7287 }
7288
7289 /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7290 or a digit. */
7291
7292 p = ptr + 2;
7293 while (IS_DIGIT(*p)) p++;
7294 if (*p != (PCRE2_UCHAR)terminator)
7295 {
7296 *errorcodeptr = ERR57;
7297 goto FAILED;
7298 }
7299 ptr++;
7300 goto HANDLE_NUMERICAL_RECURSION;
7301 }
7302
7303 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7304 We also support \k{name} (.NET syntax). */
7305
7306 if (escape == ESC_k)
7307 {
7308 if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7309 ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7310 {
7311 *errorcodeptr = ERR69;
7312 goto FAILED;
7313 }
7314 is_recurse = FALSE;
7315 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7316 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
7317 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
7318 goto NAMED_REF_OR_RECURSE;
7319 }
7320
7321 /* Back references are handled specially; must disable firstcu if
7322 not set to cope with cases like (?=(\w+))\1: which would otherwise set
7323 ':' later. */
7324
7325 if (escape < 0)
7326 {
7327 open_capitem *oc;
7328 recno = -escape;
7329
7330 /* Come here from named backref handling when the reference is to a
7331 single group (i.e. not to a duplicated name). */
7332
7333 HANDLE_REFERENCE:
7334 if (recno > (int)cb->final_bracount)
7335 {
7336 *errorcodeptr = ERR15;
7337 goto FAILED;
7338 }
7339 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7340 previous = code;
7341 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7342 PUT2INC(code, 0, recno);
7343 cb->backref_map |= (recno < 32)? (1u << recno) : 1;
7344 if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
7345
7346 /* Check to see if this back reference is recursive, that it, it
7347 is inside the group that it references. A flag is set so that the
7348 group can be made atomic. */
7349
7350 for (oc = cb->open_caps; oc != NULL; oc = oc->next)
7351 {
7352 if (oc->number == recno)
7353 {
7354 oc->flag = TRUE;
7355 break;
7356 }
7357 }
7358 }
7359
7360 /* So are Unicode property matches, if supported. */
7361
7362 #ifdef SUPPORT_UNICODE
7363 else if (escape == ESC_P || escape == ESC_p)
7364 {
7365 BOOL negated;
7366 unsigned int ptype = 0, pdata = 0;
7367 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb))
7368 goto FAILED;
7369 previous = code;
7370 *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
7371 *code++ = ptype;
7372 *code++ = pdata;
7373 }
7374 #else
7375
7376 /* If Unicode properties are not supported, \X, \P, and \p are not
7377 allowed. */
7378
7379 else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
7380 {
7381 *errorcodeptr = ERR45;
7382 goto FAILED;
7383 }
7384 #endif
7385
7386 /* The use of \C can be locked out. */
7387
7388 #ifdef NEVER_BACKSLASH_C
7389 else if (escape == ESC_C)
7390 {
7391 *errorcodeptr = ERR85;
7392 goto FAILED;
7393 }
7394 #else
7395 else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
7396 {
7397 *errorcodeptr = ERR83;
7398 goto FAILED;
7399 }
7400 #endif
7401
7402 /* For the rest (including \X when Unicode properties are supported), we
7403 can obtain the OP value by negating the escape value in the default
7404 situation when PCRE2_UCP is not set. When it *is* set, we substitute
7405 Unicode property tests. Note that \b and \B do a one-character
7406 lookbehind, and \A also behaves as if it does. */
7407
7408 else
7409 {
7410 if (escape == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7411 if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
7412 cb->max_lookbehind == 0)
7413 cb->max_lookbehind = 1;
7414 #ifdef SUPPORT_UNICODE
7415 if (escape >= ESC_DU && escape <= ESC_wu)
7416 {
7417 cb->nestptr[1] = cb->nestptr[0]; /* Back up if at 2nd level */
7418 cb->nestptr[0] = ptr + 1; /* Where to resume */
7419 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
7420 }
7421 else
7422 #endif
7423 /* In non-UTF mode, and for both 32-bit modes, we turn \C into
7424 OP_ALLANY instead of OP_ANYBYTE so that it works in DFA mode and in
7425 lookbehinds. */
7426
7427 {
7428 previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
7429 #if PCRE2_CODE_UNIT_WIDTH == 32
7430 *code++ = (escape == ESC_C)? OP_ALLANY : escape;
7431 #else
7432 *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
7433 #endif
7434 }
7435 }
7436 continue;
7437 }
7438
7439 /* We have a data character whose value is in c. In UTF-8 mode it may have
7440 a value > 127. We set its representation in the length/buffer, and then
7441 handle it as a data character. */
7442
7443 mclength = PUTCHAR(c, mcbuffer);
7444 goto ONE_CHAR;
7445
7446
7447 /* ===================================================================*/
7448 /* Handle a literal character. It is guaranteed not to be whitespace or #
7449 when the extended flag is set. If we are in a UTF mode, it may be a
7450 multi-unit literal character. */
7451
7452 default:
7453 NORMAL_CHAR:
7454 mclength = 1;
7455 mcbuffer[0] = c;
7456
7457 #ifdef SUPPORT_UNICODE
7458 if (utf && HAS_EXTRALEN(c))
7459 ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
7460 #endif
7461
7462 /* At this point we have the character's bytes in mcbuffer, and the length
7463 in mclength. When not in UTF mode, the length is always 1. */
7464
7465 ONE_CHAR:
7466 previous = code;
7467
7468 /* For caseless UTF mode, check whether this character has more than one
7469 other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
7470
7471 #ifdef SUPPORT_UNICODE
7472 if (utf && (options & PCRE2_CASELESS) != 0)
7473 {
7474 GETCHAR(c, mcbuffer);
7475 if ((c = UCD_CASESET(c)) != 0)
7476 {
7477 *code++ = OP_PROP;
7478 *code++ = PT_CLIST;
7479 *code++ = c;
7480 if (firstcuflags == REQ_UNSET)
7481 firstcuflags = zerofirstcuflags = REQ_NONE;
7482 break;
7483 }
7484 }
7485 #endif
7486
7487 /* Caseful matches, or not one of the multicase characters. */
7488
7489 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7490 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
7491
7492 /* Remember if \r or \n were seen */
7493
7494 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7495 cb->external_flags |= PCRE2_HASCRORLF;
7496
7497 /* Set the first and required bytes appropriately. If no previous first
7498 byte, set it from this character, but revert to none on a zero repeat.
7499 Otherwise, leave the firstcu value alone, and don't change it on a zero
7500 repeat. */
7501
7502 if (firstcuflags == REQ_UNSET)
7503 {
7504 zerofirstcuflags = REQ_NONE;
7505 zeroreqcu = reqcu;
7506 zeroreqcuflags = reqcuflags;
7507
7508 /* If the character is more than one byte long, we can set firstcu
7509 only if it is not to be matched caselessly. */
7510
7511 if (mclength == 1 || req_caseopt == 0)
7512 {
7513 firstcu = mcbuffer[0] | req_caseopt;
7514 firstcu = mcbuffer[0];
7515 firstcuflags = req_caseopt;
7516
7517 if (mclength != 1)
7518 {
7519 reqcu = code[-1];
7520 reqcuflags = cb->req_varyopt;
7521 }
7522 }
7523 else firstcuflags = reqcuflags = REQ_NONE;
7524 }
7525
7526 /* firstcu was previously set; we can set reqcu only if the length is
7527 1 or the matching is caseful. */
7528
7529 else
7530 {
7531 zerofirstcu = firstcu;
7532 zerofirstcuflags = firstcuflags;
7533 zeroreqcu = reqcu;
7534 zeroreqcuflags = reqcuflags;
7535 if (mclength == 1 || req_caseopt == 0)
7536 {
7537 reqcu = code[-1];
7538 reqcuflags = req_caseopt | cb->req_varyopt;
7539 }
7540 }
7541
7542 break; /* End of literal character handling */
7543 }
7544 } /* end of big loop */
7545
7546 /* Control never reaches here by falling through, only by a goto for all the
7547 error states. Pass back the position in the pattern so that it can be displayed
7548 to the user for diagnosing the error. */
7549
7550 FAILED:
7551 *ptrptr = ptr;
7552 return FALSE;
7553 }
7554
7555
7556
7557 /*************************************************
7558 * Compile regex: a sequence of alternatives *
7559 *************************************************/
7560
7561 /* On entry, ptr is pointing past the bracket character, but on return it
7562 points to the closing bracket, or vertical bar, or end of string. The code
7563 variable is pointing at the byte into which the BRA operator has been stored.
7564 This function is used during the pre-compile phase when we are trying to find
7565 out the amount of memory needed, as well as during the real compile phase. The
7566 value of lengthptr distinguishes the two phases.
7567
7568 Arguments:
7569 options option bits, including any changes for this subpattern
7570 codeptr -> the address of the current code pointer
7571 ptrptr -> the address of the current pattern pointer
7572 errorcodeptr -> pointer to error code variable
7573 lookbehind TRUE if this is a lookbehind assertion
7574 reset_bracount TRUE to reset the count for each branch
7575 skipunits skip this many code units at start (for brackets and OP_COND)
7576 cond_depth depth of nesting for conditional subpatterns
7577 firstcuptr place to put the first required code unit
7578 firstcuflagsptr place to put the first code unit flags, or a negative number
7579 reqcuptr place to put the last required code unit
7580 reqcuflagsptr place to put the last required code unit flags, or a negative number
7581 bcptr pointer to the chain of currently open branches
7582 cb points to the data block with tables pointers etc.
7583 lengthptr NULL during the real compile phase
7584 points to length accumulator during pre-compile phase
7585
7586 Returns: TRUE on success
7587 */
7588
7589 static BOOL
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,PCRE2_SPTR * ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,uint32_t skipunits,int cond_depth,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,size_t * lengthptr)7590 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, PCRE2_SPTR *ptrptr,
7591 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, uint32_t skipunits,
7592 int cond_depth, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
7593 uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
7594 compile_block *cb, size_t *lengthptr)
7595 {
7596 PCRE2_SPTR ptr = *ptrptr;
7597 PCRE2_UCHAR *code = *codeptr;
7598 PCRE2_UCHAR *last_branch = code;
7599 PCRE2_UCHAR *start_bracket = code;
7600 PCRE2_UCHAR *reverse_count = NULL;
7601 open_capitem capitem;
7602 int capnumber = 0;
7603 uint32_t firstcu, reqcu;
7604 int32_t firstcuflags, reqcuflags;
7605 uint32_t branchfirstcu, branchreqcu;
7606 int32_t branchfirstcuflags, branchreqcuflags;
7607 size_t length;
7608 unsigned int orig_bracount;
7609 unsigned int max_bracount;
7610 branch_chain bc;
7611
7612 /* If set, call the external function that checks for stack availability. */
7613
7614 if (cb->cx->stack_guard != NULL &&
7615 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
7616 {
7617 *errorcodeptr= ERR33;
7618 return FALSE;
7619 }
7620
7621 /* Miscellaneous initialization */
7622
7623 bc.outer = bcptr;
7624 bc.current_branch = code;
7625
7626 firstcu = reqcu = 0;
7627 firstcuflags = reqcuflags = REQ_UNSET;
7628
7629 /* Accumulate the length for use in the pre-compile phase. Start with the
7630 length of the BRA and KET and any extra code units that are required at the
7631 beginning. We accumulate in a local variable to save frequent testing of
7632 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
7633 start and end of each alternative, because compiled items are discarded during
7634 the pre-compile phase so that the work space is not exceeded. */
7635
7636 length = 2 + 2*LINK_SIZE + skipunits;
7637
7638 /* WARNING: If the above line is changed for any reason, you must also change
7639 the code that abstracts option settings at the start of the pattern and makes
7640 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
7641 pre-compile phase to find out whether or not anything has yet been compiled.
7642
7643 If this is a capturing subpattern, add to the chain of open capturing items
7644 so that we can detect them if (*ACCEPT) is encountered. This is also used to
7645 detect groups that contain recursive back references to themselves. Note that
7646 only OP_CBRA need be tested here; changing this opcode to one of its variants,
7647 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
7648
7649 if (*code == OP_CBRA)
7650 {
7651 capnumber = GET2(code, 1 + LINK_SIZE);
7652 capitem.number = capnumber;
7653 capitem.next = cb->open_caps;
7654 capitem.flag = FALSE;
7655 cb->open_caps = &capitem;
7656 }
7657
7658 /* Offset is set zero to mark that this bracket is still open */
7659
7660 PUT(code, 1, 0);
7661 code += 1 + LINK_SIZE + skipunits;
7662
7663 /* Loop for each alternative branch */
7664
7665 orig_bracount = max_bracount = cb->bracount;
7666
7667 for (;;)
7668 {
7669 /* For a (?| group, reset the capturing bracket count so that each branch
7670 uses the same numbers. */
7671
7672 if (reset_bracount) cb->bracount = orig_bracount;
7673
7674 /* Set up dummy OP_REVERSE if lookbehind assertion */
7675
7676 if (lookbehind)
7677 {
7678 *code++ = OP_REVERSE;
7679 reverse_count = code;
7680 PUTINC(code, 0, 0);
7681 length += 1 + LINK_SIZE;
7682 }
7683
7684 /* Now compile the branch; in the pre-compile phase its length gets added
7685 into the length. */
7686
7687 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstcu,
7688 &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
7689 cond_depth, cb, (lengthptr == NULL)? NULL : &length))
7690 {
7691 *ptrptr = ptr;
7692 return FALSE;
7693 }
7694
7695 /* Keep the highest bracket count in case (?| was used and some branch
7696 has fewer than the rest. */
7697
7698 if (cb->bracount > max_bracount) max_bracount = cb->bracount;
7699
7700 /* In the real compile phase, there is some post-processing to be done. */
7701
7702 if (lengthptr == NULL)
7703 {
7704 /* If this is the first branch, the firstcu and reqcu values for the
7705 branch become the values for the regex. */
7706
7707 if (*last_branch != OP_ALT)
7708 {
7709 firstcu = branchfirstcu;
7710 firstcuflags = branchfirstcuflags;
7711 reqcu = branchreqcu;
7712 reqcuflags = branchreqcuflags;
7713 }
7714
7715 /* If this is not the first branch, the first char and reqcu have to
7716 match the values from all the previous branches, except that if the
7717 previous value for reqcu didn't have REQ_VARY set, it can still match,
7718 and we set REQ_VARY for the regex. */
7719
7720 else
7721 {
7722 /* If we previously had a firstcu, but it doesn't match the new branch,
7723 we have to abandon the firstcu for the regex, but if there was
7724 previously no reqcu, it takes on the value of the old firstcu. */
7725
7726 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
7727 {
7728 if (firstcuflags >= 0)
7729 {
7730 if (reqcuflags < 0)
7731 {
7732 reqcu = firstcu;
7733 reqcuflags = firstcuflags;
7734 }
7735 }
7736 firstcuflags = REQ_NONE;
7737 }
7738
7739 /* If we (now or from before) have no firstcu, a firstcu from the
7740 branch becomes a reqcu if there isn't a branch reqcu. */
7741
7742 if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
7743 branchreqcuflags < 0)
7744 {
7745 branchreqcu = branchfirstcu;
7746 branchreqcuflags = branchfirstcuflags;
7747 }
7748
7749 /* Now ensure that the reqcus match */
7750
7751 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
7752 reqcu != branchreqcu)
7753 reqcuflags = REQ_NONE;
7754 else
7755 {
7756 reqcu = branchreqcu;
7757 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
7758 }
7759 }
7760
7761 /* If lookbehind, check that this branch matches a fixed-length string, and
7762 put the length into the OP_REVERSE item. Temporarily mark the end of the
7763 branch with OP_END. If the branch contains OP_RECURSE, the result is
7764 FFL_LATER (a negative value) because there may be forward references that
7765 we can't check here. Set a flag to cause another lookbehind check at the
7766 end. Why not do it all at the end? Because common errors can be picked up
7767 here and the offset of the problem can be shown. */
7768
7769 if (lookbehind)
7770 {
7771 int fixed_length;
7772 int count = 0;
7773 *code = OP_END;
7774 fixed_length = find_fixedlength(last_branch, (options & PCRE2_UTF) != 0,
7775 FALSE, cb, NULL, &count);
7776 if (fixed_length == FFL_LATER)
7777 {
7778 cb->check_lookbehind = TRUE;
7779 }
7780 else if (fixed_length < 0)
7781 {
7782 *errorcodeptr = fixed_length_errors[-fixed_length];
7783 *ptrptr = ptr;
7784 return FALSE;
7785 }
7786 else
7787 {
7788 if (fixed_length > cb->max_lookbehind)
7789 cb->max_lookbehind = fixed_length;
7790 PUT(reverse_count, 0, fixed_length);
7791 }
7792 }
7793 }
7794
7795 /* Reached end of expression, either ')' or end of pattern. In the real
7796 compile phase, go back through the alternative branches and reverse the chain
7797 of offsets, with the field in the BRA item now becoming an offset to the
7798 first alternative. If there are no alternatives, it points to the end of the
7799 group. The length in the terminating ket is always the length of the whole
7800 bracketed item. Return leaving the pointer at the terminating char. */
7801
7802 if (*ptr != CHAR_VERTICAL_LINE)
7803 {
7804 if (lengthptr == NULL)
7805 {
7806 size_t branch_length = code - last_branch;
7807 do
7808 {
7809 size_t prev_length = GET(last_branch, 1);
7810 PUT(last_branch, 1, branch_length);
7811 branch_length = prev_length;
7812 last_branch -= branch_length;
7813 }
7814 while (branch_length > 0);
7815 }
7816
7817 /* Fill in the ket */
7818
7819 *code = OP_KET;
7820 PUT(code, 1, (int)(code - start_bracket));
7821 code += 1 + LINK_SIZE;
7822
7823 /* If it was a capturing subpattern, check to see if it contained any
7824 recursive back references. If so, we must wrap it in atomic brackets. In
7825 any event, remove the block from the chain. */
7826
7827 if (capnumber > 0)
7828 {
7829 if (cb->open_caps->flag)
7830 {
7831 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
7832 CU2BYTES(code - start_bracket));
7833 *start_bracket = OP_ONCE;
7834 code += 1 + LINK_SIZE;
7835 PUT(start_bracket, 1, (int)(code - start_bracket));
7836 *code = OP_KET;
7837 PUT(code, 1, (int)(code - start_bracket));
7838 code += 1 + LINK_SIZE;
7839 length += 2 + 2*LINK_SIZE;
7840 }
7841 cb->open_caps = cb->open_caps->next;
7842 }
7843
7844 /* Retain the highest bracket number, in case resetting was used. */
7845
7846 cb->bracount = max_bracount;
7847
7848 /* Set values to pass back */
7849
7850 *codeptr = code;
7851 *ptrptr = ptr;
7852 *firstcuptr = firstcu;
7853 *firstcuflagsptr = firstcuflags;
7854 *reqcuptr = reqcu;
7855 *reqcuflagsptr = reqcuflags;
7856 if (lengthptr != NULL)
7857 {
7858 if (OFLOW_MAX - *lengthptr < length)
7859 {
7860 *errorcodeptr = ERR20;
7861 return FALSE;
7862 }
7863 *lengthptr += length;
7864 }
7865 return TRUE;
7866 }
7867
7868 /* Another branch follows. In the pre-compile phase, we can move the code
7869 pointer back to where it was for the start of the first branch. (That is,
7870 pretend that each branch is the only one.)
7871
7872 In the real compile phase, insert an ALT node. Its length field points back
7873 to the previous branch while the bracket remains open. At the end the chain
7874 is reversed. It's done like this so that the start of the bracket has a
7875 zero offset until it is closed, making it possible to detect recursion. */
7876
7877 if (lengthptr != NULL)
7878 {
7879 code = *codeptr + 1 + LINK_SIZE + skipunits;
7880 length += 1 + LINK_SIZE;
7881 }
7882 else
7883 {
7884 *code = OP_ALT;
7885 PUT(code, 1, (int)(code - last_branch));
7886 bc.current_branch = last_branch = code;
7887 code += 1 + LINK_SIZE;
7888 }
7889
7890 /* Advance past the vertical bar */
7891
7892 ptr++;
7893 }
7894 /* Control never reaches here */
7895 }
7896
7897
7898
7899 /*************************************************
7900 * Check for anchored pattern *
7901 *************************************************/
7902
7903 /* Try to find out if this is an anchored regular expression. Consider each
7904 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
7905 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
7906 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
7907 be found, because ^ generates OP_CIRCM in that mode.
7908
7909 We can also consider a regex to be anchored if OP_SOM starts all its branches.
7910 This is the code for \G, which means "match at start of match position, taking
7911 into account the match offset".
7912
7913 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
7914 because that will try the rest of the pattern at all possible matching points,
7915 so there is no point trying again.... er ....
7916
7917 .... except when the .* appears inside capturing parentheses, and there is a
7918 subsequent back reference to those parentheses. We haven't enough information
7919 to catch that case precisely.
7920
7921 At first, the best we could do was to detect when .* was in capturing brackets
7922 and the highest back reference was greater than or equal to that level.
7923 However, by keeping a bitmap of the first 31 back references, we can catch some
7924 of the more common cases more precisely.
7925
7926 ... A second exception is when the .* appears inside an atomic group, because
7927 this prevents the number of characters it matches from being adjusted.
7928
7929 Arguments:
7930 code points to start of the compiled pattern
7931 bracket_map a bitmap of which brackets we are inside while testing; this
7932 handles up to substring 31; after that we just have to take
7933 the less precise approach
7934 cb points to the compile data block
7935 atomcount atomic group level
7936
7937 Returns: TRUE or FALSE
7938 */
7939
7940 static BOOL
is_anchored(register PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount)7941 is_anchored(register PCRE2_SPTR code, unsigned int bracket_map,
7942 compile_block *cb, int atomcount)
7943 {
7944 do {
7945 PCRE2_SPTR scode = first_significant_code(
7946 code + PRIV(OP_lengths)[*code], FALSE);
7947 register int op = *scode;
7948
7949 /* Non-capturing brackets */
7950
7951 if (op == OP_BRA || op == OP_BRAPOS ||
7952 op == OP_SBRA || op == OP_SBRAPOS)
7953 {
7954 if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
7955 }
7956
7957 /* Capturing brackets */
7958
7959 else if (op == OP_CBRA || op == OP_CBRAPOS ||
7960 op == OP_SCBRA || op == OP_SCBRAPOS)
7961 {
7962 int n = GET2(scode, 1+LINK_SIZE);
7963 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
7964 if (!is_anchored(scode, new_map, cb, atomcount)) return FALSE;
7965 }
7966
7967 /* Positive forward assertions and conditions */
7968
7969 else if (op == OP_ASSERT || op == OP_COND)
7970 {
7971 if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
7972 }
7973
7974 /* Atomic groups */
7975
7976 else if (op == OP_ONCE || op == OP_ONCE_NC)
7977 {
7978 if (!is_anchored(scode, bracket_map, cb, atomcount + 1))
7979 return FALSE;
7980 }
7981
7982 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7983 it isn't in brackets that are or may be referenced or inside an atomic
7984 group. There is also an option that disables auto-anchoring. */
7985
7986 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7987 op == OP_TYPEPOSSTAR))
7988 {
7989 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
7990 atomcount > 0 || cb->had_pruneorskip ||
7991 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
7992 return FALSE;
7993 }
7994
7995 /* Check for explicit anchoring */
7996
7997 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7998
7999 code += GET(code, 1);
8000 }
8001 while (*code == OP_ALT); /* Loop for each alternative */
8002 return TRUE;
8003 }
8004
8005
8006
8007 /*************************************************
8008 * Check for starting with ^ or .* *
8009 *************************************************/
8010
8011 /* This is called to find out if every branch starts with ^ or .* so that
8012 "first char" processing can be done to speed things up in multiline
8013 matching and for non-DOTALL patterns that start with .* (which must start at
8014 the beginning or after \n). As in the case of is_anchored() (see above), we
8015 have to take account of back references to capturing brackets that contain .*
8016 because in that case we can't make the assumption. Also, the appearance of .*
8017 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8018 count, because once again the assumption no longer holds.
8019
8020 Arguments:
8021 code points to start of the compiled pattern or a group
8022 bracket_map a bitmap of which brackets we are inside while testing; this
8023 handles up to substring 31; after that we just have to take
8024 the less precise approach
8025 cb points to the compile data
8026 atomcount atomic group level
8027
8028 Returns: TRUE or FALSE
8029 */
8030
8031 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount)8032 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8033 int atomcount)
8034 {
8035 do {
8036 PCRE2_SPTR scode = first_significant_code(
8037 code + PRIV(OP_lengths)[*code], FALSE);
8038 register int op = *scode;
8039
8040 /* If we are at the start of a conditional assertion group, *both* the
8041 conditional assertion *and* what follows the condition must satisfy the test
8042 for start of line. Other kinds of condition fail. Note that there may be an
8043 auto-callout at the start of a condition. */
8044
8045 if (op == OP_COND)
8046 {
8047 scode += 1 + LINK_SIZE;
8048
8049 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8050 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8051
8052 switch (*scode)
8053 {
8054 case OP_CREF:
8055 case OP_DNCREF:
8056 case OP_RREF:
8057 case OP_DNRREF:
8058 case OP_FAIL:
8059 case OP_FALSE:
8060 case OP_TRUE:
8061 return FALSE;
8062
8063 default: /* Assertion */
8064 if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
8065 do scode += GET(scode, 1); while (*scode == OP_ALT);
8066 scode += 1 + LINK_SIZE;
8067 break;
8068 }
8069 scode = first_significant_code(scode, FALSE);
8070 op = *scode;
8071 }
8072
8073 /* Non-capturing brackets */
8074
8075 if (op == OP_BRA || op == OP_BRAPOS ||
8076 op == OP_SBRA || op == OP_SBRAPOS)
8077 {
8078 if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
8079 }
8080
8081 /* Capturing brackets */
8082
8083 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8084 op == OP_SCBRA || op == OP_SCBRAPOS)
8085 {
8086 int n = GET2(scode, 1+LINK_SIZE);
8087 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8088 if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
8089 }
8090
8091 /* Positive forward assertions */
8092
8093 else if (op == OP_ASSERT)
8094 {
8095 if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
8096 }
8097
8098 /* Atomic brackets */
8099
8100 else if (op == OP_ONCE || op == OP_ONCE_NC)
8101 {
8102 if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
8103 }
8104
8105 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8106 brackets that may be referenced, as long as the pattern does not contain
8107 *PRUNE or *SKIP, because these break the feature. Consider, for example,
8108 /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8109 start of a line. There is also an option that disables this optimization. */
8110
8111 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8112 {
8113 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8114 atomcount > 0 || cb->had_pruneorskip ||
8115 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8116 return FALSE;
8117 }
8118
8119 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8120 in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8121 because the number of characters matched by .* cannot be adjusted inside
8122 them. */
8123
8124 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8125
8126 /* Move on to the next alternative */
8127
8128 code += GET(code, 1);
8129 }
8130 while (*code == OP_ALT); /* Loop for each alternative */
8131 return TRUE;
8132 }
8133
8134
8135
8136 /*************************************************
8137 * Check for asserted fixed first code unit *
8138 *************************************************/
8139
8140 /* During compilation, the "first code unit" settings from forward assertions
8141 are discarded, because they can cause conflicts with actual literals that
8142 follow. However, if we end up without a first code unit setting for an
8143 unanchored pattern, it is worth scanning the regex to see if there is an
8144 initial asserted first code unit. If all branches start with the same asserted
8145 code unit, or with a non-conditional bracket all of whose alternatives start
8146 with the same asserted code unit (recurse ad lib), then we return that code
8147 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8148 REQ_NONE in the flags.
8149
8150 Arguments:
8151 code points to start of compiled pattern
8152 flags points to the first code unit flags
8153 inassert TRUE if in an assertion
8154
8155 Returns: the fixed first code unit, or 0 with REQ_NONE in flags
8156 */
8157
8158 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,int32_t * flags,BOOL inassert)8159 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, BOOL inassert)
8160 {
8161 register uint32_t c = 0;
8162 int cflags = REQ_NONE;
8163
8164 *flags = REQ_NONE;
8165 do {
8166 uint32_t d;
8167 int dflags;
8168 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8169 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8170 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8171 register PCRE2_UCHAR op = *scode;
8172
8173 switch(op)
8174 {
8175 default:
8176 return 0;
8177
8178 case OP_BRA:
8179 case OP_BRAPOS:
8180 case OP_CBRA:
8181 case OP_SCBRA:
8182 case OP_CBRAPOS:
8183 case OP_SCBRAPOS:
8184 case OP_ASSERT:
8185 case OP_ONCE:
8186 case OP_ONCE_NC:
8187 d = find_firstassertedcu(scode, &dflags, op == OP_ASSERT);
8188 if (dflags < 0)
8189 return 0;
8190 if (cflags < 0) { c = d; cflags = dflags; }
8191 else if (c != d || cflags != dflags) return 0;
8192 break;
8193
8194 case OP_EXACT:
8195 scode += IMM2_SIZE;
8196 /* Fall through */
8197
8198 case OP_CHAR:
8199 case OP_PLUS:
8200 case OP_MINPLUS:
8201 case OP_POSPLUS:
8202 if (!inassert) return 0;
8203 if (cflags < 0) { c = scode[1]; cflags = 0; }
8204 else if (c != scode[1]) return 0;
8205 break;
8206
8207 case OP_EXACTI:
8208 scode += IMM2_SIZE;
8209 /* Fall through */
8210
8211 case OP_CHARI:
8212 case OP_PLUSI:
8213 case OP_MINPLUSI:
8214 case OP_POSPLUSI:
8215 if (!inassert) return 0;
8216 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8217 else if (c != scode[1]) return 0;
8218 break;
8219 }
8220
8221 code += GET(code, 1);
8222 }
8223 while (*code == OP_ALT);
8224
8225 *flags = cflags;
8226 return c;
8227 }
8228
8229
8230
8231 /*************************************************
8232 * Add an entry to the name/number table *
8233 *************************************************/
8234
8235 /* This function is called between compiling passes to add an entry to the
8236 name/number table, maintaining alphabetical order. Checking for permitted
8237 and forbidden duplicates has already been done.
8238
8239 Arguments:
8240 cb the compile data block
8241 name the name to add
8242 length the length of the name
8243 groupno the group number
8244
8245 Returns: nothing
8246 */
8247
8248 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno)8249 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8250 unsigned int groupno)
8251 {
8252 int i;
8253 PCRE2_UCHAR *slot = cb->name_table;
8254
8255 for (i = 0; i < cb->names_found; i++)
8256 {
8257 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8258 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8259 crc = -1; /* Current name is a substring */
8260
8261 /* Make space in the table and break the loop for an earlier name. For a
8262 duplicate or later name, carry on. We do this for duplicates so that in the
8263 simple case (when ?(| is not used) they are in order of their numbers. In all
8264 cases they are in the order in which they appear in the pattern. */
8265
8266 if (crc < 0)
8267 {
8268 memmove(slot + cb->name_entry_size, slot,
8269 CU2BYTES((cb->names_found - i) * cb->name_entry_size));
8270 break;
8271 }
8272
8273 /* Continue the loop for a later or duplicate name */
8274
8275 slot += cb->name_entry_size;
8276 }
8277
8278 PUT2(slot, 0, groupno);
8279 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8280 cb->names_found++;
8281
8282 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8283 the memory is all initialized. Otherwise valgrind moans about uninitialized
8284 memory when saving serialized compiled patterns. */
8285
8286 memset(slot + IMM2_SIZE + length, 0,
8287 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8288 }
8289
8290
8291
8292 /*************************************************
8293 * External function to compile a pattern *
8294 *************************************************/
8295
8296 /* This function reads a regular expression in the form of a string and returns
8297 a pointer to a block of store holding a compiled version of the expression.
8298
8299 Arguments:
8300 pattern the regular expression
8301 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
8302 options option bits
8303 errorptr pointer to errorcode
8304 erroroffset pointer to error offset
8305 ccontext points to a compile context or is NULL
8306
8307 Returns: pointer to compiled data block, or NULL on error,
8308 with errorcode and erroroffset set
8309 */
8310
8311 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)8312 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
8313 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
8314 {
8315 BOOL utf; /* Set TRUE for UTF mode */
8316 pcre2_real_code *re = NULL; /* What we will return */
8317 compile_block cb; /* "Static" compile-time data */
8318 const uint8_t *tables; /* Char tables base pointer */
8319
8320 PCRE2_UCHAR *code; /* Current pointer in compiled code */
8321 PCRE2_SPTR codestart; /* Start of compiled code */
8322 PCRE2_SPTR ptr; /* Current pointer in pattern */
8323
8324 size_t length = 1; /* Allow or final END opcode */
8325 size_t usedlength; /* Actual length used */
8326 size_t re_blocksize; /* Size of memory block */
8327
8328 int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
8329 uint32_t firstcu, reqcu; /* Value of first/req code unit */
8330 uint32_t setflags = 0; /* NL and BSR set flags */
8331
8332 uint32_t skipatstart; /* When checking (*UTF) etc */
8333 uint32_t limit_match = UINT32_MAX; /* Unset match limits */
8334 uint32_t limit_recursion = UINT32_MAX;
8335
8336 int newline = 0; /* Unset; can be set by the pattern */
8337 int bsr = 0; /* Unset; can be set by the pattern */
8338 int errorcode = 0; /* Initialize to avoid compiler warn */
8339
8340 /* Comments at the head of this file explain about these variables. */
8341
8342 PCRE2_UCHAR *copied_pattern = NULL;
8343 PCRE2_UCHAR stack_copied_pattern[COPIED_PATTERN_SIZE];
8344 named_group named_groups[NAMED_GROUP_LIST_SIZE];
8345
8346 /* The workspace is used in different ways in the different compiling phases.
8347 It needs to be 16-bit aligned for the preliminary group scan, and 32-bit
8348 aligned for the group information cache. */
8349
8350 uint32_t c32workspace[C32_WORK_SIZE];
8351 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c32workspace;
8352
8353
8354 /* -------------- Check arguments and set up the pattern ----------------- */
8355
8356 /* There must be error code and offset pointers. */
8357
8358 if (errorptr == NULL || erroroffset == NULL) return NULL;
8359 *errorptr = ERR0;
8360 *erroroffset = 0;
8361
8362 /* There must be a pattern! */
8363
8364 if (pattern == NULL)
8365 {
8366 *errorptr = ERR16;
8367 return NULL;
8368 }
8369
8370 /* Check that all undefined public option bits are zero. */
8371
8372 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
8373 {
8374 *errorptr = ERR17;
8375 return NULL;
8376 }
8377
8378 /* A NULL compile context means "use a default context" */
8379
8380 if (ccontext == NULL)
8381 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
8382
8383 /* A zero-terminated pattern is indicated by the special length value
8384 PCRE2_ZERO_TERMINATED. Otherwise, we make a copy of the pattern and add a zero,
8385 to ensure that it is always possible to look one code unit beyond the end of
8386 the pattern's characters. In both cases, check that the pattern is overlong. */
8387
8388 if (patlen == PCRE2_ZERO_TERMINATED)
8389 {
8390 patlen = PRIV(strlen)(pattern);
8391 if (patlen > ccontext->max_pattern_length)
8392 {
8393 *errorptr = ERR88;
8394 return NULL;
8395 }
8396 }
8397 else
8398 {
8399 if (patlen > ccontext->max_pattern_length)
8400 {
8401 *errorptr = ERR88;
8402 return NULL;
8403 }
8404 if (patlen < COPIED_PATTERN_SIZE)
8405 copied_pattern = stack_copied_pattern;
8406 else
8407 {
8408 copied_pattern = ccontext->memctl.malloc(CU2BYTES(patlen + 1),
8409 ccontext->memctl.memory_data);
8410 if (copied_pattern == NULL)
8411 {
8412 *errorptr = ERR21;
8413 return NULL;
8414 }
8415 }
8416 memcpy(copied_pattern, pattern, CU2BYTES(patlen));
8417 copied_pattern[patlen] = 0;
8418 pattern = copied_pattern;
8419 }
8420
8421 /* ------------ Initialize the "static" compile data -------------- */
8422
8423 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
8424
8425 cb.lcc = tables + lcc_offset; /* Individual */
8426 cb.fcc = tables + fcc_offset; /* character */
8427 cb.cbits = tables + cbits_offset; /* tables */
8428 cb.ctypes = tables + ctypes_offset;
8429
8430 cb.assert_depth = 0;
8431 cb.bracount = cb.final_bracount = 0;
8432 cb.cx = ccontext;
8433 cb.dupnames = FALSE;
8434 cb.end_pattern = pattern + patlen;
8435 cb.nestptr[0] = cb.nestptr[1] = NULL;
8436 cb.external_flags = 0;
8437 cb.external_options = options;
8438 cb.groupinfo = c32workspace;
8439 cb.had_recurse = FALSE;
8440 cb.iscondassert = FALSE;
8441 cb.max_lookbehind = 0;
8442 cb.name_entry_size = 0;
8443 cb.name_table = NULL;
8444 cb.named_groups = named_groups;
8445 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
8446 cb.names_found = 0;
8447 cb.open_caps = NULL;
8448 cb.parens_depth = 0;
8449 cb.req_varyopt = 0;
8450 cb.start_code = cworkspace;
8451 cb.start_pattern = pattern;
8452 cb.start_workspace = cworkspace;
8453 cb.workspace_size = COMPILE_WORK_SIZE;
8454
8455 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
8456 references to help in deciding whether (.*) can be treated as anchored or not.
8457 */
8458
8459 cb.top_backref = 0;
8460 cb.backref_map = 0;
8461
8462 /* --------------- Start looking at the pattern --------------- */
8463
8464 /* Check for global one-time option settings at the start of the pattern, and
8465 remember the offset to the actual regex. */
8466
8467 ptr = pattern;
8468 skipatstart = 0;
8469
8470 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
8471 ptr[skipatstart+1] == CHAR_ASTERISK)
8472 {
8473 unsigned int i;
8474 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
8475 {
8476 pso *p = pso_list + i;
8477
8478 if (PRIV(strncmp_c8)(ptr+skipatstart+2, (char *)(p->name), p->length) == 0)
8479 {
8480 uint32_t c, pp;
8481
8482 skipatstart += p->length + 2;
8483 switch(p->type)
8484 {
8485 case PSO_OPT:
8486 cb.external_options |= p->value;
8487 break;
8488
8489 case PSO_FLG:
8490 setflags |= p->value;
8491 break;
8492
8493 case PSO_NL:
8494 newline = p->value;
8495 setflags |= PCRE2_NL_SET;
8496 break;
8497
8498 case PSO_BSR:
8499 bsr = p->value;
8500 setflags |= PCRE2_BSR_SET;
8501 break;
8502
8503 case PSO_LIMM:
8504 case PSO_LIMR:
8505 c = 0;
8506 pp = skipatstart;
8507 if (!IS_DIGIT(ptr[pp]))
8508 {
8509 errorcode = ERR60;
8510 ptr += pp;
8511 goto HAD_ERROR;
8512 }
8513 while (IS_DIGIT(ptr[pp]))
8514 {
8515 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
8516 c = c*10 + (ptr[pp++] - CHAR_0);
8517 }
8518 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
8519 {
8520 errorcode = ERR60;
8521 ptr += pp;
8522 goto HAD_ERROR;
8523 }
8524 if (p->type == PSO_LIMM) limit_match = c;
8525 else limit_recursion = c;
8526 skipatstart += pp - skipatstart;
8527 break;
8528 }
8529 break; /* Out of the table scan loop */
8530 }
8531 }
8532 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
8533 }
8534
8535 /* End of pattern-start options; advance to start of real regex. */
8536
8537 ptr += skipatstart;
8538
8539 /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
8540
8541 #ifndef SUPPORT_UNICODE
8542 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
8543 {
8544 errorcode = ERR32;
8545 goto HAD_ERROR;
8546 }
8547 #endif
8548
8549 /* Check UTF. We have the original options in 'options', with that value as
8550 modified by (*UTF) etc in cb->external_options. */
8551
8552 utf = (cb.external_options & PCRE2_UTF) != 0;
8553 if (utf)
8554 {
8555 if ((options & PCRE2_NEVER_UTF) != 0)
8556 {
8557 errorcode = ERR74;
8558 goto HAD_ERROR;
8559 }
8560 if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
8561 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
8562 goto HAD_UTF_ERROR;
8563 }
8564
8565 /* Check UCP lockout. */
8566
8567 if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
8568 (PCRE2_UCP|PCRE2_NEVER_UCP))
8569 {
8570 errorcode = ERR75;
8571 goto HAD_ERROR;
8572 }
8573
8574 /* Process the BSR setting. */
8575
8576 if (bsr == 0) bsr = ccontext->bsr_convention;
8577
8578 /* Process the newline setting. */
8579
8580 if (newline == 0) newline = ccontext->newline_convention;
8581 cb.nltype = NLTYPE_FIXED;
8582 switch(newline)
8583 {
8584 case PCRE2_NEWLINE_CR:
8585 cb.nllen = 1;
8586 cb.nl[0] = CHAR_CR;
8587 break;
8588
8589 case PCRE2_NEWLINE_LF:
8590 cb.nllen = 1;
8591 cb.nl[0] = CHAR_NL;
8592 break;
8593
8594 case PCRE2_NEWLINE_CRLF:
8595 cb.nllen = 2;
8596 cb.nl[0] = CHAR_CR;
8597 cb.nl[1] = CHAR_NL;
8598 break;
8599
8600 case PCRE2_NEWLINE_ANY:
8601 cb.nltype = NLTYPE_ANY;
8602 break;
8603
8604 case PCRE2_NEWLINE_ANYCRLF:
8605 cb.nltype = NLTYPE_ANYCRLF;
8606 break;
8607
8608 default:
8609 errorcode = ERR56;
8610 goto HAD_ERROR;
8611 }
8612
8613 /* Before we do anything else, do a pre-scan of the pattern in order to
8614 discover the named groups and their numerical equivalents, so that this
8615 information is always available for the remaining processing. */
8616
8617 errorcode = scan_for_captures(&ptr, cb.external_options, &cb);
8618 if (errorcode != 0) goto HAD_ERROR;
8619
8620 /* For obscure debugging this code can be enabled. */
8621
8622 #if 0
8623 {
8624 int i;
8625 named_group *ng = cb.named_groups;
8626 fprintf(stderr, "+++Captures: %d\n", cb.final_bracount);
8627 for (i = 0; i < cb.names_found; i++, ng++)
8628 {
8629 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
8630 }
8631 }
8632 #endif
8633
8634 /* Reset current bracket count to zero and current pointer to the start of the
8635 pattern. */
8636
8637 cb.bracount = 0;
8638 ptr = pattern + skipatstart;
8639
8640 /* Pretend to compile the pattern while actually just accumulating the amount
8641 of memory required in the 'length' variable. This behaviour is triggered by
8642 passing a non-NULL final argument to compile_regex(). We pass a block of
8643 workspace (cworkspace) for it to compile parts of the pattern into; the
8644 compiled code is discarded when it is no longer needed, so hopefully this
8645 workspace will never overflow, though there is a test for its doing so.
8646
8647 On error, errorcode will be set non-zero, so we don't need to look at the
8648 result of the function. The initial options have been put into the cb block so
8649 that they can be changed if an option setting is found within the regex right
8650 at the beginning. Bringing initial option settings outside can help speed up
8651 starting point checks. We still have to pass a separate options variable (the
8652 first argument) because that may change as the pattern is processed. */
8653
8654 code = cworkspace;
8655 *code = OP_BRA;
8656
8657 (void)compile_regex(cb.external_options, &code, &ptr, &errorcode, FALSE,
8658 FALSE, 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
8659 &cb, &length);
8660
8661 if (errorcode != 0) goto HAD_ERROR;
8662 if (length > MAX_PATTERN_SIZE)
8663 {
8664 errorcode = ERR20;
8665 goto HAD_ERROR;
8666 }
8667
8668 /* Compute the size of, and then get and initialize, the data block for storing
8669 the compiled pattern and names table. Integer overflow should no longer be
8670 possible because nowadays we limit the maximum value of cb.names_found and
8671 cb.name_entry_size. */
8672
8673 re_blocksize = sizeof(pcre2_real_code) +
8674 CU2BYTES(length + cb.names_found * cb.name_entry_size);
8675 re = (pcre2_real_code *)
8676 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
8677 if (re == NULL)
8678 {
8679 errorcode = ERR21;
8680 goto HAD_ERROR;
8681 }
8682
8683 re->memctl = ccontext->memctl;
8684 re->tables = tables;
8685 re->executable_jit = NULL;
8686 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
8687 re->blocksize = re_blocksize;
8688 re->magic_number = MAGIC_NUMBER;
8689 re->compile_options = options;
8690 re->overall_options = cb.external_options;
8691 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
8692 re->limit_match = limit_match;
8693 re->limit_recursion = limit_recursion;
8694 re->first_codeunit = 0;
8695 re->last_codeunit = 0;
8696 re->bsr_convention = bsr;
8697 re->newline_convention = newline;
8698 re->max_lookbehind = 0;
8699 re->minlength = 0;
8700 re->top_bracket = 0;
8701 re->top_backref = 0;
8702 re->name_entry_size = cb.name_entry_size;
8703 re->name_count = cb.names_found;
8704
8705 /* The basic block is immediately followed by the name table, and the compiled
8706 code follows after that. */
8707
8708 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
8709 re->name_entry_size * re->name_count;
8710
8711 /* Workspace is needed to remember information about numbered groups: whether a
8712 group can match an empty string and what its fixed length is. This is done to
8713 avoid the possibility of recursive references causing very long compile times
8714 when checking these features. Unnumbered groups do not have this exposure since
8715 they cannot be referenced. We use an indexed vector for this purpose. If there
8716 are sufficiently few groups, it can be the c32workspace vector, as set up
8717 above. Otherwise we have to get/free a special vector. The vector must be
8718 initialized to zero. */
8719
8720 if (cb.final_bracount >= C32_WORK_SIZE)
8721 {
8722 cb.groupinfo = ccontext->memctl.malloc(
8723 (cb.final_bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
8724 if (cb.groupinfo == NULL)
8725 {
8726 errorcode = ERR21;
8727 goto HAD_ERROR;
8728 }
8729 }
8730 memset(cb.groupinfo, 0, (cb.final_bracount + 1) * sizeof(uint32_t));
8731
8732 /* Update the compile data block for the actual compile. The starting points of
8733 the name/number translation table and of the code are passed around in the
8734 compile data block. The start/end pattern and initial options are already set
8735 from the pre-compile phase, as is the name_entry_size field. Reset the bracket
8736 count and the names_found field. */
8737
8738 cb.parens_depth = 0;
8739 cb.assert_depth = 0;
8740 cb.bracount = 0;
8741 cb.max_lookbehind = 0;
8742 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
8743 cb.start_code = codestart;
8744 cb.iscondassert = FALSE;
8745 cb.req_varyopt = 0;
8746 cb.had_accept = FALSE;
8747 cb.had_pruneorskip = FALSE;
8748 cb.check_lookbehind = FALSE;
8749 cb.open_caps = NULL;
8750
8751 /* If any named groups were found, create the name/number table from the list
8752 created in the pre-pass. */
8753
8754 if (cb.names_found > 0)
8755 {
8756 int i = cb.names_found;
8757 named_group *ng = cb.named_groups;
8758 cb.names_found = 0;
8759 for (; i > 0; i--, ng++)
8760 add_name_to_table(&cb, ng->name, ng->length, ng->number);
8761 }
8762
8763 /* Set up a starting, non-extracting bracket, then compile the expression. On
8764 error, errorcode will be set non-zero, so we don't need to look at the result
8765 of the function here. */
8766
8767 ptr = pattern + skipatstart;
8768 code = (PCRE2_UCHAR *)codestart;
8769 *code = OP_BRA;
8770 (void)compile_regex(re->overall_options, &code, &ptr, &errorcode, FALSE, FALSE,
8771 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
8772
8773 re->top_bracket = cb.bracount;
8774 re->top_backref = cb.top_backref;
8775 re->max_lookbehind = cb.max_lookbehind;
8776
8777 if (cb.had_accept)
8778 {
8779 reqcu = 0; /* Must disable after (*ACCEPT) */
8780 reqcuflags = REQ_NONE;
8781 }
8782
8783 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
8784 but the estimated length exceeds the really used length, adjust the value of
8785 re->blocksize, and if valgrind support is configured, mark the extra allocated
8786 memory as unaddressable, so that any out-of-bound reads can be detected. */
8787
8788 *code++ = OP_END;
8789 usedlength = code - codestart;
8790 if (usedlength > length) errorcode = ERR23; else
8791 {
8792 re->blocksize -= CU2BYTES(length - usedlength);
8793 #ifdef SUPPORT_VALGRIND
8794 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
8795 #endif
8796 }
8797
8798 /* Scan the pattern for recursion/subroutine calls and convert the group
8799 numbers into offsets. Maintain a small cache so that repeated groups containing
8800 recursions are efficiently handled. */
8801
8802 #define RSCAN_CACHE_SIZE 8
8803
8804 if (errorcode == 0 && cb.had_recurse)
8805 {
8806 PCRE2_UCHAR *rcode;
8807 PCRE2_SPTR rgroup;
8808 int ccount = 0;
8809 int start = RSCAN_CACHE_SIZE;
8810 recurse_cache rc[RSCAN_CACHE_SIZE];
8811
8812 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
8813 rcode != NULL;
8814 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
8815 {
8816 int i, p, recno;
8817
8818 recno = (int)GET(rcode, 1);
8819 if (recno == 0) rgroup = codestart; else
8820 {
8821 PCRE2_SPTR search_from = codestart;
8822 rgroup = NULL;
8823 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
8824 {
8825 if (recno == rc[p].recno)
8826 {
8827 rgroup = rc[p].group;
8828 break;
8829 }
8830
8831 /* Group n+1 must always start to the right of group n, so we can save
8832 search time below when the new group number is greater than any of the
8833 previously found groups. */
8834
8835 if (recno > rc[p].recno) search_from = rc[p].group;
8836 }
8837
8838 if (rgroup == NULL)
8839 {
8840 rgroup = PRIV(find_bracket)(search_from, utf, recno);
8841 if (rgroup == NULL)
8842 {
8843 errorcode = ERR53;
8844 break;
8845 }
8846 if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
8847 rc[start].recno = recno;
8848 rc[start].group = rgroup;
8849 if (ccount < RSCAN_CACHE_SIZE) ccount++;
8850 }
8851 }
8852
8853 PUT(rcode, 1, rgroup - codestart);
8854 }
8855 }
8856
8857 /* In rare debugging situations we sometimes need to look at the compiled code
8858 at this stage. */
8859
8860 #ifdef CALL_PRINTINT
8861 pcre2_printint(re, stderr, TRUE);
8862 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
8863 #endif
8864
8865 /* After a successful compile, give an error if there's back reference to a
8866 non-existent capturing subpattern. Then, unless disabled, check whether any
8867 single character iterators can be auto-possessified. The function overwrites
8868 the appropriate opcode values, so the type of the pointer must be cast. NOTE:
8869 the intermediate variable "temp" is used in this code because at least one
8870 compiler gives a warning about loss of "const" attribute if the cast
8871 (PCRE2_UCHAR *)codestart is used directly in the function call. */
8872
8873 if (errorcode == 0)
8874 {
8875 if (re->top_backref > re->top_bracket) errorcode = ERR15;
8876 else if ((re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
8877 {
8878 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
8879 if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
8880 }
8881 }
8882
8883 /* If there were any lookbehind assertions that contained OP_RECURSE
8884 (recursions or subroutine calls), a flag is set for them to be checked here,
8885 because they may contain forward references. Actual recursions cannot be fixed
8886 length, but subroutine calls can. It is done like this so that those without
8887 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
8888 exceptional ones forgo this. We scan the pattern to check that they are fixed
8889 length, and set their lengths. */
8890
8891 if (errorcode == 0 && cb.check_lookbehind)
8892 {
8893 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)codestart;
8894
8895 /* Loop, searching for OP_REVERSE items, and process those that do not have
8896 their length set. (Actually, it will also re-process any that have a length
8897 of zero, but that is a pathological case, and it does no harm.) When we find
8898 one, we temporarily terminate the branch it is in while we scan it. Note that
8899 calling find_bracket() with a negative group number returns a pointer to the
8900 OP_REVERSE item, not the actual lookbehind. */
8901
8902 for (cc = (PCRE2_UCHAR *)PRIV(find_bracket)(codestart, utf, -1);
8903 cc != NULL;
8904 cc = (PCRE2_UCHAR *)PRIV(find_bracket)(cc, utf, -1))
8905 {
8906 if (GET(cc, 1) == 0)
8907 {
8908 int fixed_length;
8909 int count = 0;
8910 PCRE2_UCHAR *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
8911 int end_op = *be;
8912 *be = OP_END;
8913 fixed_length = find_fixedlength(cc, utf, TRUE, &cb, NULL, &count);
8914 *be = end_op;
8915 if (fixed_length < 0)
8916 {
8917 errorcode = fixed_length_errors[-fixed_length];
8918 break;
8919 }
8920 if (fixed_length > cb.max_lookbehind) cb.max_lookbehind = fixed_length;
8921 PUT(cc, 1, fixed_length);
8922 }
8923 cc += 1 + LINK_SIZE;
8924 }
8925
8926 /* The previous value of the maximum lookbehind was transferred to the
8927 compiled regex block above. We could have updated this value in the loop
8928 above, but keep the two values in step, just in case some later code below
8929 uses the cb value. */
8930
8931 re->max_lookbehind = cb.max_lookbehind;
8932 }
8933
8934 /* Failed to compile, or error while post-processing. Earlier errors get here
8935 via the dreaded goto. */
8936
8937 if (errorcode != 0)
8938 {
8939 HAD_ERROR:
8940 *erroroffset = (int)(ptr - pattern);
8941 HAD_UTF_ERROR:
8942 *errorptr = errorcode;
8943 pcre2_code_free(re);
8944 re = NULL;
8945 goto EXIT;
8946 }
8947
8948 /* Successful compile. If the anchored option was not passed, set it if
8949 we can determine that the pattern is anchored by virtue of ^ characters or \A
8950 or anything else, such as starting with non-atomic .* when DOTALL is set and
8951 there are no occurrences of *PRUNE or *SKIP (though there is an option to
8952 disable this case). */
8953
8954 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
8955 is_anchored(codestart, 0, &cb, 0))
8956 re->overall_options |= PCRE2_ANCHORED;
8957
8958 /* If the pattern is still not anchored and we do not have a first code unit,
8959 see if there is one that is asserted (these are not saved during the compile
8960 because they can cause conflicts with actual literals that follow). This code
8961 need not be obeyed if PCRE2_NO_START_OPTIMIZE is set, as the data it would
8962 create will not be used. */
8963
8964 if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
8965 {
8966 if (firstcuflags < 0)
8967 firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE);
8968
8969 /* Save the data for a first code unit. */
8970
8971 if (firstcuflags >= 0)
8972 {
8973 re->first_codeunit = firstcu;
8974 re->flags |= PCRE2_FIRSTSET;
8975
8976 /* Handle caseless first code units. */
8977
8978 if ((firstcuflags & REQ_CASELESS) != 0)
8979 {
8980 if (firstcu < 128 || (!utf && firstcu < 255))
8981 {
8982 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
8983 }
8984
8985 /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
8986 8-bit UTF mode, codepoints in the range 128-255 are introductory code
8987 points and cannot have another case. In 16-bit and 32-bit modes, we can
8988 check wide characters when UTF (and therefore UCP) is supported. */
8989
8990 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
8991 else if (firstcu <= MAX_UTF_CODE_POINT &&
8992 UCD_OTHERCASE(firstcu) != firstcu)
8993 re->flags |= PCRE2_FIRSTCASELESS;
8994 #endif
8995 }
8996 }
8997
8998 /* When there is no first code unit, see if we can set the PCRE2_STARTLINE
8999 flag. This is helpful for multiline matches when all branches start with ^
9000 and also when all branches start with non-atomic .* for non-DOTALL matches
9001 when *PRUNE and SKIP are not present. (There is an option that disables this
9002 case.) */
9003
9004 else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
9005 }
9006
9007 /* Handle the "required code unit", if one is set. In the case of an anchored
9008 pattern, do this only if it follows a variable length item in the pattern.
9009 Again, skip this if PCRE2_NO_START_OPTIMIZE is set. */
9010
9011 if (reqcuflags >= 0 &&
9012 ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0 ||
9013 (reqcuflags & REQ_VARY) != 0))
9014 {
9015 re->last_codeunit = reqcu;
9016 re->flags |= PCRE2_LASTSET;
9017
9018 /* Handle caseless required code units as for first code units (above). */
9019
9020 if ((reqcuflags & REQ_CASELESS) != 0)
9021 {
9022 if (reqcu < 128 || (!utf && reqcu < 255))
9023 {
9024 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
9025 }
9026 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
9027 else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
9028 re->flags |= PCRE2_LASTCASELESS;
9029 #endif
9030 }
9031 }
9032
9033 /* Check for a pattern than can match an empty string, so that this information
9034 can be provided to applications. */
9035
9036 do
9037 {
9038 int count = 0;
9039 int rc = could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL, &count);
9040 if (rc < 0)
9041 {
9042 errorcode = ERR86;
9043 goto HAD_ERROR;
9044 }
9045 if (rc > 0)
9046 {
9047 re->flags |= PCRE2_MATCH_EMPTY;
9048 break;
9049 }
9050 codestart += GET(codestart, 1);
9051 }
9052 while (*codestart == OP_ALT);
9053
9054 /* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern
9055 to set up information such as a bitmap of starting code units and a minimum
9056 matching length. */
9057
9058 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
9059 PRIV(study)(re) != 0)
9060 {
9061 errorcode = ERR31;
9062 goto HAD_ERROR;
9063 }
9064
9065 /* Control ends up here in all cases. If memory was obtained for a
9066 zero-terminated copy of the pattern, remember to free it before returning. Also
9067 free the list of named groups if a larger one had to be obtained, and likewise
9068 the group information vector. */
9069
9070 EXIT:
9071 if (copied_pattern != stack_copied_pattern)
9072 ccontext->memctl.free(copied_pattern, ccontext->memctl.memory_data);
9073 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
9074 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
9075 if (cb.groupinfo != c32workspace)
9076 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
9077
9078 return re; /* Will be NULL after an error */
9079 }
9080
9081 /* End of pcre2_compile.c */
9082