1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2020 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #define NLBLOCK cb /* Block containing newline information */
47 #define PSSTART start_pattern /* Field containing processed string start */
48 #define PSEND end_pattern /* Field containing processed string end */
49
50 #include "pcre2_internal.h"
51
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63
64 /* Other debugging code can be enabled by these defines. */
65
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c) xdigitab[c]
75
76 #else /* Either 16-bit or 32-bit */
77 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81
82 #else /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100 { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102 { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110
111 /* Macros for manipulating elements of the parsed pattern vector. */
112
113 #define META_CODE(x) (x & 0xffff0000u)
114 #define META_DATA(x) (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116
117 /* Function definitions to allow mutual recursion */
118
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121 add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122 compile_block *, const uint32_t *, unsigned int);
123 #endif
124
125 static int
126 compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127 uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
128 compile_block *, PCRE2_SIZE *);
129
130 static int
131 get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132 compile_block *);
133
134 static BOOL
135 set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136 compile_block *);
137
138 static int
139 check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140 compile_block *);
141
142
143 /*************************************************
144 * Code parameters and static tables *
145 *************************************************/
146
147 #define MAX_GROUP_NUMBER 65535u
148 #define MAX_REPEAT_COUNT 65535u
149 #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
150
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163
164 In the real compile phase, this workspace is not currently used. */
165
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
167
168 #define C16_WORK_SIZE \
169 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174
175 #define GROUPINFO_DEFAULT_SIZE 256
176
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186
187 #define NAMED_GROUP_LIST_SIZE 20
188
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199
200 #define OFLOW_MAX (INT_MAX - 20)
201
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210
211 #define META_END 0x80000000u /* End of pattern */
212
213 #define META_ALT 0x80010000u /* alternation */
214 #define META_ATOMIC 0x80020000u /* atomic group */
215 #define META_BACKREF 0x80030000u /* Back ref */
216 #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
217 #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
219 #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
220 #define META_CAPTURE 0x80080000u /* Capturing parenthesis */
221 #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
222 #define META_CLASS 0x800a0000u /* start non-empty class */
223 #define META_CLASS_EMPTY 0x800b0000u /* empty class */
224 #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
225 #define META_CLASS_END 0x800d0000u /* end of non-empty class */
226 #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
227 #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
228 #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
229 #define META_COND_NAME 0x80110000u /* (?(<name>)... */
230 #define META_COND_NUMBER 0x80120000u /* (?(digits)... */
231 #define META_COND_RNAME 0x80130000u /* (?(R&name)... */
232 #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
233 #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR 0x80160000u /* $ metacharacter */
235 #define META_DOT 0x80170000u /* . metacharacter */
236 #define META_ESCAPE 0x80180000u /* \d and friends */
237 #define META_KET 0x80190000u /* closing parenthesis */
238 #define META_NOCAPTURE 0x801a0000u /* no capture parens */
239 #define META_OPTIONS 0x801b0000u /* (?i) and friends */
240 #define META_POSIX 0x801c0000u /* POSIX class item */
241 #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
242 #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
243 #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
244 #define META_RECURSE 0x80200000u /* Recursion */
245 #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
246 #define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
247
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250
251 #define META_LOOKAHEAD 0x80230000u /* (?= */
252 #define META_LOOKAHEADNOT 0x80240000u /* (?! */
253 #define META_LOOKBEHIND 0x80250000u /* (?<= */
254 #define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
255
256 /* These cannot be conditions */
257
258 #define META_LOOKAHEAD_NA 0x80270000u /* (*napla: */
259 #define META_LOOKBEHIND_NA 0x80280000u /* (*naplb: */
260
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264
265 #define META_MARK 0x80290000u /* (*MARK) */
266 #define META_ACCEPT 0x802a0000u /* (*ACCEPT) */
267 #define META_FAIL 0x802b0000u /* (*FAIL) */
268 #define META_COMMIT 0x802c0000u /* These */
269 #define META_COMMIT_ARG 0x802d0000u /* pairs */
270 #define META_PRUNE 0x802e0000u /* must */
271 #define META_PRUNE_ARG 0x802f0000u /* be */
272 #define META_SKIP 0x80300000u /* kept */
273 #define META_SKIP_ARG 0x80310000u /* in */
274 #define META_THEN 0x80320000u /* this */
275 #define META_THEN_ARG 0x80330000u /* order */
276
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278
279 #define META_ASTERISK 0x80340000u /* * */
280 #define META_ASTERISK_PLUS 0x80350000u /* *+ */
281 #define META_ASTERISK_QUERY 0x80360000u /* *? */
282 #define META_PLUS 0x80370000u /* + */
283 #define META_PLUS_PLUS 0x80380000u /* ++ */
284 #define META_PLUS_QUERY 0x80390000u /* +? */
285 #define META_QUERY 0x803a0000u /* ? */
286 #define META_QUERY_PLUS 0x803b0000u /* ?+ */
287 #define META_QUERY_QUERY 0x803c0000u /* ?? */
288 #define META_MINMAX 0x803d0000u /* {n,m} repeat */
289 #define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */
291
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER META_MINMAX_QUERY
294
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305
306 static unsigned char meta_extra_lengths[] = {
307 0, /* META_END */
308 0, /* META_ALT */
309 0, /* META_ATOMIC */
310 0, /* META_BACKREF - more if group is >= 10 */
311 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
312 1, /* META_BIGVALUE */
313 3, /* META_CALLOUT_NUMBER */
314 3+SIZEOFFSET, /* META_CALLOUT_STRING */
315 0, /* META_CAPTURE */
316 0, /* META_CIRCUMFLEX */
317 0, /* META_CLASS */
318 0, /* META_CLASS_EMPTY */
319 0, /* META_CLASS_EMPTY_NOT */
320 0, /* META_CLASS_END */
321 0, /* META_CLASS_NOT */
322 0, /* META_COND_ASSERT */
323 SIZEOFFSET, /* META_COND_DEFINE */
324 1+SIZEOFFSET, /* META_COND_NAME */
325 1+SIZEOFFSET, /* META_COND_NUMBER */
326 1+SIZEOFFSET, /* META_COND_RNAME */
327 1+SIZEOFFSET, /* META_COND_RNUMBER */
328 3, /* META_COND_VERSION */
329 0, /* META_DOLLAR */
330 0, /* META_DOT */
331 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332 0, /* META_KET */
333 0, /* META_NOCAPTURE */
334 1, /* META_OPTIONS */
335 1, /* META_POSIX */
336 1, /* META_POSIX_NEG */
337 0, /* META_RANGE_ESCAPED */
338 0, /* META_RANGE_LITERAL */
339 SIZEOFFSET, /* META_RECURSE */
340 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
341 0, /* META_SCRIPT_RUN */
342 0, /* META_LOOKAHEAD */
343 0, /* META_LOOKAHEADNOT */
344 SIZEOFFSET, /* META_LOOKBEHIND */
345 SIZEOFFSET, /* META_LOOKBEHINDNOT */
346 0, /* META_LOOKAHEAD_NA */
347 SIZEOFFSET, /* META_LOOKBEHIND_NA */
348 1, /* META_MARK - plus the string length */
349 0, /* META_ACCEPT */
350 0, /* META_FAIL */
351 0, /* META_COMMIT */
352 1, /* META_COMMIT_ARG - plus the string length */
353 0, /* META_PRUNE */
354 1, /* META_PRUNE_ARG - plus the string length */
355 0, /* META_SKIP */
356 1, /* META_SKIP_ARG - plus the string length */
357 0, /* META_THEN */
358 1, /* META_THEN_ARG - plus the string length */
359 0, /* META_ASTERISK */
360 0, /* META_ASTERISK_PLUS */
361 0, /* META_ASTERISK_QUERY */
362 0, /* META_PLUS */
363 0, /* META_PLUS_PLUS */
364 0, /* META_PLUS_QUERY */
365 0, /* META_QUERY */
366 0, /* META_QUERY_PLUS */
367 0, /* META_QUERY_QUERY */
368 2, /* META_MINMAX */
369 2, /* META_MINMAX_PLUS */
370 2 /* META_MINMAX_QUERY */
371 };
372
373 /* Types for skipping parts of a parsed pattern. */
374
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380
381 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387
388 /* Private flags added to firstcu and reqcu. */
389
390 #define REQ_CASELESS (1u << 0) /* Indicates caselessness */
391 #define REQ_VARY (1u << 1) /* reqcu followed non-literal item */
392 /* Negative values for the firstcu and reqcu flags */
393 #define REQ_UNSET (-2) /* Not yet found anything */
394 #define REQ_NONE (-1) /* Found not fixed char */
395
396 /* These flags are used in the groupinfo vector. */
397
398 #define GI_SET_FIXED_LENGTH 0x80000000u
399 #define GI_NOT_FIXED_LENGTH 0x40000000u
400 #define GI_FIXED_LENGTH_MASK 0x0000ffffu
401
402 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
403 and is fast (a good compiler can turn it into a subtraction and unsigned
404 comparison). */
405
406 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
407
408 /* Table to identify hex digits. The tables in chartables are dependent on the
409 locale, and may mark arbitrary characters as digits. We want to recognize only
410 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
411 costs 256 bytes, but it is a lot faster than doing character value tests (at
412 least in some simple cases I timed), and in some applications one wants PCRE2
413 to compile efficiently as well as match efficiently. The value in the table is
414 the binary hex digit value, or 0xff for non-hex digits. */
415
416 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
417 UTF-8 mode. */
418
419 #ifndef EBCDIC
420 static const uint8_t xdigitab[] =
421 {
422 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
423 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
424 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
425 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
426 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
427 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
428 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
429 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
430 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
431 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
432 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
433 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
434 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
435 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
436 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
437 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
438 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
439 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
440 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
441 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
442 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
443 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
444 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
445 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
446 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
447 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
448 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
449 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
450 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
451 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
452 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
453 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
454
455 #else
456
457 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
458
459 static const uint8_t xdigitab[] =
460 {
461 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
462 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
463 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
464 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
465 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
466 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
467 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
468 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
469 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
470 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
471 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
472 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
473 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
474 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
475 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
476 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
477 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
478 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
479 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
480 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
481 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
482 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
483 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
484 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
485 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
486 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
487 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
488 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
489 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
490 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
491 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
492 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
493 #endif /* EBCDIC */
494
495
496 /* Table for handling alphanumeric escaped characters. Positive returns are
497 simple data values; negative values are for special things like \d and so on.
498 Zero means further processing is needed (for things like \x), or the escape is
499 invalid. */
500
501 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
502 in UTF-8 mode. It runs from '0' to 'z'. */
503
504 #ifndef EBCDIC
505 #define ESCAPES_FIRST CHAR_0
506 #define ESCAPES_LAST CHAR_z
507 #define UPPER_CASE(c) (c-32)
508
509 static const short int escapes[] = {
510 0, 0,
511 0, 0,
512 0, 0,
513 0, 0,
514 0, 0,
515 CHAR_COLON, CHAR_SEMICOLON,
516 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
517 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
518 CHAR_COMMERCIAL_AT, -ESC_A,
519 -ESC_B, -ESC_C,
520 -ESC_D, -ESC_E,
521 0, -ESC_G,
522 -ESC_H, 0,
523 0, -ESC_K,
524 0, 0,
525 -ESC_N, 0,
526 -ESC_P, -ESC_Q,
527 -ESC_R, -ESC_S,
528 0, 0,
529 -ESC_V, -ESC_W,
530 -ESC_X, 0,
531 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
532 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
533 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
534 CHAR_GRAVE_ACCENT, CHAR_BEL,
535 -ESC_b, 0,
536 -ESC_d, CHAR_ESC,
537 CHAR_FF, 0,
538 -ESC_h, 0,
539 0, -ESC_k,
540 0, 0,
541 CHAR_LF, 0,
542 -ESC_p, 0,
543 CHAR_CR, -ESC_s,
544 CHAR_HT, 0,
545 -ESC_v, -ESC_w,
546 0, 0,
547 -ESC_z
548 };
549
550 #else
551
552 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
553 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
554 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
555 because it is defined as 'a', which of course picks up the ASCII value. */
556
557 #if 'a' == 0x81 /* Check for a real EBCDIC environment */
558 #define ESCAPES_FIRST CHAR_a
559 #define ESCAPES_LAST CHAR_9
560 #define UPPER_CASE(c) (c+64)
561 #else /* Testing in an ASCII environment */
562 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
563 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
564 #define UPPER_CASE(c) (c-32)
565 #endif
566
567 static const short int escapes[] = {
568 /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
569 /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
570 /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
571 /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
572 /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
573 /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
574 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
575 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
576 /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
577 /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
578 /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
579 /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
580 /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
581 /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
582 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
583 /* F8 */ 0, 0
584 };
585
586 /* We also need a table of characters that may follow \c in an EBCDIC
587 environment for characters 0-31. */
588
589 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
590
591 #endif /* EBCDIC */
592
593
594 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
595 searched linearly. Put all the names into a single string, in order to reduce
596 the number of relocations when a shared library is dynamically linked. The
597 string is built from string macros so that it works in UTF-8 mode on EBCDIC
598 platforms. */
599
600 typedef struct verbitem {
601 unsigned int len; /* Length of verb name */
602 uint32_t meta; /* Base META_ code */
603 int has_arg; /* Argument requirement */
604 } verbitem;
605
606 static const char verbnames[] =
607 "\0" /* Empty name is a shorthand for MARK */
608 STRING_MARK0
609 STRING_ACCEPT0
610 STRING_F0
611 STRING_FAIL0
612 STRING_COMMIT0
613 STRING_PRUNE0
614 STRING_SKIP0
615 STRING_THEN;
616
617 static const verbitem verbs[] = {
618 { 0, META_MARK, +1 }, /* > 0 => must have an argument */
619 { 4, META_MARK, +1 },
620 { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
621 { 1, META_FAIL, -1 },
622 { 4, META_FAIL, -1 },
623 { 6, META_COMMIT, 0 },
624 { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
625 { 4, META_SKIP, 0 },
626 { 4, META_THEN, 0 }
627 };
628
629 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
630
631 /* Verb opcodes, indexed by their META code offset from META_MARK. */
632
633 static const uint32_t verbops[] = {
634 OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
635 OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
636
637 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
638
639 typedef struct alasitem {
640 unsigned int len; /* Length of name */
641 uint32_t meta; /* Base META_ code */
642 } alasitem;
643
644 static const char alasnames[] =
645 STRING_pla0
646 STRING_plb0
647 STRING_napla0
648 STRING_naplb0
649 STRING_nla0
650 STRING_nlb0
651 STRING_positive_lookahead0
652 STRING_positive_lookbehind0
653 STRING_non_atomic_positive_lookahead0
654 STRING_non_atomic_positive_lookbehind0
655 STRING_negative_lookahead0
656 STRING_negative_lookbehind0
657 STRING_atomic0
658 STRING_sr0
659 STRING_asr0
660 STRING_script_run0
661 STRING_atomic_script_run;
662
663 static const alasitem alasmeta[] = {
664 { 3, META_LOOKAHEAD },
665 { 3, META_LOOKBEHIND },
666 { 5, META_LOOKAHEAD_NA },
667 { 5, META_LOOKBEHIND_NA },
668 { 3, META_LOOKAHEADNOT },
669 { 3, META_LOOKBEHINDNOT },
670 { 18, META_LOOKAHEAD },
671 { 19, META_LOOKBEHIND },
672 { 29, META_LOOKAHEAD_NA },
673 { 30, META_LOOKBEHIND_NA },
674 { 18, META_LOOKAHEADNOT },
675 { 19, META_LOOKBEHINDNOT },
676 { 6, META_ATOMIC },
677 { 2, META_SCRIPT_RUN }, /* sr = script run */
678 { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
679 { 10, META_SCRIPT_RUN }, /* script run */
680 { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
681 };
682
683 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
684
685 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
686
687 static uint32_t chartypeoffset[] = {
688 OP_STAR - OP_STAR, OP_STARI - OP_STAR,
689 OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
690
691 /* Tables of names of POSIX character classes and their lengths. The names are
692 now all in a single string, to reduce the number of relocations when a shared
693 library is dynamically loaded. The list of lengths is terminated by a zero
694 length entry. The first three must be alpha, lower, upper, as this is assumed
695 for handling case independence. The indices for graph, print, and punct are
696 needed, so identify them. */
697
698 static const char posix_names[] =
699 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
700 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
701 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
702 STRING_word0 STRING_xdigit;
703
704 static const uint8_t posix_name_lengths[] = {
705 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
706
707 #define PC_GRAPH 8
708 #define PC_PRINT 9
709 #define PC_PUNCT 10
710
711 /* Table of class bit maps for each POSIX class. Each class is formed from a
712 base map, with an optional addition or removal of another map. Then, for some
713 classes, there is some additional tweaking: for [:blank:] the vertical space
714 characters are removed, and for [:alpha:] and [:alnum:] the underscore
715 character is removed. The triples in the table consist of the base map offset,
716 second map offset or -1 if no second map, and a non-negative value for map
717 addition or a negative value for map subtraction (if there are two maps). The
718 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
719 remove vertical space characters, 2 => remove underscore. */
720
721 static const int posix_class_maps[] = {
722 cbit_word, cbit_digit, -2, /* alpha */
723 cbit_lower, -1, 0, /* lower */
724 cbit_upper, -1, 0, /* upper */
725 cbit_word, -1, 2, /* alnum - word without underscore */
726 cbit_print, cbit_cntrl, 0, /* ascii */
727 cbit_space, -1, 1, /* blank - a GNU extension */
728 cbit_cntrl, -1, 0, /* cntrl */
729 cbit_digit, -1, 0, /* digit */
730 cbit_graph, -1, 0, /* graph */
731 cbit_print, -1, 0, /* print */
732 cbit_punct, -1, 0, /* punct */
733 cbit_space, -1, 0, /* space */
734 cbit_word, -1, 0, /* word - a Perl extension */
735 cbit_xdigit,-1, 0 /* xdigit */
736 };
737
738 #ifdef SUPPORT_UNICODE
739
740 /* The POSIX class Unicode property substitutes that are used in UCP mode must
741 be in the order of the POSIX class names, defined above. */
742
743 static int posix_substitutes[] = {
744 PT_GC, ucp_L, /* alpha */
745 PT_PC, ucp_Ll, /* lower */
746 PT_PC, ucp_Lu, /* upper */
747 PT_ALNUM, 0, /* alnum */
748 -1, 0, /* ascii, treat as non-UCP */
749 -1, 1, /* blank, treat as \h */
750 PT_PC, ucp_Cc, /* cntrl */
751 PT_PC, ucp_Nd, /* digit */
752 PT_PXGRAPH, 0, /* graph */
753 PT_PXPRINT, 0, /* print */
754 PT_PXPUNCT, 0, /* punct */
755 PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
756 PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
757 -1, 0 /* xdigit, treat as non-UCP */
758 };
759 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
760 #endif /* SUPPORT_UNICODE */
761
762 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
763 are allowed. */
764
765 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
766 (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
767 PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
768 PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
769
770 #define PUBLIC_COMPILE_OPTIONS \
771 (PUBLIC_LITERAL_COMPILE_OPTIONS| \
772 PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
773 PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
774 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
775 PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
776 PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
777 PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
778
779 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
780 (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
781
782 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
783 (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
784 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
785 PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX)
786
787 /* Compile time error code numbers. They are given names so that they can more
788 easily be tracked. When a new number is added, the tables called eint1 and
789 eint2 in pcre2posix.c may need to be updated, and a new error text must be
790 added to compile_error_texts in pcre2_error.c. */
791
792 enum { ERR0 = COMPILE_ERROR_BASE,
793 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
794 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
795 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
796 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
797 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
798 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
799 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
800 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
801 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
802 ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98 };
803
804 /* This is a table of start-of-pattern options such as (*UTF) and settings such
805 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
806 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
807 generic and always supported. */
808
809 enum { PSO_OPT, /* Value is an option bit */
810 PSO_FLG, /* Value is a flag bit */
811 PSO_NL, /* Value is a newline type */
812 PSO_BSR, /* Value is a \R type */
813 PSO_LIMH, /* Read integer value for heap limit */
814 PSO_LIMM, /* Read integer value for match limit */
815 PSO_LIMD }; /* Read integer value for depth limit */
816
817 typedef struct pso {
818 const uint8_t *name;
819 uint16_t length;
820 uint16_t type;
821 uint32_t value;
822 } pso;
823
824 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
825
826 static pso pso_list[] = {
827 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
828 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
829 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
830 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
831 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
832 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
833 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
834 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
835 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
836 { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
837 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
838 { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
839 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
840 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
841 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
842 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
843 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
844 { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
845 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
846 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
847 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
848 };
849
850 /* This table is used when converting repeating opcodes into possessified
851 versions as a result of an explicit possessive quantifier such as ++. A zero
852 value means there is no possessified version - in those cases the item in
853 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
854 because all relevant opcodes are less than that. */
855
856 static const uint8_t opcode_possessify[] = {
857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
858 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
859
860 0, /* NOTI */
861 OP_POSSTAR, 0, /* STAR, MINSTAR */
862 OP_POSPLUS, 0, /* PLUS, MINPLUS */
863 OP_POSQUERY, 0, /* QUERY, MINQUERY */
864 OP_POSUPTO, 0, /* UPTO, MINUPTO */
865 0, /* EXACT */
866 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
867
868 OP_POSSTARI, 0, /* STARI, MINSTARI */
869 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
870 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
871 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
872 0, /* EXACTI */
873 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
874
875 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
876 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
877 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
878 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
879 0, /* NOTEXACT */
880 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
881
882 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
883 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
884 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
885 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
886 0, /* NOTEXACTI */
887 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
888
889 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
890 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
891 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
892 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
893 0, /* TYPEEXACT */
894 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
895
896 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
897 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
898 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
899 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
900 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
901
902 0, 0, 0, /* CLASS, NCLASS, XCLASS */
903 0, 0, /* REF, REFI */
904 0, 0, /* DNREF, DNREFI */
905 0, 0 /* RECURSE, CALLOUT */
906 };
907
908
909 #ifdef DEBUG_SHOW_PARSED
910 /*************************************************
911 * Show the parsed pattern for debugging *
912 *************************************************/
913
914 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
915 can be enabled. */
916
show_parsed(compile_block * cb)917 static void show_parsed(compile_block *cb)
918 {
919 uint32_t *pptr = cb->parsed_pattern;
920
921 for (;;)
922 {
923 int max, min;
924 PCRE2_SIZE offset;
925 uint32_t i;
926 uint32_t length;
927 uint32_t meta_arg = META_DATA(*pptr);
928
929 fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
930
931 if (*pptr < META_END)
932 {
933 if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
934 pptr++;
935 }
936
937 else switch (META_CODE(*pptr++))
938 {
939 default:
940 fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
941 return;
942
943 case META_END:
944 fprintf(stderr, "META_END\n");
945 return;
946
947 case META_CAPTURE:
948 fprintf(stderr, "META_CAPTURE %d", meta_arg);
949 break;
950
951 case META_RECURSE:
952 GETOFFSET(offset, pptr);
953 fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
954 break;
955
956 case META_BACKREF:
957 if (meta_arg < 10)
958 offset = cb->small_ref_offset[meta_arg];
959 else
960 GETOFFSET(offset, pptr);
961 fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
962 break;
963
964 case META_ESCAPE:
965 if (meta_arg == ESC_P || meta_arg == ESC_p)
966 {
967 uint32_t ptype = *pptr >> 16;
968 uint32_t pvalue = *pptr++ & 0xffff;
969 fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
970 ptype, pvalue);
971 }
972 else
973 {
974 uint32_t cc;
975 /* There's just one escape we might have here that isn't negated in the
976 escapes table. */
977 if (meta_arg == ESC_g) cc = CHAR_g;
978 else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
979 {
980 if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
981 }
982 if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
983 fprintf(stderr, "META \\%c", cc);
984 }
985 break;
986
987 case META_MINMAX:
988 min = *pptr++;
989 max = *pptr++;
990 if (max != REPEAT_UNLIMITED)
991 fprintf(stderr, "META {%d,%d}", min, max);
992 else
993 fprintf(stderr, "META {%d,}", min);
994 break;
995
996 case META_MINMAX_QUERY:
997 min = *pptr++;
998 max = *pptr++;
999 if (max != REPEAT_UNLIMITED)
1000 fprintf(stderr, "META {%d,%d}?", min, max);
1001 else
1002 fprintf(stderr, "META {%d,}?", min);
1003 break;
1004
1005 case META_MINMAX_PLUS:
1006 min = *pptr++;
1007 max = *pptr++;
1008 if (max != REPEAT_UNLIMITED)
1009 fprintf(stderr, "META {%d,%d}+", min, max);
1010 else
1011 fprintf(stderr, "META {%d,}+", min);
1012 break;
1013
1014 case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1015 case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1016 case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1017 case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1018 case META_DOT: fprintf(stderr, "META_DOT"); break;
1019 case META_ASTERISK: fprintf(stderr, "META *"); break;
1020 case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1021 case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1022 case META_PLUS: fprintf(stderr, "META +"); break;
1023 case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1024 case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1025 case META_QUERY: fprintf(stderr, "META ?"); break;
1026 case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1027 case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1028
1029 case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1030 case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1031 case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1032 case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1033 case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1034 case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1035 case META_KET: fprintf(stderr, "META )"); break;
1036 case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1037
1038 case META_CLASS: fprintf(stderr, "META ["); break;
1039 case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1040 case META_CLASS_END: fprintf(stderr, "META ]"); break;
1041 case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1042 case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1043
1044 case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1045 case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1046
1047 case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1048 case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1049
1050 case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1051 case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1052 case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1053 case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1054 case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1055 case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1056
1057 case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
1058
1059 case META_LOOKBEHIND:
1060 fprintf(stderr, "META (?<= %d offset=", meta_arg);
1061 GETOFFSET(offset, pptr);
1062 fprintf(stderr, "%zd", offset);
1063 break;
1064
1065 case META_LOOKBEHIND_NA:
1066 fprintf(stderr, "META (*naplb: %d offset=", meta_arg);
1067 GETOFFSET(offset, pptr);
1068 fprintf(stderr, "%zd", offset);
1069 break;
1070
1071 case META_LOOKBEHINDNOT:
1072 fprintf(stderr, "META (?<! %d offset=", meta_arg);
1073 GETOFFSET(offset, pptr);
1074 fprintf(stderr, "%zd", offset);
1075 break;
1076
1077 case META_CALLOUT_NUMBER:
1078 fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1079 pptr[1]);
1080 pptr += 3;
1081 break;
1082
1083 case META_CALLOUT_STRING:
1084 {
1085 uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1086 uint32_t patlength = *pptr++; /* Length of next pattern item */
1087 fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1088 GETOFFSET(offset, pptr);
1089 fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1090 }
1091 break;
1092
1093 case META_RECURSE_BYNAME:
1094 fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1095 GETOFFSET(offset, pptr);
1096 fprintf(stderr, "%zd", offset);
1097 break;
1098
1099 case META_BACKREF_BYNAME:
1100 fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1101 GETOFFSET(offset, pptr);
1102 fprintf(stderr, "%zd", offset);
1103 break;
1104
1105 case META_COND_NUMBER:
1106 fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1107 GETOFFSET(offset, pptr);
1108 fprintf(stderr, "%zd", offset);
1109 pptr++;
1110 break;
1111
1112 case META_COND_DEFINE:
1113 fprintf(stderr, "META (?(DEFINE) offset=");
1114 GETOFFSET(offset, pptr);
1115 fprintf(stderr, "%zd", offset);
1116 break;
1117
1118 case META_COND_VERSION:
1119 fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1120 fprintf(stderr, "%d.", *pptr++);
1121 fprintf(stderr, "%d)", *pptr++);
1122 break;
1123
1124 case META_COND_NAME:
1125 fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1126 GETOFFSET(offset, pptr);
1127 fprintf(stderr, "%zd", offset);
1128 break;
1129
1130 case META_COND_RNAME:
1131 fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1132 GETOFFSET(offset, pptr);
1133 fprintf(stderr, "%zd", offset);
1134 break;
1135
1136 /* This is kept as a name, because it might be. */
1137
1138 case META_COND_RNUMBER:
1139 fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1140 GETOFFSET(offset, pptr);
1141 fprintf(stderr, "%zd", offset);
1142 break;
1143
1144 case META_MARK:
1145 fprintf(stderr, "META (*MARK:");
1146 goto SHOWARG;
1147
1148 case META_COMMIT_ARG:
1149 fprintf(stderr, "META (*COMMIT:");
1150 goto SHOWARG;
1151
1152 case META_PRUNE_ARG:
1153 fprintf(stderr, "META (*PRUNE:");
1154 goto SHOWARG;
1155
1156 case META_SKIP_ARG:
1157 fprintf(stderr, "META (*SKIP:");
1158 goto SHOWARG;
1159
1160 case META_THEN_ARG:
1161 fprintf(stderr, "META (*THEN:");
1162 SHOWARG:
1163 length = *pptr++;
1164 for (i = 0; i < length; i++)
1165 {
1166 uint32_t cc = *pptr++;
1167 if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1168 else fprintf(stderr, "\\x{%x}", cc);
1169 }
1170 fprintf(stderr, ") length=%u", length);
1171 break;
1172 }
1173 fprintf(stderr, "\n");
1174 }
1175 return;
1176 }
1177 #endif /* DEBUG_SHOW_PARSED */
1178
1179
1180
1181 /*************************************************
1182 * Copy compiled code *
1183 *************************************************/
1184
1185 /* Compiled JIT code cannot be copied, so the new compiled block has no
1186 associated JIT data. */
1187
1188 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1189 pcre2_code_copy(const pcre2_code *code)
1190 {
1191 PCRE2_SIZE* ref_count;
1192 pcre2_code *newcode;
1193
1194 if (code == NULL) return NULL;
1195 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1196 if (newcode == NULL) return NULL;
1197 memcpy(newcode, code, code->blocksize);
1198 newcode->executable_jit = NULL;
1199
1200 /* If the code is one that has been deserialized, increment the reference count
1201 in the decoded tables. */
1202
1203 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1204 {
1205 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1206 (*ref_count)++;
1207 }
1208
1209 return newcode;
1210 }
1211
1212
1213
1214 /*************************************************
1215 * Copy compiled code and character tables *
1216 *************************************************/
1217
1218 /* Compiled JIT code cannot be copied, so the new compiled block has no
1219 associated JIT data. This version of code_copy also makes a separate copy of
1220 the character tables. */
1221
1222 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1223 pcre2_code_copy_with_tables(const pcre2_code *code)
1224 {
1225 PCRE2_SIZE* ref_count;
1226 pcre2_code *newcode;
1227 uint8_t *newtables;
1228
1229 if (code == NULL) return NULL;
1230 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1231 if (newcode == NULL) return NULL;
1232 memcpy(newcode, code, code->blocksize);
1233 newcode->executable_jit = NULL;
1234
1235 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1236 code->memctl.memory_data);
1237 if (newtables == NULL)
1238 {
1239 code->memctl.free((void *)newcode, code->memctl.memory_data);
1240 return NULL;
1241 }
1242 memcpy(newtables, code->tables, TABLES_LENGTH);
1243 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1244 *ref_count = 1;
1245
1246 newcode->tables = newtables;
1247 newcode->flags |= PCRE2_DEREF_TABLES;
1248 return newcode;
1249 }
1250
1251
1252
1253 /*************************************************
1254 * Free compiled code *
1255 *************************************************/
1256
1257 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1258 pcre2_code_free(pcre2_code *code)
1259 {
1260 PCRE2_SIZE* ref_count;
1261
1262 if (code != NULL)
1263 {
1264 if (code->executable_jit != NULL)
1265 PRIV(jit_free)(code->executable_jit, &code->memctl);
1266
1267 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1268 {
1269 /* Decoded tables belong to the codes after deserialization, and they must
1270 be freed when there are no more references to them. The *ref_count should
1271 always be > 0. */
1272
1273 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1274 if (*ref_count > 0)
1275 {
1276 (*ref_count)--;
1277 if (*ref_count == 0)
1278 code->memctl.free((void *)code->tables, code->memctl.memory_data);
1279 }
1280 }
1281
1282 code->memctl.free(code, code->memctl.memory_data);
1283 }
1284 }
1285
1286
1287
1288 /*************************************************
1289 * Read a number, possibly signed *
1290 *************************************************/
1291
1292 /* This function is used to read numbers in the pattern. The initial pointer
1293 must be the sign or first digit of the number. When relative values (introduced
1294 by + or -) are allowed, they are relative group numbers, and the result must be
1295 greater than zero.
1296
1297 Arguments:
1298 ptrptr points to the character pointer variable
1299 ptrend points to the end of the input string
1300 allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1301 max_value the largest number allowed
1302 max_error the error to give for an over-large number
1303 intptr where to put the result
1304 errcodeptr where to put an error code
1305
1306 Returns: TRUE - a number was read
1307 FALSE - errorcode == 0 => no number was found
1308 errorcode != 0 => an error occurred
1309 */
1310
1311 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1312 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1313 uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1314 {
1315 int sign = 0;
1316 uint32_t n = 0;
1317 PCRE2_SPTR ptr = *ptrptr;
1318 BOOL yield = FALSE;
1319
1320 *errorcodeptr = 0;
1321
1322 if (allow_sign >= 0 && ptr < ptrend)
1323 {
1324 if (*ptr == CHAR_PLUS)
1325 {
1326 sign = +1;
1327 max_value -= allow_sign;
1328 ptr++;
1329 }
1330 else if (*ptr == CHAR_MINUS)
1331 {
1332 sign = -1;
1333 ptr++;
1334 }
1335 }
1336
1337 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1338 while (ptr < ptrend && IS_DIGIT(*ptr))
1339 {
1340 n = n * 10 + *ptr++ - CHAR_0;
1341 if (n > max_value)
1342 {
1343 *errorcodeptr = max_error;
1344 goto EXIT;
1345 }
1346 }
1347
1348 if (allow_sign >= 0 && sign != 0)
1349 {
1350 if (n == 0)
1351 {
1352 *errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1353 goto EXIT;
1354 }
1355
1356 if (sign > 0) n += allow_sign;
1357 else if ((int)n > allow_sign)
1358 {
1359 *errorcodeptr = ERR15; /* Non-existent subpattern */
1360 goto EXIT;
1361 }
1362 else n = allow_sign + 1 - n;
1363 }
1364
1365 yield = TRUE;
1366
1367 EXIT:
1368 *intptr = n;
1369 *ptrptr = ptr;
1370 return yield;
1371 }
1372
1373
1374
1375 /*************************************************
1376 * Read repeat counts *
1377 *************************************************/
1378
1379 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1380 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1381 larger value is used for "unlimited". We have to use signed arguments for
1382 read_number() because it is capable of returning a signed value.
1383
1384 Arguments:
1385 ptrptr points to pointer to character after'{'
1386 ptrend pointer to end of input
1387 minp if not NULL, pointer to int for min
1388 maxp if not NULL, pointer to int for max (-1 if no max)
1389 returned as -1 if no max
1390 errorcodeptr points to error code variable
1391
1392 Returns: FALSE if not a repeat quantifier, errorcode set zero
1393 FALSE on error, with errorcode set non-zero
1394 TRUE on success, with pointer updated to point after '}'
1395 */
1396
1397 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1398 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1399 uint32_t *maxp, int *errorcodeptr)
1400 {
1401 PCRE2_SPTR p = *ptrptr;
1402 BOOL yield = FALSE;
1403 int32_t min = 0;
1404 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1405
1406 /* NB read_number() initializes the error code to zero. The only error is for a
1407 number that is too big. */
1408
1409 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1410 goto EXIT;
1411
1412 if (p >= ptrend) goto EXIT;
1413
1414 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1415 {
1416 p++;
1417 max = min;
1418 }
1419
1420 else
1421 {
1422 if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT;
1423 if (*p != CHAR_RIGHT_CURLY_BRACKET)
1424 {
1425 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1426 errorcodeptr) || p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1427 goto EXIT;
1428 if (max < min)
1429 {
1430 *errorcodeptr = ERR4;
1431 goto EXIT;
1432 }
1433 }
1434 p++;
1435 }
1436
1437 yield = TRUE;
1438 if (minp != NULL) *minp = (uint32_t)min;
1439 if (maxp != NULL) *maxp = (uint32_t)max;
1440
1441 /* Update the pattern pointer on success, or after an error, but not when
1442 the result is "not a repeat quantifier". */
1443
1444 EXIT:
1445 if (yield || *errorcodeptr != 0) *ptrptr = p;
1446 return yield;
1447 }
1448
1449
1450
1451 /*************************************************
1452 * Handle escapes *
1453 *************************************************/
1454
1455 /* This function is called when a \ has been encountered. It either returns a
1456 positive value for a simple escape such as \d, or 0 for a data character, which
1457 is placed in chptr. A backreference to group n is returned as negative n. On
1458 entry, ptr is pointing at the character after \. On exit, it points after the
1459 final code unit of the escape sequence.
1460
1461 This function is also called from pcre2_substitute() to handle escape sequences
1462 in replacement strings. In this case, the cb argument is NULL, and in the case
1463 of escapes that have further processing, only sequences that define a data
1464 character are recognised. The isclass argument is not relevant; the options
1465 argument is the final value of the compiled pattern's options.
1466
1467 Arguments:
1468 ptrptr points to the input position pointer
1469 ptrend points to the end of the input
1470 chptr points to a returned data character
1471 errorcodeptr points to the errorcode variable (containing zero)
1472 options the current options bits
1473 isclass TRUE if inside a character class
1474 cb compile data block or NULL when called from pcre2_substitute()
1475
1476 Returns: zero => a data character
1477 positive => a special escape sequence
1478 negative => a numerical back reference
1479 on error, errorcodeptr is set non-zero
1480 */
1481
1482 int
PRIV(check_escape)1483 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1484 int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1485 compile_block *cb)
1486 {
1487 BOOL utf = (options & PCRE2_UTF) != 0;
1488 PCRE2_SPTR ptr = *ptrptr;
1489 uint32_t c, cc;
1490 int escape = 0;
1491 int i;
1492
1493 /* If backslash is at the end of the string, it's an error. */
1494
1495 if (ptr >= ptrend)
1496 {
1497 *errorcodeptr = ERR1;
1498 return 0;
1499 }
1500
1501 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1502 *errorcodeptr = 0; /* Be optimistic */
1503
1504 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1505 value test saves a memory lookup for code points outside the alphanumeric
1506 range. */
1507
1508 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1509
1510 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1511 positive value is a literal value for something like \n. A negative value is
1512 the negation of one of the ESC_ macros that is passed back for handling by the
1513 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1514 is supported. If the value is zero, further processing is handled below. */
1515
1516 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1517 {
1518 if (i > 0)
1519 {
1520 c = (uint32_t)i;
1521 if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1522 c = CHAR_LF;
1523 }
1524 else /* Negative table entry */
1525 {
1526 escape = -i; /* Else return a special escape */
1527 if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1528 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1529
1530 /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1531 Unicode code points, as well as plain \N for "not newline". PCRE does not
1532 support \N{name}. However, it does support quantification such as \N{2,3},
1533 so if \N{ is not followed by U+dddd we check for a quantifier. */
1534
1535 if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1536 {
1537 PCRE2_SPTR p = ptr + 1;
1538
1539 /* \N{U+ can be handled by the \x{ code. However, this construction is
1540 not valid in EBCDIC environments because it specifies a Unicode
1541 character, not a codepoint in the local code. For example \N{U+0041}
1542 must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1543 casing semantics for the entire pattern, so allow it only in UTF (i.e.
1544 Unicode) mode. */
1545
1546 if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1547 {
1548 #ifdef EBCDIC
1549 *errorcodeptr = ERR93;
1550 #else
1551 if (utf)
1552 {
1553 ptr = p + 1;
1554 escape = 0; /* Not a fancy escape after all */
1555 goto COME_FROM_NU;
1556 }
1557 else *errorcodeptr = ERR93;
1558 #endif
1559 }
1560
1561 /* Give an error if what follows is not a quantifier, but don't override
1562 an error set by the quantifier reader (e.g. number overflow). */
1563
1564 else
1565 {
1566 if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1567 *errorcodeptr == 0)
1568 *errorcodeptr = ERR37;
1569 }
1570 }
1571 }
1572 }
1573
1574 /* Escapes that need further processing, including those that are unknown, have
1575 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1576 \o, and \x are recognized (\u and \U can never appear as they are used for case
1577 forcing). */
1578
1579 else
1580 {
1581 int s;
1582 PCRE2_SPTR oldptr;
1583 BOOL overflow;
1584 BOOL alt_bsux =
1585 ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
1586
1587 /* Filter calls from pcre2_substitute(). */
1588
1589 if (cb == NULL)
1590 {
1591 if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1592 {
1593 *errorcodeptr = ERR3;
1594 return 0;
1595 }
1596 alt_bsux = FALSE; /* Do not modify \x handling */
1597 }
1598
1599 switch (c)
1600 {
1601 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1602 error. */
1603
1604 case CHAR_F:
1605 case CHAR_l:
1606 case CHAR_L:
1607 *errorcodeptr = ERR37;
1608 break;
1609
1610 /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1611 is set. Otherwise, \u must be followed by exactly four hex digits or, if
1612 PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1613 Otherwise it is a lowercase u letter. This gives some compatibility with
1614 ECMAScript (aka JavaScript). */
1615
1616 case CHAR_u:
1617 if (!alt_bsux) *errorcodeptr = ERR37; else
1618 {
1619 uint32_t xc;
1620
1621 if (ptr >= ptrend) break;
1622 if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1623 (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
1624 {
1625 PCRE2_SPTR hptr = ptr + 1;
1626 cc = 0;
1627
1628 while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1629 {
1630 if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
1631 {
1632 *errorcodeptr = ERR77;
1633 ptr = hptr; /* Show where */
1634 break; /* *hptr != } will cause another break below */
1635 }
1636 cc = (cc << 4) | xc;
1637 hptr++;
1638 }
1639
1640 if (hptr == ptr + 1 || /* No hex digits */
1641 hptr >= ptrend || /* Hit end of input */
1642 *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
1643 break; /* Hex escape not recognized */
1644
1645 c = cc; /* Accept the code point */
1646 ptr = hptr + 1;
1647 }
1648
1649 else /* Must be exactly 4 hex digits */
1650 {
1651 if (ptrend - ptr < 4) break; /* Less than 4 chars */
1652 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1653 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1654 cc = (cc << 4) | xc;
1655 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1656 cc = (cc << 4) | xc;
1657 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1658 c = (cc << 4) | xc;
1659 ptr += 4;
1660 }
1661
1662 if (utf)
1663 {
1664 if (c > 0x10ffffU) *errorcodeptr = ERR77;
1665 else
1666 if (c >= 0xd800 && c <= 0xdfff &&
1667 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1668 *errorcodeptr = ERR73;
1669 }
1670 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1671 }
1672 break;
1673
1674 /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1675 in which case it is an upper case letter. */
1676
1677 case CHAR_U:
1678 if (!alt_bsux) *errorcodeptr = ERR37;
1679 break;
1680
1681 /* In a character class, \g is just a literal "g". Outside a character
1682 class, \g must be followed by one of a number of specific things:
1683
1684 (1) A number, either plain or braced. If positive, it is an absolute
1685 backreference. If negative, it is a relative backreference. This is a Perl
1686 5.10 feature.
1687
1688 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1689 is part of Perl's movement towards a unified syntax for back references. As
1690 this is synonymous with \k{name}, we fudge it up by pretending it really
1691 was \k{name}.
1692
1693 (3) For Oniguruma compatibility we also support \g followed by a name or a
1694 number either in angle brackets or in single quotes. However, these are
1695 (possibly recursive) subroutine calls, _not_ backreferences. We return
1696 the ESC_g code.
1697
1698 Summary: Return a negative number for a numerical back reference, ESC_k for
1699 a named back reference, and ESC_g for a named or numbered subroutine call.
1700 */
1701
1702 case CHAR_g:
1703 if (isclass) break;
1704
1705 if (ptr >= ptrend)
1706 {
1707 *errorcodeptr = ERR57;
1708 break;
1709 }
1710
1711 if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1712 {
1713 escape = ESC_g;
1714 break;
1715 }
1716
1717 /* If there is a brace delimiter, try to read a numerical reference. If
1718 there isn't one, assume we have a name and treat it as \k. */
1719
1720 if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1721 {
1722 PCRE2_SPTR p = ptr + 1;
1723 if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1724 errorcodeptr))
1725 {
1726 if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1727 break;
1728 }
1729 if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1730 {
1731 *errorcodeptr = ERR57;
1732 break;
1733 }
1734 ptr = p + 1;
1735 }
1736
1737 /* Read an undelimited number */
1738
1739 else
1740 {
1741 if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1742 errorcodeptr))
1743 {
1744 if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1745 break;
1746 }
1747 }
1748
1749 if (s <= 0)
1750 {
1751 *errorcodeptr = ERR15;
1752 break;
1753 }
1754
1755 escape = -s;
1756 break;
1757
1758 /* The handling of escape sequences consisting of a string of digits
1759 starting with one that is not zero is not straightforward. Perl has changed
1760 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1761 recommended to avoid the ambiguities in the old syntax.
1762
1763 Outside a character class, the digits are read as a decimal number. If the
1764 number is less than 10, or if there are that many previous extracting left
1765 brackets, it is a back reference. Otherwise, up to three octal digits are
1766 read to form an escaped character code. Thus \123 is likely to be octal 123
1767 (cf \0123, which is octal 012 followed by the literal 3).
1768
1769 Inside a character class, \ followed by a digit is always either a literal
1770 8 or 9 or an octal number. */
1771
1772 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1773 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1774
1775 if (!isclass)
1776 {
1777 oldptr = ptr;
1778 ptr--; /* Back to the digit */
1779 if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s,
1780 errorcodeptr))
1781 break;
1782
1783 /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1784 are octal escapes if there are not that many previous captures. */
1785
1786 if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount)
1787 {
1788 if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1789 else escape = -s; /* Indicates a back reference */
1790 break;
1791 }
1792 ptr = oldptr; /* Put the pointer back and fall through */
1793 }
1794
1795 /* Handle a digit following \ when the number is not a back reference, or
1796 we are within a character class. If the first digit is 8 or 9, Perl used to
1797 generate a binary zero and then treat the digit as a following literal. At
1798 least by Perl 5.18 this changed so as not to insert the binary zero. */
1799
1800 if (c >= CHAR_8) break;
1801
1802 /* Fall through */
1803
1804 /* \0 always starts an octal number, but we may drop through to here with a
1805 larger first octal digit. The original code used just to take the least
1806 significant 8 bits of octal numbers (I think this is what early Perls used
1807 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1808 but no more than 3 octal digits. */
1809
1810 case CHAR_0:
1811 c -= CHAR_0;
1812 while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1813 c = c * 8 + *ptr++ - CHAR_0;
1814 #if PCRE2_CODE_UNIT_WIDTH == 8
1815 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1816 #endif
1817 break;
1818
1819 /* \o is a relatively new Perl feature, supporting a more general way of
1820 specifying character codes in octal. The only supported form is \o{ddd}. */
1821
1822 case CHAR_o:
1823 if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1824 {
1825 ptr--;
1826 *errorcodeptr = ERR55;
1827 }
1828 else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1829 *errorcodeptr = ERR78;
1830 else
1831 {
1832 c = 0;
1833 overflow = FALSE;
1834 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1835 {
1836 cc = *ptr++;
1837 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1838 #if PCRE2_CODE_UNIT_WIDTH == 32
1839 if (c >= 0x20000000l) { overflow = TRUE; break; }
1840 #endif
1841 c = (c << 3) + (cc - CHAR_0);
1842 #if PCRE2_CODE_UNIT_WIDTH == 8
1843 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1844 #elif PCRE2_CODE_UNIT_WIDTH == 16
1845 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1846 #elif PCRE2_CODE_UNIT_WIDTH == 32
1847 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1848 #endif
1849 }
1850 if (overflow)
1851 {
1852 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1853 *errorcodeptr = ERR34;
1854 }
1855 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1856 {
1857 if (utf && c >= 0xd800 && c <= 0xdfff &&
1858 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1859 {
1860 ptr--;
1861 *errorcodeptr = ERR73;
1862 }
1863 }
1864 else
1865 {
1866 ptr--;
1867 *errorcodeptr = ERR64;
1868 }
1869 }
1870 break;
1871
1872 /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1873 by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1874
1875 case CHAR_x:
1876 if (alt_bsux)
1877 {
1878 uint32_t xc;
1879 if (ptrend - ptr < 2) break; /* Less than 2 characters */
1880 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1881 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1882 c = (cc << 4) | xc;
1883 ptr += 2;
1884 }
1885
1886 /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1887 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1888 digits. If not, { used to be treated as a data character. However, Perl
1889 seems to read hex digits up to the first non-such, and ignore the rest, so
1890 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1891 now gives an error. */
1892
1893 else
1894 {
1895 if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1896 {
1897 #ifndef EBCDIC
1898 COME_FROM_NU:
1899 #endif
1900 if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1901 {
1902 *errorcodeptr = ERR78;
1903 break;
1904 }
1905 c = 0;
1906 overflow = FALSE;
1907
1908 while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1909 {
1910 ptr++;
1911 if (c == 0 && cc == 0) continue; /* Leading zeroes */
1912 #if PCRE2_CODE_UNIT_WIDTH == 32
1913 if (c >= 0x10000000l) { overflow = TRUE; break; }
1914 #endif
1915 c = (c << 4) | cc;
1916 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1917 {
1918 overflow = TRUE;
1919 break;
1920 }
1921 }
1922
1923 if (overflow)
1924 {
1925 while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1926 *errorcodeptr = ERR34;
1927 }
1928 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1929 {
1930 if (utf && c >= 0xd800 && c <= 0xdfff &&
1931 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1932 {
1933 ptr--;
1934 *errorcodeptr = ERR73;
1935 }
1936 }
1937
1938 /* If the sequence of hex digits does not end with '}', give an error.
1939 We used just to recognize this construct and fall through to the normal
1940 \x handling, but nowadays Perl gives an error, which seems much more
1941 sensible, so we do too. */
1942
1943 else
1944 {
1945 ptr--;
1946 *errorcodeptr = ERR67;
1947 }
1948 } /* End of \x{} processing */
1949
1950 /* Read a up to two hex digits after \x */
1951
1952 else
1953 {
1954 c = 0;
1955 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1956 ptr++;
1957 c = cc;
1958 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1959 ptr++;
1960 c = (c << 4) | cc;
1961 } /* End of \xdd handling */
1962 } /* End of Perl-style \x handling */
1963 break;
1964
1965 /* The handling of \c is different in ASCII and EBCDIC environments. In an
1966 ASCII (or Unicode) environment, an error is given if the character
1967 following \c is not a printable ASCII character. Otherwise, the following
1968 character is upper-cased if it is a letter, and after that the 0x40 bit is
1969 flipped. The result is the value of the escape.
1970
1971 In an EBCDIC environment the handling of \c is compatible with the
1972 specification in the perlebcdic document. The following character must be
1973 a letter or one of small number of special characters. These provide a
1974 means of defining the character values 0-31.
1975
1976 For testing the EBCDIC handling of \c in an ASCII environment, recognize
1977 the EBCDIC value of 'c' explicitly. */
1978
1979 #if defined EBCDIC && 'a' != 0x81
1980 case 0x83:
1981 #else
1982 case CHAR_c:
1983 #endif
1984 if (ptr >= ptrend)
1985 {
1986 *errorcodeptr = ERR2;
1987 break;
1988 }
1989 c = *ptr;
1990 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
1991
1992 /* Handle \c in an ASCII/Unicode environment. */
1993
1994 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1995 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
1996 {
1997 *errorcodeptr = ERR68;
1998 break;
1999 }
2000 c ^= 0x40;
2001
2002 /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2003 255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2004 POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2005 The other valid sequences correspond to a list of specific characters. */
2006
2007 #else
2008 if (c == CHAR_QUESTION_MARK)
2009 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2010 else
2011 {
2012 for (i = 0; i < 32; i++)
2013 {
2014 if (c == ebcdic_escape_c[i]) break;
2015 }
2016 if (i < 32) c = i; else *errorcodeptr = ERR68;
2017 }
2018 #endif /* EBCDIC */
2019
2020 ptr++;
2021 break;
2022
2023 /* Any other alphanumeric following \ is an error. Perl gives an error only
2024 if in warning mode, but PCRE doesn't have a warning mode. */
2025
2026 default:
2027 *errorcodeptr = ERR3;
2028 *ptrptr = ptr - 1; /* Point to the character at fault */
2029 return 0;
2030 }
2031 }
2032
2033 /* Set the pointer to the next character before returning. */
2034
2035 *ptrptr = ptr;
2036 *chptr = c;
2037 return escape;
2038 }
2039
2040
2041
2042 #ifdef SUPPORT_UNICODE
2043 /*************************************************
2044 * Handle \P and \p *
2045 *************************************************/
2046
2047 /* This function is called after \P or \p has been encountered, provided that
2048 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2049 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2050 after the final code unit of the escape sequence.
2051
2052 Arguments:
2053 ptrptr the pattern position pointer
2054 negptr a boolean that is set TRUE for negation else FALSE
2055 ptypeptr an unsigned int that is set to the type value
2056 pdataptr an unsigned int that is set to the detailed property value
2057 errorcodeptr the error code variable
2058 cb the compile data
2059
2060 Returns: TRUE if the type value was found, or FALSE for an invalid type
2061 */
2062
2063 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2064 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2065 uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2066 {
2067 PCRE2_UCHAR c;
2068 PCRE2_SIZE i, bot, top;
2069 PCRE2_SPTR ptr = *ptrptr;
2070 PCRE2_UCHAR name[32];
2071
2072 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2073 c = *ptr++;
2074 *negptr = FALSE;
2075
2076 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2077 negation. */
2078
2079 if (c == CHAR_LEFT_CURLY_BRACKET)
2080 {
2081 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2082 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2083 {
2084 *negptr = TRUE;
2085 ptr++;
2086 }
2087 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2088 {
2089 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2090 c = *ptr++;
2091 if (c == CHAR_NUL) goto ERROR_RETURN;
2092 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2093 name[i] = c;
2094 }
2095 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2096 name[i] = 0;
2097 }
2098
2099 /* Otherwise there is just one following character, which must be an ASCII
2100 letter. */
2101
2102 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2103 {
2104 name[0] = c;
2105 name[1] = 0;
2106 }
2107 else goto ERROR_RETURN;
2108
2109 *ptrptr = ptr;
2110
2111 /* Search for a recognized property name using binary chop. */
2112
2113 bot = 0;
2114 top = PRIV(utt_size);
2115
2116 while (bot < top)
2117 {
2118 int r;
2119 i = (bot + top) >> 1;
2120 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2121 if (r == 0)
2122 {
2123 *ptypeptr = PRIV(utt)[i].type;
2124 *pdataptr = PRIV(utt)[i].value;
2125 return TRUE;
2126 }
2127 if (r > 0) bot = i + 1; else top = i;
2128 }
2129 *errorcodeptr = ERR47; /* Unrecognized name */
2130 return FALSE;
2131
2132 ERROR_RETURN: /* Malformed \P or \p */
2133 *errorcodeptr = ERR46;
2134 *ptrptr = ptr;
2135 return FALSE;
2136 }
2137 #endif
2138
2139
2140
2141 /*************************************************
2142 * Check for POSIX class syntax *
2143 *************************************************/
2144
2145 /* This function is called when the sequence "[:" or "[." or "[=" is
2146 encountered in a character class. It checks whether this is followed by a
2147 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2148 reach an unescaped ']' without the special preceding character, return FALSE.
2149
2150 Originally, this function only recognized a sequence of letters between the
2151 terminators, but it seems that Perl recognizes any sequence of characters,
2152 though of course unknown POSIX names are subsequently rejected. Perl gives an
2153 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2154 didn't consider this to be a POSIX class. Likewise for [:1234:].
2155
2156 The problem in trying to be exactly like Perl is in the handling of escapes. We
2157 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2158 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2159 below handles the special cases \\ and \], but does not try to do any other
2160 escape processing. This makes it different from Perl for cases such as
2161 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2162 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2163 when Perl does, I think.
2164
2165 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2166 It seems that the appearance of a nested POSIX class supersedes an apparent
2167 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2168 a digit. This is handled by returning FALSE if the start of a new group with
2169 the same terminator is encountered, since the next closing sequence must close
2170 the nested group, not the outer one.
2171
2172 In Perl, unescaped square brackets may also appear as part of class names. For
2173 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2174 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2175 seem right at all. PCRE does not allow closing square brackets in POSIX class
2176 names.
2177
2178 Arguments:
2179 ptr pointer to the character after the initial [ (colon, dot, equals)
2180 ptrend pointer to the end of the pattern
2181 endptr where to return a pointer to the terminating ':', '.', or '='
2182
2183 Returns: TRUE or FALSE
2184 */
2185
2186 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2187 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2188 {
2189 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2190 terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2191
2192 for (; ptrend - ptr >= 2; ptr++)
2193 {
2194 if (*ptr == CHAR_BACKSLASH &&
2195 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2196 ptr++;
2197
2198 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2199 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2200
2201 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2202 {
2203 *endptr = ptr;
2204 return TRUE;
2205 }
2206 }
2207
2208 return FALSE;
2209 }
2210
2211
2212
2213 /*************************************************
2214 * Check POSIX class name *
2215 *************************************************/
2216
2217 /* This function is called to check the name given in a POSIX-style class entry
2218 such as [:alnum:].
2219
2220 Arguments:
2221 ptr points to the first letter
2222 len the length of the name
2223
2224 Returns: a value representing the name, or -1 if unknown
2225 */
2226
2227 static int
check_posix_name(PCRE2_SPTR ptr,int len)2228 check_posix_name(PCRE2_SPTR ptr, int len)
2229 {
2230 const char *pn = posix_names;
2231 int yield = 0;
2232 while (posix_name_lengths[yield] != 0)
2233 {
2234 if (len == posix_name_lengths[yield] &&
2235 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2236 pn += posix_name_lengths[yield] + 1;
2237 yield++;
2238 }
2239 return -1;
2240 }
2241
2242
2243
2244 /*************************************************
2245 * Read a subpattern or VERB name *
2246 *************************************************/
2247
2248 /* This function is called from parse_regex() below whenever it needs to read
2249 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2250 pointer must be to the character before the name. If that character is '*' we
2251 are reading a verb or alpha assertion name. The pointer is updated to point
2252 after the name, for a VERB or alpha assertion name, or after tha name's
2253 terminator for a subpattern name. Returning both the offset and the name
2254 pointer is redundant information, but some callers use one and some the other,
2255 so it is simplest just to return both.
2256
2257 Arguments:
2258 ptrptr points to the character pointer variable
2259 ptrend points to the end of the input string
2260 utf true if the input is UTF-encoded
2261 terminator the terminator of a subpattern name must be this
2262 offsetptr where to put the offset from the start of the pattern
2263 nameptr where to put a pointer to the name in the input
2264 namelenptr where to put the length of the name
2265 errcodeptr where to put an error code
2266 cb pointer to the compile data block
2267
2268 Returns: TRUE if a name was read
2269 FALSE otherwise, with error code set
2270 */
2271
2272 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2273 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2274 PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2275 int *errorcodeptr, compile_block *cb)
2276 {
2277 PCRE2_SPTR ptr = *ptrptr;
2278 BOOL is_group = (*ptr != CHAR_ASTERISK);
2279
2280 if (++ptr >= ptrend) /* No characters in name */
2281 {
2282 *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2283 ERR60; /* Verb not recognized or malformed */
2284 goto FAILED;
2285 }
2286
2287 *nameptr = ptr;
2288 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2289
2290 /* In UTF mode, a group name may contain letters and decimal digits as defined
2291 by Unicode properties, and underscores, but must not start with a digit. */
2292
2293 #ifdef SUPPORT_UNICODE
2294 if (utf && is_group)
2295 {
2296 uint32_t c, type;
2297
2298 GETCHAR(c, ptr);
2299 type = UCD_CHARTYPE(c);
2300
2301 if (type == ucp_Nd)
2302 {
2303 *errorcodeptr = ERR44;
2304 goto FAILED;
2305 }
2306
2307 for(;;)
2308 {
2309 if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2310 c != CHAR_UNDERSCORE) break;
2311 ptr++;
2312 FORWARDCHARTEST(ptr, ptrend);
2313 if (ptr >= ptrend) break;
2314 GETCHAR(c, ptr);
2315 type = UCD_CHARTYPE(c);
2316 }
2317 }
2318 else
2319 #else
2320 (void)utf; /* Avoid compiler warning */
2321 #endif /* SUPPORT_UNICODE */
2322
2323 /* Handle non-group names and group names in non-UTF modes. A group name must
2324 not start with a digit. If either of the others start with a digit it just
2325 won't be recognized. */
2326
2327 {
2328 if (is_group && IS_DIGIT(*ptr))
2329 {
2330 *errorcodeptr = ERR44;
2331 goto FAILED;
2332 }
2333
2334 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2335 {
2336 ptr++;
2337 }
2338 }
2339
2340 /* Check name length */
2341
2342 if (ptr > *nameptr + MAX_NAME_SIZE)
2343 {
2344 *errorcodeptr = ERR48;
2345 goto FAILED;
2346 }
2347 *namelenptr = (uint32_t)(ptr - *nameptr);
2348
2349 /* Subpattern names must not be empty, and their terminator is checked here.
2350 (What follows a verb or alpha assertion name is checked separately.) */
2351
2352 if (is_group)
2353 {
2354 if (ptr == *nameptr)
2355 {
2356 *errorcodeptr = ERR62; /* Subpattern name expected */
2357 goto FAILED;
2358 }
2359 if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2360 {
2361 *errorcodeptr = ERR42;
2362 goto FAILED;
2363 }
2364 ptr++;
2365 }
2366
2367 *ptrptr = ptr;
2368 return TRUE;
2369
2370 FAILED:
2371 *ptrptr = ptr;
2372 return FALSE;
2373 }
2374
2375
2376
2377 /*************************************************
2378 * Manage callouts at start of cycle *
2379 *************************************************/
2380
2381 /* At the start of a new item in parse_regex() we are able to record the
2382 details of the previous item in a prior callout, and also to set up an
2383 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2384 which would otherwise happen for items such as \Q that contribute nothing to
2385 the parsed pattern.
2386
2387 Arguments:
2388 ptr current pattern pointer
2389 pcalloutptr points to a pointer to previous callout, or NULL
2390 auto_callout TRUE if auto_callouts are enabled
2391 parsed_pattern the parsed pattern pointer
2392 cb compile block
2393
2394 Returns: possibly updated parsed_pattern pointer.
2395 */
2396
2397 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2398 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2399 uint32_t *parsed_pattern, compile_block *cb)
2400 {
2401 uint32_t *previous_callout = *pcalloutptr;
2402
2403 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2404 cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2405
2406 if (!auto_callout) previous_callout = NULL; else
2407 {
2408 if (previous_callout == NULL ||
2409 previous_callout != parsed_pattern - 4 ||
2410 previous_callout[3] != 255)
2411 {
2412 previous_callout = parsed_pattern; /* Set up new automatic callout */
2413 parsed_pattern += 4;
2414 previous_callout[0] = META_CALLOUT_NUMBER;
2415 previous_callout[2] = 0;
2416 previous_callout[3] = 255;
2417 }
2418 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2419 }
2420
2421 *pcalloutptr = previous_callout;
2422 return parsed_pattern;
2423 }
2424
2425
2426
2427 /*************************************************
2428 * Parse regex and identify named groups *
2429 *************************************************/
2430
2431 /* This function is called first of all. It scans the pattern and does two
2432 things: (1) It identifies capturing groups and makes a table of named capturing
2433 groups so that information about them is fully available to both the compiling
2434 scans. (2) It writes a parsed version of the pattern with comments omitted and
2435 escapes processed into the parsed_pattern vector.
2436
2437 Arguments:
2438 ptr points to the start of the pattern
2439 options compiling dynamic options (may change during the scan)
2440 has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2441 cb pointer to the compile data block
2442
2443 Returns: zero on success or a non-zero error code, with the
2444 error offset placed in the cb field
2445 */
2446
2447 /* A structure and some flags for dealing with nested groups. */
2448
2449 typedef struct nest_save {
2450 uint16_t nest_depth;
2451 uint16_t reset_group;
2452 uint16_t max_group;
2453 uint16_t flags;
2454 uint32_t options;
2455 } nest_save;
2456
2457 #define NSF_RESET 0x0001u
2458 #define NSF_CONDASSERT 0x0002u
2459 #define NSF_ATOMICSR 0x0004u
2460
2461 /* Options that are changeable within the pattern must be tracked during
2462 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2463 but all must be tracked so that META_OPTIONS items set the correct values for
2464 the main compiling phase. */
2465
2466 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2467 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2468 PCRE2_UNGREEDY)
2469
2470 /* States used for analyzing ranges in character classes. The two OK values
2471 must be last. */
2472
2473 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2474
2475 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2476 the storing of literal values in the main parsed pattern, where they can always
2477 be quantified. */
2478
2479 #if PCRE2_CODE_UNIT_WIDTH == 32
2480 #define PARSED_LITERAL(c, p) \
2481 { \
2482 if (c >= META_END) *p++ = META_BIGVALUE; \
2483 *p++ = c; \
2484 okquantifier = TRUE; \
2485 }
2486 #else
2487 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2488 #endif
2489
2490 /* Here's the actual function. */
2491
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2492 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2493 compile_block *cb)
2494 {
2495 uint32_t c;
2496 uint32_t delimiter;
2497 uint32_t namelen;
2498 uint32_t class_range_state;
2499 uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2500 uint32_t *verbstartptr = NULL;
2501 uint32_t *previous_callout = NULL;
2502 uint32_t *parsed_pattern = cb->parsed_pattern;
2503 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2504 uint32_t meta_quantifier = 0;
2505 uint32_t add_after_mark = 0;
2506 uint32_t extra_options = cb->cx->extra_options;
2507 uint16_t nest_depth = 0;
2508 int after_manual_callout = 0;
2509 int expect_cond_assert = 0;
2510 int errorcode = 0;
2511 int escape;
2512 int i;
2513 BOOL inescq = FALSE;
2514 BOOL inverbname = FALSE;
2515 BOOL utf = (options & PCRE2_UTF) != 0;
2516 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2517 BOOL isdupname;
2518 BOOL negate_class;
2519 BOOL okquantifier = FALSE;
2520 PCRE2_SPTR thisptr;
2521 PCRE2_SPTR name;
2522 PCRE2_SPTR ptrend = cb->end_pattern;
2523 PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2524 named_group *ng;
2525 nest_save *top_nest, *end_nests;
2526
2527 /* Insert leading items for word and line matching (features provided for the
2528 benefit of pcre2grep). */
2529
2530 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2531 {
2532 *parsed_pattern++ = META_CIRCUMFLEX;
2533 *parsed_pattern++ = META_NOCAPTURE;
2534 }
2535 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2536 {
2537 *parsed_pattern++ = META_ESCAPE + ESC_b;
2538 *parsed_pattern++ = META_NOCAPTURE;
2539 }
2540
2541 /* If the pattern is actually a literal string, process it separately to avoid
2542 cluttering up the main loop. */
2543
2544 if ((options & PCRE2_LITERAL) != 0)
2545 {
2546 while (ptr < ptrend)
2547 {
2548 if (parsed_pattern >= parsed_pattern_end)
2549 {
2550 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2551 goto FAILED;
2552 }
2553 thisptr = ptr;
2554 GETCHARINCTEST(c, ptr);
2555 if (auto_callout)
2556 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2557 auto_callout, parsed_pattern, cb);
2558 PARSED_LITERAL(c, parsed_pattern);
2559 }
2560 goto PARSED_END;
2561 }
2562
2563 /* Process a real regex which may contain meta-characters. */
2564
2565 top_nest = NULL;
2566 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2567
2568 /* The size of the nest_save structure might not be a factor of the size of the
2569 workspace. Therefore we must round down end_nests so as to correctly avoid
2570 creating a nest_save that spans the end of the workspace. */
2571
2572 end_nests = (nest_save *)((char *)end_nests -
2573 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2574
2575 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2576
2577 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2578
2579 /* Now scan the pattern */
2580
2581 while (ptr < ptrend)
2582 {
2583 int prev_expect_cond_assert;
2584 uint32_t min_repeat, max_repeat;
2585 uint32_t set, unset, *optset;
2586 uint32_t terminator;
2587 uint32_t prev_meta_quantifier;
2588 BOOL prev_okquantifier;
2589 PCRE2_SPTR tempptr;
2590 PCRE2_SIZE offset;
2591
2592 if (parsed_pattern >= parsed_pattern_end)
2593 {
2594 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2595 goto FAILED;
2596 }
2597
2598 if (nest_depth > cb->cx->parens_nest_limit)
2599 {
2600 errorcode = ERR19;
2601 goto FAILED; /* Parentheses too deeply nested */
2602 }
2603
2604 /* Get next input character, save its position for callout handling. */
2605
2606 thisptr = ptr;
2607 GETCHARINCTEST(c, ptr);
2608
2609 /* Copy quoted literals until \E, allowing for the possibility of automatic
2610 callouts, except when processing a (*VERB) "name". */
2611
2612 if (inescq)
2613 {
2614 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2615 {
2616 inescq = FALSE;
2617 ptr++; /* Skip E */
2618 }
2619 else
2620 {
2621 if (expect_cond_assert > 0) /* A literal is not allowed if we are */
2622 { /* expecting a conditional assertion, */
2623 ptr--; /* but an empty \Q\E sequence is OK. */
2624 errorcode = ERR28;
2625 goto FAILED;
2626 }
2627 if (inverbname)
2628 { /* Don't use PARSED_LITERAL() because it */
2629 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2630 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2631 #endif
2632 *parsed_pattern++ = c;
2633 }
2634 else
2635 {
2636 if (after_manual_callout-- <= 0)
2637 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2638 auto_callout, parsed_pattern, cb);
2639 PARSED_LITERAL(c, parsed_pattern);
2640 }
2641 meta_quantifier = 0;
2642 }
2643 continue; /* Next character */
2644 }
2645
2646 /* If we are processing the "name" part of a (*VERB:NAME) item, all
2647 characters up to the closing parenthesis are literals except when
2648 PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2649 and \E and escaped characters are allowed (no character types such as \d). If
2650 PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2651 this by not entering the special (*VERB:NAME) processing - they are then
2652 picked up below. Note that c is a character, not a code unit, so we must not
2653 use MAX_255 to test its size because MAX_255 tests code units and is assumed
2654 TRUE in 8-bit mode. */
2655
2656 if (inverbname &&
2657 (
2658 /* EITHER: not both options set */
2659 ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2660 (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2661 #ifdef SUPPORT_UNICODE
2662 /* OR: character > 255 AND not Unicode Pattern White Space */
2663 (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2664 #endif
2665 /* OR: not a # comment or isspace() white space */
2666 (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2667 #ifdef SUPPORT_UNICODE
2668 /* and not CHAR_NEL when Unicode is supported */
2669 && c != CHAR_NEL
2670 #endif
2671 )))
2672 {
2673 PCRE2_SIZE verbnamelength;
2674
2675 switch(c)
2676 {
2677 default: /* Don't use PARSED_LITERAL() because it */
2678 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2679 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2680 #endif
2681 *parsed_pattern++ = c;
2682 break;
2683
2684 case CHAR_RIGHT_PARENTHESIS:
2685 inverbname = FALSE;
2686 /* This is the length in characters */
2687 verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2688 /* But the limit on the length is in code units */
2689 if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2690 {
2691 ptr--;
2692 errorcode = ERR76;
2693 goto FAILED;
2694 }
2695 *verblengthptr = (uint32_t)verbnamelength;
2696
2697 /* If this name was on a verb such as (*ACCEPT) which does not continue,
2698 a (*MARK) was generated for the name. We now add the original verb as the
2699 next item. */
2700
2701 if (add_after_mark != 0)
2702 {
2703 *parsed_pattern++ = add_after_mark;
2704 add_after_mark = 0;
2705 }
2706 break;
2707
2708 case CHAR_BACKSLASH:
2709 if ((options & PCRE2_ALT_VERBNAMES) != 0)
2710 {
2711 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2712 cb->cx->extra_options, FALSE, cb);
2713 if (errorcode != 0) goto FAILED;
2714 }
2715 else escape = 0; /* Treat all as literal */
2716
2717 switch(escape)
2718 {
2719 case 0: /* Don't use PARSED_LITERAL() because it */
2720 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2721 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2722 #endif
2723 *parsed_pattern++ = c;
2724 break;
2725
2726 case ESC_Q:
2727 inescq = TRUE;
2728 break;
2729
2730 case ESC_E: /* Ignore */
2731 break;
2732
2733 default:
2734 errorcode = ERR40; /* Invalid in verb name */
2735 goto FAILED;
2736 }
2737 }
2738 continue; /* Next character in pattern */
2739 }
2740
2741 /* Not a verb name character. At this point we must process everything that
2742 must not change the quantification state. This is mainly comments, but we
2743 handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2744 A+, as in Perl. An isolated \E is ignored. */
2745
2746 if (c == CHAR_BACKSLASH && ptr < ptrend)
2747 {
2748 if (*ptr == CHAR_Q || *ptr == CHAR_E)
2749 {
2750 inescq = *ptr == CHAR_Q;
2751 ptr++;
2752 continue;
2753 }
2754 }
2755
2756 /* Skip over whitespace and # comments in extended mode. Note that c is a
2757 character, not a code unit, so we must not use MAX_255 to test its size
2758 because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2759 whitespace characters are those designated as "Pattern White Space" by
2760 Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2761 U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2762 subset of space characters that match \h and \v. */
2763
2764 if ((options & PCRE2_EXTENDED) != 0)
2765 {
2766 if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2767 #ifdef SUPPORT_UNICODE
2768 if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2769 #endif
2770 if (c == CHAR_NUMBER_SIGN)
2771 {
2772 while (ptr < ptrend)
2773 {
2774 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
2775 { /* IS_NEWLINE sets cb->nllen. */
2776 ptr += cb->nllen;
2777 break;
2778 }
2779 ptr++;
2780 #ifdef SUPPORT_UNICODE
2781 if (utf) FORWARDCHARTEST(ptr, ptrend);
2782 #endif
2783 }
2784 continue; /* Next character in pattern */
2785 }
2786 }
2787
2788 /* Skip over bracketed comments */
2789
2790 if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2791 ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2792 {
2793 while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2794 if (ptr >= ptrend)
2795 {
2796 errorcode = ERR18; /* A special error for missing ) in a comment */
2797 goto FAILED; /* to make it easier to debug. */
2798 }
2799 ptr++;
2800 continue; /* Next character in pattern */
2801 }
2802
2803 /* If the next item is not a quantifier, fill in length of any previous
2804 callout and create an auto callout if required. */
2805
2806 if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2807 (c != CHAR_LEFT_CURLY_BRACKET ||
2808 (tempptr = ptr,
2809 !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2810 {
2811 if (after_manual_callout-- <= 0)
2812 parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2813 parsed_pattern, cb);
2814 }
2815
2816 /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2817 assertion, possibly preceded by a callout. If the value is 1, we have just
2818 had the callout and expect an assertion. There must be at least 3 more
2819 characters in all cases. When expect_cond_assert is 2, we know that the
2820 current character is an opening parenthesis, as otherwise we wouldn't be
2821 here. However, when it is 1, we need to check, and it's easiest just to check
2822 always. Note that expect_cond_assert may be negative, since all callouts just
2823 decrement it. */
2824
2825 if (expect_cond_assert > 0)
2826 {
2827 BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2828 (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
2829 if (ok)
2830 {
2831 if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
2832 {
2833 ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
2834 }
2835 else switch(ptr[1]) /* Traditional symbolic format */
2836 {
2837 case CHAR_C:
2838 ok = expect_cond_assert == 2;
2839 break;
2840
2841 case CHAR_EQUALS_SIGN:
2842 case CHAR_EXCLAMATION_MARK:
2843 break;
2844
2845 case CHAR_LESS_THAN_SIGN:
2846 ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2847 break;
2848
2849 default:
2850 ok = FALSE;
2851 }
2852 }
2853
2854 if (!ok)
2855 {
2856 ptr--; /* Adjust error offset */
2857 errorcode = ERR28;
2858 goto FAILED;
2859 }
2860 }
2861
2862 /* Remember whether we are expecting a conditional assertion, and set the
2863 default for this item. */
2864
2865 prev_expect_cond_assert = expect_cond_assert;
2866 expect_cond_assert = 0;
2867
2868 /* Remember quantification status for the previous significant item, then set
2869 default for this item. */
2870
2871 prev_okquantifier = okquantifier;
2872 prev_meta_quantifier = meta_quantifier;
2873 okquantifier = FALSE;
2874 meta_quantifier = 0;
2875
2876 /* If the previous significant item was a quantifier, adjust the parsed code
2877 if there is a following modifier. The base meta value is always followed by
2878 the PLUS and QUERY values, in that order. We do this here rather than after
2879 reading a quantifier so that intervening comments and /x whitespace can be
2880 ignored without having to replicate code. */
2881
2882 if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2883 {
2884 parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2885 prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2886 0x00020000u : 0x00010000u);
2887 continue; /* Next character in pattern */
2888 }
2889
2890
2891 /* Process the next item in the main part of a pattern. */
2892
2893 switch(c)
2894 {
2895 default: /* Non-special character */
2896 PARSED_LITERAL(c, parsed_pattern);
2897 break;
2898
2899
2900 /* ---- Escape sequence ---- */
2901
2902 case CHAR_BACKSLASH:
2903 tempptr = ptr;
2904 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2905 cb->cx->extra_options, FALSE, cb);
2906 if (errorcode != 0)
2907 {
2908 ESCAPE_FAILED:
2909 if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
2910 goto FAILED;
2911 ptr = tempptr;
2912 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
2913 {
2914 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
2915 }
2916 escape = 0; /* Treat as literal character */
2917 }
2918
2919 /* The escape was a data escape or literal character. */
2920
2921 if (escape == 0)
2922 {
2923 PARSED_LITERAL(c, parsed_pattern);
2924 }
2925
2926 /* The escape was a back (or forward) reference. We keep the offset in
2927 order to give a more useful diagnostic for a bad forward reference. For
2928 references to groups numbered less than 10 we can't use more than two items
2929 in parsed_pattern because they may be just two characters in the input (and
2930 in a 64-bit world an offset may need two elements). So for them, the offset
2931 of the first occurrent is held in a special vector. */
2932
2933 else if (escape < 0)
2934 {
2935 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
2936 escape = -escape;
2937 *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
2938 if (escape < 10)
2939 {
2940 if (cb->small_ref_offset[escape] == PCRE2_UNSET)
2941 cb->small_ref_offset[escape] = offset;
2942 }
2943 else
2944 {
2945 PUTOFFSET(offset, parsed_pattern);
2946 }
2947 okquantifier = TRUE;
2948 }
2949
2950 /* The escape was a character class such as \d etc. or other special
2951 escape indicator such as \A or \X. Most of them generate just a single
2952 parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
2953 value. They are supported only when Unicode is available. The type and
2954 value are packed into a single 32-bit value so that the whole sequences
2955 uses only two elements in the parsed_vector. This is because the same
2956 coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
2957 set.
2958
2959 There are also some cases where the escape sequence is followed by a name:
2960 \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
2961 and \g'name' are subroutine calls by name; \g{name} is a synonym for
2962 \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
2963 and returned as a negative value (handled above). A name is coded as an
2964 offset into the pattern and a length. */
2965
2966 else switch (escape)
2967 {
2968 case ESC_C:
2969 #ifdef NEVER_BACKSLASH_C
2970 errorcode = ERR85;
2971 goto ESCAPE_FAILED;
2972 #else
2973 if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
2974 {
2975 errorcode = ERR83;
2976 goto ESCAPE_FAILED;
2977 }
2978 #endif
2979 okquantifier = TRUE;
2980 *parsed_pattern++ = META_ESCAPE + escape;
2981 break;
2982
2983 case ESC_X:
2984 #ifndef SUPPORT_UNICODE
2985 errorcode = ERR45; /* Supported only with Unicode support */
2986 goto ESCAPE_FAILED;
2987 #endif
2988 case ESC_H:
2989 case ESC_h:
2990 case ESC_N:
2991 case ESC_R:
2992 case ESC_V:
2993 case ESC_v:
2994 okquantifier = TRUE;
2995 *parsed_pattern++ = META_ESCAPE + escape;
2996 break;
2997
2998 default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
2999 *parsed_pattern++ = META_ESCAPE + escape;
3000 break;
3001
3002 /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
3003 without Unicode support because it is checked when pcre2_compile() is
3004 called. */
3005
3006 case ESC_d:
3007 case ESC_D:
3008 case ESC_s:
3009 case ESC_S:
3010 case ESC_w:
3011 case ESC_W:
3012 okquantifier = TRUE;
3013 if ((options & PCRE2_UCP) == 0)
3014 {
3015 *parsed_pattern++ = META_ESCAPE + escape;
3016 }
3017 else
3018 {
3019 *parsed_pattern++ = META_ESCAPE +
3020 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3021 ESC_p : ESC_P);
3022 switch(escape)
3023 {
3024 case ESC_d:
3025 case ESC_D:
3026 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3027 break;
3028
3029 case ESC_s:
3030 case ESC_S:
3031 *parsed_pattern++ = PT_SPACE << 16;
3032 break;
3033
3034 case ESC_w:
3035 case ESC_W:
3036 *parsed_pattern++ = PT_WORD << 16;
3037 break;
3038 }
3039 }
3040 break;
3041
3042 /* Unicode property matching */
3043
3044 case ESC_P:
3045 case ESC_p:
3046 #ifdef SUPPORT_UNICODE
3047 {
3048 BOOL negated;
3049 uint16_t ptype = 0, pdata = 0;
3050 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3051 goto ESCAPE_FAILED;
3052 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3053 *parsed_pattern++ = META_ESCAPE + escape;
3054 *parsed_pattern++ = (ptype << 16) | pdata;
3055 okquantifier = TRUE;
3056 }
3057 #else
3058 errorcode = ERR45;
3059 goto ESCAPE_FAILED;
3060 #endif
3061 break; /* End \P and \p */
3062
3063 /* When \g is used with quotes or angle brackets as delimiters, it is a
3064 numerical or named subroutine call, and control comes here. When used
3065 with brace delimiters it is a numberical back reference and does not come
3066 here because check_escape() returns it directly as a reference. \k is
3067 always a named back reference. */
3068
3069 case ESC_g:
3070 case ESC_k:
3071 if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3072 *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3073 {
3074 errorcode = (escape == ESC_g)? ERR57 : ERR69;
3075 goto ESCAPE_FAILED;
3076 }
3077 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3078 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3079 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3080
3081 /* For a non-braced \g, check for a numerical recursion. */
3082
3083 if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3084 {
3085 PCRE2_SPTR p = ptr + 1;
3086
3087 if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3088 &errorcode))
3089 {
3090 if (p >= ptrend || *p != terminator)
3091 {
3092 errorcode = ERR57;
3093 goto ESCAPE_FAILED;
3094 }
3095 ptr = p;
3096 goto SET_RECURSION;
3097 }
3098 if (errorcode != 0) goto ESCAPE_FAILED;
3099 }
3100
3101 /* Not a numerical recursion */
3102
3103 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3104 &errorcode, cb)) goto ESCAPE_FAILED;
3105
3106 /* \k and \g when used with braces are back references, whereas \g used
3107 with quotes or angle brackets is a recursion */
3108
3109 *parsed_pattern++ =
3110 (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3111 META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3112 *parsed_pattern++ = namelen;
3113
3114 PUTOFFSET(offset, parsed_pattern);
3115 okquantifier = TRUE;
3116 break; /* End special escape processing */
3117 }
3118 break; /* End escape sequence processing */
3119
3120
3121 /* ---- Single-character special items ---- */
3122
3123 case CHAR_CIRCUMFLEX_ACCENT:
3124 *parsed_pattern++ = META_CIRCUMFLEX;
3125 break;
3126
3127 case CHAR_DOLLAR_SIGN:
3128 *parsed_pattern++ = META_DOLLAR;
3129 break;
3130
3131 case CHAR_DOT:
3132 *parsed_pattern++ = META_DOT;
3133 okquantifier = TRUE;
3134 break;
3135
3136
3137 /* ---- Single-character quantifiers ---- */
3138
3139 case CHAR_ASTERISK:
3140 meta_quantifier = META_ASTERISK;
3141 goto CHECK_QUANTIFIER;
3142
3143 case CHAR_PLUS:
3144 meta_quantifier = META_PLUS;
3145 goto CHECK_QUANTIFIER;
3146
3147 case CHAR_QUESTION_MARK:
3148 meta_quantifier = META_QUERY;
3149 goto CHECK_QUANTIFIER;
3150
3151
3152 /* ---- Potential {n,m} quantifier ---- */
3153
3154 case CHAR_LEFT_CURLY_BRACKET:
3155 if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3156 &errorcode))
3157 {
3158 if (errorcode != 0) goto FAILED; /* Error in quantifier. */
3159 PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
3160 break; /* No more quantifier processing */
3161 }
3162 meta_quantifier = META_MINMAX;
3163 /* Fall through */
3164
3165
3166 /* ---- Quantifier post-processing ---- */
3167
3168 /* Check that a quantifier is allowed after the previous item. */
3169
3170 CHECK_QUANTIFIER:
3171 if (!prev_okquantifier)
3172 {
3173 errorcode = ERR9;
3174 goto FAILED_BACK;
3175 }
3176
3177 /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3178 quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3179 sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3180 wrapping it in non-capturing brackets, but we have to allow for a preceding
3181 (*MARK) for when (*ACCEPT) has an argument. */
3182
3183 if (parsed_pattern[-1] == META_ACCEPT)
3184 {
3185 uint32_t *p;
3186 for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3187 *verbstartptr = META_NOCAPTURE;
3188 parsed_pattern[1] = META_KET;
3189 parsed_pattern += 2;
3190 }
3191
3192 /* Now we can put the quantifier into the parsed pattern vector. At this
3193 stage, we have only the basic quantifier. The check for a following + or ?
3194 modifier happens at the top of the loop, after any intervening comments
3195 have been removed. */
3196
3197 *parsed_pattern++ = meta_quantifier;
3198 if (c == CHAR_LEFT_CURLY_BRACKET)
3199 {
3200 *parsed_pattern++ = min_repeat;
3201 *parsed_pattern++ = max_repeat;
3202 }
3203 break;
3204
3205
3206 /* ---- Character class ---- */
3207
3208 case CHAR_LEFT_SQUARE_BRACKET:
3209 okquantifier = TRUE;
3210
3211 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3212 used for "start of word" and "end of word". As these are otherwise illegal
3213 sequences, we don't break anything by recognizing them. They are replaced
3214 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3215 erroneous and are handled by the normal code below. */
3216
3217 if (ptrend - ptr >= 6 &&
3218 (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3219 PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3220 {
3221 *parsed_pattern++ = META_ESCAPE + ESC_b;
3222
3223 if (ptr[2] == CHAR_LESS_THAN_SIGN)
3224 {
3225 *parsed_pattern++ = META_LOOKAHEAD;
3226 }
3227 else
3228 {
3229 *parsed_pattern++ = META_LOOKBEHIND;
3230 *has_lookbehind = TRUE;
3231
3232 /* The offset is used only for the "non-fixed length" error; this won't
3233 occur here, so just store zero. */
3234
3235 PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3236 }
3237
3238 if ((options & PCRE2_UCP) == 0)
3239 *parsed_pattern++ = META_ESCAPE + ESC_w;
3240 else
3241 {
3242 *parsed_pattern++ = META_ESCAPE + ESC_p;
3243 *parsed_pattern++ = PT_WORD << 16;
3244 }
3245 *parsed_pattern++ = META_KET;
3246 ptr += 6;
3247 break;
3248 }
3249
3250 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3251 they are encountered at the top level, so we'll do that too. */
3252
3253 if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3254 *ptr == CHAR_EQUALS_SIGN) &&
3255 check_posix_syntax(ptr, ptrend, &tempptr))
3256 {
3257 errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3258 goto FAILED;
3259 }
3260
3261 /* Process a regular character class. If the first character is '^', set
3262 the negation flag. If the first few characters (either before or after ^)
3263 are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3264 This makes for compatibility with Perl. */
3265
3266 negate_class = FALSE;
3267 while (ptr < ptrend)
3268 {
3269 GETCHARINCTEST(c, ptr);
3270 if (c == CHAR_BACKSLASH)
3271 {
3272 if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3273 else if (ptrend - ptr >= 3 &&
3274 PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3275 ptr += 3;
3276 else
3277 break;
3278 }
3279 else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3280 (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */
3281 continue;
3282 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3283 negate_class = TRUE;
3284 else break;
3285 }
3286
3287 /* Now the real contents of the class; c has the first "real" character.
3288 Empty classes are permitted only if the option is set. */
3289
3290 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3291 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3292 {
3293 *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3294 break; /* End of class processing */
3295 }
3296
3297 /* Process a non-empty class. */
3298
3299 *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3300 class_range_state = RANGE_NO;
3301
3302 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3303 because there are holes in the encoding, and simply using the range A-Z
3304 (for example) would include the characters in the holes. This applies only
3305 to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3306 in this respect. In order to accommodate this, we keep track of whether
3307 character values are literal or not, and a state variable for handling
3308 ranges. */
3309
3310 /* Loop for the contents of the class */
3311
3312 for (;;)
3313 {
3314 BOOL char_is_literal = TRUE;
3315
3316 /* Inside \Q...\E everything is literal except \E */
3317
3318 if (inescq)
3319 {
3320 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3321 {
3322 inescq = FALSE; /* Reset literal state */
3323 ptr++; /* Skip the 'E' */
3324 goto CLASS_CONTINUE;
3325 }
3326 goto CLASS_LITERAL;
3327 }
3328
3329 /* Skip over space and tab (only) in extended-more mode. */
3330
3331 if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3332 (c == CHAR_SPACE || c == CHAR_HT))
3333 goto CLASS_CONTINUE;
3334
3335 /* Handle POSIX class names. Perl allows a negation extension of the
3336 form [:^name:]. A square bracket that doesn't match the syntax is
3337 treated as a literal. We also recognize the POSIX constructions
3338 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3339 5.6 and 5.8 do. */
3340
3341 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3342 ptrend - ptr >= 3 &&
3343 (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3344 *ptr == CHAR_EQUALS_SIGN) &&
3345 check_posix_syntax(ptr, ptrend, &tempptr))
3346 {
3347 BOOL posix_negate = FALSE;
3348 int posix_class;
3349
3350 /* Perl treats a hyphen before a POSIX class as a literal, not the
3351 start of a range. However, it gives a warning in its warning mode. PCRE
3352 does not have a warning mode, so we give an error, because this is
3353 likely an error on the user's part. */
3354
3355 if (class_range_state == RANGE_STARTED)
3356 {
3357 errorcode = ERR50;
3358 goto FAILED;
3359 }
3360
3361 if (*ptr != CHAR_COLON)
3362 {
3363 errorcode = ERR13;
3364 goto FAILED_BACK;
3365 }
3366
3367 if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3368 {
3369 posix_negate = TRUE;
3370 ptr++;
3371 }
3372
3373 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3374 if (posix_class < 0)
3375 {
3376 errorcode = ERR30;
3377 goto FAILED;
3378 }
3379 ptr = tempptr + 2;
3380
3381 /* Perl treats a hyphen after a POSIX class as a literal, not the
3382 start of a range. However, it gives a warning in its warning mode
3383 unless the hyphen is the last character in the class. PCRE does not
3384 have a warning mode, so we give an error, because this is likely an
3385 error on the user's part. */
3386
3387 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3388 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3389 {
3390 errorcode = ERR50;
3391 goto FAILED;
3392 }
3393
3394 /* Set "a hyphen is not the start of a range" for the -] case, and also
3395 in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3396 fuzzers do that kind of thing) and *then* a hyphen. This causes that
3397 hyphen to be treated as a literal. I don't think it's worth setting up
3398 special apparatus to do otherwise. */
3399
3400 class_range_state = RANGE_NO;
3401
3402 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3403 use Unicode properties \p or \P or, in one case, \h or \H. The
3404 substitutes table has two values per class, containing the type and
3405 value of a \p or \P item. The special cases are specified with a
3406 negative type: a non-zero value causes \h or \H to be used, and a zero
3407 value falls through to behave like a non-UCP POSIX class. */
3408
3409 #ifdef SUPPORT_UNICODE
3410 if ((options & PCRE2_UCP) != 0)
3411 {
3412 int ptype = posix_substitutes[2*posix_class];
3413 int pvalue = posix_substitutes[2*posix_class + 1];
3414 if (ptype >= 0)
3415 {
3416 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3417 *parsed_pattern++ = (ptype << 16) | pvalue;
3418 goto CLASS_CONTINUE;
3419 }
3420
3421 if (pvalue != 0)
3422 {
3423 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3424 goto CLASS_CONTINUE;
3425 }
3426
3427 /* Fall through */
3428 }
3429 #endif /* SUPPORT_UNICODE */
3430
3431 /* Non-UCP POSIX class */
3432
3433 *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3434 *parsed_pattern++ = posix_class;
3435 }
3436
3437 /* Handle potential start of range */
3438
3439 else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3440 {
3441 *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3442 META_RANGE_LITERAL : META_RANGE_ESCAPED;
3443 class_range_state = RANGE_STARTED;
3444 }
3445
3446 /* Handle a literal character */
3447
3448 else if (c != CHAR_BACKSLASH)
3449 {
3450 CLASS_LITERAL:
3451 if (class_range_state == RANGE_STARTED)
3452 {
3453 if (c == parsed_pattern[-2]) /* Optimize one-char range */
3454 parsed_pattern--;
3455 else if (parsed_pattern[-2] > c) /* Check range is in order */
3456 {
3457 errorcode = ERR8;
3458 goto FAILED_BACK;
3459 }
3460 else
3461 {
3462 if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3463 parsed_pattern[-1] = META_RANGE_ESCAPED;
3464 PARSED_LITERAL(c, parsed_pattern);
3465 }
3466 class_range_state = RANGE_NO;
3467 }
3468 else /* Potential start of range */
3469 {
3470 class_range_state = char_is_literal?
3471 RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3472 PARSED_LITERAL(c, parsed_pattern);
3473 }
3474 }
3475
3476 /* Handle escapes in a class */
3477
3478 else
3479 {
3480 tempptr = ptr;
3481 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3482 cb->cx->extra_options, TRUE, cb);
3483
3484 if (errorcode != 0)
3485 {
3486 if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3487 goto FAILED;
3488 ptr = tempptr;
3489 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3490 {
3491 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3492 }
3493 escape = 0; /* Treat as literal character */
3494 }
3495
3496 switch(escape)
3497 {
3498 case 0: /* Escaped character code point is in c */
3499 char_is_literal = FALSE;
3500 goto CLASS_LITERAL;
3501
3502 case ESC_b:
3503 c = CHAR_BS; /* \b is backspace in a class */
3504 char_is_literal = FALSE;
3505 goto CLASS_LITERAL;
3506
3507 case ESC_Q:
3508 inescq = TRUE; /* Enter literal mode */
3509 goto CLASS_CONTINUE;
3510
3511 case ESC_E: /* Ignore orphan \E */
3512 goto CLASS_CONTINUE;
3513
3514 case ESC_B: /* Always an error in a class */
3515 case ESC_R:
3516 case ESC_X:
3517 errorcode = ERR7;
3518 ptr--;
3519 goto FAILED;
3520 }
3521
3522 /* The second part of a range can be a single-character escape
3523 sequence (detected above), but not any of the other escapes. Perl
3524 treats a hyphen as a literal in such circumstances. However, in Perl's
3525 warning mode, a warning is given, so PCRE now faults it, as it is
3526 almost certainly a mistake on the user's part. */
3527
3528 if (class_range_state == RANGE_STARTED)
3529 {
3530 errorcode = ERR50;
3531 goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
3532 }
3533
3534 /* Of the remaining escapes, only those that define characters are
3535 allowed in a class. None may start a range. */
3536
3537 class_range_state = RANGE_NO;
3538 switch(escape)
3539 {
3540 case ESC_N:
3541 errorcode = ERR71;
3542 goto FAILED;
3543
3544 case ESC_H:
3545 case ESC_h:
3546 case ESC_V:
3547 case ESC_v:
3548 *parsed_pattern++ = META_ESCAPE + escape;
3549 break;
3550
3551 /* These escapes are converted to Unicode property tests when
3552 PCRE2_UCP is set. */
3553
3554 case ESC_d:
3555 case ESC_D:
3556 case ESC_s:
3557 case ESC_S:
3558 case ESC_w:
3559 case ESC_W:
3560 if ((options & PCRE2_UCP) == 0)
3561 {
3562 *parsed_pattern++ = META_ESCAPE + escape;
3563 }
3564 else
3565 {
3566 *parsed_pattern++ = META_ESCAPE +
3567 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3568 ESC_p : ESC_P);
3569 switch(escape)
3570 {
3571 case ESC_d:
3572 case ESC_D:
3573 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3574 break;
3575
3576 case ESC_s:
3577 case ESC_S:
3578 *parsed_pattern++ = PT_SPACE << 16;
3579 break;
3580
3581 case ESC_w:
3582 case ESC_W:
3583 *parsed_pattern++ = PT_WORD << 16;
3584 break;
3585 }
3586 }
3587 break;
3588
3589 /* Explicit Unicode property matching */
3590
3591 case ESC_P:
3592 case ESC_p:
3593 #ifdef SUPPORT_UNICODE
3594 {
3595 BOOL negated;
3596 uint16_t ptype = 0, pdata = 0;
3597 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3598 goto FAILED;
3599 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3600 *parsed_pattern++ = META_ESCAPE + escape;
3601 *parsed_pattern++ = (ptype << 16) | pdata;
3602 }
3603 #else
3604 errorcode = ERR45;
3605 goto FAILED;
3606 #endif
3607 break; /* End \P and \p */
3608
3609 default: /* All others are not allowed in a class */
3610 errorcode = ERR7;
3611 ptr--;
3612 goto FAILED;
3613 }
3614
3615 /* Perl gives a warning unless a following hyphen is the last character
3616 in the class. PCRE throws an error. */
3617
3618 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3619 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3620 {
3621 errorcode = ERR50;
3622 goto FAILED;
3623 }
3624 }
3625
3626 /* Proceed to next thing in the class. */
3627
3628 CLASS_CONTINUE:
3629 if (ptr >= ptrend)
3630 {
3631 errorcode = ERR6; /* Missing terminating ']' */
3632 goto FAILED;
3633 }
3634 GETCHARINCTEST(c, ptr);
3635 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3636 } /* End of class-processing loop */
3637
3638 /* -] at the end of a class is a literal '-' */
3639
3640 if (class_range_state == RANGE_STARTED)
3641 {
3642 parsed_pattern[-1] = CHAR_MINUS;
3643 class_range_state = RANGE_NO;
3644 }
3645
3646 *parsed_pattern++ = META_CLASS_END;
3647 break; /* End of character class */
3648
3649
3650 /* ---- Opening parenthesis ---- */
3651
3652 case CHAR_LEFT_PARENTHESIS:
3653 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3654
3655 /* If ( is not followed by ? it is either a capture or a special verb or an
3656 alpha assertion or a positive non-atomic lookahead. */
3657
3658 if (*ptr != CHAR_QUESTION_MARK)
3659 {
3660 const char *vn;
3661
3662 /* Handle capturing brackets (or non-capturing if auto-capture is turned
3663 off). */
3664
3665 if (*ptr != CHAR_ASTERISK)
3666 {
3667 nest_depth++;
3668 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3669 {
3670 if (cb->bracount >= MAX_GROUP_NUMBER)
3671 {
3672 errorcode = ERR97;
3673 goto FAILED;
3674 }
3675 cb->bracount++;
3676 *parsed_pattern++ = META_CAPTURE | cb->bracount;
3677 }
3678 else *parsed_pattern++ = META_NOCAPTURE;
3679 }
3680
3681 /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3682 quantifier" error rather than "(*MARK) must have an argument". */
3683
3684 else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3685 break;
3686
3687 /* Handle "alpha assertions" such as (*pla:...). Most of these are
3688 synonyms for the historical symbolic assertions, but the script run and
3689 non-atomic lookaround ones are new. They are distinguished by starting
3690 with a lower case letter. Checking both ends of the alphabet makes this
3691 work in all character codes. */
3692
3693 else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3694 {
3695 uint32_t meta;
3696
3697 vn = alasnames;
3698 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3699 &errorcode, cb)) goto FAILED;
3700 if (ptr >= ptrend || *ptr != CHAR_COLON)
3701 {
3702 errorcode = ERR95; /* Malformed */
3703 goto FAILED;
3704 }
3705
3706 /* Scan the table of alpha assertion names */
3707
3708 for (i = 0; i < alascount; i++)
3709 {
3710 if (namelen == alasmeta[i].len &&
3711 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3712 break;
3713 vn += alasmeta[i].len + 1;
3714 }
3715
3716 if (i >= alascount)
3717 {
3718 errorcode = ERR95; /* Alpha assertion not recognized */
3719 goto FAILED;
3720 }
3721
3722 /* Check for expecting an assertion condition. If so, only atomic
3723 lookaround assertions are valid. */
3724
3725 meta = alasmeta[i].meta;
3726 if (prev_expect_cond_assert > 0 &&
3727 (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3728 {
3729 errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3730 ERR98 : ERR28; /* (Atomic) assertion expected */
3731 goto FAILED;
3732 }
3733
3734 /* The lookaround alphabetic synonyms can mostly be handled by jumping
3735 to the code that handles the traditional symbolic forms. */
3736
3737 switch(meta)
3738 {
3739 default:
3740 errorcode = ERR89; /* Unknown code; should never occur because */
3741 goto FAILED; /* the meta values come from a table above. */
3742
3743 case META_ATOMIC:
3744 goto ATOMIC_GROUP;
3745
3746 case META_LOOKAHEAD:
3747 goto POSITIVE_LOOK_AHEAD;
3748
3749 case META_LOOKAHEAD_NA:
3750 goto POSITIVE_NONATOMIC_LOOK_AHEAD;
3751
3752 case META_LOOKAHEADNOT:
3753 goto NEGATIVE_LOOK_AHEAD;
3754
3755 case META_LOOKBEHIND:
3756 case META_LOOKBEHINDNOT:
3757 case META_LOOKBEHIND_NA:
3758 *parsed_pattern++ = meta;
3759 ptr--;
3760 goto POST_LOOKBEHIND;
3761
3762 /* The script run facilities are handled here. Unicode support is
3763 required (give an error if not, as this is a security issue). Always
3764 record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3765 META_ATOMIC and remember that we need two META_KETs at the end. */
3766
3767 case META_SCRIPT_RUN:
3768 case META_ATOMIC_SCRIPT_RUN:
3769 #ifdef SUPPORT_UNICODE
3770 *parsed_pattern++ = META_SCRIPT_RUN;
3771 nest_depth++;
3772 ptr++;
3773 if (meta == META_ATOMIC_SCRIPT_RUN)
3774 {
3775 *parsed_pattern++ = META_ATOMIC;
3776 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3777 else if (++top_nest >= end_nests)
3778 {
3779 errorcode = ERR84;
3780 goto FAILED;
3781 }
3782 top_nest->nest_depth = nest_depth;
3783 top_nest->flags = NSF_ATOMICSR;
3784 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3785 }
3786 break;
3787 #else /* SUPPORT_UNICODE */
3788 errorcode = ERR96;
3789 goto FAILED;
3790 #endif
3791 }
3792 }
3793
3794
3795 /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3796
3797 else
3798 {
3799 vn = verbnames;
3800 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3801 &errorcode, cb)) goto FAILED;
3802 if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3803 *ptr != CHAR_RIGHT_PARENTHESIS))
3804 {
3805 errorcode = ERR60; /* Malformed */
3806 goto FAILED;
3807 }
3808
3809 /* Scan the table of verb names */
3810
3811 for (i = 0; i < verbcount; i++)
3812 {
3813 if (namelen == verbs[i].len &&
3814 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3815 break;
3816 vn += verbs[i].len + 1;
3817 }
3818
3819 if (i >= verbcount)
3820 {
3821 errorcode = ERR60; /* Verb not recognized */
3822 goto FAILED;
3823 }
3824
3825 /* An empty argument is treated as no argument. */
3826
3827 if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3828 ptr[1] == CHAR_RIGHT_PARENTHESIS)
3829 ptr++; /* Advance to the closing parens */
3830
3831 /* Check for mandatory non-empty argument; this is (*MARK) */
3832
3833 if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3834 {
3835 errorcode = ERR66;
3836 goto FAILED;
3837 }
3838
3839 /* Remember where this verb, possibly with a preceding (*MARK), starts,
3840 for handling quantified (*ACCEPT). */
3841
3842 verbstartptr = parsed_pattern;
3843 okquantifier = (verbs[i].meta == META_ACCEPT);
3844
3845 /* It appears that Perl allows any characters whatsoever, other than a
3846 closing parenthesis, to appear in arguments ("names"), so we no longer
3847 insist on letters, digits, and underscores. Perl does not, however, do
3848 any interpretation within arguments, and has no means of including a
3849 closing parenthesis. PCRE supports escape processing but only when it
3850 is requested by an option. We set inverbname TRUE here, and let the
3851 main loop take care of this so that escape and \x processing is done by
3852 the main code above. */
3853
3854 if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
3855 {
3856 /* Some optional arguments can be treated as a preceding (*MARK) */
3857
3858 if (verbs[i].has_arg < 0)
3859 {
3860 add_after_mark = verbs[i].meta;
3861 *parsed_pattern++ = META_MARK;
3862 }
3863
3864 /* The remaining verbs with arguments (except *MARK) need a different
3865 opcode. */
3866
3867 else
3868 {
3869 *parsed_pattern++ = verbs[i].meta +
3870 ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3871 }
3872
3873 /* Set up for reading the name in the main loop. */
3874
3875 verblengthptr = parsed_pattern++;
3876 verbnamestart = ptr;
3877 inverbname = TRUE;
3878 }
3879 else /* No verb "name" argument */
3880 {
3881 *parsed_pattern++ = verbs[i].meta;
3882 }
3883 } /* End of (*VERB) handling */
3884 break; /* Done with this parenthesis */
3885 } /* End of groups that don't start with (? */
3886
3887
3888 /* ---- Items starting (? ---- */
3889
3890 /* The type of item is determined by what follows (?. Handle (?| and option
3891 changes under "default" because both need a new block on the nest stack.
3892 Comments starting with (?# are handled above. Note that there is some
3893 ambiguity about the sequence (?- because if a digit follows it's a relative
3894 recursion or subroutine call whereas otherwise it's an option unsetting. */
3895
3896 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3897
3898 switch(*ptr)
3899 {
3900 default:
3901 if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
3902 goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
3903
3904 /* We now have either (?| or a (possibly empty) option setting,
3905 optionally followed by a non-capturing group. */
3906
3907 nest_depth++;
3908 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3909 else if (++top_nest >= end_nests)
3910 {
3911 errorcode = ERR84;
3912 goto FAILED;
3913 }
3914 top_nest->nest_depth = nest_depth;
3915 top_nest->flags = 0;
3916 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3917
3918 /* Start of non-capturing group that resets the capture count for each
3919 branch. */
3920
3921 if (*ptr == CHAR_VERTICAL_LINE)
3922 {
3923 top_nest->reset_group = (uint16_t)cb->bracount;
3924 top_nest->max_group = (uint16_t)cb->bracount;
3925 top_nest->flags |= NSF_RESET;
3926 cb->external_flags |= PCRE2_DUPCAPUSED;
3927 *parsed_pattern++ = META_NOCAPTURE;
3928 ptr++;
3929 }
3930
3931 /* Scan for options imnsxJU to be set or unset. */
3932
3933 else
3934 {
3935 BOOL hyphenok = TRUE;
3936 uint32_t oldoptions = options;
3937
3938 top_nest->reset_group = 0;
3939 top_nest->max_group = 0;
3940 set = unset = 0;
3941 optset = &set;
3942
3943 /* ^ at the start unsets imnsx and disables the subsequent use of - */
3944
3945 if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
3946 {
3947 options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
3948 PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
3949 hyphenok = FALSE;
3950 ptr++;
3951 }
3952
3953 while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
3954 *ptr != CHAR_COLON)
3955 {
3956 switch (*ptr++)
3957 {
3958 case CHAR_MINUS:
3959 if (!hyphenok)
3960 {
3961 errorcode = ERR94;
3962 ptr--; /* Correct the offset */
3963 goto FAILED;
3964 }
3965 optset = &unset;
3966 hyphenok = FALSE;
3967 break;
3968
3969 case CHAR_J: /* Record that it changed in the external options */
3970 *optset |= PCRE2_DUPNAMES;
3971 cb->external_flags |= PCRE2_JCHANGED;
3972 break;
3973
3974 case CHAR_i: *optset |= PCRE2_CASELESS; break;
3975 case CHAR_m: *optset |= PCRE2_MULTILINE; break;
3976 case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
3977 case CHAR_s: *optset |= PCRE2_DOTALL; break;
3978 case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
3979
3980 /* If x appears twice it sets the extended extended option. */
3981
3982 case CHAR_x:
3983 *optset |= PCRE2_EXTENDED;
3984 if (ptr < ptrend && *ptr == CHAR_x)
3985 {
3986 *optset |= PCRE2_EXTENDED_MORE;
3987 ptr++;
3988 }
3989 break;
3990
3991 default:
3992 errorcode = ERR11;
3993 ptr--; /* Correct the offset */
3994 goto FAILED;
3995 }
3996 }
3997
3998 /* If we are setting extended without extended-more, ensure that any
3999 existing extended-more gets unset. Also, unsetting extended must also
4000 unset extended-more. */
4001
4002 if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4003 (unset & PCRE2_EXTENDED) != 0)
4004 unset |= PCRE2_EXTENDED_MORE;
4005
4006 options = (options | set) & (~unset);
4007
4008 /* If the options ended with ')' this is not the start of a nested
4009 group with option changes, so the options change at this level.
4010 In this case, if the previous level set up a nest block, discard the
4011 one we have just created. Otherwise adjust it for the previous level.
4012 If the options ended with ':' we are starting a non-capturing group,
4013 possibly with an options setting. */
4014
4015 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4016 if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4017 {
4018 nest_depth--; /* This is not a nested group after all. */
4019 if (top_nest > (nest_save *)(cb->start_workspace) &&
4020 (top_nest-1)->nest_depth == nest_depth) top_nest--;
4021 else top_nest->nest_depth = nest_depth;
4022 }
4023 else *parsed_pattern++ = META_NOCAPTURE;
4024
4025 /* If nothing changed, no need to record. */
4026
4027 if (options != oldoptions)
4028 {
4029 *parsed_pattern++ = META_OPTIONS;
4030 *parsed_pattern++ = options;
4031 }
4032 } /* End options processing */
4033 break; /* End default case after (? */
4034
4035
4036 /* ---- Python syntax support ---- */
4037
4038 case CHAR_P:
4039 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4040
4041 /* (?P<name> is the same as (?<name>, which defines a named group. */
4042
4043 if (*ptr == CHAR_LESS_THAN_SIGN)
4044 {
4045 terminator = CHAR_GREATER_THAN_SIGN;
4046 goto DEFINE_NAME;
4047 }
4048
4049 /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4050 call. */
4051
4052 if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4053
4054 /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4055 else after (?P is an error. */
4056
4057 if (*ptr != CHAR_EQUALS_SIGN)
4058 {
4059 errorcode = ERR41;
4060 goto FAILED;
4061 }
4062 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4063 &namelen, &errorcode, cb)) goto FAILED;
4064 *parsed_pattern++ = META_BACKREF_BYNAME;
4065 *parsed_pattern++ = namelen;
4066 PUTOFFSET(offset, parsed_pattern);
4067 okquantifier = TRUE;
4068 break; /* End of (?P processing */
4069
4070
4071 /* ---- Recursion/subroutine calls by number ---- */
4072
4073 case CHAR_R:
4074 i = 0; /* (?R) == (?R0) */
4075 ptr++;
4076 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4077 {
4078 errorcode = ERR58;
4079 goto FAILED;
4080 }
4081 goto SET_RECURSION;
4082
4083 /* An item starting (?- followed by a digit comes here via the "default"
4084 case because (?- followed by a non-digit is an options setting. */
4085
4086 case CHAR_PLUS:
4087 if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4088 {
4089 errorcode = ERR29; /* Missing number */
4090 goto FAILED;
4091 }
4092 /* Fall through */
4093
4094 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4095 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4096 RECURSION_BYNUMBER:
4097 if (!read_number(&ptr, ptrend,
4098 (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4099 MAX_GROUP_NUMBER, ERR61,
4100 &i, &errorcode)) goto FAILED;
4101 if (i < 0) /* NB (?0) is permitted */
4102 {
4103 errorcode = ERR15; /* Unknown group */
4104 goto FAILED_BACK;
4105 }
4106 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4107 goto UNCLOSED_PARENTHESIS;
4108
4109 SET_RECURSION:
4110 *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4111 offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4112 ptr++;
4113 PUTOFFSET(offset, parsed_pattern);
4114 okquantifier = TRUE;
4115 break; /* End of recursive call by number handling */
4116
4117
4118 /* ---- Recursion/subroutine calls by name ---- */
4119
4120 case CHAR_AMPERSAND:
4121 RECURSE_BY_NAME:
4122 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4123 &namelen, &errorcode, cb)) goto FAILED;
4124 *parsed_pattern++ = META_RECURSE_BYNAME;
4125 *parsed_pattern++ = namelen;
4126 PUTOFFSET(offset, parsed_pattern);
4127 okquantifier = TRUE;
4128 break;
4129
4130 /* ---- Callout with numerical or string argument ---- */
4131
4132 case CHAR_C:
4133 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4134
4135 /* If the previous item was a condition starting (?(? an assertion,
4136 optionally preceded by a callout, is expected. This is checked later on,
4137 during actual compilation. However we need to identify this kind of
4138 assertion in this pass because it must not be qualified. The value of
4139 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4140 for a callout - still leaving a positive value that identifies the
4141 assertion. Multiple callouts or any other items will make it zero or
4142 less, which doesn't matter because they will cause an error later. */
4143
4144 expect_cond_assert = prev_expect_cond_assert - 1;
4145
4146 /* If previous_callout is not NULL, it means this follows a previous
4147 callout. If it was a manual callout, do nothing; this means its "length
4148 of next pattern item" field will remain zero. If it was an automatic
4149 callout, abolish it. */
4150
4151 if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4152 previous_callout == parsed_pattern - 4 &&
4153 parsed_pattern[-1] == 255)
4154 parsed_pattern = previous_callout;
4155
4156 /* Save for updating next pattern item length, and skip one item before
4157 completing. */
4158
4159 previous_callout = parsed_pattern;
4160 after_manual_callout = 1;
4161
4162 /* Handle a string argument; specific delimiter is required. */
4163
4164 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4165 {
4166 PCRE2_SIZE calloutlength;
4167 PCRE2_SPTR startptr = ptr;
4168
4169 delimiter = 0;
4170 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4171 {
4172 if (*ptr == PRIV(callout_start_delims)[i])
4173 {
4174 delimiter = PRIV(callout_end_delims)[i];
4175 break;
4176 }
4177 }
4178 if (delimiter == 0)
4179 {
4180 errorcode = ERR82;
4181 goto FAILED;
4182 }
4183
4184 *parsed_pattern = META_CALLOUT_STRING;
4185 parsed_pattern += 3; /* Skip pattern info */
4186
4187 for (;;)
4188 {
4189 if (++ptr >= ptrend)
4190 {
4191 errorcode = ERR81;
4192 ptr = startptr; /* To give a more useful message */
4193 goto FAILED;
4194 }
4195 if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4196 break;
4197 }
4198
4199 calloutlength = (PCRE2_SIZE)(ptr - startptr);
4200 if (calloutlength > UINT32_MAX)
4201 {
4202 errorcode = ERR72;
4203 goto FAILED;
4204 }
4205 *parsed_pattern++ = (uint32_t)calloutlength;
4206 offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4207 PUTOFFSET(offset, parsed_pattern);
4208 }
4209
4210 /* Handle a callout with an optional numerical argument, which must be
4211 less than or equal to 255. A missing argument gives 0. */
4212
4213 else
4214 {
4215 int n = 0;
4216 *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
4217 parsed_pattern += 3; /* Skip pattern info */
4218 while (ptr < ptrend && IS_DIGIT(*ptr))
4219 {
4220 n = n * 10 + *ptr++ - CHAR_0;
4221 if (n > 255)
4222 {
4223 errorcode = ERR38;
4224 goto FAILED;
4225 }
4226 }
4227 *parsed_pattern++ = n;
4228 }
4229
4230 /* Both formats must have a closing parenthesis */
4231
4232 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4233 {
4234 errorcode = ERR39;
4235 goto FAILED;
4236 }
4237 ptr++;
4238
4239 /* Remember the offset to the next item in the pattern, and set a default
4240 length. This should get updated after the next item is read. */
4241
4242 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4243 previous_callout[2] = 0;
4244 break; /* End callout */
4245
4246
4247 /* ---- Conditional group ---- */
4248
4249 /* A condition can be an assertion, a number (referring to a numbered
4250 group's having been set), a name (referring to a named group), or 'R',
4251 referring to overall recursion. R<digits> and R&name are also permitted
4252 for recursion state tests. Numbers may be preceded by + or - to specify a
4253 relative group number.
4254
4255 There are several syntaxes for testing a named group: (?(name)) is used
4256 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4257
4258 There are two unfortunate ambiguities. 'R' can be the recursive thing or
4259 the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4260 the Perl DEFINE feature or the Python named test. We look for a name
4261 first; if not found, we try the other case.
4262
4263 For compatibility with auto-callouts, we allow a callout to be specified
4264 before a condition that is an assertion. */
4265
4266 case CHAR_LEFT_PARENTHESIS:
4267 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4268 nest_depth++;
4269
4270 /* If the next character is ? or * there must be an assertion next
4271 (optionally preceded by a callout). We do not check this here, but
4272 instead we set expect_cond_assert to 2. If this is still greater than
4273 zero (callouts decrement it) when the next assertion is read, it will be
4274 marked as a condition that must not be repeated. A value greater than
4275 zero also causes checking that an assertion (possibly with callout)
4276 follows. */
4277
4278 if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4279 {
4280 *parsed_pattern++ = META_COND_ASSERT;
4281 ptr--; /* Pull pointer back to the opening parenthesis. */
4282 expect_cond_assert = 2;
4283 break; /* End of conditional */
4284 }
4285
4286 /* Handle (?([+-]number)... */
4287
4288 if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4289 &errorcode))
4290 {
4291 if (i <= 0)
4292 {
4293 errorcode = ERR15;
4294 goto FAILED;
4295 }
4296 *parsed_pattern++ = META_COND_NUMBER;
4297 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4298 PUTOFFSET(offset, parsed_pattern);
4299 *parsed_pattern++ = i;
4300 }
4301 else if (errorcode != 0) goto FAILED; /* Number too big */
4302
4303 /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4304
4305 else if (ptrend - ptr >= 10 &&
4306 PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4307 ptr[7] != CHAR_RIGHT_PARENTHESIS)
4308 {
4309 uint32_t ge = 0;
4310 int major = 0;
4311 int minor = 0;
4312
4313 ptr += 7;
4314 if (*ptr == CHAR_GREATER_THAN_SIGN)
4315 {
4316 ge = 1;
4317 ptr++;
4318 }
4319
4320 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4321 references its argument twice. */
4322
4323 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4324 goto BAD_VERSION_CONDITION;
4325
4326 if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4327 goto FAILED;
4328
4329 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4330 if (*ptr == CHAR_DOT)
4331 {
4332 if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4333 minor = (*ptr++ - CHAR_0) * 10;
4334 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4335 if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4336 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4337 goto BAD_VERSION_CONDITION;
4338 }
4339
4340 *parsed_pattern++ = META_COND_VERSION;
4341 *parsed_pattern++ = ge;
4342 *parsed_pattern++ = major;
4343 *parsed_pattern++ = minor;
4344 }
4345
4346 /* All the remaining cases now require us to read a name. We cannot at
4347 this stage distinguish ambiguous cases such as (?(R12) which might be a
4348 recursion test by number or a name, because the named groups have not yet
4349 all been identified. Those cases are treated as names, but given a
4350 different META code. */
4351
4352 else
4353 {
4354 BOOL was_r_ampersand = FALSE;
4355
4356 if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4357 {
4358 terminator = CHAR_RIGHT_PARENTHESIS;
4359 was_r_ampersand = TRUE;
4360 ptr++;
4361 }
4362 else if (*ptr == CHAR_LESS_THAN_SIGN)
4363 terminator = CHAR_GREATER_THAN_SIGN;
4364 else if (*ptr == CHAR_APOSTROPHE)
4365 terminator = CHAR_APOSTROPHE;
4366 else
4367 {
4368 terminator = CHAR_RIGHT_PARENTHESIS;
4369 ptr--; /* Point to char before name */
4370 }
4371 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4372 &errorcode, cb)) goto FAILED;
4373
4374 /* Handle (?(R&name) */
4375
4376 if (was_r_ampersand)
4377 {
4378 *parsed_pattern = META_COND_RNAME;
4379 ptr--; /* Back to closing parens */
4380 }
4381
4382 /* Handle (?(name). If the name is "DEFINE" we identify it with a
4383 special code. Likewise if the name consists of R followed only by
4384 digits. Otherwise, handle it like a quoted name. */
4385
4386 else if (terminator == CHAR_RIGHT_PARENTHESIS)
4387 {
4388 if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4389 *parsed_pattern = META_COND_DEFINE;
4390 else
4391 {
4392 for (i = 1; i < (int)namelen; i++)
4393 if (!IS_DIGIT(name[i])) break;
4394 *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4395 META_COND_RNUMBER : META_COND_NAME;
4396 }
4397 ptr--; /* Back to closing parens */
4398 }
4399
4400 /* Handle (?('name') or (?(<name>) */
4401
4402 else *parsed_pattern = META_COND_NAME;
4403
4404 /* All these cases except DEFINE end with the name length and offset;
4405 DEFINE just has an offset (for the "too many branches" error). */
4406
4407 if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4408 PUTOFFSET(offset, parsed_pattern);
4409 } /* End cases that read a name */
4410
4411 /* Check the closing parenthesis of the condition */
4412
4413 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4414 {
4415 errorcode = ERR24;
4416 goto FAILED;
4417 }
4418 ptr++;
4419 break; /* End of condition processing */
4420
4421
4422 /* ---- Atomic group ---- */
4423
4424 case CHAR_GREATER_THAN_SIGN:
4425 ATOMIC_GROUP: /* Come from (*atomic: */
4426 *parsed_pattern++ = META_ATOMIC;
4427 nest_depth++;
4428 ptr++;
4429 break;
4430
4431
4432 /* ---- Lookahead assertions ---- */
4433
4434 case CHAR_EQUALS_SIGN:
4435 POSITIVE_LOOK_AHEAD: /* Come from (*pla: */
4436 *parsed_pattern++ = META_LOOKAHEAD;
4437 ptr++;
4438 goto POST_ASSERTION;
4439
4440 case CHAR_ASTERISK:
4441 POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
4442 *parsed_pattern++ = META_LOOKAHEAD_NA;
4443 ptr++;
4444 goto POST_ASSERTION;
4445
4446 case CHAR_EXCLAMATION_MARK:
4447 NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
4448 *parsed_pattern++ = META_LOOKAHEADNOT;
4449 ptr++;
4450 goto POST_ASSERTION;
4451
4452
4453 /* ---- Lookbehind assertions ---- */
4454
4455 /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4456 is the start of the name of a capturing group. */
4457
4458 case CHAR_LESS_THAN_SIGN:
4459 if (ptrend - ptr <= 1 ||
4460 (ptr[1] != CHAR_EQUALS_SIGN &&
4461 ptr[1] != CHAR_EXCLAMATION_MARK &&
4462 ptr[1] != CHAR_ASTERISK))
4463 {
4464 terminator = CHAR_GREATER_THAN_SIGN;
4465 goto DEFINE_NAME;
4466 }
4467 *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4468 META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4469 META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4470
4471 POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
4472 *has_lookbehind = TRUE;
4473 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4474 PUTOFFSET(offset, parsed_pattern);
4475 ptr += 2;
4476 /* Fall through */
4477
4478 /* If the previous item was a condition starting (?(? an assertion,
4479 optionally preceded by a callout, is expected. This is checked later on,
4480 during actual compilation. However we need to identify this kind of
4481 assertion in this pass because it must not be qualified. The value of
4482 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4483 for a callout - still leaving a positive value that identifies the
4484 assertion. Multiple callouts or any other items will make it zero or
4485 less, which doesn't matter because they will cause an error later. */
4486
4487 POST_ASSERTION:
4488 nest_depth++;
4489 if (prev_expect_cond_assert > 0)
4490 {
4491 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4492 else if (++top_nest >= end_nests)
4493 {
4494 errorcode = ERR84;
4495 goto FAILED;
4496 }
4497 top_nest->nest_depth = nest_depth;
4498 top_nest->flags = NSF_CONDASSERT;
4499 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4500 }
4501 break;
4502
4503
4504 /* ---- Define a named group ---- */
4505
4506 /* A named group may be defined as (?'name') or (?<name>). In the latter
4507 case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4508 terminator set to '>'. */
4509
4510 case CHAR_APOSTROPHE:
4511 terminator = CHAR_APOSTROPHE; /* Terminator */
4512
4513 DEFINE_NAME:
4514 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4515 &errorcode, cb)) goto FAILED;
4516
4517 /* We have a name for this capturing group. It is also assigned a number,
4518 which is its primary means of identification. */
4519
4520 if (cb->bracount >= MAX_GROUP_NUMBER)
4521 {
4522 errorcode = ERR97;
4523 goto FAILED;
4524 }
4525 cb->bracount++;
4526 *parsed_pattern++ = META_CAPTURE | cb->bracount;
4527 nest_depth++;
4528
4529 /* Check not too many names */
4530
4531 if (cb->names_found >= MAX_NAME_COUNT)
4532 {
4533 errorcode = ERR49;
4534 goto FAILED;
4535 }
4536
4537 /* Adjust the entry size to accommodate the longest name found. */
4538
4539 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4540 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4541
4542 /* Scan the list to check for duplicates. For duplicate names, if the
4543 number is the same, break the loop, which causes the name to be
4544 discarded; otherwise, if DUPNAMES is not set, give an error.
4545 If it is set, allow the name with a different number, but continue
4546 scanning in case this is a duplicate with the same number. For
4547 non-duplicate names, give an error if the number is duplicated. */
4548
4549 isdupname = FALSE;
4550 ng = cb->named_groups;
4551 for (i = 0; i < cb->names_found; i++, ng++)
4552 {
4553 if (namelen == ng->length &&
4554 PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4555 {
4556 if (ng->number == cb->bracount) break;
4557 if ((options & PCRE2_DUPNAMES) == 0)
4558 {
4559 errorcode = ERR43;
4560 goto FAILED;
4561 }
4562 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
4563 cb->dupnames = TRUE; /* Duplicate names exist */
4564 }
4565 else if (ng->number == cb->bracount)
4566 {
4567 errorcode = ERR65;
4568 goto FAILED;
4569 }
4570 }
4571
4572 if (i < cb->names_found) break; /* Ignore duplicate with same number */
4573
4574 /* Increase the list size if necessary */
4575
4576 if (cb->names_found >= cb->named_group_list_size)
4577 {
4578 uint32_t newsize = cb->named_group_list_size * 2;
4579 named_group *newspace =
4580 cb->cx->memctl.malloc(newsize * sizeof(named_group),
4581 cb->cx->memctl.memory_data);
4582 if (newspace == NULL)
4583 {
4584 errorcode = ERR21;
4585 goto FAILED;
4586 }
4587
4588 memcpy(newspace, cb->named_groups,
4589 cb->named_group_list_size * sizeof(named_group));
4590 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4591 cb->cx->memctl.free((void *)cb->named_groups,
4592 cb->cx->memctl.memory_data);
4593 cb->named_groups = newspace;
4594 cb->named_group_list_size = newsize;
4595 }
4596
4597 /* Add this name to the list */
4598
4599 cb->named_groups[cb->names_found].name = name;
4600 cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4601 cb->named_groups[cb->names_found].number = cb->bracount;
4602 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4603 cb->names_found++;
4604 break;
4605 } /* End of (? switch */
4606 break; /* End of ( handling */
4607
4608
4609 /* ---- Branch terminators ---- */
4610
4611 /* Alternation: reset the capture count if we are in a (?| group. */
4612
4613 case CHAR_VERTICAL_LINE:
4614 if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4615 (top_nest->flags & NSF_RESET) != 0)
4616 {
4617 if (cb->bracount > top_nest->max_group)
4618 top_nest->max_group = (uint16_t)cb->bracount;
4619 cb->bracount = top_nest->reset_group;
4620 }
4621 *parsed_pattern++ = META_ALT;
4622 break;
4623
4624 /* End of group; reset the capture count to the maximum if we are in a (?|
4625 group and/or reset the options that are tracked during parsing. Disallow
4626 quantifier for a condition that is an assertion. */
4627
4628 case CHAR_RIGHT_PARENTHESIS:
4629 okquantifier = TRUE;
4630 if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4631 {
4632 options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4633 if ((top_nest->flags & NSF_RESET) != 0 &&
4634 top_nest->max_group > cb->bracount)
4635 cb->bracount = top_nest->max_group;
4636 if ((top_nest->flags & NSF_CONDASSERT) != 0)
4637 okquantifier = FALSE;
4638
4639 if ((top_nest->flags & NSF_ATOMICSR) != 0)
4640 {
4641 *parsed_pattern++ = META_KET;
4642 }
4643
4644 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4645 else top_nest--;
4646 }
4647 if (nest_depth == 0) /* Unmatched closing parenthesis */
4648 {
4649 errorcode = ERR22;
4650 goto FAILED_BACK;
4651 }
4652 nest_depth--;
4653 *parsed_pattern++ = META_KET;
4654 break;
4655 } /* End of switch on pattern character */
4656 } /* End of main character scan loop */
4657
4658 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4659
4660 if (inverbname && ptr >= ptrend)
4661 {
4662 errorcode = ERR60;
4663 goto FAILED;
4664 }
4665
4666 /* Manage callout for the final item */
4667
4668 PARSED_END:
4669 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4670 parsed_pattern, cb);
4671
4672 /* Insert trailing items for word and line matching (features provided for the
4673 benefit of pcre2grep). */
4674
4675 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4676 {
4677 *parsed_pattern++ = META_KET;
4678 *parsed_pattern++ = META_DOLLAR;
4679 }
4680 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4681 {
4682 *parsed_pattern++ = META_KET;
4683 *parsed_pattern++ = META_ESCAPE + ESC_b;
4684 }
4685
4686 /* Terminate the parsed pattern, then return success if all groups are closed.
4687 Otherwise we have unclosed parentheses. */
4688
4689 if (parsed_pattern >= parsed_pattern_end)
4690 {
4691 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
4692 goto FAILED;
4693 }
4694
4695 *parsed_pattern = META_END;
4696 if (nest_depth == 0) return 0;
4697
4698 UNCLOSED_PARENTHESIS:
4699 errorcode = ERR14;
4700
4701 /* Come here for all failures. */
4702
4703 FAILED:
4704 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4705 return errorcode;
4706
4707 /* Some errors need to indicate the previous character. */
4708
4709 FAILED_BACK:
4710 ptr--;
4711 goto FAILED;
4712
4713 /* This failure happens several times. */
4714
4715 BAD_VERSION_CONDITION:
4716 errorcode = ERR79;
4717 goto FAILED;
4718 }
4719
4720
4721
4722 /*************************************************
4723 * Find first significant opcode *
4724 *************************************************/
4725
4726 /* This is called by several functions that scan a compiled expression looking
4727 for a fixed first character, or an anchoring opcode etc. It skips over things
4728 that do not influence this. For some calls, it makes sense to skip negative
4729 forward and all backward assertions, and also the \b assertion; for others it
4730 does not.
4731
4732 Arguments:
4733 code pointer to the start of the group
4734 skipassert TRUE if certain assertions are to be skipped
4735
4736 Returns: pointer to the first significant opcode
4737 */
4738
4739 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)4740 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4741 {
4742 for (;;)
4743 {
4744 switch ((int)*code)
4745 {
4746 case OP_ASSERT_NOT:
4747 case OP_ASSERTBACK:
4748 case OP_ASSERTBACK_NOT:
4749 case OP_ASSERTBACK_NA:
4750 if (!skipassert) return code;
4751 do code += GET(code, 1); while (*code == OP_ALT);
4752 code += PRIV(OP_lengths)[*code];
4753 break;
4754
4755 case OP_WORD_BOUNDARY:
4756 case OP_NOT_WORD_BOUNDARY:
4757 if (!skipassert) return code;
4758 /* Fall through */
4759
4760 case OP_CALLOUT:
4761 case OP_CREF:
4762 case OP_DNCREF:
4763 case OP_RREF:
4764 case OP_DNRREF:
4765 case OP_FALSE:
4766 case OP_TRUE:
4767 code += PRIV(OP_lengths)[*code];
4768 break;
4769
4770 case OP_CALLOUT_STR:
4771 code += GET(code, 1 + 2*LINK_SIZE);
4772 break;
4773
4774 case OP_SKIPZERO:
4775 code += 2 + GET(code, 2) + LINK_SIZE;
4776 break;
4777
4778 case OP_COND:
4779 case OP_SCOND:
4780 if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
4781 code[GET(code, 1)] != OP_KET) /* More than one branch */
4782 return code;
4783 code += GET(code, 1) + 1 + LINK_SIZE;
4784 break;
4785
4786 case OP_MARK:
4787 case OP_COMMIT_ARG:
4788 case OP_PRUNE_ARG:
4789 case OP_SKIP_ARG:
4790 case OP_THEN_ARG:
4791 code += code[1] + PRIV(OP_lengths)[*code];
4792 break;
4793
4794 default:
4795 return code;
4796 }
4797 }
4798 /* Control never reaches here */
4799 }
4800
4801
4802
4803 #ifdef SUPPORT_UNICODE
4804 /*************************************************
4805 * Get othercase range *
4806 *************************************************/
4807
4808 /* This function is passed the start and end of a class range in UCP mode. It
4809 searches up the characters, looking for ranges of characters in the "other"
4810 case. Each call returns the next one, updating the start address. A character
4811 with multiple other cases is returned on its own with a special return value.
4812
4813 Arguments:
4814 cptr points to starting character value; updated
4815 d end value
4816 ocptr where to put start of othercase range
4817 odptr where to put end of othercase range
4818
4819 Yield: -1 when no more
4820 0 when a range is returned
4821 >0 the CASESET offset for char with multiple other cases
4822 in this case, ocptr contains the original
4823 */
4824
4825 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)4826 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4827 uint32_t *odptr)
4828 {
4829 uint32_t c, othercase, next;
4830 unsigned int co;
4831
4832 /* Find the first character that has an other case. If it has multiple other
4833 cases, return its case offset value. */
4834
4835 for (c = *cptr; c <= d; c++)
4836 {
4837 if ((co = UCD_CASESET(c)) != 0)
4838 {
4839 *ocptr = c++; /* Character that has the set */
4840 *cptr = c; /* Rest of input range */
4841 return (int)co;
4842 }
4843 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4844 }
4845
4846 if (c > d) return -1; /* Reached end of range */
4847
4848 /* Found a character that has a single other case. Search for the end of the
4849 range, which is either the end of the input range, or a character that has zero
4850 or more than one other cases. */
4851
4852 *ocptr = othercase;
4853 next = othercase + 1;
4854
4855 for (++c; c <= d; c++)
4856 {
4857 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4858 next++;
4859 }
4860
4861 *odptr = next - 1; /* End of othercase range */
4862 *cptr = c; /* Rest of input range */
4863 return 0;
4864 }
4865 #endif /* SUPPORT_UNICODE */
4866
4867
4868
4869 /*************************************************
4870 * Add a character or range to a class (internal) *
4871 *************************************************/
4872
4873 /* This function packages up the logic of adding a character or range of
4874 characters to a class. The character values in the arguments will be within the
4875 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4876 called only from within the "add to class" group of functions, some of which
4877 are recursive and mutually recursive. The external entry point is
4878 add_to_class().
4879
4880 Arguments:
4881 classbits the bit map for characters < 256
4882 uchardptr points to the pointer for extra data
4883 options the options word
4884 cb compile data
4885 start start of range character
4886 end end of range character
4887
4888 Returns: the number of < 256 characters added
4889 the pointer to extra data is updated
4890 */
4891
4892 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)4893 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4894 uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
4895 {
4896 uint32_t c;
4897 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
4898 unsigned int n8 = 0;
4899
4900 /* If caseless matching is required, scan the range and process alternate
4901 cases. In Unicode, there are 8-bit characters that have alternate cases that
4902 are greater than 255 and vice-versa. Sometimes we can just extend the original
4903 range. */
4904
4905 if ((options & PCRE2_CASELESS) != 0)
4906 {
4907 #ifdef SUPPORT_UNICODE
4908 if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
4909 {
4910 int rc;
4911 uint32_t oc, od;
4912
4913 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
4914 c = start;
4915
4916 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4917 {
4918 /* Handle a single character that has more than one other case. */
4919
4920 if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
4921 PRIV(ucd_caseless_sets) + rc, oc);
4922
4923 /* Do nothing if the other case range is within the original range. */
4924
4925 else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
4926
4927 /* Extend the original range if there is overlap, noting that if oc < c, we
4928 can't have od > end because a subrange is always shorter than the basic
4929 range. Otherwise, use a recursive call to add the additional range. */
4930
4931 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4932 else if (od > end && oc <= end + 1)
4933 {
4934 end = od; /* Extend upwards */
4935 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4936 }
4937 else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
4938 }
4939 }
4940 else
4941 #endif /* SUPPORT_UNICODE */
4942
4943 /* Not UTF mode */
4944
4945 for (c = start; c <= classbits_end; c++)
4946 {
4947 SETBIT(classbits, cb->fcc[c]);
4948 n8++;
4949 }
4950 }
4951
4952 /* Now handle the originally supplied range. Adjust the final value according
4953 to the bit length - this means that the same lists of (e.g.) horizontal spaces
4954 can be used in all cases. */
4955
4956 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
4957 end = MAX_NON_UTF_CHAR;
4958
4959 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
4960
4961 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4962
4963 for (c = start; c <= classbits_end; c++)
4964 {
4965 /* Regardless of start, c will always be <= 255. */
4966 SETBIT(classbits, c);
4967 n8++;
4968 }
4969
4970 #ifdef SUPPORT_WIDE_CHARS
4971 if (start <= 0xff) start = 0xff + 1;
4972
4973 if (end >= start)
4974 {
4975 PCRE2_UCHAR *uchardata = *uchardptr;
4976
4977 #ifdef SUPPORT_UNICODE
4978 if ((options & PCRE2_UTF) != 0)
4979 {
4980 if (start < end)
4981 {
4982 *uchardata++ = XCL_RANGE;
4983 uchardata += PRIV(ord2utf)(start, uchardata);
4984 uchardata += PRIV(ord2utf)(end, uchardata);
4985 }
4986 else if (start == end)
4987 {
4988 *uchardata++ = XCL_SINGLE;
4989 uchardata += PRIV(ord2utf)(start, uchardata);
4990 }
4991 }
4992 else
4993 #endif /* SUPPORT_UNICODE */
4994
4995 /* Without UTF support, character values are constrained by the bit length,
4996 and can only be > 256 for 16-bit and 32-bit libraries. */
4997
4998 #if PCRE2_CODE_UNIT_WIDTH == 8
4999 {}
5000 #else
5001 if (start < end)
5002 {
5003 *uchardata++ = XCL_RANGE;
5004 *uchardata++ = start;
5005 *uchardata++ = end;
5006 }
5007 else if (start == end)
5008 {
5009 *uchardata++ = XCL_SINGLE;
5010 *uchardata++ = start;
5011 }
5012 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
5013 *uchardptr = uchardata; /* Updata extra data pointer */
5014 }
5015 #else /* SUPPORT_WIDE_CHARS */
5016 (void)uchardptr; /* Avoid compiler warning */
5017 #endif /* SUPPORT_WIDE_CHARS */
5018
5019 return n8; /* Number of 8-bit characters */
5020 }
5021
5022
5023
5024 #ifdef SUPPORT_UNICODE
5025 /*************************************************
5026 * Add a list of characters to a class (internal) *
5027 *************************************************/
5028
5029 /* This function is used for adding a list of case-equivalent characters to a
5030 class when in UTF mode. This function is called only from within
5031 add_to_class_internal(), with which it is mutually recursive.
5032
5033 Arguments:
5034 classbits the bit map for characters < 256
5035 uchardptr points to the pointer for extra data
5036 options the options word
5037 cb contains pointers to tables etc.
5038 p points to row of 32-bit values, terminated by NOTACHAR
5039 except character to omit; this is used when adding lists of
5040 case-equivalent characters to avoid including the one we
5041 already know about
5042
5043 Returns: the number of < 256 characters added
5044 the pointer to extra data is updated
5045 */
5046
5047 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5048 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5049 uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
5050 {
5051 unsigned int n8 = 0;
5052 while (p[0] < NOTACHAR)
5053 {
5054 unsigned int n = 0;
5055 if (p[0] != except)
5056 {
5057 while(p[n+1] == p[0] + n + 1) n++;
5058 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5059 }
5060 p += n + 1;
5061 }
5062 return n8;
5063 }
5064 #endif
5065
5066
5067
5068 /*************************************************
5069 * External entry point for add range to class *
5070 *************************************************/
5071
5072 /* This function sets the overall range so that the internal functions can try
5073 to avoid duplication when handling case-independence.
5074
5075 Arguments:
5076 classbits the bit map for characters < 256
5077 uchardptr points to the pointer for extra data
5078 options the options word
5079 cb compile data
5080 start start of range character
5081 end end of range character
5082
5083 Returns: the number of < 256 characters added
5084 the pointer to extra data is updated
5085 */
5086
5087 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)5088 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5089 compile_block *cb, uint32_t start, uint32_t end)
5090 {
5091 cb->class_range_start = start;
5092 cb->class_range_end = end;
5093 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5094 }
5095
5096
5097 /*************************************************
5098 * External entry point for add list to class *
5099 *************************************************/
5100
5101 /* This function is used for adding a list of horizontal or vertical whitespace
5102 characters to a class. The list must be in order so that ranges of characters
5103 can be detected and handled appropriately. This function sets the overall range
5104 so that the internal functions can try to avoid duplication when handling
5105 case-independence.
5106
5107 Arguments:
5108 classbits the bit map for characters < 256
5109 uchardptr points to the pointer for extra data
5110 options the options word
5111 cb contains pointers to tables etc.
5112 p points to row of 32-bit values, terminated by NOTACHAR
5113 except character to omit; this is used when adding lists of
5114 case-equivalent characters to avoid including the one we
5115 already know about
5116
5117 Returns: the number of < 256 characters added
5118 the pointer to extra data is updated
5119 */
5120
5121 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5122 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5123 compile_block *cb, const uint32_t *p, unsigned int except)
5124 {
5125 unsigned int n8 = 0;
5126 while (p[0] < NOTACHAR)
5127 {
5128 unsigned int n = 0;
5129 if (p[0] != except)
5130 {
5131 while(p[n+1] == p[0] + n + 1) n++;
5132 cb->class_range_start = p[0];
5133 cb->class_range_end = p[n];
5134 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5135 }
5136 p += n + 1;
5137 }
5138 return n8;
5139 }
5140
5141
5142
5143 /*************************************************
5144 * Add characters not in a list to a class *
5145 *************************************************/
5146
5147 /* This function is used for adding the complement of a list of horizontal or
5148 vertical whitespace to a class. The list must be in order.
5149
5150 Arguments:
5151 classbits the bit map for characters < 256
5152 uchardptr points to the pointer for extra data
5153 options the options word
5154 cb contains pointers to tables etc.
5155 p points to row of 32-bit values, terminated by NOTACHAR
5156
5157 Returns: the number of < 256 characters added
5158 the pointer to extra data is updated
5159 */
5160
5161 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)5162 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5163 uint32_t options, compile_block *cb, const uint32_t *p)
5164 {
5165 BOOL utf = (options & PCRE2_UTF) != 0;
5166 unsigned int n8 = 0;
5167 if (p[0] > 0)
5168 n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
5169 while (p[0] < NOTACHAR)
5170 {
5171 while (p[1] == p[0] + 1) p++;
5172 n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
5173 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5174 p++;
5175 }
5176 return n8;
5177 }
5178
5179
5180
5181 /*************************************************
5182 * Find details of duplicate group names *
5183 *************************************************/
5184
5185 /* This is called from compile_branch() when it needs to know the index and
5186 count of duplicates in the names table when processing named backreferences,
5187 either directly, or as conditions.
5188
5189 Arguments:
5190 name points to the name
5191 length the length of the name
5192 indexptr where to put the index
5193 countptr where to put the count of duplicates
5194 errorcodeptr where to put an error code
5195 cb the compile block
5196
5197 Returns: TRUE if OK, FALSE if not, error code set
5198 */
5199
5200 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5201 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5202 int *countptr, int *errorcodeptr, compile_block *cb)
5203 {
5204 uint32_t i, groupnumber;
5205 int count;
5206 PCRE2_UCHAR *slot = cb->name_table;
5207
5208 /* Find the first entry in the table */
5209
5210 for (i = 0; i < cb->names_found; i++)
5211 {
5212 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5213 slot[IMM2_SIZE+length] == 0) break;
5214 slot += cb->name_entry_size;
5215 }
5216
5217 /* This should not occur, because this function is called only when we know we
5218 have duplicate names. Give an internal error. */
5219
5220 if (i >= cb->names_found)
5221 {
5222 *errorcodeptr = ERR53;
5223 cb->erroroffset = name - cb->start_pattern;
5224 return FALSE;
5225 }
5226
5227 /* Record the index and then see how many duplicates there are, updating the
5228 backref map and maximum back reference as we do. */
5229
5230 *indexptr = i;
5231 count = 0;
5232
5233 for (;;)
5234 {
5235 count++;
5236 groupnumber = GET2(slot,0);
5237 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5238 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5239 if (++i >= cb->names_found) break;
5240 slot += cb->name_entry_size;
5241 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5242 (slot+IMM2_SIZE)[length] != 0) break;
5243 }
5244
5245 *countptr = count;
5246 return TRUE;
5247 }
5248
5249
5250
5251 /*************************************************
5252 * Compile one branch *
5253 *************************************************/
5254
5255 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5256 the options are changed during the branch, the pointer is used to change the
5257 external options bits. This function is used during the pre-compile phase when
5258 we are trying to find out the amount of memory needed, as well as during the
5259 real compile phase. The value of lengthptr distinguishes the two phases.
5260
5261 Arguments:
5262 optionsptr pointer to the option bits
5263 codeptr points to the pointer to the current code point
5264 pptrptr points to the current parsed pattern pointer
5265 errorcodeptr points to error code variable
5266 firstcuptr place to put the first required code unit
5267 firstcuflagsptr place to put the first code unit flags, or a negative number
5268 reqcuptr place to put the last required code unit
5269 reqcuflagsptr place to put the last required code unit flags, or a negative number
5270 bcptr points to current branch chain
5271 cb contains pointers to tables etc.
5272 lengthptr NULL during the real compile phase
5273 points to length accumulator during pre-compile phase
5274
5275 Returns: 0 There's been an error, *errorcodeptr is non-zero
5276 +1 Success, this branch must match at least one character
5277 -1 Success, this branch may match an empty string
5278 */
5279
5280 static int
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)5281 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
5282 int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
5283 uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
5284 compile_block *cb, PCRE2_SIZE *lengthptr)
5285 {
5286 int bravalue = 0;
5287 int okreturn = -1;
5288 int group_return = 0;
5289 uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
5290 uint32_t greedy_default, greedy_non_default;
5291 uint32_t repeat_type, op_type;
5292 uint32_t options = *optionsptr; /* May change dynamically */
5293 uint32_t firstcu, reqcu;
5294 uint32_t zeroreqcu, zerofirstcu;
5295 uint32_t escape;
5296 uint32_t *pptr = *pptrptr;
5297 uint32_t meta, meta_arg;
5298 int32_t firstcuflags, reqcuflags;
5299 int32_t zeroreqcuflags, zerofirstcuflags;
5300 int32_t req_caseopt, reqvary, tempreqvary;
5301 PCRE2_SIZE offset = 0;
5302 PCRE2_SIZE length_prevgroup = 0;
5303 PCRE2_UCHAR *code = *codeptr;
5304 PCRE2_UCHAR *last_code = code;
5305 PCRE2_UCHAR *orig_code = code;
5306 PCRE2_UCHAR *tempcode;
5307 PCRE2_UCHAR *previous = NULL;
5308 PCRE2_UCHAR op_previous;
5309 BOOL groupsetfirstcu = FALSE;
5310 BOOL had_accept = FALSE;
5311 BOOL matched_char = FALSE;
5312 BOOL previous_matched_char = FALSE;
5313 BOOL reset_caseful = FALSE;
5314 const uint8_t *cbits = cb->cbits;
5315 uint8_t classbits[32];
5316
5317 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5318 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5319 dynamically as we process the pattern. */
5320
5321 #ifdef SUPPORT_UNICODE
5322 BOOL utf = (options & PCRE2_UTF) != 0;
5323 BOOL ucp = (options & PCRE2_UCP) != 0;
5324 #else /* No Unicode support */
5325 BOOL utf = FALSE;
5326 #endif
5327
5328 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5329 class_uchardata always so that it can be passed to add_to_class() always,
5330 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5331 alternative calls for the different cases. */
5332
5333 PCRE2_UCHAR *class_uchardata;
5334 #ifdef SUPPORT_WIDE_CHARS
5335 BOOL xclass;
5336 PCRE2_UCHAR *class_uchardata_base;
5337 #endif
5338
5339 /* Set up the default and non-default settings for greediness */
5340
5341 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5342 greedy_non_default = greedy_default ^ 1;
5343
5344 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5345 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5346 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5347
5348 When we hit a repeat whose minimum is zero, we may have to adjust these values
5349 to take the zero repeat into account. This is implemented by setting them to
5350 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5351 item types that can be repeated set these backoff variables appropriately. */
5352
5353 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5354 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5355
5356 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
5357 according to the current setting of the caseless flag. The REQ_CASELESS value
5358 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5359 to record the case status of the value. This is used only for ASCII characters.
5360 */
5361
5362 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
5363
5364 /* Switch on next META item until the end of the branch */
5365
5366 for (;; pptr++)
5367 {
5368 #ifdef SUPPORT_WIDE_CHARS
5369 BOOL xclass_has_prop;
5370 #endif
5371 BOOL negate_class;
5372 BOOL should_flip_negation;
5373 BOOL match_all_or_no_wide_chars;
5374 BOOL possessive_quantifier;
5375 BOOL note_group_empty;
5376 int class_has_8bitchar;
5377 int i;
5378 uint32_t mclength;
5379 uint32_t skipunits;
5380 uint32_t subreqcu, subfirstcu;
5381 uint32_t groupnumber;
5382 uint32_t verbarglen, verbculen;
5383 int32_t subreqcuflags, subfirstcuflags; /* Must be signed */
5384 open_capitem *oc;
5385 PCRE2_UCHAR mcbuffer[8];
5386
5387 /* Get next META item in the pattern and its potential argument. */
5388
5389 meta = META_CODE(*pptr);
5390 meta_arg = META_DATA(*pptr);
5391
5392 /* If we are in the pre-compile phase, accumulate the length used for the
5393 previous cycle of this loop, unless the next item is a quantifier. */
5394
5395 if (lengthptr != NULL)
5396 {
5397 if (code > cb->start_workspace + cb->workspace_size -
5398 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
5399 {
5400 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5401 ERR52 : ERR86;
5402 return 0;
5403 }
5404
5405 /* There is at least one situation where code goes backwards: this is the
5406 case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5407 is processed, the whole class is eliminated. However, it is created first,
5408 so we have to allow memory for it. Therefore, don't ever reduce the length
5409 at this point. */
5410
5411 if (code < last_code) code = last_code;
5412
5413 /* If the next thing is not a quantifier, we add the length of the previous
5414 item into the total, and reset the code pointer to the start of the
5415 workspace. Otherwise leave the previous item available to be quantified. */
5416
5417 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5418 {
5419 if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5420 {
5421 *errorcodeptr = ERR20; /* Integer overflow */
5422 return 0;
5423 }
5424 *lengthptr += (PCRE2_SIZE)(code - orig_code);
5425 if (*lengthptr > MAX_PATTERN_SIZE)
5426 {
5427 *errorcodeptr = ERR20; /* Pattern is too large */
5428 return 0;
5429 }
5430 code = orig_code;
5431 }
5432
5433 /* Remember where this code item starts so we can catch the "backwards"
5434 case above next time round. */
5435
5436 last_code = code;
5437 }
5438
5439 /* Process the next parsed pattern item. If it is not a quantifier, remember
5440 where it starts so that it can be quantified when a quantifier follows.
5441 Checking for the legality of quantifiers happens in parse_regex(), except for
5442 a quantifier after an assertion that is a condition. */
5443
5444 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5445 {
5446 previous = code;
5447 if (matched_char && !had_accept) okreturn = 1;
5448 }
5449
5450 previous_matched_char = matched_char;
5451 matched_char = FALSE;
5452 note_group_empty = FALSE;
5453 skipunits = 0; /* Default value for most subgroups */
5454
5455 switch(meta)
5456 {
5457 /* ===================================================================*/
5458 /* The branch terminates at pattern end or | or ) */
5459
5460 case META_END:
5461 case META_ALT:
5462 case META_KET:
5463 *firstcuptr = firstcu;
5464 *firstcuflagsptr = firstcuflags;
5465 *reqcuptr = reqcu;
5466 *reqcuflagsptr = reqcuflags;
5467 *codeptr = code;
5468 *pptrptr = pptr;
5469 return okreturn;
5470
5471
5472 /* ===================================================================*/
5473 /* Handle single-character metacharacters. In multiline mode, ^ disables
5474 the setting of any following char as a first character. */
5475
5476 case META_CIRCUMFLEX:
5477 if ((options & PCRE2_MULTILINE) != 0)
5478 {
5479 if (firstcuflags == REQ_UNSET)
5480 zerofirstcuflags = firstcuflags = REQ_NONE;
5481 *code++ = OP_CIRCM;
5482 }
5483 else *code++ = OP_CIRC;
5484 break;
5485
5486 case META_DOLLAR:
5487 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5488 break;
5489
5490 /* There can never be a first char if '.' is first, whatever happens about
5491 repeats. The value of reqcu doesn't change either. */
5492
5493 case META_DOT:
5494 matched_char = TRUE;
5495 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5496 zerofirstcu = firstcu;
5497 zerofirstcuflags = firstcuflags;
5498 zeroreqcu = reqcu;
5499 zeroreqcuflags = reqcuflags;
5500 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5501 break;
5502
5503
5504 /* ===================================================================*/
5505 /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5506 Otherwise, an initial ']' is taken as a data character. When empty classes
5507 are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5508 match any character, so generate OP_ALLANY. */
5509
5510 case META_CLASS_EMPTY:
5511 case META_CLASS_EMPTY_NOT:
5512 matched_char = TRUE;
5513 *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5514 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5515 zerofirstcu = firstcu;
5516 zerofirstcuflags = firstcuflags;
5517 break;
5518
5519
5520 /* ===================================================================*/
5521 /* Non-empty character class. If the included characters are all < 256, we
5522 build a 32-byte bitmap of the permitted characters, except in the special
5523 case where there is only one such character. For negated classes, we build
5524 the map as usual, then invert it at the end. However, we use a different
5525 opcode so that data characters > 255 can be handled correctly.
5526
5527 If the class contains characters outside the 0-255 range, a different
5528 opcode is compiled. It may optionally have a bit map for characters < 256,
5529 but those above are are explicitly listed afterwards. A flag code unit
5530 tells whether the bitmap is present, and whether this is a negated class or
5531 not. */
5532
5533 case META_CLASS_NOT:
5534 case META_CLASS:
5535 matched_char = TRUE;
5536 negate_class = meta == META_CLASS_NOT;
5537
5538 /* We can optimize the case of a single character in a class by generating
5539 OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5540 negative. In the negative case there can be no first char if this item is
5541 first, whatever repeat count may follow. In the case of reqcu, save the
5542 previous value for reinstating. */
5543
5544 /* NOTE: at present this optimization is not effective if the only
5545 character in a class in 32-bit, non-UCP mode has its top bit set. */
5546
5547 if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5548 {
5549 #ifdef SUPPORT_UNICODE
5550 uint32_t d;
5551 #endif
5552 uint32_t c = pptr[1];
5553
5554 pptr += 2; /* Move on to class end */
5555 if (meta == META_CLASS) /* A positive one-char class can be */
5556 { /* handled as a normal literal character. */
5557 meta = c; /* Set up the character */
5558 goto NORMAL_CHAR_SET;
5559 }
5560
5561 /* Handle a negative one-character class */
5562
5563 zeroreqcu = reqcu;
5564 zeroreqcuflags = reqcuflags;
5565 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5566 zerofirstcu = firstcu;
5567 zerofirstcuflags = firstcuflags;
5568
5569 /* For caseless UTF or UCP mode, check whether this character has more
5570 than one other case. If so, generate a special OP_NOTPROP item instead of
5571 OP_NOTI. */
5572
5573 #ifdef SUPPORT_UNICODE
5574 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5575 (d = UCD_CASESET(c)) != 0)
5576 {
5577 *code++ = OP_NOTPROP;
5578 *code++ = PT_CLIST;
5579 *code++ = d;
5580 break; /* We are finished with this class */
5581 }
5582 #endif
5583 /* Char has only one other case, or UCP not available */
5584
5585 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5586 code += PUTCHAR(c, code);
5587 break; /* We are finished with this class */
5588 } /* End of 1-char optimization */
5589
5590 /* Handle character classes that contain more than just one literal
5591 character. If there are exactly two characters in a positive class, see if
5592 they are case partners. This can be optimized to generate a caseless single
5593 character match (which also sets first/required code units if relevant). */
5594
5595 if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5596 pptr[3] == META_CLASS_END)
5597 {
5598 uint32_t c = pptr[1];
5599
5600 #ifdef SUPPORT_UNICODE
5601 if (UCD_CASESET(c) == 0)
5602 #endif
5603 {
5604 uint32_t d;
5605
5606 #ifdef SUPPORT_UNICODE
5607 if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5608 #endif
5609 {
5610 #if PCRE2_CODE_UNIT_WIDTH != 8
5611 if (c > 255) d = c; else
5612 #endif
5613 d = TABLE_GET(c, cb->fcc, c);
5614 }
5615
5616 if (c != d && pptr[2] == d)
5617 {
5618 pptr += 3; /* Move on to class end */
5619 meta = c;
5620 if ((options & PCRE2_CASELESS) == 0)
5621 {
5622 reset_caseful = TRUE;
5623 options |= PCRE2_CASELESS;
5624 req_caseopt = REQ_CASELESS;
5625 }
5626 goto CLASS_CASELESS_CHAR;
5627 }
5628 }
5629 }
5630
5631 /* If a non-extended class contains a negative special such as \S, we need
5632 to flip the negation flag at the end, so that support for characters > 255
5633 works correctly (they are all included in the class). An extended class may
5634 need to insert specific matching or non-matching code for wide characters.
5635 */
5636
5637 should_flip_negation = match_all_or_no_wide_chars = FALSE;
5638
5639 /* Extended class (xclass) will be used when characters > 255
5640 might match. */
5641
5642 #ifdef SUPPORT_WIDE_CHARS
5643 xclass = FALSE;
5644 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
5645 class_uchardata_base = class_uchardata; /* Save the start */
5646 #endif
5647
5648 /* For optimization purposes, we track some properties of the class:
5649 class_has_8bitchar will be non-zero if the class contains at least one
5650 character with a code point less than 256; xclass_has_prop will be TRUE if
5651 Unicode property checks are present in the class. */
5652
5653 class_has_8bitchar = 0;
5654 #ifdef SUPPORT_WIDE_CHARS
5655 xclass_has_prop = FALSE;
5656 #endif
5657
5658 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5659 in a temporary bit of memory, in case the class contains fewer than two
5660 8-bit characters because in that case the compiled code doesn't use the bit
5661 map. */
5662
5663 memset(classbits, 0, 32 * sizeof(uint8_t));
5664
5665 /* Process items until META_CLASS_END is reached. */
5666
5667 while ((meta = *(++pptr)) != META_CLASS_END)
5668 {
5669 /* Handle POSIX classes such as [:alpha:] etc. */
5670
5671 if (meta == META_POSIX || meta == META_POSIX_NEG)
5672 {
5673 BOOL local_negate = (meta == META_POSIX_NEG);
5674 int posix_class = *(++pptr);
5675 int taboffset, tabopt;
5676 uint8_t pbits[32];
5677
5678 should_flip_negation = local_negate; /* Note negative special */
5679
5680 /* If matching is caseless, upper and lower are converted to alpha.
5681 This relies on the fact that the class table starts with alpha,
5682 lower, upper as the first 3 entries. */
5683
5684 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5685 posix_class = 0;
5686
5687 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5688 different escape sequences that use Unicode properties \p or \P.
5689 Others that are not available via \p or \P have to generate
5690 XCL_PROP/XCL_NOTPROP directly, which is done here. */
5691
5692 #ifdef SUPPORT_UNICODE
5693 if ((options & PCRE2_UCP) != 0) switch(posix_class)
5694 {
5695 case PC_GRAPH:
5696 case PC_PRINT:
5697 case PC_PUNCT:
5698 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5699 *class_uchardata++ = (PCRE2_UCHAR)
5700 ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5701 (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5702 *class_uchardata++ = 0;
5703 xclass_has_prop = TRUE;
5704 goto CONTINUE_CLASS;
5705
5706 /* For the other POSIX classes (ascii, xdigit) we are going to
5707 fall through to the non-UCP case and build a bit map for
5708 characters with code points less than 256. However, if we are in
5709 a negated POSIX class, characters with code points greater than
5710 255 must either all match or all not match, depending on whether
5711 the whole class is not or is negated. For example, for
5712 [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5713 they must not.
5714
5715 In the special case where there are no xclass items, this is
5716 automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5717 explicit range is needed for OP_XCLASS. Setting a flag here
5718 causes the range to be generated later when it is known that
5719 OP_XCLASS is required. In the 8-bit library this is relevant only in
5720 utf mode, since no wide characters can exist otherwise. */
5721
5722 default:
5723 #if PCRE2_CODE_UNIT_WIDTH == 8
5724 if (utf)
5725 #endif
5726 match_all_or_no_wide_chars |= local_negate;
5727 break;
5728 }
5729 #endif /* SUPPORT_UNICODE */
5730
5731 /* In the non-UCP case, or when UCP makes no difference, we build the
5732 bit map for the POSIX class in a chunk of local store because we may
5733 be adding and subtracting from it, and we don't want to subtract bits
5734 that may be in the main map already. At the end we or the result into
5735 the bit map that is being built. */
5736
5737 posix_class *= 3;
5738
5739 /* Copy in the first table (always present) */
5740
5741 memcpy(pbits, cbits + posix_class_maps[posix_class],
5742 32 * sizeof(uint8_t));
5743
5744 /* If there is a second table, add or remove it as required. */
5745
5746 taboffset = posix_class_maps[posix_class + 1];
5747 tabopt = posix_class_maps[posix_class + 2];
5748
5749 if (taboffset >= 0)
5750 {
5751 if (tabopt >= 0)
5752 for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5753 else
5754 for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5755 }
5756
5757 /* Now see if we need to remove any special characters. An option
5758 value of 1 removes vertical space and 2 removes underscore. */
5759
5760 if (tabopt < 0) tabopt = -tabopt;
5761 if (tabopt == 1) pbits[1] &= ~0x3c;
5762 else if (tabopt == 2) pbits[11] &= 0x7f;
5763
5764 /* Add the POSIX table or its complement into the main table that is
5765 being built and we are done. */
5766
5767 if (local_negate)
5768 for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
5769 else
5770 for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
5771
5772 /* Every class contains at least one < 256 character. */
5773
5774 class_has_8bitchar = 1;
5775 goto CONTINUE_CLASS; /* End of POSIX handling */
5776 }
5777
5778 /* Other than POSIX classes, the only items we should encounter are
5779 \d-type escapes and literal characters (possibly as ranges). */
5780
5781 if (meta == META_BIGVALUE)
5782 {
5783 meta = *(++pptr);
5784 goto CLASS_LITERAL;
5785 }
5786
5787 /* Any other non-literal must be an escape */
5788
5789 if (meta >= META_END)
5790 {
5791 if (META_CODE(meta) != META_ESCAPE)
5792 {
5793 #ifdef DEBUG_SHOW_PARSED
5794 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5795 "in character class\n", meta);
5796 #endif
5797 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
5798 return 0;
5799 }
5800 escape = META_DATA(meta);
5801
5802 /* Every class contains at least one < 256 character. */
5803
5804 class_has_8bitchar++;
5805
5806 switch(escape)
5807 {
5808 case ESC_d:
5809 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5810 break;
5811
5812 case ESC_D:
5813 should_flip_negation = TRUE;
5814 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
5815 break;
5816
5817 case ESC_w:
5818 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5819 break;
5820
5821 case ESC_W:
5822 should_flip_negation = TRUE;
5823 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
5824 break;
5825
5826 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5827 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5828 previously set by something earlier in the character class.
5829 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5830 we could just adjust the appropriate bit. From PCRE 8.34 we no
5831 longer treat \s and \S specially. */
5832
5833 case ESC_s:
5834 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5835 break;
5836
5837 case ESC_S:
5838 should_flip_negation = TRUE;
5839 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
5840 break;
5841
5842 /* When adding the horizontal or vertical space lists to a class, or
5843 their complements, disable PCRE2_CASELESS, because it justs wastes
5844 time, and in the "not-x" UTF cases can create unwanted duplicates in
5845 the XCLASS list (provoked by characters that have more than one other
5846 case and by both cases being in the same "not-x" sublist). */
5847
5848 case ESC_h:
5849 (void)add_list_to_class(classbits, &class_uchardata,
5850 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5851 break;
5852
5853 case ESC_H:
5854 (void)add_not_list_to_class(classbits, &class_uchardata,
5855 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5856 break;
5857
5858 case ESC_v:
5859 (void)add_list_to_class(classbits, &class_uchardata,
5860 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5861 break;
5862
5863 case ESC_V:
5864 (void)add_not_list_to_class(classbits, &class_uchardata,
5865 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5866 break;
5867
5868 /* If Unicode is not supported, \P and \p are not allowed and are
5869 faulted at parse time, so will never appear here. */
5870
5871 #ifdef SUPPORT_UNICODE
5872 case ESC_p:
5873 case ESC_P:
5874 {
5875 uint32_t ptype = *(++pptr) >> 16;
5876 uint32_t pdata = *pptr & 0xffff;
5877 *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5878 *class_uchardata++ = ptype;
5879 *class_uchardata++ = pdata;
5880 xclass_has_prop = TRUE;
5881 class_has_8bitchar--; /* Undo! */
5882 }
5883 break;
5884 #endif
5885 }
5886
5887 goto CONTINUE_CLASS;
5888 } /* End handling \d-type escapes */
5889
5890 /* A literal character may be followed by a range meta. At parse time
5891 there are checks for out-of-order characters, for ranges where the two
5892 characters are equal, and for hyphens that cannot indicate a range. At
5893 this point, therefore, no checking is needed. */
5894
5895 else
5896 {
5897 uint32_t c, d;
5898
5899 CLASS_LITERAL:
5900 c = d = meta;
5901
5902 /* Remember if \r or \n were explicitly used */
5903
5904 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5905
5906 /* Process a character range */
5907
5908 if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
5909 {
5910 #ifdef EBCDIC
5911 BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
5912 #endif
5913 pptr += 2;
5914 d = *pptr;
5915 if (d == META_BIGVALUE) d = *(++pptr);
5916
5917 /* Remember an explicit \r or \n, and add the range to the class. */
5918
5919 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5920
5921 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
5922 because there are holes in the encoding, and simply using the range
5923 A-Z (for example) would include the characters in the holes. This
5924 applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
5925
5926 #ifdef EBCDIC
5927 if (range_is_literal &&
5928 (cb->ctypes[c] & ctype_letter) != 0 &&
5929 (cb->ctypes[d] & ctype_letter) != 0 &&
5930 (c <= CHAR_z) == (d <= CHAR_z))
5931 {
5932 uint32_t uc = (d <= CHAR_z)? 0 : 64;
5933 uint32_t C = c - uc;
5934 uint32_t D = d - uc;
5935
5936 if (C <= CHAR_i)
5937 {
5938 class_has_8bitchar +=
5939 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5940 ((D < CHAR_i)? D : CHAR_i) + uc);
5941 C = CHAR_j;
5942 }
5943
5944 if (C <= D && C <= CHAR_r)
5945 {
5946 class_has_8bitchar +=
5947 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5948 ((D < CHAR_r)? D : CHAR_r) + uc);
5949 C = CHAR_s;
5950 }
5951
5952 if (C <= D)
5953 {
5954 class_has_8bitchar +=
5955 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5956 D + uc);
5957 }
5958 }
5959 else
5960 #endif
5961 /* Not an EBCDIC special range */
5962
5963 class_has_8bitchar +=
5964 add_to_class(classbits, &class_uchardata, options, cb, c, d);
5965 goto CONTINUE_CLASS; /* Go get the next char in the class */
5966 } /* End of range handling */
5967
5968
5969 /* Handle a single character. */
5970
5971 class_has_8bitchar +=
5972 add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
5973 }
5974
5975 /* Continue to the next item in the class. */
5976
5977 CONTINUE_CLASS:
5978
5979 #ifdef SUPPORT_WIDE_CHARS
5980 /* If any wide characters or Unicode properties have been encountered,
5981 set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
5982 of the extra data and reset the pointer. This is so that very large
5983 classes that contain a zillion wide characters or Unicode property tests
5984 do not overwrite the workspace (which is on the stack). */
5985
5986 if (class_uchardata > class_uchardata_base)
5987 {
5988 xclass = TRUE;
5989 if (lengthptr != NULL)
5990 {
5991 *lengthptr += class_uchardata - class_uchardata_base;
5992 class_uchardata = class_uchardata_base;
5993 }
5994 }
5995 #endif
5996
5997 continue; /* Needed to avoid error when not supporting wide chars */
5998 } /* End of main class-processing loop */
5999
6000 /* If this class is the first thing in the branch, there can be no first
6001 char setting, whatever the repeat count. Any reqcu setting must remain
6002 unchanged after any kind of repeat. */
6003
6004 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6005 zerofirstcu = firstcu;
6006 zerofirstcuflags = firstcuflags;
6007 zeroreqcu = reqcu;
6008 zeroreqcuflags = reqcuflags;
6009
6010 /* If there are characters with values > 255, or Unicode property settings
6011 (\p or \P), we have to compile an extended class, with its own opcode,
6012 unless there were no property settings and there was a negated special such
6013 as \S in the class, and PCRE2_UCP is not set, because in that case all
6014 characters > 255 are in or not in the class, so any that were explicitly
6015 given as well can be ignored.
6016
6017 In the UCP case, if certain negated POSIX classes ([:^ascii:] or
6018 [^:xdigit:]) were present in a class, we either have to match or not match
6019 all wide characters (depending on whether the whole class is or is not
6020 negated). This requirement is indicated by match_all_or_no_wide_chars being
6021 true. We do this by including an explicit range, which works in both cases.
6022 This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6023 cannot be any wide characters in 8-bit non-UTF mode.
6024
6025 When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6026 class where \S etc is present without PCRE2_UCP, causing an extended class
6027 to be compiled, we make sure that all characters > 255 are included by
6028 forcing match_all_or_no_wide_chars to be true.
6029
6030 If, when generating an xclass, there are no characters < 256, we can omit
6031 the bitmap in the actual compiled code. */
6032
6033 #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
6034 if (xclass && (
6035 #ifdef SUPPORT_UNICODE
6036 (options & PCRE2_UCP) != 0 ||
6037 #endif
6038 xclass_has_prop || !should_flip_negation))
6039 {
6040 if (match_all_or_no_wide_chars || (
6041 #if PCRE2_CODE_UNIT_WIDTH == 8
6042 utf &&
6043 #endif
6044 should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6045 {
6046 *class_uchardata++ = XCL_RANGE;
6047 if (utf) /* Will always be utf in the 8-bit library */
6048 {
6049 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6050 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6051 }
6052 else /* Can only happen for the 16-bit & 32-bit libraries */
6053 {
6054 #if PCRE2_CODE_UNIT_WIDTH == 16
6055 *class_uchardata++ = 0x100;
6056 *class_uchardata++ = 0xffffu;
6057 #elif PCRE2_CODE_UNIT_WIDTH == 32
6058 *class_uchardata++ = 0x100;
6059 *class_uchardata++ = 0xffffffffu;
6060 #endif
6061 }
6062 }
6063 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
6064 *code++ = OP_XCLASS;
6065 code += LINK_SIZE;
6066 *code = negate_class? XCL_NOT:0;
6067 if (xclass_has_prop) *code |= XCL_HASPROP;
6068
6069 /* If the map is required, move up the extra data to make room for it;
6070 otherwise just move the code pointer to the end of the extra data. */
6071
6072 if (class_has_8bitchar > 0)
6073 {
6074 *code++ |= XCL_MAP;
6075 (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6076 CU2BYTES(class_uchardata - code));
6077 if (negate_class && !xclass_has_prop)
6078 {
6079 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6080 for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6081 }
6082 memcpy(code, classbits, 32);
6083 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6084 }
6085 else code = class_uchardata;
6086
6087 /* Now fill in the complete length of the item */
6088
6089 PUT(previous, 1, (int)(code - previous));
6090 break; /* End of class handling */
6091 }
6092 #endif /* SUPPORT_WIDE_CHARS */
6093
6094 /* If there are no characters > 255, or they are all to be included or
6095 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6096 whole class was negated and whether there were negative specials such as \S
6097 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6098 negating it if necessary. */
6099
6100 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6101 if (lengthptr == NULL) /* Save time in the pre-compile phase */
6102 {
6103 if (negate_class)
6104 {
6105 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6106 for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6107 }
6108 memcpy(code, classbits, 32);
6109 }
6110 code += 32 / sizeof(PCRE2_UCHAR);
6111 break; /* End of class processing */
6112
6113
6114 /* ===================================================================*/
6115 /* Deal with (*VERB)s. */
6116
6117 /* Check for open captures before ACCEPT and close those that are within
6118 the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6119 assertion. In the first pass, just accumulate the length required;
6120 otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6121 workspace overflow. Do not set firstcu after *ACCEPT. */
6122
6123 case META_ACCEPT:
6124 cb->had_accept = had_accept = TRUE;
6125 for (oc = cb->open_caps;
6126 oc != NULL && oc->assert_depth >= cb->assert_depth;
6127 oc = oc->next)
6128 {
6129 if (lengthptr != NULL)
6130 {
6131 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6132 }
6133 else
6134 {
6135 *code++ = OP_CLOSE;
6136 PUT2INC(code, 0, oc->number);
6137 }
6138 }
6139 *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6140 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6141 break;
6142
6143 case META_PRUNE:
6144 case META_SKIP:
6145 cb->had_pruneorskip = TRUE;
6146 /* Fall through */
6147 case META_COMMIT:
6148 case META_FAIL:
6149 *code++ = verbops[(meta - META_MARK) >> 16];
6150 break;
6151
6152 case META_THEN:
6153 cb->external_flags |= PCRE2_HASTHEN;
6154 *code++ = OP_THEN;
6155 break;
6156
6157 /* Handle verbs with arguments. Arguments can be very long, especially in
6158 16- and 32-bit modes, and can overflow the workspace in the first pass.
6159 However, the argument length is constrained to be small enough to fit in
6160 one code unit. This check happens in parse_regex(). In the first pass,
6161 instead of putting the argument into memory, we just update the length
6162 counter and set up an empty argument. */
6163
6164 case META_THEN_ARG:
6165 cb->external_flags |= PCRE2_HASTHEN;
6166 goto VERB_ARG;
6167
6168 case META_PRUNE_ARG:
6169 case META_SKIP_ARG:
6170 cb->had_pruneorskip = TRUE;
6171 /* Fall through */
6172 case META_MARK:
6173 case META_COMMIT_ARG:
6174 VERB_ARG:
6175 *code++ = verbops[(meta - META_MARK) >> 16];
6176 /* The length is in characters. */
6177 verbarglen = *(++pptr);
6178 verbculen = 0;
6179 tempcode = code++;
6180 for (i = 0; i < (int)verbarglen; i++)
6181 {
6182 meta = *(++pptr);
6183 #ifdef SUPPORT_UNICODE
6184 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6185 #endif
6186 {
6187 mclength = 1;
6188 mcbuffer[0] = meta;
6189 }
6190 if (lengthptr != NULL) *lengthptr += mclength; else
6191 {
6192 memcpy(code, mcbuffer, CU2BYTES(mclength));
6193 code += mclength;
6194 verbculen += mclength;
6195 }
6196 }
6197
6198 *tempcode = verbculen; /* Fill in the code unit length */
6199 *code++ = 0; /* Terminating zero */
6200 break;
6201
6202
6203 /* ===================================================================*/
6204 /* Handle options change. The new setting must be passed back for use in
6205 subsequent branches. Reset the greedy defaults and the case value for
6206 firstcu and reqcu. */
6207
6208 case META_OPTIONS:
6209 *optionsptr = options = *(++pptr);
6210 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6211 greedy_non_default = greedy_default ^ 1;
6212 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6213 break;
6214
6215
6216 /* ===================================================================*/
6217 /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6218 because it could be a numerical check on recursion, or a name check on a
6219 group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6220 we can handle it either way. We first try for a name; if not found, process
6221 the number. */
6222
6223 case META_COND_RNUMBER: /* (?(Rdigits) */
6224 case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
6225 case META_COND_RNAME: /* (?(R&name) - test for recursion */
6226 bravalue = OP_COND;
6227 {
6228 int count, index;
6229 PCRE2_SPTR name;
6230 named_group *ng = cb->named_groups;
6231 uint32_t length = *(++pptr);
6232
6233 GETPLUSOFFSET(offset, pptr);
6234 name = cb->start_pattern + offset;
6235
6236 /* In the first pass, the names generated in the pre-pass are available,
6237 but the main name table has not yet been created. Scan the list of names
6238 generated in the pre-pass in order to get a number and whether or not
6239 this name is duplicated. If it is not duplicated, we can handle it as a
6240 numerical group. */
6241
6242 for (i = 0; i < cb->names_found; i++, ng++)
6243 {
6244 if (length == ng->length &&
6245 PRIV(strncmp)(name, ng->name, length) == 0)
6246 {
6247 if (!ng->isdup)
6248 {
6249 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6250 PUT2(code, 2+LINK_SIZE, ng->number);
6251 if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6252 skipunits = 1+IMM2_SIZE;
6253 goto GROUP_PROCESS_NOTE_EMPTY;
6254 }
6255 break; /* Found a duplicated name */
6256 }
6257 }
6258
6259 /* If the name was not found we have a bad reference, unless we are
6260 dealing with R<digits>, which is treated as a recursion test by number.
6261 */
6262
6263 if (i >= cb->names_found)
6264 {
6265 groupnumber = 0;
6266 if (meta == META_COND_RNUMBER)
6267 {
6268 for (i = 1; i < (int)length; i++)
6269 {
6270 groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6271 if (groupnumber > MAX_GROUP_NUMBER)
6272 {
6273 *errorcodeptr = ERR61;
6274 cb->erroroffset = offset + i;
6275 return 0;
6276 }
6277 }
6278 }
6279
6280 if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6281 {
6282 *errorcodeptr = ERR15;
6283 cb->erroroffset = offset;
6284 return 0;
6285 }
6286
6287 /* (?Rdigits) treated as a recursion reference by number. A value of
6288 zero (which is the result of both (?R) and (?R0)) means "any", and is
6289 translated into RREF_ANY (which is 0xffff). */
6290
6291 if (groupnumber == 0) groupnumber = RREF_ANY;
6292 code[1+LINK_SIZE] = OP_RREF;
6293 PUT2(code, 2+LINK_SIZE, groupnumber);
6294 skipunits = 1+IMM2_SIZE;
6295 goto GROUP_PROCESS_NOTE_EMPTY;
6296 }
6297
6298 /* A duplicated name was found. Note that if an R<digits> name is found
6299 (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6300
6301 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6302
6303 /* We have a duplicated name. In the compile pass we have to search the
6304 main table in order to get the index and count values. */
6305
6306 count = 0; /* Values for first pass (avoids compiler warning) */
6307 index = 0;
6308 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6309 &count, errorcodeptr, cb)) return 0;
6310
6311 /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6312 insert appropriate data values. */
6313
6314 code[1+LINK_SIZE]++;
6315 skipunits = 1+2*IMM2_SIZE;
6316 PUT2(code, 2+LINK_SIZE, index);
6317 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6318 }
6319 goto GROUP_PROCESS_NOTE_EMPTY;
6320
6321 /* The DEFINE condition is always false. Its internal groups may never
6322 be called, so matched_char must remain false, hence the jump to
6323 GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6324
6325 case META_COND_DEFINE:
6326 bravalue = OP_COND;
6327 GETPLUSOFFSET(offset, pptr);
6328 code[1+LINK_SIZE] = OP_DEFINE;
6329 skipunits = 1;
6330 goto GROUP_PROCESS;
6331
6332 /* Conditional test of a group's being set. */
6333
6334 case META_COND_NUMBER:
6335 bravalue = OP_COND;
6336 GETPLUSOFFSET(offset, pptr);
6337 groupnumber = *(++pptr);
6338 if (groupnumber > cb->bracount)
6339 {
6340 *errorcodeptr = ERR15;
6341 cb->erroroffset = offset;
6342 return 0;
6343 }
6344 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6345 offset -= 2; /* Point at initial ( for too many branches error */
6346 code[1+LINK_SIZE] = OP_CREF;
6347 skipunits = 1+IMM2_SIZE;
6348 PUT2(code, 2+LINK_SIZE, groupnumber);
6349 goto GROUP_PROCESS_NOTE_EMPTY;
6350
6351 /* Test for the PCRE2 version. */
6352
6353 case META_COND_VERSION:
6354 bravalue = OP_COND;
6355 if (pptr[1] > 0)
6356 code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6357 (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6358 OP_TRUE : OP_FALSE;
6359 else
6360 code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6361 OP_TRUE : OP_FALSE;
6362 skipunits = 1;
6363 pptr += 3;
6364 goto GROUP_PROCESS_NOTE_EMPTY;
6365
6366 /* The condition is an assertion, possibly preceded by a callout. */
6367
6368 case META_COND_ASSERT:
6369 bravalue = OP_COND;
6370 goto GROUP_PROCESS_NOTE_EMPTY;
6371
6372
6373 /* ===================================================================*/
6374 /* Handle all kinds of nested bracketed groups. The non-capturing,
6375 non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6376
6377 case META_LOOKAHEAD:
6378 bravalue = OP_ASSERT;
6379 cb->assert_depth += 1;
6380 goto GROUP_PROCESS;
6381
6382 case META_LOOKAHEAD_NA:
6383 bravalue = OP_ASSERT_NA;
6384 cb->assert_depth += 1;
6385 goto GROUP_PROCESS;
6386
6387 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6388 thing to do, but Perl allows all assertions to be quantified, and when
6389 they contain capturing parentheses there may be a potential use for
6390 this feature. Not that that applies to a quantified (?!) but we allow
6391 it for uniformity. */
6392
6393 case META_LOOKAHEADNOT:
6394 if (pptr[1] == META_KET &&
6395 (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6396 {
6397 *code++ = OP_FAIL;
6398 pptr++;
6399 }
6400 else
6401 {
6402 bravalue = OP_ASSERT_NOT;
6403 cb->assert_depth += 1;
6404 goto GROUP_PROCESS;
6405 }
6406 break;
6407
6408 case META_LOOKBEHIND:
6409 bravalue = OP_ASSERTBACK;
6410 cb->assert_depth += 1;
6411 goto GROUP_PROCESS;
6412
6413 case META_LOOKBEHINDNOT:
6414 bravalue = OP_ASSERTBACK_NOT;
6415 cb->assert_depth += 1;
6416 goto GROUP_PROCESS;
6417
6418 case META_LOOKBEHIND_NA:
6419 bravalue = OP_ASSERTBACK_NA;
6420 cb->assert_depth += 1;
6421 goto GROUP_PROCESS;
6422
6423 case META_ATOMIC:
6424 bravalue = OP_ONCE;
6425 goto GROUP_PROCESS_NOTE_EMPTY;
6426
6427 case META_SCRIPT_RUN:
6428 bravalue = OP_SCRIPT_RUN;
6429 goto GROUP_PROCESS_NOTE_EMPTY;
6430
6431 case META_NOCAPTURE:
6432 bravalue = OP_BRA;
6433 /* Fall through */
6434
6435 /* Process nested bracketed regex. The nesting depth is maintained for the
6436 benefit of the stackguard function. The test for too deep nesting is now
6437 done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6438 others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6439 note of whether or not they may match an empty string. */
6440
6441 GROUP_PROCESS_NOTE_EMPTY:
6442 note_group_empty = TRUE;
6443
6444 GROUP_PROCESS:
6445 cb->parens_depth += 1;
6446 *code = bravalue;
6447 pptr++;
6448 tempcode = code;
6449 tempreqvary = cb->req_varyopt; /* Save value before group */
6450 length_prevgroup = 0; /* Initialize for pre-compile phase */
6451
6452 if ((group_return =
6453 compile_regex(
6454 options, /* The option state */
6455 &tempcode, /* Where to put code (updated) */
6456 &pptr, /* Input pointer (updated) */
6457 errorcodeptr, /* Where to put an error message */
6458 skipunits, /* Skip over bracket number */
6459 &subfirstcu, /* For possible first char */
6460 &subfirstcuflags,
6461 &subreqcu, /* For possible last char */
6462 &subreqcuflags,
6463 bcptr, /* Current branch chain */
6464 cb, /* Compile data block */
6465 (lengthptr == NULL)? NULL : /* Actual compile phase */
6466 &length_prevgroup /* Pre-compile phase */
6467 )) == 0)
6468 return 0; /* Error */
6469
6470 cb->parens_depth -= 1;
6471
6472 /* If that was a non-conditional significant group (not an assertion, not a
6473 DEFINE) that matches at least one character, then the current item matches
6474 a character. Conditionals are handled below. */
6475
6476 if (note_group_empty && bravalue != OP_COND && group_return > 0)
6477 matched_char = TRUE;
6478
6479 /* If we've just compiled an assertion, pop the assert depth. */
6480
6481 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6482 cb->assert_depth -= 1;
6483
6484 /* At the end of compiling, code is still pointing to the start of the
6485 group, while tempcode has been updated to point past the end of the group.
6486 The parsed pattern pointer (pptr) is on the closing META_KET.
6487
6488 If this is a conditional bracket, check that there are no more than
6489 two branches in the group, or just one if it's a DEFINE group. We do this
6490 in the real compile phase, not in the pre-pass, where the whole group may
6491 not be available. */
6492
6493 if (bravalue == OP_COND && lengthptr == NULL)
6494 {
6495 PCRE2_UCHAR *tc = code;
6496 int condcount = 0;
6497
6498 do {
6499 condcount++;
6500 tc += GET(tc,1);
6501 }
6502 while (*tc != OP_KET);
6503
6504 /* A DEFINE group is never obeyed inline (the "condition" is always
6505 false). It must have only one branch. Having checked this, change the
6506 opcode to OP_FALSE. */
6507
6508 if (code[LINK_SIZE+1] == OP_DEFINE)
6509 {
6510 if (condcount > 1)
6511 {
6512 cb->erroroffset = offset;
6513 *errorcodeptr = ERR54;
6514 return 0;
6515 }
6516 code[LINK_SIZE+1] = OP_FALSE;
6517 bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6518 }
6519
6520 /* A "normal" conditional group. If there is just one branch, we must not
6521 make use of its firstcu or reqcu, because this is equivalent to an
6522 empty second branch. Also, it may match an empty string. If there are two
6523 branches, this item must match a character if the group must. */
6524
6525 else
6526 {
6527 if (condcount > 2)
6528 {
6529 cb->erroroffset = offset;
6530 *errorcodeptr = ERR27;
6531 return 0;
6532 }
6533 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6534 else if (group_return > 0) matched_char = TRUE;
6535 }
6536 }
6537
6538 /* In the pre-compile phase, update the length by the length of the group,
6539 less the brackets at either end. Then reduce the compiled code to just a
6540 set of non-capturing brackets so that it doesn't use much memory if it is
6541 duplicated by a quantifier.*/
6542
6543 if (lengthptr != NULL)
6544 {
6545 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6546 {
6547 *errorcodeptr = ERR20;
6548 return 0;
6549 }
6550 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6551 code++; /* This already contains bravalue */
6552 PUTINC(code, 0, 1 + LINK_SIZE);
6553 *code++ = OP_KET;
6554 PUTINC(code, 0, 1 + LINK_SIZE);
6555 break; /* No need to waste time with special character handling */
6556 }
6557
6558 /* Otherwise update the main code pointer to the end of the group. */
6559
6560 code = tempcode;
6561
6562 /* For a DEFINE group, required and first character settings are not
6563 relevant. */
6564
6565 if (bravalue == OP_DEFINE) break;
6566
6567 /* Handle updating of the required and first code units for other types of
6568 group. Update for normal brackets of all kinds, and conditions with two
6569 branches (see code above). If the bracket is followed by a quantifier with
6570 zero repeat, we have to back off. Hence the definition of zeroreqcu and
6571 zerofirstcu outside the main loop so that they can be accessed for the back
6572 off. */
6573
6574 zeroreqcu = reqcu;
6575 zeroreqcuflags = reqcuflags;
6576 zerofirstcu = firstcu;
6577 zerofirstcuflags = firstcuflags;
6578 groupsetfirstcu = FALSE;
6579
6580 if (bravalue >= OP_ONCE) /* Not an assertion */
6581 {
6582 /* If we have not yet set a firstcu in this branch, take it from the
6583 subpattern, remembering that it was set here so that a repeat of more
6584 than one can replicate it as reqcu if necessary. If the subpattern has
6585 no firstcu, set "none" for the whole branch. In both cases, a zero
6586 repeat forces firstcu to "none". */
6587
6588 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6589 {
6590 if (subfirstcuflags >= 0)
6591 {
6592 firstcu = subfirstcu;
6593 firstcuflags = subfirstcuflags;
6594 groupsetfirstcu = TRUE;
6595 }
6596 else firstcuflags = REQ_NONE;
6597 zerofirstcuflags = REQ_NONE;
6598 }
6599
6600 /* If firstcu was previously set, convert the subpattern's firstcu
6601 into reqcu if there wasn't one, using the vary flag that was in
6602 existence beforehand. */
6603
6604 else if (subfirstcuflags >= 0 && subreqcuflags < 0)
6605 {
6606 subreqcu = subfirstcu;
6607 subreqcuflags = subfirstcuflags | tempreqvary;
6608 }
6609
6610 /* If the subpattern set a required code unit (or set a first code unit
6611 that isn't really the first code unit - see above), set it. */
6612
6613 if (subreqcuflags >= 0)
6614 {
6615 reqcu = subreqcu;
6616 reqcuflags = subreqcuflags;
6617 }
6618 }
6619
6620 /* For a forward assertion, we take the reqcu, if set, provided that the
6621 group has also set a firstcu. This can be helpful if the pattern that
6622 follows the assertion doesn't set a different char. For example, it's
6623 useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6624 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6625 the "real" "a" would then become a reqcu instead of a firstcu. This is
6626 overcome by a scan at the end if there's no firstcu, looking for an
6627 asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6628 we must only take the reqcu when the group also set a firstcu. Otherwise,
6629 in that example, 'X' ends up set for both. */
6630
6631 else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
6632 subreqcuflags >= 0 && subfirstcuflags >= 0)
6633 {
6634 reqcu = subreqcu;
6635 reqcuflags = subreqcuflags;
6636 }
6637
6638 break; /* End of nested group handling */
6639
6640
6641 /* ===================================================================*/
6642 /* Handle named backreferences and recursions. */
6643
6644 case META_BACKREF_BYNAME:
6645 case META_RECURSE_BYNAME:
6646 {
6647 int count, index;
6648 PCRE2_SPTR name;
6649 BOOL is_dupname = FALSE;
6650 named_group *ng = cb->named_groups;
6651 uint32_t length = *(++pptr);
6652
6653 GETPLUSOFFSET(offset, pptr);
6654 name = cb->start_pattern + offset;
6655
6656 /* In the first pass, the names generated in the pre-pass are available,
6657 but the main name table has not yet been created. Scan the list of names
6658 generated in the pre-pass in order to get a number and whether or not
6659 this name is duplicated. */
6660
6661 groupnumber = 0;
6662 for (i = 0; i < cb->names_found; i++, ng++)
6663 {
6664 if (length == ng->length &&
6665 PRIV(strncmp)(name, ng->name, length) == 0)
6666 {
6667 is_dupname = ng->isdup;
6668 groupnumber = ng->number;
6669
6670 /* For a recursion, that's all that is needed. We can now go to
6671 the code that handles numerical recursion, applying it to the first
6672 group with the given name. */
6673
6674 if (meta == META_RECURSE_BYNAME)
6675 {
6676 meta_arg = groupnumber;
6677 goto HANDLE_NUMERICAL_RECURSION;
6678 }
6679
6680 /* For a back reference, update the back reference map and the
6681 maximum back reference. */
6682
6683 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6684 if (groupnumber > cb->top_backref)
6685 cb->top_backref = groupnumber;
6686 }
6687 }
6688
6689 /* If the name was not found we have a bad reference. */
6690
6691 if (groupnumber == 0)
6692 {
6693 *errorcodeptr = ERR15;
6694 cb->erroroffset = offset;
6695 return 0;
6696 }
6697
6698 /* If a back reference name is not duplicated, we can handle it as
6699 a numerical reference. */
6700
6701 if (!is_dupname)
6702 {
6703 meta_arg = groupnumber;
6704 goto HANDLE_SINGLE_REFERENCE;
6705 }
6706
6707 /* If a back reference name is duplicated, we generate a different
6708 opcode to a numerical back reference. In the second pass we must
6709 search for the index and count in the final name table. */
6710
6711 count = 0; /* Values for first pass (avoids compiler warning) */
6712 index = 0;
6713 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6714 &count, errorcodeptr, cb)) return 0;
6715
6716 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6717 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6718 PUT2INC(code, 0, index);
6719 PUT2INC(code, 0, count);
6720 }
6721 break;
6722
6723
6724 /* ===================================================================*/
6725 /* Handle a numerical callout. */
6726
6727 case META_CALLOUT_NUMBER:
6728 code[0] = OP_CALLOUT;
6729 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6730 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6731 code[1 + 2*LINK_SIZE] = pptr[3];
6732 pptr += 3;
6733 code += PRIV(OP_lengths)[OP_CALLOUT];
6734 break;
6735
6736
6737 /* ===================================================================*/
6738 /* Handle a callout with a string argument. In the pre-pass we just compute
6739 the length without generating anything. The length in pptr[3] includes both
6740 delimiters; in the actual compile only the first one is copied, but a
6741 terminating zero is added. Any doubled delimiters within the string make
6742 this an overestimate, but it is not worth bothering about. */
6743
6744 case META_CALLOUT_STRING:
6745 if (lengthptr != NULL)
6746 {
6747 *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6748 pptr += 3;
6749 SKIPOFFSET(pptr);
6750 }
6751
6752 /* In the real compile we can copy the string. The starting delimiter is
6753 included so that the client can discover it if they want. We also pass the
6754 start offset to help a script language give better error messages. */
6755
6756 else
6757 {
6758 PCRE2_SPTR pp;
6759 uint32_t delimiter;
6760 uint32_t length = pptr[3];
6761 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6762
6763 code[0] = OP_CALLOUT_STR;
6764 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6765 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6766
6767 pptr += 3;
6768 GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
6769 pp = cb->start_pattern + offset;
6770 delimiter = *callout_string++ = *pp++;
6771 if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6772 delimiter = CHAR_RIGHT_CURLY_BRACKET;
6773 PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
6774
6775 /* The syntax of the pattern was checked in the parsing scan. The length
6776 includes both delimiters, but we have passed the opening one just above,
6777 so we reduce length before testing it. The test is for > 1 because we do
6778 not want to copy the final delimiter. This also ensures that pp[1] is
6779 accessible. */
6780
6781 while (--length > 1)
6782 {
6783 if (*pp == delimiter && pp[1] == delimiter)
6784 {
6785 *callout_string++ = delimiter;
6786 pp += 2;
6787 length--;
6788 }
6789 else *callout_string++ = *pp++;
6790 }
6791 *callout_string++ = CHAR_NUL;
6792
6793 /* Set the length of the entire item, the advance to its end. */
6794
6795 PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6796 code = callout_string;
6797 }
6798 break;
6799
6800
6801 /* ===================================================================*/
6802 /* Handle repetition. The different types are all sorted out in the parsing
6803 pass. */
6804
6805 case META_MINMAX_PLUS:
6806 case META_MINMAX_QUERY:
6807 case META_MINMAX:
6808 repeat_min = *(++pptr);
6809 repeat_max = *(++pptr);
6810 goto REPEAT;
6811
6812 case META_ASTERISK:
6813 case META_ASTERISK_PLUS:
6814 case META_ASTERISK_QUERY:
6815 repeat_min = 0;
6816 repeat_max = REPEAT_UNLIMITED;
6817 goto REPEAT;
6818
6819 case META_PLUS:
6820 case META_PLUS_PLUS:
6821 case META_PLUS_QUERY:
6822 repeat_min = 1;
6823 repeat_max = REPEAT_UNLIMITED;
6824 goto REPEAT;
6825
6826 case META_QUERY:
6827 case META_QUERY_PLUS:
6828 case META_QUERY_QUERY:
6829 repeat_min = 0;
6830 repeat_max = 1;
6831
6832 REPEAT:
6833 if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6834
6835 /* Remember whether this is a variable length repeat, and default to
6836 single-char opcodes. */
6837
6838 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6839 op_type = 0;
6840
6841 /* Adjust first and required code units for a zero repeat. */
6842
6843 if (repeat_min == 0)
6844 {
6845 firstcu = zerofirstcu;
6846 firstcuflags = zerofirstcuflags;
6847 reqcu = zeroreqcu;
6848 reqcuflags = zeroreqcuflags;
6849 }
6850
6851 /* Note the greediness and possessiveness. */
6852
6853 switch (meta)
6854 {
6855 case META_MINMAX_PLUS:
6856 case META_ASTERISK_PLUS:
6857 case META_PLUS_PLUS:
6858 case META_QUERY_PLUS:
6859 repeat_type = 0; /* Force greedy */
6860 possessive_quantifier = TRUE;
6861 break;
6862
6863 case META_MINMAX_QUERY:
6864 case META_ASTERISK_QUERY:
6865 case META_PLUS_QUERY:
6866 case META_QUERY_QUERY:
6867 repeat_type = greedy_non_default;
6868 possessive_quantifier = FALSE;
6869 break;
6870
6871 default:
6872 repeat_type = greedy_default;
6873 possessive_quantifier = FALSE;
6874 break;
6875 }
6876
6877 /* Save start of previous item, in case we have to move it up in order to
6878 insert something before it, and remember what it was. */
6879
6880 tempcode = previous;
6881 op_previous = *previous;
6882
6883 /* Now handle repetition for the different types of item. If the repeat
6884 minimum and the repeat maximum are both 1, we can ignore the quantifier for
6885 non-parenthesized items, as they have only one alternative. For anything in
6886 parentheses, we must not ignore if {1} is possessive. */
6887
6888 switch (op_previous)
6889 {
6890 /* If previous was a character or negated character match, abolish the
6891 item and generate a repeat item instead. If a char item has a minimum of
6892 more than one, ensure that it is set in reqcu - it might not be if a
6893 sequence such as x{3} is the first thing in a branch because the x will
6894 have gone into firstcu instead. */
6895
6896 case OP_CHAR:
6897 case OP_CHARI:
6898 case OP_NOT:
6899 case OP_NOTI:
6900 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6901 op_type = chartypeoffset[op_previous - OP_CHAR];
6902
6903 /* Deal with UTF characters that take up more than one code unit. */
6904
6905 #ifdef MAYBE_UTF_MULTI
6906 if (utf && NOT_FIRSTCU(code[-1]))
6907 {
6908 PCRE2_UCHAR *lastchar = code - 1;
6909 BACKCHAR(lastchar);
6910 mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
6911 memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
6912 }
6913 else
6914 #endif /* MAYBE_UTF_MULTI */
6915
6916 /* Handle the case of a single code unit - either with no UTF support, or
6917 with UTF disabled, or for a single-code-unit UTF character. */
6918 {
6919 mcbuffer[0] = code[-1];
6920 mclength = 1;
6921 if (op_previous <= OP_CHARI && repeat_min > 1)
6922 {
6923 reqcu = mcbuffer[0];
6924 reqcuflags = req_caseopt | cb->req_varyopt;
6925 }
6926 }
6927 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
6928
6929 /* If previous was a character class or a back reference, we put the
6930 repeat stuff after it, but just skip the item if the repeat was {0,0}. */
6931
6932 #ifdef SUPPORT_WIDE_CHARS
6933 case OP_XCLASS:
6934 #endif
6935 case OP_CLASS:
6936 case OP_NCLASS:
6937 case OP_REF:
6938 case OP_REFI:
6939 case OP_DNREF:
6940 case OP_DNREFI:
6941
6942 if (repeat_max == 0)
6943 {
6944 code = previous;
6945 goto END_REPEAT;
6946 }
6947 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6948
6949 if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
6950 *code++ = OP_CRSTAR + repeat_type;
6951 else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
6952 *code++ = OP_CRPLUS + repeat_type;
6953 else if (repeat_min == 0 && repeat_max == 1)
6954 *code++ = OP_CRQUERY + repeat_type;
6955 else
6956 {
6957 *code++ = OP_CRRANGE + repeat_type;
6958 PUT2INC(code, 0, repeat_min);
6959 if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
6960 PUT2INC(code, 0, repeat_max);
6961 }
6962 break;
6963
6964 /* If previous is OP_FAIL, it was generated by an empty class []
6965 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
6966 generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
6967 time. We can just ignore this repeat. */
6968
6969 case OP_FAIL:
6970 goto END_REPEAT;
6971
6972 /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
6973 because pcre2_match() could not handle backtracking into recursively
6974 called groups. Now that this backtracking is available, we no longer need
6975 to do this. However, we still need to replicate recursions as we do for
6976 groups so as to have independent backtracking points. We can replicate
6977 for the minimum number of repeats directly. For optional repeats we now
6978 wrap the recursion in OP_BRA brackets and make use of the bracket
6979 repetition. */
6980
6981 case OP_RECURSE:
6982 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
6983 goto END_REPEAT;
6984
6985 /* Generate unwrapped repeats for a non-zero minimum, except when the
6986 minimum is 1 and the maximum unlimited, because that can be handled with
6987 OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
6988 minimum, we just need to generate the appropriate additional copies.
6989 Otherwise we need to generate one more, to simulate the situation when
6990 the minimum is zero. */
6991
6992 if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
6993 {
6994 int replicate = repeat_min;
6995 if (repeat_min == repeat_max) replicate--;
6996
6997 /* In the pre-compile phase, we don't actually do the replication. We
6998 just adjust the length as if we had. Do some paranoid checks for
6999 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7000 integer type when available, otherwise double. */
7001
7002 if (lengthptr != NULL)
7003 {
7004 PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
7005 if ((INT64_OR_DOUBLE)replicate*
7006 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
7007 (INT64_OR_DOUBLE)INT_MAX ||
7008 OFLOW_MAX - *lengthptr < delta)
7009 {
7010 *errorcodeptr = ERR20;
7011 return 0;
7012 }
7013 *lengthptr += delta;
7014 }
7015
7016 else for (i = 0; i < replicate; i++)
7017 {
7018 memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7019 previous = code;
7020 code += 1 + LINK_SIZE;
7021 }
7022
7023 /* If the number of repeats is fixed, we are done. Otherwise, adjust
7024 the counts and fall through. */
7025
7026 if (repeat_min == repeat_max) break;
7027 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7028 repeat_min = 0;
7029 }
7030
7031 /* Wrap the recursion call in OP_BRA brackets. */
7032
7033 (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7034 op_previous = *previous = OP_BRA;
7035 PUT(previous, 1, 2 + 2*LINK_SIZE);
7036 previous[2 + 2*LINK_SIZE] = OP_KET;
7037 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7038 code += 2 + 2 * LINK_SIZE;
7039 length_prevgroup = 3 + 3*LINK_SIZE;
7040 group_return = -1; /* Set "may match empty string" */
7041
7042 /* Now treat as a repeated OP_BRA. */
7043 /* Fall through */
7044
7045 /* If previous was a bracket group, we may have to replicate it in
7046 certain cases. Note that at this point we can encounter only the "basic"
7047 bracket opcodes such as BRA and CBRA, as this is the place where they get
7048 converted into the more special varieties such as BRAPOS and SBRA.
7049 Originally, PCRE did not allow repetition of assertions, but now it does,
7050 for Perl compatibility. */
7051
7052 case OP_ASSERT:
7053 case OP_ASSERT_NOT:
7054 case OP_ASSERT_NA:
7055 case OP_ASSERTBACK:
7056 case OP_ASSERTBACK_NOT:
7057 case OP_ASSERTBACK_NA:
7058 case OP_ONCE:
7059 case OP_SCRIPT_RUN:
7060 case OP_BRA:
7061 case OP_CBRA:
7062 case OP_COND:
7063 {
7064 int len = (int)(code - previous);
7065 PCRE2_UCHAR *bralink = NULL;
7066 PCRE2_UCHAR *brazeroptr = NULL;
7067
7068 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7069 goto END_REPEAT;
7070
7071 /* Repeating a DEFINE group (or any group where the condition is always
7072 FALSE and there is only one branch) is pointless, but Perl allows the
7073 syntax, so we just ignore the repeat. */
7074
7075 if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7076 previous[GET(previous, 1)] != OP_ALT)
7077 goto END_REPEAT;
7078
7079 /* Perl allows all assertions to be quantified, and when they contain
7080 capturing parentheses and/or are optional there are potential uses for
7081 this feature. PCRE2 used to force the maximum quantifier to 1 on the
7082 invalid grounds that further repetition was never useful. This was
7083 always a bit pointless, since an assertion could be wrapped with a
7084 repeated group to achieve the effect. General repetition is now
7085 permitted, but if the maximum is unlimited it is set to one more than
7086 the minimum. */
7087
7088 if (op_previous < OP_ONCE) /* Assertion */
7089 {
7090 if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7091 }
7092
7093 /* The case of a zero minimum is special because of the need to stick
7094 OP_BRAZERO in front of it, and because the group appears once in the
7095 data, whereas in other cases it appears the minimum number of times. For
7096 this reason, it is simplest to treat this case separately, as otherwise
7097 the code gets far too messy. There are several special subcases when the
7098 minimum is zero. */
7099
7100 if (repeat_min == 0)
7101 {
7102 /* If the maximum is also zero, we used to just omit the group from
7103 the output altogether, like this:
7104
7105 ** if (repeat_max == 0)
7106 ** {
7107 ** code = previous;
7108 ** goto END_REPEAT;
7109 ** }
7110
7111 However, that fails when a group or a subgroup within it is
7112 referenced as a subroutine from elsewhere in the pattern, so now we
7113 stick in OP_SKIPZERO in front of it so that it is skipped on
7114 execution. As we don't have a list of which groups are referenced, we
7115 cannot do this selectively.
7116
7117 If the maximum is 1 or unlimited, we just have to stick in the
7118 BRAZERO and do no more at this point. */
7119
7120 if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7121 {
7122 (void)memmove(previous + 1, previous, CU2BYTES(len));
7123 code++;
7124 if (repeat_max == 0)
7125 {
7126 *previous++ = OP_SKIPZERO;
7127 goto END_REPEAT;
7128 }
7129 brazeroptr = previous; /* Save for possessive optimizing */
7130 *previous++ = OP_BRAZERO + repeat_type;
7131 }
7132
7133 /* If the maximum is greater than 1 and limited, we have to replicate
7134 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7135 The first one has to be handled carefully because it's the original
7136 copy, which has to be moved up. The remainder can be handled by code
7137 that is common with the non-zero minimum case below. We have to
7138 adjust the value or repeat_max, since one less copy is required. */
7139
7140 else
7141 {
7142 int linkoffset;
7143 (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7144 code += 2 + LINK_SIZE;
7145 *previous++ = OP_BRAZERO + repeat_type;
7146 *previous++ = OP_BRA;
7147
7148 /* We chain together the bracket link offset fields that have to be
7149 filled in later when the ends of the brackets are reached. */
7150
7151 linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7152 bralink = previous;
7153 PUTINC(previous, 0, linkoffset);
7154 }
7155
7156 if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7157 }
7158
7159 /* If the minimum is greater than zero, replicate the group as many
7160 times as necessary, and adjust the maximum to the number of subsequent
7161 copies that we need. */
7162
7163 else
7164 {
7165 if (repeat_min > 1)
7166 {
7167 /* In the pre-compile phase, we don't actually do the replication.
7168 We just adjust the length as if we had. Do some paranoid checks for
7169 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7170 integer type when available, otherwise double. */
7171
7172 if (lengthptr != NULL)
7173 {
7174 PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
7175 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
7176 (INT64_OR_DOUBLE)length_prevgroup >
7177 (INT64_OR_DOUBLE)INT_MAX ||
7178 OFLOW_MAX - *lengthptr < delta)
7179 {
7180 *errorcodeptr = ERR20;
7181 return 0;
7182 }
7183 *lengthptr += delta;
7184 }
7185
7186 /* This is compiling for real. If there is a set first code unit
7187 for the group, and we have not yet set a "required code unit", set
7188 it. */
7189
7190 else
7191 {
7192 if (groupsetfirstcu && reqcuflags < 0)
7193 {
7194 reqcu = firstcu;
7195 reqcuflags = firstcuflags;
7196 }
7197 for (i = 1; (uint32_t)i < repeat_min; i++)
7198 {
7199 memcpy(code, previous, CU2BYTES(len));
7200 code += len;
7201 }
7202 }
7203 }
7204
7205 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7206 }
7207
7208 /* This code is common to both the zero and non-zero minimum cases. If
7209 the maximum is limited, it replicates the group in a nested fashion,
7210 remembering the bracket starts on a stack. In the case of a zero
7211 minimum, the first one was set up above. In all cases the repeat_max
7212 now specifies the number of additional copies needed. Again, we must
7213 remember to replicate entries on the forward reference list. */
7214
7215 if (repeat_max != REPEAT_UNLIMITED)
7216 {
7217 /* In the pre-compile phase, we don't actually do the replication. We
7218 just adjust the length as if we had. For each repetition we must add
7219 1 to the length for BRAZERO and for all but the last repetition we
7220 must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7221 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7222 is a 64-bit integer type when available, otherwise double. */
7223
7224 if (lengthptr != NULL && repeat_max > 0)
7225 {
7226 PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
7227 2 - 2*LINK_SIZE; /* Last one doesn't nest */
7228 if ((INT64_OR_DOUBLE)repeat_max *
7229 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
7230 > (INT64_OR_DOUBLE)INT_MAX ||
7231 OFLOW_MAX - *lengthptr < delta)
7232 {
7233 *errorcodeptr = ERR20;
7234 return 0;
7235 }
7236 *lengthptr += delta;
7237 }
7238
7239 /* This is compiling for real */
7240
7241 else for (i = repeat_max - 1; i >= 0; i--)
7242 {
7243 *code++ = OP_BRAZERO + repeat_type;
7244
7245 /* All but the final copy start a new nesting, maintaining the
7246 chain of brackets outstanding. */
7247
7248 if (i != 0)
7249 {
7250 int linkoffset;
7251 *code++ = OP_BRA;
7252 linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7253 bralink = code;
7254 PUTINC(code, 0, linkoffset);
7255 }
7256
7257 memcpy(code, previous, CU2BYTES(len));
7258 code += len;
7259 }
7260
7261 /* Now chain through the pending brackets, and fill in their length
7262 fields (which are holding the chain links pro tem). */
7263
7264 while (bralink != NULL)
7265 {
7266 int oldlinkoffset;
7267 int linkoffset = (int)(code - bralink + 1);
7268 PCRE2_UCHAR *bra = code - linkoffset;
7269 oldlinkoffset = GET(bra, 1);
7270 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7271 *code++ = OP_KET;
7272 PUTINC(code, 0, linkoffset);
7273 PUT(bra, 1, linkoffset);
7274 }
7275 }
7276
7277 /* If the maximum is unlimited, set a repeater in the final copy. For
7278 SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7279 possessively repeated ONCE brackets can be converted into non-capturing
7280 brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7281 saves having to deal with possessive ONCEs specially.
7282
7283 Otherwise, when we are doing the actual compile phase, check to see
7284 whether this group is one that could match an empty string. If so,
7285 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7286 that runtime checking can be done. [This check is also applied to ONCE
7287 and SCRIPT_RUN groups at runtime, but in a different way.]
7288
7289 Then, if the quantifier was possessive and the bracket is not a
7290 conditional, we convert the BRA code to the POS form, and the KET code
7291 to KETRPOS. (It turns out to be convenient at runtime to detect this
7292 kind of subpattern at both the start and at the end.) The use of
7293 special opcodes makes it possible to reduce greatly the stack usage in
7294 pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7295 OP_BRAPOSZERO.
7296
7297 Then, if the minimum number of matches is 1 or 0, cancel the possessive
7298 flag so that the default action below, of wrapping everything inside
7299 atomic brackets, does not happen. When the minimum is greater than 1,
7300 there will be earlier copies of the group, and so we still have to wrap
7301 the whole thing. */
7302
7303 else
7304 {
7305 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7306 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7307
7308 /* Convert possessive ONCE brackets to non-capturing */
7309
7310 if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7311
7312 /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7313 to do is to set the KET. */
7314
7315 if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7316 *ketcode = OP_KETRMAX + repeat_type;
7317
7318 /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7319 (which have been converted to non-capturing above). */
7320
7321 else
7322 {
7323 /* In the compile phase, adjust the opcode if the group can match
7324 an empty string. For a conditional group with only one branch, the
7325 value of group_return will not show "could be empty", so we must
7326 check that separately. */
7327
7328 if (lengthptr == NULL)
7329 {
7330 if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7331 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7332 *bracode = OP_SCOND;
7333 }
7334
7335 /* Handle possessive quantifiers. */
7336
7337 if (possessive_quantifier)
7338 {
7339 /* For COND brackets, we wrap the whole thing in a possessively
7340 repeated non-capturing bracket, because we have not invented POS
7341 versions of the COND opcodes. */
7342
7343 if (*bracode == OP_COND || *bracode == OP_SCOND)
7344 {
7345 int nlen = (int)(code - bracode);
7346 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7347 code += 1 + LINK_SIZE;
7348 nlen += 1 + LINK_SIZE;
7349 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7350 *code++ = OP_KETRPOS;
7351 PUTINC(code, 0, nlen);
7352 PUT(bracode, 1, nlen);
7353 }
7354
7355 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7356
7357 else
7358 {
7359 *bracode += 1; /* Switch to xxxPOS opcodes */
7360 *ketcode = OP_KETRPOS;
7361 }
7362
7363 /* If the minimum is zero, mark it as possessive, then unset the
7364 possessive flag when the minimum is 0 or 1. */
7365
7366 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7367 if (repeat_min < 2) possessive_quantifier = FALSE;
7368 }
7369
7370 /* Non-possessive quantifier */
7371
7372 else *ketcode = OP_KETRMAX + repeat_type;
7373 }
7374 }
7375 }
7376 break;
7377
7378 /* If previous was a character type match (\d or similar), abolish it and
7379 create a suitable repeat item. The code is shared with single-character
7380 repeats by setting op_type to add a suitable offset into repeat_type.
7381 Note the the Unicode property types will be present only when
7382 SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7383 here because it just makes it horribly messy. */
7384
7385 default:
7386 if (op_previous >= OP_EODN) /* Not a character type - internal error */
7387 {
7388 *errorcodeptr = ERR10;
7389 return 0;
7390 }
7391 else
7392 {
7393 int prop_type, prop_value;
7394 PCRE2_UCHAR *oldcode;
7395
7396 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7397
7398 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
7399 mclength = 0; /* Not a character */
7400
7401 if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7402 {
7403 prop_type = previous[1];
7404 prop_value = previous[2];
7405 }
7406 else
7407 {
7408 /* Come here from just above with a character in mcbuffer/mclength. */
7409 OUTPUT_SINGLE_REPEAT:
7410 prop_type = prop_value = -1;
7411 }
7412
7413 /* At this point, if prop_type == prop_value == -1 we either have a
7414 character in mcbuffer when mclength is greater than zero, or we have
7415 mclength zero, in which case there is a non-property character type in
7416 op_previous. If prop_type/value are not negative, we have a property
7417 character type in op_previous. */
7418
7419 oldcode = code; /* Save where we were */
7420 code = previous; /* Usually overwrite previous item */
7421
7422 /* If the maximum is zero then the minimum must also be zero; Perl allows
7423 this case, so we do too - by simply omitting the item altogether. */
7424
7425 if (repeat_max == 0) goto END_REPEAT;
7426
7427 /* Combine the op_type with the repeat_type */
7428
7429 repeat_type += op_type;
7430
7431 /* A minimum of zero is handled either as the special case * or ?, or as
7432 an UPTO, with the maximum given. */
7433
7434 if (repeat_min == 0)
7435 {
7436 if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7437 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7438 else
7439 {
7440 *code++ = OP_UPTO + repeat_type;
7441 PUT2INC(code, 0, repeat_max);
7442 }
7443 }
7444
7445 /* A repeat minimum of 1 is optimized into some special cases. If the
7446 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7447 left in place and, if the maximum is greater than 1, we use OP_UPTO with
7448 one less than the maximum. */
7449
7450 else if (repeat_min == 1)
7451 {
7452 if (repeat_max == REPEAT_UNLIMITED)
7453 *code++ = OP_PLUS + repeat_type;
7454 else
7455 {
7456 code = oldcode; /* Leave previous item in place */
7457 if (repeat_max == 1) goto END_REPEAT;
7458 *code++ = OP_UPTO + repeat_type;
7459 PUT2INC(code, 0, repeat_max - 1);
7460 }
7461 }
7462
7463 /* The case {n,n} is just an EXACT, while the general case {n,m} is
7464 handled as an EXACT followed by an UPTO or STAR or QUERY. */
7465
7466 else
7467 {
7468 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7469 PUT2INC(code, 0, repeat_min);
7470
7471 /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7472 and then generate the second opcode. For a repeated Unicode property
7473 match, there are two extra values that define the required property,
7474 and mclength is set zero to indicate this. */
7475
7476 if (repeat_max != repeat_min)
7477 {
7478 if (mclength > 0)
7479 {
7480 memcpy(code, mcbuffer, CU2BYTES(mclength));
7481 code += mclength;
7482 }
7483 else
7484 {
7485 *code++ = op_previous;
7486 if (prop_type >= 0)
7487 {
7488 *code++ = prop_type;
7489 *code++ = prop_value;
7490 }
7491 }
7492
7493 /* Now set up the following opcode */
7494
7495 if (repeat_max == REPEAT_UNLIMITED)
7496 *code++ = OP_STAR + repeat_type;
7497 else
7498 {
7499 repeat_max -= repeat_min;
7500 if (repeat_max == 1)
7501 {
7502 *code++ = OP_QUERY + repeat_type;
7503 }
7504 else
7505 {
7506 *code++ = OP_UPTO + repeat_type;
7507 PUT2INC(code, 0, repeat_max);
7508 }
7509 }
7510 }
7511 }
7512
7513 /* Fill in the character or character type for the final opcode. */
7514
7515 if (mclength > 0)
7516 {
7517 memcpy(code, mcbuffer, CU2BYTES(mclength));
7518 code += mclength;
7519 }
7520 else
7521 {
7522 *code++ = op_previous;
7523 if (prop_type >= 0)
7524 {
7525 *code++ = prop_type;
7526 *code++ = prop_value;
7527 }
7528 }
7529 }
7530 break;
7531 } /* End of switch on different op_previous values */
7532
7533
7534 /* If the character following a repeat is '+', possessive_quantifier is
7535 TRUE. For some opcodes, there are special alternative opcodes for this
7536 case. For anything else, we wrap the entire repeated item inside OP_ONCE
7537 brackets. Logically, the '+' notation is just syntactic sugar, taken from
7538 Sun's Java package, but the special opcodes can optimize it.
7539
7540 Some (but not all) possessively repeated subpatterns have already been
7541 completely handled in the code just above. For them, possessive_quantifier
7542 is always FALSE at this stage. Note that the repeated item starts at
7543 tempcode, not at previous, which might be the first part of a string whose
7544 (former) last char we repeated. */
7545
7546 if (possessive_quantifier)
7547 {
7548 int len;
7549
7550 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7551 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7552 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7553 remains is greater than zero, there's a further opcode that can be
7554 handled. If not, do nothing, leaving the EXACT alone. */
7555
7556 switch(*tempcode)
7557 {
7558 case OP_TYPEEXACT:
7559 tempcode += PRIV(OP_lengths)[*tempcode] +
7560 ((tempcode[1 + IMM2_SIZE] == OP_PROP
7561 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7562 break;
7563
7564 /* CHAR opcodes are used for exacts whose count is 1. */
7565
7566 case OP_CHAR:
7567 case OP_CHARI:
7568 case OP_NOT:
7569 case OP_NOTI:
7570 case OP_EXACT:
7571 case OP_EXACTI:
7572 case OP_NOTEXACT:
7573 case OP_NOTEXACTI:
7574 tempcode += PRIV(OP_lengths)[*tempcode];
7575 #ifdef SUPPORT_UNICODE
7576 if (utf && HAS_EXTRALEN(tempcode[-1]))
7577 tempcode += GET_EXTRALEN(tempcode[-1]);
7578 #endif
7579 break;
7580
7581 /* For the class opcodes, the repeat operator appears at the end;
7582 adjust tempcode to point to it. */
7583
7584 case OP_CLASS:
7585 case OP_NCLASS:
7586 tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7587 break;
7588
7589 #ifdef SUPPORT_WIDE_CHARS
7590 case OP_XCLASS:
7591 tempcode += GET(tempcode, 1);
7592 break;
7593 #endif
7594 }
7595
7596 /* If tempcode is equal to code (which points to the end of the repeated
7597 item), it means we have skipped an EXACT item but there is no following
7598 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7599 all other cases, tempcode will be pointing to the repeat opcode, and will
7600 be less than code, so the value of len will be greater than 0. */
7601
7602 len = (int)(code - tempcode);
7603 if (len > 0)
7604 {
7605 unsigned int repcode = *tempcode;
7606
7607 /* There is a table for possessifying opcodes, all of which are less
7608 than OP_CALLOUT. A zero entry means there is no possessified version.
7609 */
7610
7611 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7612 *tempcode = opcode_possessify[repcode];
7613
7614 /* For opcode without a special possessified version, wrap the item in
7615 ONCE brackets. */
7616
7617 else
7618 {
7619 (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7620 code += 1 + LINK_SIZE;
7621 len += 1 + LINK_SIZE;
7622 tempcode[0] = OP_ONCE;
7623 *code++ = OP_KET;
7624 PUTINC(code, 0, len);
7625 PUT(tempcode, 1, len);
7626 }
7627 }
7628 }
7629
7630 /* We set the "follows varying string" flag for subsequently encountered
7631 reqcus if it isn't already set and we have just passed a varying length
7632 item. */
7633
7634 END_REPEAT:
7635 cb->req_varyopt |= reqvary;
7636 break;
7637
7638
7639 /* ===================================================================*/
7640 /* Handle a 32-bit data character with a value greater than META_END. */
7641
7642 case META_BIGVALUE:
7643 pptr++;
7644 goto NORMAL_CHAR;
7645
7646
7647 /* ===============================================================*/
7648 /* Handle a back reference by number, which is the meta argument. The
7649 pattern offsets for back references to group numbers less than 10 are held
7650 in a special vector, to avoid using more than two parsed pattern elements
7651 in 64-bit environments. We only need the offset to the first occurrence,
7652 because if that doesn't fail, subsequent ones will also be OK. */
7653
7654 case META_BACKREF:
7655 if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7656 else GETPLUSOFFSET(offset, pptr);
7657
7658 if (meta_arg > cb->bracount)
7659 {
7660 cb->erroroffset = offset;
7661 *errorcodeptr = ERR15; /* Non-existent subpattern */
7662 return 0;
7663 }
7664
7665 /* Come here from named backref handling when the reference is to a
7666 single group (that is, not to a duplicated name). The back reference
7667 data will have already been updated. We must disable firstcu if not
7668 set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7669 later. */
7670
7671 HANDLE_SINGLE_REFERENCE:
7672 if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7673 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7674 PUT2INC(code, 0, meta_arg);
7675
7676 /* Update the map of back references, and keep the highest one. We
7677 could do this in parse_regex() for numerical back references, but not
7678 for named back references, because we don't know the numbers to which
7679 named back references refer. So we do it all in this function. */
7680
7681 cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7682 if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7683 break;
7684
7685
7686 /* ===============================================================*/
7687 /* Handle recursion by inserting the number of the called group (which is
7688 the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7689 scanned and these numbers are replaced by offsets within the pattern. It is
7690 done like this to avoid problems with forward references and adjusting
7691 offsets when groups are duplicated and moved (as discovered in previous
7692 implementations). Note that a recursion does not have a set first
7693 character. */
7694
7695 case META_RECURSE:
7696 GETPLUSOFFSET(offset, pptr);
7697 if (meta_arg > cb->bracount)
7698 {
7699 cb->erroroffset = offset;
7700 *errorcodeptr = ERR15; /* Non-existent subpattern */
7701 return 0;
7702 }
7703 HANDLE_NUMERICAL_RECURSION:
7704 *code = OP_RECURSE;
7705 PUT(code, 1, meta_arg);
7706 code += 1 + LINK_SIZE;
7707 groupsetfirstcu = FALSE;
7708 cb->had_recurse = TRUE;
7709 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7710 zerofirstcu = firstcu;
7711 zerofirstcuflags = firstcuflags;
7712 break;
7713
7714
7715 /* ===============================================================*/
7716 /* Handle capturing parentheses; the number is the meta argument. */
7717
7718 case META_CAPTURE:
7719 bravalue = OP_CBRA;
7720 skipunits = IMM2_SIZE;
7721 PUT2(code, 1+LINK_SIZE, meta_arg);
7722 cb->lastcapture = meta_arg;
7723 goto GROUP_PROCESS_NOTE_EMPTY;
7724
7725
7726 /* ===============================================================*/
7727 /* Handle escape sequence items. For ones like \d, the ESC_values are
7728 arranged to be the same as the corresponding OP_values in the default case
7729 when PCRE2_UCP is not set (which is the only case in which they will appear
7730 here).
7731
7732 Note: \Q and \E are never seen here, as they were dealt with in
7733 parse_pattern(). Neither are numerical back references or recursions, which
7734 were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7735 \g, when followed by names, are turned into META_BACKREF_BYNAME or
7736 META_RECURSE_BYNAME. */
7737
7738 case META_ESCAPE:
7739
7740 /* We can test for escape sequences that consume a character because their
7741 values lie between ESC_b and ESC_Z; this may have to change if any new ones
7742 are ever created. For these sequences, we disable the setting of a first
7743 character if it hasn't already been set. */
7744
7745 if (meta_arg > ESC_b && meta_arg < ESC_Z)
7746 {
7747 matched_char = TRUE;
7748 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7749 }
7750
7751 /* Set values to reset to if this is followed by a zero repeat. */
7752
7753 zerofirstcu = firstcu;
7754 zerofirstcuflags = firstcuflags;
7755 zeroreqcu = reqcu;
7756 zeroreqcuflags = reqcuflags;
7757
7758 /* If Unicode is not supported, \P and \p are not allowed and are
7759 faulted at parse time, so will never appear here. */
7760
7761 #ifdef SUPPORT_UNICODE
7762 if (meta_arg == ESC_P || meta_arg == ESC_p)
7763 {
7764 uint32_t ptype = *(++pptr) >> 16;
7765 uint32_t pdata = *pptr & 0xffff;
7766
7767 /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
7768 from the auto-anchoring code. */
7769
7770 if (meta_arg == ESC_p && ptype == PT_ANY)
7771 {
7772 *code++ = OP_ALLANY;
7773 }
7774 else
7775 {
7776 *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7777 *code++ = ptype;
7778 *code++ = pdata;
7779 }
7780 break; /* End META_ESCAPE */
7781 }
7782 #endif
7783
7784 /* For the rest (including \X when Unicode is supported - if not it's
7785 faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7786 not set; if it is set, these escapes do not show up here because they are
7787 converted into Unicode property tests in parse_regex(). Note that \b and \B
7788 do a one-character lookbehind, and \A also behaves as if it does. */
7789
7790 if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7791 if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7792 cb->max_lookbehind == 0)
7793 cb->max_lookbehind = 1;
7794
7795 /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7796 instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7797
7798 #if PCRE2_CODE_UNIT_WIDTH == 32
7799 *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7800 #else
7801 *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7802 #endif
7803 break; /* End META_ESCAPE */
7804
7805
7806 /* ===================================================================*/
7807 /* Handle an unrecognized meta value. A parsed pattern value less than
7808 META_END is a literal. Otherwise we have a problem. */
7809
7810 default:
7811 if (meta >= META_END)
7812 {
7813 #ifdef DEBUG_SHOW_PARSED
7814 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7815 #endif
7816 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
7817 return 0;
7818 }
7819
7820 /* Handle a literal character. We come here by goto in the case of a
7821 32-bit, non-UTF character whose value is greater than META_END. */
7822
7823 NORMAL_CHAR:
7824 meta = *pptr; /* Get the full 32 bits */
7825 NORMAL_CHAR_SET: /* Character is already in meta */
7826 matched_char = TRUE;
7827
7828 /* For caseless UTF or UCP mode, check whether this character has more than
7829 one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
7830 */
7831
7832 #ifdef SUPPORT_UNICODE
7833 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
7834 {
7835 uint32_t caseset = UCD_CASESET(meta);
7836 if (caseset != 0)
7837 {
7838 *code++ = OP_PROP;
7839 *code++ = PT_CLIST;
7840 *code++ = caseset;
7841 if (firstcuflags == REQ_UNSET)
7842 firstcuflags = zerofirstcuflags = REQ_NONE;
7843 break; /* End handling this meta item */
7844 }
7845 }
7846 #endif
7847
7848 /* Caseful matches, or caseless and not one of the multicase characters. We
7849 come here by goto in the case of a positive class that contains only
7850 case-partners of a character with just two cases; matched_char has already
7851 been set TRUE and options fudged if necessary. */
7852
7853 CLASS_CASELESS_CHAR:
7854
7855 /* Get the character's code units into mcbuffer, with the length in
7856 mclength. When not in UTF mode, the length is always 1. */
7857
7858 #ifdef SUPPORT_UNICODE
7859 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7860 #endif
7861 {
7862 mclength = 1;
7863 mcbuffer[0] = meta;
7864 }
7865
7866 /* Generate the appropriate code */
7867
7868 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7869 memcpy(code, mcbuffer, CU2BYTES(mclength));
7870 code += mclength;
7871
7872 /* Remember if \r or \n were seen */
7873
7874 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7875 cb->external_flags |= PCRE2_HASCRORLF;
7876
7877 /* Set the first and required code units appropriately. If no previous
7878 first code unit, set it from this character, but revert to none on a zero
7879 repeat. Otherwise, leave the firstcu value alone, and don't change it on
7880 a zero repeat. */
7881
7882 if (firstcuflags == REQ_UNSET)
7883 {
7884 zerofirstcuflags = REQ_NONE;
7885 zeroreqcu = reqcu;
7886 zeroreqcuflags = reqcuflags;
7887
7888 /* If the character is more than one code unit long, we can set a single
7889 firstcu only if it is not to be matched caselessly. Multiple possible
7890 starting code units may be picked up later in the studying code. */
7891
7892 if (mclength == 1 || req_caseopt == 0)
7893 {
7894 firstcu = mcbuffer[0];
7895 firstcuflags = req_caseopt;
7896 if (mclength != 1)
7897 {
7898 reqcu = code[-1];
7899 reqcuflags = cb->req_varyopt;
7900 }
7901 }
7902 else firstcuflags = reqcuflags = REQ_NONE;
7903 }
7904
7905 /* firstcu was previously set; we can set reqcu only if the length is
7906 1 or the matching is caseful. */
7907
7908 else
7909 {
7910 zerofirstcu = firstcu;
7911 zerofirstcuflags = firstcuflags;
7912 zeroreqcu = reqcu;
7913 zeroreqcuflags = reqcuflags;
7914 if (mclength == 1 || req_caseopt == 0)
7915 {
7916 reqcu = code[-1];
7917 reqcuflags = req_caseopt | cb->req_varyopt;
7918 }
7919 }
7920
7921 /* If caselessness was temporarily instated, reset it. */
7922
7923 if (reset_caseful)
7924 {
7925 options &= ~PCRE2_CASELESS;
7926 req_caseopt = 0;
7927 reset_caseful = FALSE;
7928 }
7929
7930 break; /* End literal character handling */
7931 } /* End of big switch */
7932 } /* End of big loop */
7933
7934 /* Control never reaches here. */
7935 }
7936
7937
7938
7939 /*************************************************
7940 * Compile regex: a sequence of alternatives *
7941 *************************************************/
7942
7943 /* On entry, pptr is pointing past the bracket meta, but on return it points to
7944 the closing bracket or META_END. The code variable is pointing at the code unit
7945 into which the BRA operator has been stored. This function is used during the
7946 pre-compile phase when we are trying to find out the amount of memory needed,
7947 as well as during the real compile phase. The value of lengthptr distinguishes
7948 the two phases.
7949
7950 Arguments:
7951 options option bits, including any changes for this subpattern
7952 codeptr -> the address of the current code pointer
7953 pptrptr -> the address of the current parsed pattern pointer
7954 errorcodeptr -> pointer to error code variable
7955 skipunits skip this many code units at start (for brackets and OP_COND)
7956 firstcuptr place to put the first required code unit
7957 firstcuflagsptr place to put the first code unit flags, or a negative number
7958 reqcuptr place to put the last required code unit
7959 reqcuflagsptr place to put the last required code unit flags, or a negative number
7960 bcptr pointer to the chain of currently open branches
7961 cb points to the data block with tables pointers etc.
7962 lengthptr NULL during the real compile phase
7963 points to length accumulator during pre-compile phase
7964
7965 Returns: 0 There has been an error
7966 +1 Success, this group must match at least one character
7967 -1 Success, this group may match an empty string
7968 */
7969
7970 static int
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)7971 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
7972 int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
7973 int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
7974 branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
7975 {
7976 PCRE2_UCHAR *code = *codeptr;
7977 PCRE2_UCHAR *last_branch = code;
7978 PCRE2_UCHAR *start_bracket = code;
7979 BOOL lookbehind;
7980 open_capitem capitem;
7981 int capnumber = 0;
7982 int okreturn = 1;
7983 uint32_t *pptr = *pptrptr;
7984 uint32_t firstcu, reqcu;
7985 uint32_t lookbehindlength;
7986 int32_t firstcuflags, reqcuflags;
7987 uint32_t branchfirstcu, branchreqcu;
7988 int32_t branchfirstcuflags, branchreqcuflags;
7989 PCRE2_SIZE length;
7990 branch_chain bc;
7991
7992 /* If set, call the external function that checks for stack availability. */
7993
7994 if (cb->cx->stack_guard != NULL &&
7995 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
7996 {
7997 *errorcodeptr= ERR33;
7998 return 0;
7999 }
8000
8001 /* Miscellaneous initialization */
8002
8003 bc.outer = bcptr;
8004 bc.current_branch = code;
8005
8006 firstcu = reqcu = 0;
8007 firstcuflags = reqcuflags = REQ_UNSET;
8008
8009 /* Accumulate the length for use in the pre-compile phase. Start with the
8010 length of the BRA and KET and any extra code units that are required at the
8011 beginning. We accumulate in a local variable to save frequent testing of
8012 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8013 start and end of each alternative, because compiled items are discarded during
8014 the pre-compile phase so that the workspace is not exceeded. */
8015
8016 length = 2 + 2*LINK_SIZE + skipunits;
8017
8018 /* Remember if this is a lookbehind assertion, and if it is, save its length
8019 and skip over the pattern offset. */
8020
8021 lookbehind = *code == OP_ASSERTBACK ||
8022 *code == OP_ASSERTBACK_NOT ||
8023 *code == OP_ASSERTBACK_NA;
8024
8025 if (lookbehind)
8026 {
8027 lookbehindlength = META_DATA(pptr[-1]);
8028 pptr += SIZEOFFSET;
8029 }
8030 else lookbehindlength = 0;
8031
8032 /* If this is a capturing subpattern, add to the chain of open capturing items
8033 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8034 need be tested here; changing this opcode to one of its variants, e.g.
8035 OP_SCBRAPOS, happens later, after the group has been compiled. */
8036
8037 if (*code == OP_CBRA)
8038 {
8039 capnumber = GET2(code, 1 + LINK_SIZE);
8040 capitem.number = capnumber;
8041 capitem.next = cb->open_caps;
8042 capitem.assert_depth = cb->assert_depth;
8043 cb->open_caps = &capitem;
8044 }
8045
8046 /* Offset is set zero to mark that this bracket is still open */
8047
8048 PUT(code, 1, 0);
8049 code += 1 + LINK_SIZE + skipunits;
8050
8051 /* Loop for each alternative branch */
8052
8053 for (;;)
8054 {
8055 int branch_return;
8056
8057 /* Insert OP_REVERSE if this is as lookbehind assertion. */
8058
8059 if (lookbehind && lookbehindlength > 0)
8060 {
8061 *code++ = OP_REVERSE;
8062 PUTINC(code, 0, lookbehindlength);
8063 length += 1 + LINK_SIZE;
8064 }
8065
8066 /* Now compile the branch; in the pre-compile phase its length gets added
8067 into the length. */
8068
8069 if ((branch_return =
8070 compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
8071 &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
8072 cb, (lengthptr == NULL)? NULL : &length)) == 0)
8073 return 0;
8074
8075 /* If a branch can match an empty string, so can the whole group. */
8076
8077 if (branch_return < 0) okreturn = -1;
8078
8079 /* In the real compile phase, there is some post-processing to be done. */
8080
8081 if (lengthptr == NULL)
8082 {
8083 /* If this is the first branch, the firstcu and reqcu values for the
8084 branch become the values for the regex. */
8085
8086 if (*last_branch != OP_ALT)
8087 {
8088 firstcu = branchfirstcu;
8089 firstcuflags = branchfirstcuflags;
8090 reqcu = branchreqcu;
8091 reqcuflags = branchreqcuflags;
8092 }
8093
8094 /* If this is not the first branch, the first char and reqcu have to
8095 match the values from all the previous branches, except that if the
8096 previous value for reqcu didn't have REQ_VARY set, it can still match,
8097 and we set REQ_VARY for the group from this branch's value. */
8098
8099 else
8100 {
8101 /* If we previously had a firstcu, but it doesn't match the new branch,
8102 we have to abandon the firstcu for the regex, but if there was
8103 previously no reqcu, it takes on the value of the old firstcu. */
8104
8105 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8106 {
8107 if (firstcuflags >= 0)
8108 {
8109 if (reqcuflags < 0)
8110 {
8111 reqcu = firstcu;
8112 reqcuflags = firstcuflags;
8113 }
8114 }
8115 firstcuflags = REQ_NONE;
8116 }
8117
8118 /* If we (now or from before) have no firstcu, a firstcu from the
8119 branch becomes a reqcu if there isn't a branch reqcu. */
8120
8121 if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
8122 branchreqcuflags < 0)
8123 {
8124 branchreqcu = branchfirstcu;
8125 branchreqcuflags = branchfirstcuflags;
8126 }
8127
8128 /* Now ensure that the reqcus match */
8129
8130 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8131 reqcu != branchreqcu)
8132 reqcuflags = REQ_NONE;
8133 else
8134 {
8135 reqcu = branchreqcu;
8136 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8137 }
8138 }
8139 }
8140
8141 /* Handle reaching the end of the expression, either ')' or end of pattern.
8142 In the real compile phase, go back through the alternative branches and
8143 reverse the chain of offsets, with the field in the BRA item now becoming an
8144 offset to the first alternative. If there are no alternatives, it points to
8145 the end of the group. The length in the terminating ket is always the length
8146 of the whole bracketed item. Return leaving the pointer at the terminating
8147 char. */
8148
8149 if (META_CODE(*pptr) != META_ALT)
8150 {
8151 if (lengthptr == NULL)
8152 {
8153 PCRE2_SIZE branch_length = code - last_branch;
8154 do
8155 {
8156 PCRE2_SIZE prev_length = GET(last_branch, 1);
8157 PUT(last_branch, 1, branch_length);
8158 branch_length = prev_length;
8159 last_branch -= branch_length;
8160 }
8161 while (branch_length > 0);
8162 }
8163
8164 /* Fill in the ket */
8165
8166 *code = OP_KET;
8167 PUT(code, 1, (int)(code - start_bracket));
8168 code += 1 + LINK_SIZE;
8169
8170 /* If it was a capturing subpattern, remove the block from the chain. */
8171
8172 if (capnumber > 0) cb->open_caps = cb->open_caps->next;
8173
8174 /* Set values to pass back */
8175
8176 *codeptr = code;
8177 *pptrptr = pptr;
8178 *firstcuptr = firstcu;
8179 *firstcuflagsptr = firstcuflags;
8180 *reqcuptr = reqcu;
8181 *reqcuflagsptr = reqcuflags;
8182 if (lengthptr != NULL)
8183 {
8184 if (OFLOW_MAX - *lengthptr < length)
8185 {
8186 *errorcodeptr = ERR20;
8187 return 0;
8188 }
8189 *lengthptr += length;
8190 }
8191 return okreturn;
8192 }
8193
8194 /* Another branch follows. In the pre-compile phase, we can move the code
8195 pointer back to where it was for the start of the first branch. (That is,
8196 pretend that each branch is the only one.)
8197
8198 In the real compile phase, insert an ALT node. Its length field points back
8199 to the previous branch while the bracket remains open. At the end the chain
8200 is reversed. It's done like this so that the start of the bracket has a
8201 zero offset until it is closed, making it possible to detect recursion. */
8202
8203 if (lengthptr != NULL)
8204 {
8205 code = *codeptr + 1 + LINK_SIZE + skipunits;
8206 length += 1 + LINK_SIZE;
8207 }
8208 else
8209 {
8210 *code = OP_ALT;
8211 PUT(code, 1, (int)(code - last_branch));
8212 bc.current_branch = last_branch = code;
8213 code += 1 + LINK_SIZE;
8214 }
8215
8216 /* Set the lookbehind length (if not in a lookbehind the value will be zero)
8217 and then advance past the vertical bar. */
8218
8219 lookbehindlength = META_DATA(*pptr);
8220 pptr++;
8221 }
8222 /* Control never reaches here */
8223 }
8224
8225
8226
8227 /*************************************************
8228 * Check for anchored pattern *
8229 *************************************************/
8230
8231 /* Try to find out if this is an anchored regular expression. Consider each
8232 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8233 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8234 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8235 be found, because ^ generates OP_CIRCM in that mode.
8236
8237 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8238 This is the code for \G, which means "match at start of match position, taking
8239 into account the match offset".
8240
8241 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8242 because that will try the rest of the pattern at all possible matching points,
8243 so there is no point trying again.... er ....
8244
8245 .... except when the .* appears inside capturing parentheses, and there is a
8246 subsequent back reference to those parentheses. We haven't enough information
8247 to catch that case precisely.
8248
8249 At first, the best we could do was to detect when .* was in capturing brackets
8250 and the highest back reference was greater than or equal to that level.
8251 However, by keeping a bitmap of the first 31 back references, we can catch some
8252 of the more common cases more precisely.
8253
8254 ... A second exception is when the .* appears inside an atomic group, because
8255 this prevents the number of characters it matches from being adjusted.
8256
8257 Arguments:
8258 code points to start of the compiled pattern
8259 bracket_map a bitmap of which brackets we are inside while testing; this
8260 handles up to substring 31; after that we just have to take
8261 the less precise approach
8262 cb points to the compile data block
8263 atomcount atomic group level
8264 inassert TRUE if in an assertion
8265
8266 Returns: TRUE or FALSE
8267 */
8268
8269 static BOOL
is_anchored(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8270 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8271 int atomcount, BOOL inassert)
8272 {
8273 do {
8274 PCRE2_SPTR scode = first_significant_code(
8275 code + PRIV(OP_lengths)[*code], FALSE);
8276 int op = *scode;
8277
8278 /* Non-capturing brackets */
8279
8280 if (op == OP_BRA || op == OP_BRAPOS ||
8281 op == OP_SBRA || op == OP_SBRAPOS)
8282 {
8283 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8284 return FALSE;
8285 }
8286
8287 /* Capturing brackets */
8288
8289 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8290 op == OP_SCBRA || op == OP_SCBRAPOS)
8291 {
8292 int n = GET2(scode, 1+LINK_SIZE);
8293 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8294 if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8295 }
8296
8297 /* Positive forward assertion */
8298
8299 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8300 {
8301 if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8302 }
8303
8304 /* Condition. If there is no second branch, it can't be anchored. */
8305
8306 else if (op == OP_COND || op == OP_SCOND)
8307 {
8308 if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8309 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8310 return FALSE;
8311 }
8312
8313 /* Atomic groups */
8314
8315 else if (op == OP_ONCE)
8316 {
8317 if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8318 return FALSE;
8319 }
8320
8321 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8322 it isn't in brackets that are or may be referenced or inside an atomic
8323 group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8324 because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8325 with the subject "aab", which matches "b", i.e. not at the start of a line.
8326 There is also an option that disables auto-anchoring. */
8327
8328 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8329 op == OP_TYPEPOSSTAR))
8330 {
8331 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8332 atomcount > 0 || cb->had_pruneorskip || inassert ||
8333 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8334 return FALSE;
8335 }
8336
8337 /* Check for explicit anchoring */
8338
8339 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8340
8341 code += GET(code, 1);
8342 }
8343 while (*code == OP_ALT); /* Loop for each alternative */
8344 return TRUE;
8345 }
8346
8347
8348
8349 /*************************************************
8350 * Check for starting with ^ or .* *
8351 *************************************************/
8352
8353 /* This is called to find out if every branch starts with ^ or .* so that
8354 "first char" processing can be done to speed things up in multiline
8355 matching and for non-DOTALL patterns that start with .* (which must start at
8356 the beginning or after \n). As in the case of is_anchored() (see above), we
8357 have to take account of back references to capturing brackets that contain .*
8358 because in that case we can't make the assumption. Also, the appearance of .*
8359 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8360 or *SKIP does not count, because once again the assumption no longer holds.
8361
8362 Arguments:
8363 code points to start of the compiled pattern or a group
8364 bracket_map a bitmap of which brackets we are inside while testing; this
8365 handles up to substring 31; after that we just have to take
8366 the less precise approach
8367 cb points to the compile data
8368 atomcount atomic group level
8369 inassert TRUE if in an assertion
8370
8371 Returns: TRUE or FALSE
8372 */
8373
8374 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8375 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8376 int atomcount, BOOL inassert)
8377 {
8378 do {
8379 PCRE2_SPTR scode = first_significant_code(
8380 code + PRIV(OP_lengths)[*code], FALSE);
8381 int op = *scode;
8382
8383 /* If we are at the start of a conditional assertion group, *both* the
8384 conditional assertion *and* what follows the condition must satisfy the test
8385 for start of line. Other kinds of condition fail. Note that there may be an
8386 auto-callout at the start of a condition. */
8387
8388 if (op == OP_COND)
8389 {
8390 scode += 1 + LINK_SIZE;
8391
8392 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8393 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8394
8395 switch (*scode)
8396 {
8397 case OP_CREF:
8398 case OP_DNCREF:
8399 case OP_RREF:
8400 case OP_DNRREF:
8401 case OP_FAIL:
8402 case OP_FALSE:
8403 case OP_TRUE:
8404 return FALSE;
8405
8406 default: /* Assertion */
8407 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8408 do scode += GET(scode, 1); while (*scode == OP_ALT);
8409 scode += 1 + LINK_SIZE;
8410 break;
8411 }
8412 scode = first_significant_code(scode, FALSE);
8413 op = *scode;
8414 }
8415
8416 /* Non-capturing brackets */
8417
8418 if (op == OP_BRA || op == OP_BRAPOS ||
8419 op == OP_SBRA || op == OP_SBRAPOS)
8420 {
8421 if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8422 return FALSE;
8423 }
8424
8425 /* Capturing brackets */
8426
8427 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8428 op == OP_SCBRA || op == OP_SCBRAPOS)
8429 {
8430 int n = GET2(scode, 1+LINK_SIZE);
8431 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8432 if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8433 }
8434
8435 /* Positive forward assertions */
8436
8437 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8438 {
8439 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8440 return FALSE;
8441 }
8442
8443 /* Atomic brackets */
8444
8445 else if (op == OP_ONCE)
8446 {
8447 if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8448 return FALSE;
8449 }
8450
8451 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8452 brackets that may be referenced or an assertion, and as long as the pattern
8453 does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8454 for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8455 i.e. not at the start of a line. There is also an option that disables this
8456 optimization. */
8457
8458 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8459 {
8460 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8461 atomcount > 0 || cb->had_pruneorskip || inassert ||
8462 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8463 return FALSE;
8464 }
8465
8466 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8467 in particular that this includes atomic brackets OP_ONCE because the number
8468 of characters matched by .* cannot be adjusted inside them. */
8469
8470 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8471
8472 /* Move on to the next alternative */
8473
8474 code += GET(code, 1);
8475 }
8476 while (*code == OP_ALT); /* Loop for each alternative */
8477 return TRUE;
8478 }
8479
8480
8481
8482 /*************************************************
8483 * Scan compiled regex for recursion reference *
8484 *************************************************/
8485
8486 /* This function scans through a compiled pattern until it finds an instance of
8487 OP_RECURSE.
8488
8489 Arguments:
8490 code points to start of expression
8491 utf TRUE in UTF mode
8492
8493 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8494 */
8495
8496 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8497 find_recurse(PCRE2_SPTR code, BOOL utf)
8498 {
8499 for (;;)
8500 {
8501 PCRE2_UCHAR c = *code;
8502 if (c == OP_END) return NULL;
8503 if (c == OP_RECURSE) return code;
8504
8505 /* XCLASS is used for classes that cannot be represented just by a bit map.
8506 This includes negated single high-valued characters. CALLOUT_STR is used for
8507 callouts with string arguments. In both cases the length in the table is
8508 zero; the actual length is stored in the compiled code. */
8509
8510 if (c == OP_XCLASS) code += GET(code, 1);
8511 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8512
8513 /* Otherwise, we can get the item's length from the table, except that for
8514 repeated character types, we have to test for \p and \P, which have an extra
8515 two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8516 we must add in its length. */
8517
8518 else
8519 {
8520 switch(c)
8521 {
8522 case OP_TYPESTAR:
8523 case OP_TYPEMINSTAR:
8524 case OP_TYPEPLUS:
8525 case OP_TYPEMINPLUS:
8526 case OP_TYPEQUERY:
8527 case OP_TYPEMINQUERY:
8528 case OP_TYPEPOSSTAR:
8529 case OP_TYPEPOSPLUS:
8530 case OP_TYPEPOSQUERY:
8531 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8532 break;
8533
8534 case OP_TYPEPOSUPTO:
8535 case OP_TYPEUPTO:
8536 case OP_TYPEMINUPTO:
8537 case OP_TYPEEXACT:
8538 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8539 code += 2;
8540 break;
8541
8542 case OP_MARK:
8543 case OP_COMMIT_ARG:
8544 case OP_PRUNE_ARG:
8545 case OP_SKIP_ARG:
8546 case OP_THEN_ARG:
8547 code += code[1];
8548 break;
8549 }
8550
8551 /* Add in the fixed length from the table */
8552
8553 code += PRIV(OP_lengths)[c];
8554
8555 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8556 be followed by a multi-unit character. The length in the table is a
8557 minimum, so we have to arrange to skip the extra units. */
8558
8559 #ifdef MAYBE_UTF_MULTI
8560 if (utf) switch(c)
8561 {
8562 case OP_CHAR:
8563 case OP_CHARI:
8564 case OP_NOT:
8565 case OP_NOTI:
8566 case OP_EXACT:
8567 case OP_EXACTI:
8568 case OP_NOTEXACT:
8569 case OP_NOTEXACTI:
8570 case OP_UPTO:
8571 case OP_UPTOI:
8572 case OP_NOTUPTO:
8573 case OP_NOTUPTOI:
8574 case OP_MINUPTO:
8575 case OP_MINUPTOI:
8576 case OP_NOTMINUPTO:
8577 case OP_NOTMINUPTOI:
8578 case OP_POSUPTO:
8579 case OP_POSUPTOI:
8580 case OP_NOTPOSUPTO:
8581 case OP_NOTPOSUPTOI:
8582 case OP_STAR:
8583 case OP_STARI:
8584 case OP_NOTSTAR:
8585 case OP_NOTSTARI:
8586 case OP_MINSTAR:
8587 case OP_MINSTARI:
8588 case OP_NOTMINSTAR:
8589 case OP_NOTMINSTARI:
8590 case OP_POSSTAR:
8591 case OP_POSSTARI:
8592 case OP_NOTPOSSTAR:
8593 case OP_NOTPOSSTARI:
8594 case OP_PLUS:
8595 case OP_PLUSI:
8596 case OP_NOTPLUS:
8597 case OP_NOTPLUSI:
8598 case OP_MINPLUS:
8599 case OP_MINPLUSI:
8600 case OP_NOTMINPLUS:
8601 case OP_NOTMINPLUSI:
8602 case OP_POSPLUS:
8603 case OP_POSPLUSI:
8604 case OP_NOTPOSPLUS:
8605 case OP_NOTPOSPLUSI:
8606 case OP_QUERY:
8607 case OP_QUERYI:
8608 case OP_NOTQUERY:
8609 case OP_NOTQUERYI:
8610 case OP_MINQUERY:
8611 case OP_MINQUERYI:
8612 case OP_NOTMINQUERY:
8613 case OP_NOTMINQUERYI:
8614 case OP_POSQUERY:
8615 case OP_POSQUERYI:
8616 case OP_NOTPOSQUERY:
8617 case OP_NOTPOSQUERYI:
8618 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8619 break;
8620 }
8621 #else
8622 (void)(utf); /* Keep compiler happy by referencing function argument */
8623 #endif /* MAYBE_UTF_MULTI */
8624 }
8625 }
8626 }
8627
8628
8629
8630 /*************************************************
8631 * Check for asserted fixed first code unit *
8632 *************************************************/
8633
8634 /* During compilation, the "first code unit" settings from forward assertions
8635 are discarded, because they can cause conflicts with actual literals that
8636 follow. However, if we end up without a first code unit setting for an
8637 unanchored pattern, it is worth scanning the regex to see if there is an
8638 initial asserted first code unit. If all branches start with the same asserted
8639 code unit, or with a non-conditional bracket all of whose alternatives start
8640 with the same asserted code unit (recurse ad lib), then we return that code
8641 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8642 REQ_NONE in the flags.
8643
8644 Arguments:
8645 code points to start of compiled pattern
8646 flags points to the first code unit flags
8647 inassert non-zero if in an assertion
8648
8649 Returns: the fixed first code unit, or 0 with REQ_NONE in flags
8650 */
8651
8652 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,int32_t * flags,uint32_t inassert)8653 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
8654 {
8655 uint32_t c = 0;
8656 int cflags = REQ_NONE;
8657
8658 *flags = REQ_NONE;
8659 do {
8660 uint32_t d;
8661 int dflags;
8662 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8663 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8664 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8665 PCRE2_UCHAR op = *scode;
8666
8667 switch(op)
8668 {
8669 default:
8670 return 0;
8671
8672 case OP_BRA:
8673 case OP_BRAPOS:
8674 case OP_CBRA:
8675 case OP_SCBRA:
8676 case OP_CBRAPOS:
8677 case OP_SCBRAPOS:
8678 case OP_ASSERT:
8679 case OP_ASSERT_NA:
8680 case OP_ONCE:
8681 case OP_SCRIPT_RUN:
8682 d = find_firstassertedcu(scode, &dflags, inassert +
8683 ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
8684 if (dflags < 0)
8685 return 0;
8686 if (cflags < 0) { c = d; cflags = dflags; }
8687 else if (c != d || cflags != dflags) return 0;
8688 break;
8689
8690 case OP_EXACT:
8691 scode += IMM2_SIZE;
8692 /* Fall through */
8693
8694 case OP_CHAR:
8695 case OP_PLUS:
8696 case OP_MINPLUS:
8697 case OP_POSPLUS:
8698 if (inassert == 0) return 0;
8699 if (cflags < 0) { c = scode[1]; cflags = 0; }
8700 else if (c != scode[1]) return 0;
8701 break;
8702
8703 case OP_EXACTI:
8704 scode += IMM2_SIZE;
8705 /* Fall through */
8706
8707 case OP_CHARI:
8708 case OP_PLUSI:
8709 case OP_MINPLUSI:
8710 case OP_POSPLUSI:
8711 if (inassert == 0) return 0;
8712
8713 /* If the character is more than one code unit long, we cannot set its
8714 first code unit when matching caselessly. Later scanning may pick up
8715 multiple code units. */
8716
8717 #ifdef SUPPORT_UNICODE
8718 #if PCRE2_CODE_UNIT_WIDTH == 8
8719 if (scode[1] >= 0x80) return 0;
8720 #elif PCRE2_CODE_UNIT_WIDTH == 16
8721 if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
8722 #endif
8723 #endif
8724
8725 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8726 else if (c != scode[1]) return 0;
8727 break;
8728 }
8729
8730 code += GET(code, 1);
8731 }
8732 while (*code == OP_ALT);
8733
8734 *flags = cflags;
8735 return c;
8736 }
8737
8738
8739
8740 /*************************************************
8741 * Add an entry to the name/number table *
8742 *************************************************/
8743
8744 /* This function is called between compiling passes to add an entry to the
8745 name/number table, maintaining alphabetical order. Checking for permitted
8746 and forbidden duplicates has already been done.
8747
8748 Arguments:
8749 cb the compile data block
8750 name the name to add
8751 length the length of the name
8752 groupno the group number
8753 tablecount the count of names in the table so far
8754
8755 Returns: nothing
8756 */
8757
8758 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)8759 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8760 unsigned int groupno, uint32_t tablecount)
8761 {
8762 uint32_t i;
8763 PCRE2_UCHAR *slot = cb->name_table;
8764
8765 for (i = 0; i < tablecount; i++)
8766 {
8767 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8768 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8769 crc = -1; /* Current name is a substring */
8770
8771 /* Make space in the table and break the loop for an earlier name. For a
8772 duplicate or later name, carry on. We do this for duplicates so that in the
8773 simple case (when ?(| is not used) they are in order of their numbers. In all
8774 cases they are in the order in which they appear in the pattern. */
8775
8776 if (crc < 0)
8777 {
8778 (void)memmove(slot + cb->name_entry_size, slot,
8779 CU2BYTES((tablecount - i) * cb->name_entry_size));
8780 break;
8781 }
8782
8783 /* Continue the loop for a later or duplicate name */
8784
8785 slot += cb->name_entry_size;
8786 }
8787
8788 PUT2(slot, 0, groupno);
8789 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8790
8791 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8792 the memory is all initialized. Otherwise valgrind moans about uninitialized
8793 memory when saving serialized compiled patterns. */
8794
8795 memset(slot + IMM2_SIZE + length, 0,
8796 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8797 }
8798
8799
8800
8801 /*************************************************
8802 * Skip in parsed pattern *
8803 *************************************************/
8804
8805 /* This function is called to skip parts of the parsed pattern when finding the
8806 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8807 the end of the branch, it is called to skip over an internal lookaround or
8808 (DEFINE) group, and it is also called to skip to the end of a class, during
8809 which it will never encounter nested groups (but there's no need to have
8810 special code for that).
8811
8812 When called to find the end of a branch or group, pptr must point to the first
8813 meta code inside the branch, not the branch-starting code. In other cases it
8814 can point to the item that causes the function to be called.
8815
8816 Arguments:
8817 pptr current pointer to skip from
8818 skiptype PSKIP_CLASS when skipping to end of class
8819 PSKIP_ALT when META_ALT ends the skip
8820 PSKIP_KET when only META_KET ends the skip
8821
8822 Returns: new value of pptr
8823 NULL if META_END is reached - should never occur
8824 or for an unknown meta value - likewise
8825 */
8826
8827 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)8828 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8829 {
8830 uint32_t nestlevel = 0;
8831
8832 for (;; pptr++)
8833 {
8834 uint32_t meta = META_CODE(*pptr);
8835
8836 switch(meta)
8837 {
8838 default: /* Just skip over most items */
8839 if (meta < META_END) continue; /* Literal */
8840 break;
8841
8842 /* This should never occur. */
8843
8844 case META_END:
8845 return NULL;
8846
8847 /* The data for these items is variable in length. */
8848
8849 case META_BACKREF: /* Offset is present only if group >= 10 */
8850 if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8851 break;
8852
8853 case META_ESCAPE: /* A few escapes are followed by data items. */
8854 switch (META_DATA(*pptr))
8855 {
8856 case ESC_P:
8857 case ESC_p:
8858 pptr += 1;
8859 break;
8860
8861 case ESC_g:
8862 case ESC_k:
8863 pptr += 1 + SIZEOFFSET;
8864 break;
8865 }
8866 break;
8867
8868 case META_MARK: /* Add the length of the name. */
8869 case META_COMMIT_ARG:
8870 case META_PRUNE_ARG:
8871 case META_SKIP_ARG:
8872 case META_THEN_ARG:
8873 pptr += pptr[1];
8874 break;
8875
8876 /* These are the "active" items in this loop. */
8877
8878 case META_CLASS_END:
8879 if (skiptype == PSKIP_CLASS) return pptr;
8880 break;
8881
8882 case META_ATOMIC:
8883 case META_CAPTURE:
8884 case META_COND_ASSERT:
8885 case META_COND_DEFINE:
8886 case META_COND_NAME:
8887 case META_COND_NUMBER:
8888 case META_COND_RNAME:
8889 case META_COND_RNUMBER:
8890 case META_COND_VERSION:
8891 case META_LOOKAHEAD:
8892 case META_LOOKAHEADNOT:
8893 case META_LOOKAHEAD_NA:
8894 case META_LOOKBEHIND:
8895 case META_LOOKBEHINDNOT:
8896 case META_LOOKBEHIND_NA:
8897 case META_NOCAPTURE:
8898 case META_SCRIPT_RUN:
8899 nestlevel++;
8900 break;
8901
8902 case META_ALT:
8903 if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
8904 break;
8905
8906 case META_KET:
8907 if (nestlevel == 0) return pptr;
8908 nestlevel--;
8909 break;
8910 }
8911
8912 /* The extra data item length for each meta is in a table. */
8913
8914 meta = (meta >> 16) & 0x7fff;
8915 if (meta >= sizeof(meta_extra_lengths)) return NULL;
8916 pptr += meta_extra_lengths[meta];
8917 }
8918 /* Control never reaches here */
8919 return pptr;
8920 }
8921
8922
8923
8924 /*************************************************
8925 * Find length of a parsed group *
8926 *************************************************/
8927
8928 /* This is called for nested groups within a branch of a lookbehind whose
8929 length is being computed. If all the branches in the nested group have the same
8930 length, that is OK. On entry, the pointer must be at the first element after
8931 the group initializing code. On exit it points to OP_KET. Caching is used to
8932 improve processing speed when the same capturing group occurs many times.
8933
8934 Arguments:
8935 pptrptr pointer to pointer in the parsed pattern
8936 isinline FALSE if a reference or recursion; TRUE for inline group
8937 errcodeptr pointer to the errorcode
8938 lcptr pointer to the loop counter
8939 group number of captured group or -1 for a non-capturing group
8940 recurses chain of recurse_check to catch mutual recursion
8941 cb pointer to the compile data
8942
8943 Returns: the group length or a negative number
8944 */
8945
8946 static int
get_grouplength(uint32_t ** pptrptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)8947 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
8948 int group, parsed_recurse_check *recurses, compile_block *cb)
8949 {
8950 int branchlength;
8951 int grouplength = -1;
8952
8953 /* The cache can be used only if there is no possibility of there being two
8954 groups with the same number. We do not need to set the end pointer for a group
8955 that is being processed as a back reference or recursion, but we must do so for
8956 an inline group. */
8957
8958 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
8959 {
8960 uint32_t groupinfo = cb->groupinfo[group];
8961 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
8962 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
8963 {
8964 if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
8965 return groupinfo & GI_FIXED_LENGTH_MASK;
8966 }
8967 }
8968
8969 /* Scan the group. In this case we find the end pointer of necessity. */
8970
8971 for(;;)
8972 {
8973 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
8974 if (branchlength < 0) goto ISNOTFIXED;
8975 if (grouplength == -1) grouplength = branchlength;
8976 else if (grouplength != branchlength) goto ISNOTFIXED;
8977 if (**pptrptr == META_KET) break;
8978 *pptrptr += 1; /* Skip META_ALT */
8979 }
8980
8981 if (group > 0)
8982 cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
8983 return grouplength;
8984
8985 ISNOTFIXED:
8986 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
8987 return -1;
8988 }
8989
8990
8991
8992 /*************************************************
8993 * Find length of a parsed branch *
8994 *************************************************/
8995
8996 /* Return a fixed length for a branch in a lookbehind, giving an error if the
8997 length is not fixed. On entry, *pptrptr points to the first element inside the
8998 branch. On exit it is set to point to the ALT or KET.
8999
9000 Arguments:
9001 pptrptr pointer to pointer in the parsed pattern
9002 errcodeptr pointer to error code
9003 lcptr pointer to loop counter
9004 recurses chain of recurse_check to catch mutual recursion
9005 cb pointer to compile block
9006
9007 Returns: the length, or a negative value on error
9008 */
9009
9010 static int
get_branchlength(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9011 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9012 parsed_recurse_check *recurses, compile_block *cb)
9013 {
9014 int branchlength = 0;
9015 int grouplength;
9016 uint32_t lastitemlength = 0;
9017 uint32_t *pptr = *pptrptr;
9018 PCRE2_SIZE offset;
9019 parsed_recurse_check this_recurse;
9020
9021 /* A large and/or complex regex can take too long to process. This can happen
9022 more often when (?| groups are present in the pattern because their length
9023 cannot be cached. */
9024
9025 if ((*lcptr)++ > 2000)
9026 {
9027 *errcodeptr = ERR35; /* Lookbehind is too complicated */
9028 return -1;
9029 }
9030
9031 /* Scan the branch, accumulating the length. */
9032
9033 for (;; pptr++)
9034 {
9035 parsed_recurse_check *r;
9036 uint32_t *gptr, *gptrend;
9037 uint32_t escape;
9038 uint32_t group = 0;
9039 uint32_t itemlength = 0;
9040
9041 if (*pptr < META_END)
9042 {
9043 itemlength = 1;
9044 }
9045
9046 else switch (META_CODE(*pptr))
9047 {
9048 case META_KET:
9049 case META_ALT:
9050 goto EXIT;
9051
9052 /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9053 actual termination. */
9054
9055 case META_ACCEPT:
9056 case META_FAIL:
9057 pptr = parsed_skip(pptr, PSKIP_ALT);
9058 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9059 goto EXIT;
9060
9061 case META_MARK:
9062 case META_COMMIT_ARG:
9063 case META_PRUNE_ARG:
9064 case META_SKIP_ARG:
9065 case META_THEN_ARG:
9066 pptr += pptr[1] + 1;
9067 break;
9068
9069 case META_CIRCUMFLEX:
9070 case META_COMMIT:
9071 case META_DOLLAR:
9072 case META_PRUNE:
9073 case META_SKIP:
9074 case META_THEN:
9075 break;
9076
9077 case META_OPTIONS:
9078 pptr += 1;
9079 break;
9080
9081 case META_BIGVALUE:
9082 itemlength = 1;
9083 pptr += 1;
9084 break;
9085
9086 case META_CLASS:
9087 case META_CLASS_NOT:
9088 itemlength = 1;
9089 pptr = parsed_skip(pptr, PSKIP_CLASS);
9090 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9091 break;
9092
9093 case META_CLASS_EMPTY_NOT:
9094 case META_DOT:
9095 itemlength = 1;
9096 break;
9097
9098 case META_CALLOUT_NUMBER:
9099 pptr += 3;
9100 break;
9101
9102 case META_CALLOUT_STRING:
9103 pptr += 3 + SIZEOFFSET;
9104 break;
9105
9106 /* Only some escapes consume a character. Of those, \R and \X are never
9107 allowed because they might match more than character. \C is allowed only in
9108 32-bit and non-UTF 8/16-bit modes. */
9109
9110 case META_ESCAPE:
9111 escape = META_DATA(*pptr);
9112 if (escape == ESC_R || escape == ESC_X) return -1;
9113 if (escape > ESC_b && escape < ESC_Z)
9114 {
9115 #if PCRE2_CODE_UNIT_WIDTH != 32
9116 if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9117 {
9118 *errcodeptr = ERR36;
9119 return -1;
9120 }
9121 #endif
9122 itemlength = 1;
9123 if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
9124 }
9125 break;
9126
9127 /* Lookaheads do not contribute to the length of this branch, but they may
9128 contain lookbehinds within them whose lengths need to be set. */
9129
9130 case META_LOOKAHEAD:
9131 case META_LOOKAHEADNOT:
9132 case META_LOOKAHEAD_NA:
9133 *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb);
9134 if (*errcodeptr != 0) return -1;
9135
9136 /* Ignore any qualifiers that follow a lookahead assertion. */
9137
9138 switch (pptr[1])
9139 {
9140 case META_ASTERISK:
9141 case META_ASTERISK_PLUS:
9142 case META_ASTERISK_QUERY:
9143 case META_PLUS:
9144 case META_PLUS_PLUS:
9145 case META_PLUS_QUERY:
9146 case META_QUERY:
9147 case META_QUERY_PLUS:
9148 case META_QUERY_QUERY:
9149 pptr++;
9150 break;
9151
9152 case META_MINMAX:
9153 case META_MINMAX_PLUS:
9154 case META_MINMAX_QUERY:
9155 pptr += 3;
9156 break;
9157
9158 default:
9159 break;
9160 }
9161 break;
9162
9163 /* A nested lookbehind does not contribute any length to this lookbehind,
9164 but must itself be checked and have its lengths set. */
9165
9166 case META_LOOKBEHIND:
9167 case META_LOOKBEHINDNOT:
9168 case META_LOOKBEHIND_NA:
9169 if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9170 return -1;
9171 break;
9172
9173 /* Back references and recursions are handled by very similar code. At this
9174 stage, the names generated in the parsing pass are available, but the main
9175 name table has not yet been created. So for the named varieties, scan the
9176 list of names in order to get the number of the first one in the pattern,
9177 and whether or not this name is duplicated. */
9178
9179 case META_BACKREF_BYNAME:
9180 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9181 goto ISNOTFIXED;
9182 /* Fall through */
9183
9184 case META_RECURSE_BYNAME:
9185 {
9186 int i;
9187 PCRE2_SPTR name;
9188 BOOL is_dupname = FALSE;
9189 named_group *ng = cb->named_groups;
9190 uint32_t meta_code = META_CODE(*pptr);
9191 uint32_t length = *(++pptr);
9192
9193 GETPLUSOFFSET(offset, pptr);
9194 name = cb->start_pattern + offset;
9195 for (i = 0; i < cb->names_found; i++, ng++)
9196 {
9197 if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9198 {
9199 group = ng->number;
9200 is_dupname = ng->isdup;
9201 break;
9202 }
9203 }
9204
9205 if (group == 0)
9206 {
9207 *errcodeptr = ERR15; /* Non-existent subpattern */
9208 cb->erroroffset = offset;
9209 return -1;
9210 }
9211
9212 /* A numerical back reference can be fixed length if duplicate capturing
9213 groups are not being used. A non-duplicate named back reference can also
9214 be handled. */
9215
9216 if (meta_code == META_RECURSE_BYNAME ||
9217 (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9218 goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
9219 }
9220 goto ISNOTFIXED; /* Duplicate name or number */
9221
9222 /* The offset values for back references < 10 are in a separate vector
9223 because otherwise they would use more than two parsed pattern elements on
9224 64-bit systems. */
9225
9226 case META_BACKREF:
9227 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9228 (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9229 goto ISNOTFIXED;
9230 group = META_DATA(*pptr);
9231 if (group < 10)
9232 {
9233 offset = cb->small_ref_offset[group];
9234 goto RECURSE_OR_BACKREF_LENGTH;
9235 }
9236
9237 /* Fall through */
9238 /* For groups >= 10 - picking up group twice does no harm. */
9239
9240 /* A true recursion implies not fixed length, but a subroutine call may
9241 be OK. Back reference "recursions" are also failed. */
9242
9243 case META_RECURSE:
9244 group = META_DATA(*pptr);
9245 GETPLUSOFFSET(offset, pptr);
9246
9247 RECURSE_OR_BACKREF_LENGTH:
9248 if (group > cb->bracount)
9249 {
9250 cb->erroroffset = offset;
9251 *errcodeptr = ERR15; /* Non-existent subpattern */
9252 return -1;
9253 }
9254 if (group == 0) goto ISNOTFIXED; /* Local recursion */
9255 for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9256 {
9257 if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9258 else if (*gptr == (META_CAPTURE | group)) break;
9259 }
9260
9261 /* We must start the search for the end of the group at the first meta code
9262 inside the group. Otherwise it will be treated as an enclosed group. */
9263
9264 gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9265 if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9266 if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
9267 for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9268 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
9269 this_recurse.prev = recurses;
9270 this_recurse.groupptr = gptr;
9271
9272 /* We do not need to know the position of the end of the group, that is,
9273 gptr is not used after the call to get_grouplength(). Setting the second
9274 argument FALSE stops it scanning for the end when the length can be found
9275 in the cache. */
9276
9277 gptr++;
9278 grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9279 &this_recurse, cb);
9280 if (grouplength < 0)
9281 {
9282 if (*errcodeptr == 0) goto ISNOTFIXED;
9283 return -1; /* Error already set */
9284 }
9285 itemlength = grouplength;
9286 break;
9287
9288 /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9289 the length of this branch. Skip from the following item to the next
9290 unpaired ket. */
9291
9292 case META_COND_DEFINE:
9293 pptr = parsed_skip(pptr + 1, PSKIP_KET);
9294 break;
9295
9296 /* Check other nested groups - advance past the initial data for each type
9297 and then seek a fixed length with get_grouplength(). */
9298
9299 case META_COND_NAME:
9300 case META_COND_NUMBER:
9301 case META_COND_RNAME:
9302 case META_COND_RNUMBER:
9303 pptr += 2 + SIZEOFFSET;
9304 goto CHECK_GROUP;
9305
9306 case META_COND_ASSERT:
9307 pptr += 1;
9308 goto CHECK_GROUP;
9309
9310 case META_COND_VERSION:
9311 pptr += 4;
9312 goto CHECK_GROUP;
9313
9314 case META_CAPTURE:
9315 group = META_DATA(*pptr);
9316 /* Fall through */
9317
9318 case META_ATOMIC:
9319 case META_NOCAPTURE:
9320 case META_SCRIPT_RUN:
9321 pptr++;
9322 CHECK_GROUP:
9323 grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9324 recurses, cb);
9325 if (grouplength < 0) return -1;
9326 itemlength = grouplength;
9327 break;
9328
9329 /* Exact repetition is OK; variable repetition is not. A repetition of zero
9330 must subtract the length that has already been added. */
9331
9332 case META_MINMAX:
9333 case META_MINMAX_PLUS:
9334 case META_MINMAX_QUERY:
9335 if (pptr[1] == pptr[2])
9336 {
9337 switch(pptr[1])
9338 {
9339 case 0:
9340 branchlength -= lastitemlength;
9341 break;
9342
9343 case 1:
9344 itemlength = 0;
9345 break;
9346
9347 default: /* Check for integer overflow */
9348 if (lastitemlength != 0 && /* Should not occur, but just in case */
9349 INT_MAX/lastitemlength < pptr[1] - 1)
9350 {
9351 *errcodeptr = ERR87; /* Integer overflow; lookbehind too big */
9352 return -1;
9353 }
9354 itemlength = (pptr[1] - 1) * lastitemlength;
9355 break;
9356 }
9357 pptr += 2;
9358 break;
9359 }
9360 /* Fall through */
9361
9362 /* Any other item means this branch does not have a fixed length. */
9363
9364 default:
9365 ISNOTFIXED:
9366 *errcodeptr = ERR25; /* Not fixed length */
9367 return -1;
9368 }
9369
9370 /* Add the item length to the branchlength, checking for integer overflow and
9371 for the branch length exceeding the limit. */
9372
9373 if (INT_MAX - branchlength < (int)itemlength ||
9374 (branchlength += itemlength) > LOOKBEHIND_MAX)
9375 {
9376 *errcodeptr = ERR87;
9377 return -1;
9378 }
9379
9380 /* Save this item length for use if the next item is a quantifier. */
9381
9382 lastitemlength = itemlength;
9383 }
9384
9385 EXIT:
9386 *pptrptr = pptr;
9387 return branchlength;
9388
9389 PARSED_SKIP_FAILED:
9390 *errcodeptr = ERR90;
9391 return -1;
9392 }
9393
9394
9395
9396 /*************************************************
9397 * Set lengths in a lookbehind *
9398 *************************************************/
9399
9400 /* This function is called for each lookbehind, to set the lengths in its
9401 branches. An error occurs if any branch does not have a fixed length that is
9402 less than the maximum (65535). On exit, the pointer must be left on the final
9403 ket.
9404
9405 The function also maintains the max_lookbehind value. Any lookbehind branch
9406 that contains a nested lookbehind may actually look further back than the
9407 length of the branch. The additional amount is passed back from
9408 get_branchlength() as an "extra" value.
9409
9410 Arguments:
9411 pptrptr pointer to pointer in the parsed pattern
9412 errcodeptr pointer to error code
9413 lcptr pointer to loop counter
9414 recurses chain of recurse_check to catch mutual recursion
9415 cb pointer to compile block
9416
9417 Returns: TRUE if all is well
9418 FALSE otherwise, with error code and offset set
9419 */
9420
9421 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9422 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9423 parsed_recurse_check *recurses, compile_block *cb)
9424 {
9425 PCRE2_SIZE offset;
9426 int branchlength;
9427 uint32_t *bptr = *pptrptr;
9428
9429 READPLUSOFFSET(offset, bptr); /* Offset for error messages */
9430 *pptrptr += SIZEOFFSET;
9431
9432 do
9433 {
9434 *pptrptr += 1;
9435 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9436 if (branchlength < 0)
9437 {
9438 /* The errorcode and offset may already be set from a nested lookbehind. */
9439 if (*errcodeptr == 0) *errcodeptr = ERR25;
9440 if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9441 return FALSE;
9442 }
9443 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9444 *bptr |= branchlength; /* branchlength never more than 65535 */
9445 bptr = *pptrptr;
9446 }
9447 while (*bptr == META_ALT);
9448
9449 return TRUE;
9450 }
9451
9452
9453
9454 /*************************************************
9455 * Check parsed pattern lookbehinds *
9456 *************************************************/
9457
9458 /* This function is called at the end of parsing a pattern if any lookbehinds
9459 were encountered. It scans the parsed pattern for them, calling
9460 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9461 the error offset is marked unset. The enables the functions above not to
9462 override settings from deeper nestings.
9463
9464 This function is called recursively from get_branchlength() for lookaheads in
9465 order to process any lookbehinds that they may contain. It stops when it hits a
9466 non-nested closing parenthesis in this case, returning a pointer to it.
9467
9468 Arguments
9469 pptr points to where to start (start of pattern or start of lookahead)
9470 retptr if not NULL, return the ket pointer here
9471 recurses chain of recurse_check to catch mutual recursion
9472 cb points to the compile block
9473
9474 Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
9475 */
9476
9477 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb)9478 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9479 parsed_recurse_check *recurses, compile_block *cb)
9480 {
9481 int errorcode = 0;
9482 int loopcount = 0;
9483 int nestlevel = 0;
9484
9485 cb->erroroffset = PCRE2_UNSET;
9486
9487 for (; *pptr != META_END; pptr++)
9488 {
9489 if (*pptr < META_END) continue; /* Literal */
9490
9491 switch (META_CODE(*pptr))
9492 {
9493 default:
9494 return ERR70; /* Unrecognized meta code */
9495
9496 case META_ESCAPE:
9497 if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9498 pptr += 1;
9499 break;
9500
9501 case META_KET:
9502 if (--nestlevel < 0)
9503 {
9504 if (retptr != NULL) *retptr = pptr;
9505 return 0;
9506 }
9507 break;
9508
9509 case META_ATOMIC:
9510 case META_CAPTURE:
9511 case META_COND_ASSERT:
9512 case META_LOOKAHEAD:
9513 case META_LOOKAHEADNOT:
9514 case META_LOOKAHEAD_NA:
9515 case META_NOCAPTURE:
9516 case META_SCRIPT_RUN:
9517 nestlevel++;
9518 break;
9519
9520 case META_ACCEPT:
9521 case META_ALT:
9522 case META_ASTERISK:
9523 case META_ASTERISK_PLUS:
9524 case META_ASTERISK_QUERY:
9525 case META_BACKREF:
9526 case META_CIRCUMFLEX:
9527 case META_CLASS:
9528 case META_CLASS_EMPTY:
9529 case META_CLASS_EMPTY_NOT:
9530 case META_CLASS_END:
9531 case META_CLASS_NOT:
9532 case META_COMMIT:
9533 case META_DOLLAR:
9534 case META_DOT:
9535 case META_FAIL:
9536 case META_PLUS:
9537 case META_PLUS_PLUS:
9538 case META_PLUS_QUERY:
9539 case META_PRUNE:
9540 case META_QUERY:
9541 case META_QUERY_PLUS:
9542 case META_QUERY_QUERY:
9543 case META_RANGE_ESCAPED:
9544 case META_RANGE_LITERAL:
9545 case META_SKIP:
9546 case META_THEN:
9547 break;
9548
9549 case META_RECURSE:
9550 pptr += SIZEOFFSET;
9551 break;
9552
9553 case META_BACKREF_BYNAME:
9554 case META_RECURSE_BYNAME:
9555 pptr += 1 + SIZEOFFSET;
9556 break;
9557
9558 case META_COND_DEFINE:
9559 pptr += SIZEOFFSET;
9560 nestlevel++;
9561 break;
9562
9563 case META_COND_NAME:
9564 case META_COND_NUMBER:
9565 case META_COND_RNAME:
9566 case META_COND_RNUMBER:
9567 pptr += 1 + SIZEOFFSET;
9568 nestlevel++;
9569 break;
9570
9571 case META_COND_VERSION:
9572 pptr += 3;
9573 nestlevel++;
9574 break;
9575
9576 case META_CALLOUT_STRING:
9577 pptr += 3 + SIZEOFFSET;
9578 break;
9579
9580 case META_BIGVALUE:
9581 case META_OPTIONS:
9582 case META_POSIX:
9583 case META_POSIX_NEG:
9584 pptr += 1;
9585 break;
9586
9587 case META_MINMAX:
9588 case META_MINMAX_QUERY:
9589 case META_MINMAX_PLUS:
9590 pptr += 2;
9591 break;
9592
9593 case META_CALLOUT_NUMBER:
9594 pptr += 3;
9595 break;
9596
9597 case META_MARK:
9598 case META_COMMIT_ARG:
9599 case META_PRUNE_ARG:
9600 case META_SKIP_ARG:
9601 case META_THEN_ARG:
9602 pptr += 1 + pptr[1];
9603 break;
9604
9605 case META_LOOKBEHIND:
9606 case META_LOOKBEHINDNOT:
9607 case META_LOOKBEHIND_NA:
9608 if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, recurses, cb))
9609 return errorcode;
9610 break;
9611 }
9612 }
9613
9614 return 0;
9615 }
9616
9617
9618
9619 /*************************************************
9620 * External function to compile a pattern *
9621 *************************************************/
9622
9623 /* This function reads a regular expression in the form of a string and returns
9624 a pointer to a block of store holding a compiled version of the expression.
9625
9626 Arguments:
9627 pattern the regular expression
9628 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
9629 options option bits
9630 errorptr pointer to errorcode
9631 erroroffset pointer to error offset
9632 ccontext points to a compile context or is NULL
9633
9634 Returns: pointer to compiled data block, or NULL on error,
9635 with errorcode and erroroffset set
9636 */
9637
9638 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)9639 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9640 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9641 {
9642 BOOL utf; /* Set TRUE for UTF mode */
9643 BOOL ucp; /* Set TRUE for UCP mode */
9644 BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
9645 BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
9646 pcre2_real_code *re = NULL; /* What we will return */
9647 compile_block cb; /* "Static" compile-time data */
9648 const uint8_t *tables; /* Char tables base pointer */
9649
9650 PCRE2_UCHAR *code; /* Current pointer in compiled code */
9651 PCRE2_SPTR codestart; /* Start of compiled code */
9652 PCRE2_SPTR ptr; /* Current pointer in pattern */
9653 uint32_t *pptr; /* Current pointer in parsed pattern */
9654
9655 PCRE2_SIZE length = 1; /* Allow for final END opcode */
9656 PCRE2_SIZE usedlength; /* Actual length used */
9657 PCRE2_SIZE re_blocksize; /* Size of memory block */
9658 PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
9659 PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
9660
9661 int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
9662 uint32_t firstcu, reqcu; /* Value of first/req code unit */
9663 uint32_t setflags = 0; /* NL and BSR set flags */
9664
9665 uint32_t skipatstart; /* When checking (*UTF) etc */
9666 uint32_t limit_heap = UINT32_MAX;
9667 uint32_t limit_match = UINT32_MAX; /* Unset match limits */
9668 uint32_t limit_depth = UINT32_MAX;
9669
9670 int newline = 0; /* Unset; can be set by the pattern */
9671 int bsr = 0; /* Unset; can be set by the pattern */
9672 int errorcode = 0; /* Initialize to avoid compiler warn */
9673 int regexrc; /* Return from compile */
9674
9675 uint32_t i; /* Local loop counter */
9676
9677 /* Comments at the head of this file explain about these variables. */
9678
9679 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9680 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9681 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9682
9683 /* The workspace is used in different ways in the different compiling phases.
9684 It needs to be 16-bit aligned for the preliminary parsing scan. */
9685
9686 uint32_t c16workspace[C16_WORK_SIZE];
9687 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9688
9689
9690 /* -------------- Check arguments and set up the pattern ----------------- */
9691
9692 /* There must be error code and offset pointers. */
9693
9694 if (errorptr == NULL || erroroffset == NULL) return NULL;
9695 *errorptr = ERR0;
9696 *erroroffset = 0;
9697
9698 /* There must be a pattern! */
9699
9700 if (pattern == NULL)
9701 {
9702 *errorptr = ERR16;
9703 return NULL;
9704 }
9705
9706 /* A NULL compile context means "use a default context" */
9707
9708 if (ccontext == NULL)
9709 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9710
9711 /* PCRE2_MATCH_INVALID_UTF implies UTF */
9712
9713 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
9714
9715 /* Check that all undefined public option bits are zero. */
9716
9717 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9718 (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9719 {
9720 *errorptr = ERR17;
9721 return NULL;
9722 }
9723
9724 if ((options & PCRE2_LITERAL) != 0 &&
9725 ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9726 (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9727 {
9728 *errorptr = ERR92;
9729 return NULL;
9730 }
9731
9732 /* A zero-terminated pattern is indicated by the special length value
9733 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9734
9735 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9736 patlen = PRIV(strlen)(pattern);
9737
9738 if (patlen > ccontext->max_pattern_length)
9739 {
9740 *errorptr = ERR88;
9741 return NULL;
9742 }
9743
9744 /* From here on, all returns from this function should end up going via the
9745 EXIT label. */
9746
9747
9748 /* ------------ Initialize the "static" compile data -------------- */
9749
9750 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9751
9752 cb.lcc = tables + lcc_offset; /* Individual */
9753 cb.fcc = tables + fcc_offset; /* character */
9754 cb.cbits = tables + cbits_offset; /* tables */
9755 cb.ctypes = tables + ctypes_offset;
9756
9757 cb.assert_depth = 0;
9758 cb.bracount = 0;
9759 cb.cx = ccontext;
9760 cb.dupnames = FALSE;
9761 cb.end_pattern = pattern + patlen;
9762 cb.erroroffset = 0;
9763 cb.external_flags = 0;
9764 cb.external_options = options;
9765 cb.groupinfo = stack_groupinfo;
9766 cb.had_recurse = FALSE;
9767 cb.lastcapture = 0;
9768 cb.max_lookbehind = 0;
9769 cb.name_entry_size = 0;
9770 cb.name_table = NULL;
9771 cb.named_groups = named_groups;
9772 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9773 cb.names_found = 0;
9774 cb.open_caps = NULL;
9775 cb.parens_depth = 0;
9776 cb.parsed_pattern = stack_parsed_pattern;
9777 cb.req_varyopt = 0;
9778 cb.start_code = cworkspace;
9779 cb.start_pattern = pattern;
9780 cb.start_workspace = cworkspace;
9781 cb.workspace_size = COMPILE_WORK_SIZE;
9782
9783 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9784 references to help in deciding whether (.*) can be treated as anchored or not.
9785 */
9786
9787 cb.top_backref = 0;
9788 cb.backref_map = 0;
9789
9790 /* Escape sequences \1 to \9 are always back references, but as they are only
9791 two characters long, only two elements can be used in the parsed_pattern
9792 vector. The first contains the reference, and we'd like to use the second to
9793 record the offset in the pattern, so that forward references to non-existent
9794 groups can be diagnosed later with an offset. However, on 64-bit systems,
9795 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9796 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9797 references have enough space for the offset to be put into the parsed pattern.
9798 */
9799
9800 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9801
9802
9803 /* --------------- Start looking at the pattern --------------- */
9804
9805 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9806 the start of the pattern, and remember the offset to the actual regex. With
9807 valgrind support, make the terminator of a zero-terminated pattern
9808 inaccessible. This catches bugs that would otherwise only show up for
9809 non-zero-terminated patterns. */
9810
9811 #ifdef SUPPORT_VALGRIND
9812 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9813 #endif
9814
9815 ptr = pattern;
9816 skipatstart = 0;
9817
9818 if ((options & PCRE2_LITERAL) == 0)
9819 {
9820 while (patlen - skipatstart >= 2 &&
9821 ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9822 ptr[skipatstart+1] == CHAR_ASTERISK)
9823 {
9824 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9825 {
9826 uint32_t c, pp;
9827 pso *p = pso_list + i;
9828
9829 if (patlen - skipatstart - 2 >= p->length &&
9830 PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9831 p->length) == 0)
9832 {
9833 skipatstart += p->length + 2;
9834 switch(p->type)
9835 {
9836 case PSO_OPT:
9837 cb.external_options |= p->value;
9838 break;
9839
9840 case PSO_FLG:
9841 setflags |= p->value;
9842 break;
9843
9844 case PSO_NL:
9845 newline = p->value;
9846 setflags |= PCRE2_NL_SET;
9847 break;
9848
9849 case PSO_BSR:
9850 bsr = p->value;
9851 setflags |= PCRE2_BSR_SET;
9852 break;
9853
9854 case PSO_LIMM:
9855 case PSO_LIMD:
9856 case PSO_LIMH:
9857 c = 0;
9858 pp = skipatstart;
9859 if (!IS_DIGIT(ptr[pp]))
9860 {
9861 errorcode = ERR60;
9862 ptr += pp;
9863 goto HAD_EARLY_ERROR;
9864 }
9865 while (IS_DIGIT(ptr[pp]))
9866 {
9867 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
9868 c = c*10 + (ptr[pp++] - CHAR_0);
9869 }
9870 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9871 {
9872 errorcode = ERR60;
9873 ptr += pp;
9874 goto HAD_EARLY_ERROR;
9875 }
9876 if (p->type == PSO_LIMH) limit_heap = c;
9877 else if (p->type == PSO_LIMM) limit_match = c;
9878 else limit_depth = c;
9879 skipatstart += pp - skipatstart;
9880 break;
9881 }
9882 break; /* Out of the table scan loop */
9883 }
9884 }
9885 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
9886 }
9887 }
9888
9889 /* End of pattern-start options; advance to start of real regex. */
9890
9891 ptr += skipatstart;
9892
9893 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
9894
9895 #ifndef SUPPORT_UNICODE
9896 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
9897 {
9898 errorcode = ERR32;
9899 goto HAD_EARLY_ERROR;
9900 }
9901 #endif
9902
9903 /* Check UTF. We have the original options in 'options', with that value as
9904 modified by (*UTF) etc in cb->external_options. The extra option
9905 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
9906 surrogate code points cannot be represented in UTF-16. */
9907
9908 utf = (cb.external_options & PCRE2_UTF) != 0;
9909 if (utf)
9910 {
9911 if ((options & PCRE2_NEVER_UTF) != 0)
9912 {
9913 errorcode = ERR74;
9914 goto HAD_EARLY_ERROR;
9915 }
9916 if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
9917 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
9918 goto HAD_ERROR; /* Offset was set by valid_utf() */
9919
9920 #if PCRE2_CODE_UNIT_WIDTH == 16
9921 if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
9922 {
9923 errorcode = ERR91;
9924 goto HAD_EARLY_ERROR;
9925 }
9926 #endif
9927 }
9928
9929 /* Check UCP lockout. */
9930
9931 ucp = (cb.external_options & PCRE2_UCP) != 0;
9932 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
9933 {
9934 errorcode = ERR75;
9935 goto HAD_EARLY_ERROR;
9936 }
9937
9938 /* Process the BSR setting. */
9939
9940 if (bsr == 0) bsr = ccontext->bsr_convention;
9941
9942 /* Process the newline setting. */
9943
9944 if (newline == 0) newline = ccontext->newline_convention;
9945 cb.nltype = NLTYPE_FIXED;
9946 switch(newline)
9947 {
9948 case PCRE2_NEWLINE_CR:
9949 cb.nllen = 1;
9950 cb.nl[0] = CHAR_CR;
9951 break;
9952
9953 case PCRE2_NEWLINE_LF:
9954 cb.nllen = 1;
9955 cb.nl[0] = CHAR_NL;
9956 break;
9957
9958 case PCRE2_NEWLINE_NUL:
9959 cb.nllen = 1;
9960 cb.nl[0] = CHAR_NUL;
9961 break;
9962
9963 case PCRE2_NEWLINE_CRLF:
9964 cb.nllen = 2;
9965 cb.nl[0] = CHAR_CR;
9966 cb.nl[1] = CHAR_NL;
9967 break;
9968
9969 case PCRE2_NEWLINE_ANY:
9970 cb.nltype = NLTYPE_ANY;
9971 break;
9972
9973 case PCRE2_NEWLINE_ANYCRLF:
9974 cb.nltype = NLTYPE_ANYCRLF;
9975 break;
9976
9977 default:
9978 errorcode = ERR56;
9979 goto HAD_EARLY_ERROR;
9980 }
9981
9982 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
9983 their numerical equivalents, so that this information is always available for
9984 the remaining processing. (2) At the same time, parse the pattern and put a
9985 processed version into the parsed_pattern vector. This has escapes interpreted
9986 and comments removed (amongst other things).
9987
9988 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
9989 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
9990 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
9991 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
9992 characters greater than META_END (0x80000000) have to be coded as two units. In
9993 this case, therefore, we scan the pattern to check for such values. */
9994
9995 #if PCRE2_CODE_UNIT_WIDTH == 32
9996 if (!utf)
9997 {
9998 PCRE2_SPTR p;
9999 for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10000 }
10001 #endif
10002
10003 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10004 is set we have to assume a numerical callout (4 elements) for each character
10005 plus one at the end. This is overkill, but memory is plentiful these days. For
10006 many smaller patterns the vector on the stack (which was set up above) can be
10007 used. */
10008
10009 parsed_size_needed = patlen - skipatstart + big32count;
10010
10011 if ((ccontext->extra_options &
10012 (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10013 parsed_size_needed += 4;
10014
10015 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10016 parsed_size_needed = (parsed_size_needed + 1) * 5;
10017
10018 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10019 {
10020 uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10021 (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10022 if (heap_parsed_pattern == NULL)
10023 {
10024 *errorptr = ERR21;
10025 goto EXIT;
10026 }
10027 cb.parsed_pattern = heap_parsed_pattern;
10028 }
10029 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10030
10031 /* Do the parsing scan. */
10032
10033 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10034 if (errorcode != 0) goto HAD_CB_ERROR;
10035
10036 /* Workspace is needed to remember information about numbered groups: whether a
10037 group can match an empty string and what its fixed length is. This is done to
10038 avoid the possibility of recursive references causing very long compile times
10039 when checking these features. Unnumbered groups do not have this exposure since
10040 they cannot be referenced. We use an indexed vector for this purpose. If there
10041 are sufficiently few groups, the default vector on the stack, as set up above,
10042 can be used. Otherwise we have to get/free a special vector. The vector must be
10043 initialized to zero. */
10044
10045 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
10046 {
10047 cb.groupinfo = ccontext->memctl.malloc(
10048 (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
10049 if (cb.groupinfo == NULL)
10050 {
10051 errorcode = ERR21;
10052 cb.erroroffset = 0;
10053 goto HAD_CB_ERROR;
10054 }
10055 }
10056 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
10057
10058 /* If there were any lookbehinds, scan the parsed pattern to figure out their
10059 lengths. */
10060
10061 if (has_lookbehind)
10062 {
10063 errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb);
10064 if (errorcode != 0) goto HAD_CB_ERROR;
10065 }
10066
10067 /* For debugging, there is a function that shows the parsed data vector. */
10068
10069 #ifdef DEBUG_SHOW_PARSED
10070 fprintf(stderr, "+++ Pre-scan complete:\n");
10071 show_parsed(&cb);
10072 #endif
10073
10074 /* For debugging capturing information this code can be enabled. */
10075
10076 #ifdef DEBUG_SHOW_CAPTURES
10077 {
10078 named_group *ng = cb.named_groups;
10079 fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10080 for (i = 0; i < cb.names_found; i++, ng++)
10081 {
10082 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10083 }
10084 }
10085 #endif
10086
10087 /* Pretend to compile the pattern while actually just accumulating the amount
10088 of memory required in the 'length' variable. This behaviour is triggered by
10089 passing a non-NULL final argument to compile_regex(). We pass a block of
10090 workspace (cworkspace) for it to compile parts of the pattern into; the
10091 compiled code is discarded when it is no longer needed, so hopefully this
10092 workspace will never overflow, though there is a test for its doing so.
10093
10094 On error, errorcode will be set non-zero, so we don't need to look at the
10095 result of the function. The initial options have been put into the cb block,
10096 but we still have to pass a separate options variable (the first argument)
10097 because the options may change as the pattern is processed. */
10098
10099 cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
10100 pptr = cb.parsed_pattern;
10101 code = cworkspace;
10102 *code = OP_BRA;
10103
10104 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
10105 &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
10106
10107 if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
10108
10109 /* This should be caught in compile_regex(), but just in case... */
10110
10111 if (length > MAX_PATTERN_SIZE)
10112 {
10113 errorcode = ERR20;
10114 goto HAD_CB_ERROR;
10115 }
10116
10117 /* Compute the size of, and then get and initialize, the data block for storing
10118 the compiled pattern and names table. Integer overflow should no longer be
10119 possible because nowadays we limit the maximum value of cb.names_found and
10120 cb.name_entry_size. */
10121
10122 re_blocksize = sizeof(pcre2_real_code) +
10123 CU2BYTES(length +
10124 (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10125 re = (pcre2_real_code *)
10126 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10127 if (re == NULL)
10128 {
10129 errorcode = ERR21;
10130 goto HAD_CB_ERROR;
10131 }
10132
10133 /* The compiler may put padding at the end of the pcre2_real_code structure in
10134 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10135 compiled pattern is copied (for example, when serialized) undefined bytes are
10136 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10137 write to the last 8 bytes of the structure before setting the fields. */
10138
10139 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10140 re->memctl = ccontext->memctl;
10141 re->tables = tables;
10142 re->executable_jit = NULL;
10143 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10144 re->blocksize = re_blocksize;
10145 re->magic_number = MAGIC_NUMBER;
10146 re->compile_options = options;
10147 re->overall_options = cb.external_options;
10148 re->extra_options = ccontext->extra_options;
10149 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10150 re->limit_heap = limit_heap;
10151 re->limit_match = limit_match;
10152 re->limit_depth = limit_depth;
10153 re->first_codeunit = 0;
10154 re->last_codeunit = 0;
10155 re->bsr_convention = bsr;
10156 re->newline_convention = newline;
10157 re->max_lookbehind = 0;
10158 re->minlength = 0;
10159 re->top_bracket = 0;
10160 re->top_backref = 0;
10161 re->name_entry_size = cb.name_entry_size;
10162 re->name_count = cb.names_found;
10163
10164 /* The basic block is immediately followed by the name table, and the compiled
10165 code follows after that. */
10166
10167 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10168 re->name_entry_size * re->name_count;
10169
10170 /* Update the compile data block for the actual compile. The starting points of
10171 the name/number translation table and of the code are passed around in the
10172 compile data block. The start/end pattern and initial options are already set
10173 from the pre-compile phase, as is the name_entry_size field. */
10174
10175 cb.parens_depth = 0;
10176 cb.assert_depth = 0;
10177 cb.lastcapture = 0;
10178 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10179 cb.start_code = codestart;
10180 cb.req_varyopt = 0;
10181 cb.had_accept = FALSE;
10182 cb.had_pruneorskip = FALSE;
10183 cb.open_caps = NULL;
10184
10185 /* If any named groups were found, create the name/number table from the list
10186 created in the pre-pass. */
10187
10188 if (cb.names_found > 0)
10189 {
10190 named_group *ng = cb.named_groups;
10191 for (i = 0; i < cb.names_found; i++, ng++)
10192 add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10193 }
10194
10195 /* Set up a starting, non-extracting bracket, then compile the expression. On
10196 error, errorcode will be set non-zero, so we don't need to look at the result
10197 of the function here. */
10198
10199 pptr = cb.parsed_pattern;
10200 code = (PCRE2_UCHAR *)codestart;
10201 *code = OP_BRA;
10202 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
10203 &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
10204 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10205 re->top_bracket = cb.bracount;
10206 re->top_backref = cb.top_backref;
10207 re->max_lookbehind = cb.max_lookbehind;
10208
10209 if (cb.had_accept)
10210 {
10211 reqcu = 0; /* Must disable after (*ACCEPT) */
10212 reqcuflags = REQ_NONE;
10213 re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
10214 }
10215
10216 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10217 but the estimated length exceeds the really used length, adjust the value of
10218 re->blocksize, and if valgrind support is configured, mark the extra allocated
10219 memory as unaddressable, so that any out-of-bound reads can be detected. */
10220
10221 *code++ = OP_END;
10222 usedlength = code - codestart;
10223 if (usedlength > length) errorcode = ERR23; else
10224 {
10225 re->blocksize -= CU2BYTES(length - usedlength);
10226 #ifdef SUPPORT_VALGRIND
10227 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10228 #endif
10229 }
10230
10231 /* Scan the pattern for recursion/subroutine calls and convert the group
10232 numbers into offsets. Maintain a small cache so that repeated groups containing
10233 recursions are efficiently handled. */
10234
10235 #define RSCAN_CACHE_SIZE 8
10236
10237 if (errorcode == 0 && cb.had_recurse)
10238 {
10239 PCRE2_UCHAR *rcode;
10240 PCRE2_SPTR rgroup;
10241 unsigned int ccount = 0;
10242 int start = RSCAN_CACHE_SIZE;
10243 recurse_cache rc[RSCAN_CACHE_SIZE];
10244
10245 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10246 rcode != NULL;
10247 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10248 {
10249 int p, groupnumber;
10250
10251 groupnumber = (int)GET(rcode, 1);
10252 if (groupnumber == 0) rgroup = codestart; else
10253 {
10254 PCRE2_SPTR search_from = codestart;
10255 rgroup = NULL;
10256 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10257 {
10258 if (groupnumber == rc[p].groupnumber)
10259 {
10260 rgroup = rc[p].group;
10261 break;
10262 }
10263
10264 /* Group n+1 must always start to the right of group n, so we can save
10265 search time below when the new group number is greater than any of the
10266 previously found groups. */
10267
10268 if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10269 }
10270
10271 if (rgroup == NULL)
10272 {
10273 rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10274 if (rgroup == NULL)
10275 {
10276 errorcode = ERR53;
10277 break;
10278 }
10279 if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10280 rc[start].groupnumber = groupnumber;
10281 rc[start].group = rgroup;
10282 if (ccount < RSCAN_CACHE_SIZE) ccount++;
10283 }
10284 }
10285
10286 PUT(rcode, 1, rgroup - codestart);
10287 }
10288 }
10289
10290 /* In rare debugging situations we sometimes need to look at the compiled code
10291 at this stage. */
10292
10293 #ifdef DEBUG_CALL_PRINTINT
10294 pcre2_printint(re, stderr, TRUE);
10295 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10296 #endif
10297
10298 /* Unless disabled, check whether any single character iterators can be
10299 auto-possessified. The function overwrites the appropriate opcode values, so
10300 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10301 used in this code because at least one compiler gives a warning about loss of
10302 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10303 function call. */
10304
10305 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10306 {
10307 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10308 if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10309 }
10310
10311 /* Failed to compile, or error while post-processing. */
10312
10313 if (errorcode != 0) goto HAD_CB_ERROR;
10314
10315 /* Successful compile. If the anchored option was not passed, set it if
10316 we can determine that the pattern is anchored by virtue of ^ characters or \A
10317 or anything else, such as starting with non-atomic .* when DOTALL is set and
10318 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10319 disable this case). */
10320
10321 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10322 is_anchored(codestart, 0, &cb, 0, FALSE))
10323 re->overall_options |= PCRE2_ANCHORED;
10324
10325 /* Set up the first code unit or startline flag, the required code unit, and
10326 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10327 is set, as the data it would create will not be used. Note that a first code
10328 unit (but not the startline flag) is useful for anchored patterns because it
10329 can still give a quick "no match" and also avoid searching for a last code
10330 unit. */
10331
10332 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10333 {
10334 int minminlength = 0; /* For minimal minlength from first/required CU */
10335
10336 /* If we do not have a first code unit, see if there is one that is asserted
10337 (these are not saved during the compile because they can cause conflicts with
10338 actual literals that follow). */
10339
10340 if (firstcuflags < 0)
10341 firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10342
10343 /* Save the data for a first code unit. The existence of one means the
10344 minimum length must be at least 1. */
10345
10346 if (firstcuflags >= 0)
10347 {
10348 re->first_codeunit = firstcu;
10349 re->flags |= PCRE2_FIRSTSET;
10350 minminlength++;
10351
10352 /* Handle caseless first code units. */
10353
10354 if ((firstcuflags & REQ_CASELESS) != 0)
10355 {
10356 if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10357 {
10358 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10359 }
10360
10361 /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10362 In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10363 points and cannot have another case, but if UCP is set they may do. */
10364
10365 #ifdef SUPPORT_UNICODE
10366 #if PCRE2_CODE_UNIT_WIDTH == 8
10367 else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10368 re->flags |= PCRE2_FIRSTCASELESS;
10369 #else
10370 else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10371 UCD_OTHERCASE(firstcu) != firstcu)
10372 re->flags |= PCRE2_FIRSTCASELESS;
10373 #endif
10374 #endif /* SUPPORT_UNICODE */
10375 }
10376 }
10377
10378 /* When there is no first code unit, for non-anchored patterns, see if we can
10379 set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10380 branches start with ^ and also when all branches start with non-atomic .* for
10381 non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10382 that disables this case.) */
10383
10384 else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10385 is_startline(codestart, 0, &cb, 0, FALSE))
10386 re->flags |= PCRE2_STARTLINE;
10387
10388 /* Handle the "required code unit", if one is set. In the UTF case we can
10389 increment the minimum minimum length only if we are sure this really is a
10390 different character and not a non-starting code unit of the first character,
10391 because the minimum length count is in characters, not code units. */
10392
10393 if (reqcuflags >= 0)
10394 {
10395 #if PCRE2_CODE_UNIT_WIDTH == 16
10396 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10397 firstcuflags < 0 || /* First not set */
10398 (firstcu & 0xf800) != 0xd800 || /* First not surrogate */
10399 (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
10400 #elif PCRE2_CODE_UNIT_WIDTH == 8
10401 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10402 firstcuflags < 0 || /* First not set */
10403 (firstcu & 0x80) == 0 || /* First is ASCII */
10404 (reqcu & 0x80) == 0) /* Req is ASCII */
10405 #endif
10406 {
10407 minminlength++;
10408 }
10409
10410 /* In the case of an anchored pattern, set up the value only if it follows
10411 a variable length item in the pattern. */
10412
10413 if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10414 (reqcuflags & REQ_VARY) != 0)
10415 {
10416 re->last_codeunit = reqcu;
10417 re->flags |= PCRE2_LASTSET;
10418
10419 /* Handle caseless required code units as for first code units (above). */
10420
10421 if ((reqcuflags & REQ_CASELESS) != 0)
10422 {
10423 if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10424 {
10425 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10426 }
10427 #ifdef SUPPORT_UNICODE
10428 #if PCRE2_CODE_UNIT_WIDTH == 8
10429 else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10430 re->flags |= PCRE2_LASTCASELESS;
10431 #else
10432 else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10433 UCD_OTHERCASE(reqcu) != reqcu)
10434 re->flags |= PCRE2_LASTCASELESS;
10435 #endif
10436 #endif /* SUPPORT_UNICODE */
10437 }
10438 }
10439 }
10440
10441 /* Study the compiled pattern to set up information such as a bitmap of
10442 starting code units and a minimum matching length. */
10443
10444 if (PRIV(study)(re) != 0)
10445 {
10446 errorcode = ERR31;
10447 goto HAD_CB_ERROR;
10448 }
10449
10450 /* If study() set a bitmap of starting code units, it implies a minimum
10451 length of at least one. */
10452
10453 if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10454 minminlength = 1;
10455
10456 /* If the minimum length set (or not set) by study() is less than the minimum
10457 implied by required code units, override it. */
10458
10459 if (re->minlength < minminlength) re->minlength = minminlength;
10460 } /* End of start-of-match optimizations. */
10461
10462 /* Control ends up here in all cases. When running under valgrind, make a
10463 pattern's terminating zero defined again. If memory was obtained for the parsed
10464 version of the pattern, free it before returning. Also free the list of named
10465 groups if a larger one had to be obtained, and likewise the group information
10466 vector. */
10467
10468 EXIT:
10469 #ifdef SUPPORT_VALGRIND
10470 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10471 #endif
10472 if (cb.parsed_pattern != stack_parsed_pattern)
10473 ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10474 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10475 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10476 if (cb.groupinfo != stack_groupinfo)
10477 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10478 return re; /* Will be NULL after an error */
10479
10480 /* Errors discovered in parse_regex() set the offset value in the compile
10481 block. Errors discovered before it is called must compute it from the ptr
10482 value. After parse_regex() is called, the offset in the compile block is set to
10483 the end of the pattern, but certain errors in compile_regex() may reset it if
10484 an offset is available in the parsed pattern. */
10485
10486 HAD_CB_ERROR:
10487 ptr = pattern + cb.erroroffset;
10488
10489 HAD_EARLY_ERROR:
10490 *erroroffset = ptr - pattern;
10491
10492 HAD_ERROR:
10493 *errorptr = errorcode;
10494 pcre2_code_free(re);
10495 re = NULL;
10496 goto EXIT;
10497 }
10498
10499 /* End of pcre2_compile.c */
10500