1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2019 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #define NLBLOCK cb /* Block containing newline information */
47 #define PSSTART start_pattern /* Field containing processed string start */
48 #define PSEND end_pattern /* Field containing processed string end */
49
50 #include "pcre2_internal.h"
51
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63
64 /* Other debugging code can be enabled by these defines. */
65
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c) xdigitab[c]
75
76 #else /* Either 16-bit or 32-bit */
77 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81
82 #else /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100 { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102 { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110
111 /* Macros for manipulating elements of the parsed pattern vector. */
112
113 #define META_CODE(x) (x & 0xffff0000u)
114 #define META_DATA(x) (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116
117 /* Function definitions to allow mutual recursion */
118
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121 add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122 compile_block *, const uint32_t *, unsigned int);
123 #endif
124
125 static int
126 compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127 uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
128 compile_block *, PCRE2_SIZE *);
129
130 static int
131 get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132 compile_block *);
133
134 static BOOL
135 set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136 compile_block *);
137
138
139
140 /*************************************************
141 * Code parameters and static tables *
142 *************************************************/
143
144 #define MAX_GROUP_NUMBER 65535u
145 #define MAX_REPEAT_COUNT 65535u
146 #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
147
148 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
149 different ways in the different pattern scans. The parsing and group-
150 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
151 aligned for this. Having defined the size in code units, we set up
152 C16_WORK_SIZE as the number of elements in the 16-bit vector.
153
154 During the first compiling phase, when determining how much memory is required,
155 the regex is partly compiled into this space, but the compiled parts are
156 discarded as soon as they can be, so that hopefully there will never be an
157 overrun. The code does, however, check for an overrun, which can occur for
158 pathological patterns. The size of the workspace depends on LINK_SIZE because
159 the length of compiled items varies with this.
160
161 In the real compile phase, this workspace is not currently used. */
162
163 #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
164
165 #define C16_WORK_SIZE \
166 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
167
168 /* A uint32_t vector is used for caching information about the size of
169 capturing groups, to improve performance. A default is created on the stack of
170 this size. */
171
172 #define GROUPINFO_DEFAULT_SIZE 256
173
174 /* The overrun tests check for a slightly smaller size so that they detect the
175 overrun before it actually does run off the end of the data block. */
176
177 #define WORK_SIZE_SAFETY_MARGIN (100)
178
179 /* This value determines the size of the initial vector that is used for
180 remembering named groups during the pre-compile. It is allocated on the stack,
181 but if it is too small, it is expanded, in a similar way to the workspace. The
182 value is the number of slots in the list. */
183
184 #define NAMED_GROUP_LIST_SIZE 20
185
186 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
187 of uint32_t. For short patterns this lives on the stack, with this size. Heap
188 memory is used for longer patterns. */
189
190 #define PARSED_PATTERN_DEFAULT_SIZE 1024
191
192 /* Maximum length value to check against when making sure that the variable
193 that holds the compiled pattern length does not overflow. We make it a bit less
194 than INT_MAX to allow for adding in group terminating code units, so that we
195 don't have to check them every time. */
196
197 #define OFLOW_MAX (INT_MAX - 20)
198
199 /* Code values for parsed patterns, which are stored in a vector of 32-bit
200 unsigned ints. Values less than META_END are literal data values. The coding
201 for identifying the item is in the top 16-bits, leaving 16 bits for the
202 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
203 macros are used to manipulate parsed pattern elements.
204
205 NOTE: When these definitions are changed, the table of extra lengths for each
206 code (meta_extra_lengths, just below) must be updated to remain in step. */
207
208 #define META_END 0x80000000u /* End of pattern */
209
210 #define META_ALT 0x80010000u /* alternation */
211 #define META_ATOMIC 0x80020000u /* atomic group */
212 #define META_BACKREF 0x80030000u /* Back ref */
213 #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
214 #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
215 #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
216 #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
217 #define META_CAPTURE 0x80080000u /* Capturing parenthesis */
218 #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
219 #define META_CLASS 0x800a0000u /* start non-empty class */
220 #define META_CLASS_EMPTY 0x800b0000u /* empty class */
221 #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
222 #define META_CLASS_END 0x800d0000u /* end of non-empty class */
223 #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
224 #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
225 #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
226 #define META_COND_NAME 0x80110000u /* (?(<name>)... */
227 #define META_COND_NUMBER 0x80120000u /* (?(digits)... */
228 #define META_COND_RNAME 0x80130000u /* (?(R&name)... */
229 #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
230 #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
231 #define META_DOLLAR 0x80160000u /* $ metacharacter */
232 #define META_DOT 0x80170000u /* . metacharacter */
233 #define META_ESCAPE 0x80180000u /* \d and friends */
234 #define META_KET 0x80190000u /* closing parenthesis */
235 #define META_NOCAPTURE 0x801a0000u /* no capture parens */
236 #define META_OPTIONS 0x801b0000u /* (?i) and friends */
237 #define META_POSIX 0x801c0000u /* POSIX class item */
238 #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
239 #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
240 #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
241 #define META_RECURSE 0x80200000u /* Recursion */
242 #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
243 #define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
244
245 /* These must be kept together to make it easy to check that an assertion
246 is present where expected in a conditional group. */
247
248 #define META_LOOKAHEAD 0x80230000u /* (?= */
249 #define META_LOOKAHEADNOT 0x80240000u /* (?! */
250 #define META_LOOKBEHIND 0x80250000u /* (?<= */
251 #define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
252
253 /* These must be kept in this order, with consecutive values, and the _ARG
254 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
255 versions. */
256
257 #define META_MARK 0x80270000u /* (*MARK) */
258 #define META_ACCEPT 0x80280000u /* (*ACCEPT) */
259 #define META_FAIL 0x80290000u /* (*FAIL) */
260 #define META_COMMIT 0x802a0000u /* These */
261 #define META_COMMIT_ARG 0x802b0000u /* pairs */
262 #define META_PRUNE 0x802c0000u /* must */
263 #define META_PRUNE_ARG 0x802d0000u /* be */
264 #define META_SKIP 0x802e0000u /* kept */
265 #define META_SKIP_ARG 0x802f0000u /* in */
266 #define META_THEN 0x80300000u /* this */
267 #define META_THEN_ARG 0x80310000u /* order */
268
269 /* These must be kept in groups of adjacent 3 values, and all together. */
270
271 #define META_ASTERISK 0x80320000u /* * */
272 #define META_ASTERISK_PLUS 0x80330000u /* *+ */
273 #define META_ASTERISK_QUERY 0x80340000u /* *? */
274 #define META_PLUS 0x80350000u /* + */
275 #define META_PLUS_PLUS 0x80360000u /* ++ */
276 #define META_PLUS_QUERY 0x80370000u /* +? */
277 #define META_QUERY 0x80380000u /* ? */
278 #define META_QUERY_PLUS 0x80390000u /* ?+ */
279 #define META_QUERY_QUERY 0x803a0000u /* ?? */
280 #define META_MINMAX 0x803b0000u /* {n,m} repeat */
281 #define META_MINMAX_PLUS 0x803c0000u /* {n,m}+ repeat */
282 #define META_MINMAX_QUERY 0x803d0000u /* {n,m}? repeat */
283
284 #define META_FIRST_QUANTIFIER META_ASTERISK
285 #define META_LAST_QUANTIFIER META_MINMAX_QUERY
286
287 /* This is a special "meta code" that is used only to distinguish (*asr: from
288 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
289 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
290 therefore no need for it to have a length entry, so use a high value. */
291
292 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
293
294 /* Table of extra lengths for each of the meta codes. Must be kept in step with
295 the definitions above. For some items these values are a basic length to which
296 a variable amount has to be added. */
297
298 static unsigned char meta_extra_lengths[] = {
299 0, /* META_END */
300 0, /* META_ALT */
301 0, /* META_ATOMIC */
302 0, /* META_BACKREF - more if group is >= 10 */
303 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
304 1, /* META_BIGVALUE */
305 3, /* META_CALLOUT_NUMBER */
306 3+SIZEOFFSET, /* META_CALLOUT_STRING */
307 0, /* META_CAPTURE */
308 0, /* META_CIRCUMFLEX */
309 0, /* META_CLASS */
310 0, /* META_CLASS_EMPTY */
311 0, /* META_CLASS_EMPTY_NOT */
312 0, /* META_CLASS_END */
313 0, /* META_CLASS_NOT */
314 0, /* META_COND_ASSERT */
315 SIZEOFFSET, /* META_COND_DEFINE */
316 1+SIZEOFFSET, /* META_COND_NAME */
317 1+SIZEOFFSET, /* META_COND_NUMBER */
318 1+SIZEOFFSET, /* META_COND_RNAME */
319 1+SIZEOFFSET, /* META_COND_RNUMBER */
320 3, /* META_COND_VERSION */
321 0, /* META_DOLLAR */
322 0, /* META_DOT */
323 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
324 0, /* META_KET */
325 0, /* META_NOCAPTURE */
326 1, /* META_OPTIONS */
327 1, /* META_POSIX */
328 1, /* META_POSIX_NEG */
329 0, /* META_RANGE_ESCAPED */
330 0, /* META_RANGE_LITERAL */
331 SIZEOFFSET, /* META_RECURSE */
332 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
333 0, /* META_SCRIPT_RUN */
334 0, /* META_LOOKAHEAD */
335 0, /* META_LOOKAHEADNOT */
336 SIZEOFFSET, /* META_LOOKBEHIND */
337 SIZEOFFSET, /* META_LOOKBEHINDNOT */
338 1, /* META_MARK - plus the string length */
339 0, /* META_ACCEPT */
340 0, /* META_FAIL */
341 0, /* META_COMMIT */
342 1, /* META_COMMIT_ARG - plus the string length */
343 0, /* META_PRUNE */
344 1, /* META_PRUNE_ARG - plus the string length */
345 0, /* META_SKIP */
346 1, /* META_SKIP_ARG - plus the string length */
347 0, /* META_THEN */
348 1, /* META_THEN_ARG - plus the string length */
349 0, /* META_ASTERISK */
350 0, /* META_ASTERISK_PLUS */
351 0, /* META_ASTERISK_QUERY */
352 0, /* META_PLUS */
353 0, /* META_PLUS_PLUS */
354 0, /* META_PLUS_QUERY */
355 0, /* META_QUERY */
356 0, /* META_QUERY_PLUS */
357 0, /* META_QUERY_QUERY */
358 2, /* META_MINMAX */
359 2, /* META_MINMAX_PLUS */
360 2 /* META_MINMAX_QUERY */
361 };
362
363 /* Types for skipping parts of a parsed pattern. */
364
365 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
366
367 /* Macro for setting individual bits in class bitmaps. It took some
368 experimenting to figure out how to stop gcc 5.3.0 from warning with
369 -Wconversion. This version gets a warning:
370
371 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
372
373 Let's hope the apparently less efficient version isn't actually so bad if the
374 compiler is clever with identical subexpressions. */
375
376 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
377
378 /* Private flags added to firstcu and reqcu. */
379
380 #define REQ_CASELESS (1u << 0) /* Indicates caselessness */
381 #define REQ_VARY (1u << 1) /* reqcu followed non-literal item */
382 /* Negative values for the firstcu and reqcu flags */
383 #define REQ_UNSET (-2) /* Not yet found anything */
384 #define REQ_NONE (-1) /* Found not fixed char */
385
386 /* These flags are used in the groupinfo vector. */
387
388 #define GI_SET_FIXED_LENGTH 0x80000000u
389 #define GI_NOT_FIXED_LENGTH 0x40000000u
390 #define GI_FIXED_LENGTH_MASK 0x0000ffffu
391
392 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
393 and is fast (a good compiler can turn it into a subtraction and unsigned
394 comparison). */
395
396 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
397
398 /* Table to identify hex digits. The tables in chartables are dependent on the
399 locale, and may mark arbitrary characters as digits. We want to recognize only
400 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
401 costs 256 bytes, but it is a lot faster than doing character value tests (at
402 least in some simple cases I timed), and in some applications one wants PCRE2
403 to compile efficiently as well as match efficiently. The value in the table is
404 the binary hex digit value, or 0xff for non-hex digits. */
405
406 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
407 UTF-8 mode. */
408
409 #ifndef EBCDIC
410 static const uint8_t xdigitab[] =
411 {
412 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
413 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
414 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
415 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
416 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
417 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
418 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
419 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
420 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
421 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
422 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
423 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
424 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
425 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
426 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
427 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
428 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
429 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
430 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
431 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
432 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
433 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
434 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
435 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
436 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
437 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
438 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
439 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
440 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
441 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
442 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
443 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
444
445 #else
446
447 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
448
449 static const uint8_t xdigitab[] =
450 {
451 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
452 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
453 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
454 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
455 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
456 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
457 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
458 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
459 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
460 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
461 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
462 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
463 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
464 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
465 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
466 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
467 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
468 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
469 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
470 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
471 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
472 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
473 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
474 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
475 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
476 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
477 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
478 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
479 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
480 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
481 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
482 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
483 #endif /* EBCDIC */
484
485
486 /* Table for handling alphanumeric escaped characters. Positive returns are
487 simple data values; negative values are for special things like \d and so on.
488 Zero means further processing is needed (for things like \x), or the escape is
489 invalid. */
490
491 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
492 in UTF-8 mode. It runs from '0' to 'z'. */
493
494 #ifndef EBCDIC
495 #define ESCAPES_FIRST CHAR_0
496 #define ESCAPES_LAST CHAR_z
497 #define UPPER_CASE(c) (c-32)
498
499 static const short int escapes[] = {
500 0, 0,
501 0, 0,
502 0, 0,
503 0, 0,
504 0, 0,
505 CHAR_COLON, CHAR_SEMICOLON,
506 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
507 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
508 CHAR_COMMERCIAL_AT, -ESC_A,
509 -ESC_B, -ESC_C,
510 -ESC_D, -ESC_E,
511 0, -ESC_G,
512 -ESC_H, 0,
513 0, -ESC_K,
514 0, 0,
515 -ESC_N, 0,
516 -ESC_P, -ESC_Q,
517 -ESC_R, -ESC_S,
518 0, 0,
519 -ESC_V, -ESC_W,
520 -ESC_X, 0,
521 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
522 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
523 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
524 CHAR_GRAVE_ACCENT, CHAR_BEL,
525 -ESC_b, 0,
526 -ESC_d, CHAR_ESC,
527 CHAR_FF, 0,
528 -ESC_h, 0,
529 0, -ESC_k,
530 0, 0,
531 CHAR_LF, 0,
532 -ESC_p, 0,
533 CHAR_CR, -ESC_s,
534 CHAR_HT, 0,
535 -ESC_v, -ESC_w,
536 0, 0,
537 -ESC_z
538 };
539
540 #else
541
542 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
543 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
544 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
545 because it is defined as 'a', which of course picks up the ASCII value. */
546
547 #if 'a' == 0x81 /* Check for a real EBCDIC environment */
548 #define ESCAPES_FIRST CHAR_a
549 #define ESCAPES_LAST CHAR_9
550 #define UPPER_CASE(c) (c+64)
551 #else /* Testing in an ASCII environment */
552 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
553 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
554 #define UPPER_CASE(c) (c-32)
555 #endif
556
557 static const short int escapes[] = {
558 /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
559 /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
560 /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
561 /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
562 /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
563 /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
564 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
565 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
566 /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
567 /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
568 /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
569 /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
570 /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
571 /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
572 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
573 /* F8 */ 0, 0
574 };
575
576 /* We also need a table of characters that may follow \c in an EBCDIC
577 environment for characters 0-31. */
578
579 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
580
581 #endif /* EBCDIC */
582
583
584 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
585 searched linearly. Put all the names into a single string, in order to reduce
586 the number of relocations when a shared library is dynamically linked. The
587 string is built from string macros so that it works in UTF-8 mode on EBCDIC
588 platforms. */
589
590 typedef struct verbitem {
591 unsigned int len; /* Length of verb name */
592 uint32_t meta; /* Base META_ code */
593 int has_arg; /* Argument requirement */
594 } verbitem;
595
596 static const char verbnames[] =
597 "\0" /* Empty name is a shorthand for MARK */
598 STRING_MARK0
599 STRING_ACCEPT0
600 STRING_F0
601 STRING_FAIL0
602 STRING_COMMIT0
603 STRING_PRUNE0
604 STRING_SKIP0
605 STRING_THEN;
606
607 static const verbitem verbs[] = {
608 { 0, META_MARK, +1 }, /* > 0 => must have an argument */
609 { 4, META_MARK, +1 },
610 { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
611 { 1, META_FAIL, -1 },
612 { 4, META_FAIL, -1 },
613 { 6, META_COMMIT, 0 },
614 { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
615 { 4, META_SKIP, 0 },
616 { 4, META_THEN, 0 }
617 };
618
619 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
620
621 /* Verb opcodes, indexed by their META code offset from META_MARK. */
622
623 static const uint32_t verbops[] = {
624 OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
625 OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
626
627 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
628
629 typedef struct alasitem {
630 unsigned int len; /* Length of name */
631 uint32_t meta; /* Base META_ code */
632 } alasitem;
633
634 static const char alasnames[] =
635 STRING_pla0
636 STRING_plb0
637 STRING_nla0
638 STRING_nlb0
639 STRING_positive_lookahead0
640 STRING_positive_lookbehind0
641 STRING_negative_lookahead0
642 STRING_negative_lookbehind0
643 STRING_atomic0
644 STRING_sr0
645 STRING_asr0
646 STRING_script_run0
647 STRING_atomic_script_run;
648
649 static const alasitem alasmeta[] = {
650 { 3, META_LOOKAHEAD },
651 { 3, META_LOOKBEHIND },
652 { 3, META_LOOKAHEADNOT },
653 { 3, META_LOOKBEHINDNOT },
654 { 18, META_LOOKAHEAD },
655 { 19, META_LOOKBEHIND },
656 { 18, META_LOOKAHEADNOT },
657 { 19, META_LOOKBEHINDNOT },
658 { 6, META_ATOMIC },
659 { 2, META_SCRIPT_RUN }, /* sr = script run */
660 { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
661 { 10, META_SCRIPT_RUN }, /* script run */
662 { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
663 };
664
665 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
666
667 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
668
669 static uint32_t chartypeoffset[] = {
670 OP_STAR - OP_STAR, OP_STARI - OP_STAR,
671 OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
672
673 /* Tables of names of POSIX character classes and their lengths. The names are
674 now all in a single string, to reduce the number of relocations when a shared
675 library is dynamically loaded. The list of lengths is terminated by a zero
676 length entry. The first three must be alpha, lower, upper, as this is assumed
677 for handling case independence. The indices for graph, print, and punct are
678 needed, so identify them. */
679
680 static const char posix_names[] =
681 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
682 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
683 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
684 STRING_word0 STRING_xdigit;
685
686 static const uint8_t posix_name_lengths[] = {
687 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
688
689 #define PC_GRAPH 8
690 #define PC_PRINT 9
691 #define PC_PUNCT 10
692
693 /* Table of class bit maps for each POSIX class. Each class is formed from a
694 base map, with an optional addition or removal of another map. Then, for some
695 classes, there is some additional tweaking: for [:blank:] the vertical space
696 characters are removed, and for [:alpha:] and [:alnum:] the underscore
697 character is removed. The triples in the table consist of the base map offset,
698 second map offset or -1 if no second map, and a non-negative value for map
699 addition or a negative value for map subtraction (if there are two maps). The
700 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
701 remove vertical space characters, 2 => remove underscore. */
702
703 static const int posix_class_maps[] = {
704 cbit_word, cbit_digit, -2, /* alpha */
705 cbit_lower, -1, 0, /* lower */
706 cbit_upper, -1, 0, /* upper */
707 cbit_word, -1, 2, /* alnum - word without underscore */
708 cbit_print, cbit_cntrl, 0, /* ascii */
709 cbit_space, -1, 1, /* blank - a GNU extension */
710 cbit_cntrl, -1, 0, /* cntrl */
711 cbit_digit, -1, 0, /* digit */
712 cbit_graph, -1, 0, /* graph */
713 cbit_print, -1, 0, /* print */
714 cbit_punct, -1, 0, /* punct */
715 cbit_space, -1, 0, /* space */
716 cbit_word, -1, 0, /* word - a Perl extension */
717 cbit_xdigit,-1, 0 /* xdigit */
718 };
719
720 #ifdef SUPPORT_UNICODE
721
722 /* The POSIX class Unicode property substitutes that are used in UCP mode must
723 be in the order of the POSIX class names, defined above. */
724
725 static int posix_substitutes[] = {
726 PT_GC, ucp_L, /* alpha */
727 PT_PC, ucp_Ll, /* lower */
728 PT_PC, ucp_Lu, /* upper */
729 PT_ALNUM, 0, /* alnum */
730 -1, 0, /* ascii, treat as non-UCP */
731 -1, 1, /* blank, treat as \h */
732 PT_PC, ucp_Cc, /* cntrl */
733 PT_PC, ucp_Nd, /* digit */
734 PT_PXGRAPH, 0, /* graph */
735 PT_PXPRINT, 0, /* print */
736 PT_PXPUNCT, 0, /* punct */
737 PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
738 PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
739 -1, 0 /* xdigit, treat as non-UCP */
740 };
741 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
742 #endif /* SUPPORT_UNICODE */
743
744 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
745 are allowed. */
746
747 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
748 (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
749 PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_NO_START_OPTIMIZE| \
750 PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
751
752 #define PUBLIC_COMPILE_OPTIONS \
753 (PUBLIC_LITERAL_COMPILE_OPTIONS| \
754 PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
755 PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
756 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
757 PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
758 PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
759 PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
760
761 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
762 (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
763
764 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
765 (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
766 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
767 PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX)
768
769 /* Compile time error code numbers. They are given names so that they can more
770 easily be tracked. When a new number is added, the tables called eint1 and
771 eint2 in pcre2posix.c may need to be updated, and a new error text must be
772 added to compile_error_texts in pcre2_error.c. */
773
774 enum { ERR0 = COMPILE_ERROR_BASE,
775 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
776 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
777 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
778 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
779 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
780 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
781 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
782 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
783 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
784 ERR91, ERR92, ERR93, ERR94, ERR95, ERR96 };
785
786 /* This is a table of start-of-pattern options such as (*UTF) and settings such
787 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
788 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
789 generic and always supported. */
790
791 enum { PSO_OPT, /* Value is an option bit */
792 PSO_FLG, /* Value is a flag bit */
793 PSO_NL, /* Value is a newline type */
794 PSO_BSR, /* Value is a \R type */
795 PSO_LIMH, /* Read integer value for heap limit */
796 PSO_LIMM, /* Read integer value for match limit */
797 PSO_LIMD }; /* Read integer value for depth limit */
798
799 typedef struct pso {
800 const uint8_t *name;
801 uint16_t length;
802 uint16_t type;
803 uint32_t value;
804 } pso;
805
806 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
807
808 static pso pso_list[] = {
809 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
810 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
811 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
812 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
813 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
814 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
815 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
816 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
817 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
818 { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
819 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
820 { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
821 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
822 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
823 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
824 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
825 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
826 { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
827 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
828 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
829 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
830 };
831
832 /* This table is used when converting repeating opcodes into possessified
833 versions as a result of an explicit possessive quantifier such as ++. A zero
834 value means there is no possessified version - in those cases the item in
835 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
836 because all relevant opcodes are less than that. */
837
838 static const uint8_t opcode_possessify[] = {
839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
841
842 0, /* NOTI */
843 OP_POSSTAR, 0, /* STAR, MINSTAR */
844 OP_POSPLUS, 0, /* PLUS, MINPLUS */
845 OP_POSQUERY, 0, /* QUERY, MINQUERY */
846 OP_POSUPTO, 0, /* UPTO, MINUPTO */
847 0, /* EXACT */
848 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
849
850 OP_POSSTARI, 0, /* STARI, MINSTARI */
851 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
852 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
853 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
854 0, /* EXACTI */
855 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
856
857 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
858 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
859 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
860 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
861 0, /* NOTEXACT */
862 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
863
864 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
865 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
866 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
867 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
868 0, /* NOTEXACTI */
869 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
870
871 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
872 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
873 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
874 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
875 0, /* TYPEEXACT */
876 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
877
878 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
879 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
880 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
881 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
882 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
883
884 0, 0, 0, /* CLASS, NCLASS, XCLASS */
885 0, 0, /* REF, REFI */
886 0, 0, /* DNREF, DNREFI */
887 0, 0 /* RECURSE, CALLOUT */
888 };
889
890
891 #ifdef DEBUG_SHOW_PARSED
892 /*************************************************
893 * Show the parsed pattern for debugging *
894 *************************************************/
895
896 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
897 can be enabled. */
898
show_parsed(compile_block * cb)899 static void show_parsed(compile_block *cb)
900 {
901 uint32_t *pptr = cb->parsed_pattern;
902
903 for (;;)
904 {
905 int max, min;
906 PCRE2_SIZE offset;
907 uint32_t i;
908 uint32_t length;
909 uint32_t meta_arg = META_DATA(*pptr);
910
911 fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
912
913 if (*pptr < META_END)
914 {
915 if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
916 pptr++;
917 }
918
919 else switch (META_CODE(*pptr++))
920 {
921 default:
922 fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
923 return;
924
925 case META_END:
926 fprintf(stderr, "META_END\n");
927 return;
928
929 case META_CAPTURE:
930 fprintf(stderr, "META_CAPTURE %d", meta_arg);
931 break;
932
933 case META_RECURSE:
934 GETOFFSET(offset, pptr);
935 fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
936 break;
937
938 case META_BACKREF:
939 if (meta_arg < 10)
940 offset = cb->small_ref_offset[meta_arg];
941 else
942 GETOFFSET(offset, pptr);
943 fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
944 break;
945
946 case META_ESCAPE:
947 if (meta_arg == ESC_P || meta_arg == ESC_p)
948 {
949 uint32_t ptype = *pptr >> 16;
950 uint32_t pvalue = *pptr++ & 0xffff;
951 fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
952 ptype, pvalue);
953 }
954 else
955 {
956 uint32_t cc;
957 /* There's just one escape we might have here that isn't negated in the
958 escapes table. */
959 if (meta_arg == ESC_g) cc = CHAR_g;
960 else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
961 {
962 if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
963 }
964 if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
965 fprintf(stderr, "META \\%c", cc);
966 }
967 break;
968
969 case META_MINMAX:
970 min = *pptr++;
971 max = *pptr++;
972 if (max != REPEAT_UNLIMITED)
973 fprintf(stderr, "META {%d,%d}", min, max);
974 else
975 fprintf(stderr, "META {%d,}", min);
976 break;
977
978 case META_MINMAX_QUERY:
979 min = *pptr++;
980 max = *pptr++;
981 if (max != REPEAT_UNLIMITED)
982 fprintf(stderr, "META {%d,%d}?", min, max);
983 else
984 fprintf(stderr, "META {%d,}?", min);
985 break;
986
987 case META_MINMAX_PLUS:
988 min = *pptr++;
989 max = *pptr++;
990 if (max != REPEAT_UNLIMITED)
991 fprintf(stderr, "META {%d,%d}+", min, max);
992 else
993 fprintf(stderr, "META {%d,}+", min);
994 break;
995
996 case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
997 case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
998 case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
999 case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1000 case META_DOT: fprintf(stderr, "META_DOT"); break;
1001 case META_ASTERISK: fprintf(stderr, "META *"); break;
1002 case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1003 case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1004 case META_PLUS: fprintf(stderr, "META +"); break;
1005 case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1006 case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1007 case META_QUERY: fprintf(stderr, "META ?"); break;
1008 case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1009 case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1010
1011 case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1012 case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1013 case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1014 case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1015 case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1016 case META_KET: fprintf(stderr, "META )"); break;
1017 case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1018
1019 case META_CLASS: fprintf(stderr, "META ["); break;
1020 case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1021 case META_CLASS_END: fprintf(stderr, "META ]"); break;
1022 case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1023 case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1024
1025 case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1026 case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1027
1028 case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1029 case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1030
1031 case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1032 case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1033 case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1034 case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1035 case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1036 case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1037
1038 case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
1039
1040 case META_LOOKBEHIND:
1041 fprintf(stderr, "META (?<= %d offset=", meta_arg);
1042 GETOFFSET(offset, pptr);
1043 fprintf(stderr, "%zd", offset);
1044 break;
1045
1046 case META_LOOKBEHINDNOT:
1047 fprintf(stderr, "META (?<! %d offset=", meta_arg);
1048 GETOFFSET(offset, pptr);
1049 fprintf(stderr, "%zd", offset);
1050 break;
1051
1052 case META_CALLOUT_NUMBER:
1053 fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1054 pptr[1]);
1055 pptr += 3;
1056 break;
1057
1058 case META_CALLOUT_STRING:
1059 {
1060 uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1061 uint32_t patlength = *pptr++; /* Length of next pattern item */
1062 fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1063 GETOFFSET(offset, pptr);
1064 fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1065 }
1066 break;
1067
1068 case META_RECURSE_BYNAME:
1069 fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1070 GETOFFSET(offset, pptr);
1071 fprintf(stderr, "%zd", offset);
1072 break;
1073
1074 case META_BACKREF_BYNAME:
1075 fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1076 GETOFFSET(offset, pptr);
1077 fprintf(stderr, "%zd", offset);
1078 break;
1079
1080 case META_COND_NUMBER:
1081 fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1082 GETOFFSET(offset, pptr);
1083 fprintf(stderr, "%zd", offset);
1084 pptr++;
1085 break;
1086
1087 case META_COND_DEFINE:
1088 fprintf(stderr, "META (?(DEFINE) offset=");
1089 GETOFFSET(offset, pptr);
1090 fprintf(stderr, "%zd", offset);
1091 break;
1092
1093 case META_COND_VERSION:
1094 fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1095 fprintf(stderr, "%d.", *pptr++);
1096 fprintf(stderr, "%d)", *pptr++);
1097 break;
1098
1099 case META_COND_NAME:
1100 fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1101 GETOFFSET(offset, pptr);
1102 fprintf(stderr, "%zd", offset);
1103 break;
1104
1105 case META_COND_RNAME:
1106 fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1107 GETOFFSET(offset, pptr);
1108 fprintf(stderr, "%zd", offset);
1109 break;
1110
1111 /* This is kept as a name, because it might be. */
1112
1113 case META_COND_RNUMBER:
1114 fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1115 GETOFFSET(offset, pptr);
1116 fprintf(stderr, "%zd", offset);
1117 break;
1118
1119 case META_MARK:
1120 fprintf(stderr, "META (*MARK:");
1121 goto SHOWARG;
1122
1123 case META_COMMIT_ARG:
1124 fprintf(stderr, "META (*COMMIT:");
1125 goto SHOWARG;
1126
1127 case META_PRUNE_ARG:
1128 fprintf(stderr, "META (*PRUNE:");
1129 goto SHOWARG;
1130
1131 case META_SKIP_ARG:
1132 fprintf(stderr, "META (*SKIP:");
1133 goto SHOWARG;
1134
1135 case META_THEN_ARG:
1136 fprintf(stderr, "META (*THEN:");
1137 SHOWARG:
1138 length = *pptr++;
1139 for (i = 0; i < length; i++)
1140 {
1141 uint32_t cc = *pptr++;
1142 if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1143 else fprintf(stderr, "\\x{%x}", cc);
1144 }
1145 fprintf(stderr, ") length=%u", length);
1146 break;
1147 }
1148 fprintf(stderr, "\n");
1149 }
1150 return;
1151 }
1152 #endif /* DEBUG_SHOW_PARSED */
1153
1154
1155
1156 /*************************************************
1157 * Copy compiled code *
1158 *************************************************/
1159
1160 /* Compiled JIT code cannot be copied, so the new compiled block has no
1161 associated JIT data. */
1162
1163 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1164 pcre2_code_copy(const pcre2_code *code)
1165 {
1166 PCRE2_SIZE* ref_count;
1167 pcre2_code *newcode;
1168
1169 if (code == NULL) return NULL;
1170 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1171 if (newcode == NULL) return NULL;
1172 memcpy(newcode, code, code->blocksize);
1173 newcode->executable_jit = NULL;
1174
1175 /* If the code is one that has been deserialized, increment the reference count
1176 in the decoded tables. */
1177
1178 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1179 {
1180 ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
1181 (*ref_count)++;
1182 }
1183
1184 return newcode;
1185 }
1186
1187
1188
1189 /*************************************************
1190 * Copy compiled code and character tables *
1191 *************************************************/
1192
1193 /* Compiled JIT code cannot be copied, so the new compiled block has no
1194 associated JIT data. This version of code_copy also makes a separate copy of
1195 the character tables. */
1196
1197 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1198 pcre2_code_copy_with_tables(const pcre2_code *code)
1199 {
1200 PCRE2_SIZE* ref_count;
1201 pcre2_code *newcode;
1202 uint8_t *newtables;
1203
1204 if (code == NULL) return NULL;
1205 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1206 if (newcode == NULL) return NULL;
1207 memcpy(newcode, code, code->blocksize);
1208 newcode->executable_jit = NULL;
1209
1210 newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE),
1211 code->memctl.memory_data);
1212 if (newtables == NULL)
1213 {
1214 code->memctl.free((void *)newcode, code->memctl.memory_data);
1215 return NULL;
1216 }
1217 memcpy(newtables, code->tables, tables_length);
1218 ref_count = (PCRE2_SIZE *)(newtables + tables_length);
1219 *ref_count = 1;
1220
1221 newcode->tables = newtables;
1222 newcode->flags |= PCRE2_DEREF_TABLES;
1223 return newcode;
1224 }
1225
1226
1227
1228 /*************************************************
1229 * Free compiled code *
1230 *************************************************/
1231
1232 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1233 pcre2_code_free(pcre2_code *code)
1234 {
1235 PCRE2_SIZE* ref_count;
1236
1237 if (code != NULL)
1238 {
1239 if (code->executable_jit != NULL)
1240 PRIV(jit_free)(code->executable_jit, &code->memctl);
1241
1242 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1243 {
1244 /* Decoded tables belong to the codes after deserialization, and they must
1245 be freed when there are no more references to them. The *ref_count should
1246 always be > 0. */
1247
1248 ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
1249 if (*ref_count > 0)
1250 {
1251 (*ref_count)--;
1252 if (*ref_count == 0)
1253 code->memctl.free((void *)code->tables, code->memctl.memory_data);
1254 }
1255 }
1256
1257 code->memctl.free(code, code->memctl.memory_data);
1258 }
1259 }
1260
1261
1262
1263 /*************************************************
1264 * Read a number, possibly signed *
1265 *************************************************/
1266
1267 /* This function is used to read numbers in the pattern. The initial pointer
1268 must be the sign or first digit of the number. When relative values (introduced
1269 by + or -) are allowed, they are relative group numbers, and the result must be
1270 greater than zero.
1271
1272 Arguments:
1273 ptrptr points to the character pointer variable
1274 ptrend points to the end of the input string
1275 allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1276 max_value the largest number allowed
1277 max_error the error to give for an over-large number
1278 intptr where to put the result
1279 errcodeptr where to put an error code
1280
1281 Returns: TRUE - a number was read
1282 FALSE - errorcode == 0 => no number was found
1283 errorcode != 0 => an error occurred
1284 */
1285
1286 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1287 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1288 uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1289 {
1290 int sign = 0;
1291 uint32_t n = 0;
1292 PCRE2_SPTR ptr = *ptrptr;
1293 BOOL yield = FALSE;
1294
1295 *errorcodeptr = 0;
1296
1297 if (allow_sign >= 0 && ptr < ptrend)
1298 {
1299 if (*ptr == CHAR_PLUS)
1300 {
1301 sign = +1;
1302 max_value -= allow_sign;
1303 ptr++;
1304 }
1305 else if (*ptr == CHAR_MINUS)
1306 {
1307 sign = -1;
1308 ptr++;
1309 }
1310 }
1311
1312 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1313 while (ptr < ptrend && IS_DIGIT(*ptr))
1314 {
1315 n = n * 10 + *ptr++ - CHAR_0;
1316 if (n > max_value)
1317 {
1318 *errorcodeptr = max_error;
1319 goto EXIT;
1320 }
1321 }
1322
1323 if (allow_sign >= 0 && sign != 0)
1324 {
1325 if (n == 0)
1326 {
1327 *errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1328 goto EXIT;
1329 }
1330
1331 if (sign > 0) n += allow_sign;
1332 else if ((int)n > allow_sign)
1333 {
1334 *errorcodeptr = ERR15; /* Non-existent subpattern */
1335 goto EXIT;
1336 }
1337 else n = allow_sign + 1 - n;
1338 }
1339
1340 yield = TRUE;
1341
1342 EXIT:
1343 *intptr = n;
1344 *ptrptr = ptr;
1345 return yield;
1346 }
1347
1348
1349
1350 /*************************************************
1351 * Read repeat counts *
1352 *************************************************/
1353
1354 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1355 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1356 larger value is used for "unlimited". We have to use signed arguments for
1357 read_number() because it is capable of returning a signed value.
1358
1359 Arguments:
1360 ptrptr points to pointer to character after'{'
1361 ptrend pointer to end of input
1362 minp if not NULL, pointer to int for min
1363 maxp if not NULL, pointer to int for max (-1 if no max)
1364 returned as -1 if no max
1365 errorcodeptr points to error code variable
1366
1367 Returns: FALSE if not a repeat quantifier, errorcode set zero
1368 FALSE on error, with errorcode set non-zero
1369 TRUE on success, with pointer updated to point after '}'
1370 */
1371
1372 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1373 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1374 uint32_t *maxp, int *errorcodeptr)
1375 {
1376 PCRE2_SPTR p = *ptrptr;
1377 BOOL yield = FALSE;
1378 int32_t min = 0;
1379 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1380
1381 /* NB read_number() initializes the error code to zero. The only error is for a
1382 number that is too big. */
1383
1384 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1385 goto EXIT;
1386
1387 if (p >= ptrend) goto EXIT;
1388
1389 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1390 {
1391 p++;
1392 max = min;
1393 }
1394
1395 else
1396 {
1397 if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT;
1398 if (*p != CHAR_RIGHT_CURLY_BRACKET)
1399 {
1400 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1401 errorcodeptr) || p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1402 goto EXIT;
1403 if (max < min)
1404 {
1405 *errorcodeptr = ERR4;
1406 goto EXIT;
1407 }
1408 }
1409 p++;
1410 }
1411
1412 yield = TRUE;
1413 if (minp != NULL) *minp = (uint32_t)min;
1414 if (maxp != NULL) *maxp = (uint32_t)max;
1415
1416 /* Update the pattern pointer on success, or after an error, but not when
1417 the result is "not a repeat quantifier". */
1418
1419 EXIT:
1420 if (yield || *errorcodeptr != 0) *ptrptr = p;
1421 return yield;
1422
1423
1424
1425 }
1426
1427
1428
1429 /*************************************************
1430 * Handle escapes *
1431 *************************************************/
1432
1433 /* This function is called when a \ has been encountered. It either returns a
1434 positive value for a simple escape such as \d, or 0 for a data character, which
1435 is placed in chptr. A backreference to group n is returned as negative n. On
1436 entry, ptr is pointing at the character after \. On exit, it points after the
1437 final code unit of the escape sequence.
1438
1439 This function is also called from pcre2_substitute() to handle escape sequences
1440 in replacement strings. In this case, the cb argument is NULL, and in the case
1441 of escapes that have further processing, only sequences that define a data
1442 character are recognised. The isclass argument is not relevant; the options
1443 argument is the final value of the compiled pattern's options.
1444
1445 Arguments:
1446 ptrptr points to the input position pointer
1447 ptrend points to the end of the input
1448 chptr points to a returned data character
1449 errorcodeptr points to the errorcode variable (containing zero)
1450 options the current options bits
1451 isclass TRUE if inside a character class
1452 cb compile data block or NULL when called from pcre2_substitute()
1453
1454 Returns: zero => a data character
1455 positive => a special escape sequence
1456 negative => a numerical back reference
1457 on error, errorcodeptr is set non-zero
1458 */
1459
1460 int
PRIV(check_escape)1461 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1462 int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1463 compile_block *cb)
1464 {
1465 BOOL utf = (options & PCRE2_UTF) != 0;
1466 PCRE2_SPTR ptr = *ptrptr;
1467 uint32_t c, cc;
1468 int escape = 0;
1469 int i;
1470
1471 /* If backslash is at the end of the string, it's an error. */
1472
1473 if (ptr >= ptrend)
1474 {
1475 *errorcodeptr = ERR1;
1476 return 0;
1477 }
1478
1479 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1480 *errorcodeptr = 0; /* Be optimistic */
1481
1482 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1483 value test saves a memory lookup for code points outside the alphanumeric
1484 range. */
1485
1486 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1487
1488 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1489 positive value is a literal value for something like \n. A negative value is
1490 the negation of one of the ESC_ macros that is passed back for handling by the
1491 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1492 is supported. If the value is zero, further processing is handled below. */
1493
1494 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1495 {
1496 if (i > 0)
1497 {
1498 c = (uint32_t)i;
1499 if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1500 c = CHAR_LF;
1501 }
1502 else /* Negative table entry */
1503 {
1504 escape = -i; /* Else return a special escape */
1505 if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1506 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1507
1508 /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1509 Unicode code points, as well as plain \N for "not newline". PCRE does not
1510 support \N{name}. However, it does support quantification such as \N{2,3},
1511 so if \N{ is not followed by U+dddd we check for a quantifier. */
1512
1513 if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1514 {
1515 PCRE2_SPTR p = ptr + 1;
1516
1517 /* \N{U+ can be handled by the \x{ code. However, this construction is
1518 not valid in EBCDIC environments because it specifies a Unicode
1519 character, not a codepoint in the local code. For example \N{U+0041}
1520 must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1521 casing semantics for the entire pattern, so allow it only in UTF (i.e.
1522 Unicode) mode. */
1523
1524 if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1525 {
1526 #ifdef EBCDIC
1527 *errorcodeptr = ERR93;
1528 #else
1529 if (utf)
1530 {
1531 ptr = p + 1;
1532 escape = 0; /* Not a fancy escape after all */
1533 goto COME_FROM_NU;
1534 }
1535 else *errorcodeptr = ERR93;
1536 #endif
1537 }
1538
1539 /* Give an error if what follows is not a quantifier, but don't override
1540 an error set by the quantifier reader (e.g. number overflow). */
1541
1542 else
1543 {
1544 if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1545 *errorcodeptr == 0)
1546 *errorcodeptr = ERR37;
1547 }
1548 }
1549 }
1550 }
1551
1552 /* Escapes that need further processing, including those that are unknown, have
1553 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1554 \o, and \x are recognized (\u and \U can never appear as they are used for case
1555 forcing). */
1556
1557 else
1558 {
1559 int s;
1560 PCRE2_SPTR oldptr;
1561 BOOL overflow;
1562 BOOL alt_bsux =
1563 ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
1564
1565 /* Filter calls from pcre2_substitute(). */
1566
1567 if (cb == NULL)
1568 {
1569 if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1570 {
1571 *errorcodeptr = ERR3;
1572 return 0;
1573 }
1574 alt_bsux = FALSE; /* Do not modify \x handling */
1575 }
1576
1577 switch (c)
1578 {
1579 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1580 error. */
1581
1582 case CHAR_F:
1583 case CHAR_l:
1584 case CHAR_L:
1585 *errorcodeptr = ERR37;
1586 break;
1587
1588 /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1589 is set. Otherwise, \u must be followed by exactly four hex digits or, if
1590 PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1591 Otherwise it is a lowercase u letter. This gives some compatibility with
1592 ECMAScript (aka JavaScript). */
1593
1594 case CHAR_u:
1595 if (!alt_bsux) *errorcodeptr = ERR37; else
1596 {
1597 uint32_t xc;
1598
1599 if (ptr >= ptrend) break;
1600 if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1601 (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
1602 {
1603 PCRE2_SPTR hptr = ptr + 1;
1604 cc = 0;
1605
1606 while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1607 {
1608 if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
1609 {
1610 *errorcodeptr = ERR77;
1611 ptr = hptr; /* Show where */
1612 break; /* *hptr != } will cause another break below */
1613 }
1614 cc = (cc << 4) | xc;
1615 hptr++;
1616 }
1617
1618 if (hptr == ptr + 1 || /* No hex digits */
1619 hptr >= ptrend || /* Hit end of input */
1620 *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
1621 break; /* Hex escape not recognized */
1622
1623 c = cc; /* Accept the code point */
1624 ptr = hptr + 1;
1625 }
1626
1627 else /* Must be exactly 4 hex digits */
1628 {
1629 if (ptrend - ptr < 4) break; /* Less than 4 chars */
1630 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1631 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1632 cc = (cc << 4) | xc;
1633 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1634 cc = (cc << 4) | xc;
1635 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1636 c = (cc << 4) | xc;
1637 ptr += 4;
1638 }
1639
1640 if (utf)
1641 {
1642 if (c > 0x10ffffU) *errorcodeptr = ERR77;
1643 else
1644 if (c >= 0xd800 && c <= 0xdfff &&
1645 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1646 *errorcodeptr = ERR73;
1647 }
1648 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1649 }
1650 break;
1651
1652 /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1653 in which case it is an upper case letter. */
1654
1655 case CHAR_U:
1656 if (!alt_bsux) *errorcodeptr = ERR37;
1657 break;
1658
1659 /* In a character class, \g is just a literal "g". Outside a character
1660 class, \g must be followed by one of a number of specific things:
1661
1662 (1) A number, either plain or braced. If positive, it is an absolute
1663 backreference. If negative, it is a relative backreference. This is a Perl
1664 5.10 feature.
1665
1666 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1667 is part of Perl's movement towards a unified syntax for back references. As
1668 this is synonymous with \k{name}, we fudge it up by pretending it really
1669 was \k{name}.
1670
1671 (3) For Oniguruma compatibility we also support \g followed by a name or a
1672 number either in angle brackets or in single quotes. However, these are
1673 (possibly recursive) subroutine calls, _not_ backreferences. We return
1674 the ESC_g code.
1675
1676 Summary: Return a negative number for a numerical back reference, ESC_k for
1677 a named back reference, and ESC_g for a named or numbered subroutine call.
1678 */
1679
1680 case CHAR_g:
1681 if (isclass) break;
1682
1683 if (ptr >= ptrend)
1684 {
1685 *errorcodeptr = ERR57;
1686 break;
1687 }
1688
1689 if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1690 {
1691 escape = ESC_g;
1692 break;
1693 }
1694
1695 /* If there is a brace delimiter, try to read a numerical reference. If
1696 there isn't one, assume we have a name and treat it as \k. */
1697
1698 if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1699 {
1700 PCRE2_SPTR p = ptr + 1;
1701 if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1702 errorcodeptr))
1703 {
1704 if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1705 break;
1706 }
1707 if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1708 {
1709 *errorcodeptr = ERR57;
1710 break;
1711 }
1712 ptr = p + 1;
1713 }
1714
1715 /* Read an undelimited number */
1716
1717 else
1718 {
1719 if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1720 errorcodeptr))
1721 {
1722 if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1723 break;
1724 }
1725 }
1726
1727 if (s <= 0)
1728 {
1729 *errorcodeptr = ERR15;
1730 break;
1731 }
1732
1733 escape = -s;
1734 break;
1735
1736 /* The handling of escape sequences consisting of a string of digits
1737 starting with one that is not zero is not straightforward. Perl has changed
1738 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1739 recommended to avoid the ambiguities in the old syntax.
1740
1741 Outside a character class, the digits are read as a decimal number. If the
1742 number is less than 10, or if there are that many previous extracting left
1743 brackets, it is a back reference. Otherwise, up to three octal digits are
1744 read to form an escaped character code. Thus \123 is likely to be octal 123
1745 (cf \0123, which is octal 012 followed by the literal 3).
1746
1747 Inside a character class, \ followed by a digit is always either a literal
1748 8 or 9 or an octal number. */
1749
1750 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1751 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1752
1753 if (!isclass)
1754 {
1755 oldptr = ptr;
1756 ptr--; /* Back to the digit */
1757 if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s,
1758 errorcodeptr))
1759 break;
1760
1761 /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1762 are octal escapes if there are not that many previous captures. */
1763
1764 if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount)
1765 {
1766 if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1767 else escape = -s; /* Indicates a back reference */
1768 break;
1769 }
1770 ptr = oldptr; /* Put the pointer back and fall through */
1771 }
1772
1773 /* Handle a digit following \ when the number is not a back reference, or
1774 we are within a character class. If the first digit is 8 or 9, Perl used to
1775 generate a binary zero and then treat the digit as a following literal. At
1776 least by Perl 5.18 this changed so as not to insert the binary zero. */
1777
1778 if (c >= CHAR_8) break;
1779
1780 /* Fall through */
1781
1782 /* \0 always starts an octal number, but we may drop through to here with a
1783 larger first octal digit. The original code used just to take the least
1784 significant 8 bits of octal numbers (I think this is what early Perls used
1785 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1786 but no more than 3 octal digits. */
1787
1788 case CHAR_0:
1789 c -= CHAR_0;
1790 while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1791 c = c * 8 + *ptr++ - CHAR_0;
1792 #if PCRE2_CODE_UNIT_WIDTH == 8
1793 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1794 #endif
1795 break;
1796
1797 /* \o is a relatively new Perl feature, supporting a more general way of
1798 specifying character codes in octal. The only supported form is \o{ddd}. */
1799
1800 case CHAR_o:
1801 if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1802 {
1803 ptr--;
1804 *errorcodeptr = ERR55;
1805 }
1806 else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1807 *errorcodeptr = ERR78;
1808 else
1809 {
1810 c = 0;
1811 overflow = FALSE;
1812 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1813 {
1814 cc = *ptr++;
1815 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1816 #if PCRE2_CODE_UNIT_WIDTH == 32
1817 if (c >= 0x20000000l) { overflow = TRUE; break; }
1818 #endif
1819 c = (c << 3) + (cc - CHAR_0);
1820 #if PCRE2_CODE_UNIT_WIDTH == 8
1821 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1822 #elif PCRE2_CODE_UNIT_WIDTH == 16
1823 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1824 #elif PCRE2_CODE_UNIT_WIDTH == 32
1825 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1826 #endif
1827 }
1828 if (overflow)
1829 {
1830 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1831 *errorcodeptr = ERR34;
1832 }
1833 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1834 {
1835 if (utf && c >= 0xd800 && c <= 0xdfff &&
1836 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1837 {
1838 ptr--;
1839 *errorcodeptr = ERR73;
1840 }
1841 }
1842 else
1843 {
1844 ptr--;
1845 *errorcodeptr = ERR64;
1846 }
1847 }
1848 break;
1849
1850 /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1851 by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1852
1853 case CHAR_x:
1854 if (alt_bsux)
1855 {
1856 uint32_t xc;
1857 if (ptrend - ptr < 2) break; /* Less than 2 characters */
1858 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1859 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1860 c = (cc << 4) | xc;
1861 ptr += 2;
1862 }
1863
1864 /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1865 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1866 digits. If not, { used to be treated as a data character. However, Perl
1867 seems to read hex digits up to the first non-such, and ignore the rest, so
1868 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1869 now gives an error. */
1870
1871 else
1872 {
1873 if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1874 {
1875 #ifndef EBCDIC
1876 COME_FROM_NU:
1877 #endif
1878 if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1879 {
1880 *errorcodeptr = ERR78;
1881 break;
1882 }
1883 c = 0;
1884 overflow = FALSE;
1885
1886 while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1887 {
1888 ptr++;
1889 if (c == 0 && cc == 0) continue; /* Leading zeroes */
1890 #if PCRE2_CODE_UNIT_WIDTH == 32
1891 if (c >= 0x10000000l) { overflow = TRUE; break; }
1892 #endif
1893 c = (c << 4) | cc;
1894 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1895 {
1896 overflow = TRUE;
1897 break;
1898 }
1899 }
1900
1901 if (overflow)
1902 {
1903 while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1904 *errorcodeptr = ERR34;
1905 }
1906 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1907 {
1908 if (utf && c >= 0xd800 && c <= 0xdfff &&
1909 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1910 {
1911 ptr--;
1912 *errorcodeptr = ERR73;
1913 }
1914 }
1915
1916 /* If the sequence of hex digits does not end with '}', give an error.
1917 We used just to recognize this construct and fall through to the normal
1918 \x handling, but nowadays Perl gives an error, which seems much more
1919 sensible, so we do too. */
1920
1921 else
1922 {
1923 ptr--;
1924 *errorcodeptr = ERR67;
1925 }
1926 } /* End of \x{} processing */
1927
1928 /* Read a up to two hex digits after \x */
1929
1930 else
1931 {
1932 c = 0;
1933 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1934 ptr++;
1935 c = cc;
1936 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1937 ptr++;
1938 c = (c << 4) | cc;
1939 } /* End of \xdd handling */
1940 } /* End of Perl-style \x handling */
1941 break;
1942
1943 /* The handling of \c is different in ASCII and EBCDIC environments. In an
1944 ASCII (or Unicode) environment, an error is given if the character
1945 following \c is not a printable ASCII character. Otherwise, the following
1946 character is upper-cased if it is a letter, and after that the 0x40 bit is
1947 flipped. The result is the value of the escape.
1948
1949 In an EBCDIC environment the handling of \c is compatible with the
1950 specification in the perlebcdic document. The following character must be
1951 a letter or one of small number of special characters. These provide a
1952 means of defining the character values 0-31.
1953
1954 For testing the EBCDIC handling of \c in an ASCII environment, recognize
1955 the EBCDIC value of 'c' explicitly. */
1956
1957 #if defined EBCDIC && 'a' != 0x81
1958 case 0x83:
1959 #else
1960 case CHAR_c:
1961 #endif
1962 if (ptr >= ptrend)
1963 {
1964 *errorcodeptr = ERR2;
1965 break;
1966 }
1967 c = *ptr;
1968 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
1969
1970 /* Handle \c in an ASCII/Unicode environment. */
1971
1972 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1973 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
1974 {
1975 *errorcodeptr = ERR68;
1976 break;
1977 }
1978 c ^= 0x40;
1979
1980 /* Handle \c in an EBCDIC environment. The special case \c? is converted to
1981 255 (0xff) or 95 (0x5f) if other characters suggest we are using the
1982 POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
1983 The other valid sequences correspond to a list of specific characters. */
1984
1985 #else
1986 if (c == CHAR_QUESTION_MARK)
1987 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1988 else
1989 {
1990 for (i = 0; i < 32; i++)
1991 {
1992 if (c == ebcdic_escape_c[i]) break;
1993 }
1994 if (i < 32) c = i; else *errorcodeptr = ERR68;
1995 }
1996 #endif /* EBCDIC */
1997
1998 ptr++;
1999 break;
2000
2001 /* Any other alphanumeric following \ is an error. Perl gives an error only
2002 if in warning mode, but PCRE doesn't have a warning mode. */
2003
2004 default:
2005 *errorcodeptr = ERR3;
2006 *ptrptr = ptr - 1; /* Point to the character at fault */
2007 return 0;
2008 }
2009 }
2010
2011 /* Set the pointer to the next character before returning. */
2012
2013 *ptrptr = ptr;
2014 *chptr = c;
2015 return escape;
2016 }
2017
2018
2019
2020 #ifdef SUPPORT_UNICODE
2021 /*************************************************
2022 * Handle \P and \p *
2023 *************************************************/
2024
2025 /* This function is called after \P or \p has been encountered, provided that
2026 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2027 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2028 after the final code unit of the escape sequence.
2029
2030 Arguments:
2031 ptrptr the pattern position pointer
2032 negptr a boolean that is set TRUE for negation else FALSE
2033 ptypeptr an unsigned int that is set to the type value
2034 pdataptr an unsigned int that is set to the detailed property value
2035 errorcodeptr the error code variable
2036 cb the compile data
2037
2038 Returns: TRUE if the type value was found, or FALSE for an invalid type
2039 */
2040
2041 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2042 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2043 uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2044 {
2045 PCRE2_UCHAR c;
2046 PCRE2_SIZE i, bot, top;
2047 PCRE2_SPTR ptr = *ptrptr;
2048 PCRE2_UCHAR name[32];
2049
2050 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2051 c = *ptr++;
2052 *negptr = FALSE;
2053
2054 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2055 negation. */
2056
2057 if (c == CHAR_LEFT_CURLY_BRACKET)
2058 {
2059 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2060 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2061 {
2062 *negptr = TRUE;
2063 ptr++;
2064 }
2065 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2066 {
2067 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2068 c = *ptr++;
2069 if (c == CHAR_NUL) goto ERROR_RETURN;
2070 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2071 name[i] = c;
2072 }
2073 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2074 name[i] = 0;
2075 }
2076
2077 /* Otherwise there is just one following character, which must be an ASCII
2078 letter. */
2079
2080 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2081 {
2082 name[0] = c;
2083 name[1] = 0;
2084 }
2085 else goto ERROR_RETURN;
2086
2087 *ptrptr = ptr;
2088
2089 /* Search for a recognized property name using binary chop. */
2090
2091 bot = 0;
2092 top = PRIV(utt_size);
2093
2094 while (bot < top)
2095 {
2096 int r;
2097 i = (bot + top) >> 1;
2098 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2099 if (r == 0)
2100 {
2101 *ptypeptr = PRIV(utt)[i].type;
2102 *pdataptr = PRIV(utt)[i].value;
2103 return TRUE;
2104 }
2105 if (r > 0) bot = i + 1; else top = i;
2106 }
2107 *errorcodeptr = ERR47; /* Unrecognized name */
2108 return FALSE;
2109
2110 ERROR_RETURN: /* Malformed \P or \p */
2111 *errorcodeptr = ERR46;
2112 *ptrptr = ptr;
2113 return FALSE;
2114 }
2115 #endif
2116
2117
2118
2119 /*************************************************
2120 * Check for POSIX class syntax *
2121 *************************************************/
2122
2123 /* This function is called when the sequence "[:" or "[." or "[=" is
2124 encountered in a character class. It checks whether this is followed by a
2125 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2126 reach an unescaped ']' without the special preceding character, return FALSE.
2127
2128 Originally, this function only recognized a sequence of letters between the
2129 terminators, but it seems that Perl recognizes any sequence of characters,
2130 though of course unknown POSIX names are subsequently rejected. Perl gives an
2131 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2132 didn't consider this to be a POSIX class. Likewise for [:1234:].
2133
2134 The problem in trying to be exactly like Perl is in the handling of escapes. We
2135 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2136 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2137 below handles the special cases \\ and \], but does not try to do any other
2138 escape processing. This makes it different from Perl for cases such as
2139 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2140 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2141 when Perl does, I think.
2142
2143 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2144 It seems that the appearance of a nested POSIX class supersedes an apparent
2145 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2146 a digit. This is handled by returning FALSE if the start of a new group with
2147 the same terminator is encountered, since the next closing sequence must close
2148 the nested group, not the outer one.
2149
2150 In Perl, unescaped square brackets may also appear as part of class names. For
2151 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2152 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2153 seem right at all. PCRE does not allow closing square brackets in POSIX class
2154 names.
2155
2156 Arguments:
2157 ptr pointer to the character after the initial [ (colon, dot, equals)
2158 ptrend pointer to the end of the pattern
2159 endptr where to return a pointer to the terminating ':', '.', or '='
2160
2161 Returns: TRUE or FALSE
2162 */
2163
2164 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2165 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2166 {
2167 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2168 terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2169
2170 for (; ptrend - ptr >= 2; ptr++)
2171 {
2172 if (*ptr == CHAR_BACKSLASH &&
2173 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2174 ptr++;
2175
2176 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2177 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2178
2179 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2180 {
2181 *endptr = ptr;
2182 return TRUE;
2183 }
2184 }
2185
2186 return FALSE;
2187 }
2188
2189
2190
2191 /*************************************************
2192 * Check POSIX class name *
2193 *************************************************/
2194
2195 /* This function is called to check the name given in a POSIX-style class entry
2196 such as [:alnum:].
2197
2198 Arguments:
2199 ptr points to the first letter
2200 len the length of the name
2201
2202 Returns: a value representing the name, or -1 if unknown
2203 */
2204
2205 static int
check_posix_name(PCRE2_SPTR ptr,int len)2206 check_posix_name(PCRE2_SPTR ptr, int len)
2207 {
2208 const char *pn = posix_names;
2209 int yield = 0;
2210 while (posix_name_lengths[yield] != 0)
2211 {
2212 if (len == posix_name_lengths[yield] &&
2213 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2214 pn += posix_name_lengths[yield] + 1;
2215 yield++;
2216 }
2217 return -1;
2218 }
2219
2220
2221
2222 /*************************************************
2223 * Read a subpattern or VERB name *
2224 *************************************************/
2225
2226 /* This function is called from parse_regex() below whenever it needs to read
2227 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2228 pointer must be to the character before the name. If that character is '*' we
2229 are reading a verb or alpha assertion name. The pointer is updated to point
2230 after the name, for a VERB or alpha assertion name, or after tha name's
2231 terminator for a subpattern name. Returning both the offset and the name
2232 pointer is redundant information, but some callers use one and some the other,
2233 so it is simplest just to return both.
2234
2235 Arguments:
2236 ptrptr points to the character pointer variable
2237 ptrend points to the end of the input string
2238 utf true if the input is UTF-encoded
2239 terminator the terminator of a subpattern name must be this
2240 offsetptr where to put the offset from the start of the pattern
2241 nameptr where to put a pointer to the name in the input
2242 namelenptr where to put the length of the name
2243 errcodeptr where to put an error code
2244 cb pointer to the compile data block
2245
2246 Returns: TRUE if a name was read
2247 FALSE otherwise, with error code set
2248 */
2249
2250 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2251 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2252 PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2253 int *errorcodeptr, compile_block *cb)
2254 {
2255 PCRE2_SPTR ptr = *ptrptr;
2256 BOOL is_group = (*ptr != CHAR_ASTERISK);
2257
2258 if (++ptr >= ptrend) /* No characters in name */
2259 {
2260 *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2261 ERR60; /* Verb not recognized or malformed */
2262 goto FAILED;
2263 }
2264
2265 *nameptr = ptr;
2266 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2267
2268 /* In UTF mode, a group name may contain letters and decimal digits as defined
2269 by Unicode properties, and underscores, but must not start with a digit. */
2270
2271 #ifdef SUPPORT_UNICODE
2272 if (utf && is_group)
2273 {
2274 uint32_t c, type;
2275
2276 GETCHAR(c, ptr);
2277 type = UCD_CHARTYPE(c);
2278
2279 if (type == ucp_Nd)
2280 {
2281 *errorcodeptr = ERR44;
2282 goto FAILED;
2283 }
2284
2285 for(;;)
2286 {
2287 if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2288 c != CHAR_UNDERSCORE) break;
2289 ptr++;
2290 FORWARDCHARTEST(ptr, ptrend);
2291 if (ptr >= ptrend) break;
2292 GETCHAR(c, ptr);
2293 type = UCD_CHARTYPE(c);
2294 }
2295 }
2296 else
2297 #else
2298 (void)utf; /* Avoid compiler warning */
2299 #endif /* SUPPORT_UNICODE */
2300
2301 /* Handle non-group names and group names in non-UTF modes. A group name must
2302 not start with a digit. If either of the others start with a digit it just
2303 won't be recognized. */
2304
2305 {
2306 if (is_group && IS_DIGIT(*ptr))
2307 {
2308 *errorcodeptr = ERR44;
2309 goto FAILED;
2310 }
2311
2312 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2313 {
2314 ptr++;
2315 }
2316 }
2317
2318 /* Check name length */
2319
2320 if (ptr > *nameptr + MAX_NAME_SIZE)
2321 {
2322 *errorcodeptr = ERR48;
2323 goto FAILED;
2324 }
2325 *namelenptr = ptr - *nameptr;
2326
2327 /* Subpattern names must not be empty, and their terminator is checked here.
2328 (What follows a verb or alpha assertion name is checked separately.) */
2329
2330 if (is_group)
2331 {
2332 if (ptr == *nameptr)
2333 {
2334 *errorcodeptr = ERR62; /* Subpattern name expected */
2335 goto FAILED;
2336 }
2337 if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2338 {
2339 *errorcodeptr = ERR42;
2340 goto FAILED;
2341 }
2342 ptr++;
2343 }
2344
2345 *ptrptr = ptr;
2346 return TRUE;
2347
2348 FAILED:
2349 *ptrptr = ptr;
2350 return FALSE;
2351 }
2352
2353
2354
2355 /*************************************************
2356 * Manage callouts at start of cycle *
2357 *************************************************/
2358
2359 /* At the start of a new item in parse_regex() we are able to record the
2360 details of the previous item in a prior callout, and also to set up an
2361 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2362 which would otherwise happen for items such as \Q that contribute nothing to
2363 the parsed pattern.
2364
2365 Arguments:
2366 ptr current pattern pointer
2367 pcalloutptr points to a pointer to previous callout, or NULL
2368 auto_callout TRUE if auto_callouts are enabled
2369 parsed_pattern the parsed pattern pointer
2370 cb compile block
2371
2372 Returns: possibly updated parsed_pattern pointer.
2373 */
2374
2375 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2376 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2377 uint32_t *parsed_pattern, compile_block *cb)
2378 {
2379 uint32_t *previous_callout = *pcalloutptr;
2380
2381 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2382 cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2383
2384 if (!auto_callout) previous_callout = NULL; else
2385 {
2386 if (previous_callout == NULL ||
2387 previous_callout != parsed_pattern - 4 ||
2388 previous_callout[3] != 255)
2389 {
2390 previous_callout = parsed_pattern; /* Set up new automatic callout */
2391 parsed_pattern += 4;
2392 previous_callout[0] = META_CALLOUT_NUMBER;
2393 previous_callout[2] = 0;
2394 previous_callout[3] = 255;
2395 }
2396 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2397 }
2398
2399 *pcalloutptr = previous_callout;
2400 return parsed_pattern;
2401 }
2402
2403
2404
2405 /*************************************************
2406 * Parse regex and identify named groups *
2407 *************************************************/
2408
2409 /* This function is called first of all. It scans the pattern and does two
2410 things: (1) It identifies capturing groups and makes a table of named capturing
2411 groups so that information about them is fully available to both the compiling
2412 scans. (2) It writes a parsed version of the pattern with comments omitted and
2413 escapes processed into the parsed_pattern vector.
2414
2415 Arguments:
2416 ptr points to the start of the pattern
2417 options compiling dynamic options (may change during the scan)
2418 has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2419 cb pointer to the compile data block
2420
2421 Returns: zero on success or a non-zero error code, with the
2422 error offset placed in the cb field
2423 */
2424
2425 /* A structure and some flags for dealing with nested groups. */
2426
2427 typedef struct nest_save {
2428 uint16_t nest_depth;
2429 uint16_t reset_group;
2430 uint16_t max_group;
2431 uint16_t flags;
2432 uint32_t options;
2433 } nest_save;
2434
2435 #define NSF_RESET 0x0001u
2436 #define NSF_CONDASSERT 0x0002u
2437 #define NSF_ATOMICSR 0x0004u
2438
2439 /* Options that are changeable within the pattern must be tracked during
2440 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2441 but all must be tracked so that META_OPTIONS items set the correct values for
2442 the main compiling phase. */
2443
2444 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2445 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2446 PCRE2_UNGREEDY)
2447
2448 /* States used for analyzing ranges in character classes. The two OK values
2449 must be last. */
2450
2451 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2452
2453 /* Only in 32-bit mode can there be literals > META_END. A macros encapsulates
2454 the storing of literal values in the parsed pattern. */
2455
2456 #if PCRE2_CODE_UNIT_WIDTH == 32
2457 #define PARSED_LITERAL(c, p) \
2458 { \
2459 if (c >= META_END) *p++ = META_BIGVALUE; \
2460 *p++ = c; \
2461 okquantifier = TRUE; \
2462 }
2463 #else
2464 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2465 #endif
2466
2467 /* Here's the actual function. */
2468
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2469 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2470 compile_block *cb)
2471 {
2472 uint32_t c;
2473 uint32_t delimiter;
2474 uint32_t namelen;
2475 uint32_t class_range_state;
2476 uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2477 uint32_t *previous_callout = NULL;
2478 uint32_t *parsed_pattern = cb->parsed_pattern;
2479 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2480 uint32_t meta_quantifier = 0;
2481 uint32_t add_after_mark = 0;
2482 uint32_t extra_options = cb->cx->extra_options;
2483 uint16_t nest_depth = 0;
2484 int after_manual_callout = 0;
2485 int expect_cond_assert = 0;
2486 int errorcode = 0;
2487 int escape;
2488 int i;
2489 BOOL inescq = FALSE;
2490 BOOL inverbname = FALSE;
2491 BOOL utf = (options & PCRE2_UTF) != 0;
2492 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2493 BOOL isdupname;
2494 BOOL negate_class;
2495 BOOL okquantifier = FALSE;
2496 PCRE2_SPTR thisptr;
2497 PCRE2_SPTR name;
2498 PCRE2_SPTR ptrend = cb->end_pattern;
2499 PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2500 named_group *ng;
2501 nest_save *top_nest, *end_nests;
2502
2503 /* Insert leading items for word and line matching (features provided for the
2504 benefit of pcre2grep). */
2505
2506 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2507 {
2508 *parsed_pattern++ = META_CIRCUMFLEX;
2509 *parsed_pattern++ = META_NOCAPTURE;
2510 }
2511 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2512 {
2513 *parsed_pattern++ = META_ESCAPE + ESC_b;
2514 *parsed_pattern++ = META_NOCAPTURE;
2515 }
2516
2517 /* If the pattern is actually a literal string, process it separately to avoid
2518 cluttering up the main loop. */
2519
2520 if ((options & PCRE2_LITERAL) != 0)
2521 {
2522 while (ptr < ptrend)
2523 {
2524 if (parsed_pattern >= parsed_pattern_end)
2525 {
2526 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2527 goto FAILED;
2528 }
2529 thisptr = ptr;
2530 GETCHARINCTEST(c, ptr);
2531 if (auto_callout)
2532 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2533 auto_callout, parsed_pattern, cb);
2534 PARSED_LITERAL(c, parsed_pattern);
2535 }
2536 goto PARSED_END;
2537 }
2538
2539 /* Process a real regex which may contain meta-characters. */
2540
2541 top_nest = NULL;
2542 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2543
2544 /* The size of the nest_save structure might not be a factor of the size of the
2545 workspace. Therefore we must round down end_nests so as to correctly avoid
2546 creating a nest_save that spans the end of the workspace. */
2547
2548 end_nests = (nest_save *)((char *)end_nests -
2549 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2550
2551 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2552
2553 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2554
2555 /* Now scan the pattern */
2556
2557 while (ptr < ptrend)
2558 {
2559 int prev_expect_cond_assert;
2560 uint32_t min_repeat, max_repeat;
2561 uint32_t set, unset, *optset;
2562 uint32_t terminator;
2563 uint32_t prev_meta_quantifier;
2564 BOOL prev_okquantifier;
2565 PCRE2_SPTR tempptr;
2566 PCRE2_SIZE offset;
2567
2568 if (parsed_pattern >= parsed_pattern_end)
2569 {
2570 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2571 goto FAILED;
2572 }
2573
2574 if (nest_depth > cb->cx->parens_nest_limit)
2575 {
2576 errorcode = ERR19;
2577 goto FAILED; /* Parentheses too deeply nested */
2578 }
2579
2580 /* Get next input character, save its position for callout handling. */
2581
2582 thisptr = ptr;
2583 GETCHARINCTEST(c, ptr);
2584
2585 /* Copy quoted literals until \E, allowing for the possibility of automatic
2586 callouts, except when processing a (*VERB) "name". */
2587
2588 if (inescq)
2589 {
2590 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2591 {
2592 inescq = FALSE;
2593 ptr++; /* Skip E */
2594 }
2595 else
2596 {
2597 if (expect_cond_assert > 0) /* A literal is not allowed if we are */
2598 { /* expecting a conditional assertion, */
2599 ptr--; /* but an empty \Q\E sequence is OK. */
2600 errorcode = ERR28;
2601 goto FAILED;
2602 }
2603 if (!inverbname && after_manual_callout-- <= 0)
2604 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2605 auto_callout, parsed_pattern, cb);
2606 PARSED_LITERAL(c, parsed_pattern);
2607 meta_quantifier = 0;
2608 }
2609 continue; /* Next character */
2610 }
2611
2612 /* If we are processing the "name" part of a (*VERB:NAME) item, all
2613 characters up to the closing parenthesis are literals except when
2614 PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2615 and \E and escaped characters are allowed (no character types such as \d). If
2616 PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2617 this by not entering the special (*VERB:NAME) processing - they are then
2618 picked up below. Note that c is a character, not a code unit, so we must not
2619 use MAX_255 to test its size because MAX_255 tests code units and is assumed
2620 TRUE in 8-bit mode. */
2621
2622 if (inverbname &&
2623 (
2624 /* EITHER: not both options set */
2625 ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2626 (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2627 #ifdef SUPPORT_UNICODE
2628 /* OR: character > 255 AND not Unicode Pattern White Space */
2629 (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2630 #endif
2631 /* OR: not a # comment or isspace() white space */
2632 (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2633 #ifdef SUPPORT_UNICODE
2634 /* and not CHAR_NEL when Unicode is supported */
2635 && c != CHAR_NEL
2636 #endif
2637 )))
2638 {
2639 PCRE2_SIZE verbnamelength;
2640
2641 switch(c)
2642 {
2643 default:
2644 PARSED_LITERAL(c, parsed_pattern);
2645 break;
2646
2647 case CHAR_RIGHT_PARENTHESIS:
2648 inverbname = FALSE;
2649 okquantifier = FALSE; /* Was probably set by literals */
2650 /* This is the length in characters */
2651 verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2652 /* But the limit on the length is in code units */
2653 if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2654 {
2655 ptr--;
2656 errorcode = ERR76;
2657 goto FAILED;
2658 }
2659 *verblengthptr = (uint32_t)verbnamelength;
2660
2661 /* If this name was on a verb such as (*ACCEPT) which does not continue,
2662 a (*MARK) was generated for the name. We now add the original verb as the
2663 next item. */
2664
2665 if (add_after_mark != 0)
2666 {
2667 *parsed_pattern++ = add_after_mark;
2668 add_after_mark = 0;
2669 }
2670 break;
2671
2672 case CHAR_BACKSLASH:
2673 if ((options & PCRE2_ALT_VERBNAMES) != 0)
2674 {
2675 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2676 cb->cx->extra_options, FALSE, cb);
2677 if (errorcode != 0) goto FAILED;
2678 }
2679 else escape = 0; /* Treat all as literal */
2680
2681 switch(escape)
2682 {
2683 case 0:
2684 PARSED_LITERAL(c, parsed_pattern);
2685 break;
2686
2687 case ESC_Q:
2688 inescq = TRUE;
2689 break;
2690
2691 case ESC_E: /* Ignore */
2692 break;
2693
2694 default:
2695 errorcode = ERR40; /* Invalid in verb name */
2696 goto FAILED;
2697 }
2698 }
2699 continue; /* Next character in pattern */
2700 }
2701
2702 /* Not a verb name character. At this point we must process everything that
2703 must not change the quantification state. This is mainly comments, but we
2704 handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2705 A+, as in Perl. An isolated \E is ignored. */
2706
2707 if (c == CHAR_BACKSLASH && ptr < ptrend)
2708 {
2709 if (*ptr == CHAR_Q || *ptr == CHAR_E)
2710 {
2711 inescq = *ptr == CHAR_Q;
2712 ptr++;
2713 continue;
2714 }
2715 }
2716
2717 /* Skip over whitespace and # comments in extended mode. Note that c is a
2718 character, not a code unit, so we must not use MAX_255 to test its size
2719 because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2720 whitespace characters are those designated as "Pattern White Space" by
2721 Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2722 U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2723 subset of space characters that match \h and \v. */
2724
2725 if ((options & PCRE2_EXTENDED) != 0)
2726 {
2727 if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2728 #ifdef SUPPORT_UNICODE
2729 if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2730 #endif
2731 if (c == CHAR_NUMBER_SIGN)
2732 {
2733 while (ptr < ptrend)
2734 {
2735 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
2736 { /* IS_NEWLINE sets cb->nllen. */
2737 ptr += cb->nllen;
2738 break;
2739 }
2740 ptr++;
2741 #ifdef SUPPORT_UNICODE
2742 if (utf) FORWARDCHARTEST(ptr, ptrend);
2743 #endif
2744 }
2745 continue; /* Next character in pattern */
2746 }
2747 }
2748
2749 /* Skip over bracketed comments */
2750
2751 if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2752 ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2753 {
2754 while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2755 if (ptr >= ptrend)
2756 {
2757 errorcode = ERR18; /* A special error for missing ) in a comment */
2758 goto FAILED; /* to make it easier to debug. */
2759 }
2760 ptr++;
2761 continue; /* Next character in pattern */
2762 }
2763
2764 /* If the next item is not a quantifier, fill in length of any previous
2765 callout and create an auto callout if required. */
2766
2767 if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2768 (c != CHAR_LEFT_CURLY_BRACKET ||
2769 (tempptr = ptr,
2770 !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2771 {
2772 if (after_manual_callout-- <= 0)
2773 parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2774 parsed_pattern, cb);
2775 }
2776
2777 /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2778 assertion, possibly preceded by a callout. If the value is 1, we have just
2779 had the callout and expect an assertion. There must be at least 3 more
2780 characters in all cases. When expect_cond_assert is 2, we know that the
2781 current character is an opening parenthesis, as otherwise we wouldn't be
2782 here. However, when it is 1, we need to check, and it's easiest just to check
2783 always. Note that expect_cond_assert may be negative, since all callouts just
2784 decrement it. */
2785
2786 if (expect_cond_assert > 0)
2787 {
2788 BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2789 (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
2790 if (ok)
2791 {
2792 if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
2793 {
2794 ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
2795 }
2796 else switch(ptr[1]) /* Traditional symbolic format */
2797 {
2798 case CHAR_C:
2799 ok = expect_cond_assert == 2;
2800 break;
2801
2802 case CHAR_EQUALS_SIGN:
2803 case CHAR_EXCLAMATION_MARK:
2804 break;
2805
2806 case CHAR_LESS_THAN_SIGN:
2807 ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2808 break;
2809
2810 default:
2811 ok = FALSE;
2812 }
2813 }
2814
2815 if (!ok)
2816 {
2817 ptr--; /* Adjust error offset */
2818 errorcode = ERR28;
2819 goto FAILED;
2820 }
2821 }
2822
2823 /* Remember whether we are expecting a conditional assertion, and set the
2824 default for this item. */
2825
2826 prev_expect_cond_assert = expect_cond_assert;
2827 expect_cond_assert = 0;
2828
2829 /* Remember quantification status for the previous significant item, then set
2830 default for this item. */
2831
2832 prev_okquantifier = okquantifier;
2833 prev_meta_quantifier = meta_quantifier;
2834 okquantifier = FALSE;
2835 meta_quantifier = 0;
2836
2837 /* If the previous significant item was a quantifier, adjust the parsed code
2838 if there is a following modifier. The base meta value is always followed by
2839 the PLUS and QUERY values, in that order. We do this here rather than after
2840 reading a quantifier so that intervening comments and /x whitespace can be
2841 ignored without having to replicate code. */
2842
2843 if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2844 {
2845 parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2846 prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2847 0x00020000u : 0x00010000u);
2848 continue; /* Next character in pattern */
2849 }
2850
2851
2852 /* Process the next item in the main part of a pattern. */
2853
2854 switch(c)
2855 {
2856 default: /* Non-special character */
2857 PARSED_LITERAL(c, parsed_pattern);
2858 break;
2859
2860
2861 /* ---- Escape sequence ---- */
2862
2863 case CHAR_BACKSLASH:
2864 tempptr = ptr;
2865 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2866 cb->cx->extra_options, FALSE, cb);
2867 if (errorcode != 0)
2868 {
2869 ESCAPE_FAILED:
2870 if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
2871 goto FAILED;
2872 ptr = tempptr;
2873 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
2874 {
2875 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
2876 }
2877 escape = 0; /* Treat as literal character */
2878 }
2879
2880 /* The escape was a data escape or literal character. */
2881
2882 if (escape == 0)
2883 {
2884 PARSED_LITERAL(c, parsed_pattern);
2885 }
2886
2887 /* The escape was a back (or forward) reference. We keep the offset in
2888 order to give a more useful diagnostic for a bad forward reference. For
2889 references to groups numbered less than 10 we can't use more than two items
2890 in parsed_pattern because they may be just two characters in the input (and
2891 in a 64-bit world an offset may need two elements). So for them, the offset
2892 of the first occurrent is held in a special vector. */
2893
2894 else if (escape < 0)
2895 {
2896 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
2897 escape = -escape;
2898 *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
2899 if (escape < 10)
2900 {
2901 if (cb->small_ref_offset[escape] == PCRE2_UNSET)
2902 cb->small_ref_offset[escape] = offset;
2903 }
2904 else
2905 {
2906 PUTOFFSET(offset, parsed_pattern);
2907 }
2908 okquantifier = TRUE;
2909 }
2910
2911 /* The escape was a character class such as \d etc. or other special
2912 escape indicator such as \A or \X. Most of them generate just a single
2913 parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
2914 value. They are supported only when Unicode is available. The type and
2915 value are packed into a single 32-bit value so that the whole sequences
2916 uses only two elements in the parsed_vector. This is because the same
2917 coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
2918 set.
2919
2920 There are also some cases where the escape sequence is followed by a name:
2921 \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
2922 and \g'name' are subroutine calls by name; \g{name} is a synonym for
2923 \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
2924 and returned as a negative value (handled above). A name is coded as an
2925 offset into the pattern and a length. */
2926
2927 else switch (escape)
2928 {
2929 case ESC_C:
2930 #ifdef NEVER_BACKSLASH_C
2931 errorcode = ERR85;
2932 goto ESCAPE_FAILED;
2933 #else
2934 if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
2935 {
2936 errorcode = ERR83;
2937 goto ESCAPE_FAILED;
2938 }
2939 #endif
2940 okquantifier = TRUE;
2941 *parsed_pattern++ = META_ESCAPE + escape;
2942 break;
2943
2944 case ESC_X:
2945 #ifndef SUPPORT_UNICODE
2946 errorcode = ERR45; /* Supported only with Unicode support */
2947 goto ESCAPE_FAILED;
2948 #endif
2949 case ESC_H:
2950 case ESC_h:
2951 case ESC_N:
2952 case ESC_R:
2953 case ESC_V:
2954 case ESC_v:
2955 okquantifier = TRUE;
2956 *parsed_pattern++ = META_ESCAPE + escape;
2957 break;
2958
2959 default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
2960 *parsed_pattern++ = META_ESCAPE + escape;
2961 break;
2962
2963 /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
2964 without Unicode support because it is checked when pcre2_compile() is
2965 called. */
2966
2967 case ESC_d:
2968 case ESC_D:
2969 case ESC_s:
2970 case ESC_S:
2971 case ESC_w:
2972 case ESC_W:
2973 okquantifier = TRUE;
2974 if ((options & PCRE2_UCP) == 0)
2975 {
2976 *parsed_pattern++ = META_ESCAPE + escape;
2977 }
2978 else
2979 {
2980 *parsed_pattern++ = META_ESCAPE +
2981 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
2982 ESC_p : ESC_P);
2983 switch(escape)
2984 {
2985 case ESC_d:
2986 case ESC_D:
2987 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2988 break;
2989
2990 case ESC_s:
2991 case ESC_S:
2992 *parsed_pattern++ = PT_SPACE << 16;
2993 break;
2994
2995 case ESC_w:
2996 case ESC_W:
2997 *parsed_pattern++ = PT_WORD << 16;
2998 break;
2999 }
3000 }
3001 break;
3002
3003 /* Unicode property matching */
3004
3005 case ESC_P:
3006 case ESC_p:
3007 #ifdef SUPPORT_UNICODE
3008 {
3009 BOOL negated;
3010 uint16_t ptype = 0, pdata = 0;
3011 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3012 goto ESCAPE_FAILED;
3013 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3014 *parsed_pattern++ = META_ESCAPE + escape;
3015 *parsed_pattern++ = (ptype << 16) | pdata;
3016 okquantifier = TRUE;
3017 }
3018 #else
3019 errorcode = ERR45;
3020 goto ESCAPE_FAILED;
3021 #endif
3022 break; /* End \P and \p */
3023
3024 /* When \g is used with quotes or angle brackets as delimiters, it is a
3025 numerical or named subroutine call, and control comes here. When used
3026 with brace delimiters it is a numberical back reference and does not come
3027 here because check_escape() returns it directly as a reference. \k is
3028 always a named back reference. */
3029
3030 case ESC_g:
3031 case ESC_k:
3032 if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3033 *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3034 {
3035 errorcode = (escape == ESC_g)? ERR57 : ERR69;
3036 goto ESCAPE_FAILED;
3037 }
3038 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3039 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3040 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3041
3042 /* For a non-braced \g, check for a numerical recursion. */
3043
3044 if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3045 {
3046 PCRE2_SPTR p = ptr + 1;
3047
3048 if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3049 &errorcode))
3050 {
3051 if (p >= ptrend || *p != terminator)
3052 {
3053 errorcode = ERR57;
3054 goto ESCAPE_FAILED;
3055 }
3056 ptr = p;
3057 goto SET_RECURSION;
3058 }
3059 if (errorcode != 0) goto ESCAPE_FAILED;
3060 }
3061
3062 /* Not a numerical recursion */
3063
3064 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3065 &errorcode, cb)) goto ESCAPE_FAILED;
3066
3067 /* \k and \g when used with braces are back references, whereas \g used
3068 with quotes or angle brackets is a recursion */
3069
3070 *parsed_pattern++ =
3071 (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3072 META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3073 *parsed_pattern++ = namelen;
3074
3075 PUTOFFSET(offset, parsed_pattern);
3076 okquantifier = TRUE;
3077 break; /* End special escape processing */
3078 }
3079 break; /* End escape sequence processing */
3080
3081
3082 /* ---- Single-character special items ---- */
3083
3084 case CHAR_CIRCUMFLEX_ACCENT:
3085 *parsed_pattern++ = META_CIRCUMFLEX;
3086 break;
3087
3088 case CHAR_DOLLAR_SIGN:
3089 *parsed_pattern++ = META_DOLLAR;
3090 break;
3091
3092 case CHAR_DOT:
3093 *parsed_pattern++ = META_DOT;
3094 okquantifier = TRUE;
3095 break;
3096
3097
3098 /* ---- Single-character quantifiers ---- */
3099
3100 case CHAR_ASTERISK:
3101 meta_quantifier = META_ASTERISK;
3102 goto CHECK_QUANTIFIER;
3103
3104 case CHAR_PLUS:
3105 meta_quantifier = META_PLUS;
3106 goto CHECK_QUANTIFIER;
3107
3108 case CHAR_QUESTION_MARK:
3109 meta_quantifier = META_QUERY;
3110 goto CHECK_QUANTIFIER;
3111
3112
3113 /* ---- Potential {n,m} quantifier ---- */
3114
3115 case CHAR_LEFT_CURLY_BRACKET:
3116 if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3117 &errorcode))
3118 {
3119 if (errorcode != 0) goto FAILED; /* Error in quantifier. */
3120 PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
3121 break; /* No more quantifier processing */
3122 }
3123 meta_quantifier = META_MINMAX;
3124 /* Fall through */
3125
3126
3127 /* ---- Quantifier post-processing ---- */
3128
3129 /* Check that a quantifier is allowed after the previous item. */
3130
3131 CHECK_QUANTIFIER:
3132 if (!prev_okquantifier)
3133 {
3134 errorcode = ERR9;
3135 goto FAILED_BACK;
3136 }
3137
3138 /* Now we can put the quantifier into the parsed pattern vector. At this
3139 stage, we have only the basic quantifier. The check for a following + or ?
3140 modifier happens at the top of the loop, after any intervening comments
3141 have been removed. */
3142
3143 *parsed_pattern++ = meta_quantifier;
3144 if (c == CHAR_LEFT_CURLY_BRACKET)
3145 {
3146 *parsed_pattern++ = min_repeat;
3147 *parsed_pattern++ = max_repeat;
3148 }
3149 break;
3150
3151
3152 /* ---- Character class ---- */
3153
3154 case CHAR_LEFT_SQUARE_BRACKET:
3155 okquantifier = TRUE;
3156
3157 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3158 used for "start of word" and "end of word". As these are otherwise illegal
3159 sequences, we don't break anything by recognizing them. They are replaced
3160 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3161 erroneous and are handled by the normal code below. */
3162
3163 if (ptrend - ptr >= 6 &&
3164 (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3165 PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3166 {
3167 *parsed_pattern++ = META_ESCAPE + ESC_b;
3168
3169 if (ptr[2] == CHAR_LESS_THAN_SIGN)
3170 {
3171 *parsed_pattern++ = META_LOOKAHEAD;
3172 }
3173 else
3174 {
3175 *parsed_pattern++ = META_LOOKBEHIND;
3176 *has_lookbehind = TRUE;
3177
3178 /* The offset is used only for the "non-fixed length" error; this won't
3179 occur here, so just store zero. */
3180
3181 PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3182 }
3183
3184 if ((options & PCRE2_UCP) == 0)
3185 *parsed_pattern++ = META_ESCAPE + ESC_w;
3186 else
3187 {
3188 *parsed_pattern++ = META_ESCAPE + ESC_p;
3189 *parsed_pattern++ = PT_WORD << 16;
3190 }
3191 *parsed_pattern++ = META_KET;
3192 ptr += 6;
3193 break;
3194 }
3195
3196 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3197 they are encountered at the top level, so we'll do that too. */
3198
3199 if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3200 *ptr == CHAR_EQUALS_SIGN) &&
3201 check_posix_syntax(ptr, ptrend, &tempptr))
3202 {
3203 errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3204 goto FAILED;
3205 }
3206
3207 /* Process a regular character class. If the first character is '^', set
3208 the negation flag. If the first few characters (either before or after ^)
3209 are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3210 This makes for compatibility with Perl. */
3211
3212 negate_class = FALSE;
3213 while (ptr < ptrend)
3214 {
3215 GETCHARINCTEST(c, ptr);
3216 if (c == CHAR_BACKSLASH)
3217 {
3218 if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3219 else if (ptrend - ptr >= 3 &&
3220 PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3221 ptr += 3;
3222 else
3223 break;
3224 }
3225 else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3226 (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */
3227 continue;
3228 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3229 negate_class = TRUE;
3230 else break;
3231 }
3232
3233 /* Now the real contents of the class; c has the first "real" character.
3234 Empty classes are permitted only if the option is set. */
3235
3236 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3237 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3238 {
3239 *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3240 break; /* End of class processing */
3241 }
3242
3243 /* Process a non-empty class. */
3244
3245 *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3246 class_range_state = RANGE_NO;
3247
3248 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3249 because there are holes in the encoding, and simply using the range A-Z
3250 (for example) would include the characters in the holes. This applies only
3251 to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3252 in this respect. In order to accommodate this, we keep track of whether
3253 character values are literal or not, and a state variable for handling
3254 ranges. */
3255
3256 /* Loop for the contents of the class */
3257
3258 for (;;)
3259 {
3260 BOOL char_is_literal = TRUE;
3261
3262 /* Inside \Q...\E everything is literal except \E */
3263
3264 if (inescq)
3265 {
3266 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3267 {
3268 inescq = FALSE; /* Reset literal state */
3269 ptr++; /* Skip the 'E' */
3270 goto CLASS_CONTINUE;
3271 }
3272 goto CLASS_LITERAL;
3273 }
3274
3275 /* Skip over space and tab (only) in extended-more mode. */
3276
3277 if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3278 (c == CHAR_SPACE || c == CHAR_HT))
3279 goto CLASS_CONTINUE;
3280
3281 /* Handle POSIX class names. Perl allows a negation extension of the
3282 form [:^name:]. A square bracket that doesn't match the syntax is
3283 treated as a literal. We also recognize the POSIX constructions
3284 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3285 5.6 and 5.8 do. */
3286
3287 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3288 ptrend - ptr >= 3 &&
3289 (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3290 *ptr == CHAR_EQUALS_SIGN) &&
3291 check_posix_syntax(ptr, ptrend, &tempptr))
3292 {
3293 BOOL posix_negate = FALSE;
3294 int posix_class;
3295
3296 /* Perl treats a hyphen before a POSIX class as a literal, not the
3297 start of a range. However, it gives a warning in its warning mode. PCRE
3298 does not have a warning mode, so we give an error, because this is
3299 likely an error on the user's part. */
3300
3301 if (class_range_state == RANGE_STARTED)
3302 {
3303 errorcode = ERR50;
3304 goto FAILED;
3305 }
3306
3307 if (*ptr != CHAR_COLON)
3308 {
3309 errorcode = ERR13;
3310 goto FAILED_BACK;
3311 }
3312
3313 if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3314 {
3315 posix_negate = TRUE;
3316 ptr++;
3317 }
3318
3319 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3320 if (posix_class < 0)
3321 {
3322 errorcode = ERR30;
3323 goto FAILED;
3324 }
3325 ptr = tempptr + 2;
3326
3327 /* Perl treats a hyphen after a POSIX class as a literal, not the
3328 start of a range. However, it gives a warning in its warning mode
3329 unless the hyphen is the last character in the class. PCRE does not
3330 have a warning mode, so we give an error, because this is likely an
3331 error on the user's part. */
3332
3333 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3334 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3335 {
3336 errorcode = ERR50;
3337 goto FAILED;
3338 }
3339
3340 /* Set "a hyphen is not the start of a range" for the -] case, and also
3341 in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3342 fuzzers do that kind of thing) and *then* a hyphen. This causes that
3343 hyphen to be treated as a literal. I don't think it's worth setting up
3344 special apparatus to do otherwise. */
3345
3346 class_range_state = RANGE_NO;
3347
3348 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3349 use Unicode properties \p or \P or, in one case, \h or \H. The
3350 substitutes table has two values per class, containing the type and
3351 value of a \p or \P item. The special cases are specified with a
3352 negative type: a non-zero value causes \h or \H to be used, and a zero
3353 value falls through to behave like a non-UCP POSIX class. */
3354
3355 #ifdef SUPPORT_UNICODE
3356 if ((options & PCRE2_UCP) != 0)
3357 {
3358 int ptype = posix_substitutes[2*posix_class];
3359 int pvalue = posix_substitutes[2*posix_class + 1];
3360 if (ptype >= 0)
3361 {
3362 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3363 *parsed_pattern++ = (ptype << 16) | pvalue;
3364 goto CLASS_CONTINUE;
3365 }
3366
3367 if (pvalue != 0)
3368 {
3369 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3370 goto CLASS_CONTINUE;
3371 }
3372
3373 /* Fall through */
3374 }
3375 #endif /* SUPPORT_UNICODE */
3376
3377 /* Non-UCP POSIX class */
3378
3379 *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3380 *parsed_pattern++ = posix_class;
3381 }
3382
3383 /* Handle potential start of range */
3384
3385 else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3386 {
3387 *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3388 META_RANGE_LITERAL : META_RANGE_ESCAPED;
3389 class_range_state = RANGE_STARTED;
3390 }
3391
3392 /* Handle a literal character */
3393
3394 else if (c != CHAR_BACKSLASH)
3395 {
3396 CLASS_LITERAL:
3397 if (class_range_state == RANGE_STARTED)
3398 {
3399 if (c == parsed_pattern[-2]) /* Optimize one-char range */
3400 parsed_pattern--;
3401 else if (parsed_pattern[-2] > c) /* Check range is in order */
3402 {
3403 errorcode = ERR8;
3404 goto FAILED_BACK;
3405 }
3406 else
3407 {
3408 if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3409 parsed_pattern[-1] = META_RANGE_ESCAPED;
3410 PARSED_LITERAL(c, parsed_pattern);
3411 }
3412 class_range_state = RANGE_NO;
3413 }
3414 else /* Potential start of range */
3415 {
3416 class_range_state = char_is_literal?
3417 RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3418 PARSED_LITERAL(c, parsed_pattern);
3419 }
3420 }
3421
3422 /* Handle escapes in a class */
3423
3424 else
3425 {
3426 tempptr = ptr;
3427 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3428 cb->cx->extra_options, TRUE, cb);
3429
3430 if (errorcode != 0)
3431 {
3432 if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3433 goto FAILED;
3434 ptr = tempptr;
3435 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3436 {
3437 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3438 }
3439 escape = 0; /* Treat as literal character */
3440 }
3441
3442 switch(escape)
3443 {
3444 case 0: /* Escaped character code point is in c */
3445 char_is_literal = FALSE;
3446 goto CLASS_LITERAL;
3447
3448 case ESC_b:
3449 c = CHAR_BS; /* \b is backspace in a class */
3450 char_is_literal = FALSE;
3451 goto CLASS_LITERAL;
3452
3453 case ESC_Q:
3454 inescq = TRUE; /* Enter literal mode */
3455 goto CLASS_CONTINUE;
3456
3457 case ESC_E: /* Ignore orphan \E */
3458 goto CLASS_CONTINUE;
3459
3460 case ESC_B: /* Always an error in a class */
3461 case ESC_R:
3462 case ESC_X:
3463 errorcode = ERR7;
3464 ptr--;
3465 goto FAILED;
3466 }
3467
3468 /* The second part of a range can be a single-character escape
3469 sequence (detected above), but not any of the other escapes. Perl
3470 treats a hyphen as a literal in such circumstances. However, in Perl's
3471 warning mode, a warning is given, so PCRE now faults it, as it is
3472 almost certainly a mistake on the user's part. */
3473
3474 if (class_range_state == RANGE_STARTED)
3475 {
3476 errorcode = ERR50;
3477 goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
3478 }
3479
3480 /* Of the remaining escapes, only those that define characters are
3481 allowed in a class. None may start a range. */
3482
3483 class_range_state = RANGE_NO;
3484 switch(escape)
3485 {
3486 case ESC_N:
3487 errorcode = ERR71;
3488 goto FAILED;
3489
3490 case ESC_H:
3491 case ESC_h:
3492 case ESC_V:
3493 case ESC_v:
3494 *parsed_pattern++ = META_ESCAPE + escape;
3495 break;
3496
3497 /* These escapes are converted to Unicode property tests when
3498 PCRE2_UCP is set. */
3499
3500 case ESC_d:
3501 case ESC_D:
3502 case ESC_s:
3503 case ESC_S:
3504 case ESC_w:
3505 case ESC_W:
3506 if ((options & PCRE2_UCP) == 0)
3507 {
3508 *parsed_pattern++ = META_ESCAPE + escape;
3509 }
3510 else
3511 {
3512 *parsed_pattern++ = META_ESCAPE +
3513 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3514 ESC_p : ESC_P);
3515 switch(escape)
3516 {
3517 case ESC_d:
3518 case ESC_D:
3519 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3520 break;
3521
3522 case ESC_s:
3523 case ESC_S:
3524 *parsed_pattern++ = PT_SPACE << 16;
3525 break;
3526
3527 case ESC_w:
3528 case ESC_W:
3529 *parsed_pattern++ = PT_WORD << 16;
3530 break;
3531 }
3532 }
3533 break;
3534
3535 /* Explicit Unicode property matching */
3536
3537 case ESC_P:
3538 case ESC_p:
3539 #ifdef SUPPORT_UNICODE
3540 {
3541 BOOL negated;
3542 uint16_t ptype = 0, pdata = 0;
3543 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3544 goto FAILED;
3545 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3546 *parsed_pattern++ = META_ESCAPE + escape;
3547 *parsed_pattern++ = (ptype << 16) | pdata;
3548 }
3549 #else
3550 errorcode = ERR45;
3551 goto FAILED;
3552 #endif
3553 break; /* End \P and \p */
3554
3555 default: /* All others are not allowed in a class */
3556 errorcode = ERR7;
3557 ptr--;
3558 goto FAILED;
3559 }
3560
3561 /* Perl gives a warning unless a following hyphen is the last character
3562 in the class. PCRE throws an error. */
3563
3564 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3565 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3566 {
3567 errorcode = ERR50;
3568 goto FAILED;
3569 }
3570 }
3571
3572 /* Proceed to next thing in the class. */
3573
3574 CLASS_CONTINUE:
3575 if (ptr >= ptrend)
3576 {
3577 errorcode = ERR6; /* Missing terminating ']' */
3578 goto FAILED;
3579 }
3580 GETCHARINCTEST(c, ptr);
3581 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3582 } /* End of class-processing loop */
3583
3584 if (class_range_state == RANGE_STARTED)
3585 {
3586 parsed_pattern[-1] = CHAR_MINUS;
3587 class_range_state = RANGE_NO;
3588 }
3589
3590 *parsed_pattern++ = META_CLASS_END;
3591 break; /* End of character class */
3592
3593
3594 /* ---- Opening parenthesis ---- */
3595
3596 case CHAR_LEFT_PARENTHESIS:
3597 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3598
3599 /* If ( is not followed by ? it is either a capture or a special verb or an
3600 alpha assertion. */
3601
3602 if (*ptr != CHAR_QUESTION_MARK)
3603 {
3604 const char *vn;
3605
3606 /* Handle capturing brackets (or non-capturing if auto-capture is turned
3607 off). */
3608
3609 if (*ptr != CHAR_ASTERISK)
3610 {
3611 nest_depth++;
3612 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3613 {
3614 cb->bracount++;
3615 *parsed_pattern++ = META_CAPTURE | cb->bracount;
3616 }
3617 else *parsed_pattern++ = META_NOCAPTURE;
3618 }
3619
3620 /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3621 quantifier" error rather than "(*MARK) must have an argument". */
3622
3623 else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3624 break;
3625
3626 /* Handle "alpha assertions" such as (*pla:...). Most of these are
3627 synonyms for the historical symbolic assertions, but the script run ones
3628 are new. They are distinguished by starting with a lower case letter.
3629 Checking both ends of the alphabet makes this work in all character
3630 codes. */
3631
3632 else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3633 {
3634 uint32_t meta;
3635
3636 vn = alasnames;
3637 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3638 &errorcode, cb)) goto FAILED;
3639 if (ptr >= ptrend || *ptr != CHAR_COLON)
3640 {
3641 errorcode = ERR95; /* Malformed */
3642 goto FAILED;
3643 }
3644
3645 /* Scan the table of alpha assertion names */
3646
3647 for (i = 0; i < alascount; i++)
3648 {
3649 if (namelen == alasmeta[i].len &&
3650 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3651 break;
3652 vn += alasmeta[i].len + 1;
3653 }
3654
3655 if (i >= alascount)
3656 {
3657 errorcode = ERR95; /* Alpha assertion not recognized */
3658 goto FAILED;
3659 }
3660
3661 /* Check for expecting an assertion condition. If so, only lookaround
3662 assertions are valid. */
3663
3664 meta = alasmeta[i].meta;
3665 if (prev_expect_cond_assert > 0 &&
3666 (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3667 {
3668 errorcode = ERR28; /* Assertion expected */
3669 goto FAILED;
3670 }
3671
3672 /* The lookaround alphabetic synonyms can be almost entirely handled by
3673 jumping to the code that handles the traditional symbolic forms. */
3674
3675 switch(meta)
3676 {
3677 default:
3678 errorcode = ERR89; /* Unknown code; should never occur because */
3679 goto FAILED; /* the meta values come from a table above. */
3680
3681 case META_ATOMIC:
3682 goto ATOMIC_GROUP;
3683
3684 case META_LOOKAHEAD:
3685 goto POSITIVE_LOOK_AHEAD;
3686
3687 case META_LOOKAHEADNOT:
3688 goto NEGATIVE_LOOK_AHEAD;
3689
3690 case META_LOOKBEHIND:
3691 case META_LOOKBEHINDNOT:
3692 *parsed_pattern++ = meta;
3693 ptr--;
3694 goto POST_LOOKBEHIND;
3695
3696 /* The script run facilities are handled here. Unicode support is
3697 required (give an error if not, as this is a security issue). Always
3698 record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3699 META_ATOMIC and remember that we need two META_KETs at the end. */
3700
3701 case META_SCRIPT_RUN:
3702 case META_ATOMIC_SCRIPT_RUN:
3703 #ifdef SUPPORT_UNICODE
3704 *parsed_pattern++ = META_SCRIPT_RUN;
3705 nest_depth++;
3706 ptr++;
3707 if (meta == META_ATOMIC_SCRIPT_RUN)
3708 {
3709 *parsed_pattern++ = META_ATOMIC;
3710 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3711 else if (++top_nest >= end_nests)
3712 {
3713 errorcode = ERR84;
3714 goto FAILED;
3715 }
3716 top_nest->nest_depth = nest_depth;
3717 top_nest->flags = NSF_ATOMICSR;
3718 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3719 }
3720 break;
3721 #else /* SUPPORT_UNICODE */
3722 errorcode = ERR96;
3723 goto FAILED;
3724 #endif
3725 }
3726 }
3727
3728
3729 /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3730
3731 else
3732 {
3733 vn = verbnames;
3734 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3735 &errorcode, cb)) goto FAILED;
3736 if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3737 *ptr != CHAR_RIGHT_PARENTHESIS))
3738 {
3739 errorcode = ERR60; /* Malformed */
3740 goto FAILED;
3741 }
3742
3743 /* Scan the table of verb names */
3744
3745 for (i = 0; i < verbcount; i++)
3746 {
3747 if (namelen == verbs[i].len &&
3748 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3749 break;
3750 vn += verbs[i].len + 1;
3751 }
3752
3753 if (i >= verbcount)
3754 {
3755 errorcode = ERR60; /* Verb not recognized */
3756 goto FAILED;
3757 }
3758
3759 /* An empty argument is treated as no argument. */
3760
3761 if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3762 ptr[1] == CHAR_RIGHT_PARENTHESIS)
3763 ptr++; /* Advance to the closing parens */
3764
3765 /* Check for mandatory non-empty argument; this is (*MARK) */
3766
3767 if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3768 {
3769 errorcode = ERR66;
3770 goto FAILED;
3771 }
3772
3773 /* It appears that Perl allows any characters whatsoever, other than a
3774 closing parenthesis, to appear in arguments ("names"), so we no longer
3775 insist on letters, digits, and underscores. Perl does not, however, do
3776 any interpretation within arguments, and has no means of including a
3777 closing parenthesis. PCRE supports escape processing but only when it
3778 is requested by an option. We set inverbname TRUE here, and let the
3779 main loop take care of this so that escape and \x processing is done by
3780 the main code above. */
3781
3782 if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
3783 {
3784 /* Some optional arguments can be treated as a preceding (*MARK) */
3785
3786 if (verbs[i].has_arg < 0)
3787 {
3788 add_after_mark = verbs[i].meta;
3789 *parsed_pattern++ = META_MARK;
3790 }
3791
3792 /* The remaining verbs with arguments (except *MARK) need a different
3793 opcode. */
3794
3795 else
3796 {
3797 *parsed_pattern++ = verbs[i].meta +
3798 ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3799 }
3800
3801 /* Set up for reading the name in the main loop. */
3802
3803 verblengthptr = parsed_pattern++;
3804 verbnamestart = ptr;
3805 inverbname = TRUE;
3806 }
3807 else /* No verb "name" argument */
3808 {
3809 *parsed_pattern++ = verbs[i].meta;
3810 }
3811 } /* End of (*VERB) handling */
3812 break; /* Done with this parenthesis */
3813 } /* End of groups that don't start with (? */
3814
3815
3816 /* ---- Items starting (? ---- */
3817
3818 /* The type of item is determined by what follows (?. Handle (?| and option
3819 changes under "default" because both need a new block on the nest stack.
3820 Comments starting with (?# are handled above. Note that there is some
3821 ambiguity about the sequence (?- because if a digit follows it's a relative
3822 recursion or subroutine call whereas otherwise it's an option unsetting. */
3823
3824 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3825
3826 switch(*ptr)
3827 {
3828 default:
3829 if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
3830 goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
3831
3832 /* We now have either (?| or a (possibly empty) option setting,
3833 optionally followed by a non-capturing group. */
3834
3835 nest_depth++;
3836 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3837 else if (++top_nest >= end_nests)
3838 {
3839 errorcode = ERR84;
3840 goto FAILED;
3841 }
3842 top_nest->nest_depth = nest_depth;
3843 top_nest->flags = 0;
3844 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3845
3846 /* Start of non-capturing group that resets the capture count for each
3847 branch. */
3848
3849 if (*ptr == CHAR_VERTICAL_LINE)
3850 {
3851 top_nest->reset_group = (uint16_t)cb->bracount;
3852 top_nest->max_group = (uint16_t)cb->bracount;
3853 top_nest->flags |= NSF_RESET;
3854 cb->external_flags |= PCRE2_DUPCAPUSED;
3855 *parsed_pattern++ = META_NOCAPTURE;
3856 ptr++;
3857 }
3858
3859 /* Scan for options imnsxJU to be set or unset. */
3860
3861 else
3862 {
3863 BOOL hyphenok = TRUE;
3864 uint32_t oldoptions = options;
3865
3866 top_nest->reset_group = 0;
3867 top_nest->max_group = 0;
3868 set = unset = 0;
3869 optset = &set;
3870
3871 /* ^ at the start unsets imnsx and disables the subsequent use of - */
3872
3873 if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
3874 {
3875 options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
3876 PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
3877 hyphenok = FALSE;
3878 ptr++;
3879 }
3880
3881 while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
3882 *ptr != CHAR_COLON)
3883 {
3884 switch (*ptr++)
3885 {
3886 case CHAR_MINUS:
3887 if (!hyphenok)
3888 {
3889 errorcode = ERR94;
3890 ptr--; /* Correct the offset */
3891 goto FAILED;
3892 }
3893 optset = &unset;
3894 hyphenok = FALSE;
3895 break;
3896
3897 case CHAR_J: /* Record that it changed in the external options */
3898 *optset |= PCRE2_DUPNAMES;
3899 cb->external_flags |= PCRE2_JCHANGED;
3900 break;
3901
3902 case CHAR_i: *optset |= PCRE2_CASELESS; break;
3903 case CHAR_m: *optset |= PCRE2_MULTILINE; break;
3904 case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
3905 case CHAR_s: *optset |= PCRE2_DOTALL; break;
3906 case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
3907
3908 /* If x appears twice it sets the extended extended option. */
3909
3910 case CHAR_x:
3911 *optset |= PCRE2_EXTENDED;
3912 if (ptr < ptrend && *ptr == CHAR_x)
3913 {
3914 *optset |= PCRE2_EXTENDED_MORE;
3915 ptr++;
3916 }
3917 break;
3918
3919 default:
3920 errorcode = ERR11;
3921 ptr--; /* Correct the offset */
3922 goto FAILED;
3923 }
3924 }
3925
3926 /* If we are setting extended without extended-more, ensure that any
3927 existing extended-more gets unset. Also, unsetting extended must also
3928 unset extended-more. */
3929
3930 if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
3931 (unset & PCRE2_EXTENDED) != 0)
3932 unset |= PCRE2_EXTENDED_MORE;
3933
3934 options = (options | set) & (~unset);
3935
3936 /* If the options ended with ')' this is not the start of a nested
3937 group with option changes, so the options change at this level.
3938 In this case, if the previous level set up a nest block, discard the
3939 one we have just created. Otherwise adjust it for the previous level.
3940 If the options ended with ':' we are starting a non-capturing group,
3941 possibly with an options setting. */
3942
3943 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3944 if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
3945 {
3946 nest_depth--; /* This is not a nested group after all. */
3947 if (top_nest > (nest_save *)(cb->start_workspace) &&
3948 (top_nest-1)->nest_depth == nest_depth) top_nest--;
3949 else top_nest->nest_depth = nest_depth;
3950 }
3951 else *parsed_pattern++ = META_NOCAPTURE;
3952
3953 /* If nothing changed, no need to record. */
3954
3955 if (options != oldoptions)
3956 {
3957 *parsed_pattern++ = META_OPTIONS;
3958 *parsed_pattern++ = options;
3959 }
3960 } /* End options processing */
3961 break; /* End default case after (? */
3962
3963
3964 /* ---- Python syntax support ---- */
3965
3966 case CHAR_P:
3967 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3968
3969 /* (?P<name> is the same as (?<name>, which defines a named group. */
3970
3971 if (*ptr == CHAR_LESS_THAN_SIGN)
3972 {
3973 terminator = CHAR_GREATER_THAN_SIGN;
3974 goto DEFINE_NAME;
3975 }
3976
3977 /* (?P>name) is the same as (?&name), which is a recursion or subroutine
3978 call. */
3979
3980 if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
3981
3982 /* (?P=name) is the same as \k<name>, a back reference by name. Anything
3983 else after (?P is an error. */
3984
3985 if (*ptr != CHAR_EQUALS_SIGN)
3986 {
3987 errorcode = ERR41;
3988 goto FAILED;
3989 }
3990 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
3991 &namelen, &errorcode, cb)) goto FAILED;
3992 *parsed_pattern++ = META_BACKREF_BYNAME;
3993 *parsed_pattern++ = namelen;
3994 PUTOFFSET(offset, parsed_pattern);
3995 okquantifier = TRUE;
3996 break; /* End of (?P processing */
3997
3998
3999 /* ---- Recursion/subroutine calls by number ---- */
4000
4001 case CHAR_R:
4002 i = 0; /* (?R) == (?R0) */
4003 ptr++;
4004 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4005 {
4006 errorcode = ERR58;
4007 goto FAILED;
4008 }
4009 goto SET_RECURSION;
4010
4011 /* An item starting (?- followed by a digit comes here via the "default"
4012 case because (?- followed by a non-digit is an options setting. */
4013
4014 case CHAR_PLUS:
4015 if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4016 {
4017 errorcode = ERR29; /* Missing number */
4018 goto FAILED;
4019 }
4020 /* Fall through */
4021
4022 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4023 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4024 RECURSION_BYNUMBER:
4025 if (!read_number(&ptr, ptrend,
4026 (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4027 MAX_GROUP_NUMBER, ERR61,
4028 &i, &errorcode)) goto FAILED;
4029 if (i < 0) /* NB (?0) is permitted */
4030 {
4031 errorcode = ERR15; /* Unknown group */
4032 goto FAILED_BACK;
4033 }
4034 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4035 goto UNCLOSED_PARENTHESIS;
4036
4037 SET_RECURSION:
4038 *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4039 offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4040 ptr++;
4041 PUTOFFSET(offset, parsed_pattern);
4042 okquantifier = TRUE;
4043 break; /* End of recursive call by number handling */
4044
4045
4046 /* ---- Recursion/subroutine calls by name ---- */
4047
4048 case CHAR_AMPERSAND:
4049 RECURSE_BY_NAME:
4050 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4051 &namelen, &errorcode, cb)) goto FAILED;
4052 *parsed_pattern++ = META_RECURSE_BYNAME;
4053 *parsed_pattern++ = namelen;
4054 PUTOFFSET(offset, parsed_pattern);
4055 okquantifier = TRUE;
4056 break;
4057
4058 /* ---- Callout with numerical or string argument ---- */
4059
4060 case CHAR_C:
4061 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4062
4063 /* If the previous item was a condition starting (?(? an assertion,
4064 optionally preceded by a callout, is expected. This is checked later on,
4065 during actual compilation. However we need to identify this kind of
4066 assertion in this pass because it must not be qualified. The value of
4067 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4068 for a callout - still leaving a positive value that identifies the
4069 assertion. Multiple callouts or any other items will make it zero or
4070 less, which doesn't matter because they will cause an error later. */
4071
4072 expect_cond_assert = prev_expect_cond_assert - 1;
4073
4074 /* If previous_callout is not NULL, it means this follows a previous
4075 callout. If it was a manual callout, do nothing; this means its "length
4076 of next pattern item" field will remain zero. If it was an automatic
4077 callout, abolish it. */
4078
4079 if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4080 previous_callout == parsed_pattern - 4 &&
4081 parsed_pattern[-1] == 255)
4082 parsed_pattern = previous_callout;
4083
4084 /* Save for updating next pattern item length, and skip one item before
4085 completing. */
4086
4087 previous_callout = parsed_pattern;
4088 after_manual_callout = 1;
4089
4090 /* Handle a string argument; specific delimiter is required. */
4091
4092 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4093 {
4094 PCRE2_SIZE calloutlength;
4095 PCRE2_SPTR startptr = ptr;
4096
4097 delimiter = 0;
4098 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4099 {
4100 if (*ptr == PRIV(callout_start_delims)[i])
4101 {
4102 delimiter = PRIV(callout_end_delims)[i];
4103 break;
4104 }
4105 }
4106 if (delimiter == 0)
4107 {
4108 errorcode = ERR82;
4109 goto FAILED;
4110 }
4111
4112 *parsed_pattern = META_CALLOUT_STRING;
4113 parsed_pattern += 3; /* Skip pattern info */
4114
4115 for (;;)
4116 {
4117 if (++ptr >= ptrend)
4118 {
4119 errorcode = ERR81;
4120 ptr = startptr; /* To give a more useful message */
4121 goto FAILED;
4122 }
4123 if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4124 break;
4125 }
4126
4127 calloutlength = (PCRE2_SIZE)(ptr - startptr);
4128 if (calloutlength > UINT32_MAX)
4129 {
4130 errorcode = ERR72;
4131 goto FAILED;
4132 }
4133 *parsed_pattern++ = (uint32_t)calloutlength;
4134 offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4135 PUTOFFSET(offset, parsed_pattern);
4136 }
4137
4138 /* Handle a callout with an optional numerical argument, which must be
4139 less than or equal to 255. A missing argument gives 0. */
4140
4141 else
4142 {
4143 int n = 0;
4144 *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
4145 parsed_pattern += 3; /* Skip pattern info */
4146 while (ptr < ptrend && IS_DIGIT(*ptr))
4147 {
4148 n = n * 10 + *ptr++ - CHAR_0;
4149 if (n > 255)
4150 {
4151 errorcode = ERR38;
4152 goto FAILED;
4153 }
4154 }
4155 *parsed_pattern++ = n;
4156 }
4157
4158 /* Both formats must have a closing parenthesis */
4159
4160 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4161 {
4162 errorcode = ERR39;
4163 goto FAILED;
4164 }
4165 ptr++;
4166
4167 /* Remember the offset to the next item in the pattern, and set a default
4168 length. This should get updated after the next item is read. */
4169
4170 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4171 previous_callout[2] = 0;
4172 break; /* End callout */
4173
4174
4175 /* ---- Conditional group ---- */
4176
4177 /* A condition can be an assertion, a number (referring to a numbered
4178 group's having been set), a name (referring to a named group), or 'R',
4179 referring to overall recursion. R<digits> and R&name are also permitted
4180 for recursion state tests. Numbers may be preceded by + or - to specify a
4181 relative group number.
4182
4183 There are several syntaxes for testing a named group: (?(name)) is used
4184 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4185
4186 There are two unfortunate ambiguities. 'R' can be the recursive thing or
4187 the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4188 the Perl DEFINE feature or the Python named test. We look for a name
4189 first; if not found, we try the other case.
4190
4191 For compatibility with auto-callouts, we allow a callout to be specified
4192 before a condition that is an assertion. */
4193
4194 case CHAR_LEFT_PARENTHESIS:
4195 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4196 nest_depth++;
4197
4198 /* If the next character is ? or * there must be an assertion next
4199 (optionally preceded by a callout). We do not check this here, but
4200 instead we set expect_cond_assert to 2. If this is still greater than
4201 zero (callouts decrement it) when the next assertion is read, it will be
4202 marked as a condition that must not be repeated. A value greater than
4203 zero also causes checking that an assertion (possibly with callout)
4204 follows. */
4205
4206 if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4207 {
4208 *parsed_pattern++ = META_COND_ASSERT;
4209 ptr--; /* Pull pointer back to the opening parenthesis. */
4210 expect_cond_assert = 2;
4211 break; /* End of conditional */
4212 }
4213
4214 /* Handle (?([+-]number)... */
4215
4216 if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4217 &errorcode))
4218 {
4219 if (i <= 0)
4220 {
4221 errorcode = ERR15;
4222 goto FAILED;
4223 }
4224 *parsed_pattern++ = META_COND_NUMBER;
4225 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4226 PUTOFFSET(offset, parsed_pattern);
4227 *parsed_pattern++ = i;
4228 }
4229 else if (errorcode != 0) goto FAILED; /* Number too big */
4230
4231 /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4232
4233 else if (ptrend - ptr >= 10 &&
4234 PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4235 ptr[7] != CHAR_RIGHT_PARENTHESIS)
4236 {
4237 uint32_t ge = 0;
4238 int major = 0;
4239 int minor = 0;
4240
4241 ptr += 7;
4242 if (*ptr == CHAR_GREATER_THAN_SIGN)
4243 {
4244 ge = 1;
4245 ptr++;
4246 }
4247
4248 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4249 references its argument twice. */
4250
4251 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4252 goto BAD_VERSION_CONDITION;
4253
4254 if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4255 goto FAILED;
4256
4257 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4258 if (*ptr == CHAR_DOT)
4259 {
4260 if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4261 minor = (*ptr++ - CHAR_0) * 10;
4262 if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4263 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4264 goto BAD_VERSION_CONDITION;
4265 }
4266
4267 *parsed_pattern++ = META_COND_VERSION;
4268 *parsed_pattern++ = ge;
4269 *parsed_pattern++ = major;
4270 *parsed_pattern++ = minor;
4271 }
4272
4273 /* All the remaining cases now require us to read a name. We cannot at
4274 this stage distinguish ambiguous cases such as (?(R12) which might be a
4275 recursion test by number or a name, because the named groups have not yet
4276 all been identified. Those cases are treated as names, but given a
4277 different META code. */
4278
4279 else
4280 {
4281 BOOL was_r_ampersand = FALSE;
4282
4283 if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4284 {
4285 terminator = CHAR_RIGHT_PARENTHESIS;
4286 was_r_ampersand = TRUE;
4287 ptr++;
4288 }
4289 else if (*ptr == CHAR_LESS_THAN_SIGN)
4290 terminator = CHAR_GREATER_THAN_SIGN;
4291 else if (*ptr == CHAR_APOSTROPHE)
4292 terminator = CHAR_APOSTROPHE;
4293 else
4294 {
4295 terminator = CHAR_RIGHT_PARENTHESIS;
4296 ptr--; /* Point to char before name */
4297 }
4298 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4299 &errorcode, cb)) goto FAILED;
4300
4301 /* Handle (?(R&name) */
4302
4303 if (was_r_ampersand)
4304 {
4305 *parsed_pattern = META_COND_RNAME;
4306 ptr--; /* Back to closing parens */
4307 }
4308
4309 /* Handle (?(name). If the name is "DEFINE" we identify it with a
4310 special code. Likewise if the name consists of R followed only by
4311 digits. Otherwise, handle it like a quoted name. */
4312
4313 else if (terminator == CHAR_RIGHT_PARENTHESIS)
4314 {
4315 if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4316 *parsed_pattern = META_COND_DEFINE;
4317 else
4318 {
4319 for (i = 1; i < (int)namelen; i++)
4320 if (!IS_DIGIT(name[i])) break;
4321 *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4322 META_COND_RNUMBER : META_COND_NAME;
4323 }
4324 ptr--; /* Back to closing parens */
4325 }
4326
4327 /* Handle (?('name') or (?(<name>) */
4328
4329 else *parsed_pattern = META_COND_NAME;
4330
4331 /* All these cases except DEFINE end with the name length and offset;
4332 DEFINE just has an offset (for the "too many branches" error). */
4333
4334 if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4335 PUTOFFSET(offset, parsed_pattern);
4336 } /* End cases that read a name */
4337
4338 /* Check the closing parenthesis of the condition */
4339
4340 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4341 {
4342 errorcode = ERR24;
4343 goto FAILED;
4344 }
4345 ptr++;
4346 break; /* End of condition processing */
4347
4348
4349 /* ---- Atomic group ---- */
4350
4351 case CHAR_GREATER_THAN_SIGN:
4352 ATOMIC_GROUP: /* Come from (*atomic: */
4353 *parsed_pattern++ = META_ATOMIC;
4354 nest_depth++;
4355 ptr++;
4356 break;
4357
4358
4359 /* ---- Lookahead assertions ---- */
4360
4361 case CHAR_EQUALS_SIGN:
4362 POSITIVE_LOOK_AHEAD: /* Come from (*pla: */
4363 *parsed_pattern++ = META_LOOKAHEAD;
4364 ptr++;
4365 goto POST_ASSERTION;
4366
4367 case CHAR_EXCLAMATION_MARK:
4368 NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
4369 *parsed_pattern++ = META_LOOKAHEADNOT;
4370 ptr++;
4371 goto POST_ASSERTION;
4372
4373
4374 /* ---- Lookbehind assertions ---- */
4375
4376 /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the
4377 start of the name of a capturing group. */
4378
4379 case CHAR_LESS_THAN_SIGN:
4380 if (ptrend - ptr <= 1 ||
4381 (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK))
4382 {
4383 terminator = CHAR_GREATER_THAN_SIGN;
4384 goto DEFINE_NAME;
4385 }
4386 *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4387 META_LOOKBEHIND : META_LOOKBEHINDNOT;
4388
4389 POST_LOOKBEHIND: /* Come from (*plb: and (*nlb: */
4390 *has_lookbehind = TRUE;
4391 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4392 PUTOFFSET(offset, parsed_pattern);
4393 ptr += 2;
4394 /* Fall through */
4395
4396 /* If the previous item was a condition starting (?(? an assertion,
4397 optionally preceded by a callout, is expected. This is checked later on,
4398 during actual compilation. However we need to identify this kind of
4399 assertion in this pass because it must not be qualified. The value of
4400 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4401 for a callout - still leaving a positive value that identifies the
4402 assertion. Multiple callouts or any other items will make it zero or
4403 less, which doesn't matter because they will cause an error later. */
4404
4405 POST_ASSERTION:
4406 nest_depth++;
4407 if (prev_expect_cond_assert > 0)
4408 {
4409 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4410 else if (++top_nest >= end_nests)
4411 {
4412 errorcode = ERR84;
4413 goto FAILED;
4414 }
4415 top_nest->nest_depth = nest_depth;
4416 top_nest->flags = NSF_CONDASSERT;
4417 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4418 }
4419 break;
4420
4421
4422 /* ---- Define a named group ---- */
4423
4424 /* A named group may be defined as (?'name') or (?<name>). In the latter
4425 case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4426 terminator set to '>'. */
4427
4428 case CHAR_APOSTROPHE:
4429 terminator = CHAR_APOSTROPHE; /* Terminator */
4430
4431 DEFINE_NAME:
4432 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4433 &errorcode, cb)) goto FAILED;
4434
4435 /* We have a name for this capturing group. It is also assigned a number,
4436 which is its primary means of identification. */
4437
4438 cb->bracount++;
4439 *parsed_pattern++ = META_CAPTURE | cb->bracount;
4440 nest_depth++;
4441
4442 /* Check not too many names */
4443
4444 if (cb->names_found >= MAX_NAME_COUNT)
4445 {
4446 errorcode = ERR49;
4447 goto FAILED;
4448 }
4449
4450 /* Adjust the entry size to accommodate the longest name found. */
4451
4452 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4453 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4454
4455 /* Scan the list to check for duplicates. For duplicate names, if the
4456 number is the same, break the loop, which causes the name to be
4457 discarded; otherwise, if DUPNAMES is not set, give an error.
4458 If it is set, allow the name with a different number, but continue
4459 scanning in case this is a duplicate with the same number. For
4460 non-duplicate names, give an error if the number is duplicated. */
4461
4462 isdupname = FALSE;
4463 ng = cb->named_groups;
4464 for (i = 0; i < cb->names_found; i++, ng++)
4465 {
4466 if (namelen == ng->length &&
4467 PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4468 {
4469 if (ng->number == cb->bracount) break;
4470 if ((options & PCRE2_DUPNAMES) == 0)
4471 {
4472 errorcode = ERR43;
4473 goto FAILED;
4474 }
4475 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
4476 cb->dupnames = TRUE; /* Duplicate names exist */
4477 }
4478 else if (ng->number == cb->bracount)
4479 {
4480 errorcode = ERR65;
4481 goto FAILED;
4482 }
4483 }
4484
4485 if (i < cb->names_found) break; /* Ignore duplicate with same number */
4486
4487 /* Increase the list size if necessary */
4488
4489 if (cb->names_found >= cb->named_group_list_size)
4490 {
4491 uint32_t newsize = cb->named_group_list_size * 2;
4492 named_group *newspace =
4493 cb->cx->memctl.malloc(newsize * sizeof(named_group),
4494 cb->cx->memctl.memory_data);
4495 if (newspace == NULL)
4496 {
4497 errorcode = ERR21;
4498 goto FAILED;
4499 }
4500
4501 memcpy(newspace, cb->named_groups,
4502 cb->named_group_list_size * sizeof(named_group));
4503 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4504 cb->cx->memctl.free((void *)cb->named_groups,
4505 cb->cx->memctl.memory_data);
4506 cb->named_groups = newspace;
4507 cb->named_group_list_size = newsize;
4508 }
4509
4510 /* Add this name to the list */
4511
4512 cb->named_groups[cb->names_found].name = name;
4513 cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4514 cb->named_groups[cb->names_found].number = cb->bracount;
4515 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4516 cb->names_found++;
4517 break;
4518 } /* End of (? switch */
4519 break; /* End of ( handling */
4520
4521
4522 /* ---- Branch terminators ---- */
4523
4524 /* Alternation: reset the capture count if we are in a (?| group. */
4525
4526 case CHAR_VERTICAL_LINE:
4527 if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4528 (top_nest->flags & NSF_RESET) != 0)
4529 {
4530 if (cb->bracount > top_nest->max_group)
4531 top_nest->max_group = (uint16_t)cb->bracount;
4532 cb->bracount = top_nest->reset_group;
4533 }
4534 *parsed_pattern++ = META_ALT;
4535 break;
4536
4537 /* End of group; reset the capture count to the maximum if we are in a (?|
4538 group and/or reset the options that are tracked during parsing. Disallow
4539 quantifier for a condition that is an assertion. */
4540
4541 case CHAR_RIGHT_PARENTHESIS:
4542 okquantifier = TRUE;
4543 if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4544 {
4545 options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4546 if ((top_nest->flags & NSF_RESET) != 0 &&
4547 top_nest->max_group > cb->bracount)
4548 cb->bracount = top_nest->max_group;
4549 if ((top_nest->flags & NSF_CONDASSERT) != 0)
4550 okquantifier = FALSE;
4551
4552 if ((top_nest->flags & NSF_ATOMICSR) != 0)
4553 {
4554 *parsed_pattern++ = META_KET;
4555 }
4556
4557
4558
4559 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4560 else top_nest--;
4561 }
4562 if (nest_depth == 0) /* Unmatched closing parenthesis */
4563 {
4564 errorcode = ERR22;
4565 goto FAILED_BACK;
4566 }
4567 nest_depth--;
4568 *parsed_pattern++ = META_KET;
4569 break;
4570 } /* End of switch on pattern character */
4571 } /* End of main character scan loop */
4572
4573 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4574
4575 if (inverbname && ptr >= ptrend)
4576 {
4577 errorcode = ERR60;
4578 goto FAILED;
4579 }
4580
4581 /* Manage callout for the final item */
4582
4583 PARSED_END:
4584 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4585 parsed_pattern, cb);
4586
4587 /* Insert trailing items for word and line matching (features provided for the
4588 benefit of pcre2grep). */
4589
4590 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4591 {
4592 *parsed_pattern++ = META_KET;
4593 *parsed_pattern++ = META_DOLLAR;
4594 }
4595 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4596 {
4597 *parsed_pattern++ = META_KET;
4598 *parsed_pattern++ = META_ESCAPE + ESC_b;
4599 }
4600
4601 /* Terminate the parsed pattern, then return success if all groups are closed.
4602 Otherwise we have unclosed parentheses. */
4603
4604 if (parsed_pattern >= parsed_pattern_end)
4605 {
4606 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
4607 goto FAILED;
4608 }
4609
4610 *parsed_pattern = META_END;
4611 if (nest_depth == 0) return 0;
4612
4613 UNCLOSED_PARENTHESIS:
4614 errorcode = ERR14;
4615
4616 /* Come here for all failures. */
4617
4618 FAILED:
4619 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4620 return errorcode;
4621
4622 /* Some errors need to indicate the previous character. */
4623
4624 FAILED_BACK:
4625 ptr--;
4626 goto FAILED;
4627
4628 /* This failure happens several times. */
4629
4630 BAD_VERSION_CONDITION:
4631 errorcode = ERR79;
4632 goto FAILED;
4633 }
4634
4635
4636
4637 /*************************************************
4638 * Find first significant opcode *
4639 *************************************************/
4640
4641 /* This is called by several functions that scan a compiled expression looking
4642 for a fixed first character, or an anchoring opcode etc. It skips over things
4643 that do not influence this. For some calls, it makes sense to skip negative
4644 forward and all backward assertions, and also the \b assertion; for others it
4645 does not.
4646
4647 Arguments:
4648 code pointer to the start of the group
4649 skipassert TRUE if certain assertions are to be skipped
4650
4651 Returns: pointer to the first significant opcode
4652 */
4653
4654 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)4655 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4656 {
4657 for (;;)
4658 {
4659 switch ((int)*code)
4660 {
4661 case OP_ASSERT_NOT:
4662 case OP_ASSERTBACK:
4663 case OP_ASSERTBACK_NOT:
4664 if (!skipassert) return code;
4665 do code += GET(code, 1); while (*code == OP_ALT);
4666 code += PRIV(OP_lengths)[*code];
4667 break;
4668
4669 case OP_WORD_BOUNDARY:
4670 case OP_NOT_WORD_BOUNDARY:
4671 if (!skipassert) return code;
4672 /* Fall through */
4673
4674 case OP_CALLOUT:
4675 case OP_CREF:
4676 case OP_DNCREF:
4677 case OP_RREF:
4678 case OP_DNRREF:
4679 case OP_FALSE:
4680 case OP_TRUE:
4681 code += PRIV(OP_lengths)[*code];
4682 break;
4683
4684 case OP_CALLOUT_STR:
4685 code += GET(code, 1 + 2*LINK_SIZE);
4686 break;
4687
4688 case OP_SKIPZERO:
4689 code += 2 + GET(code, 2) + LINK_SIZE;
4690 break;
4691
4692 case OP_COND:
4693 case OP_SCOND:
4694 if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
4695 code[GET(code, 1)] != OP_KET) /* More than one branch */
4696 return code;
4697 code += GET(code, 1) + 1 + LINK_SIZE;
4698 break;
4699
4700 case OP_MARK:
4701 case OP_COMMIT_ARG:
4702 case OP_PRUNE_ARG:
4703 case OP_SKIP_ARG:
4704 case OP_THEN_ARG:
4705 code += code[1] + PRIV(OP_lengths)[*code];
4706 break;
4707
4708 default:
4709 return code;
4710 }
4711 }
4712 /* Control never reaches here */
4713 }
4714
4715
4716
4717 #ifdef SUPPORT_UNICODE
4718 /*************************************************
4719 * Get othercase range *
4720 *************************************************/
4721
4722 /* This function is passed the start and end of a class range in UCP mode. It
4723 searches up the characters, looking for ranges of characters in the "other"
4724 case. Each call returns the next one, updating the start address. A character
4725 with multiple other cases is returned on its own with a special return value.
4726
4727 Arguments:
4728 cptr points to starting character value; updated
4729 d end value
4730 ocptr where to put start of othercase range
4731 odptr where to put end of othercase range
4732
4733 Yield: -1 when no more
4734 0 when a range is returned
4735 >0 the CASESET offset for char with multiple other cases
4736 in this case, ocptr contains the original
4737 */
4738
4739 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)4740 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4741 uint32_t *odptr)
4742 {
4743 uint32_t c, othercase, next;
4744 unsigned int co;
4745
4746 /* Find the first character that has an other case. If it has multiple other
4747 cases, return its case offset value. */
4748
4749 for (c = *cptr; c <= d; c++)
4750 {
4751 if ((co = UCD_CASESET(c)) != 0)
4752 {
4753 *ocptr = c++; /* Character that has the set */
4754 *cptr = c; /* Rest of input range */
4755 return (int)co;
4756 }
4757 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4758 }
4759
4760 if (c > d) return -1; /* Reached end of range */
4761
4762 /* Found a character that has a single other case. Search for the end of the
4763 range, which is either the end of the input range, or a character that has zero
4764 or more than one other cases. */
4765
4766 *ocptr = othercase;
4767 next = othercase + 1;
4768
4769 for (++c; c <= d; c++)
4770 {
4771 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4772 next++;
4773 }
4774
4775 *odptr = next - 1; /* End of othercase range */
4776 *cptr = c; /* Rest of input range */
4777 return 0;
4778 }
4779 #endif /* SUPPORT_UNICODE */
4780
4781
4782
4783 /*************************************************
4784 * Add a character or range to a class (internal) *
4785 *************************************************/
4786
4787 /* This function packages up the logic of adding a character or range of
4788 characters to a class. The character values in the arguments will be within the
4789 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4790 called only from within the "add to class" group of functions, some of which
4791 are recursive and mutually recursive. The external entry point is
4792 add_to_class().
4793
4794 Arguments:
4795 classbits the bit map for characters < 256
4796 uchardptr points to the pointer for extra data
4797 options the options word
4798 cb compile data
4799 start start of range character
4800 end end of range character
4801
4802 Returns: the number of < 256 characters added
4803 the pointer to extra data is updated
4804 */
4805
4806 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)4807 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4808 uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
4809 {
4810 uint32_t c;
4811 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
4812 unsigned int n8 = 0;
4813
4814 /* If caseless matching is required, scan the range and process alternate
4815 cases. In Unicode, there are 8-bit characters that have alternate cases that
4816 are greater than 255 and vice-versa. Sometimes we can just extend the original
4817 range. */
4818
4819 if ((options & PCRE2_CASELESS) != 0)
4820 {
4821 #ifdef SUPPORT_UNICODE
4822 if ((options & PCRE2_UTF) != 0)
4823 {
4824 int rc;
4825 uint32_t oc, od;
4826
4827 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
4828 c = start;
4829
4830 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4831 {
4832 /* Handle a single character that has more than one other case. */
4833
4834 if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
4835 PRIV(ucd_caseless_sets) + rc, oc);
4836
4837 /* Do nothing if the other case range is within the original range. */
4838
4839 else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
4840
4841 /* Extend the original range if there is overlap, noting that if oc < c, we
4842 can't have od > end because a subrange is always shorter than the basic
4843 range. Otherwise, use a recursive call to add the additional range. */
4844
4845 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4846 else if (od > end && oc <= end + 1)
4847 {
4848 end = od; /* Extend upwards */
4849 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4850 }
4851 else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
4852 }
4853 }
4854 else
4855 #endif /* SUPPORT_UNICODE */
4856
4857 /* Not UTF mode */
4858
4859 for (c = start; c <= classbits_end; c++)
4860 {
4861 SETBIT(classbits, cb->fcc[c]);
4862 n8++;
4863 }
4864 }
4865
4866 /* Now handle the originally supplied range. Adjust the final value according
4867 to the bit length - this means that the same lists of (e.g.) horizontal spaces
4868 can be used in all cases. */
4869
4870 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
4871 end = MAX_NON_UTF_CHAR;
4872
4873 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
4874
4875 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4876
4877 for (c = start; c <= classbits_end; c++)
4878 {
4879 /* Regardless of start, c will always be <= 255. */
4880 SETBIT(classbits, c);
4881 n8++;
4882 }
4883
4884 #ifdef SUPPORT_WIDE_CHARS
4885 if (start <= 0xff) start = 0xff + 1;
4886
4887 if (end >= start)
4888 {
4889 PCRE2_UCHAR *uchardata = *uchardptr;
4890
4891 #ifdef SUPPORT_UNICODE
4892 if ((options & PCRE2_UTF) != 0)
4893 {
4894 if (start < end)
4895 {
4896 *uchardata++ = XCL_RANGE;
4897 uchardata += PRIV(ord2utf)(start, uchardata);
4898 uchardata += PRIV(ord2utf)(end, uchardata);
4899 }
4900 else if (start == end)
4901 {
4902 *uchardata++ = XCL_SINGLE;
4903 uchardata += PRIV(ord2utf)(start, uchardata);
4904 }
4905 }
4906 else
4907 #endif /* SUPPORT_UNICODE */
4908
4909 /* Without UTF support, character values are constrained by the bit length,
4910 and can only be > 256 for 16-bit and 32-bit libraries. */
4911
4912 #if PCRE2_CODE_UNIT_WIDTH == 8
4913 {}
4914 #else
4915 if (start < end)
4916 {
4917 *uchardata++ = XCL_RANGE;
4918 *uchardata++ = start;
4919 *uchardata++ = end;
4920 }
4921 else if (start == end)
4922 {
4923 *uchardata++ = XCL_SINGLE;
4924 *uchardata++ = start;
4925 }
4926 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
4927 *uchardptr = uchardata; /* Updata extra data pointer */
4928 }
4929 #else /* SUPPORT_WIDE_CHARS */
4930 (void)uchardptr; /* Avoid compiler warning */
4931 #endif /* SUPPORT_WIDE_CHARS */
4932
4933 return n8; /* Number of 8-bit characters */
4934 }
4935
4936
4937
4938 #ifdef SUPPORT_UNICODE
4939 /*************************************************
4940 * Add a list of characters to a class (internal) *
4941 *************************************************/
4942
4943 /* This function is used for adding a list of case-equivalent characters to a
4944 class when in UTF mode. This function is called only from within
4945 add_to_class_internal(), with which it is mutually recursive.
4946
4947 Arguments:
4948 classbits the bit map for characters < 256
4949 uchardptr points to the pointer for extra data
4950 options the options word
4951 cb contains pointers to tables etc.
4952 p points to row of 32-bit values, terminated by NOTACHAR
4953 except character to omit; this is used when adding lists of
4954 case-equivalent characters to avoid including the one we
4955 already know about
4956
4957 Returns: the number of < 256 characters added
4958 the pointer to extra data is updated
4959 */
4960
4961 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)4962 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4963 uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
4964 {
4965 unsigned int n8 = 0;
4966 while (p[0] < NOTACHAR)
4967 {
4968 unsigned int n = 0;
4969 if (p[0] != except)
4970 {
4971 while(p[n+1] == p[0] + n + 1) n++;
4972 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
4973 }
4974 p += n + 1;
4975 }
4976 return n8;
4977 }
4978 #endif
4979
4980
4981
4982 /*************************************************
4983 * External entry point for add range to class *
4984 *************************************************/
4985
4986 /* This function sets the overall range so that the internal functions can try
4987 to avoid duplication when handling case-independence.
4988
4989 Arguments:
4990 classbits the bit map for characters < 256
4991 uchardptr points to the pointer for extra data
4992 options the options word
4993 cb compile data
4994 start start of range character
4995 end end of range character
4996
4997 Returns: the number of < 256 characters added
4998 the pointer to extra data is updated
4999 */
5000
5001 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)5002 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5003 compile_block *cb, uint32_t start, uint32_t end)
5004 {
5005 cb->class_range_start = start;
5006 cb->class_range_end = end;
5007 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5008 }
5009
5010
5011 /*************************************************
5012 * External entry point for add list to class *
5013 *************************************************/
5014
5015 /* This function is used for adding a list of horizontal or vertical whitespace
5016 characters to a class. The list must be in order so that ranges of characters
5017 can be detected and handled appropriately. This function sets the overall range
5018 so that the internal functions can try to avoid duplication when handling
5019 case-independence.
5020
5021 Arguments:
5022 classbits the bit map for characters < 256
5023 uchardptr points to the pointer for extra data
5024 options the options word
5025 cb contains pointers to tables etc.
5026 p points to row of 32-bit values, terminated by NOTACHAR
5027 except character to omit; this is used when adding lists of
5028 case-equivalent characters to avoid including the one we
5029 already know about
5030
5031 Returns: the number of < 256 characters added
5032 the pointer to extra data is updated
5033 */
5034
5035 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5036 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5037 compile_block *cb, const uint32_t *p, unsigned int except)
5038 {
5039 unsigned int n8 = 0;
5040 while (p[0] < NOTACHAR)
5041 {
5042 unsigned int n = 0;
5043 if (p[0] != except)
5044 {
5045 while(p[n+1] == p[0] + n + 1) n++;
5046 cb->class_range_start = p[0];
5047 cb->class_range_end = p[n];
5048 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5049 }
5050 p += n + 1;
5051 }
5052 return n8;
5053 }
5054
5055
5056
5057 /*************************************************
5058 * Add characters not in a list to a class *
5059 *************************************************/
5060
5061 /* This function is used for adding the complement of a list of horizontal or
5062 vertical whitespace to a class. The list must be in order.
5063
5064 Arguments:
5065 classbits the bit map for characters < 256
5066 uchardptr points to the pointer for extra data
5067 options the options word
5068 cb contains pointers to tables etc.
5069 p points to row of 32-bit values, terminated by NOTACHAR
5070
5071 Returns: the number of < 256 characters added
5072 the pointer to extra data is updated
5073 */
5074
5075 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)5076 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5077 uint32_t options, compile_block *cb, const uint32_t *p)
5078 {
5079 BOOL utf = (options & PCRE2_UTF) != 0;
5080 unsigned int n8 = 0;
5081 if (p[0] > 0)
5082 n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
5083 while (p[0] < NOTACHAR)
5084 {
5085 while (p[1] == p[0] + 1) p++;
5086 n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
5087 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5088 p++;
5089 }
5090 return n8;
5091 }
5092
5093
5094
5095 /*************************************************
5096 * Find details of duplicate group names *
5097 *************************************************/
5098
5099 /* This is called from compile_branch() when it needs to know the index and
5100 count of duplicates in the names table when processing named backreferences,
5101 either directly, or as conditions.
5102
5103 Arguments:
5104 name points to the name
5105 length the length of the name
5106 indexptr where to put the index
5107 countptr where to put the count of duplicates
5108 errorcodeptr where to put an error code
5109 cb the compile block
5110
5111 Returns: TRUE if OK, FALSE if not, error code set
5112 */
5113
5114 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5115 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5116 int *countptr, int *errorcodeptr, compile_block *cb)
5117 {
5118 uint32_t i, groupnumber;
5119 int count;
5120 PCRE2_UCHAR *slot = cb->name_table;
5121
5122 /* Find the first entry in the table */
5123
5124 for (i = 0; i < cb->names_found; i++)
5125 {
5126 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5127 slot[IMM2_SIZE+length] == 0) break;
5128 slot += cb->name_entry_size;
5129 }
5130
5131 /* This should not occur, because this function is called only when we know we
5132 have duplicate names. Give an internal error. */
5133
5134 if (i >= cb->names_found)
5135 {
5136 *errorcodeptr = ERR53;
5137 cb->erroroffset = name - cb->start_pattern;
5138 return FALSE;
5139 }
5140
5141 /* Record the index and then see how many duplicates there are, updating the
5142 backref map and maximum back reference as we do. */
5143
5144 *indexptr = i;
5145 count = 0;
5146
5147 for (;;)
5148 {
5149 count++;
5150 groupnumber = GET2(slot,0);
5151 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5152 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5153 if (++i >= cb->names_found) break;
5154 slot += cb->name_entry_size;
5155 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5156 (slot+IMM2_SIZE)[length] != 0) break;
5157 }
5158
5159 *countptr = count;
5160 return TRUE;
5161 }
5162
5163
5164
5165 /*************************************************
5166 * Compile one branch *
5167 *************************************************/
5168
5169 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5170 the options are changed during the branch, the pointer is used to change the
5171 external options bits. This function is used during the pre-compile phase when
5172 we are trying to find out the amount of memory needed, as well as during the
5173 real compile phase. The value of lengthptr distinguishes the two phases.
5174
5175 Arguments:
5176 optionsptr pointer to the option bits
5177 codeptr points to the pointer to the current code point
5178 pptrptr points to the current parsed pattern pointer
5179 errorcodeptr points to error code variable
5180 firstcuptr place to put the first required code unit
5181 firstcuflagsptr place to put the first code unit flags, or a negative number
5182 reqcuptr place to put the last required code unit
5183 reqcuflagsptr place to put the last required code unit flags, or a negative number
5184 bcptr points to current branch chain
5185 cb contains pointers to tables etc.
5186 lengthptr NULL during the real compile phase
5187 points to length accumulator during pre-compile phase
5188
5189 Returns: 0 There's been an error, *errorcodeptr is non-zero
5190 +1 Success, this branch must match at least one character
5191 -1 Success, this branch may match an empty string
5192 */
5193
5194 static int
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)5195 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
5196 int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
5197 uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
5198 compile_block *cb, PCRE2_SIZE *lengthptr)
5199 {
5200 int bravalue = 0;
5201 int okreturn = -1;
5202 int group_return = 0;
5203 uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
5204 uint32_t greedy_default, greedy_non_default;
5205 uint32_t repeat_type, op_type;
5206 uint32_t options = *optionsptr; /* May change dynamically */
5207 uint32_t firstcu, reqcu;
5208 uint32_t zeroreqcu, zerofirstcu;
5209 uint32_t escape;
5210 uint32_t *pptr = *pptrptr;
5211 uint32_t meta, meta_arg;
5212 int32_t firstcuflags, reqcuflags;
5213 int32_t zeroreqcuflags, zerofirstcuflags;
5214 int32_t req_caseopt, reqvary, tempreqvary;
5215 PCRE2_SIZE offset = 0;
5216 PCRE2_SIZE length_prevgroup = 0;
5217 PCRE2_UCHAR *code = *codeptr;
5218 PCRE2_UCHAR *last_code = code;
5219 PCRE2_UCHAR *orig_code = code;
5220 PCRE2_UCHAR *tempcode;
5221 PCRE2_UCHAR *previous = NULL;
5222 PCRE2_UCHAR op_previous;
5223 BOOL groupsetfirstcu = FALSE;
5224 BOOL matched_char = FALSE;
5225 BOOL previous_matched_char = FALSE;
5226 const uint8_t *cbits = cb->cbits;
5227 uint8_t classbits[32];
5228
5229 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5230 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5231 dynamically as we process the pattern. */
5232
5233 #ifdef SUPPORT_UNICODE
5234 BOOL utf = (options & PCRE2_UTF) != 0;
5235 #else /* No UTF support */
5236 BOOL utf = FALSE;
5237 #endif
5238
5239 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5240 class_uchardata always so that it can be passed to add_to_class() always,
5241 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5242 alternative calls for the different cases. */
5243
5244 PCRE2_UCHAR *class_uchardata;
5245 #ifdef SUPPORT_WIDE_CHARS
5246 BOOL xclass;
5247 PCRE2_UCHAR *class_uchardata_base;
5248 #endif
5249
5250 /* Set up the default and non-default settings for greediness */
5251
5252 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5253 greedy_non_default = greedy_default ^ 1;
5254
5255 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5256 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5257 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5258
5259 When we hit a repeat whose minimum is zero, we may have to adjust these values
5260 to take the zero repeat into account. This is implemented by setting them to
5261 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5262 item types that can be repeated set these backoff variables appropriately. */
5263
5264 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5265 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5266
5267 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
5268 according to the current setting of the caseless flag. The REQ_CASELESS value
5269 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5270 to record the case status of the value. This is used only for ASCII characters.
5271 */
5272
5273 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
5274
5275 /* Switch on next META item until the end of the branch */
5276
5277 for (;; pptr++)
5278 {
5279 #ifdef SUPPORT_WIDE_CHARS
5280 BOOL xclass_has_prop;
5281 #endif
5282 BOOL negate_class;
5283 BOOL should_flip_negation;
5284 BOOL match_all_or_no_wide_chars;
5285 BOOL possessive_quantifier;
5286 BOOL note_group_empty;
5287 int class_has_8bitchar;
5288 int i;
5289 uint32_t mclength;
5290 uint32_t skipunits;
5291 uint32_t subreqcu, subfirstcu;
5292 uint32_t groupnumber;
5293 uint32_t verbarglen, verbculen;
5294 int32_t subreqcuflags, subfirstcuflags; /* Must be signed */
5295 open_capitem *oc;
5296 PCRE2_UCHAR mcbuffer[8];
5297
5298 /* Get next META item in the pattern and its potential argument. */
5299
5300 meta = META_CODE(*pptr);
5301 meta_arg = META_DATA(*pptr);
5302
5303 /* If we are in the pre-compile phase, accumulate the length used for the
5304 previous cycle of this loop, unless the next item is a quantifier. */
5305
5306 if (lengthptr != NULL)
5307 {
5308 if (code > cb->start_workspace + cb->workspace_size -
5309 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
5310 {
5311 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5312 ERR52 : ERR86;
5313 return 0;
5314 }
5315
5316 /* There is at least one situation where code goes backwards: this is the
5317 case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5318 is processed, the whole class is eliminated. However, it is created first,
5319 so we have to allow memory for it. Therefore, don't ever reduce the length
5320 at this point. */
5321
5322 if (code < last_code) code = last_code;
5323
5324 /* If the next thing is not a quantifier, we add the length of the previous
5325 item into the total, and reset the code pointer to the start of the
5326 workspace. Otherwise leave the previous item available to be quantified. */
5327
5328 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5329 {
5330 if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5331 {
5332 *errorcodeptr = ERR20; /* Integer overflow */
5333 return 0;
5334 }
5335 *lengthptr += (PCRE2_SIZE)(code - orig_code);
5336 if (*lengthptr > MAX_PATTERN_SIZE)
5337 {
5338 *errorcodeptr = ERR20; /* Pattern is too large */
5339 return 0;
5340 }
5341 code = orig_code;
5342 }
5343
5344 /* Remember where this code item starts so we can catch the "backwards"
5345 case above next time round. */
5346
5347 last_code = code;
5348 }
5349
5350 /* Process the next parsed pattern item. If it is not a quantifier, remember
5351 where it starts so that it can be quantified when a quantifier follows.
5352 Checking for the legality of quantifiers happens in parse_regex(), except for
5353 a quantifier after an assertion that is a condition. */
5354
5355 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5356 {
5357 previous = code;
5358 if (matched_char) okreturn = 1;
5359 }
5360
5361 previous_matched_char = matched_char;
5362 matched_char = FALSE;
5363 note_group_empty = FALSE;
5364 skipunits = 0; /* Default value for most subgroups */
5365
5366 switch(meta)
5367 {
5368 /* ===================================================================*/
5369 /* The branch terminates at pattern end or | or ) */
5370
5371 case META_END:
5372 case META_ALT:
5373 case META_KET:
5374 *firstcuptr = firstcu;
5375 *firstcuflagsptr = firstcuflags;
5376 *reqcuptr = reqcu;
5377 *reqcuflagsptr = reqcuflags;
5378 *codeptr = code;
5379 *pptrptr = pptr;
5380 return okreturn;
5381
5382
5383 /* ===================================================================*/
5384 /* Handle single-character metacharacters. In multiline mode, ^ disables
5385 the setting of any following char as a first character. */
5386
5387 case META_CIRCUMFLEX:
5388 if ((options & PCRE2_MULTILINE) != 0)
5389 {
5390 if (firstcuflags == REQ_UNSET)
5391 zerofirstcuflags = firstcuflags = REQ_NONE;
5392 *code++ = OP_CIRCM;
5393 }
5394 else *code++ = OP_CIRC;
5395 break;
5396
5397 case META_DOLLAR:
5398 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5399 break;
5400
5401 /* There can never be a first char if '.' is first, whatever happens about
5402 repeats. The value of reqcu doesn't change either. */
5403
5404 case META_DOT:
5405 matched_char = TRUE;
5406 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5407 zerofirstcu = firstcu;
5408 zerofirstcuflags = firstcuflags;
5409 zeroreqcu = reqcu;
5410 zeroreqcuflags = reqcuflags;
5411 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5412 break;
5413
5414
5415 /* ===================================================================*/
5416 /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5417 Otherwise, an initial ']' is taken as a data character. When empty classes
5418 are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5419 match any character, so generate OP_ALLANY. */
5420
5421 case META_CLASS_EMPTY:
5422 case META_CLASS_EMPTY_NOT:
5423 matched_char = TRUE;
5424 *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5425 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5426 zerofirstcu = firstcu;
5427 zerofirstcuflags = firstcuflags;
5428 break;
5429
5430
5431 /* ===================================================================*/
5432 /* Non-empty character class. If the included characters are all < 256, we
5433 build a 32-byte bitmap of the permitted characters, except in the special
5434 case where there is only one such character. For negated classes, we build
5435 the map as usual, then invert it at the end. However, we use a different
5436 opcode so that data characters > 255 can be handled correctly.
5437
5438 If the class contains characters outside the 0-255 range, a different
5439 opcode is compiled. It may optionally have a bit map for characters < 256,
5440 but those above are are explicitly listed afterwards. A flag code unit
5441 tells whether the bitmap is present, and whether this is a negated class or
5442 not. */
5443
5444 case META_CLASS_NOT:
5445 case META_CLASS:
5446 matched_char = TRUE;
5447 negate_class = meta == META_CLASS_NOT;
5448
5449 /* We can optimize the case of a single character in a class by generating
5450 OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5451 negative. In the negative case there can be no first char if this item is
5452 first, whatever repeat count may follow. In the case of reqcu, save the
5453 previous value for reinstating. */
5454
5455 /* NOTE: at present this optimization is not effective if the only
5456 character in a class in 32-bit, non-UCP mode has its top bit set. */
5457
5458 if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5459 {
5460 #ifdef SUPPORT_UNICODE
5461 uint32_t d;
5462 #endif
5463 uint32_t c = pptr[1];
5464
5465 pptr += 2; /* Move on to class end */
5466 if (meta == META_CLASS) /* A positive one-char class can be */
5467 { /* handled as a normal literal character. */
5468 meta = c; /* Set up the character */
5469 goto NORMAL_CHAR_SET;
5470 }
5471
5472 /* Handle a negative one-character class */
5473
5474 zeroreqcu = reqcu;
5475 zeroreqcuflags = reqcuflags;
5476 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5477 zerofirstcu = firstcu;
5478 zerofirstcuflags = firstcuflags;
5479
5480 /* For caseless UTF mode, check whether this character has more than
5481 one other case. If so, generate a special OP_NOTPROP item instead of
5482 OP_NOTI. */
5483
5484 #ifdef SUPPORT_UNICODE
5485 if (utf && (options & PCRE2_CASELESS) != 0 &&
5486 (d = UCD_CASESET(c)) != 0)
5487 {
5488 *code++ = OP_NOTPROP;
5489 *code++ = PT_CLIST;
5490 *code++ = d;
5491 break; /* We are finished with this class */
5492 }
5493 #endif
5494 /* Char has only one other case, or UCP not available */
5495
5496 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5497 code += PUTCHAR(c, code);
5498 break; /* We are finished with this class */
5499 } /* End of 1-char optimization */
5500
5501 /* Handle character classes that contain more than just one literal
5502 character. */
5503
5504 /* If a non-extended class contains a negative special such as \S, we need
5505 to flip the negation flag at the end, so that support for characters > 255
5506 works correctly (they are all included in the class). An extended class may
5507 need to insert specific matching or non-matching code for wide characters.
5508 */
5509
5510 should_flip_negation = match_all_or_no_wide_chars = FALSE;
5511
5512 /* Extended class (xclass) will be used when characters > 255
5513 might match. */
5514
5515 #ifdef SUPPORT_WIDE_CHARS
5516 xclass = FALSE;
5517 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
5518 class_uchardata_base = class_uchardata; /* Save the start */
5519 #endif
5520
5521 /* For optimization purposes, we track some properties of the class:
5522 class_has_8bitchar will be non-zero if the class contains at least one
5523 character with a code point less than 256; xclass_has_prop will be TRUE if
5524 Unicode property checks are present in the class. */
5525
5526 class_has_8bitchar = 0;
5527 #ifdef SUPPORT_WIDE_CHARS
5528 xclass_has_prop = FALSE;
5529 #endif
5530
5531 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5532 in a temporary bit of memory, in case the class contains fewer than two
5533 8-bit characters because in that case the compiled code doesn't use the bit
5534 map. */
5535
5536 memset(classbits, 0, 32 * sizeof(uint8_t));
5537
5538 /* Process items until META_CLASS_END is reached. */
5539
5540 while ((meta = *(++pptr)) != META_CLASS_END)
5541 {
5542 /* Handle POSIX classes such as [:alpha:] etc. */
5543
5544 if (meta == META_POSIX || meta == META_POSIX_NEG)
5545 {
5546 BOOL local_negate = (meta == META_POSIX_NEG);
5547 int posix_class = *(++pptr);
5548 int taboffset, tabopt;
5549 uint8_t pbits[32];
5550
5551 should_flip_negation = local_negate; /* Note negative special */
5552
5553 /* If matching is caseless, upper and lower are converted to alpha.
5554 This relies on the fact that the class table starts with alpha,
5555 lower, upper as the first 3 entries. */
5556
5557 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5558 posix_class = 0;
5559
5560 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5561 different escape sequences that use Unicode properties \p or \P.
5562 Others that are not available via \p or \P have to generate
5563 XCL_PROP/XCL_NOTPROP directly, which is done here. */
5564
5565 #ifdef SUPPORT_UNICODE
5566 if ((options & PCRE2_UCP) != 0) switch(posix_class)
5567 {
5568 case PC_GRAPH:
5569 case PC_PRINT:
5570 case PC_PUNCT:
5571 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5572 *class_uchardata++ = (PCRE2_UCHAR)
5573 ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5574 (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5575 *class_uchardata++ = 0;
5576 xclass_has_prop = TRUE;
5577 goto CONTINUE_CLASS;
5578
5579 /* For the other POSIX classes (ascii, xdigit) we are going to
5580 fall through to the non-UCP case and build a bit map for
5581 characters with code points less than 256. However, if we are in
5582 a negated POSIX class, characters with code points greater than
5583 255 must either all match or all not match, depending on whether
5584 the whole class is not or is negated. For example, for
5585 [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5586 they must not.
5587
5588 In the special case where there are no xclass items, this is
5589 automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5590 explicit range is needed for OP_XCLASS. Setting a flag here
5591 causes the range to be generated later when it is known that
5592 OP_XCLASS is required. In the 8-bit library this is relevant only in
5593 utf mode, since no wide characters can exist otherwise. */
5594
5595 default:
5596 #if PCRE2_CODE_UNIT_WIDTH == 8
5597 if (utf)
5598 #endif
5599 match_all_or_no_wide_chars |= local_negate;
5600 break;
5601 }
5602 #endif /* SUPPORT_UNICODE */
5603
5604 /* In the non-UCP case, or when UCP makes no difference, we build the
5605 bit map for the POSIX class in a chunk of local store because we may
5606 be adding and subtracting from it, and we don't want to subtract bits
5607 that may be in the main map already. At the end we or the result into
5608 the bit map that is being built. */
5609
5610 posix_class *= 3;
5611
5612 /* Copy in the first table (always present) */
5613
5614 memcpy(pbits, cbits + posix_class_maps[posix_class],
5615 32 * sizeof(uint8_t));
5616
5617 /* If there is a second table, add or remove it as required. */
5618
5619 taboffset = posix_class_maps[posix_class + 1];
5620 tabopt = posix_class_maps[posix_class + 2];
5621
5622 if (taboffset >= 0)
5623 {
5624 if (tabopt >= 0)
5625 for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5626 else
5627 for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5628 }
5629
5630 /* Now see if we need to remove any special characters. An option
5631 value of 1 removes vertical space and 2 removes underscore. */
5632
5633 if (tabopt < 0) tabopt = -tabopt;
5634 if (tabopt == 1) pbits[1] &= ~0x3c;
5635 else if (tabopt == 2) pbits[11] &= 0x7f;
5636
5637 /* Add the POSIX table or its complement into the main table that is
5638 being built and we are done. */
5639
5640 if (local_negate)
5641 for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
5642 else
5643 for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
5644
5645 /* Every class contains at least one < 256 character. */
5646
5647 class_has_8bitchar = 1;
5648 goto CONTINUE_CLASS; /* End of POSIX handling */
5649 }
5650
5651 /* Other than POSIX classes, the only items we should encounter are
5652 \d-type escapes and literal characters (possibly as ranges). */
5653
5654 if (meta == META_BIGVALUE)
5655 {
5656 meta = *(++pptr);
5657 goto CLASS_LITERAL;
5658 }
5659
5660 /* Any other non-literal must be an escape */
5661
5662 if (meta >= META_END)
5663 {
5664 if (META_CODE(meta) != META_ESCAPE)
5665 {
5666 #ifdef DEBUG_SHOW_PARSED
5667 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5668 "in character class\n", meta);
5669 #endif
5670 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
5671 return 0;
5672 }
5673 escape = META_DATA(meta);
5674
5675 /* Every class contains at least one < 256 character. */
5676
5677 class_has_8bitchar++;
5678
5679 switch(escape)
5680 {
5681 case ESC_d:
5682 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5683 break;
5684
5685 case ESC_D:
5686 should_flip_negation = TRUE;
5687 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
5688 break;
5689
5690 case ESC_w:
5691 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5692 break;
5693
5694 case ESC_W:
5695 should_flip_negation = TRUE;
5696 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
5697 break;
5698
5699 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5700 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5701 previously set by something earlier in the character class.
5702 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5703 we could just adjust the appropriate bit. From PCRE 8.34 we no
5704 longer treat \s and \S specially. */
5705
5706 case ESC_s:
5707 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5708 break;
5709
5710 case ESC_S:
5711 should_flip_negation = TRUE;
5712 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
5713 break;
5714
5715 /* When adding the horizontal or vertical space lists to a class, or
5716 their complements, disable PCRE2_CASELESS, because it justs wastes
5717 time, and in the "not-x" UTF cases can create unwanted duplicates in
5718 the XCLASS list (provoked by characters that have more than one other
5719 case and by both cases being in the same "not-x" sublist). */
5720
5721 case ESC_h:
5722 (void)add_list_to_class(classbits, &class_uchardata,
5723 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5724 break;
5725
5726 case ESC_H:
5727 (void)add_not_list_to_class(classbits, &class_uchardata,
5728 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5729 break;
5730
5731 case ESC_v:
5732 (void)add_list_to_class(classbits, &class_uchardata,
5733 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5734 break;
5735
5736 case ESC_V:
5737 (void)add_not_list_to_class(classbits, &class_uchardata,
5738 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5739 break;
5740
5741 /* If Unicode is not supported, \P and \p are not allowed and are
5742 faulted at parse time, so will never appear here. */
5743
5744 #ifdef SUPPORT_UNICODE
5745 case ESC_p:
5746 case ESC_P:
5747 {
5748 uint32_t ptype = *(++pptr) >> 16;
5749 uint32_t pdata = *pptr & 0xffff;
5750 *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5751 *class_uchardata++ = ptype;
5752 *class_uchardata++ = pdata;
5753 xclass_has_prop = TRUE;
5754 class_has_8bitchar--; /* Undo! */
5755 }
5756 break;
5757 #endif
5758 }
5759
5760 goto CONTINUE_CLASS;
5761 } /* End handling \d-type escapes */
5762
5763 /* A literal character may be followed by a range meta. At parse time
5764 there are checks for out-of-order characters, for ranges where the two
5765 characters are equal, and for hyphens that cannot indicate a range. At
5766 this point, therefore, no checking is needed. */
5767
5768 else
5769 {
5770 uint32_t c, d;
5771
5772 CLASS_LITERAL:
5773 c = d = meta;
5774
5775 /* Remember if \r or \n were explicitly used */
5776
5777 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5778
5779 /* Process a character range */
5780
5781 if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
5782 {
5783 #ifdef EBCDIC
5784 BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
5785 #endif
5786 pptr += 2;
5787 d = *pptr;
5788 if (d == META_BIGVALUE) d = *(++pptr);
5789
5790 /* Remember an explicit \r or \n, and add the range to the class. */
5791
5792 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5793
5794 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
5795 because there are holes in the encoding, and simply using the range
5796 A-Z (for example) would include the characters in the holes. This
5797 applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
5798
5799 #ifdef EBCDIC
5800 if (range_is_literal &&
5801 (cb->ctypes[c] & ctype_letter) != 0 &&
5802 (cb->ctypes[d] & ctype_letter) != 0 &&
5803 (c <= CHAR_z) == (d <= CHAR_z))
5804 {
5805 uint32_t uc = (d <= CHAR_z)? 0 : 64;
5806 uint32_t C = c - uc;
5807 uint32_t D = d - uc;
5808
5809 if (C <= CHAR_i)
5810 {
5811 class_has_8bitchar +=
5812 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5813 ((D < CHAR_i)? D : CHAR_i) + uc);
5814 C = CHAR_j;
5815 }
5816
5817 if (C <= D && C <= CHAR_r)
5818 {
5819 class_has_8bitchar +=
5820 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5821 ((D < CHAR_r)? D : CHAR_r) + uc);
5822 C = CHAR_s;
5823 }
5824
5825 if (C <= D)
5826 {
5827 class_has_8bitchar +=
5828 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5829 D + uc);
5830 }
5831 }
5832 else
5833 #endif
5834 /* Not an EBCDIC special range */
5835
5836 class_has_8bitchar +=
5837 add_to_class(classbits, &class_uchardata, options, cb, c, d);
5838 goto CONTINUE_CLASS; /* Go get the next char in the class */
5839 } /* End of range handling */
5840
5841
5842 /* Handle a single character. */
5843
5844 class_has_8bitchar +=
5845 add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
5846 }
5847
5848 /* Continue to the next item in the class. */
5849
5850 CONTINUE_CLASS:
5851
5852 #ifdef SUPPORT_WIDE_CHARS
5853 /* If any wide characters or Unicode properties have been encountered,
5854 set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
5855 of the extra data and reset the pointer. This is so that very large
5856 classes that contain a zillion wide characters or Unicode property tests
5857 do not overwrite the workspace (which is on the stack). */
5858
5859 if (class_uchardata > class_uchardata_base)
5860 {
5861 xclass = TRUE;
5862 if (lengthptr != NULL)
5863 {
5864 *lengthptr += class_uchardata - class_uchardata_base;
5865 class_uchardata = class_uchardata_base;
5866 }
5867 }
5868 #endif
5869
5870 continue; /* Needed to avoid error when not supporting wide chars */
5871 } /* End of main class-processing loop */
5872
5873 /* If this class is the first thing in the branch, there can be no first
5874 char setting, whatever the repeat count. Any reqcu setting must remain
5875 unchanged after any kind of repeat. */
5876
5877 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5878 zerofirstcu = firstcu;
5879 zerofirstcuflags = firstcuflags;
5880 zeroreqcu = reqcu;
5881 zeroreqcuflags = reqcuflags;
5882
5883 /* If there are characters with values > 255, or Unicode property settings
5884 (\p or \P), we have to compile an extended class, with its own opcode,
5885 unless there were no property settings and there was a negated special such
5886 as \S in the class, and PCRE2_UCP is not set, because in that case all
5887 characters > 255 are in or not in the class, so any that were explicitly
5888 given as well can be ignored.
5889
5890 In the UCP case, if certain negated POSIX classes ([:^ascii:] or
5891 [^:xdigit:]) were present in a class, we either have to match or not match
5892 all wide characters (depending on whether the whole class is or is not
5893 negated). This requirement is indicated by match_all_or_no_wide_chars being
5894 true. We do this by including an explicit range, which works in both cases.
5895 This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
5896 cannot be any wide characters in 8-bit non-UTF mode.
5897
5898 When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
5899 class where \S etc is present without PCRE2_UCP, causing an extended class
5900 to be compiled, we make sure that all characters > 255 are included by
5901 forcing match_all_or_no_wide_chars to be true.
5902
5903 If, when generating an xclass, there are no characters < 256, we can omit
5904 the bitmap in the actual compiled code. */
5905
5906 #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
5907 if (xclass && (
5908 #ifdef SUPPORT_UNICODE
5909 (options & PCRE2_UCP) != 0 ||
5910 #endif
5911 xclass_has_prop || !should_flip_negation))
5912 {
5913 if (match_all_or_no_wide_chars || (
5914 #if PCRE2_CODE_UNIT_WIDTH == 8
5915 utf &&
5916 #endif
5917 should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
5918 {
5919 *class_uchardata++ = XCL_RANGE;
5920 if (utf) /* Will always be utf in the 8-bit library */
5921 {
5922 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5923 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
5924 }
5925 else /* Can only happen for the 16-bit & 32-bit libraries */
5926 {
5927 #if PCRE2_CODE_UNIT_WIDTH == 16
5928 *class_uchardata++ = 0x100;
5929 *class_uchardata++ = 0xffffu;
5930 #elif PCRE2_CODE_UNIT_WIDTH == 32
5931 *class_uchardata++ = 0x100;
5932 *class_uchardata++ = 0xffffffffu;
5933 #endif
5934 }
5935 }
5936 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5937 *code++ = OP_XCLASS;
5938 code += LINK_SIZE;
5939 *code = negate_class? XCL_NOT:0;
5940 if (xclass_has_prop) *code |= XCL_HASPROP;
5941
5942 /* If the map is required, move up the extra data to make room for it;
5943 otherwise just move the code pointer to the end of the extra data. */
5944
5945 if (class_has_8bitchar > 0)
5946 {
5947 *code++ |= XCL_MAP;
5948 (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
5949 CU2BYTES(class_uchardata - code));
5950 if (negate_class && !xclass_has_prop)
5951 {
5952 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
5953 for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
5954 }
5955 memcpy(code, classbits, 32);
5956 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
5957 }
5958 else code = class_uchardata;
5959
5960 /* Now fill in the complete length of the item */
5961
5962 PUT(previous, 1, (int)(code - previous));
5963 break; /* End of class handling */
5964 }
5965 #endif /* SUPPORT_WIDE_CHARS */
5966
5967 /* If there are no characters > 255, or they are all to be included or
5968 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5969 whole class was negated and whether there were negative specials such as \S
5970 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5971 negating it if necessary. */
5972
5973 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5974 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5975 {
5976 if (negate_class)
5977 {
5978 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
5979 for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
5980 }
5981 memcpy(code, classbits, 32);
5982 }
5983 code += 32 / sizeof(PCRE2_UCHAR);
5984 break; /* End of class processing */
5985
5986
5987 /* ===================================================================*/
5988 /* Deal with (*VERB)s. */
5989
5990 /* Check for open captures before ACCEPT and close those that are within
5991 the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
5992 assertion. In the first pass, just accumulate the length required;
5993 otherwise hitting (*ACCEPT) inside many nested parentheses can cause
5994 workspace overflow. Do not set firstcu after *ACCEPT. */
5995
5996 case META_ACCEPT:
5997 cb->had_accept = TRUE;
5998 for (oc = cb->open_caps;
5999 oc != NULL && oc->assert_depth >= cb->assert_depth;
6000 oc = oc->next)
6001 {
6002 if (lengthptr != NULL)
6003 {
6004 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6005 }
6006 else
6007 {
6008 *code++ = OP_CLOSE;
6009 PUT2INC(code, 0, oc->number);
6010 }
6011 }
6012 *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6013 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6014 break;
6015
6016 case META_PRUNE:
6017 case META_SKIP:
6018 cb->had_pruneorskip = TRUE;
6019 /* Fall through */
6020 case META_COMMIT:
6021 case META_FAIL:
6022 *code++ = verbops[(meta - META_MARK) >> 16];
6023 break;
6024
6025 case META_THEN:
6026 cb->external_flags |= PCRE2_HASTHEN;
6027 *code++ = OP_THEN;
6028 break;
6029
6030 /* Handle verbs with arguments. Arguments can be very long, especially in
6031 16- and 32-bit modes, and can overflow the workspace in the first pass.
6032 However, the argument length is constrained to be small enough to fit in
6033 one code unit. This check happens in parse_regex(). In the first pass,
6034 instead of putting the argument into memory, we just update the length
6035 counter and set up an empty argument. */
6036
6037 case META_THEN_ARG:
6038 cb->external_flags |= PCRE2_HASTHEN;
6039 goto VERB_ARG;
6040
6041 case META_PRUNE_ARG:
6042 case META_SKIP_ARG:
6043 cb->had_pruneorskip = TRUE;
6044 /* Fall through */
6045 case META_MARK:
6046 case META_COMMIT_ARG:
6047 VERB_ARG:
6048 *code++ = verbops[(meta - META_MARK) >> 16];
6049 /* The length is in characters. */
6050 verbarglen = *(++pptr);
6051 verbculen = 0;
6052 tempcode = code++;
6053 for (i = 0; i < (int)verbarglen; i++)
6054 {
6055 meta = *(++pptr);
6056 #ifdef SUPPORT_UNICODE
6057 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6058 #endif
6059 {
6060 mclength = 1;
6061 mcbuffer[0] = meta;
6062 }
6063 if (lengthptr != NULL) *lengthptr += mclength; else
6064 {
6065 memcpy(code, mcbuffer, CU2BYTES(mclength));
6066 code += mclength;
6067 verbculen += mclength;
6068 }
6069 }
6070
6071 *tempcode = verbculen; /* Fill in the code unit length */
6072 *code++ = 0; /* Terminating zero */
6073 break;
6074
6075
6076 /* ===================================================================*/
6077 /* Handle options change. The new setting must be passed back for use in
6078 subsequent branches. Reset the greedy defaults and the case value for
6079 firstcu and reqcu. */
6080
6081 case META_OPTIONS:
6082 *optionsptr = options = *(++pptr);
6083 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6084 greedy_non_default = greedy_default ^ 1;
6085 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6086 break;
6087
6088
6089 /* ===================================================================*/
6090 /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6091 because it could be a numerical check on recursion, or a name check on a
6092 group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6093 we can handle it either way. We first try for a name; if not found, process
6094 the number. */
6095
6096 case META_COND_RNUMBER: /* (?(Rdigits) */
6097 case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
6098 case META_COND_RNAME: /* (?(R&name) - test for recursion */
6099 bravalue = OP_COND;
6100 {
6101 int count, index;
6102 PCRE2_SPTR name;
6103 named_group *ng = cb->named_groups;
6104 uint32_t length = *(++pptr);
6105
6106 GETPLUSOFFSET(offset, pptr);
6107 name = cb->start_pattern + offset;
6108
6109 /* In the first pass, the names generated in the pre-pass are available,
6110 but the main name table has not yet been created. Scan the list of names
6111 generated in the pre-pass in order to get a number and whether or not
6112 this name is duplicated. If it is not duplicated, we can handle it as a
6113 numerical group. */
6114
6115 for (i = 0; i < cb->names_found; i++, ng++)
6116 {
6117 if (length == ng->length &&
6118 PRIV(strncmp)(name, ng->name, length) == 0)
6119 {
6120 if (!ng->isdup)
6121 {
6122 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6123 PUT2(code, 2+LINK_SIZE, ng->number);
6124 if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6125 skipunits = 1+IMM2_SIZE;
6126 goto GROUP_PROCESS_NOTE_EMPTY;
6127 }
6128 break; /* Found a duplicated name */
6129 }
6130 }
6131
6132 /* If the name was not found we have a bad reference, unless we are
6133 dealing with R<digits>, which is treated as a recursion test by number.
6134 */
6135
6136 if (i >= cb->names_found)
6137 {
6138 groupnumber = 0;
6139 if (meta == META_COND_RNUMBER)
6140 {
6141 for (i = 1; i < (int)length; i++)
6142 {
6143 groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6144 if (groupnumber > MAX_GROUP_NUMBER)
6145 {
6146 *errorcodeptr = ERR61;
6147 cb->erroroffset = offset + i;
6148 return 0;
6149 }
6150 }
6151 }
6152
6153 if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6154 {
6155 *errorcodeptr = ERR15;
6156 cb->erroroffset = offset;
6157 return 0;
6158 }
6159
6160 /* (?Rdigits) treated as a recursion reference by number. A value of
6161 zero (which is the result of both (?R) and (?R0)) means "any", and is
6162 translated into RREF_ANY (which is 0xffff). */
6163
6164 if (groupnumber == 0) groupnumber = RREF_ANY;
6165 code[1+LINK_SIZE] = OP_RREF;
6166 PUT2(code, 2+LINK_SIZE, groupnumber);
6167 skipunits = 1+IMM2_SIZE;
6168 goto GROUP_PROCESS_NOTE_EMPTY;
6169 }
6170
6171 /* A duplicated name was found. Note that if an R<digits> name is found
6172 (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6173
6174 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6175
6176 /* We have a duplicated name. In the compile pass we have to search the
6177 main table in order to get the index and count values. */
6178
6179 count = 0; /* Values for first pass (avoids compiler warning) */
6180 index = 0;
6181 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6182 &count, errorcodeptr, cb)) return 0;
6183
6184 /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6185 insert appropriate data values. */
6186
6187 code[1+LINK_SIZE]++;
6188 skipunits = 1+2*IMM2_SIZE;
6189 PUT2(code, 2+LINK_SIZE, index);
6190 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6191 }
6192 goto GROUP_PROCESS_NOTE_EMPTY;
6193
6194 /* The DEFINE condition is always false. Its internal groups may never
6195 be called, so matched_char must remain false, hence the jump to
6196 GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6197
6198 case META_COND_DEFINE:
6199 bravalue = OP_COND;
6200 GETPLUSOFFSET(offset, pptr);
6201 code[1+LINK_SIZE] = OP_DEFINE;
6202 skipunits = 1;
6203 goto GROUP_PROCESS;
6204
6205 /* Conditional test of a group's being set. */
6206
6207 case META_COND_NUMBER:
6208 bravalue = OP_COND;
6209 GETPLUSOFFSET(offset, pptr);
6210 groupnumber = *(++pptr);
6211 if (groupnumber > cb->bracount)
6212 {
6213 *errorcodeptr = ERR15;
6214 cb->erroroffset = offset;
6215 return 0;
6216 }
6217 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6218 offset -= 2; /* Point at initial ( for too many branches error */
6219 code[1+LINK_SIZE] = OP_CREF;
6220 skipunits = 1+IMM2_SIZE;
6221 PUT2(code, 2+LINK_SIZE, groupnumber);
6222 goto GROUP_PROCESS_NOTE_EMPTY;
6223
6224 /* Test for the PCRE2 version. */
6225
6226 case META_COND_VERSION:
6227 bravalue = OP_COND;
6228 if (pptr[1] > 0)
6229 code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6230 (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6231 OP_TRUE : OP_FALSE;
6232 else
6233 code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6234 OP_TRUE : OP_FALSE;
6235 skipunits = 1;
6236 pptr += 3;
6237 goto GROUP_PROCESS_NOTE_EMPTY;
6238
6239 /* The condition is an assertion, possibly preceded by a callout. */
6240
6241 case META_COND_ASSERT:
6242 bravalue = OP_COND;
6243 goto GROUP_PROCESS_NOTE_EMPTY;
6244
6245
6246 /* ===================================================================*/
6247 /* Handle all kinds of nested bracketed groups. The non-capturing,
6248 non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6249
6250 case META_LOOKAHEAD:
6251 bravalue = OP_ASSERT;
6252 cb->assert_depth += 1;
6253 goto GROUP_PROCESS;
6254
6255 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6256 thing to do, but Perl allows all assertions to be quantified, and when
6257 they contain capturing parentheses there may be a potential use for
6258 this feature. Not that that applies to a quantified (?!) but we allow
6259 it for uniformity. */
6260
6261 case META_LOOKAHEADNOT:
6262 if (pptr[1] == META_KET &&
6263 (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6264 {
6265 *code++ = OP_FAIL;
6266 pptr++;
6267 }
6268 else
6269 {
6270 bravalue = OP_ASSERT_NOT;
6271 cb->assert_depth += 1;
6272 goto GROUP_PROCESS;
6273 }
6274 break;
6275
6276 case META_LOOKBEHIND:
6277 bravalue = OP_ASSERTBACK;
6278 cb->assert_depth += 1;
6279 goto GROUP_PROCESS;
6280
6281 case META_LOOKBEHINDNOT:
6282 bravalue = OP_ASSERTBACK_NOT;
6283 cb->assert_depth += 1;
6284 goto GROUP_PROCESS;
6285
6286 case META_ATOMIC:
6287 bravalue = OP_ONCE;
6288 goto GROUP_PROCESS_NOTE_EMPTY;
6289
6290 case META_SCRIPT_RUN:
6291 bravalue = OP_SCRIPT_RUN;
6292 goto GROUP_PROCESS_NOTE_EMPTY;
6293
6294 case META_NOCAPTURE:
6295 bravalue = OP_BRA;
6296 /* Fall through */
6297
6298 /* Process nested bracketed regex. The nesting depth is maintained for the
6299 benefit of the stackguard function. The test for too deep nesting is now
6300 done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6301 others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6302 note of whether or not they may match an empty string. */
6303
6304 GROUP_PROCESS_NOTE_EMPTY:
6305 note_group_empty = TRUE;
6306
6307 GROUP_PROCESS:
6308 cb->parens_depth += 1;
6309 *code = bravalue;
6310 pptr++;
6311 tempcode = code;
6312 tempreqvary = cb->req_varyopt; /* Save value before group */
6313 length_prevgroup = 0; /* Initialize for pre-compile phase */
6314
6315 if ((group_return =
6316 compile_regex(
6317 options, /* The option state */
6318 &tempcode, /* Where to put code (updated) */
6319 &pptr, /* Input pointer (updated) */
6320 errorcodeptr, /* Where to put an error message */
6321 skipunits, /* Skip over bracket number */
6322 &subfirstcu, /* For possible first char */
6323 &subfirstcuflags,
6324 &subreqcu, /* For possible last char */
6325 &subreqcuflags,
6326 bcptr, /* Current branch chain */
6327 cb, /* Compile data block */
6328 (lengthptr == NULL)? NULL : /* Actual compile phase */
6329 &length_prevgroup /* Pre-compile phase */
6330 )) == 0)
6331 return 0; /* Error */
6332
6333 cb->parens_depth -= 1;
6334
6335 /* If that was a non-conditional significant group (not an assertion, not a
6336 DEFINE) that matches at least one character, then the current item matches
6337 a character. Conditionals are handled below. */
6338
6339 if (note_group_empty && bravalue != OP_COND && group_return > 0)
6340 matched_char = TRUE;
6341
6342 /* If we've just compiled an assertion, pop the assert depth. */
6343
6344 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6345 cb->assert_depth -= 1;
6346
6347 /* At the end of compiling, code is still pointing to the start of the
6348 group, while tempcode has been updated to point past the end of the group.
6349 The parsed pattern pointer (pptr) is on the closing META_KET.
6350
6351 If this is a conditional bracket, check that there are no more than
6352 two branches in the group, or just one if it's a DEFINE group. We do this
6353 in the real compile phase, not in the pre-pass, where the whole group may
6354 not be available. */
6355
6356 if (bravalue == OP_COND && lengthptr == NULL)
6357 {
6358 PCRE2_UCHAR *tc = code;
6359 int condcount = 0;
6360
6361 do {
6362 condcount++;
6363 tc += GET(tc,1);
6364 }
6365 while (*tc != OP_KET);
6366
6367 /* A DEFINE group is never obeyed inline (the "condition" is always
6368 false). It must have only one branch. Having checked this, change the
6369 opcode to OP_FALSE. */
6370
6371 if (code[LINK_SIZE+1] == OP_DEFINE)
6372 {
6373 if (condcount > 1)
6374 {
6375 cb->erroroffset = offset;
6376 *errorcodeptr = ERR54;
6377 return 0;
6378 }
6379 code[LINK_SIZE+1] = OP_FALSE;
6380 bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6381 }
6382
6383 /* A "normal" conditional group. If there is just one branch, we must not
6384 make use of its firstcu or reqcu, because this is equivalent to an
6385 empty second branch. Also, it may match an empty string. If there are two
6386 branches, this item must match a character if the group must. */
6387
6388 else
6389 {
6390 if (condcount > 2)
6391 {
6392 cb->erroroffset = offset;
6393 *errorcodeptr = ERR27;
6394 return 0;
6395 }
6396 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6397 else if (group_return > 0) matched_char = TRUE;
6398 }
6399 }
6400
6401 /* In the pre-compile phase, update the length by the length of the group,
6402 less the brackets at either end. Then reduce the compiled code to just a
6403 set of non-capturing brackets so that it doesn't use much memory if it is
6404 duplicated by a quantifier.*/
6405
6406 if (lengthptr != NULL)
6407 {
6408 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6409 {
6410 *errorcodeptr = ERR20;
6411 return 0;
6412 }
6413 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6414 code++; /* This already contains bravalue */
6415 PUTINC(code, 0, 1 + LINK_SIZE);
6416 *code++ = OP_KET;
6417 PUTINC(code, 0, 1 + LINK_SIZE);
6418 break; /* No need to waste time with special character handling */
6419 }
6420
6421 /* Otherwise update the main code pointer to the end of the group. */
6422
6423 code = tempcode;
6424
6425 /* For a DEFINE group, required and first character settings are not
6426 relevant. */
6427
6428 if (bravalue == OP_DEFINE) break;
6429
6430 /* Handle updating of the required and first code units for other types of
6431 group. Update for normal brackets of all kinds, and conditions with two
6432 branches (see code above). If the bracket is followed by a quantifier with
6433 zero repeat, we have to back off. Hence the definition of zeroreqcu and
6434 zerofirstcu outside the main loop so that they can be accessed for the back
6435 off. */
6436
6437 zeroreqcu = reqcu;
6438 zeroreqcuflags = reqcuflags;
6439 zerofirstcu = firstcu;
6440 zerofirstcuflags = firstcuflags;
6441 groupsetfirstcu = FALSE;
6442
6443 if (bravalue >= OP_ONCE) /* Not an assertion */
6444 {
6445 /* If we have not yet set a firstcu in this branch, take it from the
6446 subpattern, remembering that it was set here so that a repeat of more
6447 than one can replicate it as reqcu if necessary. If the subpattern has
6448 no firstcu, set "none" for the whole branch. In both cases, a zero
6449 repeat forces firstcu to "none". */
6450
6451 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6452 {
6453 if (subfirstcuflags >= 0)
6454 {
6455 firstcu = subfirstcu;
6456 firstcuflags = subfirstcuflags;
6457 groupsetfirstcu = TRUE;
6458 }
6459 else firstcuflags = REQ_NONE;
6460 zerofirstcuflags = REQ_NONE;
6461 }
6462
6463 /* If firstcu was previously set, convert the subpattern's firstcu
6464 into reqcu if there wasn't one, using the vary flag that was in
6465 existence beforehand. */
6466
6467 else if (subfirstcuflags >= 0 && subreqcuflags < 0)
6468 {
6469 subreqcu = subfirstcu;
6470 subreqcuflags = subfirstcuflags | tempreqvary;
6471 }
6472
6473 /* If the subpattern set a required code unit (or set a first code unit
6474 that isn't really the first code unit - see above), set it. */
6475
6476 if (subreqcuflags >= 0)
6477 {
6478 reqcu = subreqcu;
6479 reqcuflags = subreqcuflags;
6480 }
6481 }
6482
6483 /* For a forward assertion, we take the reqcu, if set, provided that the
6484 group has also set a firstcu. This can be helpful if the pattern that
6485 follows the assertion doesn't set a different char. For example, it's
6486 useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6487 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6488 the "real" "a" would then become a reqcu instead of a firstcu. This is
6489 overcome by a scan at the end if there's no firstcu, looking for an
6490 asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6491 we must only take the reqcu when the group also set a firstcu. Otherwise,
6492 in that example, 'X' ends up set for both. */
6493
6494 else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
6495 subfirstcuflags >= 0)
6496 {
6497 reqcu = subreqcu;
6498 reqcuflags = subreqcuflags;
6499 }
6500
6501 break; /* End of nested group handling */
6502
6503
6504 /* ===================================================================*/
6505 /* Handle named backreferences and recursions. */
6506
6507 case META_BACKREF_BYNAME:
6508 case META_RECURSE_BYNAME:
6509 {
6510 int count, index;
6511 PCRE2_SPTR name;
6512 BOOL is_dupname = FALSE;
6513 named_group *ng = cb->named_groups;
6514 uint32_t length = *(++pptr);
6515
6516 GETPLUSOFFSET(offset, pptr);
6517 name = cb->start_pattern + offset;
6518
6519 /* In the first pass, the names generated in the pre-pass are available,
6520 but the main name table has not yet been created. Scan the list of names
6521 generated in the pre-pass in order to get a number and whether or not
6522 this name is duplicated. */
6523
6524 groupnumber = 0;
6525 for (i = 0; i < cb->names_found; i++, ng++)
6526 {
6527 if (length == ng->length &&
6528 PRIV(strncmp)(name, ng->name, length) == 0)
6529 {
6530 is_dupname = ng->isdup;
6531 groupnumber = ng->number;
6532
6533 /* For a recursion, that's all that is needed. We can now go to
6534 the code that handles numerical recursion, applying it to the first
6535 group with the given name. */
6536
6537 if (meta == META_RECURSE_BYNAME)
6538 {
6539 meta_arg = groupnumber;
6540 goto HANDLE_NUMERICAL_RECURSION;
6541 }
6542
6543 /* For a back reference, update the back reference map and the
6544 maximum back reference. Then, for each group, we must check to
6545 see if it is recursive, that is, it is inside the group that it
6546 references. A flag is set so that the group can be made atomic.
6547 */
6548
6549 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6550 if (groupnumber > cb->top_backref)
6551 cb->top_backref = groupnumber;
6552
6553 for (oc = cb->open_caps; oc != NULL; oc = oc->next)
6554 {
6555 if (oc->number == groupnumber)
6556 {
6557 oc->flag = TRUE;
6558 break;
6559 }
6560 }
6561 }
6562 }
6563
6564 /* If the name was not found we have a bad reference. */
6565
6566 if (groupnumber == 0)
6567 {
6568 *errorcodeptr = ERR15;
6569 cb->erroroffset = offset;
6570 return 0;
6571 }
6572
6573 /* If a back reference name is not duplicated, we can handle it as
6574 a numerical reference. */
6575
6576 if (!is_dupname)
6577 {
6578 meta_arg = groupnumber;
6579 goto HANDLE_SINGLE_REFERENCE;
6580 }
6581
6582 /* If a back reference name is duplicated, we generate a different
6583 opcode to a numerical back reference. In the second pass we must
6584 search for the index and count in the final name table. */
6585
6586 count = 0; /* Values for first pass (avoids compiler warning) */
6587 index = 0;
6588 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6589 &count, errorcodeptr, cb)) return 0;
6590
6591 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6592 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6593 PUT2INC(code, 0, index);
6594 PUT2INC(code, 0, count);
6595 }
6596 break;
6597
6598
6599 /* ===================================================================*/
6600 /* Handle a numerical callout. */
6601
6602 case META_CALLOUT_NUMBER:
6603 code[0] = OP_CALLOUT;
6604 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6605 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6606 code[1 + 2*LINK_SIZE] = pptr[3];
6607 pptr += 3;
6608 code += PRIV(OP_lengths)[OP_CALLOUT];
6609 break;
6610
6611
6612 /* ===================================================================*/
6613 /* Handle a callout with a string argument. In the pre-pass we just compute
6614 the length without generating anything. The length in pptr[3] includes both
6615 delimiters; in the actual compile only the first one is copied, but a
6616 terminating zero is added. Any doubled delimiters within the string make
6617 this an overestimate, but it is not worth bothering about. */
6618
6619 case META_CALLOUT_STRING:
6620 if (lengthptr != NULL)
6621 {
6622 *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6623 pptr += 3;
6624 SKIPOFFSET(pptr);
6625 }
6626
6627 /* In the real compile we can copy the string. The starting delimiter is
6628 included so that the client can discover it if they want. We also pass the
6629 start offset to help a script language give better error messages. */
6630
6631 else
6632 {
6633 PCRE2_SPTR pp;
6634 uint32_t delimiter;
6635 uint32_t length = pptr[3];
6636 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6637
6638 code[0] = OP_CALLOUT_STR;
6639 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6640 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6641
6642 pptr += 3;
6643 GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
6644 pp = cb->start_pattern + offset;
6645 delimiter = *callout_string++ = *pp++;
6646 if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6647 delimiter = CHAR_RIGHT_CURLY_BRACKET;
6648 PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
6649
6650 /* The syntax of the pattern was checked in the parsing scan. The length
6651 includes both delimiters, but we have passed the opening one just above,
6652 so we reduce length before testing it. The test is for > 1 because we do
6653 not want to copy the final delimiter. This also ensures that pp[1] is
6654 accessible. */
6655
6656 while (--length > 1)
6657 {
6658 if (*pp == delimiter && pp[1] == delimiter)
6659 {
6660 *callout_string++ = delimiter;
6661 pp += 2;
6662 length--;
6663 }
6664 else *callout_string++ = *pp++;
6665 }
6666 *callout_string++ = CHAR_NUL;
6667
6668 /* Set the length of the entire item, the advance to its end. */
6669
6670 PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6671 code = callout_string;
6672 }
6673 break;
6674
6675
6676 /* ===================================================================*/
6677 /* Handle repetition. The different types are all sorted out in the parsing
6678 pass. */
6679
6680 case META_MINMAX_PLUS:
6681 case META_MINMAX_QUERY:
6682 case META_MINMAX:
6683 repeat_min = *(++pptr);
6684 repeat_max = *(++pptr);
6685 goto REPEAT;
6686
6687 case META_ASTERISK:
6688 case META_ASTERISK_PLUS:
6689 case META_ASTERISK_QUERY:
6690 repeat_min = 0;
6691 repeat_max = REPEAT_UNLIMITED;
6692 goto REPEAT;
6693
6694 case META_PLUS:
6695 case META_PLUS_PLUS:
6696 case META_PLUS_QUERY:
6697 repeat_min = 1;
6698 repeat_max = REPEAT_UNLIMITED;
6699 goto REPEAT;
6700
6701 case META_QUERY:
6702 case META_QUERY_PLUS:
6703 case META_QUERY_QUERY:
6704 repeat_min = 0;
6705 repeat_max = 1;
6706
6707 REPEAT:
6708 if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6709
6710 /* Remember whether this is a variable length repeat, and default to
6711 single-char opcodes. */
6712
6713 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6714 op_type = 0;
6715
6716 /* If the repeat is {1} we can ignore it. */
6717
6718 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6719
6720 /* Adjust first and required code units for a zero repeat. */
6721
6722 if (repeat_min == 0)
6723 {
6724 firstcu = zerofirstcu;
6725 firstcuflags = zerofirstcuflags;
6726 reqcu = zeroreqcu;
6727 reqcuflags = zeroreqcuflags;
6728 }
6729
6730 /* Note the greediness and possessiveness. */
6731
6732 switch (meta)
6733 {
6734 case META_MINMAX_PLUS:
6735 case META_ASTERISK_PLUS:
6736 case META_PLUS_PLUS:
6737 case META_QUERY_PLUS:
6738 repeat_type = 0; /* Force greedy */
6739 possessive_quantifier = TRUE;
6740 break;
6741
6742 case META_MINMAX_QUERY:
6743 case META_ASTERISK_QUERY:
6744 case META_PLUS_QUERY:
6745 case META_QUERY_QUERY:
6746 repeat_type = greedy_non_default;
6747 possessive_quantifier = FALSE;
6748 break;
6749
6750 default:
6751 repeat_type = greedy_default;
6752 possessive_quantifier = FALSE;
6753 break;
6754 }
6755
6756 /* Save start of previous item, in case we have to move it up in order to
6757 insert something before it, and remember what it was. */
6758
6759 tempcode = previous;
6760 op_previous = *previous;
6761
6762 /* Now handle repetition for the different types of item. */
6763
6764 switch (op_previous)
6765 {
6766 /* If previous was a character or negated character match, abolish the
6767 item and generate a repeat item instead. If a char item has a minimum of
6768 more than one, ensure that it is set in reqcu - it might not be if a
6769 sequence such as x{3} is the first thing in a branch because the x will
6770 have gone into firstcu instead. */
6771
6772 case OP_CHAR:
6773 case OP_CHARI:
6774 case OP_NOT:
6775 case OP_NOTI:
6776 op_type = chartypeoffset[op_previous - OP_CHAR];
6777
6778 /* Deal with UTF characters that take up more than one code unit. */
6779
6780 #ifdef MAYBE_UTF_MULTI
6781 if (utf && NOT_FIRSTCU(code[-1]))
6782 {
6783 PCRE2_UCHAR *lastchar = code - 1;
6784 BACKCHAR(lastchar);
6785 mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
6786 memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
6787 }
6788 else
6789 #endif /* MAYBE_UTF_MULTI */
6790
6791 /* Handle the case of a single code unit - either with no UTF support, or
6792 with UTF disabled, or for a single-code-unit UTF character. */
6793 {
6794 mcbuffer[0] = code[-1];
6795 mclength = 1;
6796 if (op_previous <= OP_CHARI && repeat_min > 1)
6797 {
6798 reqcu = mcbuffer[0];
6799 reqcuflags = req_caseopt | cb->req_varyopt;
6800 }
6801 }
6802 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
6803
6804 /* If previous was a character class or a back reference, we put the
6805 repeat stuff after it, but just skip the item if the repeat was {0,0}. */
6806
6807 #ifdef SUPPORT_WIDE_CHARS
6808 case OP_XCLASS:
6809 #endif
6810 case OP_CLASS:
6811 case OP_NCLASS:
6812 case OP_REF:
6813 case OP_REFI:
6814 case OP_DNREF:
6815 case OP_DNREFI:
6816
6817 if (repeat_max == 0)
6818 {
6819 code = previous;
6820 goto END_REPEAT;
6821 }
6822
6823 if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
6824 *code++ = OP_CRSTAR + repeat_type;
6825 else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
6826 *code++ = OP_CRPLUS + repeat_type;
6827 else if (repeat_min == 0 && repeat_max == 1)
6828 *code++ = OP_CRQUERY + repeat_type;
6829 else
6830 {
6831 *code++ = OP_CRRANGE + repeat_type;
6832 PUT2INC(code, 0, repeat_min);
6833 if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
6834 PUT2INC(code, 0, repeat_max);
6835 }
6836 break;
6837
6838 /* If previous is OP_FAIL, it was generated by an empty class []
6839 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
6840 generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
6841 time. We can just ignore this repeat. */
6842
6843 case OP_FAIL:
6844 goto END_REPEAT;
6845
6846 /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
6847 because pcre2_match() could not handle backtracking into recursively
6848 called groups. Now that this backtracking is available, we no longer need
6849 to do this. However, we still need to replicate recursions as we do for
6850 groups so as to have independent backtracking points. We can replicate
6851 for the minimum number of repeats directly. For optional repeats we now
6852 wrap the recursion in OP_BRA brackets and make use of the bracket
6853 repetition. */
6854
6855 case OP_RECURSE:
6856
6857 /* Generate unwrapped repeats for a non-zero minimum, except when the
6858 minimum is 1 and the maximum unlimited, because that can be handled with
6859 OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
6860 minimum, we just need to generate the appropriate additional copies.
6861 Otherwise we need to generate one more, to simulate the situation when
6862 the minimum is zero. */
6863
6864 if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
6865 {
6866 int replicate = repeat_min;
6867 if (repeat_min == repeat_max) replicate--;
6868
6869 /* In the pre-compile phase, we don't actually do the replication. We
6870 just adjust the length as if we had. Do some paranoid checks for
6871 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6872 integer type when available, otherwise double. */
6873
6874 if (lengthptr != NULL)
6875 {
6876 PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
6877 if ((INT64_OR_DOUBLE)replicate*
6878 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
6879 (INT64_OR_DOUBLE)INT_MAX ||
6880 OFLOW_MAX - *lengthptr < delta)
6881 {
6882 *errorcodeptr = ERR20;
6883 return 0;
6884 }
6885 *lengthptr += delta;
6886 }
6887
6888 else for (i = 0; i < replicate; i++)
6889 {
6890 memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
6891 previous = code;
6892 code += 1 + LINK_SIZE;
6893 }
6894
6895 /* If the number of repeats is fixed, we are done. Otherwise, adjust
6896 the counts and fall through. */
6897
6898 if (repeat_min == repeat_max) break;
6899 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
6900 repeat_min = 0;
6901 }
6902
6903 /* Wrap the recursion call in OP_BRA brackets. */
6904
6905 (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
6906 op_previous = *previous = OP_BRA;
6907 PUT(previous, 1, 2 + 2*LINK_SIZE);
6908 previous[2 + 2*LINK_SIZE] = OP_KET;
6909 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
6910 code += 2 + 2 * LINK_SIZE;
6911 length_prevgroup = 3 + 3*LINK_SIZE;
6912 group_return = -1; /* Set "may match empty string" */
6913
6914 /* Now treat as a repeated OP_BRA. */
6915 /* Fall through */
6916
6917 /* If previous was a bracket group, we may have to replicate it in
6918 certain cases. Note that at this point we can encounter only the "basic"
6919 bracket opcodes such as BRA and CBRA, as this is the place where they get
6920 converted into the more special varieties such as BRAPOS and SBRA.
6921 Originally, PCRE did not allow repetition of assertions, but now it does,
6922 for Perl compatibility. */
6923
6924 case OP_ASSERT:
6925 case OP_ASSERT_NOT:
6926 case OP_ASSERTBACK:
6927 case OP_ASSERTBACK_NOT:
6928 case OP_ONCE:
6929 case OP_SCRIPT_RUN:
6930 case OP_BRA:
6931 case OP_CBRA:
6932 case OP_COND:
6933 {
6934 int len = (int)(code - previous);
6935 PCRE2_UCHAR *bralink = NULL;
6936 PCRE2_UCHAR *brazeroptr = NULL;
6937
6938 /* Repeating a DEFINE group (or any group where the condition is always
6939 FALSE and there is only one branch) is pointless, but Perl allows the
6940 syntax, so we just ignore the repeat. */
6941
6942 if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
6943 previous[GET(previous, 1)] != OP_ALT)
6944 goto END_REPEAT;
6945
6946 /* There is no sense in actually repeating assertions. The only
6947 potential use of repetition is in cases when the assertion is optional.
6948 Therefore, if the minimum is greater than zero, just ignore the repeat.
6949 If the maximum is not zero or one, set it to 1. */
6950
6951 if (op_previous < OP_ONCE) /* Assertion */
6952 {
6953 if (repeat_min > 0) goto END_REPEAT;
6954 if (repeat_max > 1) repeat_max = 1;
6955 }
6956
6957 /* The case of a zero minimum is special because of the need to stick
6958 OP_BRAZERO in front of it, and because the group appears once in the
6959 data, whereas in other cases it appears the minimum number of times. For
6960 this reason, it is simplest to treat this case separately, as otherwise
6961 the code gets far too messy. There are several special subcases when the
6962 minimum is zero. */
6963
6964 if (repeat_min == 0)
6965 {
6966 /* If the maximum is also zero, we used to just omit the group from
6967 the output altogether, like this:
6968
6969 ** if (repeat_max == 0)
6970 ** {
6971 ** code = previous;
6972 ** goto END_REPEAT;
6973 ** }
6974
6975 However, that fails when a group or a subgroup within it is
6976 referenced as a subroutine from elsewhere in the pattern, so now we
6977 stick in OP_SKIPZERO in front of it so that it is skipped on
6978 execution. As we don't have a list of which groups are referenced, we
6979 cannot do this selectively.
6980
6981 If the maximum is 1 or unlimited, we just have to stick in the
6982 BRAZERO and do no more at this point. */
6983
6984 if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
6985 {
6986 (void)memmove(previous + 1, previous, CU2BYTES(len));
6987 code++;
6988 if (repeat_max == 0)
6989 {
6990 *previous++ = OP_SKIPZERO;
6991 goto END_REPEAT;
6992 }
6993 brazeroptr = previous; /* Save for possessive optimizing */
6994 *previous++ = OP_BRAZERO + repeat_type;
6995 }
6996
6997 /* If the maximum is greater than 1 and limited, we have to replicate
6998 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6999 The first one has to be handled carefully because it's the original
7000 copy, which has to be moved up. The remainder can be handled by code
7001 that is common with the non-zero minimum case below. We have to
7002 adjust the value or repeat_max, since one less copy is required. */
7003
7004 else
7005 {
7006 int linkoffset;
7007 (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7008 code += 2 + LINK_SIZE;
7009 *previous++ = OP_BRAZERO + repeat_type;
7010 *previous++ = OP_BRA;
7011
7012 /* We chain together the bracket link offset fields that have to be
7013 filled in later when the ends of the brackets are reached. */
7014
7015 linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7016 bralink = previous;
7017 PUTINC(previous, 0, linkoffset);
7018 }
7019
7020 if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7021 }
7022
7023 /* If the minimum is greater than zero, replicate the group as many
7024 times as necessary, and adjust the maximum to the number of subsequent
7025 copies that we need. */
7026
7027 else
7028 {
7029 if (repeat_min > 1)
7030 {
7031 /* In the pre-compile phase, we don't actually do the replication.
7032 We just adjust the length as if we had. Do some paranoid checks for
7033 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7034 integer type when available, otherwise double. */
7035
7036 if (lengthptr != NULL)
7037 {
7038 PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
7039 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
7040 (INT64_OR_DOUBLE)length_prevgroup >
7041 (INT64_OR_DOUBLE)INT_MAX ||
7042 OFLOW_MAX - *lengthptr < delta)
7043 {
7044 *errorcodeptr = ERR20;
7045 return 0;
7046 }
7047 *lengthptr += delta;
7048 }
7049
7050 /* This is compiling for real. If there is a set first code unit
7051 for the group, and we have not yet set a "required code unit", set
7052 it. */
7053
7054 else
7055 {
7056 if (groupsetfirstcu && reqcuflags < 0)
7057 {
7058 reqcu = firstcu;
7059 reqcuflags = firstcuflags;
7060 }
7061 for (i = 1; (uint32_t)i < repeat_min; i++)
7062 {
7063 memcpy(code, previous, CU2BYTES(len));
7064 code += len;
7065 }
7066 }
7067 }
7068
7069 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7070 }
7071
7072 /* This code is common to both the zero and non-zero minimum cases. If
7073 the maximum is limited, it replicates the group in a nested fashion,
7074 remembering the bracket starts on a stack. In the case of a zero
7075 minimum, the first one was set up above. In all cases the repeat_max
7076 now specifies the number of additional copies needed. Again, we must
7077 remember to replicate entries on the forward reference list. */
7078
7079 if (repeat_max != REPEAT_UNLIMITED)
7080 {
7081 /* In the pre-compile phase, we don't actually do the replication. We
7082 just adjust the length as if we had. For each repetition we must add
7083 1 to the length for BRAZERO and for all but the last repetition we
7084 must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7085 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7086 is a 64-bit integer type when available, otherwise double. */
7087
7088 if (lengthptr != NULL && repeat_max > 0)
7089 {
7090 PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
7091 2 - 2*LINK_SIZE; /* Last one doesn't nest */
7092 if ((INT64_OR_DOUBLE)repeat_max *
7093 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
7094 > (INT64_OR_DOUBLE)INT_MAX ||
7095 OFLOW_MAX - *lengthptr < delta)
7096 {
7097 *errorcodeptr = ERR20;
7098 return 0;
7099 }
7100 *lengthptr += delta;
7101 }
7102
7103 /* This is compiling for real */
7104
7105 else for (i = repeat_max - 1; i >= 0; i--)
7106 {
7107 *code++ = OP_BRAZERO + repeat_type;
7108
7109 /* All but the final copy start a new nesting, maintaining the
7110 chain of brackets outstanding. */
7111
7112 if (i != 0)
7113 {
7114 int linkoffset;
7115 *code++ = OP_BRA;
7116 linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7117 bralink = code;
7118 PUTINC(code, 0, linkoffset);
7119 }
7120
7121 memcpy(code, previous, CU2BYTES(len));
7122 code += len;
7123 }
7124
7125 /* Now chain through the pending brackets, and fill in their length
7126 fields (which are holding the chain links pro tem). */
7127
7128 while (bralink != NULL)
7129 {
7130 int oldlinkoffset;
7131 int linkoffset = (int)(code - bralink + 1);
7132 PCRE2_UCHAR *bra = code - linkoffset;
7133 oldlinkoffset = GET(bra, 1);
7134 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7135 *code++ = OP_KET;
7136 PUTINC(code, 0, linkoffset);
7137 PUT(bra, 1, linkoffset);
7138 }
7139 }
7140
7141 /* If the maximum is unlimited, set a repeater in the final copy. For
7142 SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7143 possessively repeated ONCE brackets can be converted into non-capturing
7144 brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7145 saves having to deal with possessive ONCEs specially.
7146
7147 Otherwise, when we are doing the actual compile phase, check to see
7148 whether this group is one that could match an empty string. If so,
7149 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7150 that runtime checking can be done. [This check is also applied to ONCE
7151 and SCRIPT_RUN groups at runtime, but in a different way.]
7152
7153 Then, if the quantifier was possessive and the bracket is not a
7154 conditional, we convert the BRA code to the POS form, and the KET code to
7155 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
7156 subpattern at both the start and at the end.) The use of special opcodes
7157 makes it possible to reduce greatly the stack usage in pcre2_match(). If
7158 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
7159
7160 Then, if the minimum number of matches is 1 or 0, cancel the possessive
7161 flag so that the default action below, of wrapping everything inside
7162 atomic brackets, does not happen. When the minimum is greater than 1,
7163 there will be earlier copies of the group, and so we still have to wrap
7164 the whole thing. */
7165
7166 else
7167 {
7168 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7169 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7170
7171 /* Convert possessive ONCE brackets to non-capturing */
7172
7173 if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7174
7175 /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7176 to do is to set the KET. */
7177
7178 if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7179 *ketcode = OP_KETRMAX + repeat_type;
7180
7181 /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7182 (which have been converted to non-capturing above). */
7183
7184 else
7185 {
7186 /* In the compile phase, adjust the opcode if the group can match
7187 an empty string. For a conditional group with only one branch, the
7188 value of group_return will not show "could be empty", so we must
7189 check that separately. */
7190
7191 if (lengthptr == NULL)
7192 {
7193 if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7194 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7195 *bracode = OP_SCOND;
7196 }
7197
7198 /* Handle possessive quantifiers. */
7199
7200 if (possessive_quantifier)
7201 {
7202 /* For COND brackets, we wrap the whole thing in a possessively
7203 repeated non-capturing bracket, because we have not invented POS
7204 versions of the COND opcodes. */
7205
7206 if (*bracode == OP_COND || *bracode == OP_SCOND)
7207 {
7208 int nlen = (int)(code - bracode);
7209 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7210 code += 1 + LINK_SIZE;
7211 nlen += 1 + LINK_SIZE;
7212 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7213 *code++ = OP_KETRPOS;
7214 PUTINC(code, 0, nlen);
7215 PUT(bracode, 1, nlen);
7216 }
7217
7218 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7219
7220 else
7221 {
7222 *bracode += 1; /* Switch to xxxPOS opcodes */
7223 *ketcode = OP_KETRPOS;
7224 }
7225
7226 /* If the minimum is zero, mark it as possessive, then unset the
7227 possessive flag when the minimum is 0 or 1. */
7228
7229 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7230 if (repeat_min < 2) possessive_quantifier = FALSE;
7231 }
7232
7233 /* Non-possessive quantifier */
7234
7235 else *ketcode = OP_KETRMAX + repeat_type;
7236 }
7237 }
7238 }
7239 break;
7240
7241 /* If previous was a character type match (\d or similar), abolish it and
7242 create a suitable repeat item. The code is shared with single-character
7243 repeats by setting op_type to add a suitable offset into repeat_type.
7244 Note the the Unicode property types will be present only when
7245 SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7246 here because it just makes it horribly messy. */
7247
7248 default:
7249 if (op_previous >= OP_EODN) /* Not a character type - internal error */
7250 {
7251 *errorcodeptr = ERR10;
7252 return 0;
7253 }
7254 else
7255 {
7256 int prop_type, prop_value;
7257 PCRE2_UCHAR *oldcode;
7258
7259 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
7260 mclength = 0; /* Not a character */
7261
7262 if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7263 {
7264 prop_type = previous[1];
7265 prop_value = previous[2];
7266 }
7267 else
7268 {
7269 /* Come here from just above with a character in mcbuffer/mclength. */
7270 OUTPUT_SINGLE_REPEAT:
7271 prop_type = prop_value = -1;
7272 }
7273
7274 /* At this point, if prop_type == prop_value == -1 we either have a
7275 character in mcbuffer when mclength is greater than zero, or we have
7276 mclength zero, in which case there is a non-property character type in
7277 op_previous. If prop_type/value are not negative, we have a property
7278 character type in op_previous. */
7279
7280 oldcode = code; /* Save where we were */
7281 code = previous; /* Usually overwrite previous item */
7282
7283 /* If the maximum is zero then the minimum must also be zero; Perl allows
7284 this case, so we do too - by simply omitting the item altogether. */
7285
7286 if (repeat_max == 0) goto END_REPEAT;
7287
7288 /* Combine the op_type with the repeat_type */
7289
7290 repeat_type += op_type;
7291
7292 /* A minimum of zero is handled either as the special case * or ?, or as
7293 an UPTO, with the maximum given. */
7294
7295 if (repeat_min == 0)
7296 {
7297 if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7298 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7299 else
7300 {
7301 *code++ = OP_UPTO + repeat_type;
7302 PUT2INC(code, 0, repeat_max);
7303 }
7304 }
7305
7306 /* A repeat minimum of 1 is optimized into some special cases. If the
7307 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7308 left in place and, if the maximum is greater than 1, we use OP_UPTO with
7309 one less than the maximum. */
7310
7311 else if (repeat_min == 1)
7312 {
7313 if (repeat_max == REPEAT_UNLIMITED)
7314 *code++ = OP_PLUS + repeat_type;
7315 else
7316 {
7317 code = oldcode; /* Leave previous item in place */
7318 if (repeat_max == 1) goto END_REPEAT;
7319 *code++ = OP_UPTO + repeat_type;
7320 PUT2INC(code, 0, repeat_max - 1);
7321 }
7322 }
7323
7324 /* The case {n,n} is just an EXACT, while the general case {n,m} is
7325 handled as an EXACT followed by an UPTO or STAR or QUERY. */
7326
7327 else
7328 {
7329 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7330 PUT2INC(code, 0, repeat_min);
7331
7332 /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7333 and then generate the second opcode. For a repeated Unicode property
7334 match, there are two extra values that define the required property,
7335 and mclength is set zero to indicate this. */
7336
7337 if (repeat_max != repeat_min)
7338 {
7339 if (mclength > 0)
7340 {
7341 memcpy(code, mcbuffer, CU2BYTES(mclength));
7342 code += mclength;
7343 }
7344 else
7345 {
7346 *code++ = op_previous;
7347 if (prop_type >= 0)
7348 {
7349 *code++ = prop_type;
7350 *code++ = prop_value;
7351 }
7352 }
7353
7354 /* Now set up the following opcode */
7355
7356 if (repeat_max == REPEAT_UNLIMITED)
7357 *code++ = OP_STAR + repeat_type;
7358 else
7359 {
7360 repeat_max -= repeat_min;
7361 if (repeat_max == 1)
7362 {
7363 *code++ = OP_QUERY + repeat_type;
7364 }
7365 else
7366 {
7367 *code++ = OP_UPTO + repeat_type;
7368 PUT2INC(code, 0, repeat_max);
7369 }
7370 }
7371 }
7372 }
7373
7374 /* Fill in the character or character type for the final opcode. */
7375
7376 if (mclength > 0)
7377 {
7378 memcpy(code, mcbuffer, CU2BYTES(mclength));
7379 code += mclength;
7380 }
7381 else
7382 {
7383 *code++ = op_previous;
7384 if (prop_type >= 0)
7385 {
7386 *code++ = prop_type;
7387 *code++ = prop_value;
7388 }
7389 }
7390 }
7391 break;
7392 } /* End of switch on different op_previous values */
7393
7394
7395 /* If the character following a repeat is '+', possessive_quantifier is
7396 TRUE. For some opcodes, there are special alternative opcodes for this
7397 case. For anything else, we wrap the entire repeated item inside OP_ONCE
7398 brackets. Logically, the '+' notation is just syntactic sugar, taken from
7399 Sun's Java package, but the special opcodes can optimize it.
7400
7401 Some (but not all) possessively repeated subpatterns have already been
7402 completely handled in the code just above. For them, possessive_quantifier
7403 is always FALSE at this stage. Note that the repeated item starts at
7404 tempcode, not at previous, which might be the first part of a string whose
7405 (former) last char we repeated. */
7406
7407 if (possessive_quantifier)
7408 {
7409 int len;
7410
7411 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7412 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7413 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7414 remains is greater than zero, there's a further opcode that can be
7415 handled. If not, do nothing, leaving the EXACT alone. */
7416
7417 switch(*tempcode)
7418 {
7419 case OP_TYPEEXACT:
7420 tempcode += PRIV(OP_lengths)[*tempcode] +
7421 ((tempcode[1 + IMM2_SIZE] == OP_PROP
7422 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7423 break;
7424
7425 /* CHAR opcodes are used for exacts whose count is 1. */
7426
7427 case OP_CHAR:
7428 case OP_CHARI:
7429 case OP_NOT:
7430 case OP_NOTI:
7431 case OP_EXACT:
7432 case OP_EXACTI:
7433 case OP_NOTEXACT:
7434 case OP_NOTEXACTI:
7435 tempcode += PRIV(OP_lengths)[*tempcode];
7436 #ifdef SUPPORT_UNICODE
7437 if (utf && HAS_EXTRALEN(tempcode[-1]))
7438 tempcode += GET_EXTRALEN(tempcode[-1]);
7439 #endif
7440 break;
7441
7442 /* For the class opcodes, the repeat operator appears at the end;
7443 adjust tempcode to point to it. */
7444
7445 case OP_CLASS:
7446 case OP_NCLASS:
7447 tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7448 break;
7449
7450 #ifdef SUPPORT_WIDE_CHARS
7451 case OP_XCLASS:
7452 tempcode += GET(tempcode, 1);
7453 break;
7454 #endif
7455 }
7456
7457 /* If tempcode is equal to code (which points to the end of the repeated
7458 item), it means we have skipped an EXACT item but there is no following
7459 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7460 all other cases, tempcode will be pointing to the repeat opcode, and will
7461 be less than code, so the value of len will be greater than 0. */
7462
7463 len = (int)(code - tempcode);
7464 if (len > 0)
7465 {
7466 unsigned int repcode = *tempcode;
7467
7468 /* There is a table for possessifying opcodes, all of which are less
7469 than OP_CALLOUT. A zero entry means there is no possessified version.
7470 */
7471
7472 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7473 *tempcode = opcode_possessify[repcode];
7474
7475 /* For opcode without a special possessified version, wrap the item in
7476 ONCE brackets. */
7477
7478 else
7479 {
7480 (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7481 code += 1 + LINK_SIZE;
7482 len += 1 + LINK_SIZE;
7483 tempcode[0] = OP_ONCE;
7484 *code++ = OP_KET;
7485 PUTINC(code, 0, len);
7486 PUT(tempcode, 1, len);
7487 }
7488 }
7489 }
7490
7491 /* We set the "follows varying string" flag for subsequently encountered
7492 reqcus if it isn't already set and we have just passed a varying length
7493 item. */
7494
7495 END_REPEAT:
7496 cb->req_varyopt |= reqvary;
7497 break;
7498
7499
7500 /* ===================================================================*/
7501 /* Handle a 32-bit data character with a value greater than META_END. */
7502
7503 case META_BIGVALUE:
7504 pptr++;
7505 goto NORMAL_CHAR;
7506
7507
7508 /* ===============================================================*/
7509 /* Handle a back reference by number, which is the meta argument. The
7510 pattern offsets for back references to group numbers less than 10 are held
7511 in a special vector, to avoid using more than two parsed pattern elements
7512 in 64-bit environments. We only need the offset to the first occurrence,
7513 because if that doesn't fail, subsequent ones will also be OK. */
7514
7515 case META_BACKREF:
7516 if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7517 else GETPLUSOFFSET(offset, pptr);
7518
7519 if (meta_arg > cb->bracount)
7520 {
7521 cb->erroroffset = offset;
7522 *errorcodeptr = ERR15; /* Non-existent subpattern */
7523 return 0;
7524 }
7525
7526 /* Come here from named backref handling when the reference is to a
7527 single group (that is, not to a duplicated name). The back reference
7528 data will have already been updated. We must disable firstcu if not
7529 set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7530 later. */
7531
7532 HANDLE_SINGLE_REFERENCE:
7533 if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7534 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7535 PUT2INC(code, 0, meta_arg);
7536
7537 /* Update the map of back references, and keep the highest one. We
7538 could do this in parse_regex() for numerical back references, but not
7539 for named back references, because we don't know the numbers to which
7540 named back references refer. So we do it all in this function. */
7541
7542 cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7543 if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7544
7545 /* Check to see if this back reference is recursive, that it, it
7546 is inside the group that it references. A flag is set so that the
7547 group can be made atomic. */
7548
7549 for (oc = cb->open_caps; oc != NULL; oc = oc->next)
7550 {
7551 if (oc->number == meta_arg)
7552 {
7553 oc->flag = TRUE;
7554 break;
7555 }
7556 }
7557 break;
7558
7559
7560 /* ===============================================================*/
7561 /* Handle recursion by inserting the number of the called group (which is
7562 the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7563 scanned and these numbers are replaced by offsets within the pattern. It is
7564 done like this to avoid problems with forward references and adjusting
7565 offsets when groups are duplicated and moved (as discovered in previous
7566 implementations). Note that a recursion does not have a set first
7567 character. */
7568
7569 case META_RECURSE:
7570 GETPLUSOFFSET(offset, pptr);
7571 if (meta_arg > cb->bracount)
7572 {
7573 cb->erroroffset = offset;
7574 *errorcodeptr = ERR15; /* Non-existent subpattern */
7575 return 0;
7576 }
7577 HANDLE_NUMERICAL_RECURSION:
7578 *code = OP_RECURSE;
7579 PUT(code, 1, meta_arg);
7580 code += 1 + LINK_SIZE;
7581 groupsetfirstcu = FALSE;
7582 cb->had_recurse = TRUE;
7583 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7584 zerofirstcu = firstcu;
7585 zerofirstcuflags = firstcuflags;
7586 break;
7587
7588
7589 /* ===============================================================*/
7590 /* Handle capturing parentheses; the number is the meta argument. */
7591
7592 case META_CAPTURE:
7593 bravalue = OP_CBRA;
7594 skipunits = IMM2_SIZE;
7595 PUT2(code, 1+LINK_SIZE, meta_arg);
7596 cb->lastcapture = meta_arg;
7597 goto GROUP_PROCESS_NOTE_EMPTY;
7598
7599
7600 /* ===============================================================*/
7601 /* Handle escape sequence items. For ones like \d, the ESC_values are
7602 arranged to be the same as the corresponding OP_values in the default case
7603 when PCRE2_UCP is not set (which is the only case in which they will appear
7604 here).
7605
7606 Note: \Q and \E are never seen here, as they were dealt with in
7607 parse_pattern(). Neither are numerical back references or recursions, which
7608 were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7609 \g, when followed by names, are turned into META_BACKREF_BYNAME or
7610 META_RECURSE_BYNAME. */
7611
7612 case META_ESCAPE:
7613
7614 /* We can test for escape sequences that consume a character because their
7615 values lie between ESC_b and ESC_Z; this may have to change if any new ones
7616 are ever created. For these sequences, we disable the setting of a first
7617 character if it hasn't already been set. */
7618
7619 if (meta_arg > ESC_b && meta_arg < ESC_Z)
7620 {
7621 matched_char = TRUE;
7622 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7623 }
7624
7625 /* Set values to reset to if this is followed by a zero repeat. */
7626
7627 zerofirstcu = firstcu;
7628 zerofirstcuflags = firstcuflags;
7629 zeroreqcu = reqcu;
7630 zeroreqcuflags = reqcuflags;
7631
7632 /* If Unicode is not supported, \P and \p are not allowed and are
7633 faulted at parse time, so will never appear here. */
7634
7635 #ifdef SUPPORT_UNICODE
7636 if (meta_arg == ESC_P || meta_arg == ESC_p)
7637 {
7638 uint32_t ptype = *(++pptr) >> 16;
7639 uint32_t pdata = *pptr & 0xffff;
7640
7641 /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
7642 from the auto-anchoring code. */
7643
7644 if (meta_arg == ESC_p && ptype == PT_ANY)
7645 {
7646 *code++ = OP_ALLANY;
7647 }
7648 else
7649 {
7650 *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7651 *code++ = ptype;
7652 *code++ = pdata;
7653 }
7654 break; /* End META_ESCAPE */
7655 }
7656 #endif
7657
7658 /* For the rest (including \X when Unicode is supported - if not it's
7659 faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7660 not set; if it is set, these escapes do not show up here because they are
7661 converted into Unicode property tests in parse_regex(). Note that \b and \B
7662 do a one-character lookbehind, and \A also behaves as if it does. */
7663
7664 if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7665 if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7666 cb->max_lookbehind == 0)
7667 cb->max_lookbehind = 1;
7668
7669 /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7670 instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7671
7672 #if PCRE2_CODE_UNIT_WIDTH == 32
7673 *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7674 #else
7675 *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7676 #endif
7677 break; /* End META_ESCAPE */
7678
7679
7680 /* ===================================================================*/
7681 /* Handle an unrecognized meta value. A parsed pattern value less than
7682 META_END is a literal. Otherwise we have a problem. */
7683
7684 default:
7685 if (meta >= META_END)
7686 {
7687 #ifdef DEBUG_SHOW_PARSED
7688 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7689 #endif
7690 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
7691 return 0;
7692 }
7693
7694 /* Handle a literal character. We come here by goto in the case of a
7695 32-bit, non-UTF character whose value is greater than META_END. */
7696
7697 NORMAL_CHAR:
7698 meta = *pptr; /* Get the full 32 bits */
7699 NORMAL_CHAR_SET: /* Character is already in meta */
7700 matched_char = TRUE;
7701
7702 /* For caseless UTF mode, check whether this character has more than one
7703 other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
7704
7705 #ifdef SUPPORT_UNICODE
7706 if (utf && (options & PCRE2_CASELESS) != 0)
7707 {
7708 uint32_t caseset = UCD_CASESET(meta);
7709 if (caseset != 0)
7710 {
7711 *code++ = OP_PROP;
7712 *code++ = PT_CLIST;
7713 *code++ = caseset;
7714 if (firstcuflags == REQ_UNSET)
7715 firstcuflags = zerofirstcuflags = REQ_NONE;
7716 break; /* End handling this meta item */
7717 }
7718 }
7719 #endif
7720
7721 /* Caseful matches, or not one of the multicase characters. Get the
7722 character's code units into mcbuffer, with the length in mclength. When not
7723 in UTF mode, the length is always 1. */
7724
7725 #ifdef SUPPORT_UNICODE
7726 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7727 #endif
7728 {
7729 mclength = 1;
7730 mcbuffer[0] = meta;
7731 }
7732
7733 /* Generate the appropriate code */
7734
7735 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7736 memcpy(code, mcbuffer, CU2BYTES(mclength));
7737 code += mclength;
7738
7739 /* Remember if \r or \n were seen */
7740
7741 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7742 cb->external_flags |= PCRE2_HASCRORLF;
7743
7744 /* Set the first and required code units appropriately. If no previous
7745 first code unit, set it from this character, but revert to none on a zero
7746 repeat. Otherwise, leave the firstcu value alone, and don't change it on
7747 a zero repeat. */
7748
7749 if (firstcuflags == REQ_UNSET)
7750 {
7751 zerofirstcuflags = REQ_NONE;
7752 zeroreqcu = reqcu;
7753 zeroreqcuflags = reqcuflags;
7754
7755 /* If the character is more than one code unit long, we can set firstcu
7756 only if it is not to be matched caselessly. */
7757
7758 if (mclength == 1 || req_caseopt == 0)
7759 {
7760 firstcu = mcbuffer[0];
7761 firstcuflags = req_caseopt;
7762 if (mclength != 1)
7763 {
7764 reqcu = code[-1];
7765 reqcuflags = cb->req_varyopt;
7766 }
7767 }
7768 else firstcuflags = reqcuflags = REQ_NONE;
7769 }
7770
7771 /* firstcu was previously set; we can set reqcu only if the length is
7772 1 or the matching is caseful. */
7773
7774 else
7775 {
7776 zerofirstcu = firstcu;
7777 zerofirstcuflags = firstcuflags;
7778 zeroreqcu = reqcu;
7779 zeroreqcuflags = reqcuflags;
7780 if (mclength == 1 || req_caseopt == 0)
7781 {
7782 reqcu = code[-1];
7783 reqcuflags = req_caseopt | cb->req_varyopt;
7784 }
7785 }
7786 break; /* End default meta handling */
7787 } /* End of big switch */
7788 } /* End of big loop */
7789
7790 /* Control never reaches here. */
7791 }
7792
7793
7794
7795 /*************************************************
7796 * Compile regex: a sequence of alternatives *
7797 *************************************************/
7798
7799 /* On entry, pptr is pointing past the bracket meta, but on return it points to
7800 the closing bracket or META_END. The code variable is pointing at the code unit
7801 into which the BRA operator has been stored. This function is used during the
7802 pre-compile phase when we are trying to find out the amount of memory needed,
7803 as well as during the real compile phase. The value of lengthptr distinguishes
7804 the two phases.
7805
7806 Arguments:
7807 options option bits, including any changes for this subpattern
7808 codeptr -> the address of the current code pointer
7809 pptrptr -> the address of the current parsed pattern pointer
7810 errorcodeptr -> pointer to error code variable
7811 skipunits skip this many code units at start (for brackets and OP_COND)
7812 firstcuptr place to put the first required code unit
7813 firstcuflagsptr place to put the first code unit flags, or a negative number
7814 reqcuptr place to put the last required code unit
7815 reqcuflagsptr place to put the last required code unit flags, or a negative number
7816 bcptr pointer to the chain of currently open branches
7817 cb points to the data block with tables pointers etc.
7818 lengthptr NULL during the real compile phase
7819 points to length accumulator during pre-compile phase
7820
7821 Returns: 0 There has been an error
7822 +1 Success, this group must match at least one character
7823 -1 Success, this group may match an empty string
7824 */
7825
7826 static int
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)7827 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
7828 int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
7829 int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
7830 branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
7831 {
7832 PCRE2_UCHAR *code = *codeptr;
7833 PCRE2_UCHAR *last_branch = code;
7834 PCRE2_UCHAR *start_bracket = code;
7835 BOOL lookbehind;
7836 open_capitem capitem;
7837 int capnumber = 0;
7838 int okreturn = 1;
7839 uint32_t *pptr = *pptrptr;
7840 uint32_t firstcu, reqcu;
7841 uint32_t lookbehindlength;
7842 int32_t firstcuflags, reqcuflags;
7843 uint32_t branchfirstcu, branchreqcu;
7844 int32_t branchfirstcuflags, branchreqcuflags;
7845 PCRE2_SIZE length;
7846 branch_chain bc;
7847
7848 /* If set, call the external function that checks for stack availability. */
7849
7850 if (cb->cx->stack_guard != NULL &&
7851 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
7852 {
7853 *errorcodeptr= ERR33;
7854 return 0;
7855 }
7856
7857 /* Miscellaneous initialization */
7858
7859 bc.outer = bcptr;
7860 bc.current_branch = code;
7861
7862 firstcu = reqcu = 0;
7863 firstcuflags = reqcuflags = REQ_UNSET;
7864
7865 /* Accumulate the length for use in the pre-compile phase. Start with the
7866 length of the BRA and KET and any extra code units that are required at the
7867 beginning. We accumulate in a local variable to save frequent testing of
7868 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
7869 start and end of each alternative, because compiled items are discarded during
7870 the pre-compile phase so that the workspace is not exceeded. */
7871
7872 length = 2 + 2*LINK_SIZE + skipunits;
7873
7874 /* Remember if this is a lookbehind assertion, and if it is, save its length
7875 and skip over the pattern offset. */
7876
7877 lookbehind = *code == OP_ASSERTBACK || *code == OP_ASSERTBACK_NOT;
7878 if (lookbehind)
7879 {
7880 lookbehindlength = META_DATA(pptr[-1]);
7881 pptr += SIZEOFFSET;
7882 }
7883 else lookbehindlength = 0;
7884
7885 /* If this is a capturing subpattern, add to the chain of open capturing items
7886 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
7887 need be tested here; changing this opcode to one of its variants, e.g.
7888 OP_SCBRAPOS, happens later, after the group has been compiled. */
7889
7890 if (*code == OP_CBRA)
7891 {
7892 capnumber = GET2(code, 1 + LINK_SIZE);
7893 capitem.number = capnumber;
7894 capitem.next = cb->open_caps;
7895 capitem.flag = FALSE;
7896 capitem.assert_depth = cb->assert_depth;
7897 cb->open_caps = &capitem;
7898 }
7899
7900 /* Offset is set zero to mark that this bracket is still open */
7901
7902 PUT(code, 1, 0);
7903 code += 1 + LINK_SIZE + skipunits;
7904
7905 /* Loop for each alternative branch */
7906
7907 for (;;)
7908 {
7909 int branch_return;
7910
7911 /* Insert OP_REVERSE if this is as lookbehind assertion. */
7912
7913 if (lookbehind && lookbehindlength > 0)
7914 {
7915 *code++ = OP_REVERSE;
7916 PUTINC(code, 0, lookbehindlength);
7917 length += 1 + LINK_SIZE;
7918 }
7919
7920 /* Now compile the branch; in the pre-compile phase its length gets added
7921 into the length. */
7922
7923 if ((branch_return =
7924 compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
7925 &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
7926 cb, (lengthptr == NULL)? NULL : &length)) == 0)
7927 return 0;
7928
7929 /* If a branch can match an empty string, so can the whole group. */
7930
7931 if (branch_return < 0) okreturn = -1;
7932
7933 /* In the real compile phase, there is some post-processing to be done. */
7934
7935 if (lengthptr == NULL)
7936 {
7937 /* If this is the first branch, the firstcu and reqcu values for the
7938 branch become the values for the regex. */
7939
7940 if (*last_branch != OP_ALT)
7941 {
7942 firstcu = branchfirstcu;
7943 firstcuflags = branchfirstcuflags;
7944 reqcu = branchreqcu;
7945 reqcuflags = branchreqcuflags;
7946 }
7947
7948 /* If this is not the first branch, the first char and reqcu have to
7949 match the values from all the previous branches, except that if the
7950 previous value for reqcu didn't have REQ_VARY set, it can still match,
7951 and we set REQ_VARY for the regex. */
7952
7953 else
7954 {
7955 /* If we previously had a firstcu, but it doesn't match the new branch,
7956 we have to abandon the firstcu for the regex, but if there was
7957 previously no reqcu, it takes on the value of the old firstcu. */
7958
7959 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
7960 {
7961 if (firstcuflags >= 0)
7962 {
7963 if (reqcuflags < 0)
7964 {
7965 reqcu = firstcu;
7966 reqcuflags = firstcuflags;
7967 }
7968 }
7969 firstcuflags = REQ_NONE;
7970 }
7971
7972 /* If we (now or from before) have no firstcu, a firstcu from the
7973 branch becomes a reqcu if there isn't a branch reqcu. */
7974
7975 if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
7976 branchreqcuflags < 0)
7977 {
7978 branchreqcu = branchfirstcu;
7979 branchreqcuflags = branchfirstcuflags;
7980 }
7981
7982 /* Now ensure that the reqcus match */
7983
7984 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
7985 reqcu != branchreqcu)
7986 reqcuflags = REQ_NONE;
7987 else
7988 {
7989 reqcu = branchreqcu;
7990 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
7991 }
7992 }
7993 }
7994
7995 /* Handle reaching the end of the expression, either ')' or end of pattern.
7996 In the real compile phase, go back through the alternative branches and
7997 reverse the chain of offsets, with the field in the BRA item now becoming an
7998 offset to the first alternative. If there are no alternatives, it points to
7999 the end of the group. The length in the terminating ket is always the length
8000 of the whole bracketed item. Return leaving the pointer at the terminating
8001 char. */
8002
8003 if (META_CODE(*pptr) != META_ALT)
8004 {
8005 if (lengthptr == NULL)
8006 {
8007 PCRE2_SIZE branch_length = code - last_branch;
8008 do
8009 {
8010 PCRE2_SIZE prev_length = GET(last_branch, 1);
8011 PUT(last_branch, 1, branch_length);
8012 branch_length = prev_length;
8013 last_branch -= branch_length;
8014 }
8015 while (branch_length > 0);
8016 }
8017
8018 /* Fill in the ket */
8019
8020 *code = OP_KET;
8021 PUT(code, 1, (int)(code - start_bracket));
8022 code += 1 + LINK_SIZE;
8023
8024 /* If it was a capturing subpattern, check to see if it contained any
8025 recursive back references. If so, we must wrap it in atomic brackets. In
8026 any event, remove the block from the chain. */
8027
8028 if (capnumber > 0)
8029 {
8030 if (cb->open_caps->flag)
8031 {
8032 (void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8033 CU2BYTES(code - start_bracket));
8034 *start_bracket = OP_ONCE;
8035 code += 1 + LINK_SIZE;
8036 PUT(start_bracket, 1, (int)(code - start_bracket));
8037 *code = OP_KET;
8038 PUT(code, 1, (int)(code - start_bracket));
8039 code += 1 + LINK_SIZE;
8040 length += 2 + 2*LINK_SIZE;
8041 }
8042 cb->open_caps = cb->open_caps->next;
8043 }
8044
8045 /* Set values to pass back */
8046
8047 *codeptr = code;
8048 *pptrptr = pptr;
8049 *firstcuptr = firstcu;
8050 *firstcuflagsptr = firstcuflags;
8051 *reqcuptr = reqcu;
8052 *reqcuflagsptr = reqcuflags;
8053 if (lengthptr != NULL)
8054 {
8055 if (OFLOW_MAX - *lengthptr < length)
8056 {
8057 *errorcodeptr = ERR20;
8058 return 0;
8059 }
8060 *lengthptr += length;
8061 }
8062 return okreturn;
8063 }
8064
8065 /* Another branch follows. In the pre-compile phase, we can move the code
8066 pointer back to where it was for the start of the first branch. (That is,
8067 pretend that each branch is the only one.)
8068
8069 In the real compile phase, insert an ALT node. Its length field points back
8070 to the previous branch while the bracket remains open. At the end the chain
8071 is reversed. It's done like this so that the start of the bracket has a
8072 zero offset until it is closed, making it possible to detect recursion. */
8073
8074 if (lengthptr != NULL)
8075 {
8076 code = *codeptr + 1 + LINK_SIZE + skipunits;
8077 length += 1 + LINK_SIZE;
8078 }
8079 else
8080 {
8081 *code = OP_ALT;
8082 PUT(code, 1, (int)(code - last_branch));
8083 bc.current_branch = last_branch = code;
8084 code += 1 + LINK_SIZE;
8085 }
8086
8087 /* Set the lookbehind length (if not in a lookbehind the value will be zero)
8088 and then advance past the vertical bar. */
8089
8090 lookbehindlength = META_DATA(*pptr);
8091 pptr++;
8092 }
8093 /* Control never reaches here */
8094 }
8095
8096
8097
8098 /*************************************************
8099 * Check for anchored pattern *
8100 *************************************************/
8101
8102 /* Try to find out if this is an anchored regular expression. Consider each
8103 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8104 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8105 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8106 be found, because ^ generates OP_CIRCM in that mode.
8107
8108 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8109 This is the code for \G, which means "match at start of match position, taking
8110 into account the match offset".
8111
8112 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8113 because that will try the rest of the pattern at all possible matching points,
8114 so there is no point trying again.... er ....
8115
8116 .... except when the .* appears inside capturing parentheses, and there is a
8117 subsequent back reference to those parentheses. We haven't enough information
8118 to catch that case precisely.
8119
8120 At first, the best we could do was to detect when .* was in capturing brackets
8121 and the highest back reference was greater than or equal to that level.
8122 However, by keeping a bitmap of the first 31 back references, we can catch some
8123 of the more common cases more precisely.
8124
8125 ... A second exception is when the .* appears inside an atomic group, because
8126 this prevents the number of characters it matches from being adjusted.
8127
8128 Arguments:
8129 code points to start of the compiled pattern
8130 bracket_map a bitmap of which brackets we are inside while testing; this
8131 handles up to substring 31; after that we just have to take
8132 the less precise approach
8133 cb points to the compile data block
8134 atomcount atomic group level
8135 inassert TRUE if in an assertion
8136
8137 Returns: TRUE or FALSE
8138 */
8139
8140 static BOOL
is_anchored(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8141 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8142 int atomcount, BOOL inassert)
8143 {
8144 do {
8145 PCRE2_SPTR scode = first_significant_code(
8146 code + PRIV(OP_lengths)[*code], FALSE);
8147 int op = *scode;
8148
8149 /* Non-capturing brackets */
8150
8151 if (op == OP_BRA || op == OP_BRAPOS ||
8152 op == OP_SBRA || op == OP_SBRAPOS)
8153 {
8154 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8155 return FALSE;
8156 }
8157
8158 /* Capturing brackets */
8159
8160 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8161 op == OP_SCBRA || op == OP_SCBRAPOS)
8162 {
8163 int n = GET2(scode, 1+LINK_SIZE);
8164 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8165 if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8166 }
8167
8168 /* Positive forward assertion */
8169
8170 else if (op == OP_ASSERT)
8171 {
8172 if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8173 }
8174
8175 /* Condition. If there is no second branch, it can't be anchored. */
8176
8177 else if (op == OP_COND || op == OP_SCOND)
8178 {
8179 if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8180 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8181 return FALSE;
8182 }
8183
8184 /* Atomic groups */
8185
8186 else if (op == OP_ONCE)
8187 {
8188 if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8189 return FALSE;
8190 }
8191
8192 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8193 it isn't in brackets that are or may be referenced or inside an atomic
8194 group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8195 because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8196 with the subject "aab", which matches "b", i.e. not at the start of a line.
8197 There is also an option that disables auto-anchoring. */
8198
8199 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8200 op == OP_TYPEPOSSTAR))
8201 {
8202 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8203 atomcount > 0 || cb->had_pruneorskip || inassert ||
8204 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8205 return FALSE;
8206 }
8207
8208 /* Check for explicit anchoring */
8209
8210 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8211
8212 code += GET(code, 1);
8213 }
8214 while (*code == OP_ALT); /* Loop for each alternative */
8215 return TRUE;
8216 }
8217
8218
8219
8220 /*************************************************
8221 * Check for starting with ^ or .* *
8222 *************************************************/
8223
8224 /* This is called to find out if every branch starts with ^ or .* so that
8225 "first char" processing can be done to speed things up in multiline
8226 matching and for non-DOTALL patterns that start with .* (which must start at
8227 the beginning or after \n). As in the case of is_anchored() (see above), we
8228 have to take account of back references to capturing brackets that contain .*
8229 because in that case we can't make the assumption. Also, the appearance of .*
8230 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8231 or *SKIP does not count, because once again the assumption no longer holds.
8232
8233 Arguments:
8234 code points to start of the compiled pattern or a group
8235 bracket_map a bitmap of which brackets we are inside while testing; this
8236 handles up to substring 31; after that we just have to take
8237 the less precise approach
8238 cb points to the compile data
8239 atomcount atomic group level
8240 inassert TRUE if in an assertion
8241
8242 Returns: TRUE or FALSE
8243 */
8244
8245 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8246 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8247 int atomcount, BOOL inassert)
8248 {
8249 do {
8250 PCRE2_SPTR scode = first_significant_code(
8251 code + PRIV(OP_lengths)[*code], FALSE);
8252 int op = *scode;
8253
8254 /* If we are at the start of a conditional assertion group, *both* the
8255 conditional assertion *and* what follows the condition must satisfy the test
8256 for start of line. Other kinds of condition fail. Note that there may be an
8257 auto-callout at the start of a condition. */
8258
8259 if (op == OP_COND)
8260 {
8261 scode += 1 + LINK_SIZE;
8262
8263 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8264 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8265
8266 switch (*scode)
8267 {
8268 case OP_CREF:
8269 case OP_DNCREF:
8270 case OP_RREF:
8271 case OP_DNRREF:
8272 case OP_FAIL:
8273 case OP_FALSE:
8274 case OP_TRUE:
8275 return FALSE;
8276
8277 default: /* Assertion */
8278 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8279 do scode += GET(scode, 1); while (*scode == OP_ALT);
8280 scode += 1 + LINK_SIZE;
8281 break;
8282 }
8283 scode = first_significant_code(scode, FALSE);
8284 op = *scode;
8285 }
8286
8287 /* Non-capturing brackets */
8288
8289 if (op == OP_BRA || op == OP_BRAPOS ||
8290 op == OP_SBRA || op == OP_SBRAPOS)
8291 {
8292 if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8293 return FALSE;
8294 }
8295
8296 /* Capturing brackets */
8297
8298 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8299 op == OP_SCBRA || op == OP_SCBRAPOS)
8300 {
8301 int n = GET2(scode, 1+LINK_SIZE);
8302 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8303 if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8304 }
8305
8306 /* Positive forward assertions */
8307
8308 else if (op == OP_ASSERT)
8309 {
8310 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8311 return FALSE;
8312 }
8313
8314 /* Atomic brackets */
8315
8316 else if (op == OP_ONCE)
8317 {
8318 if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8319 return FALSE;
8320 }
8321
8322 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8323 brackets that may be referenced or an assertion, and as long as the pattern
8324 does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8325 for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8326 i.e. not at the start of a line. There is also an option that disables this
8327 optimization. */
8328
8329 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8330 {
8331 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8332 atomcount > 0 || cb->had_pruneorskip || inassert ||
8333 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8334 return FALSE;
8335 }
8336
8337 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8338 in particular that this includes atomic brackets OP_ONCE because the number
8339 of characters matched by .* cannot be adjusted inside them. */
8340
8341 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8342
8343 /* Move on to the next alternative */
8344
8345 code += GET(code, 1);
8346 }
8347 while (*code == OP_ALT); /* Loop for each alternative */
8348 return TRUE;
8349 }
8350
8351
8352
8353 /*************************************************
8354 * Scan compiled regex for recursion reference *
8355 *************************************************/
8356
8357 /* This function scans through a compiled pattern until it finds an instance of
8358 OP_RECURSE.
8359
8360 Arguments:
8361 code points to start of expression
8362 utf TRUE in UTF mode
8363
8364 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8365 */
8366
8367 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8368 find_recurse(PCRE2_SPTR code, BOOL utf)
8369 {
8370 for (;;)
8371 {
8372 PCRE2_UCHAR c = *code;
8373 if (c == OP_END) return NULL;
8374 if (c == OP_RECURSE) return code;
8375
8376 /* XCLASS is used for classes that cannot be represented just by a bit map.
8377 This includes negated single high-valued characters. CALLOUT_STR is used for
8378 callouts with string arguments. In both cases the length in the table is
8379 zero; the actual length is stored in the compiled code. */
8380
8381 if (c == OP_XCLASS) code += GET(code, 1);
8382 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8383
8384 /* Otherwise, we can get the item's length from the table, except that for
8385 repeated character types, we have to test for \p and \P, which have an extra
8386 two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8387 we must add in its length. */
8388
8389 else
8390 {
8391 switch(c)
8392 {
8393 case OP_TYPESTAR:
8394 case OP_TYPEMINSTAR:
8395 case OP_TYPEPLUS:
8396 case OP_TYPEMINPLUS:
8397 case OP_TYPEQUERY:
8398 case OP_TYPEMINQUERY:
8399 case OP_TYPEPOSSTAR:
8400 case OP_TYPEPOSPLUS:
8401 case OP_TYPEPOSQUERY:
8402 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8403 break;
8404
8405 case OP_TYPEPOSUPTO:
8406 case OP_TYPEUPTO:
8407 case OP_TYPEMINUPTO:
8408 case OP_TYPEEXACT:
8409 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8410 code += 2;
8411 break;
8412
8413 case OP_MARK:
8414 case OP_COMMIT_ARG:
8415 case OP_PRUNE_ARG:
8416 case OP_SKIP_ARG:
8417 case OP_THEN_ARG:
8418 code += code[1];
8419 break;
8420 }
8421
8422 /* Add in the fixed length from the table */
8423
8424 code += PRIV(OP_lengths)[c];
8425
8426 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8427 be followed by a multi-unit character. The length in the table is a
8428 minimum, so we have to arrange to skip the extra units. */
8429
8430 #ifdef MAYBE_UTF_MULTI
8431 if (utf) switch(c)
8432 {
8433 case OP_CHAR:
8434 case OP_CHARI:
8435 case OP_NOT:
8436 case OP_NOTI:
8437 case OP_EXACT:
8438 case OP_EXACTI:
8439 case OP_NOTEXACT:
8440 case OP_NOTEXACTI:
8441 case OP_UPTO:
8442 case OP_UPTOI:
8443 case OP_NOTUPTO:
8444 case OP_NOTUPTOI:
8445 case OP_MINUPTO:
8446 case OP_MINUPTOI:
8447 case OP_NOTMINUPTO:
8448 case OP_NOTMINUPTOI:
8449 case OP_POSUPTO:
8450 case OP_POSUPTOI:
8451 case OP_NOTPOSUPTO:
8452 case OP_NOTPOSUPTOI:
8453 case OP_STAR:
8454 case OP_STARI:
8455 case OP_NOTSTAR:
8456 case OP_NOTSTARI:
8457 case OP_MINSTAR:
8458 case OP_MINSTARI:
8459 case OP_NOTMINSTAR:
8460 case OP_NOTMINSTARI:
8461 case OP_POSSTAR:
8462 case OP_POSSTARI:
8463 case OP_NOTPOSSTAR:
8464 case OP_NOTPOSSTARI:
8465 case OP_PLUS:
8466 case OP_PLUSI:
8467 case OP_NOTPLUS:
8468 case OP_NOTPLUSI:
8469 case OP_MINPLUS:
8470 case OP_MINPLUSI:
8471 case OP_NOTMINPLUS:
8472 case OP_NOTMINPLUSI:
8473 case OP_POSPLUS:
8474 case OP_POSPLUSI:
8475 case OP_NOTPOSPLUS:
8476 case OP_NOTPOSPLUSI:
8477 case OP_QUERY:
8478 case OP_QUERYI:
8479 case OP_NOTQUERY:
8480 case OP_NOTQUERYI:
8481 case OP_MINQUERY:
8482 case OP_MINQUERYI:
8483 case OP_NOTMINQUERY:
8484 case OP_NOTMINQUERYI:
8485 case OP_POSQUERY:
8486 case OP_POSQUERYI:
8487 case OP_NOTPOSQUERY:
8488 case OP_NOTPOSQUERYI:
8489 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8490 break;
8491 }
8492 #else
8493 (void)(utf); /* Keep compiler happy by referencing function argument */
8494 #endif /* MAYBE_UTF_MULTI */
8495 }
8496 }
8497 }
8498
8499
8500
8501 /*************************************************
8502 * Check for asserted fixed first code unit *
8503 *************************************************/
8504
8505 /* During compilation, the "first code unit" settings from forward assertions
8506 are discarded, because they can cause conflicts with actual literals that
8507 follow. However, if we end up without a first code unit setting for an
8508 unanchored pattern, it is worth scanning the regex to see if there is an
8509 initial asserted first code unit. If all branches start with the same asserted
8510 code unit, or with a non-conditional bracket all of whose alternatives start
8511 with the same asserted code unit (recurse ad lib), then we return that code
8512 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8513 REQ_NONE in the flags.
8514
8515 Arguments:
8516 code points to start of compiled pattern
8517 flags points to the first code unit flags
8518 inassert non-zero if in an assertion
8519
8520 Returns: the fixed first code unit, or 0 with REQ_NONE in flags
8521 */
8522
8523 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,int32_t * flags,uint32_t inassert)8524 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
8525 {
8526 uint32_t c = 0;
8527 int cflags = REQ_NONE;
8528
8529 *flags = REQ_NONE;
8530 do {
8531 uint32_t d;
8532 int dflags;
8533 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8534 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8535 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8536 PCRE2_UCHAR op = *scode;
8537
8538 switch(op)
8539 {
8540 default:
8541 return 0;
8542
8543 case OP_BRA:
8544 case OP_BRAPOS:
8545 case OP_CBRA:
8546 case OP_SCBRA:
8547 case OP_CBRAPOS:
8548 case OP_SCBRAPOS:
8549 case OP_ASSERT:
8550 case OP_ONCE:
8551 case OP_SCRIPT_RUN:
8552 d = find_firstassertedcu(scode, &dflags, inassert + ((op==OP_ASSERT)?1:0));
8553 if (dflags < 0)
8554 return 0;
8555 if (cflags < 0) { c = d; cflags = dflags; }
8556 else if (c != d || cflags != dflags) return 0;
8557 break;
8558
8559 case OP_EXACT:
8560 scode += IMM2_SIZE;
8561 /* Fall through */
8562
8563 case OP_CHAR:
8564 case OP_PLUS:
8565 case OP_MINPLUS:
8566 case OP_POSPLUS:
8567 if (inassert == 0) return 0;
8568 if (cflags < 0) { c = scode[1]; cflags = 0; }
8569 else if (c != scode[1]) return 0;
8570 break;
8571
8572 case OP_EXACTI:
8573 scode += IMM2_SIZE;
8574 /* Fall through */
8575
8576 case OP_CHARI:
8577 case OP_PLUSI:
8578 case OP_MINPLUSI:
8579 case OP_POSPLUSI:
8580 if (inassert == 0) return 0;
8581 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8582 else if (c != scode[1]) return 0;
8583 break;
8584 }
8585
8586 code += GET(code, 1);
8587 }
8588 while (*code == OP_ALT);
8589
8590 *flags = cflags;
8591 return c;
8592 }
8593
8594
8595
8596 /*************************************************
8597 * Add an entry to the name/number table *
8598 *************************************************/
8599
8600 /* This function is called between compiling passes to add an entry to the
8601 name/number table, maintaining alphabetical order. Checking for permitted
8602 and forbidden duplicates has already been done.
8603
8604 Arguments:
8605 cb the compile data block
8606 name the name to add
8607 length the length of the name
8608 groupno the group number
8609 tablecount the count of names in the table so far
8610
8611 Returns: nothing
8612 */
8613
8614 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)8615 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8616 unsigned int groupno, uint32_t tablecount)
8617 {
8618 uint32_t i;
8619 PCRE2_UCHAR *slot = cb->name_table;
8620
8621 for (i = 0; i < tablecount; i++)
8622 {
8623 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8624 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8625 crc = -1; /* Current name is a substring */
8626
8627 /* Make space in the table and break the loop for an earlier name. For a
8628 duplicate or later name, carry on. We do this for duplicates so that in the
8629 simple case (when ?(| is not used) they are in order of their numbers. In all
8630 cases they are in the order in which they appear in the pattern. */
8631
8632 if (crc < 0)
8633 {
8634 (void)memmove(slot + cb->name_entry_size, slot,
8635 CU2BYTES((tablecount - i) * cb->name_entry_size));
8636 break;
8637 }
8638
8639 /* Continue the loop for a later or duplicate name */
8640
8641 slot += cb->name_entry_size;
8642 }
8643
8644 PUT2(slot, 0, groupno);
8645 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8646
8647 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8648 the memory is all initialized. Otherwise valgrind moans about uninitialized
8649 memory when saving serialized compiled patterns. */
8650
8651 memset(slot + IMM2_SIZE + length, 0,
8652 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8653 }
8654
8655
8656
8657 /*************************************************
8658 * Skip in parsed pattern *
8659 *************************************************/
8660
8661 /* This function is called to skip parts of the parsed pattern when finding the
8662 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8663 the end of the branch, it is called to skip over an internal lookaround, and it
8664 is also called to skip to the end of a class, during which it will never
8665 encounter nested groups (but there's no need to have special code for that).
8666
8667 When called to find the end of a branch or group, pptr must point to the first
8668 meta code inside the branch, not the branch-starting code. In other cases it
8669 can point to the item that causes the function to be called.
8670
8671 Arguments:
8672 pptr current pointer to skip from
8673 skiptype PSKIP_CLASS when skipping to end of class
8674 PSKIP_ALT when META_ALT ends the skip
8675 PSKIP_KET when only META_KET ends the skip
8676
8677 Returns: new value of pptr
8678 NULL if META_END is reached - should never occur
8679 or for an unknown meta value - likewise
8680 */
8681
8682 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)8683 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8684 {
8685 uint32_t nestlevel = 0;
8686
8687 for (;; pptr++)
8688 {
8689 uint32_t meta = META_CODE(*pptr);
8690
8691 switch(meta)
8692 {
8693 default: /* Just skip over most items */
8694 if (meta < META_END) continue; /* Literal */
8695 break;
8696
8697 /* This should never occur. */
8698
8699 case META_END:
8700 return NULL;
8701
8702 /* The data for these items is variable in length. */
8703
8704 case META_BACKREF: /* Offset is present only if group >= 10 */
8705 if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8706 break;
8707
8708 case META_ESCAPE: /* A few escapes are followed by data items. */
8709 switch (META_DATA(*pptr))
8710 {
8711 case ESC_P:
8712 case ESC_p:
8713 pptr += 1;
8714 break;
8715
8716 case ESC_g:
8717 case ESC_k:
8718 pptr += 1 + SIZEOFFSET;
8719 break;
8720 }
8721 break;
8722
8723 case META_MARK: /* Add the length of the name. */
8724 case META_COMMIT_ARG:
8725 case META_PRUNE_ARG:
8726 case META_SKIP_ARG:
8727 case META_THEN_ARG:
8728 pptr += pptr[1];
8729 break;
8730
8731 /* These are the "active" items in this loop. */
8732
8733 case META_CLASS_END:
8734 if (skiptype == PSKIP_CLASS) return pptr;
8735 break;
8736
8737 case META_ATOMIC:
8738 case META_CAPTURE:
8739 case META_COND_ASSERT:
8740 case META_COND_DEFINE:
8741 case META_COND_NAME:
8742 case META_COND_NUMBER:
8743 case META_COND_RNAME:
8744 case META_COND_RNUMBER:
8745 case META_COND_VERSION:
8746 case META_LOOKAHEAD:
8747 case META_LOOKAHEADNOT:
8748 case META_LOOKBEHIND:
8749 case META_LOOKBEHINDNOT:
8750 case META_NOCAPTURE:
8751 case META_SCRIPT_RUN:
8752 nestlevel++;
8753 break;
8754
8755 case META_ALT:
8756 if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
8757 break;
8758
8759 case META_KET:
8760 if (nestlevel == 0) return pptr;
8761 nestlevel--;
8762 break;
8763 }
8764
8765 /* The extra data item length for each meta is in a table. */
8766
8767 meta = (meta >> 16) & 0x7fff;
8768 if (meta >= sizeof(meta_extra_lengths)) return NULL;
8769 pptr += meta_extra_lengths[meta];
8770 }
8771 /* Control never reaches here */
8772 return pptr;
8773 }
8774
8775
8776
8777 /*************************************************
8778 * Find length of a parsed group *
8779 *************************************************/
8780
8781 /* This is called for nested groups within a branch of a lookbehind whose
8782 length is being computed. If all the branches in the nested group have the same
8783 length, that is OK. On entry, the pointer must be at the first element after
8784 the group initializing code. On exit it points to OP_KET. Caching is used to
8785 improve processing speed when the same capturing group occurs many times.
8786
8787 Arguments:
8788 pptrptr pointer to pointer in the parsed pattern
8789 isinline FALSE if a reference or recursion; TRUE for inline group
8790 errcodeptr pointer to the errorcode
8791 lcptr pointer to the loop counter
8792 group number of captured group or -1 for a non-capturing group
8793 recurses chain of recurse_check to catch mutual recursion
8794 cb pointer to the compile data
8795
8796 Returns: the group length or a negative number
8797 */
8798
8799 static int
get_grouplength(uint32_t ** pptrptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)8800 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
8801 int group, parsed_recurse_check *recurses, compile_block *cb)
8802 {
8803 int branchlength;
8804 int grouplength = -1;
8805
8806 /* The cache can be used only if there is no possibility of there being two
8807 groups with the same number. We do not need to set the end pointer for a group
8808 that is being processed as a back reference or recursion, but we must do so for
8809 an inline group. */
8810
8811 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
8812 {
8813 uint32_t groupinfo = cb->groupinfo[group];
8814 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
8815 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
8816 {
8817 if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
8818 return groupinfo & GI_FIXED_LENGTH_MASK;
8819 }
8820 }
8821
8822 /* Scan the group. In this case we find the end pointer of necessity. */
8823
8824 for(;;)
8825 {
8826 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
8827 if (branchlength < 0) goto ISNOTFIXED;
8828 if (grouplength == -1) grouplength = branchlength;
8829 else if (grouplength != branchlength) goto ISNOTFIXED;
8830 if (**pptrptr == META_KET) break;
8831 *pptrptr += 1; /* Skip META_ALT */
8832 }
8833
8834 if (group > 0)
8835 cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
8836 return grouplength;
8837
8838 ISNOTFIXED:
8839 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
8840 return -1;
8841 }
8842
8843
8844
8845 /*************************************************
8846 * Find length of a parsed branch *
8847 *************************************************/
8848
8849 /* Return a fixed length for a branch in a lookbehind, giving an error if the
8850 length is not fixed. If any lookbehinds are encountered on the way, they get
8851 their length set. On entry, *pptrptr points to the first element inside the
8852 branch. On exit it is set to point to the ALT or KET.
8853
8854 Arguments:
8855 pptrptr pointer to pointer in the parsed pattern
8856 errcodeptr pointer to error code
8857 lcptr pointer to loop counter
8858 recurses chain of recurse_check to catch mutual recursion
8859 cb pointer to compile block
8860
8861 Returns: the length, or a negative value on error
8862 */
8863
8864 static int
get_branchlength(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)8865 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
8866 parsed_recurse_check *recurses, compile_block *cb)
8867 {
8868 int branchlength = 0;
8869 int grouplength;
8870 uint32_t lastitemlength = 0;
8871 uint32_t *pptr = *pptrptr;
8872 PCRE2_SIZE offset;
8873 parsed_recurse_check this_recurse;
8874
8875 /* A large and/or complex regex can take too long to process. This can happen
8876 more often when (?| groups are present in the pattern because their length
8877 cannot be cached. */
8878
8879 if ((*lcptr)++ > 2000)
8880 {
8881 *errcodeptr = ERR35; /* Lookbehind is too complicated */
8882 return -1;
8883 }
8884
8885 /* Scan the branch, accumulating the length. */
8886
8887 for (;; pptr++)
8888 {
8889 parsed_recurse_check *r;
8890 uint32_t *gptr, *gptrend;
8891 uint32_t escape;
8892 uint32_t group = 0;
8893 uint32_t itemlength = 0;
8894
8895 if (*pptr < META_END)
8896 {
8897 itemlength = 1;
8898 }
8899
8900 else switch (META_CODE(*pptr))
8901 {
8902 case META_KET:
8903 case META_ALT:
8904 goto EXIT;
8905
8906 /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
8907 actual termination. */
8908
8909 case META_ACCEPT:
8910 case META_FAIL:
8911 pptr = parsed_skip(pptr, PSKIP_ALT);
8912 if (pptr == NULL) goto PARSED_SKIP_FAILED;
8913 goto EXIT;
8914
8915 case META_MARK:
8916 case META_COMMIT_ARG:
8917 case META_PRUNE_ARG:
8918 case META_SKIP_ARG:
8919 case META_THEN_ARG:
8920 pptr += pptr[1] + 1;
8921 break;
8922
8923 case META_CIRCUMFLEX:
8924 case META_COMMIT:
8925 case META_DOLLAR:
8926 case META_PRUNE:
8927 case META_SKIP:
8928 case META_THEN:
8929 break;
8930
8931 case META_OPTIONS:
8932 pptr += 1;
8933 break;
8934
8935 case META_BIGVALUE:
8936 itemlength = 1;
8937 pptr += 1;
8938 break;
8939
8940 case META_CLASS:
8941 case META_CLASS_NOT:
8942 itemlength = 1;
8943 pptr = parsed_skip(pptr, PSKIP_CLASS);
8944 if (pptr == NULL) goto PARSED_SKIP_FAILED;
8945 break;
8946
8947 case META_CLASS_EMPTY_NOT:
8948 case META_DOT:
8949 itemlength = 1;
8950 break;
8951
8952 case META_CALLOUT_NUMBER:
8953 pptr += 3;
8954 break;
8955
8956 case META_CALLOUT_STRING:
8957 pptr += 3 + SIZEOFFSET;
8958 break;
8959
8960 /* Only some escapes consume a character. Of those, \R and \X are never
8961 allowed because they might match more than character. \C is allowed only in
8962 32-bit and non-UTF 8/16-bit modes. */
8963
8964 case META_ESCAPE:
8965 escape = META_DATA(*pptr);
8966 if (escape == ESC_R || escape == ESC_X) return -1;
8967 if (escape > ESC_b && escape < ESC_Z)
8968 {
8969 #if PCRE2_CODE_UNIT_WIDTH != 32
8970 if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
8971 {
8972 *errcodeptr = ERR36;
8973 return -1;
8974 }
8975 #endif
8976 itemlength = 1;
8977 if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
8978 }
8979 break;
8980
8981 /* Lookaheads can be ignored, but we must start the skip inside the group
8982 so that it isn't treated as a group within the branch. */
8983
8984 case META_LOOKAHEAD:
8985 case META_LOOKAHEADNOT:
8986 pptr = parsed_skip(pptr + 1, PSKIP_KET);
8987 if (pptr == NULL) goto PARSED_SKIP_FAILED;
8988
8989 /* Also ignore any qualifiers that follow a lookahead assertion. */
8990
8991 switch (pptr[1])
8992 {
8993 case META_ASTERISK:
8994 case META_ASTERISK_PLUS:
8995 case META_ASTERISK_QUERY:
8996 case META_PLUS:
8997 case META_PLUS_PLUS:
8998 case META_PLUS_QUERY:
8999 case META_QUERY:
9000 case META_QUERY_PLUS:
9001 case META_QUERY_QUERY:
9002 pptr++;
9003 break;
9004
9005 case META_MINMAX:
9006 case META_MINMAX_PLUS:
9007 case META_MINMAX_QUERY:
9008 pptr += 3;
9009 break;
9010
9011 default:
9012 break;
9013 }
9014 break;
9015
9016 /* Lookbehinds can be ignored, but must themselves be checked. */
9017
9018 case META_LOOKBEHIND:
9019 case META_LOOKBEHINDNOT:
9020 if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9021 return -1;
9022 break;
9023
9024 /* Back references and recursions are handled by very similar code. At this
9025 stage, the names generated in the parsing pass are available, but the main
9026 name table has not yet been created. So for the named varieties, scan the
9027 list of names in order to get the number of the first one in the pattern,
9028 and whether or not this name is duplicated. */
9029
9030 case META_BACKREF_BYNAME:
9031 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9032 goto ISNOTFIXED;
9033 /* Fall through */
9034
9035 case META_RECURSE_BYNAME:
9036 {
9037 int i;
9038 PCRE2_SPTR name;
9039 BOOL is_dupname = FALSE;
9040 named_group *ng = cb->named_groups;
9041 uint32_t meta_code = META_CODE(*pptr);
9042 uint32_t length = *(++pptr);
9043
9044 GETPLUSOFFSET(offset, pptr);
9045 name = cb->start_pattern + offset;
9046 for (i = 0; i < cb->names_found; i++, ng++)
9047 {
9048 if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9049 {
9050 group = ng->number;
9051 is_dupname = ng->isdup;
9052 break;
9053 }
9054 }
9055
9056 if (group == 0)
9057 {
9058 *errcodeptr = ERR15; /* Non-existent subpattern */
9059 cb->erroroffset = offset;
9060 return -1;
9061 }
9062
9063 /* A numerical back reference can be fixed length if duplicate capturing
9064 groups are not being used. A non-duplicate named back reference can also
9065 be handled. */
9066
9067 if (meta_code == META_RECURSE_BYNAME ||
9068 (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9069 goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
9070 }
9071 goto ISNOTFIXED; /* Duplicate name or number */
9072
9073 /* The offset values for back references < 10 are in a separate vector
9074 because otherwise they would use more than two parsed pattern elements on
9075 64-bit systems. */
9076
9077 case META_BACKREF:
9078 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9079 (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9080 goto ISNOTFIXED;
9081 group = META_DATA(*pptr);
9082 if (group < 10)
9083 {
9084 offset = cb->small_ref_offset[group];
9085 goto RECURSE_OR_BACKREF_LENGTH;
9086 }
9087
9088 /* Fall through */
9089 /* For groups >= 10 - picking up group twice does no harm. */
9090
9091 /* A true recursion implies not fixed length, but a subroutine call may
9092 be OK. Back reference "recursions" are also failed. */
9093
9094 case META_RECURSE:
9095 group = META_DATA(*pptr);
9096 GETPLUSOFFSET(offset, pptr);
9097
9098 RECURSE_OR_BACKREF_LENGTH:
9099 if (group > cb->bracount)
9100 {
9101 cb->erroroffset = offset;
9102 *errcodeptr = ERR15; /* Non-existent subpattern */
9103 return -1;
9104 }
9105 if (group == 0) goto ISNOTFIXED; /* Local recursion */
9106 for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9107 {
9108 if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9109 else if (*gptr == (META_CAPTURE | group)) break;
9110 }
9111
9112 /* We must start the search for the end of the group at the first meta code
9113 inside the group. Otherwise it will be treated as an enclosed group. */
9114
9115 gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9116 if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9117 if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
9118 for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9119 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
9120 this_recurse.prev = recurses;
9121 this_recurse.groupptr = gptr;
9122
9123 /* We do not need to know the position of the end of the group, that is,
9124 gptr is not used after the call to get_grouplength(). Setting the second
9125 argument FALSE stops it scanning for the end when the length can be found
9126 in the cache. */
9127
9128 gptr++;
9129 grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9130 &this_recurse, cb);
9131 if (grouplength < 0)
9132 {
9133 if (*errcodeptr == 0) goto ISNOTFIXED;
9134 return -1; /* Error already set */
9135 }
9136 itemlength = grouplength;
9137 break;
9138
9139 /* Check nested groups - advance past the initial data for each type and
9140 then seek a fixed length with get_grouplength(). */
9141
9142 case META_COND_NAME:
9143 case META_COND_NUMBER:
9144 case META_COND_RNAME:
9145 case META_COND_RNUMBER:
9146 case META_COND_DEFINE:
9147 pptr += 2 + SIZEOFFSET;
9148 goto CHECK_GROUP;
9149
9150 case META_COND_ASSERT:
9151 pptr += 1;
9152 goto CHECK_GROUP;
9153
9154 case META_COND_VERSION:
9155 pptr += 4;
9156 goto CHECK_GROUP;
9157
9158 case META_CAPTURE:
9159 group = META_DATA(*pptr);
9160 /* Fall through */
9161
9162 case META_ATOMIC:
9163 case META_NOCAPTURE:
9164 case META_SCRIPT_RUN:
9165 pptr++;
9166 CHECK_GROUP:
9167 grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9168 recurses, cb);
9169 if (grouplength < 0) return -1;
9170 itemlength = grouplength;
9171 break;
9172
9173 /* Exact repetition is OK; variable repetition is not. A repetition of zero
9174 must subtract the length that has already been added. */
9175
9176 case META_MINMAX:
9177 case META_MINMAX_PLUS:
9178 case META_MINMAX_QUERY:
9179 if (pptr[1] == pptr[2])
9180 {
9181 if (pptr[1] == 0) branchlength -= lastitemlength;
9182 else itemlength = (pptr[1] - 1) * lastitemlength;
9183 pptr += 2;
9184 break;
9185 }
9186 /* Fall through */
9187
9188 /* Any other item means this branch does not have a fixed length. */
9189
9190 default:
9191 ISNOTFIXED:
9192 *errcodeptr = ERR25; /* Not fixed length */
9193 return -1;
9194 }
9195
9196 /* Add the item length to the branchlength, and save it for use if the next
9197 thing is a quantifier. */
9198
9199 branchlength += itemlength;
9200 lastitemlength = itemlength;
9201
9202 /* Ensure that the length does not overflow the limit. */
9203
9204 if (branchlength > LOOKBEHIND_MAX)
9205 {
9206 *errcodeptr = ERR87;
9207 return -1;
9208 }
9209 }
9210
9211 EXIT:
9212 *pptrptr = pptr;
9213 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9214 return branchlength;
9215
9216 PARSED_SKIP_FAILED:
9217 *errcodeptr = ERR90;
9218 return -1;
9219 }
9220
9221
9222
9223 /*************************************************
9224 * Set lengths in a lookbehind *
9225 *************************************************/
9226
9227 /* This function is called for each lookbehind, to set the lengths in its
9228 branches. An error occurs if any branch does not have a fixed length that is
9229 less than the maximum (65535). On exit, the pointer must be left on the final
9230 ket.
9231
9232 Arguments:
9233 pptrptr pointer to pointer in the parsed pattern
9234 errcodeptr pointer to error code
9235 lcptr pointer to loop counter
9236 recurses chain of recurse_check to catch mutual recursion
9237 cb pointer to compile block
9238
9239 Returns: TRUE if all is well
9240 FALSE otherwise, with error code and offset set
9241 */
9242
9243 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9244 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9245 parsed_recurse_check *recurses, compile_block *cb)
9246 {
9247 PCRE2_SIZE offset;
9248 int branchlength;
9249 uint32_t *bptr = *pptrptr;
9250
9251 READPLUSOFFSET(offset, bptr); /* Offset for error messages */
9252 *pptrptr += SIZEOFFSET;
9253
9254 do
9255 {
9256 *pptrptr += 1;
9257 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9258 if (branchlength < 0)
9259 {
9260 /* The errorcode and offset may already be set from a nested lookbehind. */
9261 if (*errcodeptr == 0) *errcodeptr = ERR25;
9262 if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9263 return FALSE;
9264 }
9265 *bptr |= branchlength; /* branchlength never more than 65535 */
9266 bptr = *pptrptr;
9267 }
9268 while (*bptr == META_ALT);
9269
9270 return TRUE;
9271 }
9272
9273
9274
9275 /*************************************************
9276 * Check parsed pattern lookbehinds *
9277 *************************************************/
9278
9279 /* This function is called at the end of parsing a pattern if any lookbehinds
9280 were encountered. It scans the parsed pattern for them, calling
9281 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9282 the error offset is marked unset. The enables the functions above not to
9283 override settings from deeper nestings.
9284
9285 Arguments cb points to the compile block
9286 Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
9287 */
9288
9289 static int
check_lookbehinds(compile_block * cb)9290 check_lookbehinds(compile_block *cb)
9291 {
9292 uint32_t *pptr;
9293 int errorcode = 0;
9294 int loopcount = 0;
9295
9296 cb->erroroffset = PCRE2_UNSET;
9297
9298 for (pptr = cb->parsed_pattern; *pptr != META_END; pptr++)
9299 {
9300 if (*pptr < META_END) continue; /* Literal */
9301
9302 switch (META_CODE(*pptr))
9303 {
9304 default:
9305 return ERR70; /* Unrecognized meta code */
9306
9307 case META_ESCAPE:
9308 if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9309 pptr += 1;
9310 break;
9311
9312 case META_ACCEPT:
9313 case META_ALT:
9314 case META_ASTERISK:
9315 case META_ASTERISK_PLUS:
9316 case META_ASTERISK_QUERY:
9317 case META_ATOMIC:
9318 case META_BACKREF:
9319 case META_CAPTURE:
9320 case META_CIRCUMFLEX:
9321 case META_CLASS:
9322 case META_CLASS_EMPTY:
9323 case META_CLASS_EMPTY_NOT:
9324 case META_CLASS_END:
9325 case META_CLASS_NOT:
9326 case META_COMMIT:
9327 case META_COND_ASSERT:
9328 case META_DOLLAR:
9329 case META_DOT:
9330 case META_FAIL:
9331 case META_KET:
9332 case META_LOOKAHEAD:
9333 case META_LOOKAHEADNOT:
9334 case META_NOCAPTURE:
9335 case META_PLUS:
9336 case META_PLUS_PLUS:
9337 case META_PLUS_QUERY:
9338 case META_PRUNE:
9339 case META_QUERY:
9340 case META_QUERY_PLUS:
9341 case META_QUERY_QUERY:
9342 case META_RANGE_ESCAPED:
9343 case META_RANGE_LITERAL:
9344 case META_SCRIPT_RUN:
9345 case META_SKIP:
9346 case META_THEN:
9347 break;
9348
9349 case META_RECURSE:
9350 pptr += SIZEOFFSET;
9351 break;
9352
9353 case META_BACKREF_BYNAME:
9354 case META_COND_DEFINE:
9355 case META_COND_NAME:
9356 case META_COND_NUMBER:
9357 case META_COND_RNAME:
9358 case META_COND_RNUMBER:
9359 case META_RECURSE_BYNAME:
9360 pptr += 1 + SIZEOFFSET;
9361 break;
9362
9363 case META_CALLOUT_STRING:
9364 pptr += 3 + SIZEOFFSET;
9365 break;
9366
9367 case META_BIGVALUE:
9368 case META_OPTIONS:
9369 case META_POSIX:
9370 case META_POSIX_NEG:
9371 pptr += 1;
9372 break;
9373
9374 case META_MINMAX:
9375 case META_MINMAX_QUERY:
9376 case META_MINMAX_PLUS:
9377 pptr += 2;
9378 break;
9379
9380 case META_CALLOUT_NUMBER:
9381 case META_COND_VERSION:
9382 pptr += 3;
9383 break;
9384
9385 case META_MARK:
9386 case META_COMMIT_ARG:
9387 case META_PRUNE_ARG:
9388 case META_SKIP_ARG:
9389 case META_THEN_ARG:
9390 pptr += 1 + pptr[1];
9391 break;
9392
9393 case META_LOOKBEHIND:
9394 case META_LOOKBEHINDNOT:
9395 if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, NULL, cb))
9396 return errorcode;
9397 break;
9398 }
9399 }
9400
9401 return 0;
9402 }
9403
9404
9405
9406 /*************************************************
9407 * External function to compile a pattern *
9408 *************************************************/
9409
9410 /* This function reads a regular expression in the form of a string and returns
9411 a pointer to a block of store holding a compiled version of the expression.
9412
9413 Arguments:
9414 pattern the regular expression
9415 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
9416 options option bits
9417 errorptr pointer to errorcode
9418 erroroffset pointer to error offset
9419 ccontext points to a compile context or is NULL
9420
9421 Returns: pointer to compiled data block, or NULL on error,
9422 with errorcode and erroroffset set
9423 */
9424
9425 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)9426 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9427 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9428 {
9429 BOOL utf; /* Set TRUE for UTF mode */
9430 BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
9431 BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
9432 pcre2_real_code *re = NULL; /* What we will return */
9433 compile_block cb; /* "Static" compile-time data */
9434 const uint8_t *tables; /* Char tables base pointer */
9435
9436 PCRE2_UCHAR *code; /* Current pointer in compiled code */
9437 PCRE2_SPTR codestart; /* Start of compiled code */
9438 PCRE2_SPTR ptr; /* Current pointer in pattern */
9439 uint32_t *pptr; /* Current pointer in parsed pattern */
9440
9441 PCRE2_SIZE length = 1; /* Allow for final END opcode */
9442 PCRE2_SIZE usedlength; /* Actual length used */
9443 PCRE2_SIZE re_blocksize; /* Size of memory block */
9444 PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
9445 PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
9446
9447 int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
9448 uint32_t firstcu, reqcu; /* Value of first/req code unit */
9449 uint32_t setflags = 0; /* NL and BSR set flags */
9450
9451 uint32_t skipatstart; /* When checking (*UTF) etc */
9452 uint32_t limit_heap = UINT32_MAX;
9453 uint32_t limit_match = UINT32_MAX; /* Unset match limits */
9454 uint32_t limit_depth = UINT32_MAX;
9455
9456 int newline = 0; /* Unset; can be set by the pattern */
9457 int bsr = 0; /* Unset; can be set by the pattern */
9458 int errorcode = 0; /* Initialize to avoid compiler warn */
9459 int regexrc; /* Return from compile */
9460
9461 uint32_t i; /* Local loop counter */
9462
9463 /* Comments at the head of this file explain about these variables. */
9464
9465 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9466 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9467 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9468
9469 /* The workspace is used in different ways in the different compiling phases.
9470 It needs to be 16-bit aligned for the preliminary parsing scan. */
9471
9472 uint32_t c16workspace[C16_WORK_SIZE];
9473 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9474
9475
9476 /* -------------- Check arguments and set up the pattern ----------------- */
9477
9478 /* There must be error code and offset pointers. */
9479
9480 if (errorptr == NULL || erroroffset == NULL) return NULL;
9481 *errorptr = ERR0;
9482 *erroroffset = 0;
9483
9484 /* There must be a pattern! */
9485
9486 if (pattern == NULL)
9487 {
9488 *errorptr = ERR16;
9489 return NULL;
9490 }
9491
9492 /* A NULL compile context means "use a default context" */
9493
9494 if (ccontext == NULL)
9495 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9496
9497 /* Check that all undefined public option bits are zero. */
9498
9499 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9500 (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9501 {
9502 *errorptr = ERR17;
9503 return NULL;
9504 }
9505
9506 if ((options & PCRE2_LITERAL) != 0 &&
9507 ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9508 (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9509 {
9510 *errorptr = ERR92;
9511 return NULL;
9512 }
9513
9514 /* A zero-terminated pattern is indicated by the special length value
9515 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9516
9517 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9518 patlen = PRIV(strlen)(pattern);
9519
9520 if (patlen > ccontext->max_pattern_length)
9521 {
9522 *errorptr = ERR88;
9523 return NULL;
9524 }
9525
9526 /* From here on, all returns from this function should end up going via the
9527 EXIT label. */
9528
9529
9530 /* ------------ Initialize the "static" compile data -------------- */
9531
9532 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9533
9534 cb.lcc = tables + lcc_offset; /* Individual */
9535 cb.fcc = tables + fcc_offset; /* character */
9536 cb.cbits = tables + cbits_offset; /* tables */
9537 cb.ctypes = tables + ctypes_offset;
9538
9539 cb.assert_depth = 0;
9540 cb.bracount = 0;
9541 cb.cx = ccontext;
9542 cb.dupnames = FALSE;
9543 cb.end_pattern = pattern + patlen;
9544 cb.erroroffset = 0;
9545 cb.external_flags = 0;
9546 cb.external_options = options;
9547 cb.groupinfo = stack_groupinfo;
9548 cb.had_recurse = FALSE;
9549 cb.lastcapture = 0;
9550 cb.max_lookbehind = 0;
9551 cb.name_entry_size = 0;
9552 cb.name_table = NULL;
9553 cb.named_groups = named_groups;
9554 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9555 cb.names_found = 0;
9556 cb.open_caps = NULL;
9557 cb.parens_depth = 0;
9558 cb.parsed_pattern = stack_parsed_pattern;
9559 cb.req_varyopt = 0;
9560 cb.start_code = cworkspace;
9561 cb.start_pattern = pattern;
9562 cb.start_workspace = cworkspace;
9563 cb.workspace_size = COMPILE_WORK_SIZE;
9564
9565 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9566 references to help in deciding whether (.*) can be treated as anchored or not.
9567 */
9568
9569 cb.top_backref = 0;
9570 cb.backref_map = 0;
9571
9572 /* Escape sequences \1 to \9 are always back references, but as they are only
9573 two characters long, only two elements can be used in the parsed_pattern
9574 vector. The first contains the reference, and we'd like to use the second to
9575 record the offset in the pattern, so that forward references to non-existent
9576 groups can be diagnosed later with an offset. However, on 64-bit systems,
9577 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9578 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9579 references have enough space for the offset to be put into the parsed pattern.
9580 */
9581
9582 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9583
9584
9585 /* --------------- Start looking at the pattern --------------- */
9586
9587 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9588 the start of the pattern, and remember the offset to the actual regex. With
9589 valgrind support, make the terminator of a zero-terminated pattern
9590 inaccessible. This catches bugs that would otherwise only show up for
9591 non-zero-terminated patterns. */
9592
9593 #ifdef SUPPORT_VALGRIND
9594 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9595 #endif
9596
9597 ptr = pattern;
9598 skipatstart = 0;
9599
9600 if ((options & PCRE2_LITERAL) == 0)
9601 {
9602 while (patlen - skipatstart >= 2 &&
9603 ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9604 ptr[skipatstart+1] == CHAR_ASTERISK)
9605 {
9606 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9607 {
9608 uint32_t c, pp;
9609 pso *p = pso_list + i;
9610
9611 if (patlen - skipatstart - 2 >= p->length &&
9612 PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9613 p->length) == 0)
9614 {
9615 skipatstart += p->length + 2;
9616 switch(p->type)
9617 {
9618 case PSO_OPT:
9619 cb.external_options |= p->value;
9620 break;
9621
9622 case PSO_FLG:
9623 setflags |= p->value;
9624 break;
9625
9626 case PSO_NL:
9627 newline = p->value;
9628 setflags |= PCRE2_NL_SET;
9629 break;
9630
9631 case PSO_BSR:
9632 bsr = p->value;
9633 setflags |= PCRE2_BSR_SET;
9634 break;
9635
9636 case PSO_LIMM:
9637 case PSO_LIMD:
9638 case PSO_LIMH:
9639 c = 0;
9640 pp = skipatstart;
9641 if (!IS_DIGIT(ptr[pp]))
9642 {
9643 errorcode = ERR60;
9644 ptr += pp;
9645 goto HAD_EARLY_ERROR;
9646 }
9647 while (IS_DIGIT(ptr[pp]))
9648 {
9649 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
9650 c = c*10 + (ptr[pp++] - CHAR_0);
9651 }
9652 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9653 {
9654 errorcode = ERR60;
9655 ptr += pp;
9656 goto HAD_EARLY_ERROR;
9657 }
9658 if (p->type == PSO_LIMH) limit_heap = c;
9659 else if (p->type == PSO_LIMM) limit_match = c;
9660 else limit_depth = c;
9661 skipatstart += pp - skipatstart;
9662 break;
9663 }
9664 break; /* Out of the table scan loop */
9665 }
9666 }
9667 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
9668 }
9669 }
9670
9671 /* End of pattern-start options; advance to start of real regex. */
9672
9673 ptr += skipatstart;
9674
9675 /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
9676
9677 #ifndef SUPPORT_UNICODE
9678 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
9679 {
9680 errorcode = ERR32;
9681 goto HAD_EARLY_ERROR;
9682 }
9683 #endif
9684
9685 /* Check UTF. We have the original options in 'options', with that value as
9686 modified by (*UTF) etc in cb->external_options. The extra option
9687 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
9688 surrogate code points cannot be represented in UTF-16. */
9689
9690 utf = (cb.external_options & PCRE2_UTF) != 0;
9691 if (utf)
9692 {
9693 if ((options & PCRE2_NEVER_UTF) != 0)
9694 {
9695 errorcode = ERR74;
9696 goto HAD_EARLY_ERROR;
9697 }
9698 if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
9699 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
9700 goto HAD_ERROR; /* Offset was set by valid_utf() */
9701
9702 #if PCRE2_CODE_UNIT_WIDTH == 16
9703 if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
9704 {
9705 errorcode = ERR91;
9706 goto HAD_EARLY_ERROR;
9707 }
9708 #endif
9709 }
9710
9711 /* Check UCP lockout. */
9712
9713 if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
9714 (PCRE2_UCP|PCRE2_NEVER_UCP))
9715 {
9716 errorcode = ERR75;
9717 goto HAD_EARLY_ERROR;
9718 }
9719
9720 /* Process the BSR setting. */
9721
9722 if (bsr == 0) bsr = ccontext->bsr_convention;
9723
9724 /* Process the newline setting. */
9725
9726 if (newline == 0) newline = ccontext->newline_convention;
9727 cb.nltype = NLTYPE_FIXED;
9728 switch(newline)
9729 {
9730 case PCRE2_NEWLINE_CR:
9731 cb.nllen = 1;
9732 cb.nl[0] = CHAR_CR;
9733 break;
9734
9735 case PCRE2_NEWLINE_LF:
9736 cb.nllen = 1;
9737 cb.nl[0] = CHAR_NL;
9738 break;
9739
9740 case PCRE2_NEWLINE_NUL:
9741 cb.nllen = 1;
9742 cb.nl[0] = CHAR_NUL;
9743 break;
9744
9745 case PCRE2_NEWLINE_CRLF:
9746 cb.nllen = 2;
9747 cb.nl[0] = CHAR_CR;
9748 cb.nl[1] = CHAR_NL;
9749 break;
9750
9751 case PCRE2_NEWLINE_ANY:
9752 cb.nltype = NLTYPE_ANY;
9753 break;
9754
9755 case PCRE2_NEWLINE_ANYCRLF:
9756 cb.nltype = NLTYPE_ANYCRLF;
9757 break;
9758
9759 default:
9760 errorcode = ERR56;
9761 goto HAD_EARLY_ERROR;
9762 }
9763
9764 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
9765 their numerical equivalents, so that this information is always available for
9766 the remaining processing. (2) At the same time, parse the pattern and put a
9767 processed version into the parsed_pattern vector. This has escapes interpreted
9768 and comments removed (amongst other things).
9769
9770 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
9771 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
9772 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
9773 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
9774 characters greater than META_END (0x80000000) have to be coded as two units. In
9775 this case, therefore, we scan the pattern to check for such values. */
9776
9777 #if PCRE2_CODE_UNIT_WIDTH == 32
9778 if (!utf)
9779 {
9780 PCRE2_SPTR p;
9781 for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
9782 }
9783 #endif
9784
9785 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
9786 is set we have to assume a numerical callout (4 elements) for each character
9787 plus one at the end. This is overkill, but memory is plentiful these days. For
9788 many smaller patterns the vector on the stack (which was set up above) can be
9789 used. */
9790
9791 parsed_size_needed = patlen - skipatstart + big32count;
9792
9793 if ((ccontext->extra_options &
9794 (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
9795 parsed_size_needed += 4;
9796
9797 if ((options & PCRE2_AUTO_CALLOUT) != 0)
9798 parsed_size_needed = (parsed_size_needed + 1) * 5;
9799
9800 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
9801 {
9802 uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
9803 (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
9804 if (heap_parsed_pattern == NULL)
9805 {
9806 *errorptr = ERR21;
9807 goto EXIT;
9808 }
9809 cb.parsed_pattern = heap_parsed_pattern;
9810 }
9811 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
9812
9813 /* Do the parsing scan. */
9814
9815 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
9816 if (errorcode != 0) goto HAD_CB_ERROR;
9817
9818 /* Workspace is needed to remember information about numbered groups: whether a
9819 group can match an empty string and what its fixed length is. This is done to
9820 avoid the possibility of recursive references causing very long compile times
9821 when checking these features. Unnumbered groups do not have this exposure since
9822 they cannot be referenced. We use an indexed vector for this purpose. If there
9823 are sufficiently few groups, the default vector on the stack, as set up above,
9824 can be used. Otherwise we have to get/free a special vector. The vector must be
9825 initialized to zero. */
9826
9827 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
9828 {
9829 cb.groupinfo = ccontext->memctl.malloc(
9830 (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
9831 if (cb.groupinfo == NULL)
9832 {
9833 errorcode = ERR21;
9834 cb.erroroffset = 0;
9835 goto HAD_CB_ERROR;
9836 }
9837 }
9838 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
9839
9840 /* If there were any lookbehinds, scan the parsed pattern to figure out their
9841 lengths. */
9842
9843 if (has_lookbehind)
9844 {
9845 errorcode = check_lookbehinds(&cb);
9846 if (errorcode != 0) goto HAD_CB_ERROR;
9847 }
9848
9849 /* For debugging, there is a function that shows the parsed data vector. */
9850
9851 #ifdef DEBUG_SHOW_PARSED
9852 fprintf(stderr, "+++ Pre-scan complete:\n");
9853 show_parsed(&cb);
9854 #endif
9855
9856 /* For debugging capturing information this code can be enabled. */
9857
9858 #ifdef DEBUG_SHOW_CAPTURES
9859 {
9860 named_group *ng = cb.named_groups;
9861 fprintf(stderr, "+++Captures: %d\n", cb.bracount);
9862 for (i = 0; i < cb.names_found; i++, ng++)
9863 {
9864 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
9865 }
9866 }
9867 #endif
9868
9869 /* Pretend to compile the pattern while actually just accumulating the amount
9870 of memory required in the 'length' variable. This behaviour is triggered by
9871 passing a non-NULL final argument to compile_regex(). We pass a block of
9872 workspace (cworkspace) for it to compile parts of the pattern into; the
9873 compiled code is discarded when it is no longer needed, so hopefully this
9874 workspace will never overflow, though there is a test for its doing so.
9875
9876 On error, errorcode will be set non-zero, so we don't need to look at the
9877 result of the function. The initial options have been put into the cb block,
9878 but we still have to pass a separate options variable (the first argument)
9879 because the options may change as the pattern is processed. */
9880
9881 cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
9882 pptr = cb.parsed_pattern;
9883 code = cworkspace;
9884 *code = OP_BRA;
9885
9886 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
9887 &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
9888
9889 if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
9890
9891 /* This should be caught in compile_regex(), but just in case... */
9892
9893 if (length > MAX_PATTERN_SIZE)
9894 {
9895 errorcode = ERR20;
9896 goto HAD_CB_ERROR;
9897 }
9898
9899 /* Compute the size of, and then get and initialize, the data block for storing
9900 the compiled pattern and names table. Integer overflow should no longer be
9901 possible because nowadays we limit the maximum value of cb.names_found and
9902 cb.name_entry_size. */
9903
9904 re_blocksize = sizeof(pcre2_real_code) +
9905 CU2BYTES(length +
9906 (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
9907 re = (pcre2_real_code *)
9908 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
9909 if (re == NULL)
9910 {
9911 errorcode = ERR21;
9912 goto HAD_CB_ERROR;
9913 }
9914
9915 /* The compiler may put padding at the end of the pcre2_real_code structure in
9916 order to round it up to a multiple of 4 or 8 bytes. This means that when a
9917 compiled pattern is copied (for example, when serialized) undefined bytes are
9918 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
9919 write to the last 8 bytes of the structure before setting the fields. */
9920
9921 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
9922 re->memctl = ccontext->memctl;
9923 re->tables = tables;
9924 re->executable_jit = NULL;
9925 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
9926 re->blocksize = re_blocksize;
9927 re->magic_number = MAGIC_NUMBER;
9928 re->compile_options = options;
9929 re->overall_options = cb.external_options;
9930 re->extra_options = ccontext->extra_options;
9931 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
9932 re->limit_heap = limit_heap;
9933 re->limit_match = limit_match;
9934 re->limit_depth = limit_depth;
9935 re->first_codeunit = 0;
9936 re->last_codeunit = 0;
9937 re->bsr_convention = bsr;
9938 re->newline_convention = newline;
9939 re->max_lookbehind = 0;
9940 re->minlength = 0;
9941 re->top_bracket = 0;
9942 re->top_backref = 0;
9943 re->name_entry_size = cb.name_entry_size;
9944 re->name_count = cb.names_found;
9945
9946 /* The basic block is immediately followed by the name table, and the compiled
9947 code follows after that. */
9948
9949 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
9950 re->name_entry_size * re->name_count;
9951
9952 /* Update the compile data block for the actual compile. The starting points of
9953 the name/number translation table and of the code are passed around in the
9954 compile data block. The start/end pattern and initial options are already set
9955 from the pre-compile phase, as is the name_entry_size field. */
9956
9957 cb.parens_depth = 0;
9958 cb.assert_depth = 0;
9959 cb.lastcapture = 0;
9960 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
9961 cb.start_code = codestart;
9962 cb.req_varyopt = 0;
9963 cb.had_accept = FALSE;
9964 cb.had_pruneorskip = FALSE;
9965 cb.open_caps = NULL;
9966
9967 /* If any named groups were found, create the name/number table from the list
9968 created in the pre-pass. */
9969
9970 if (cb.names_found > 0)
9971 {
9972 named_group *ng = cb.named_groups;
9973 for (i = 0; i < cb.names_found; i++, ng++)
9974 add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
9975 }
9976
9977 /* Set up a starting, non-extracting bracket, then compile the expression. On
9978 error, errorcode will be set non-zero, so we don't need to look at the result
9979 of the function here. */
9980
9981 pptr = cb.parsed_pattern;
9982 code = (PCRE2_UCHAR *)codestart;
9983 *code = OP_BRA;
9984 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
9985 &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
9986 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
9987 re->top_bracket = cb.bracount;
9988 re->top_backref = cb.top_backref;
9989 re->max_lookbehind = cb.max_lookbehind;
9990
9991 if (cb.had_accept)
9992 {
9993 reqcu = 0; /* Must disable after (*ACCEPT) */
9994 reqcuflags = REQ_NONE;
9995 }
9996
9997 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
9998 but the estimated length exceeds the really used length, adjust the value of
9999 re->blocksize, and if valgrind support is configured, mark the extra allocated
10000 memory as unaddressable, so that any out-of-bound reads can be detected. */
10001
10002 *code++ = OP_END;
10003 usedlength = code - codestart;
10004 if (usedlength > length) errorcode = ERR23; else
10005 {
10006 re->blocksize -= CU2BYTES(length - usedlength);
10007 #ifdef SUPPORT_VALGRIND
10008 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10009 #endif
10010 }
10011
10012 /* Scan the pattern for recursion/subroutine calls and convert the group
10013 numbers into offsets. Maintain a small cache so that repeated groups containing
10014 recursions are efficiently handled. */
10015
10016 #define RSCAN_CACHE_SIZE 8
10017
10018 if (errorcode == 0 && cb.had_recurse)
10019 {
10020 PCRE2_UCHAR *rcode;
10021 PCRE2_SPTR rgroup;
10022 unsigned int ccount = 0;
10023 int start = RSCAN_CACHE_SIZE;
10024 recurse_cache rc[RSCAN_CACHE_SIZE];
10025
10026 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10027 rcode != NULL;
10028 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10029 {
10030 int p, groupnumber;
10031
10032 groupnumber = (int)GET(rcode, 1);
10033 if (groupnumber == 0) rgroup = codestart; else
10034 {
10035 PCRE2_SPTR search_from = codestart;
10036 rgroup = NULL;
10037 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10038 {
10039 if (groupnumber == rc[p].groupnumber)
10040 {
10041 rgroup = rc[p].group;
10042 break;
10043 }
10044
10045 /* Group n+1 must always start to the right of group n, so we can save
10046 search time below when the new group number is greater than any of the
10047 previously found groups. */
10048
10049 if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10050 }
10051
10052 if (rgroup == NULL)
10053 {
10054 rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10055 if (rgroup == NULL)
10056 {
10057 errorcode = ERR53;
10058 break;
10059 }
10060 if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10061 rc[start].groupnumber = groupnumber;
10062 rc[start].group = rgroup;
10063 if (ccount < RSCAN_CACHE_SIZE) ccount++;
10064 }
10065 }
10066
10067 PUT(rcode, 1, rgroup - codestart);
10068 }
10069 }
10070
10071 /* In rare debugging situations we sometimes need to look at the compiled code
10072 at this stage. */
10073
10074 #ifdef DEBUG_CALL_PRINTINT
10075 pcre2_printint(re, stderr, TRUE);
10076 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10077 #endif
10078
10079 /* Unless disabled, check whether any single character iterators can be
10080 auto-possessified. The function overwrites the appropriate opcode values, so
10081 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10082 used in this code because at least one compiler gives a warning about loss of
10083 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10084 function call. */
10085
10086 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10087 {
10088 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10089 if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
10090 }
10091
10092 /* Failed to compile, or error while post-processing. */
10093
10094 if (errorcode != 0) goto HAD_CB_ERROR;
10095
10096 /* Successful compile. If the anchored option was not passed, set it if
10097 we can determine that the pattern is anchored by virtue of ^ characters or \A
10098 or anything else, such as starting with non-atomic .* when DOTALL is set and
10099 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10100 disable this case). */
10101
10102 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10103 is_anchored(codestart, 0, &cb, 0, FALSE))
10104 re->overall_options |= PCRE2_ANCHORED;
10105
10106 /* Set up the first code unit or startline flag, the required code unit, and
10107 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10108 is set, as the data it would create will not be used. Note that a first code
10109 unit (but not the startline flag) is useful for anchored patterns because it
10110 can still give a quick "no match" and also avoid searching for a last code
10111 unit. */
10112
10113 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10114 {
10115 /* If we do not have a first code unit, see if there is one that is asserted
10116 (these are not saved during the compile because they can cause conflicts with
10117 actual literals that follow). */
10118
10119 if (firstcuflags < 0)
10120 firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10121
10122 /* Save the data for a first code unit. */
10123
10124 if (firstcuflags >= 0)
10125 {
10126 re->first_codeunit = firstcu;
10127 re->flags |= PCRE2_FIRSTSET;
10128
10129 /* Handle caseless first code units. */
10130
10131 if ((firstcuflags & REQ_CASELESS) != 0)
10132 {
10133 if (firstcu < 128 || (!utf && firstcu < 255))
10134 {
10135 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10136 }
10137
10138 /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
10139 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10140 points and cannot have another case. In 16-bit and 32-bit modes, we can
10141 check wide characters when UTF (and therefore UCP) is supported. */
10142
10143 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
10144 else if (firstcu <= MAX_UTF_CODE_POINT &&
10145 UCD_OTHERCASE(firstcu) != firstcu)
10146 re->flags |= PCRE2_FIRSTCASELESS;
10147 #endif
10148 }
10149 }
10150
10151 /* When there is no first code unit, for non-anchored patterns, see if we can
10152 set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10153 branches start with ^ and also when all branches start with non-atomic .* for
10154 non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10155 that disables this case.) */
10156
10157 else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10158 is_startline(codestart, 0, &cb, 0, FALSE))
10159 re->flags |= PCRE2_STARTLINE;
10160
10161 /* Handle the "required code unit", if one is set. In the case of an anchored
10162 pattern, do this only if it follows a variable length item in the pattern. */
10163
10164 if (reqcuflags >= 0 &&
10165 ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10166 (reqcuflags & REQ_VARY) != 0))
10167 {
10168 re->last_codeunit = reqcu;
10169 re->flags |= PCRE2_LASTSET;
10170
10171 /* Handle caseless required code units as for first code units (above). */
10172
10173 if ((reqcuflags & REQ_CASELESS) != 0)
10174 {
10175 if (reqcu < 128 || (!utf && reqcu < 255))
10176 {
10177 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10178 }
10179 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
10180 else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
10181 re->flags |= PCRE2_LASTCASELESS;
10182 #endif
10183 }
10184 }
10185
10186 /* Finally, study the compiled pattern to set up information such as a bitmap
10187 of starting code units and a minimum matching length. */
10188
10189 if (PRIV(study)(re) != 0)
10190 {
10191 errorcode = ERR31;
10192 goto HAD_CB_ERROR;
10193 }
10194 } /* End of start-of-match optimizations. */
10195
10196 /* Control ends up here in all cases. When running under valgrind, make a
10197 pattern's terminating zero defined again. If memory was obtained for the parsed
10198 version of the pattern, free it before returning. Also free the list of named
10199 groups if a larger one had to be obtained, and likewise the group information
10200 vector. */
10201
10202 EXIT:
10203 #ifdef SUPPORT_VALGRIND
10204 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10205 #endif
10206 if (cb.parsed_pattern != stack_parsed_pattern)
10207 ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10208 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10209 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10210 if (cb.groupinfo != stack_groupinfo)
10211 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10212 return re; /* Will be NULL after an error */
10213
10214 /* Errors discovered in parse_regex() set the offset value in the compile
10215 block. Errors discovered before it is called must compute it from the ptr
10216 value. After parse_regex() is called, the offset in the compile block is set to
10217 the end of the pattern, but certain errors in compile_regex() may reset it if
10218 an offset is available in the parsed pattern. */
10219
10220 HAD_CB_ERROR:
10221 ptr = pattern + cb.erroroffset;
10222
10223 HAD_EARLY_ERROR:
10224 *erroroffset = ptr - pattern;
10225
10226 HAD_ERROR:
10227 *errorptr = errorcode;
10228 pcre2_code_free(re);
10229 re = NULL;
10230 goto EXIT;
10231 }
10232
10233 /* End of pcre2_compile.c */
10234