1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #define NLBLOCK cb /* Block containing newline information */
47 #define PSSTART start_pattern /* Field containing processed string start */
48 #define PSEND end_pattern /* Field containing processed string end */
49
50 #include "pcre2_internal.h"
51
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63
64 /* Other debugging code can be enabled by these defines. */
65
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c) xdigitab[c]
75
76 #else /* Either 16-bit or 32-bit */
77 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81
82 #else /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100 { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102 { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110
111 /* Macros for manipulating elements of the parsed pattern vector. */
112
113 #define META_CODE(x) (x & 0xffff0000u)
114 #define META_DATA(x) (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116
117 /* Function definitions to allow mutual recursion */
118
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121 add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122 compile_block *, const uint32_t *, unsigned int);
123 #endif
124
125 static int
126 compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127 uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
128 compile_block *, PCRE2_SIZE *);
129
130 static int
131 get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132 compile_block *);
133
134 static BOOL
135 set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136 compile_block *);
137
138
139
140 /*************************************************
141 * Code parameters and static tables *
142 *************************************************/
143
144 #define MAX_GROUP_NUMBER 65535u
145 #define MAX_REPEAT_COUNT 65535u
146 #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
147
148 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
149 different ways in the different pattern scans. The parsing and group-
150 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
151 aligned for this. Having defined the size in code units, we set up
152 C16_WORK_SIZE as the number of elements in the 16-bit vector.
153
154 During the first compiling phase, when determining how much memory is required,
155 the regex is partly compiled into this space, but the compiled parts are
156 discarded as soon as they can be, so that hopefully there will never be an
157 overrun. The code does, however, check for an overrun, which can occur for
158 pathological patterns. The size of the workspace depends on LINK_SIZE because
159 the length of compiled items varies with this.
160
161 In the real compile phase, this workspace is not currently used. */
162
163 #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
164
165 #define C16_WORK_SIZE \
166 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
167
168 /* A uint32_t vector is used for caching information about the size of
169 capturing groups, to improve performance. A default is created on the stack of
170 this size. */
171
172 #define GROUPINFO_DEFAULT_SIZE 256
173
174 /* The overrun tests check for a slightly smaller size so that they detect the
175 overrun before it actually does run off the end of the data block. */
176
177 #define WORK_SIZE_SAFETY_MARGIN (100)
178
179 /* This value determines the size of the initial vector that is used for
180 remembering named groups during the pre-compile. It is allocated on the stack,
181 but if it is too small, it is expanded, in a similar way to the workspace. The
182 value is the number of slots in the list. */
183
184 #define NAMED_GROUP_LIST_SIZE 20
185
186 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
187 of uint32_t. For short patterns this lives on the stack, with this size. Heap
188 memory is used for longer patterns. */
189
190 #define PARSED_PATTERN_DEFAULT_SIZE 1024
191
192 /* Maximum length value to check against when making sure that the variable
193 that holds the compiled pattern length does not overflow. We make it a bit less
194 than INT_MAX to allow for adding in group terminating code units, so that we
195 don't have to check them every time. */
196
197 #define OFLOW_MAX (INT_MAX - 20)
198
199 /* Code values for parsed patterns, which are stored in a vector of 32-bit
200 unsigned ints. Values less than META_END are literal data values. The coding
201 for identifying the item is in the top 16-bits, leaving 16 bits for the
202 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
203 macros are used to manipulate parsed pattern elements.
204
205 NOTE: When these definitions are changed, the table of extra lengths for each
206 code (meta_extra_lengths, just below) must be updated to remain in step. */
207
208 #define META_END 0x80000000u /* End of pattern */
209
210 #define META_ALT 0x80010000u /* alternation */
211 #define META_ATOMIC 0x80020000u /* atomic group */
212 #define META_BACKREF 0x80030000u /* Back ref */
213 #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
214 #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
215 #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
216 #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
217 #define META_CAPTURE 0x80080000u /* Capturing parenthesis */
218 #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
219 #define META_CLASS 0x800a0000u /* start non-empty class */
220 #define META_CLASS_EMPTY 0x800b0000u /* empty class */
221 #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
222 #define META_CLASS_END 0x800d0000u /* end of non-empty class */
223 #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
224 #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
225 #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
226 #define META_COND_NAME 0x80110000u /* (?(<name>)... */
227 #define META_COND_NUMBER 0x80120000u /* (?(digits)... */
228 #define META_COND_RNAME 0x80130000u /* (?(R&name)... */
229 #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
230 #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
231 #define META_DOLLAR 0x80160000u /* $ metacharacter */
232 #define META_DOT 0x80170000u /* . metacharacter */
233 #define META_ESCAPE 0x80180000u /* \d and friends */
234 #define META_KET 0x80190000u /* closing parenthesis */
235 #define META_NOCAPTURE 0x801a0000u /* no capture parens */
236 #define META_OPTIONS 0x801b0000u /* (?i) and friends */
237 #define META_POSIX 0x801c0000u /* POSIX class item */
238 #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
239 #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
240 #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
241 #define META_RECURSE 0x80200000u /* Recursion */
242 #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
243
244 /* These must be kept together to make it easy to check that an assertion
245 is present where expected in a conditional group. */
246
247 #define META_LOOKAHEAD 0x80220000u /* (?= */
248 #define META_LOOKAHEADNOT 0x80230000u /* (?! */
249 #define META_LOOKBEHIND 0x80240000u /* (?<= */
250 #define META_LOOKBEHINDNOT 0x80250000u /* (?<! */
251
252 /* These must be kept in this order, with consecutive values, and the _ARG
253 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
254 versions. */
255
256 #define META_MARK 0x80260000u /* (*MARK) */
257 #define META_ACCEPT 0x80270000u /* (*ACCEPT) */
258 #define META_FAIL 0x80280000u /* (*FAIL) */
259 #define META_COMMIT 0x80290000u /* These */
260 #define META_COMMIT_ARG 0x802a0000u /* pairs */
261 #define META_PRUNE 0x802b0000u /* must */
262 #define META_PRUNE_ARG 0x802c0000u /* be */
263 #define META_SKIP 0x802d0000u /* kept */
264 #define META_SKIP_ARG 0x802e0000u /* in */
265 #define META_THEN 0x802f0000u /* this */
266 #define META_THEN_ARG 0x80300000u /* order */
267
268 /* These must be kept in groups of adjacent 3 values, and all together. */
269
270 #define META_ASTERISK 0x80310000u /* * */
271 #define META_ASTERISK_PLUS 0x80320000u /* *+ */
272 #define META_ASTERISK_QUERY 0x80330000u /* *? */
273 #define META_PLUS 0x80340000u /* + */
274 #define META_PLUS_PLUS 0x80350000u /* ++ */
275 #define META_PLUS_QUERY 0x80360000u /* +? */
276 #define META_QUERY 0x80370000u /* ? */
277 #define META_QUERY_PLUS 0x80380000u /* ?+ */
278 #define META_QUERY_QUERY 0x80390000u /* ?? */
279 #define META_MINMAX 0x803a0000u /* {n,m} repeat */
280 #define META_MINMAX_PLUS 0x803b0000u /* {n,m}+ repeat */
281 #define META_MINMAX_QUERY 0x803c0000u /* {n,m}? repeat */
282
283 #define META_FIRST_QUANTIFIER META_ASTERISK
284 #define META_LAST_QUANTIFIER META_MINMAX_QUERY
285
286 /* Table of extra lengths for each of the meta codes. Must be kept in step with
287 the definitions above. For some items these values are a basic length to which
288 a variable amount has to be added. */
289
290 static unsigned char meta_extra_lengths[] = {
291 0, /* META_END */
292 0, /* META_ALT */
293 0, /* META_ATOMIC */
294 0, /* META_BACKREF - more if group is >= 10 */
295 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
296 1, /* META_BIGVALUE */
297 3, /* META_CALLOUT_NUMBER */
298 3+SIZEOFFSET, /* META_CALLOUT_STRING */
299 0, /* META_CAPTURE */
300 0, /* META_CIRCUMFLEX */
301 0, /* META_CLASS */
302 0, /* META_CLASS_EMPTY */
303 0, /* META_CLASS_EMPTY_NOT */
304 0, /* META_CLASS_END */
305 0, /* META_CLASS_NOT */
306 0, /* META_COND_ASSERT */
307 SIZEOFFSET, /* META_COND_DEFINE */
308 1+SIZEOFFSET, /* META_COND_NAME */
309 1+SIZEOFFSET, /* META_COND_NUMBER */
310 1+SIZEOFFSET, /* META_COND_RNAME */
311 1+SIZEOFFSET, /* META_COND_RNUMBER */
312 3, /* META_COND_VERSION */
313 0, /* META_DOLLAR */
314 0, /* META_DOT */
315 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
316 0, /* META_KET */
317 0, /* META_NOCAPTURE */
318 1, /* META_OPTIONS */
319 1, /* META_POSIX */
320 1, /* META_POSIX_NEG */
321 0, /* META_RANGE_ESCAPED */
322 0, /* META_RANGE_LITERAL */
323 SIZEOFFSET, /* META_RECURSE */
324 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
325 0, /* META_LOOKAHEAD */
326 0, /* META_LOOKAHEADNOT */
327 SIZEOFFSET, /* META_LOOKBEHIND */
328 SIZEOFFSET, /* META_LOOKBEHINDNOT */
329 1, /* META_MARK - plus the string length */
330 0, /* META_ACCEPT */
331 0, /* META_FAIL */
332 0, /* META_COMMIT */
333 1, /* META_COMMIT_ARG - plus the string length */
334 0, /* META_PRUNE */
335 1, /* META_PRUNE_ARG - plus the string length */
336 0, /* META_SKIP */
337 1, /* META_SKIP_ARG - plus the string length */
338 0, /* META_THEN */
339 1, /* META_THEN_ARG - plus the string length */
340 0, /* META_ASTERISK */
341 0, /* META_ASTERISK_PLUS */
342 0, /* META_ASTERISK_QUERY */
343 0, /* META_PLUS */
344 0, /* META_PLUS_PLUS */
345 0, /* META_PLUS_QUERY */
346 0, /* META_QUERY */
347 0, /* META_QUERY_PLUS */
348 0, /* META_QUERY_QUERY */
349 2, /* META_MINMAX */
350 2, /* META_MINMAX_PLUS */
351 2 /* META_MINMAX_QUERY */
352 };
353
354 /* Types for skipping parts of a parsed pattern. */
355
356 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
357
358 /* Macro for setting individual bits in class bitmaps. It took some
359 experimenting to figure out how to stop gcc 5.3.0 from warning with
360 -Wconversion. This version gets a warning:
361
362 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7))
363
364 Let's hope the apparently less efficient version isn't actually so bad if the
365 compiler is clever with identical subexpressions. */
366
367 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7)))
368
369 /* Private flags added to firstcu and reqcu. */
370
371 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
372 #define REQ_VARY (1 << 1) /* reqcu followed non-literal item */
373 /* Negative values for the firstcu and reqcu flags */
374 #define REQ_UNSET (-2) /* Not yet found anything */
375 #define REQ_NONE (-1) /* Found not fixed char */
376
377 /* These flags are used in the groupinfo vector. */
378
379 #define GI_SET_FIXED_LENGTH 0x80000000u
380 #define GI_NOT_FIXED_LENGTH 0x40000000u
381 #define GI_FIXED_LENGTH_MASK 0x0000ffffu
382
383 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
384 and is fast (a good compiler can turn it into a subtraction and unsigned
385 comparison). */
386
387 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
388
389 /* Table to identify hex digits. The tables in chartables are dependent on the
390 locale, and may mark arbitrary characters as digits. We want to recognize only
391 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
392 costs 256 bytes, but it is a lot faster than doing character value tests (at
393 least in some simple cases I timed), and in some applications one wants PCRE2
394 to compile efficiently as well as match efficiently. The value in the table is
395 the binary hex digit value, or 0xff for non-hex digits. */
396
397 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
398 UTF-8 mode. */
399
400 #ifndef EBCDIC
401 static const uint8_t xdigitab[] =
402 {
403 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
404 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
405 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
406 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
407 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
408 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
409 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
410 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
411 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
412 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
413 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
414 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
415 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
416 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
417 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
418 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
419 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
420 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
421 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
422 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
423 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
424 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
425 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
426 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
427 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
428 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
429 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
430 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
431 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
432 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
433 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
434 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
435
436 #else
437
438 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
439
440 static const uint8_t xdigitab[] =
441 {
442 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
443 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
444 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
445 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
446 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
447 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
448 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
449 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
450 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
451 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
452 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
453 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
454 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
455 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
456 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
457 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
458 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
459 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
460 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
461 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
462 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
463 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
464 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
465 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
466 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
467 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
468 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
469 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
470 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
471 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
472 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
473 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
474 #endif /* EBCDIC */
475
476
477 /* Table for handling alphanumeric escaped characters. Positive returns are
478 simple data values; negative values are for special things like \d and so on.
479 Zero means further processing is needed (for things like \x), or the escape is
480 invalid. */
481
482 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
483 in UTF-8 mode. It runs from '0' to 'z'. */
484
485 #ifndef EBCDIC
486 #define ESCAPES_FIRST CHAR_0
487 #define ESCAPES_LAST CHAR_z
488 #define UPPER_CASE(c) (c-32)
489
490 static const short int escapes[] = {
491 0, 0,
492 0, 0,
493 0, 0,
494 0, 0,
495 0, 0,
496 CHAR_COLON, CHAR_SEMICOLON,
497 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
498 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
499 CHAR_COMMERCIAL_AT, -ESC_A,
500 -ESC_B, -ESC_C,
501 -ESC_D, -ESC_E,
502 0, -ESC_G,
503 -ESC_H, 0,
504 0, -ESC_K,
505 0, 0,
506 -ESC_N, 0,
507 -ESC_P, -ESC_Q,
508 -ESC_R, -ESC_S,
509 0, 0,
510 -ESC_V, -ESC_W,
511 -ESC_X, 0,
512 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
513 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
514 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
515 CHAR_GRAVE_ACCENT, CHAR_BEL,
516 -ESC_b, 0,
517 -ESC_d, CHAR_ESC,
518 CHAR_FF, 0,
519 -ESC_h, 0,
520 0, -ESC_k,
521 0, 0,
522 CHAR_LF, 0,
523 -ESC_p, 0,
524 CHAR_CR, -ESC_s,
525 CHAR_HT, 0,
526 -ESC_v, -ESC_w,
527 0, 0,
528 -ESC_z
529 };
530
531 #else
532
533 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
534 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
535 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
536 because it is defined as 'a', which of course picks up the ASCII value. */
537
538 #if 'a' == 0x81 /* Check for a real EBCDIC environment */
539 #define ESCAPES_FIRST CHAR_a
540 #define ESCAPES_LAST CHAR_9
541 #define UPPER_CASE(c) (c+64)
542 #else /* Testing in an ASCII environment */
543 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
544 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
545 #define UPPER_CASE(c) (c-32)
546 #endif
547
548 static const short int escapes[] = {
549 /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
550 /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
551 /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
552 /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
553 /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
554 /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
555 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
556 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
557 /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
558 /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
559 /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
560 /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
561 /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
562 /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
563 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
564 /* F8 */ 0, 0
565 };
566
567 /* We also need a table of characters that may follow \c in an EBCDIC
568 environment for characters 0-31. */
569
570 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
571
572 #endif /* EBCDIC */
573
574
575 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
576 searched linearly. Put all the names into a single string, in order to reduce
577 the number of relocations when a shared library is dynamically linked. The
578 string is built from string macros so that it works in UTF-8 mode on EBCDIC
579 platforms. */
580
581 typedef struct verbitem {
582 unsigned int len; /* Length of verb name */
583 uint32_t meta; /* Base META_ code */
584 int has_arg; /* Argument requirement */
585 } verbitem;
586
587 static const char verbnames[] =
588 "\0" /* Empty name is a shorthand for MARK */
589 STRING_MARK0
590 STRING_ACCEPT0
591 STRING_F0
592 STRING_FAIL0
593 STRING_COMMIT0
594 STRING_PRUNE0
595 STRING_SKIP0
596 STRING_THEN;
597
598 static const verbitem verbs[] = {
599 { 0, META_MARK, +1 }, /* > 0 => must have an argument */
600 { 4, META_MARK, +1 },
601 { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
602 { 1, META_FAIL, -1 },
603 { 4, META_FAIL, -1 },
604 { 6, META_COMMIT, 0 },
605 { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
606 { 4, META_SKIP, 0 },
607 { 4, META_THEN, 0 }
608 };
609
610 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
611
612 /* Verb opcodes, indexed by their META code offset from META_MARK. */
613
614 static const uint32_t verbops[] = {
615 OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
616 OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
617
618 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
619
620 static uint32_t chartypeoffset[] = {
621 OP_STAR - OP_STAR, OP_STARI - OP_STAR,
622 OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
623
624 /* Tables of names of POSIX character classes and their lengths. The names are
625 now all in a single string, to reduce the number of relocations when a shared
626 library is dynamically loaded. The list of lengths is terminated by a zero
627 length entry. The first three must be alpha, lower, upper, as this is assumed
628 for handling case independence. The indices for graph, print, and punct are
629 needed, so identify them. */
630
631 static const char posix_names[] =
632 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
633 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
634 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
635 STRING_word0 STRING_xdigit;
636
637 static const uint8_t posix_name_lengths[] = {
638 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
639
640 #define PC_GRAPH 8
641 #define PC_PRINT 9
642 #define PC_PUNCT 10
643
644 /* Table of class bit maps for each POSIX class. Each class is formed from a
645 base map, with an optional addition or removal of another map. Then, for some
646 classes, there is some additional tweaking: for [:blank:] the vertical space
647 characters are removed, and for [:alpha:] and [:alnum:] the underscore
648 character is removed. The triples in the table consist of the base map offset,
649 second map offset or -1 if no second map, and a non-negative value for map
650 addition or a negative value for map subtraction (if there are two maps). The
651 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
652 remove vertical space characters, 2 => remove underscore. */
653
654 static const int posix_class_maps[] = {
655 cbit_word, cbit_digit, -2, /* alpha */
656 cbit_lower, -1, 0, /* lower */
657 cbit_upper, -1, 0, /* upper */
658 cbit_word, -1, 2, /* alnum - word without underscore */
659 cbit_print, cbit_cntrl, 0, /* ascii */
660 cbit_space, -1, 1, /* blank - a GNU extension */
661 cbit_cntrl, -1, 0, /* cntrl */
662 cbit_digit, -1, 0, /* digit */
663 cbit_graph, -1, 0, /* graph */
664 cbit_print, -1, 0, /* print */
665 cbit_punct, -1, 0, /* punct */
666 cbit_space, -1, 0, /* space */
667 cbit_word, -1, 0, /* word - a Perl extension */
668 cbit_xdigit,-1, 0 /* xdigit */
669 };
670
671 #ifdef SUPPORT_UNICODE
672
673 /* The POSIX class Unicode property substitutes that are used in UCP mode must
674 be in the order of the POSIX class names, defined above. */
675
676 static int posix_substitutes[] = {
677 PT_GC, ucp_L, /* alpha */
678 PT_PC, ucp_Ll, /* lower */
679 PT_PC, ucp_Lu, /* upper */
680 PT_ALNUM, 0, /* alnum */
681 -1, 0, /* ascii, treat as non-UCP */
682 -1, 1, /* blank, treat as \h */
683 PT_PC, ucp_Cc, /* cntrl */
684 PT_PC, ucp_Nd, /* digit */
685 PT_PXGRAPH, 0, /* graph */
686 PT_PXPRINT, 0, /* print */
687 PT_PXPUNCT, 0, /* punct */
688 PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
689 PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
690 -1, 0 /* xdigit, treat as non-UCP */
691 };
692 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
693 #endif /* SUPPORT_UNICODE */
694
695 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
696 are allowed. */
697
698 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
699 (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
700 PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_NO_START_OPTIMIZE| \
701 PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
702
703 #define PUBLIC_COMPILE_OPTIONS \
704 (PUBLIC_LITERAL_COMPILE_OPTIONS| \
705 PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
706 PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
707 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
708 PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
709 PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
710 PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
711
712 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
713 (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
714
715 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
716 (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
717 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL)
718
719 /* Compile time error code numbers. They are given names so that they can more
720 easily be tracked. When a new number is added, the tables called eint1 and
721 eint2 in pcre2posix.c may need to be updated, and a new error text must be
722 added to compile_error_texts in pcre2_error.c. */
723
724 enum { ERR0 = COMPILE_ERROR_BASE,
725 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
726 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
727 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
728 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
729 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
730 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
731 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
732 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
733 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
734 ERR91, ERR92, ERR93, ERR94 };
735
736 /* This is a table of start-of-pattern options such as (*UTF) and settings such
737 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
738 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
739 generic and always supported. */
740
741 enum { PSO_OPT, /* Value is an option bit */
742 PSO_FLG, /* Value is a flag bit */
743 PSO_NL, /* Value is a newline type */
744 PSO_BSR, /* Value is a \R type */
745 PSO_LIMH, /* Read integer value for heap limit */
746 PSO_LIMM, /* Read integer value for match limit */
747 PSO_LIMD }; /* Read integer value for depth limit */
748
749 typedef struct pso {
750 const uint8_t *name;
751 uint16_t length;
752 uint16_t type;
753 uint32_t value;
754 } pso;
755
756 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
757
758 static pso pso_list[] = {
759 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
760 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
761 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
762 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
763 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
764 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
765 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
766 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
767 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
768 { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
769 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
770 { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
771 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
772 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
773 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
774 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
775 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
776 { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
777 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
778 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
779 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
780 };
781
782 /* This table is used when converting repeating opcodes into possessified
783 versions as a result of an explicit possessive quantifier such as ++. A zero
784 value means there is no possessified version - in those cases the item in
785 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
786 because all relevant opcodes are less than that. */
787
788 static const uint8_t opcode_possessify[] = {
789 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
791
792 0, /* NOTI */
793 OP_POSSTAR, 0, /* STAR, MINSTAR */
794 OP_POSPLUS, 0, /* PLUS, MINPLUS */
795 OP_POSQUERY, 0, /* QUERY, MINQUERY */
796 OP_POSUPTO, 0, /* UPTO, MINUPTO */
797 0, /* EXACT */
798 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
799
800 OP_POSSTARI, 0, /* STARI, MINSTARI */
801 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
802 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
803 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
804 0, /* EXACTI */
805 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
806
807 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
808 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
809 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
810 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
811 0, /* NOTEXACT */
812 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
813
814 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
815 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
816 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
817 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
818 0, /* NOTEXACTI */
819 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
820
821 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
822 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
823 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
824 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
825 0, /* TYPEEXACT */
826 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
827
828 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
829 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
830 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
831 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
832 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
833
834 0, 0, 0, /* CLASS, NCLASS, XCLASS */
835 0, 0, /* REF, REFI */
836 0, 0, /* DNREF, DNREFI */
837 0, 0 /* RECURSE, CALLOUT */
838 };
839
840
841 #ifdef DEBUG_SHOW_PARSED
842 /*************************************************
843 * Show the parsed pattern for debugging *
844 *************************************************/
845
846 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
847 can be enabled. */
848
show_parsed(compile_block * cb)849 static void show_parsed(compile_block *cb)
850 {
851 uint32_t *pptr = cb->parsed_pattern;
852
853 for (;;)
854 {
855 int max, min;
856 PCRE2_SIZE offset;
857 uint32_t i;
858 uint32_t length;
859 uint32_t meta_arg = META_DATA(*pptr);
860
861 fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
862
863 if (*pptr < META_END)
864 {
865 if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
866 pptr++;
867 }
868
869 else switch (META_CODE(*pptr++))
870 {
871 default:
872 fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
873 return;
874
875 case META_END:
876 fprintf(stderr, "META_END\n");
877 return;
878
879 case META_CAPTURE:
880 fprintf(stderr, "META_CAPTURE %d", meta_arg);
881 break;
882
883 case META_RECURSE:
884 GETOFFSET(offset, pptr);
885 fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
886 break;
887
888 case META_BACKREF:
889 if (meta_arg < 10)
890 offset = cb->small_ref_offset[meta_arg];
891 else
892 GETOFFSET(offset, pptr);
893 fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
894 break;
895
896 case META_ESCAPE:
897 if (meta_arg == ESC_P || meta_arg == ESC_p)
898 {
899 uint32_t ptype = *pptr >> 16;
900 uint32_t pvalue = *pptr++ & 0xffff;
901 fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
902 ptype, pvalue);
903 }
904 else
905 {
906 uint32_t cc;
907 /* There's just one escape we might have here that isn't negated in the
908 escapes table. */
909 if (meta_arg == ESC_g) cc = CHAR_g;
910 else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
911 {
912 if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
913 }
914 if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
915 fprintf(stderr, "META \\%c", cc);
916 }
917 break;
918
919 case META_MINMAX:
920 min = *pptr++;
921 max = *pptr++;
922 if (max != REPEAT_UNLIMITED)
923 fprintf(stderr, "META {%d,%d}", min, max);
924 else
925 fprintf(stderr, "META {%d,}", min);
926 break;
927
928 case META_MINMAX_QUERY:
929 min = *pptr++;
930 max = *pptr++;
931 if (max != REPEAT_UNLIMITED)
932 fprintf(stderr, "META {%d,%d}?", min, max);
933 else
934 fprintf(stderr, "META {%d,}?", min);
935 break;
936
937 case META_MINMAX_PLUS:
938 min = *pptr++;
939 max = *pptr++;
940 if (max != REPEAT_UNLIMITED)
941 fprintf(stderr, "META {%d,%d}+", min, max);
942 else
943 fprintf(stderr, "META {%d,}+", min);
944 break;
945
946 case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
947 case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
948 case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
949 case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
950 case META_DOT: fprintf(stderr, "META_DOT"); break;
951 case META_ASTERISK: fprintf(stderr, "META *"); break;
952 case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
953 case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
954 case META_PLUS: fprintf(stderr, "META +"); break;
955 case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
956 case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
957 case META_QUERY: fprintf(stderr, "META ?"); break;
958 case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
959 case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
960
961 case META_ATOMIC: fprintf(stderr, "META (?>"); break;
962 case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
963 case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
964 case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
965 case META_KET: fprintf(stderr, "META )"); break;
966 case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
967
968 case META_CLASS: fprintf(stderr, "META ["); break;
969 case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
970 case META_CLASS_END: fprintf(stderr, "META ]"); break;
971 case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
972 case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
973
974 case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
975 case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
976
977 case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
978 case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
979
980 case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
981 case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
982 case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
983 case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
984 case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
985 case META_THEN: fprintf(stderr, "META (*THEN)"); break;
986
987 case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
988
989 case META_LOOKBEHIND:
990 fprintf(stderr, "META (?<= %d offset=", meta_arg);
991 GETOFFSET(offset, pptr);
992 fprintf(stderr, "%zd", offset);
993 break;
994
995 case META_LOOKBEHINDNOT:
996 fprintf(stderr, "META (?<! %d offset=", meta_arg);
997 GETOFFSET(offset, pptr);
998 fprintf(stderr, "%zd", offset);
999 break;
1000
1001 case META_CALLOUT_NUMBER:
1002 fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1003 pptr[1]);
1004 pptr += 3;
1005 break;
1006
1007 case META_CALLOUT_STRING:
1008 {
1009 uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1010 uint32_t patlength = *pptr++; /* Length of next pattern item */
1011 fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1012 GETOFFSET(offset, pptr);
1013 fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1014 }
1015 break;
1016
1017 case META_RECURSE_BYNAME:
1018 fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1019 GETOFFSET(offset, pptr);
1020 fprintf(stderr, "%zd", offset);
1021 break;
1022
1023 case META_BACKREF_BYNAME:
1024 fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1025 GETOFFSET(offset, pptr);
1026 fprintf(stderr, "%zd", offset);
1027 break;
1028
1029 case META_COND_NUMBER:
1030 fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1031 GETOFFSET(offset, pptr);
1032 fprintf(stderr, "%zd", offset);
1033 pptr++;
1034 break;
1035
1036 case META_COND_DEFINE:
1037 fprintf(stderr, "META (?(DEFINE) offset=");
1038 GETOFFSET(offset, pptr);
1039 fprintf(stderr, "%zd", offset);
1040 break;
1041
1042 case META_COND_VERSION:
1043 fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1044 fprintf(stderr, "%d.", *pptr++);
1045 fprintf(stderr, "%d)", *pptr++);
1046 break;
1047
1048 case META_COND_NAME:
1049 fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1050 GETOFFSET(offset, pptr);
1051 fprintf(stderr, "%zd", offset);
1052 break;
1053
1054 case META_COND_RNAME:
1055 fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1056 GETOFFSET(offset, pptr);
1057 fprintf(stderr, "%zd", offset);
1058 break;
1059
1060 /* This is kept as a name, because it might be. */
1061
1062 case META_COND_RNUMBER:
1063 fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1064 GETOFFSET(offset, pptr);
1065 fprintf(stderr, "%zd", offset);
1066 break;
1067
1068 case META_MARK:
1069 fprintf(stderr, "META (*MARK:");
1070 goto SHOWARG;
1071
1072 case META_COMMIT_ARG:
1073 fprintf(stderr, "META (*COMMIT:");
1074 goto SHOWARG;
1075
1076 case META_PRUNE_ARG:
1077 fprintf(stderr, "META (*PRUNE:");
1078 goto SHOWARG;
1079
1080 case META_SKIP_ARG:
1081 fprintf(stderr, "META (*SKIP:");
1082 goto SHOWARG;
1083
1084 case META_THEN_ARG:
1085 fprintf(stderr, "META (*THEN:");
1086 SHOWARG:
1087 length = *pptr++;
1088 for (i = 0; i < length; i++)
1089 {
1090 uint32_t cc = *pptr++;
1091 if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1092 else fprintf(stderr, "\\x{%x}", cc);
1093 }
1094 fprintf(stderr, ") length=%u", length);
1095 break;
1096 }
1097 fprintf(stderr, "\n");
1098 }
1099 return;
1100 }
1101 #endif /* DEBUG_SHOW_PARSED */
1102
1103
1104
1105 /*************************************************
1106 * Copy compiled code *
1107 *************************************************/
1108
1109 /* Compiled JIT code cannot be copied, so the new compiled block has no
1110 associated JIT data. */
1111
1112 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1113 pcre2_code_copy(const pcre2_code *code)
1114 {
1115 PCRE2_SIZE* ref_count;
1116 pcre2_code *newcode;
1117
1118 if (code == NULL) return NULL;
1119 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1120 if (newcode == NULL) return NULL;
1121 memcpy(newcode, code, code->blocksize);
1122 newcode->executable_jit = NULL;
1123
1124 /* If the code is one that has been deserialized, increment the reference count
1125 in the decoded tables. */
1126
1127 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1128 {
1129 ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
1130 (*ref_count)++;
1131 }
1132
1133 return newcode;
1134 }
1135
1136
1137
1138 /*************************************************
1139 * Copy compiled code and character tables *
1140 *************************************************/
1141
1142 /* Compiled JIT code cannot be copied, so the new compiled block has no
1143 associated JIT data. This version of code_copy also makes a separate copy of
1144 the character tables. */
1145
1146 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1147 pcre2_code_copy_with_tables(const pcre2_code *code)
1148 {
1149 PCRE2_SIZE* ref_count;
1150 pcre2_code *newcode;
1151 uint8_t *newtables;
1152
1153 if (code == NULL) return NULL;
1154 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1155 if (newcode == NULL) return NULL;
1156 memcpy(newcode, code, code->blocksize);
1157 newcode->executable_jit = NULL;
1158
1159 newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE),
1160 code->memctl.memory_data);
1161 if (newtables == NULL)
1162 {
1163 code->memctl.free((void *)newcode, code->memctl.memory_data);
1164 return NULL;
1165 }
1166 memcpy(newtables, code->tables, tables_length);
1167 ref_count = (PCRE2_SIZE *)(newtables + tables_length);
1168 *ref_count = 1;
1169
1170 newcode->tables = newtables;
1171 newcode->flags |= PCRE2_DEREF_TABLES;
1172 return newcode;
1173 }
1174
1175
1176
1177 /*************************************************
1178 * Free compiled code *
1179 *************************************************/
1180
1181 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1182 pcre2_code_free(pcre2_code *code)
1183 {
1184 PCRE2_SIZE* ref_count;
1185
1186 if (code != NULL)
1187 {
1188 if (code->executable_jit != NULL)
1189 PRIV(jit_free)(code->executable_jit, &code->memctl);
1190
1191 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1192 {
1193 /* Decoded tables belong to the codes after deserialization, and they must
1194 be freed when there are no more reference to them. The *ref_count should
1195 always be > 0. */
1196
1197 ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
1198 if (*ref_count > 0)
1199 {
1200 (*ref_count)--;
1201 if (*ref_count == 0)
1202 code->memctl.free((void *)code->tables, code->memctl.memory_data);
1203 }
1204 }
1205
1206 code->memctl.free(code, code->memctl.memory_data);
1207 }
1208 }
1209
1210
1211
1212 /*************************************************
1213 * Read a number, possibly signed *
1214 *************************************************/
1215
1216 /* This function is used to read numbers in the pattern. The initial pointer
1217 must be the sign or first digit of the number. When relative values (introduced
1218 by + or -) are allowed, they are relative group numbers, and the result must be
1219 greater than zero.
1220
1221 Arguments:
1222 ptrptr points to the character pointer variable
1223 ptrend points to the end of the input string
1224 allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1225 max_value the largest number allowed
1226 max_error the error to give for an over-large number
1227 intptr where to put the result
1228 errcodeptr where to put an error code
1229
1230 Returns: TRUE - a number was read
1231 FALSE - errorcode == 0 => no number was found
1232 errorcode != 0 => an error occurred
1233 */
1234
1235 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1236 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1237 uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1238 {
1239 int sign = 0;
1240 uint32_t n = 0;
1241 PCRE2_SPTR ptr = *ptrptr;
1242 BOOL yield = FALSE;
1243
1244 *errorcodeptr = 0;
1245
1246 if (allow_sign >= 0 && ptr < ptrend)
1247 {
1248 if (*ptr == CHAR_PLUS)
1249 {
1250 sign = +1;
1251 max_value -= allow_sign;
1252 ptr++;
1253 }
1254 else if (*ptr == CHAR_MINUS)
1255 {
1256 sign = -1;
1257 ptr++;
1258 }
1259 }
1260
1261 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1262 while (ptr < ptrend && IS_DIGIT(*ptr))
1263 {
1264 n = n * 10 + *ptr++ - CHAR_0;
1265 if (n > max_value)
1266 {
1267 *errorcodeptr = max_error;
1268 goto EXIT;
1269 }
1270 }
1271
1272 if (allow_sign >= 0 && sign != 0)
1273 {
1274 if (n == 0)
1275 {
1276 *errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1277 goto EXIT;
1278 }
1279
1280 if (sign > 0) n += allow_sign;
1281 else if ((int)n > allow_sign)
1282 {
1283 *errorcodeptr = ERR15; /* Non-existent subpattern */
1284 goto EXIT;
1285 }
1286 else n = allow_sign + 1 - n;
1287 }
1288
1289 yield = TRUE;
1290
1291 EXIT:
1292 *intptr = n;
1293 *ptrptr = ptr;
1294 return yield;
1295 }
1296
1297
1298
1299 /*************************************************
1300 * Read repeat counts *
1301 *************************************************/
1302
1303 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1304 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1305 larger value is used for "unlimited". We have to use signed arguments for
1306 read_number() because it is capable of returning a signed value.
1307
1308 Arguments:
1309 ptrptr points to pointer to character after'{'
1310 ptrend pointer to end of input
1311 minp if not NULL, pointer to int for min
1312 maxp if not NULL, pointer to int for max (-1 if no max)
1313 returned as -1 if no max
1314 errorcodeptr points to error code variable
1315
1316 Returns: FALSE if not a repeat quantifier, errorcode set zero
1317 FALSE on error, with errorcode set non-zero
1318 TRUE on success, with pointer updated to point after '}'
1319 */
1320
1321 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1322 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1323 uint32_t *maxp, int *errorcodeptr)
1324 {
1325 PCRE2_SPTR p = *ptrptr;
1326 BOOL yield = FALSE;
1327 int32_t min = 0;
1328 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1329
1330 /* NB read_number() initializes the error code to zero. The only error is for a
1331 number that is too big. */
1332
1333 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1334 goto EXIT;
1335
1336 if (p >= ptrend) goto EXIT;
1337
1338 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1339 {
1340 p++;
1341 max = min;
1342 }
1343
1344 else
1345 {
1346 if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT;
1347 if (*p != CHAR_RIGHT_CURLY_BRACKET)
1348 {
1349 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1350 errorcodeptr) || p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1351 goto EXIT;
1352 if (max < min)
1353 {
1354 *errorcodeptr = ERR4;
1355 goto EXIT;
1356 }
1357 }
1358 p++;
1359 }
1360
1361 yield = TRUE;
1362 if (minp != NULL) *minp = (uint32_t)min;
1363 if (maxp != NULL) *maxp = (uint32_t)max;
1364
1365 /* Update the pattern pointer on success, or after an error, but not when
1366 the result is "not a repeat quantifier". */
1367
1368 EXIT:
1369 if (yield || *errorcodeptr != 0) *ptrptr = p;
1370 return yield;
1371
1372
1373
1374 }
1375
1376
1377
1378 /*************************************************
1379 * Handle escapes *
1380 *************************************************/
1381
1382 /* This function is called when a \ has been encountered. It either returns a
1383 positive value for a simple escape such as \d, or 0 for a data character, which
1384 is placed in chptr. A backreference to group n is returned as negative n. On
1385 entry, ptr is pointing at the character after \. On exit, it points after the
1386 final code unit of the escape sequence.
1387
1388 This function is also called from pcre2_substitute() to handle escape sequences
1389 in replacement strings. In this case, the cb argument is NULL, and in the case
1390 of escapes that have further processing, only sequences that define a data
1391 character are recognised. The isclass argument is not relevant; the options
1392 argument is the final value of the compiled pattern's options.
1393
1394 Arguments:
1395 ptrptr points to the input position pointer
1396 ptrend points to the end of the input
1397 chptr points to a returned data character
1398 errorcodeptr points to the errorcode variable (containing zero)
1399 options the current options bits
1400 isclass TRUE if inside a character class
1401 cb compile data block
1402
1403 Returns: zero => a data character
1404 positive => a special escape sequence
1405 negative => a numerical back reference
1406 on error, errorcodeptr is set non-zero
1407 */
1408
1409 int
PRIV(check_escape)1410 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1411 int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb)
1412 {
1413 BOOL utf = (options & PCRE2_UTF) != 0;
1414 PCRE2_SPTR ptr = *ptrptr;
1415 uint32_t c, cc;
1416 int escape = 0;
1417 int i;
1418
1419 /* If backslash is at the end of the string, it's an error. */
1420
1421 if (ptr >= ptrend)
1422 {
1423 *errorcodeptr = ERR1;
1424 return 0;
1425 }
1426
1427 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1428 *errorcodeptr = 0; /* Be optimistic */
1429
1430 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1431 value test saves a memory lookup for code points outside the alphanumeric
1432 range. Otherwise, do a table lookup. A non-zero result is something that can be
1433 returned immediately. Otherwise further processing is required. */
1434
1435 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1436
1437 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1438 {
1439 if (i > 0) c = (uint32_t)i; else /* Positive is a data character */
1440 {
1441 escape = -i; /* Else return a special escape */
1442 if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1443 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1444
1445 /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1446 Unicode code points, as well as plain \N for "not newline". PCRE does not
1447 support \N{name}. However, it does support quantification such as \N{2,3},
1448 so if \N{ is not followed by U+dddd we check for a quantifier. */
1449
1450 if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1451 {
1452 PCRE2_SPTR p = ptr + 1;
1453
1454 /* \N{U+ can be handled by the \x{ code. However, this construction is
1455 not valid in EBCDIC environments because it specifies a Unicode
1456 character, not a codepoint in the local code. For example \N{U+0041}
1457 must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1458 casing semantics for the entire pattern, so allow it only in UTF (i.e.
1459 Unicode) mode. */
1460
1461 if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1462 {
1463 #ifdef EBCDIC
1464 *errorcodeptr = ERR93;
1465 #else
1466 if (utf)
1467 {
1468 ptr = p + 1;
1469 escape = 0; /* Not a fancy escape after all */
1470 goto COME_FROM_NU;
1471 }
1472 else *errorcodeptr = ERR93;
1473 #endif
1474 }
1475
1476 /* Give an error if what follows is not a quantifier, but don't override
1477 an error set by the quantifier reader (e.g. number overflow). */
1478
1479 else
1480 {
1481 if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1482 *errorcodeptr == 0)
1483 *errorcodeptr = ERR37;
1484 }
1485 }
1486 }
1487 }
1488
1489 /* Escapes that need further processing, including those that are unknown.
1490 When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u
1491 when BSUX is set). */
1492
1493 else
1494 {
1495 PCRE2_SPTR oldptr;
1496 BOOL overflow;
1497 int s;
1498
1499 /* Filter calls from pcre2_substitute(). */
1500
1501 if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x &&
1502 (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0))
1503 {
1504 *errorcodeptr = ERR3;
1505 return 0;
1506 }
1507
1508 switch (c)
1509 {
1510 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1511 error. */
1512
1513 case CHAR_F:
1514 case CHAR_l:
1515 case CHAR_L:
1516 *errorcodeptr = ERR37;
1517 break;
1518
1519 /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
1520 specially, \u must be followed by four hex digits. Otherwise it is a
1521 lowercase u letter. */
1522
1523 case CHAR_u:
1524 if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
1525 {
1526 uint32_t xc;
1527 if (ptrend - ptr < 4) break; /* Less than 4 chars */
1528 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1529 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1530 cc = (cc << 4) | xc;
1531 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1532 cc = (cc << 4) | xc;
1533 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1534 c = (cc << 4) | xc;
1535 ptr += 4;
1536 if (utf)
1537 {
1538 if (c > 0x10ffffU) *errorcodeptr = ERR77;
1539 else
1540 if (c >= 0xd800 && c <= 0xdfff &&
1541 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1542 *errorcodeptr = ERR73;
1543 }
1544 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1545 }
1546 break;
1547
1548 /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
1549 upper case letter. */
1550
1551 case CHAR_U:
1552 if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
1553 break;
1554
1555 /* In a character class, \g is just a literal "g". Outside a character
1556 class, \g must be followed by one of a number of specific things:
1557
1558 (1) A number, either plain or braced. If positive, it is an absolute
1559 backreference. If negative, it is a relative backreference. This is a Perl
1560 5.10 feature.
1561
1562 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1563 is part of Perl's movement towards a unified syntax for back references. As
1564 this is synonymous with \k{name}, we fudge it up by pretending it really
1565 was \k{name}.
1566
1567 (3) For Oniguruma compatibility we also support \g followed by a name or a
1568 number either in angle brackets or in single quotes. However, these are
1569 (possibly recursive) subroutine calls, _not_ backreferences. We return
1570 the ESC_g code.
1571
1572 Summary: Return a negative number for a numerical back reference, ESC_k for
1573 a named back reference, and ESC_g for a named or numbered subroutine call.
1574 */
1575
1576 case CHAR_g:
1577 if (isclass) break;
1578
1579 if (ptr >= ptrend)
1580 {
1581 *errorcodeptr = ERR57;
1582 break;
1583 }
1584
1585 if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1586 {
1587 escape = ESC_g;
1588 break;
1589 }
1590
1591 /* If there is a brace delimiter, try to read a numerical reference. If
1592 there isn't one, assume we have a name and treat it as \k. */
1593
1594 if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1595 {
1596 PCRE2_SPTR p = ptr + 1;
1597 if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1598 errorcodeptr))
1599 {
1600 if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1601 break;
1602 }
1603 if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1604 {
1605 *errorcodeptr = ERR57;
1606 break;
1607 }
1608 ptr = p + 1;
1609 }
1610
1611 /* Read an undelimited number */
1612
1613 else
1614 {
1615 if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1616 errorcodeptr))
1617 {
1618 if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1619 break;
1620 }
1621 }
1622
1623 if (s <= 0)
1624 {
1625 *errorcodeptr = ERR15;
1626 break;
1627 }
1628
1629 escape = -s;
1630 break;
1631
1632 /* The handling of escape sequences consisting of a string of digits
1633 starting with one that is not zero is not straightforward. Perl has changed
1634 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1635 recommended to avoid the ambiguities in the old syntax.
1636
1637 Outside a character class, the digits are read as a decimal number. If the
1638 number is less than 10, or if there are that many previous extracting left
1639 brackets, it is a back reference. Otherwise, up to three octal digits are
1640 read to form an escaped character code. Thus \123 is likely to be octal 123
1641 (cf \0123, which is octal 012 followed by the literal 3).
1642
1643 Inside a character class, \ followed by a digit is always either a literal
1644 8 or 9 or an octal number. */
1645
1646 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1647 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1648
1649 if (!isclass)
1650 {
1651 oldptr = ptr;
1652 ptr--; /* Back to the digit */
1653 if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s,
1654 errorcodeptr))
1655 break;
1656
1657 /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1658 are octal escapes if there are not that many previous captures. */
1659
1660 if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount)
1661 {
1662 if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1663 else escape = -s; /* Indicates a back reference */
1664 break;
1665 }
1666 ptr = oldptr; /* Put the pointer back and fall through */
1667 }
1668
1669 /* Handle a digit following \ when the number is not a back reference, or
1670 we are within a character class. If the first digit is 8 or 9, Perl used to
1671 generate a binary zero and then treat the digit as a following literal. At
1672 least by Perl 5.18 this changed so as not to insert the binary zero. */
1673
1674 if (c >= CHAR_8) break;
1675
1676 /* Fall through */
1677
1678 /* \0 always starts an octal number, but we may drop through to here with a
1679 larger first octal digit. The original code used just to take the least
1680 significant 8 bits of octal numbers (I think this is what early Perls used
1681 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1682 but no more than 3 octal digits. */
1683
1684 case CHAR_0:
1685 c -= CHAR_0;
1686 while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1687 c = c * 8 + *ptr++ - CHAR_0;
1688 #if PCRE2_CODE_UNIT_WIDTH == 8
1689 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1690 #endif
1691 break;
1692
1693 /* \o is a relatively new Perl feature, supporting a more general way of
1694 specifying character codes in octal. The only supported form is \o{ddd}. */
1695
1696 case CHAR_o:
1697 if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1698 {
1699 ptr--;
1700 *errorcodeptr = ERR55;
1701 }
1702 else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1703 *errorcodeptr = ERR78;
1704 else
1705 {
1706 c = 0;
1707 overflow = FALSE;
1708 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1709 {
1710 cc = *ptr++;
1711 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1712 #if PCRE2_CODE_UNIT_WIDTH == 32
1713 if (c >= 0x20000000l) { overflow = TRUE; break; }
1714 #endif
1715 c = (c << 3) + (cc - CHAR_0);
1716 #if PCRE2_CODE_UNIT_WIDTH == 8
1717 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1718 #elif PCRE2_CODE_UNIT_WIDTH == 16
1719 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1720 #elif PCRE2_CODE_UNIT_WIDTH == 32
1721 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1722 #endif
1723 }
1724 if (overflow)
1725 {
1726 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1727 *errorcodeptr = ERR34;
1728 }
1729 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1730 {
1731 if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL ||
1732 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0))
1733 {
1734 ptr--;
1735 *errorcodeptr = ERR73;
1736 }
1737 }
1738 else
1739 {
1740 ptr--;
1741 *errorcodeptr = ERR64;
1742 }
1743 }
1744 break;
1745
1746 /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by
1747 two hexadecimal digits. Otherwise it is a lowercase x letter. */
1748
1749 case CHAR_x:
1750 if ((options & PCRE2_ALT_BSUX) != 0)
1751 {
1752 uint32_t xc;
1753 if (ptrend - ptr < 2) break; /* Less than 2 characters */
1754 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1755 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1756 c = (cc << 4) | xc;
1757 ptr += 2;
1758 } /* End PCRE2_ALT_BSUX handling */
1759
1760 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1761 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1762 digits. If not, { used to be treated as a data character. However, Perl
1763 seems to read hex digits up to the first non-such, and ignore the rest, so
1764 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1765 now gives an error. */
1766
1767 else
1768 {
1769 if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1770 {
1771 #ifndef EBCDIC
1772 COME_FROM_NU:
1773 #endif
1774 if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1775 {
1776 *errorcodeptr = ERR78;
1777 break;
1778 }
1779 c = 0;
1780 overflow = FALSE;
1781
1782 while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1783 {
1784 ptr++;
1785 if (c == 0 && cc == 0) continue; /* Leading zeroes */
1786 #if PCRE2_CODE_UNIT_WIDTH == 32
1787 if (c >= 0x10000000l) { overflow = TRUE; break; }
1788 #endif
1789 c = (c << 4) | cc;
1790 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1791 {
1792 overflow = TRUE;
1793 break;
1794 }
1795 }
1796
1797 if (overflow)
1798 {
1799 while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1800 *errorcodeptr = ERR34;
1801 }
1802 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1803 {
1804 if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL ||
1805 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0))
1806 {
1807 ptr--;
1808 *errorcodeptr = ERR73;
1809 }
1810 }
1811
1812 /* If the sequence of hex digits does not end with '}', give an error.
1813 We used just to recognize this construct and fall through to the normal
1814 \x handling, but nowadays Perl gives an error, which seems much more
1815 sensible, so we do too. */
1816
1817 else
1818 {
1819 ptr--;
1820 *errorcodeptr = ERR67;
1821 }
1822 } /* End of \x{} processing */
1823
1824 /* Read a up to two hex digits after \x */
1825
1826 else
1827 {
1828 c = 0;
1829 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1830 ptr++;
1831 c = cc;
1832 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1833 ptr++;
1834 c = (c << 4) | cc;
1835 } /* End of \xdd handling */
1836 } /* End of Perl-style \x handling */
1837 break;
1838
1839 /* The handling of \c is different in ASCII and EBCDIC environments. In an
1840 ASCII (or Unicode) environment, an error is given if the character
1841 following \c is not a printable ASCII character. Otherwise, the following
1842 character is upper-cased if it is a letter, and after that the 0x40 bit is
1843 flipped. The result is the value of the escape.
1844
1845 In an EBCDIC environment the handling of \c is compatible with the
1846 specification in the perlebcdic document. The following character must be
1847 a letter or one of small number of special characters. These provide a
1848 means of defining the character values 0-31.
1849
1850 For testing the EBCDIC handling of \c in an ASCII environment, recognize
1851 the EBCDIC value of 'c' explicitly. */
1852
1853 #if defined EBCDIC && 'a' != 0x81
1854 case 0x83:
1855 #else
1856 case CHAR_c:
1857 #endif
1858 if (ptr >= ptrend)
1859 {
1860 *errorcodeptr = ERR2;
1861 break;
1862 }
1863 c = *ptr;
1864 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
1865
1866 /* Handle \c in an ASCII/Unicode environment. */
1867
1868 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1869 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
1870 {
1871 *errorcodeptr = ERR68;
1872 break;
1873 }
1874 c ^= 0x40;
1875
1876 /* Handle \c in an EBCDIC environment. The special case \c? is converted to
1877 255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC
1878 encoding. (This is the way Perl indicates that it handles \c?.) The other
1879 valid sequences correspond to a list of specific characters. */
1880
1881 #else
1882 if (c == CHAR_QUESTION_MARK)
1883 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1884 else
1885 {
1886 for (i = 0; i < 32; i++)
1887 {
1888 if (c == ebcdic_escape_c[i]) break;
1889 }
1890 if (i < 32) c = i; else *errorcodeptr = ERR68;
1891 }
1892 #endif /* EBCDIC */
1893
1894 ptr++;
1895 break;
1896
1897 /* Any other alphanumeric following \ is an error. Perl gives an error only
1898 if in warning mode, but PCRE doesn't have a warning mode. */
1899
1900 default:
1901 *errorcodeptr = ERR3;
1902 *ptrptr = ptr - 1; /* Point to the character at fault */
1903 return 0;
1904 }
1905 }
1906
1907 /* Set the pointer to the next character before returning. */
1908
1909 *ptrptr = ptr;
1910 *chptr = c;
1911 return escape;
1912 }
1913
1914
1915
1916 #ifdef SUPPORT_UNICODE
1917 /*************************************************
1918 * Handle \P and \p *
1919 *************************************************/
1920
1921 /* This function is called after \P or \p has been encountered, provided that
1922 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
1923 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
1924 after the final code unit of the escape sequence.
1925
1926 Arguments:
1927 ptrptr the pattern position pointer
1928 negptr a boolean that is set TRUE for negation else FALSE
1929 ptypeptr an unsigned int that is set to the type value
1930 pdataptr an unsigned int that is set to the detailed property value
1931 errorcodeptr the error code variable
1932 cb the compile data
1933
1934 Returns: TRUE if the type value was found, or FALSE for an invalid type
1935 */
1936
1937 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)1938 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
1939 uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
1940 {
1941 PCRE2_UCHAR c;
1942 PCRE2_SIZE i, bot, top;
1943 PCRE2_SPTR ptr = *ptrptr;
1944 PCRE2_UCHAR name[32];
1945
1946 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
1947 c = *ptr++;
1948 *negptr = FALSE;
1949
1950 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1951 negation. */
1952
1953 if (c == CHAR_LEFT_CURLY_BRACKET)
1954 {
1955 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
1956 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
1957 {
1958 *negptr = TRUE;
1959 ptr++;
1960 }
1961 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
1962 {
1963 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
1964 c = *ptr++;
1965 if (c == CHAR_NUL) goto ERROR_RETURN;
1966 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1967 name[i] = c;
1968 }
1969 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1970 name[i] = 0;
1971 }
1972
1973 /* Otherwise there is just one following character, which must be an ASCII
1974 letter. */
1975
1976 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
1977 {
1978 name[0] = c;
1979 name[1] = 0;
1980 }
1981 else goto ERROR_RETURN;
1982
1983 *ptrptr = ptr;
1984
1985 /* Search for a recognized property name using binary chop. */
1986
1987 bot = 0;
1988 top = PRIV(utt_size);
1989
1990 while (bot < top)
1991 {
1992 int r;
1993 i = (bot + top) >> 1;
1994 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1995 if (r == 0)
1996 {
1997 *ptypeptr = PRIV(utt)[i].type;
1998 *pdataptr = PRIV(utt)[i].value;
1999 return TRUE;
2000 }
2001 if (r > 0) bot = i + 1; else top = i;
2002 }
2003 *errorcodeptr = ERR47; /* Unrecognized name */
2004 return FALSE;
2005
2006 ERROR_RETURN: /* Malformed \P or \p */
2007 *errorcodeptr = ERR46;
2008 *ptrptr = ptr;
2009 return FALSE;
2010 }
2011 #endif
2012
2013
2014
2015 /*************************************************
2016 * Check for POSIX class syntax *
2017 *************************************************/
2018
2019 /* This function is called when the sequence "[:" or "[." or "[=" is
2020 encountered in a character class. It checks whether this is followed by a
2021 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2022 reach an unescaped ']' without the special preceding character, return FALSE.
2023
2024 Originally, this function only recognized a sequence of letters between the
2025 terminators, but it seems that Perl recognizes any sequence of characters,
2026 though of course unknown POSIX names are subsequently rejected. Perl gives an
2027 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2028 didn't consider this to be a POSIX class. Likewise for [:1234:].
2029
2030 The problem in trying to be exactly like Perl is in the handling of escapes. We
2031 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2032 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2033 below handles the special cases \\ and \], but does not try to do any other
2034 escape processing. This makes it different from Perl for cases such as
2035 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2036 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2037 when Perl does, I think.
2038
2039 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2040 It seems that the appearance of a nested POSIX class supersedes an apparent
2041 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2042 a digit. This is handled by returning FALSE if the start of a new group with
2043 the same terminator is encountered, since the next closing sequence must close
2044 the nested group, not the outer one.
2045
2046 In Perl, unescaped square brackets may also appear as part of class names. For
2047 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2048 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2049 seem right at all. PCRE does not allow closing square brackets in POSIX class
2050 names.
2051
2052 Arguments:
2053 ptr pointer to the character after the initial [ (colon, dot, equals)
2054 ptrend pointer to the end of the pattern
2055 endptr where to return a pointer to the terminating ':', '.', or '='
2056
2057 Returns: TRUE or FALSE
2058 */
2059
2060 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2061 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2062 {
2063 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2064 terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2065
2066 for (; ptrend - ptr >= 2; ptr++)
2067 {
2068 if (*ptr == CHAR_BACKSLASH &&
2069 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2070 ptr++;
2071
2072 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2073 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2074
2075 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2076 {
2077 *endptr = ptr;
2078 return TRUE;
2079 }
2080 }
2081
2082 return FALSE;
2083 }
2084
2085
2086
2087 /*************************************************
2088 * Check POSIX class name *
2089 *************************************************/
2090
2091 /* This function is called to check the name given in a POSIX-style class entry
2092 such as [:alnum:].
2093
2094 Arguments:
2095 ptr points to the first letter
2096 len the length of the name
2097
2098 Returns: a value representing the name, or -1 if unknown
2099 */
2100
2101 static int
check_posix_name(PCRE2_SPTR ptr,int len)2102 check_posix_name(PCRE2_SPTR ptr, int len)
2103 {
2104 const char *pn = posix_names;
2105 int yield = 0;
2106 while (posix_name_lengths[yield] != 0)
2107 {
2108 if (len == posix_name_lengths[yield] &&
2109 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2110 pn += posix_name_lengths[yield] + 1;
2111 yield++;
2112 }
2113 return -1;
2114 }
2115
2116
2117
2118 /*************************************************
2119 * Read a subpattern or VERB name *
2120 *************************************************/
2121
2122 /* This function is called from parse_regex() below whenever it needs to read
2123 the name of a subpattern or a (*VERB). The initial pointer must be to the
2124 character before the name. If that character is '*' we are reading a verb name.
2125 The pointer is updated to point after the name, for a VERB, or after tha name's
2126 terminator for a subpattern name. Returning both the offset and the name
2127 pointer is redundant information, but some callers use one and some the other,
2128 so it is simplest just to return both.
2129
2130 Arguments:
2131 ptrptr points to the character pointer variable
2132 ptrend points to the end of the input string
2133 terminator the terminator of a subpattern name must be this
2134 offsetptr where to put the offset from the start of the pattern
2135 nameptr where to put a pointer to the name in the input
2136 namelenptr where to put the length of the name
2137 errcodeptr where to put an error code
2138 cb pointer to the compile data block
2139
2140 Returns: TRUE if a name was read
2141 FALSE otherwise, with error code set
2142 */
2143
2144 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2145 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t terminator,
2146 PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2147 int *errorcodeptr, compile_block *cb)
2148 {
2149 PCRE2_SPTR ptr = *ptrptr;
2150 BOOL is_verb = (*ptr == CHAR_ASTERISK);
2151 uint32_t namelen = 0;
2152 uint32_t ctype = is_verb? ctype_letter : ctype_word;
2153
2154 if (++ptr >= ptrend)
2155 {
2156 *errorcodeptr = is_verb? ERR60: /* Verb not recognized or malformed */
2157 ERR62; /* Subpattern name expected */
2158 goto FAILED;
2159 }
2160
2161 *nameptr = ptr;
2162 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2163
2164 if (IS_DIGIT(*ptr))
2165 {
2166 *errorcodeptr = ERR44; /* Group name must not start with digit */
2167 goto FAILED;
2168 }
2169
2170 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype) != 0)
2171 {
2172 ptr++;
2173 namelen++;
2174 if (namelen > MAX_NAME_SIZE)
2175 {
2176 *errorcodeptr = ERR48;
2177 goto FAILED;
2178 }
2179 }
2180
2181 /* Subpattern names must not be empty, and their terminator is checked here.
2182 (What follows a verb name is checked separately.) */
2183
2184 if (!is_verb)
2185 {
2186 if (namelen == 0)
2187 {
2188 *errorcodeptr = ERR62; /* Subpattern name expected */
2189 goto FAILED;
2190 }
2191 if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2192 {
2193 *errorcodeptr = ERR42;
2194 goto FAILED;
2195 }
2196 ptr++;
2197 }
2198
2199 *namelenptr = namelen;
2200 *ptrptr = ptr;
2201 return TRUE;
2202
2203 FAILED:
2204 *ptrptr = ptr;
2205 return FALSE;
2206 }
2207
2208
2209
2210 /*************************************************
2211 * Manage callouts at start of cycle *
2212 *************************************************/
2213
2214 /* At the start of a new item in parse_regex() we are able to record the
2215 details of the previous item in a prior callout, and also to set up an
2216 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2217 which would otherwise happen for items such as \Q that contribute nothing to
2218 the parsed pattern.
2219
2220 Arguments:
2221 ptr current pattern pointer
2222 pcalloutptr points to a pointer to previous callout, or NULL
2223 auto_callout TRUE if auto_callouts are enabled
2224 parsed_pattern the parsed pattern pointer
2225 cb compile block
2226
2227 Returns: possibly updated parsed_pattern pointer.
2228 */
2229
2230 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2231 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2232 uint32_t *parsed_pattern, compile_block *cb)
2233 {
2234 uint32_t *previous_callout = *pcalloutptr;
2235
2236 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2237 cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2238
2239 if (!auto_callout) previous_callout = NULL; else
2240 {
2241 if (previous_callout == NULL ||
2242 previous_callout != parsed_pattern - 4 ||
2243 previous_callout[3] != 255)
2244 {
2245 previous_callout = parsed_pattern; /* Set up new automatic callout */
2246 parsed_pattern += 4;
2247 previous_callout[0] = META_CALLOUT_NUMBER;
2248 previous_callout[2] = 0;
2249 previous_callout[3] = 255;
2250 }
2251 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2252 }
2253
2254 *pcalloutptr = previous_callout;
2255 return parsed_pattern;
2256 }
2257
2258
2259
2260 /*************************************************
2261 * Parse regex and identify named groups *
2262 *************************************************/
2263
2264 /* This function is called first of all. It scans the pattern and does two
2265 things: (1) It identifies capturing groups and makes a table of named capturing
2266 groups so that information about them is fully available to both the compiling
2267 scans. (2) It writes a parsed version of the pattern with comments omitted and
2268 escapes processed into the parsed_pattern vector.
2269
2270 Arguments:
2271 ptr points to the start of the pattern
2272 options compiling dynamic options (may change during the scan)
2273 has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2274 cb pointer to the compile data block
2275
2276 Returns: zero on success or a non-zero error code, with the
2277 error offset placed in the cb field
2278 */
2279
2280 /* A structure and some flags for dealing with nested groups. */
2281
2282 typedef struct nest_save {
2283 uint16_t nest_depth;
2284 uint16_t reset_group;
2285 uint16_t max_group;
2286 uint16_t flags;
2287 uint32_t options;
2288 } nest_save;
2289
2290 #define NSF_RESET 0x0001u
2291 #define NSF_CONDASSERT 0x0002u
2292
2293 /* Options that are changeable within the pattern must be tracked during
2294 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2295 but all must be tracked so that META_OPTIONS items set the correct values for
2296 the main compiling phase. */
2297
2298 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2299 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2300 PCRE2_UNGREEDY)
2301
2302 /* States used for analyzing ranges in character classes. The two OK values
2303 must be last. */
2304
2305 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2306
2307 /* Only in 32-bit mode can there be literals > META_END. A macros encapsulates
2308 the storing of literal values in the parsed pattern. */
2309
2310 #if PCRE2_CODE_UNIT_WIDTH == 32
2311 #define PARSED_LITERAL(c, p) \
2312 { \
2313 if (c >= META_END) *p++ = META_BIGVALUE; \
2314 *p++ = c; \
2315 okquantifier = TRUE; \
2316 }
2317 #else
2318 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2319 #endif
2320
2321 /* Here's the actual function. */
2322
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2323 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2324 compile_block *cb)
2325 {
2326 uint32_t c;
2327 uint32_t delimiter;
2328 uint32_t namelen;
2329 uint32_t class_range_state;
2330 uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2331 uint32_t *previous_callout = NULL;
2332 uint32_t *parsed_pattern = cb->parsed_pattern;
2333 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2334 uint32_t meta_quantifier = 0;
2335 uint32_t add_after_mark = 0;
2336 uint16_t nest_depth = 0;
2337 int after_manual_callout = 0;
2338 int expect_cond_assert = 0;
2339 int errorcode = 0;
2340 int escape;
2341 int i;
2342 BOOL inescq = FALSE;
2343 BOOL inverbname = FALSE;
2344 BOOL utf = (options & PCRE2_UTF) != 0;
2345 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2346 BOOL isdupname;
2347 BOOL negate_class;
2348 BOOL okquantifier = FALSE;
2349 PCRE2_SPTR thisptr;
2350 PCRE2_SPTR name;
2351 PCRE2_SPTR ptrend = cb->end_pattern;
2352 PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2353 named_group *ng;
2354 nest_save *top_nest, *end_nests;
2355
2356 /* Insert leading items for word and line matching (features provided for the
2357 benefit of pcre2grep). */
2358
2359 if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2360 {
2361 *parsed_pattern++ = META_CIRCUMFLEX;
2362 *parsed_pattern++ = META_NOCAPTURE;
2363 }
2364 else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2365 {
2366 *parsed_pattern++ = META_ESCAPE + ESC_b;
2367 *parsed_pattern++ = META_NOCAPTURE;
2368 }
2369
2370 /* If the pattern is actually a literal string, process it separately to avoid
2371 cluttering up the main loop. */
2372
2373 if ((options & PCRE2_LITERAL) != 0)
2374 {
2375 while (ptr < ptrend)
2376 {
2377 if (parsed_pattern >= parsed_pattern_end)
2378 {
2379 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2380 goto FAILED;
2381 }
2382 thisptr = ptr;
2383 GETCHARINCTEST(c, ptr);
2384 if (auto_callout)
2385 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2386 auto_callout, parsed_pattern, cb);
2387 PARSED_LITERAL(c, parsed_pattern);
2388 }
2389 goto PARSED_END;
2390 }
2391
2392 /* Process a real regex which may contain meta-characters. */
2393
2394 top_nest = NULL;
2395 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2396
2397 /* The size of the nest_save structure might not be a factor of the size of the
2398 workspace. Therefore we must round down end_nests so as to correctly avoid
2399 creating a nest_save that spans the end of the workspace. */
2400
2401 end_nests = (nest_save *)((char *)end_nests -
2402 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2403
2404 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2405
2406 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2407
2408 /* Now scan the pattern */
2409
2410 while (ptr < ptrend)
2411 {
2412 int prev_expect_cond_assert;
2413 uint32_t min_repeat, max_repeat;
2414 uint32_t set, unset, *optset;
2415 uint32_t terminator;
2416 uint32_t prev_meta_quantifier;
2417 BOOL prev_okquantifier;
2418 PCRE2_SPTR tempptr;
2419 PCRE2_SIZE offset;
2420
2421 if (parsed_pattern >= parsed_pattern_end)
2422 {
2423 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2424 goto FAILED;
2425 }
2426
2427 if (nest_depth > cb->cx->parens_nest_limit)
2428 {
2429 errorcode = ERR19;
2430 goto FAILED; /* Parentheses too deeply nested */
2431 }
2432
2433 /* Get next input character, save its position for callout handling. */
2434
2435 thisptr = ptr;
2436 GETCHARINCTEST(c, ptr);
2437
2438 /* Copy quoted literals until \E, allowing for the possibility of automatic
2439 callouts, except when processing a (*VERB) "name". */
2440
2441 if (inescq)
2442 {
2443 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2444 {
2445 inescq = FALSE;
2446 ptr++; /* Skip E */
2447 }
2448 else
2449 {
2450 if (expect_cond_assert > 0) /* A literal is not allowed if we are */
2451 { /* expecting a conditional assertion, */
2452 ptr--; /* but an empty \Q\E sequence is OK. */
2453 errorcode = ERR28;
2454 goto FAILED;
2455 }
2456 if (!inverbname && after_manual_callout-- <= 0)
2457 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2458 auto_callout, parsed_pattern, cb);
2459 PARSED_LITERAL(c, parsed_pattern);
2460 meta_quantifier = 0;
2461 }
2462 continue; /* Next character */
2463 }
2464
2465 /* If we are processing the "name" part of a (*VERB:NAME) item, all
2466 characters up to the closing parenthesis are literals except when
2467 PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2468 and \E and escaped characters are allowed (no character types such as \d). If
2469 PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2470 this by not entering the special (*VERB:NAME) processing - they are then
2471 picked up below. Note that c is a character, not a code unit, so we must not
2472 use MAX_255 to test its size because MAX_255 tests code units and is assumed
2473 TRUE in 8-bit mode. */
2474
2475 if (inverbname &&
2476 (
2477 /* EITHER: not both options set */
2478 ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2479 (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2480 #ifdef SUPPORT_UNICODE
2481 /* OR: character > 255 AND not Unicode Pattern White Space */
2482 (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2483 #endif
2484 /* OR: not a # comment or isspace() white space */
2485 (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2486 #ifdef SUPPORT_UNICODE
2487 /* and not CHAR_NEL when Unicode is supported */
2488 && c != CHAR_NEL
2489 #endif
2490 )))
2491 {
2492 PCRE2_SIZE verbnamelength;
2493
2494 switch(c)
2495 {
2496 default:
2497 PARSED_LITERAL(c, parsed_pattern);
2498 break;
2499
2500 case CHAR_RIGHT_PARENTHESIS:
2501 inverbname = FALSE;
2502 okquantifier = FALSE; /* Was probably set by literals */
2503 /* This is the length in characters */
2504 verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2505 /* But the limit on the length is in code units */
2506 if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2507 {
2508 ptr--;
2509 errorcode = ERR76;
2510 goto FAILED;
2511 }
2512 *verblengthptr = (uint32_t)verbnamelength;
2513
2514 /* If this name was on a verb such as (*ACCEPT) which does not continue,
2515 a (*MARK) was generated for the name. We now add the original verb as the
2516 next item. */
2517
2518 if (add_after_mark != 0)
2519 {
2520 *parsed_pattern++ = add_after_mark;
2521 add_after_mark = 0;
2522 }
2523 break;
2524
2525 case CHAR_BACKSLASH:
2526 if ((options & PCRE2_ALT_VERBNAMES) != 0)
2527 {
2528 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2529 FALSE, cb);
2530 if (errorcode != 0) goto FAILED;
2531 }
2532 else escape = 0; /* Treat all as literal */
2533
2534 switch(escape)
2535 {
2536 case 0:
2537 PARSED_LITERAL(c, parsed_pattern);
2538 break;
2539
2540 case ESC_Q:
2541 inescq = TRUE;
2542 break;
2543
2544 case ESC_E: /* Ignore */
2545 break;
2546
2547 default:
2548 errorcode = ERR40; /* Invalid in verb name */
2549 goto FAILED;
2550 }
2551 }
2552 continue; /* Next character in pattern */
2553 }
2554
2555 /* Not a verb name character. At this point we must process everything that
2556 must not change the quantification state. This is mainly comments, but we
2557 handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2558 A+, as in Perl. An isolated \E is ignored. */
2559
2560 if (c == CHAR_BACKSLASH && ptr < ptrend)
2561 {
2562 if (*ptr == CHAR_Q || *ptr == CHAR_E)
2563 {
2564 inescq = *ptr == CHAR_Q;
2565 ptr++;
2566 continue;
2567 }
2568 }
2569
2570 /* Skip over whitespace and # comments in extended mode. Note that c is a
2571 character, not a code unit, so we must not use MAX_255 to test its size
2572 because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2573 whitespace characters are those designated as "Pattern White Space" by
2574 Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2575 U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2576 subset of space characters that match \h and \v. */
2577
2578 if ((options & PCRE2_EXTENDED) != 0)
2579 {
2580 if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2581 #ifdef SUPPORT_UNICODE
2582 if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2583 #endif
2584 if (c == CHAR_NUMBER_SIGN)
2585 {
2586 while (ptr < ptrend)
2587 {
2588 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
2589 { /* IS_NEWLINE sets cb->nllen. */
2590 ptr += cb->nllen;
2591 break;
2592 }
2593 ptr++;
2594 #ifdef SUPPORT_UNICODE
2595 if (utf) FORWARDCHARTEST(ptr, ptrend);
2596 #endif
2597 }
2598 continue; /* Next character in pattern */
2599 }
2600 }
2601
2602 /* Skip over bracketed comments */
2603
2604 if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2605 ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2606 {
2607 while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2608 if (ptr >= ptrend)
2609 {
2610 errorcode = ERR18; /* A special error for missing ) in a comment */
2611 goto FAILED; /* to make it easier to debug. */
2612 }
2613 ptr++;
2614 continue; /* Next character in pattern */
2615 }
2616
2617 /* If the next item is not a quantifier, fill in length of any previous
2618 callout and create an auto callout if required. */
2619
2620 if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2621 (c != CHAR_LEFT_CURLY_BRACKET ||
2622 (tempptr = ptr,
2623 !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2624 {
2625 if (after_manual_callout-- <= 0)
2626 parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2627 parsed_pattern, cb);
2628 }
2629
2630 /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2631 assertion, possibly preceded by a callout. If the value is 1, we have just
2632 had the callout and expect an assertion. There must be at least 3 more
2633 characters in all cases. When expect_cond_assert is 2, we know that the
2634 current character is an opening parenthesis, as otherwise we wouldn't be
2635 here. However, when it is 1, we need to check, and it's easiest just to check
2636 always. Note that expect_cond_assert may be negative, since all callouts just
2637 decrement it. */
2638
2639 if (expect_cond_assert > 0)
2640 {
2641 BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2642 ptr[0] == CHAR_QUESTION_MARK;
2643 if (ok) switch(ptr[1])
2644 {
2645 case CHAR_C:
2646 ok = expect_cond_assert == 2;
2647 break;
2648
2649 case CHAR_EQUALS_SIGN:
2650 case CHAR_EXCLAMATION_MARK:
2651 break;
2652
2653 case CHAR_LESS_THAN_SIGN:
2654 ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2655 break;
2656
2657 default:
2658 ok = FALSE;
2659 }
2660
2661 if (!ok)
2662 {
2663 ptr--; /* Adjust error offset */
2664 errorcode = ERR28;
2665 goto FAILED;
2666 }
2667 }
2668
2669 /* Remember whether we are expecting a conditional assertion, and set the
2670 default for this item. */
2671
2672 prev_expect_cond_assert = expect_cond_assert;
2673 expect_cond_assert = 0;
2674
2675 /* Remember quantification status for the previous significant item, then set
2676 default for this item. */
2677
2678 prev_okquantifier = okquantifier;
2679 prev_meta_quantifier = meta_quantifier;
2680 okquantifier = FALSE;
2681 meta_quantifier = 0;
2682
2683 /* If the previous significant item was a quantifier, adjust the parsed code
2684 if there is a following modifier. The base meta value is always followed by
2685 the PLUS and QUERY values, in that order. We do this here rather than after
2686 reading a quantifier so that intervening comments and /x whitespace can be
2687 ignored without having to replicate code. */
2688
2689 if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2690 {
2691 parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2692 prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2693 0x00020000u : 0x00010000u);
2694 continue; /* Next character in pattern */
2695 }
2696
2697
2698 /* Process the next item in the main part of a pattern. */
2699
2700 switch(c)
2701 {
2702 default: /* Non-special character */
2703 PARSED_LITERAL(c, parsed_pattern);
2704 break;
2705
2706
2707 /* ---- Escape sequence ---- */
2708
2709 case CHAR_BACKSLASH:
2710 tempptr = ptr;
2711 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2712 FALSE, cb);
2713 if (errorcode != 0)
2714 {
2715 ESCAPE_FAILED:
2716 if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
2717 goto FAILED;
2718 ptr = tempptr;
2719 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
2720 {
2721 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
2722 }
2723 escape = 0; /* Treat as literal character */
2724 }
2725
2726 /* The escape was a data escape or literal character. */
2727
2728 if (escape == 0)
2729 {
2730 PARSED_LITERAL(c, parsed_pattern);
2731 }
2732
2733 /* The escape was a back (or forward) reference. We keep the offset in
2734 order to give a more useful diagnostic for a bad forward reference. For
2735 references to groups numbered less than 10 we can't use more than two items
2736 in parsed_pattern because they may be just two characters in the input (and
2737 in a 64-bit world an offset may need two elements). So for them, the offset
2738 of the first occurrent is held in a special vector. */
2739
2740 else if (escape < 0)
2741 {
2742 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
2743 escape = -escape;
2744 *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
2745 if (escape < 10)
2746 {
2747 if (cb->small_ref_offset[escape] == PCRE2_UNSET)
2748 cb->small_ref_offset[escape] = offset;
2749 }
2750 else
2751 {
2752 PUTOFFSET(offset, parsed_pattern);
2753 }
2754 okquantifier = TRUE;
2755 }
2756
2757 /* The escape was a character class such as \d etc. or other special
2758 escape indicator such as \A or \X. Most of them generate just a single
2759 parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
2760 value. They are supported only when Unicode is available. The type and
2761 value are packed into a single 32-bit value so that the whole sequences
2762 uses only two elements in the parsed_vector. This is because the same
2763 coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
2764 set.
2765
2766 There are also some cases where the escape sequence is followed by a name:
2767 \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
2768 and \g'name' are subroutine calls by name; \g{name} is a synonym for
2769 \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
2770 and returned as a negative value (handled above). A name is coded as an
2771 offset into the pattern and a length. */
2772
2773 else switch (escape)
2774 {
2775 case ESC_C:
2776 #ifdef NEVER_BACKSLASH_C
2777 errorcode = ERR85;
2778 goto ESCAPE_FAILED;
2779 #else
2780 if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
2781 {
2782 errorcode = ERR83;
2783 goto ESCAPE_FAILED;
2784 }
2785 #endif
2786 okquantifier = TRUE;
2787 *parsed_pattern++ = META_ESCAPE + escape;
2788 break;
2789
2790 case ESC_X:
2791 #ifndef SUPPORT_UNICODE
2792 errorcode = ERR45; /* Supported only with Unicode support */
2793 goto ESCAPE_FAILED;
2794 #endif
2795 case ESC_H:
2796 case ESC_h:
2797 case ESC_N:
2798 case ESC_R:
2799 case ESC_V:
2800 case ESC_v:
2801 okquantifier = TRUE;
2802 *parsed_pattern++ = META_ESCAPE + escape;
2803 break;
2804
2805 default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
2806 *parsed_pattern++ = META_ESCAPE + escape;
2807 break;
2808
2809 /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
2810 without Unicode support because it is checked when pcre2_compile() is
2811 called. */
2812
2813 case ESC_d:
2814 case ESC_D:
2815 case ESC_s:
2816 case ESC_S:
2817 case ESC_w:
2818 case ESC_W:
2819 okquantifier = TRUE;
2820 if ((options & PCRE2_UCP) == 0)
2821 {
2822 *parsed_pattern++ = META_ESCAPE + escape;
2823 }
2824 else
2825 {
2826 *parsed_pattern++ = META_ESCAPE +
2827 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
2828 ESC_p : ESC_P);
2829 switch(escape)
2830 {
2831 case ESC_d:
2832 case ESC_D:
2833 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2834 break;
2835
2836 case ESC_s:
2837 case ESC_S:
2838 *parsed_pattern++ = PT_SPACE << 16;
2839 break;
2840
2841 case ESC_w:
2842 case ESC_W:
2843 *parsed_pattern++ = PT_WORD << 16;
2844 break;
2845 }
2846 }
2847 break;
2848
2849 /* Unicode property matching */
2850
2851 case ESC_P:
2852 case ESC_p:
2853 #ifdef SUPPORT_UNICODE
2854 {
2855 BOOL negated;
2856 uint16_t ptype = 0, pdata = 0;
2857 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
2858 goto ESCAPE_FAILED;
2859 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
2860 *parsed_pattern++ = META_ESCAPE + escape;
2861 *parsed_pattern++ = (ptype << 16) | pdata;
2862 okquantifier = TRUE;
2863 }
2864 #else
2865 errorcode = ERR45;
2866 goto ESCAPE_FAILED;
2867 #endif
2868 break; /* End \P and \p */
2869
2870 /* When \g is used with quotes or angle brackets as delimiters, it is a
2871 numerical or named subroutine call, and control comes here. When used
2872 with brace delimiters it is a numberical back reference and does not come
2873 here because check_escape() returns it directly as a reference. \k is
2874 always a named back reference. */
2875
2876 case ESC_g:
2877 case ESC_k:
2878 if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
2879 *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
2880 {
2881 errorcode = (escape == ESC_g)? ERR57 : ERR69;
2882 goto ESCAPE_FAILED;
2883 }
2884 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
2885 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
2886 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
2887
2888 /* For a non-braced \g, check for a numerical recursion. */
2889
2890 if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
2891 {
2892 PCRE2_SPTR p = ptr + 1;
2893
2894 if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
2895 &errorcode))
2896 {
2897 if (p >= ptrend || *p != terminator)
2898 {
2899 errorcode = ERR57;
2900 goto ESCAPE_FAILED;
2901 }
2902 ptr = p;
2903 goto SET_RECURSION;
2904 }
2905 if (errorcode != 0) goto ESCAPE_FAILED;
2906 }
2907
2908 /* Not a numerical recursion */
2909
2910 if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
2911 &errorcode, cb)) goto ESCAPE_FAILED;
2912
2913 /* \k and \g when used with braces are back references, whereas \g used
2914 with quotes or angle brackets is a recursion */
2915
2916 *parsed_pattern++ =
2917 (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
2918 META_BACKREF_BYNAME : META_RECURSE_BYNAME;
2919 *parsed_pattern++ = namelen;
2920
2921 PUTOFFSET(offset, parsed_pattern);
2922 okquantifier = TRUE;
2923 break; /* End special escape processing */
2924 }
2925 break; /* End escape sequence processing */
2926
2927
2928 /* ---- Single-character special items ---- */
2929
2930 case CHAR_CIRCUMFLEX_ACCENT:
2931 *parsed_pattern++ = META_CIRCUMFLEX;
2932 break;
2933
2934 case CHAR_DOLLAR_SIGN:
2935 *parsed_pattern++ = META_DOLLAR;
2936 break;
2937
2938 case CHAR_DOT:
2939 *parsed_pattern++ = META_DOT;
2940 okquantifier = TRUE;
2941 break;
2942
2943
2944 /* ---- Single-character quantifiers ---- */
2945
2946 case CHAR_ASTERISK:
2947 meta_quantifier = META_ASTERISK;
2948 goto CHECK_QUANTIFIER;
2949
2950 case CHAR_PLUS:
2951 meta_quantifier = META_PLUS;
2952 goto CHECK_QUANTIFIER;
2953
2954 case CHAR_QUESTION_MARK:
2955 meta_quantifier = META_QUERY;
2956 goto CHECK_QUANTIFIER;
2957
2958
2959 /* ---- Potential {n,m} quantifier ---- */
2960
2961 case CHAR_LEFT_CURLY_BRACKET:
2962 if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
2963 &errorcode))
2964 {
2965 if (errorcode != 0) goto FAILED; /* Error in quantifier. */
2966 PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
2967 break; /* No more quantifier processing */
2968 }
2969 meta_quantifier = META_MINMAX;
2970 /* Fall through */
2971
2972
2973 /* ---- Quantifier post-processing ---- */
2974
2975 /* Check that a quantifier is allowed after the previous item. */
2976
2977 CHECK_QUANTIFIER:
2978 if (!prev_okquantifier)
2979 {
2980 errorcode = ERR9;
2981 goto FAILED_BACK;
2982 }
2983
2984 /* Now we can put the quantifier into the parsed pattern vector. At this
2985 stage, we have only the basic quantifier. The check for a following + or ?
2986 modifier happens at the top of the loop, after any intervening comments
2987 have been removed. */
2988
2989 *parsed_pattern++ = meta_quantifier;
2990 if (c == CHAR_LEFT_CURLY_BRACKET)
2991 {
2992 *parsed_pattern++ = min_repeat;
2993 *parsed_pattern++ = max_repeat;
2994 }
2995 break;
2996
2997
2998 /* ---- Character class ---- */
2999
3000 case CHAR_LEFT_SQUARE_BRACKET:
3001 okquantifier = TRUE;
3002
3003 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3004 used for "start of word" and "end of word". As these are otherwise illegal
3005 sequences, we don't break anything by recognizing them. They are replaced
3006 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3007 erroneous and are handled by the normal code below. */
3008
3009 if (ptrend - ptr >= 6 &&
3010 (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3011 PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3012 {
3013 *parsed_pattern++ = META_ESCAPE + ESC_b;
3014
3015 if (ptr[2] == CHAR_LESS_THAN_SIGN)
3016 {
3017 *parsed_pattern++ = META_LOOKAHEAD;
3018 }
3019 else
3020 {
3021 *parsed_pattern++ = META_LOOKBEHIND;
3022 *has_lookbehind = TRUE;
3023
3024 /* The offset is used only for the "non-fixed length" error; this won't
3025 occur here, so just store zero. */
3026
3027 PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3028 }
3029
3030 if ((options & PCRE2_UCP) == 0)
3031 *parsed_pattern++ = META_ESCAPE + ESC_w;
3032 else
3033 {
3034 *parsed_pattern++ = META_ESCAPE + ESC_p;
3035 *parsed_pattern++ = PT_WORD << 16;
3036 }
3037 *parsed_pattern++ = META_KET;
3038 ptr += 6;
3039 break;
3040 }
3041
3042 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3043 they are encountered at the top level, so we'll do that too. */
3044
3045 if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3046 *ptr == CHAR_EQUALS_SIGN) &&
3047 check_posix_syntax(ptr, ptrend, &tempptr))
3048 {
3049 errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3050 goto FAILED;
3051 }
3052
3053 /* Process a regular character class. If the first character is '^', set
3054 the negation flag. If the first few characters (either before or after ^)
3055 are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3056 This makes for compatibility with Perl. */
3057
3058 negate_class = FALSE;
3059 while (ptr < ptrend)
3060 {
3061 GETCHARINCTEST(c, ptr);
3062 if (c == CHAR_BACKSLASH)
3063 {
3064 if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3065 else if (ptrend - ptr >= 3 &&
3066 PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3067 ptr += 3;
3068 else
3069 break;
3070 }
3071 else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3072 (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */
3073 continue;
3074 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3075 negate_class = TRUE;
3076 else break;
3077 }
3078
3079 /* Now the real contents of the class; c has the first "real" character.
3080 Empty classes are permitted only if the option is set. */
3081
3082 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3083 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3084 {
3085 *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3086 break; /* End of class processing */
3087 }
3088
3089 /* Process a non-empty class. */
3090
3091 *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3092 class_range_state = RANGE_NO;
3093
3094 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3095 because there are holes in the encoding, and simply using the range A-Z
3096 (for example) would include the characters in the holes. This applies only
3097 to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3098 in this respect. In order to accommodate this, we keep track of whether
3099 character values are literal or not, and a state variable for handling
3100 ranges. */
3101
3102 /* Loop for the contents of the class */
3103
3104 for (;;)
3105 {
3106 BOOL char_is_literal = TRUE;
3107
3108 /* Inside \Q...\E everything is literal except \E */
3109
3110 if (inescq)
3111 {
3112 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3113 {
3114 inescq = FALSE; /* Reset literal state */
3115 ptr++; /* Skip the 'E' */
3116 goto CLASS_CONTINUE;
3117 }
3118 goto CLASS_LITERAL;
3119 }
3120
3121 /* Skip over space and tab (only) in extended-more mode. */
3122
3123 if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3124 (c == CHAR_SPACE || c == CHAR_HT))
3125 goto CLASS_CONTINUE;
3126
3127 /* Handle POSIX class names. Perl allows a negation extension of the
3128 form [:^name:]. A square bracket that doesn't match the syntax is
3129 treated as a literal. We also recognize the POSIX constructions
3130 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3131 5.6 and 5.8 do. */
3132
3133 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3134 ptrend - ptr >= 3 &&
3135 (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3136 *ptr == CHAR_EQUALS_SIGN) &&
3137 check_posix_syntax(ptr, ptrend, &tempptr))
3138 {
3139 BOOL posix_negate = FALSE;
3140 int posix_class;
3141
3142 /* Perl treats a hyphen before a POSIX class as a literal, not the
3143 start of a range. However, it gives a warning in its warning mode. PCRE
3144 does not have a warning mode, so we give an error, because this is
3145 likely an error on the user's part. */
3146
3147 if (class_range_state == RANGE_STARTED)
3148 {
3149 errorcode = ERR50;
3150 goto FAILED;
3151 }
3152
3153 if (*ptr != CHAR_COLON)
3154 {
3155 errorcode = ERR13;
3156 goto FAILED_BACK;
3157 }
3158
3159 if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3160 {
3161 posix_negate = TRUE;
3162 ptr++;
3163 }
3164
3165 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3166 if (posix_class < 0)
3167 {
3168 errorcode = ERR30;
3169 goto FAILED;
3170 }
3171 ptr = tempptr + 2;
3172
3173 /* Perl treats a hyphen after a POSIX class as a literal, not the
3174 start of a range. However, it gives a warning in its warning mode
3175 unless the hyphen is the last character in the class. PCRE does not
3176 have a warning mode, so we give an error, because this is likely an
3177 error on the user's part. */
3178
3179 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3180 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3181 {
3182 errorcode = ERR50;
3183 goto FAILED;
3184 }
3185
3186 /* Set "a hyphen is not the start of a range" for the -] case, and also
3187 in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3188 fuzzers do that kind of thing) and *then* a hyphen. This causes that
3189 hyphen to be treated as a literal. I don't think it's worth setting up
3190 special apparatus to do otherwise. */
3191
3192 class_range_state = RANGE_NO;
3193
3194 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3195 use Unicode properties \p or \P or, in one case, \h or \H. The
3196 substitutes table has two values per class, containing the type and
3197 value of a \p or \P item. The special cases are specified with a
3198 negative type: a non-zero value causes \h or \H to be used, and a zero
3199 value falls through to behave like a non-UCP POSIX class. */
3200
3201 #ifdef SUPPORT_UNICODE
3202 if ((options & PCRE2_UCP) != 0)
3203 {
3204 int ptype = posix_substitutes[2*posix_class];
3205 int pvalue = posix_substitutes[2*posix_class + 1];
3206 if (ptype >= 0)
3207 {
3208 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3209 *parsed_pattern++ = (ptype << 16) | pvalue;
3210 goto CLASS_CONTINUE;
3211 }
3212
3213 if (pvalue != 0)
3214 {
3215 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3216 goto CLASS_CONTINUE;
3217 }
3218
3219 /* Fall through */
3220 }
3221 #endif /* SUPPORT_UNICODE */
3222
3223 /* Non-UCP POSIX class */
3224
3225 *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3226 *parsed_pattern++ = posix_class;
3227 }
3228
3229 /* Handle potential start of range */
3230
3231 else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3232 {
3233 *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3234 META_RANGE_LITERAL : META_RANGE_ESCAPED;
3235 class_range_state = RANGE_STARTED;
3236 }
3237
3238 /* Handle a literal character */
3239
3240 else if (c != CHAR_BACKSLASH)
3241 {
3242 CLASS_LITERAL:
3243 if (class_range_state == RANGE_STARTED)
3244 {
3245 if (c == parsed_pattern[-2]) /* Optimize one-char range */
3246 parsed_pattern--;
3247 else if (parsed_pattern[-2] > c) /* Check range is in order */
3248 {
3249 errorcode = ERR8;
3250 goto FAILED_BACK;
3251 }
3252 else
3253 {
3254 if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3255 parsed_pattern[-1] = META_RANGE_ESCAPED;
3256 PARSED_LITERAL(c, parsed_pattern);
3257 }
3258 class_range_state = RANGE_NO;
3259 }
3260 else /* Potential start of range */
3261 {
3262 class_range_state = char_is_literal?
3263 RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3264 PARSED_LITERAL(c, parsed_pattern);
3265 }
3266 }
3267
3268 /* Handle escapes in a class */
3269
3270 else
3271 {
3272 tempptr = ptr;
3273 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode,
3274 options, TRUE, cb);
3275 if (errorcode != 0)
3276 {
3277 CLASS_ESCAPE_FAILED:
3278 if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3279 goto FAILED;
3280 ptr = tempptr;
3281 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3282 {
3283 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3284 }
3285 escape = 0; /* Treat as literal character */
3286 }
3287
3288 if (escape == 0) /* Escaped character code point is in c */
3289 {
3290 char_is_literal = FALSE;
3291 goto CLASS_LITERAL;
3292 }
3293
3294 /* These three escapes do not alter the class range state. */
3295
3296 if (escape == ESC_b)
3297 {
3298 c = CHAR_BS; /* \b is backspace in a class */
3299 char_is_literal = FALSE;
3300 goto CLASS_LITERAL;
3301 }
3302
3303 else if (escape == ESC_Q)
3304 {
3305 inescq = TRUE; /* Enter literal mode */
3306 goto CLASS_CONTINUE;
3307 }
3308
3309 else if (escape == ESC_E) /* Ignore orphan \E */
3310 goto CLASS_CONTINUE;
3311
3312 /* The second part of a range can be a single-character escape
3313 sequence (detected above), but not any of the other escapes. Perl
3314 treats a hyphen as a literal in such circumstances. However, in Perl's
3315 warning mode, a warning is given, so PCRE now faults it, as it is
3316 almost certainly a mistake on the user's part. */
3317
3318 if (class_range_state == RANGE_STARTED)
3319 {
3320 errorcode = ERR50;
3321 goto CLASS_ESCAPE_FAILED;
3322 }
3323
3324 /* Of the remaining escapes, only those that define characters are
3325 allowed in a class. None may start a range. */
3326
3327 class_range_state = RANGE_NO;
3328 switch(escape)
3329 {
3330 case ESC_N:
3331 errorcode = ERR71; /* Not supported in a class */
3332 goto CLASS_ESCAPE_FAILED;
3333
3334 case ESC_H:
3335 case ESC_h:
3336 case ESC_V:
3337 case ESC_v:
3338 *parsed_pattern++ = META_ESCAPE + escape;
3339 break;
3340
3341 /* These escapes are converted to Unicode property tests when
3342 PCRE2_UCP is set. */
3343
3344 case ESC_d:
3345 case ESC_D:
3346 case ESC_s:
3347 case ESC_S:
3348 case ESC_w:
3349 case ESC_W:
3350 if ((options & PCRE2_UCP) == 0)
3351 {
3352 *parsed_pattern++ = META_ESCAPE + escape;
3353 }
3354 else
3355 {
3356 *parsed_pattern++ = META_ESCAPE +
3357 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3358 ESC_p : ESC_P);
3359 switch(escape)
3360 {
3361 case ESC_d:
3362 case ESC_D:
3363 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3364 break;
3365
3366 case ESC_s:
3367 case ESC_S:
3368 *parsed_pattern++ = PT_SPACE << 16;
3369 break;
3370
3371 case ESC_w:
3372 case ESC_W:
3373 *parsed_pattern++ = PT_WORD << 16;
3374 break;
3375 }
3376 }
3377 break;
3378
3379 /* Explicit Unicode property matching */
3380
3381 case ESC_P:
3382 case ESC_p:
3383 #ifdef SUPPORT_UNICODE
3384 {
3385 BOOL negated;
3386 uint16_t ptype = 0, pdata = 0;
3387 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3388 goto FAILED;
3389 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3390 *parsed_pattern++ = META_ESCAPE + escape;
3391 *parsed_pattern++ = (ptype << 16) | pdata;
3392 }
3393 #else
3394 errorcode = ERR45;
3395 goto CLASS_ESCAPE_FAILED;
3396 #endif
3397 break; /* End \P and \p */
3398
3399 default: /* All others are not allowed in a class */
3400 errorcode = ERR7;
3401 ptr--;
3402 goto CLASS_ESCAPE_FAILED;
3403 }
3404
3405 /* Perl gives a warning unless a following hyphen is the last character
3406 in the class. PCRE throws an error. */
3407
3408 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3409 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3410 {
3411 errorcode = ERR50;
3412 goto FAILED;
3413 }
3414 }
3415
3416 /* Proceed to next thing in the class. */
3417
3418 CLASS_CONTINUE:
3419 if (ptr >= ptrend)
3420 {
3421 errorcode = ERR6; /* Missing terminating ']' */
3422 goto FAILED;
3423 }
3424 GETCHARINCTEST(c, ptr);
3425 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3426 } /* End of class-processing loop */
3427
3428 if (class_range_state == RANGE_STARTED)
3429 {
3430 parsed_pattern[-1] = CHAR_MINUS;
3431 class_range_state = RANGE_NO;
3432 }
3433
3434 *parsed_pattern++ = META_CLASS_END;
3435 break; /* End of character class */
3436
3437
3438 /* ---- Opening parenthesis ---- */
3439
3440 case CHAR_LEFT_PARENTHESIS:
3441 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3442
3443 /* If ( is not followed by ? it is either a capture or a special verb. */
3444
3445 if (*ptr != CHAR_QUESTION_MARK)
3446 {
3447 const char *vn;
3448
3449 /* Handle capturing brackets (or non-capturing if auto-capture is turned
3450 off). */
3451
3452 if (*ptr != CHAR_ASTERISK)
3453 {
3454 nest_depth++;
3455 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3456 {
3457 cb->bracount++;
3458 *parsed_pattern++ = META_CAPTURE | cb->bracount;
3459 }
3460 else *parsed_pattern++ = META_NOCAPTURE;
3461 }
3462
3463
3464 /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3465
3466 /* Do nothing for (*) so it gives a "bad quantifier" error rather than
3467 "(*MARK) must have an argument". */
3468
3469 else if (ptrend - ptr > 1 && ptr[1] != CHAR_RIGHT_PARENTHESIS)
3470 {
3471 vn = verbnames;
3472 if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode,
3473 cb)) goto FAILED;
3474 if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3475 *ptr != CHAR_RIGHT_PARENTHESIS))
3476 {
3477 errorcode = ERR60; /* Malformed */
3478 goto FAILED;
3479 }
3480
3481 /* Scan the table of verb names */
3482
3483 for (i = 0; i < verbcount; i++)
3484 {
3485 if (namelen == verbs[i].len &&
3486 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3487 break;
3488 vn += verbs[i].len + 1;
3489 }
3490
3491 if (i >= verbcount)
3492 {
3493 errorcode = ERR60; /* Verb not recognized */
3494 goto FAILED;
3495 }
3496
3497 /* An empty argument is treated as no argument. */
3498
3499 if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3500 ptr[1] == CHAR_RIGHT_PARENTHESIS)
3501 ptr++; /* Advance to the closing parens */
3502
3503 /* Check for mandatory non-empty argument; this is (*MARK) */
3504
3505 if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3506 {
3507 errorcode = ERR66;
3508 goto FAILED;
3509 }
3510
3511 /* It appears that Perl allows any characters whatsoever, other than a
3512 closing parenthesis, to appear in arguments ("names"), so we no longer
3513 insist on letters, digits, and underscores. Perl does not, however, do
3514 any interpretation within arguments, and has no means of including a
3515 closing parenthesis. PCRE supports escape processing but only when it
3516 is requested by an option. We set inverbname TRUE here, and let the
3517 main loop take care of this so that escape and \x processing is done by
3518 the main code above. */
3519
3520 if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
3521 {
3522 /* Some optional arguments can be treated as a preceding (*MARK) */
3523
3524 if (verbs[i].has_arg < 0)
3525 {
3526 add_after_mark = verbs[i].meta;
3527 *parsed_pattern++ = META_MARK;
3528 }
3529
3530 /* The remaining verbs with arguments (except *MARK) need a different
3531 opcode. */
3532
3533 else
3534 {
3535 *parsed_pattern++ = verbs[i].meta +
3536 ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3537 }
3538
3539 /* Set up for reading the name in the main loop. */
3540
3541 verblengthptr = parsed_pattern++;
3542 verbnamestart = ptr;
3543 inverbname = TRUE;
3544 }
3545 else /* No verb "name" argument */
3546 {
3547 *parsed_pattern++ = verbs[i].meta;
3548 }
3549 } /* End of (*VERB) handling */
3550 break; /* Done with this parenthesis */
3551 } /* End of groups that don't start with (? */
3552
3553
3554 /* ---- Items starting (? ---- */
3555
3556 /* The type of item is determined by what follows (?. Handle (?| and option
3557 changes under "default" because both need a new block on the nest stack.
3558 Comments starting with (?# are handled above. Note that there is some
3559 ambiguity about the sequence (?- because if a digit follows it's a relative
3560 recursion or subroutine call whereas otherwise it's an option unsetting. */
3561
3562 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3563
3564 switch(*ptr)
3565 {
3566 default:
3567 if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
3568 goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
3569
3570 /* We now have either (?| or a (possibly empty) option setting,
3571 optionally followed by a non-capturing group. */
3572
3573 nest_depth++;
3574 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3575 else if (++top_nest >= end_nests)
3576 {
3577 errorcode = ERR84;
3578 goto FAILED;
3579 }
3580 top_nest->nest_depth = nest_depth;
3581 top_nest->flags = 0;
3582 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3583
3584 /* Start of non-capturing group that resets the capture count for each
3585 branch. */
3586
3587 if (*ptr == CHAR_VERTICAL_LINE)
3588 {
3589 top_nest->reset_group = (uint16_t)cb->bracount;
3590 top_nest->max_group = (uint16_t)cb->bracount;
3591 top_nest->flags |= NSF_RESET;
3592 cb->external_flags |= PCRE2_DUPCAPUSED;
3593 *parsed_pattern++ = META_NOCAPTURE;
3594 ptr++;
3595 }
3596
3597 /* Scan for options imnsxJU to be set or unset. */
3598
3599 else
3600 {
3601 BOOL hyphenok = TRUE;
3602 uint32_t oldoptions = options;
3603
3604 top_nest->reset_group = 0;
3605 top_nest->max_group = 0;
3606 set = unset = 0;
3607 optset = &set;
3608
3609 /* ^ at the start unsets imnsx and disables the subsequent use of - */
3610
3611 if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
3612 {
3613 options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
3614 PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
3615 hyphenok = FALSE;
3616 ptr++;
3617 }
3618
3619 while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
3620 *ptr != CHAR_COLON)
3621 {
3622 switch (*ptr++)
3623 {
3624 case CHAR_MINUS:
3625 if (!hyphenok)
3626 {
3627 errorcode = ERR94;
3628 ptr--; /* Correct the offset */
3629 goto FAILED;
3630 }
3631 optset = &unset;
3632 hyphenok = FALSE;
3633 break;
3634
3635 case CHAR_J: /* Record that it changed in the external options */
3636 *optset |= PCRE2_DUPNAMES;
3637 cb->external_flags |= PCRE2_JCHANGED;
3638 break;
3639
3640 case CHAR_i: *optset |= PCRE2_CASELESS; break;
3641 case CHAR_m: *optset |= PCRE2_MULTILINE; break;
3642 case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
3643 case CHAR_s: *optset |= PCRE2_DOTALL; break;
3644 case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
3645
3646 /* If x appears twice it sets the extended extended option. */
3647
3648 case CHAR_x:
3649 *optset |= PCRE2_EXTENDED;
3650 if (ptr < ptrend && *ptr == CHAR_x)
3651 {
3652 *optset |= PCRE2_EXTENDED_MORE;
3653 ptr++;
3654 }
3655 break;
3656
3657 default:
3658 errorcode = ERR11;
3659 ptr--; /* Correct the offset */
3660 goto FAILED;
3661 }
3662 }
3663
3664 /* If we are setting extended without extended-more, ensure that any
3665 existing extended-more gets unset. Also, unsetting extended must also
3666 unset extended-more. */
3667
3668 if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
3669 (unset & PCRE2_EXTENDED) != 0)
3670 unset |= PCRE2_EXTENDED_MORE;
3671
3672 options = (options | set) & (~unset);
3673
3674 /* If the options ended with ')' this is not the start of a nested
3675 group with option changes, so the options change at this level.
3676 In this case, if the previous level set up a nest block, discard the
3677 one we have just created. Otherwise adjust it for the previous level.
3678 If the options ended with ':' we are starting a non-capturing group,
3679 possibly with an options setting. */
3680
3681 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3682 if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
3683 {
3684 nest_depth--; /* This is not a nested group after all. */
3685 if (top_nest > (nest_save *)(cb->start_workspace) &&
3686 (top_nest-1)->nest_depth == nest_depth) top_nest--;
3687 else top_nest->nest_depth = nest_depth;
3688 }
3689 else *parsed_pattern++ = META_NOCAPTURE;
3690
3691 /* If nothing changed, no need to record. */
3692
3693 if (options != oldoptions)
3694 {
3695 *parsed_pattern++ = META_OPTIONS;
3696 *parsed_pattern++ = options;
3697 }
3698 } /* End options processing */
3699 break; /* End default case after (? */
3700
3701
3702 /* ---- Python syntax support ---- */
3703
3704 case CHAR_P:
3705 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3706
3707 /* (?P<name> is the same as (?<name>, which defines a named group. */
3708
3709 if (*ptr == CHAR_LESS_THAN_SIGN)
3710 {
3711 terminator = CHAR_GREATER_THAN_SIGN;
3712 goto DEFINE_NAME;
3713 }
3714
3715 /* (?P>name) is the same as (?&name), which is a recursion or subroutine
3716 call. */
3717
3718 if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
3719
3720 /* (?P=name) is the same as \k<name>, a back reference by name. Anything
3721 else after (?P is an error. */
3722
3723 if (*ptr != CHAR_EQUALS_SIGN)
3724 {
3725 errorcode = ERR41;
3726 goto FAILED;
3727 }
3728 if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name,
3729 &namelen, &errorcode, cb)) goto FAILED;
3730 *parsed_pattern++ = META_BACKREF_BYNAME;
3731 *parsed_pattern++ = namelen;
3732 PUTOFFSET(offset, parsed_pattern);
3733 okquantifier = TRUE;
3734 break; /* End of (?P processing */
3735
3736
3737 /* ---- Recursion/subroutine calls by number ---- */
3738
3739 case CHAR_R:
3740 i = 0; /* (?R) == (?R0) */
3741 ptr++;
3742 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
3743 {
3744 errorcode = ERR58;
3745 goto FAILED;
3746 }
3747 goto SET_RECURSION;
3748
3749 /* An item starting (?- followed by a digit comes here via the "default"
3750 case because (?- followed by a non-digit is an options setting. */
3751
3752 case CHAR_PLUS:
3753 if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
3754 {
3755 errorcode = ERR29; /* Missing number */
3756 goto FAILED;
3757 }
3758 /* Fall through */
3759
3760 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
3761 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
3762 RECURSION_BYNUMBER:
3763 if (!read_number(&ptr, ptrend,
3764 (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
3765 MAX_GROUP_NUMBER, ERR61,
3766 &i, &errorcode)) goto FAILED;
3767 if (i < 0) /* NB (?0) is permitted */
3768 {
3769 errorcode = ERR15; /* Unknown group */
3770 goto FAILED_BACK;
3771 }
3772 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
3773 goto UNCLOSED_PARENTHESIS;
3774
3775 SET_RECURSION:
3776 *parsed_pattern++ = META_RECURSE | (uint32_t)i;
3777 offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
3778 ptr++;
3779 PUTOFFSET(offset, parsed_pattern);
3780 okquantifier = TRUE;
3781 break; /* End of recursive call by number handling */
3782
3783
3784 /* ---- Recursion/subroutine calls by name ---- */
3785
3786 case CHAR_AMPERSAND:
3787 RECURSE_BY_NAME:
3788 if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name,
3789 &namelen, &errorcode, cb)) goto FAILED;
3790 *parsed_pattern++ = META_RECURSE_BYNAME;
3791 *parsed_pattern++ = namelen;
3792 PUTOFFSET(offset, parsed_pattern);
3793 okquantifier = TRUE;
3794 break;
3795
3796 /* ---- Callout with numerical or string argument ---- */
3797
3798 case CHAR_C:
3799 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3800
3801 /* If the previous item was a condition starting (?(? an assertion,
3802 optionally preceded by a callout, is expected. This is checked later on,
3803 during actual compilation. However we need to identify this kind of
3804 assertion in this pass because it must not be qualified. The value of
3805 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
3806 for a callout - still leaving a positive value that identifies the
3807 assertion. Multiple callouts or any other items will make it zero or
3808 less, which doesn't matter because they will cause an error later. */
3809
3810 expect_cond_assert = prev_expect_cond_assert - 1;
3811
3812 /* If previous_callout is not NULL, it means this follows a previous
3813 callout. If it was a manual callout, do nothing; this means its "length
3814 of next pattern item" field will remain zero. If it was an automatic
3815 callout, abolish it. */
3816
3817 if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
3818 previous_callout == parsed_pattern - 4 &&
3819 parsed_pattern[-1] == 255)
3820 parsed_pattern = previous_callout;
3821
3822 /* Save for updating next pattern item length, and skip one item before
3823 completing. */
3824
3825 previous_callout = parsed_pattern;
3826 after_manual_callout = 1;
3827
3828 /* Handle a string argument; specific delimiter is required. */
3829
3830 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
3831 {
3832 PCRE2_SIZE calloutlength;
3833 PCRE2_SPTR startptr = ptr;
3834
3835 delimiter = 0;
3836 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
3837 {
3838 if (*ptr == PRIV(callout_start_delims)[i])
3839 {
3840 delimiter = PRIV(callout_end_delims)[i];
3841 break;
3842 }
3843 }
3844 if (delimiter == 0)
3845 {
3846 errorcode = ERR82;
3847 goto FAILED;
3848 }
3849
3850 *parsed_pattern = META_CALLOUT_STRING;
3851 parsed_pattern += 3; /* Skip pattern info */
3852
3853 for (;;)
3854 {
3855 if (++ptr >= ptrend)
3856 {
3857 errorcode = ERR81;
3858 ptr = startptr; /* To give a more useful message */
3859 goto FAILED;
3860 }
3861 if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
3862 break;
3863 }
3864
3865 calloutlength = (PCRE2_SIZE)(ptr - startptr);
3866 if (calloutlength > UINT32_MAX)
3867 {
3868 errorcode = ERR72;
3869 goto FAILED;
3870 }
3871 *parsed_pattern++ = (uint32_t)calloutlength;
3872 offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
3873 PUTOFFSET(offset, parsed_pattern);
3874 }
3875
3876 /* Handle a callout with an optional numerical argument, which must be
3877 less than or equal to 255. A missing argument gives 0. */
3878
3879 else
3880 {
3881 int n = 0;
3882 *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
3883 parsed_pattern += 3; /* Skip pattern info */
3884 while (ptr < ptrend && IS_DIGIT(*ptr))
3885 {
3886 n = n * 10 + *ptr++ - CHAR_0;
3887 if (n > 255)
3888 {
3889 errorcode = ERR38;
3890 goto FAILED;
3891 }
3892 }
3893 *parsed_pattern++ = n;
3894 }
3895
3896 /* Both formats must have a closing parenthesis */
3897
3898 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
3899 {
3900 errorcode = ERR39;
3901 goto FAILED;
3902 }
3903 ptr++;
3904
3905 /* Remember the offset to the next item in the pattern, and set a default
3906 length. This should get updated after the next item is read. */
3907
3908 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
3909 previous_callout[2] = 0;
3910 break; /* End callout */
3911
3912
3913 /* ---- Conditional group ---- */
3914
3915 /* A condition can be an assertion, a number (referring to a numbered
3916 group's having been set), a name (referring to a named group), or 'R',
3917 referring to overall recursion. R<digits> and R&name are also permitted
3918 for recursion state tests. Numbers may be preceded by + or - to specify a
3919 relative group number.
3920
3921 There are several syntaxes for testing a named group: (?(name)) is used
3922 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3923
3924 There are two unfortunate ambiguities. 'R' can be the recursive thing or
3925 the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
3926 the Perl DEFINE feature or the Python named test. We look for a name
3927 first; if not found, we try the other case.
3928
3929 For compatibility with auto-callouts, we allow a callout to be specified
3930 before a condition that is an assertion. */
3931
3932 case CHAR_LEFT_PARENTHESIS:
3933 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3934 nest_depth++;
3935
3936 /* If the next character is ? there must be an assertion next (optionally
3937 preceded by a callout). We do not check this here, but instead we set
3938 expect_cond_assert to 2. If this is still greater than zero (callouts
3939 decrement it) when the next assertion is read, it will be marked as a
3940 condition that must not be repeated. A value greater than zero also
3941 causes checking that an assertion (possibly with callout) follows. */
3942
3943 if (*ptr == CHAR_QUESTION_MARK)
3944 {
3945 *parsed_pattern++ = META_COND_ASSERT;
3946 ptr--; /* Pull pointer back to the opening parenthesis. */
3947 expect_cond_assert = 2;
3948 break; /* End of conditional */
3949 }
3950
3951 /* Handle (?([+-]number)... */
3952
3953 if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3954 &errorcode))
3955 {
3956 if (i <= 0)
3957 {
3958 errorcode = ERR15;
3959 goto FAILED;
3960 }
3961 *parsed_pattern++ = META_COND_NUMBER;
3962 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
3963 PUTOFFSET(offset, parsed_pattern);
3964 *parsed_pattern++ = i;
3965 }
3966 else if (errorcode != 0) goto FAILED; /* Number too big */
3967
3968 /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
3969
3970 else if (ptrend - ptr >= 10 &&
3971 PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
3972 ptr[7] != CHAR_RIGHT_PARENTHESIS)
3973 {
3974 uint32_t ge = 0;
3975 int major = 0;
3976 int minor = 0;
3977
3978 ptr += 7;
3979 if (*ptr == CHAR_GREATER_THAN_SIGN)
3980 {
3981 ge = 1;
3982 ptr++;
3983 }
3984
3985 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
3986 references its argument twice. */
3987
3988 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
3989 goto BAD_VERSION_CONDITION;
3990
3991 if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
3992 goto FAILED;
3993
3994 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
3995 if (*ptr == CHAR_DOT)
3996 {
3997 if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
3998 minor = (*ptr++ - CHAR_0) * 10;
3999 if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4000 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4001 goto BAD_VERSION_CONDITION;
4002 }
4003
4004 *parsed_pattern++ = META_COND_VERSION;
4005 *parsed_pattern++ = ge;
4006 *parsed_pattern++ = major;
4007 *parsed_pattern++ = minor;
4008 }
4009
4010 /* All the remaining cases now require us to read a name. We cannot at
4011 this stage distinguish ambiguous cases such as (?(R12) which might be a
4012 recursion test by number or a name, because the named groups have not yet
4013 all been identified. Those cases are treated as names, but given a
4014 different META code. */
4015
4016 else
4017 {
4018 BOOL was_r_ampersand = FALSE;
4019
4020 if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4021 {
4022 terminator = CHAR_RIGHT_PARENTHESIS;
4023 was_r_ampersand = TRUE;
4024 ptr++;
4025 }
4026 else if (*ptr == CHAR_LESS_THAN_SIGN)
4027 terminator = CHAR_GREATER_THAN_SIGN;
4028 else if (*ptr == CHAR_APOSTROPHE)
4029 terminator = CHAR_APOSTROPHE;
4030 else
4031 {
4032 terminator = CHAR_RIGHT_PARENTHESIS;
4033 ptr--; /* Point to char before name */
4034 }
4035 if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
4036 &errorcode, cb)) goto FAILED;
4037
4038 /* Handle (?(R&name) */
4039
4040 if (was_r_ampersand)
4041 {
4042 *parsed_pattern = META_COND_RNAME;
4043 ptr--; /* Back to closing parens */
4044 }
4045
4046 /* Handle (?(name). If the name is "DEFINE" we identify it with a
4047 special code. Likewise if the name consists of R followed only by
4048 digits. Otherwise, handle it like a quoted name. */
4049
4050 else if (terminator == CHAR_RIGHT_PARENTHESIS)
4051 {
4052 if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4053 *parsed_pattern = META_COND_DEFINE;
4054 else
4055 {
4056 for (i = 1; i < (int)namelen; i++)
4057 if (!IS_DIGIT(name[i])) break;
4058 *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4059 META_COND_RNUMBER : META_COND_NAME;
4060 }
4061 ptr--; /* Back to closing parens */
4062 }
4063
4064 /* Handle (?('name') or (?(<name>) */
4065
4066 else *parsed_pattern = META_COND_NAME;
4067
4068 /* All these cases except DEFINE end with the name length and offset;
4069 DEFINE just has an offset (for the "too many branches" error). */
4070
4071 if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4072 PUTOFFSET(offset, parsed_pattern);
4073 } /* End cases that read a name */
4074
4075 /* Check the closing parenthesis of the condition */
4076
4077 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4078 {
4079 errorcode = ERR24;
4080 goto FAILED;
4081 }
4082 ptr++;
4083 break; /* End of condition processing */
4084
4085
4086 /* ---- Atomic group ---- */
4087
4088 case CHAR_GREATER_THAN_SIGN:
4089 *parsed_pattern++ = META_ATOMIC;
4090 nest_depth++;
4091 ptr++;
4092 break;
4093
4094
4095 /* ---- Lookahead assertions ---- */
4096
4097 case CHAR_EQUALS_SIGN:
4098 *parsed_pattern++ = META_LOOKAHEAD;
4099 ptr++;
4100 goto POST_ASSERTION;
4101
4102 case CHAR_EXCLAMATION_MARK:
4103 *parsed_pattern++ = META_LOOKAHEADNOT;
4104 ptr++;
4105 goto POST_ASSERTION;
4106
4107
4108 /* ---- Lookbehind assertions ---- */
4109
4110 /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the
4111 start of the name of a capturing group. */
4112
4113 case CHAR_LESS_THAN_SIGN:
4114 if (ptrend - ptr <= 1 ||
4115 (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK))
4116 {
4117 terminator = CHAR_GREATER_THAN_SIGN;
4118 goto DEFINE_NAME;
4119 }
4120 *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4121 META_LOOKBEHIND : META_LOOKBEHINDNOT;
4122 *has_lookbehind = TRUE;
4123 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4124 PUTOFFSET(offset, parsed_pattern);
4125 ptr += 2;
4126 /* Fall through */
4127
4128 /* If the previous item was a condition starting (?(? an assertion,
4129 optionally preceded by a callout, is expected. This is checked later on,
4130 during actual compilation. However we need to identify this kind of
4131 assertion in this pass because it must not be qualified. The value of
4132 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4133 for a callout - still leaving a positive value that identifies the
4134 assertion. Multiple callouts or any other items will make it zero or
4135 less, which doesn't matter because they will cause an error later. */
4136
4137 POST_ASSERTION:
4138 nest_depth++;
4139 if (prev_expect_cond_assert > 0)
4140 {
4141 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4142 else if (++top_nest >= end_nests)
4143 {
4144 errorcode = ERR84;
4145 goto FAILED;
4146 }
4147 top_nest->nest_depth = nest_depth;
4148 top_nest->flags = NSF_CONDASSERT;
4149 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4150 }
4151 break;
4152
4153
4154 /* ---- Define a named group ---- */
4155
4156 /* A named group may be defined as (?'name') or (?<name>). In the latter
4157 case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4158 terminator set to '>'. */
4159
4160 case CHAR_APOSTROPHE:
4161 terminator = CHAR_APOSTROPHE; /* Terminator */
4162
4163 DEFINE_NAME:
4164 if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen,
4165 &errorcode, cb)) goto FAILED;
4166
4167 /* We have a name for this capturing group. It is also assigned a number,
4168 which is its primary means of identification. */
4169
4170 cb->bracount++;
4171 *parsed_pattern++ = META_CAPTURE | cb->bracount;
4172 nest_depth++;
4173
4174 /* Check not too many names */
4175
4176 if (cb->names_found >= MAX_NAME_COUNT)
4177 {
4178 errorcode = ERR49;
4179 goto FAILED;
4180 }
4181
4182 /* Adjust the entry size to accommodate the longest name found. */
4183
4184 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4185 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4186
4187 /* Scan the list to check for duplicates. For duplicate names, if the
4188 number is the same, break the loop, which causes the name to be
4189 discarded; otherwise, if DUPNAMES is not set, give an error.
4190 If it is set, allow the name with a different number, but continue
4191 scanning in case this is a duplicate with the same number. For
4192 non-duplicate names, give an error if the number is duplicated. */
4193
4194 isdupname = FALSE;
4195 ng = cb->named_groups;
4196 for (i = 0; i < cb->names_found; i++, ng++)
4197 {
4198 if (namelen == ng->length &&
4199 PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4200 {
4201 if (ng->number == cb->bracount) break;
4202 if ((options & PCRE2_DUPNAMES) == 0)
4203 {
4204 errorcode = ERR43;
4205 goto FAILED;
4206 }
4207 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
4208 cb->dupnames = TRUE; /* Duplicate names exist */
4209 }
4210 else if (ng->number == cb->bracount)
4211 {
4212 errorcode = ERR65;
4213 goto FAILED;
4214 }
4215 }
4216
4217 if (i < cb->names_found) break; /* Ignore duplicate with same number */
4218
4219 /* Increase the list size if necessary */
4220
4221 if (cb->names_found >= cb->named_group_list_size)
4222 {
4223 uint32_t newsize = cb->named_group_list_size * 2;
4224 named_group *newspace =
4225 cb->cx->memctl.malloc(newsize * sizeof(named_group),
4226 cb->cx->memctl.memory_data);
4227 if (newspace == NULL)
4228 {
4229 errorcode = ERR21;
4230 goto FAILED;
4231 }
4232
4233 memcpy(newspace, cb->named_groups,
4234 cb->named_group_list_size * sizeof(named_group));
4235 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4236 cb->cx->memctl.free((void *)cb->named_groups,
4237 cb->cx->memctl.memory_data);
4238 cb->named_groups = newspace;
4239 cb->named_group_list_size = newsize;
4240 }
4241
4242 /* Add this name to the list */
4243
4244 cb->named_groups[cb->names_found].name = name;
4245 cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4246 cb->named_groups[cb->names_found].number = cb->bracount;
4247 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4248 cb->names_found++;
4249 break;
4250 } /* End of (? switch */
4251 break; /* End of ( handling */
4252
4253
4254 /* ---- Branch terminators ---- */
4255
4256 /* Alternation: reset the capture count if we are in a (?| group. */
4257
4258 case CHAR_VERTICAL_LINE:
4259 if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4260 (top_nest->flags & NSF_RESET) != 0)
4261 {
4262 if (cb->bracount > top_nest->max_group)
4263 top_nest->max_group = (uint16_t)cb->bracount;
4264 cb->bracount = top_nest->reset_group;
4265 }
4266 *parsed_pattern++ = META_ALT;
4267 break;
4268
4269 /* End of group; reset the capture count to the maximum if we are in a (?|
4270 group and/or reset the options that are tracked during parsing. Disallow
4271 quantifier for a condition that is an assertion. */
4272
4273 case CHAR_RIGHT_PARENTHESIS:
4274 okquantifier = TRUE;
4275 if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4276 {
4277 options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4278 if ((top_nest->flags & NSF_RESET) != 0 &&
4279 top_nest->max_group > cb->bracount)
4280 cb->bracount = top_nest->max_group;
4281 if ((top_nest->flags & NSF_CONDASSERT) != 0)
4282 okquantifier = FALSE;
4283 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4284 else top_nest--;
4285 }
4286 if (nest_depth == 0) /* Unmatched closing parenthesis */
4287 {
4288 errorcode = ERR22;
4289 goto FAILED_BACK;
4290 }
4291 nest_depth--;
4292 *parsed_pattern++ = META_KET;
4293 break;
4294 } /* End of switch on pattern character */
4295 } /* End of main character scan loop */
4296
4297 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4298
4299 if (inverbname && ptr >= ptrend)
4300 {
4301 errorcode = ERR60;
4302 goto FAILED;
4303 }
4304
4305 /* Manage callout for the final item */
4306
4307 PARSED_END:
4308 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4309 parsed_pattern, cb);
4310
4311 /* Insert trailing items for word and line matching (features provided for the
4312 benefit of pcre2grep). */
4313
4314 if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4315 {
4316 *parsed_pattern++ = META_KET;
4317 *parsed_pattern++ = META_DOLLAR;
4318 }
4319 else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4320 {
4321 *parsed_pattern++ = META_KET;
4322 *parsed_pattern++ = META_ESCAPE + ESC_b;
4323 }
4324
4325 /* Terminate the parsed pattern, then return success if all groups are closed.
4326 Otherwise we have unclosed parentheses. */
4327
4328 if (parsed_pattern >= parsed_pattern_end)
4329 {
4330 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
4331 goto FAILED;
4332 }
4333
4334 *parsed_pattern = META_END;
4335 if (nest_depth == 0) return 0;
4336
4337 UNCLOSED_PARENTHESIS:
4338 errorcode = ERR14;
4339
4340 /* Come here for all failures. */
4341
4342 FAILED:
4343 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4344 return errorcode;
4345
4346 /* Some errors need to indicate the previous character. */
4347
4348 FAILED_BACK:
4349 ptr--;
4350 goto FAILED;
4351
4352 /* This failure happens several times. */
4353
4354 BAD_VERSION_CONDITION:
4355 errorcode = ERR79;
4356 goto FAILED;
4357 }
4358
4359
4360
4361 /*************************************************
4362 * Find first significant opcode *
4363 *************************************************/
4364
4365 /* This is called by several functions that scan a compiled expression looking
4366 for a fixed first character, or an anchoring opcode etc. It skips over things
4367 that do not influence this. For some calls, it makes sense to skip negative
4368 forward and all backward assertions, and also the \b assertion; for others it
4369 does not.
4370
4371 Arguments:
4372 code pointer to the start of the group
4373 skipassert TRUE if certain assertions are to be skipped
4374
4375 Returns: pointer to the first significant opcode
4376 */
4377
4378 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)4379 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4380 {
4381 for (;;)
4382 {
4383 switch ((int)*code)
4384 {
4385 case OP_ASSERT_NOT:
4386 case OP_ASSERTBACK:
4387 case OP_ASSERTBACK_NOT:
4388 if (!skipassert) return code;
4389 do code += GET(code, 1); while (*code == OP_ALT);
4390 code += PRIV(OP_lengths)[*code];
4391 break;
4392
4393 case OP_WORD_BOUNDARY:
4394 case OP_NOT_WORD_BOUNDARY:
4395 if (!skipassert) return code;
4396 /* Fall through */
4397
4398 case OP_CALLOUT:
4399 case OP_CREF:
4400 case OP_DNCREF:
4401 case OP_RREF:
4402 case OP_DNRREF:
4403 case OP_FALSE:
4404 case OP_TRUE:
4405 code += PRIV(OP_lengths)[*code];
4406 break;
4407
4408 case OP_CALLOUT_STR:
4409 code += GET(code, 1 + 2*LINK_SIZE);
4410 break;
4411
4412 case OP_SKIPZERO:
4413 code += 2 + GET(code, 2) + LINK_SIZE;
4414 break;
4415
4416 case OP_COND:
4417 case OP_SCOND:
4418 if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
4419 code[GET(code, 1)] != OP_KET) /* More than one branch */
4420 return code;
4421 code += GET(code, 1) + 1 + LINK_SIZE;
4422 break;
4423
4424 default:
4425 return code;
4426 }
4427 }
4428 /* Control never reaches here */
4429 }
4430
4431
4432
4433 #ifdef SUPPORT_UNICODE
4434 /*************************************************
4435 * Get othercase range *
4436 *************************************************/
4437
4438 /* This function is passed the start and end of a class range in UCP mode. It
4439 searches up the characters, looking for ranges of characters in the "other"
4440 case. Each call returns the next one, updating the start address. A character
4441 with multiple other cases is returned on its own with a special return value.
4442
4443 Arguments:
4444 cptr points to starting character value; updated
4445 d end value
4446 ocptr where to put start of othercase range
4447 odptr where to put end of othercase range
4448
4449 Yield: -1 when no more
4450 0 when a range is returned
4451 >0 the CASESET offset for char with multiple other cases
4452 in this case, ocptr contains the original
4453 */
4454
4455 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)4456 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4457 uint32_t *odptr)
4458 {
4459 uint32_t c, othercase, next;
4460 unsigned int co;
4461
4462 /* Find the first character that has an other case. If it has multiple other
4463 cases, return its case offset value. */
4464
4465 for (c = *cptr; c <= d; c++)
4466 {
4467 if ((co = UCD_CASESET(c)) != 0)
4468 {
4469 *ocptr = c++; /* Character that has the set */
4470 *cptr = c; /* Rest of input range */
4471 return (int)co;
4472 }
4473 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4474 }
4475
4476 if (c > d) return -1; /* Reached end of range */
4477
4478 /* Found a character that has a single other case. Search for the end of the
4479 range, which is either the end of the input range, or a character that has zero
4480 or more than one other cases. */
4481
4482 *ocptr = othercase;
4483 next = othercase + 1;
4484
4485 for (++c; c <= d; c++)
4486 {
4487 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4488 next++;
4489 }
4490
4491 *odptr = next - 1; /* End of othercase range */
4492 *cptr = c; /* Rest of input range */
4493 return 0;
4494 }
4495 #endif /* SUPPORT_UNICODE */
4496
4497
4498
4499 /*************************************************
4500 * Add a character or range to a class (internal) *
4501 *************************************************/
4502
4503 /* This function packages up the logic of adding a character or range of
4504 characters to a class. The character values in the arguments will be within the
4505 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4506 called only from within the "add to class" group of functions, some of which
4507 are recursive and mutually recursive. The external entry point is
4508 add_to_class().
4509
4510 Arguments:
4511 classbits the bit map for characters < 256
4512 uchardptr points to the pointer for extra data
4513 options the options word
4514 cb compile data
4515 start start of range character
4516 end end of range character
4517
4518 Returns: the number of < 256 characters added
4519 the pointer to extra data is updated
4520 */
4521
4522 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)4523 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4524 uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
4525 {
4526 uint32_t c;
4527 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
4528 unsigned int n8 = 0;
4529
4530 /* If caseless matching is required, scan the range and process alternate
4531 cases. In Unicode, there are 8-bit characters that have alternate cases that
4532 are greater than 255 and vice-versa. Sometimes we can just extend the original
4533 range. */
4534
4535 if ((options & PCRE2_CASELESS) != 0)
4536 {
4537 #ifdef SUPPORT_UNICODE
4538 if ((options & PCRE2_UTF) != 0)
4539 {
4540 int rc;
4541 uint32_t oc, od;
4542
4543 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
4544 c = start;
4545
4546 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4547 {
4548 /* Handle a single character that has more than one other case. */
4549
4550 if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
4551 PRIV(ucd_caseless_sets) + rc, oc);
4552
4553 /* Do nothing if the other case range is within the original range. */
4554
4555 else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
4556
4557 /* Extend the original range if there is overlap, noting that if oc < c, we
4558 can't have od > end because a subrange is always shorter than the basic
4559 range. Otherwise, use a recursive call to add the additional range. */
4560
4561 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4562 else if (od > end && oc <= end + 1)
4563 {
4564 end = od; /* Extend upwards */
4565 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4566 }
4567 else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
4568 }
4569 }
4570 else
4571 #endif /* SUPPORT_UNICODE */
4572
4573 /* Not UTF mode */
4574
4575 for (c = start; c <= classbits_end; c++)
4576 {
4577 SETBIT(classbits, cb->fcc[c]);
4578 n8++;
4579 }
4580 }
4581
4582 /* Now handle the originally supplied range. Adjust the final value according
4583 to the bit length - this means that the same lists of (e.g.) horizontal spaces
4584 can be used in all cases. */
4585
4586 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
4587 end = MAX_NON_UTF_CHAR;
4588
4589 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
4590
4591 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4592
4593 for (c = start; c <= classbits_end; c++)
4594 {
4595 /* Regardless of start, c will always be <= 255. */
4596 SETBIT(classbits, c);
4597 n8++;
4598 }
4599
4600 #ifdef SUPPORT_WIDE_CHARS
4601 if (start <= 0xff) start = 0xff + 1;
4602
4603 if (end >= start)
4604 {
4605 PCRE2_UCHAR *uchardata = *uchardptr;
4606
4607 #ifdef SUPPORT_UNICODE
4608 if ((options & PCRE2_UTF) != 0)
4609 {
4610 if (start < end)
4611 {
4612 *uchardata++ = XCL_RANGE;
4613 uchardata += PRIV(ord2utf)(start, uchardata);
4614 uchardata += PRIV(ord2utf)(end, uchardata);
4615 }
4616 else if (start == end)
4617 {
4618 *uchardata++ = XCL_SINGLE;
4619 uchardata += PRIV(ord2utf)(start, uchardata);
4620 }
4621 }
4622 else
4623 #endif /* SUPPORT_UNICODE */
4624
4625 /* Without UTF support, character values are constrained by the bit length,
4626 and can only be > 256 for 16-bit and 32-bit libraries. */
4627
4628 #if PCRE2_CODE_UNIT_WIDTH == 8
4629 {}
4630 #else
4631 if (start < end)
4632 {
4633 *uchardata++ = XCL_RANGE;
4634 *uchardata++ = start;
4635 *uchardata++ = end;
4636 }
4637 else if (start == end)
4638 {
4639 *uchardata++ = XCL_SINGLE;
4640 *uchardata++ = start;
4641 }
4642 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
4643 *uchardptr = uchardata; /* Updata extra data pointer */
4644 }
4645 #else /* SUPPORT_WIDE_CHARS */
4646 (void)uchardptr; /* Avoid compiler warning */
4647 #endif /* SUPPORT_WIDE_CHARS */
4648
4649 return n8; /* Number of 8-bit characters */
4650 }
4651
4652
4653
4654 #ifdef SUPPORT_UNICODE
4655 /*************************************************
4656 * Add a list of characters to a class (internal) *
4657 *************************************************/
4658
4659 /* This function is used for adding a list of case-equivalent characters to a
4660 class when in UTF mode. This function is called only from within
4661 add_to_class_internal(), with which it is mutually recursive.
4662
4663 Arguments:
4664 classbits the bit map for characters < 256
4665 uchardptr points to the pointer for extra data
4666 options the options word
4667 cb contains pointers to tables etc.
4668 p points to row of 32-bit values, terminated by NOTACHAR
4669 except character to omit; this is used when adding lists of
4670 case-equivalent characters to avoid including the one we
4671 already know about
4672
4673 Returns: the number of < 256 characters added
4674 the pointer to extra data is updated
4675 */
4676
4677 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)4678 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4679 uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
4680 {
4681 unsigned int n8 = 0;
4682 while (p[0] < NOTACHAR)
4683 {
4684 unsigned int n = 0;
4685 if (p[0] != except)
4686 {
4687 while(p[n+1] == p[0] + n + 1) n++;
4688 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
4689 }
4690 p += n + 1;
4691 }
4692 return n8;
4693 }
4694 #endif
4695
4696
4697
4698 /*************************************************
4699 * External entry point for add range to class *
4700 *************************************************/
4701
4702 /* This function sets the overall range so that the internal functions can try
4703 to avoid duplication when handling case-independence.
4704
4705 Arguments:
4706 classbits the bit map for characters < 256
4707 uchardptr points to the pointer for extra data
4708 options the options word
4709 cb compile data
4710 start start of range character
4711 end end of range character
4712
4713 Returns: the number of < 256 characters added
4714 the pointer to extra data is updated
4715 */
4716
4717 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)4718 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
4719 compile_block *cb, uint32_t start, uint32_t end)
4720 {
4721 cb->class_range_start = start;
4722 cb->class_range_end = end;
4723 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
4724 }
4725
4726
4727 /*************************************************
4728 * External entry point for add list to class *
4729 *************************************************/
4730
4731 /* This function is used for adding a list of horizontal or vertical whitespace
4732 characters to a class. The list must be in order so that ranges of characters
4733 can be detected and handled appropriately. This function sets the overall range
4734 so that the internal functions can try to avoid duplication when handling
4735 case-independence.
4736
4737 Arguments:
4738 classbits the bit map for characters < 256
4739 uchardptr points to the pointer for extra data
4740 options the options word
4741 cb contains pointers to tables etc.
4742 p points to row of 32-bit values, terminated by NOTACHAR
4743 except character to omit; this is used when adding lists of
4744 case-equivalent characters to avoid including the one we
4745 already know about
4746
4747 Returns: the number of < 256 characters added
4748 the pointer to extra data is updated
4749 */
4750
4751 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)4752 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
4753 compile_block *cb, const uint32_t *p, unsigned int except)
4754 {
4755 unsigned int n8 = 0;
4756 while (p[0] < NOTACHAR)
4757 {
4758 unsigned int n = 0;
4759 if (p[0] != except)
4760 {
4761 while(p[n+1] == p[0] + n + 1) n++;
4762 cb->class_range_start = p[0];
4763 cb->class_range_end = p[n];
4764 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
4765 }
4766 p += n + 1;
4767 }
4768 return n8;
4769 }
4770
4771
4772
4773 /*************************************************
4774 * Add characters not in a list to a class *
4775 *************************************************/
4776
4777 /* This function is used for adding the complement of a list of horizontal or
4778 vertical whitespace to a class. The list must be in order.
4779
4780 Arguments:
4781 classbits the bit map for characters < 256
4782 uchardptr points to the pointer for extra data
4783 options the options word
4784 cb contains pointers to tables etc.
4785 p points to row of 32-bit values, terminated by NOTACHAR
4786
4787 Returns: the number of < 256 characters added
4788 the pointer to extra data is updated
4789 */
4790
4791 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)4792 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4793 uint32_t options, compile_block *cb, const uint32_t *p)
4794 {
4795 BOOL utf = (options & PCRE2_UTF) != 0;
4796 unsigned int n8 = 0;
4797 if (p[0] > 0)
4798 n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
4799 while (p[0] < NOTACHAR)
4800 {
4801 while (p[1] == p[0] + 1) p++;
4802 n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
4803 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4804 p++;
4805 }
4806 return n8;
4807 }
4808
4809
4810
4811 /*************************************************
4812 * Find details of duplicate group names *
4813 *************************************************/
4814
4815 /* This is called from compile_branch() when it needs to know the index and
4816 count of duplicates in the names table when processing named backreferences,
4817 either directly, or as conditions.
4818
4819 Arguments:
4820 name points to the name
4821 length the length of the name
4822 indexptr where to put the index
4823 countptr where to put the count of duplicates
4824 errorcodeptr where to put an error code
4825 cb the compile block
4826
4827 Returns: TRUE if OK, FALSE if not, error code set
4828 */
4829
4830 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)4831 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
4832 int *countptr, int *errorcodeptr, compile_block *cb)
4833 {
4834 uint32_t i, groupnumber;
4835 int count;
4836 PCRE2_UCHAR *slot = cb->name_table;
4837
4838 /* Find the first entry in the table */
4839
4840 for (i = 0; i < cb->names_found; i++)
4841 {
4842 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
4843 slot[IMM2_SIZE+length] == 0) break;
4844 slot += cb->name_entry_size;
4845 }
4846
4847 /* This should not occur, because this function is called only when we know we
4848 have duplicate names. Give an internal error. */
4849
4850 if (i >= cb->names_found)
4851 {
4852 *errorcodeptr = ERR53;
4853 cb->erroroffset = name - cb->start_pattern;
4854 return FALSE;
4855 }
4856
4857 /* Record the index and then see how many duplicates there are, updating the
4858 backref map and maximum back reference as we do. */
4859
4860 *indexptr = i;
4861 count = 0;
4862
4863 for (;;)
4864 {
4865 count++;
4866 groupnumber = GET2(slot,0);
4867 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
4868 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
4869 if (++i >= cb->names_found) break;
4870 slot += cb->name_entry_size;
4871 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
4872 (slot+IMM2_SIZE)[length] != 0) break;
4873 }
4874
4875 *countptr = count;
4876 return TRUE;
4877 }
4878
4879
4880
4881 /*************************************************
4882 * Compile one branch *
4883 *************************************************/
4884
4885 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
4886 the options are changed during the branch, the pointer is used to change the
4887 external options bits. This function is used during the pre-compile phase when
4888 we are trying to find out the amount of memory needed, as well as during the
4889 real compile phase. The value of lengthptr distinguishes the two phases.
4890
4891 Arguments:
4892 optionsptr pointer to the option bits
4893 codeptr points to the pointer to the current code point
4894 pptrptr points to the current parsed pattern pointer
4895 errorcodeptr points to error code variable
4896 firstcuptr place to put the first required code unit
4897 firstcuflagsptr place to put the first code unit flags, or a negative number
4898 reqcuptr place to put the last required code unit
4899 reqcuflagsptr place to put the last required code unit flags, or a negative number
4900 bcptr points to current branch chain
4901 cb contains pointers to tables etc.
4902 lengthptr NULL during the real compile phase
4903 points to length accumulator during pre-compile phase
4904
4905 Returns: 0 There's been an error, *errorcodeptr is non-zero
4906 +1 Success, this branch must match at least one character
4907 -1 Success, this branch may match an empty string
4908 */
4909
4910 static int
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)4911 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
4912 int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
4913 uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
4914 compile_block *cb, PCRE2_SIZE *lengthptr)
4915 {
4916 int bravalue = 0;
4917 int okreturn = -1;
4918 int group_return = 0;
4919 uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4920 uint32_t greedy_default, greedy_non_default;
4921 uint32_t repeat_type, op_type;
4922 uint32_t options = *optionsptr; /* May change dynamically */
4923 uint32_t firstcu, reqcu;
4924 uint32_t zeroreqcu, zerofirstcu;
4925 uint32_t escape;
4926 uint32_t *pptr = *pptrptr;
4927 uint32_t meta, meta_arg;
4928 int32_t firstcuflags, reqcuflags;
4929 int32_t zeroreqcuflags, zerofirstcuflags;
4930 int32_t req_caseopt, reqvary, tempreqvary;
4931 PCRE2_SIZE offset = 0;
4932 PCRE2_SIZE length_prevgroup = 0;
4933 PCRE2_UCHAR *code = *codeptr;
4934 PCRE2_UCHAR *last_code = code;
4935 PCRE2_UCHAR *orig_code = code;
4936 PCRE2_UCHAR *tempcode;
4937 PCRE2_UCHAR *previous = NULL;
4938 PCRE2_UCHAR op_previous;
4939 BOOL groupsetfirstcu = FALSE;
4940 BOOL matched_char = FALSE;
4941 BOOL previous_matched_char = FALSE;
4942 const uint8_t *cbits = cb->cbits;
4943 uint8_t classbits[32];
4944
4945 /* We can fish out the UTF setting once and for all into a BOOL, but we must
4946 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
4947 dynamically as we process the pattern. */
4948
4949 #ifdef SUPPORT_UNICODE
4950 BOOL utf = (options & PCRE2_UTF) != 0;
4951 #else /* No UTF support */
4952 BOOL utf = FALSE;
4953 #endif
4954
4955 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4956 class_uchardata always so that it can be passed to add_to_class() always,
4957 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4958 alternative calls for the different cases. */
4959
4960 PCRE2_UCHAR *class_uchardata;
4961 #ifdef SUPPORT_WIDE_CHARS
4962 BOOL xclass;
4963 PCRE2_UCHAR *class_uchardata_base;
4964 #endif
4965
4966 /* Set up the default and non-default settings for greediness */
4967
4968 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
4969 greedy_non_default = greedy_default ^ 1;
4970
4971 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
4972 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4973 matches a non-fixed first unit; reqcu just remains unset if we never find one.
4974
4975 When we hit a repeat whose minimum is zero, we may have to adjust these values
4976 to take the zero repeat into account. This is implemented by setting them to
4977 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
4978 item types that can be repeated set these backoff variables appropriately. */
4979
4980 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
4981 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
4982
4983 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
4984 according to the current setting of the caseless flag. The REQ_CASELESS value
4985 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
4986 to record the case status of the value. This is used only for ASCII characters.
4987 */
4988
4989 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
4990
4991 /* Switch on next META item until the end of the branch */
4992
4993 for (;; pptr++)
4994 {
4995 #ifdef SUPPORT_WIDE_CHARS
4996 BOOL xclass_has_prop;
4997 #endif
4998 BOOL negate_class;
4999 BOOL should_flip_negation;
5000 BOOL match_all_or_no_wide_chars;
5001 BOOL possessive_quantifier;
5002 BOOL note_group_empty;
5003 int class_has_8bitchar;
5004 int i;
5005 uint32_t mclength;
5006 uint32_t skipunits;
5007 uint32_t subreqcu, subfirstcu;
5008 uint32_t groupnumber;
5009 uint32_t verbarglen, verbculen;
5010 int32_t subreqcuflags, subfirstcuflags; /* Must be signed */
5011 open_capitem *oc;
5012 PCRE2_UCHAR mcbuffer[8];
5013
5014 /* Get next META item in the pattern and its potential argument. */
5015
5016 meta = META_CODE(*pptr);
5017 meta_arg = META_DATA(*pptr);
5018
5019 /* If we are in the pre-compile phase, accumulate the length used for the
5020 previous cycle of this loop, unless the next item is a quantifier. */
5021
5022 if (lengthptr != NULL)
5023 {
5024 if (code > cb->start_workspace + cb->workspace_size -
5025 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
5026 {
5027 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5028 ERR52 : ERR86;
5029 return 0;
5030 }
5031
5032 /* There is at least one situation where code goes backwards: this is the
5033 case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5034 is processed, the whole class is eliminated. However, it is created first,
5035 so we have to allow memory for it. Therefore, don't ever reduce the length
5036 at this point. */
5037
5038 if (code < last_code) code = last_code;
5039
5040 /* If the next thing is not a quantifier, we add the length of the previous
5041 item into the total, and reset the code pointer to the start of the
5042 workspace. Otherwise leave the previous item available to be quantified. */
5043
5044 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5045 {
5046 if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5047 {
5048 *errorcodeptr = ERR20; /* Integer overflow */
5049 return 0;
5050 }
5051 *lengthptr += (PCRE2_SIZE)(code - orig_code);
5052 if (*lengthptr > MAX_PATTERN_SIZE)
5053 {
5054 *errorcodeptr = ERR20; /* Pattern is too large */
5055 return 0;
5056 }
5057 code = orig_code;
5058 }
5059
5060 /* Remember where this code item starts so we can catch the "backwards"
5061 case above next time round. */
5062
5063 last_code = code;
5064 }
5065
5066 /* Process the next parsed pattern item. If it is not a quantifier, remember
5067 where it starts so that it can be quantified when a quantifier follows.
5068 Checking for the legality of quantifiers happens in parse_regex(), except for
5069 a quantifier after an assertion that is a condition. */
5070
5071 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5072 {
5073 previous = code;
5074 if (matched_char) okreturn = 1;
5075 }
5076
5077 previous_matched_char = matched_char;
5078 matched_char = FALSE;
5079 note_group_empty = FALSE;
5080 skipunits = 0; /* Default value for most subgroups */
5081
5082 switch(meta)
5083 {
5084 /* ===================================================================*/
5085 /* The branch terminates at pattern end or | or ) */
5086
5087 case META_END:
5088 case META_ALT:
5089 case META_KET:
5090 *firstcuptr = firstcu;
5091 *firstcuflagsptr = firstcuflags;
5092 *reqcuptr = reqcu;
5093 *reqcuflagsptr = reqcuflags;
5094 *codeptr = code;
5095 *pptrptr = pptr;
5096 return okreturn;
5097
5098
5099 /* ===================================================================*/
5100 /* Handle single-character metacharacters. In multiline mode, ^ disables
5101 the setting of any following char as a first character. */
5102
5103 case META_CIRCUMFLEX:
5104 if ((options & PCRE2_MULTILINE) != 0)
5105 {
5106 if (firstcuflags == REQ_UNSET)
5107 zerofirstcuflags = firstcuflags = REQ_NONE;
5108 *code++ = OP_CIRCM;
5109 }
5110 else *code++ = OP_CIRC;
5111 break;
5112
5113 case META_DOLLAR:
5114 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5115 break;
5116
5117 /* There can never be a first char if '.' is first, whatever happens about
5118 repeats. The value of reqcu doesn't change either. */
5119
5120 case META_DOT:
5121 matched_char = TRUE;
5122 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5123 zerofirstcu = firstcu;
5124 zerofirstcuflags = firstcuflags;
5125 zeroreqcu = reqcu;
5126 zeroreqcuflags = reqcuflags;
5127 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5128 break;
5129
5130
5131 /* ===================================================================*/
5132 /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5133 Otherwise, an initial ']' is taken as a data character. When empty classes
5134 are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5135 match any character, so generate OP_ALLANY. */
5136
5137 case META_CLASS_EMPTY:
5138 case META_CLASS_EMPTY_NOT:
5139 matched_char = TRUE;
5140 *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5141 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5142 zerofirstcu = firstcu;
5143 zerofirstcuflags = firstcuflags;
5144 break;
5145
5146
5147 /* ===================================================================*/
5148 /* Non-empty character class. If the included characters are all < 256, we
5149 build a 32-byte bitmap of the permitted characters, except in the special
5150 case where there is only one such character. For negated classes, we build
5151 the map as usual, then invert it at the end. However, we use a different
5152 opcode so that data characters > 255 can be handled correctly.
5153
5154 If the class contains characters outside the 0-255 range, a different
5155 opcode is compiled. It may optionally have a bit map for characters < 256,
5156 but those above are are explicitly listed afterwards. A flag code unit
5157 tells whether the bitmap is present, and whether this is a negated class or
5158 not. */
5159
5160 case META_CLASS_NOT:
5161 case META_CLASS:
5162 matched_char = TRUE;
5163 negate_class = meta == META_CLASS_NOT;
5164
5165 /* We can optimize the case of a single character in a class by generating
5166 OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5167 negative. In the negative case there can be no first char if this item is
5168 first, whatever repeat count may follow. In the case of reqcu, save the
5169 previous value for reinstating. */
5170
5171 /* NOTE: at present this optimization is not effective if the only
5172 character in a class in 32-bit, non-UCP mode has its top bit set. */
5173
5174 if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5175 {
5176 #ifdef SUPPORT_UNICODE
5177 uint32_t d;
5178 #endif
5179 uint32_t c = pptr[1];
5180
5181 pptr += 2; /* Move on to class end */
5182 if (meta == META_CLASS) /* A positive one-char class can be */
5183 { /* handled as a normal literal character. */
5184 meta = c; /* Set up the character */
5185 goto NORMAL_CHAR_SET;
5186 }
5187
5188 /* Handle a negative one-character class */
5189
5190 zeroreqcu = reqcu;
5191 zeroreqcuflags = reqcuflags;
5192 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5193 zerofirstcu = firstcu;
5194 zerofirstcuflags = firstcuflags;
5195
5196 /* For caseless UTF mode, check whether this character has more than
5197 one other case. If so, generate a special OP_NOTPROP item instead of
5198 OP_NOTI. */
5199
5200 #ifdef SUPPORT_UNICODE
5201 if (utf && (options & PCRE2_CASELESS) != 0 &&
5202 (d = UCD_CASESET(c)) != 0)
5203 {
5204 *code++ = OP_NOTPROP;
5205 *code++ = PT_CLIST;
5206 *code++ = d;
5207 break; /* We are finished with this class */
5208 }
5209 #endif
5210 /* Char has only one other case, or UCP not available */
5211
5212 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5213 code += PUTCHAR(c, code);
5214 break; /* We are finished with this class */
5215 } /* End of 1-char optimization */
5216
5217 /* Handle character classes that contain more than just one literal
5218 character. */
5219
5220 /* If a non-extended class contains a negative special such as \S, we need
5221 to flip the negation flag at the end, so that support for characters > 255
5222 works correctly (they are all included in the class). An extended class may
5223 need to insert specific matching or non-matching code for wide characters.
5224 */
5225
5226 should_flip_negation = match_all_or_no_wide_chars = FALSE;
5227
5228 /* Extended class (xclass) will be used when characters > 255
5229 might match. */
5230
5231 #ifdef SUPPORT_WIDE_CHARS
5232 xclass = FALSE;
5233 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
5234 class_uchardata_base = class_uchardata; /* Save the start */
5235 #endif
5236
5237 /* For optimization purposes, we track some properties of the class:
5238 class_has_8bitchar will be non-zero if the class contains at least one
5239 character with a code point less than 256; xclass_has_prop will be TRUE if
5240 Unicode property checks are present in the class. */
5241
5242 class_has_8bitchar = 0;
5243 #ifdef SUPPORT_WIDE_CHARS
5244 xclass_has_prop = FALSE;
5245 #endif
5246
5247 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5248 in a temporary bit of memory, in case the class contains fewer than two
5249 8-bit characters because in that case the compiled code doesn't use the bit
5250 map. */
5251
5252 memset(classbits, 0, 32 * sizeof(uint8_t));
5253
5254 /* Process items until META_CLASS_END is reached. */
5255
5256 while ((meta = *(++pptr)) != META_CLASS_END)
5257 {
5258 /* Handle POSIX classes such as [:alpha:] etc. */
5259
5260 if (meta == META_POSIX || meta == META_POSIX_NEG)
5261 {
5262 BOOL local_negate = (meta == META_POSIX_NEG);
5263 int posix_class = *(++pptr);
5264 int taboffset, tabopt;
5265 uint8_t pbits[32];
5266
5267 should_flip_negation = local_negate; /* Note negative special */
5268
5269 /* If matching is caseless, upper and lower are converted to alpha.
5270 This relies on the fact that the class table starts with alpha,
5271 lower, upper as the first 3 entries. */
5272
5273 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5274 posix_class = 0;
5275
5276 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5277 different escape sequences that use Unicode properties \p or \P.
5278 Others that are not available via \p or \P have to generate
5279 XCL_PROP/XCL_NOTPROP directly, which is done here. */
5280
5281 #ifdef SUPPORT_UNICODE
5282 if ((options & PCRE2_UCP) != 0) switch(posix_class)
5283 {
5284 case PC_GRAPH:
5285 case PC_PRINT:
5286 case PC_PUNCT:
5287 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5288 *class_uchardata++ = (PCRE2_UCHAR)
5289 ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5290 (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5291 *class_uchardata++ = 0;
5292 xclass_has_prop = TRUE;
5293 goto CONTINUE_CLASS;
5294
5295 /* For the other POSIX classes (ascii, xdigit) we are going to
5296 fall through to the non-UCP case and build a bit map for
5297 characters with code points less than 256. However, if we are in
5298 a negated POSIX class, characters with code points greater than
5299 255 must either all match or all not match, depending on whether
5300 the whole class is not or is negated. For example, for
5301 [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5302 they must not.
5303
5304 In the special case where there are no xclass items, this is
5305 automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5306 explicit range is needed for OP_XCLASS. Setting a flag here
5307 causes the range to be generated later when it is known that
5308 OP_XCLASS is required. In the 8-bit library this is relevant only in
5309 utf mode, since no wide characters can exist otherwise. */
5310
5311 default:
5312 #if PCRE2_CODE_UNIT_WIDTH == 8
5313 if (utf)
5314 #endif
5315 match_all_or_no_wide_chars |= local_negate;
5316 break;
5317 }
5318 #endif /* SUPPORT_UNICODE */
5319
5320 /* In the non-UCP case, or when UCP makes no difference, we build the
5321 bit map for the POSIX class in a chunk of local store because we may
5322 be adding and subtracting from it, and we don't want to subtract bits
5323 that may be in the main map already. At the end we or the result into
5324 the bit map that is being built. */
5325
5326 posix_class *= 3;
5327
5328 /* Copy in the first table (always present) */
5329
5330 memcpy(pbits, cbits + posix_class_maps[posix_class],
5331 32 * sizeof(uint8_t));
5332
5333 /* If there is a second table, add or remove it as required. */
5334
5335 taboffset = posix_class_maps[posix_class + 1];
5336 tabopt = posix_class_maps[posix_class + 2];
5337
5338 if (taboffset >= 0)
5339 {
5340 if (tabopt >= 0)
5341 for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5342 else
5343 for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5344 }
5345
5346 /* Now see if we need to remove any special characters. An option
5347 value of 1 removes vertical space and 2 removes underscore. */
5348
5349 if (tabopt < 0) tabopt = -tabopt;
5350 if (tabopt == 1) pbits[1] &= ~0x3c;
5351 else if (tabopt == 2) pbits[11] &= 0x7f;
5352
5353 /* Add the POSIX table or its complement into the main table that is
5354 being built and we are done. */
5355
5356 if (local_negate)
5357 for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
5358 else
5359 for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
5360
5361 /* Every class contains at least one < 256 character. */
5362
5363 class_has_8bitchar = 1;
5364 goto CONTINUE_CLASS; /* End of POSIX handling */
5365 }
5366
5367 /* Other than POSIX classes, the only items we should encounter are
5368 \d-type escapes and literal characters (possibly as ranges). */
5369
5370 if (meta == META_BIGVALUE)
5371 {
5372 meta = *(++pptr);
5373 goto CLASS_LITERAL;
5374 }
5375
5376 /* Any other non-literal must be an escape */
5377
5378 if (meta >= META_END)
5379 {
5380 if (META_CODE(meta) != META_ESCAPE)
5381 {
5382 #ifdef DEBUG_SHOW_PARSED
5383 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5384 "in character class\n", meta);
5385 #endif
5386 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
5387 return 0;
5388 }
5389 escape = META_DATA(meta);
5390
5391 /* Every class contains at least one < 256 character. */
5392
5393 class_has_8bitchar++;
5394
5395 switch(escape)
5396 {
5397 case ESC_d:
5398 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5399 break;
5400
5401 case ESC_D:
5402 should_flip_negation = TRUE;
5403 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
5404 break;
5405
5406 case ESC_w:
5407 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5408 break;
5409
5410 case ESC_W:
5411 should_flip_negation = TRUE;
5412 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
5413 break;
5414
5415 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5416 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5417 previously set by something earlier in the character class.
5418 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5419 we could just adjust the appropriate bit. From PCRE 8.34 we no
5420 longer treat \s and \S specially. */
5421
5422 case ESC_s:
5423 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5424 break;
5425
5426 case ESC_S:
5427 should_flip_negation = TRUE;
5428 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
5429 break;
5430
5431 /* When adding the horizontal or vertical space lists to a class, or
5432 their complements, disable PCRE2_CASELESS, because it justs wastes
5433 time, and in the "not-x" UTF cases can create unwanted duplicates in
5434 the XCLASS list (provoked by characters that have more than one other
5435 case and by both cases being in the same "not-x" sublist). */
5436
5437 case ESC_h:
5438 (void)add_list_to_class(classbits, &class_uchardata,
5439 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5440 break;
5441
5442 case ESC_H:
5443 (void)add_not_list_to_class(classbits, &class_uchardata,
5444 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5445 break;
5446
5447 case ESC_v:
5448 (void)add_list_to_class(classbits, &class_uchardata,
5449 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5450 break;
5451
5452 case ESC_V:
5453 (void)add_not_list_to_class(classbits, &class_uchardata,
5454 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5455 break;
5456
5457 /* If Unicode is not supported, \P and \p are not allowed and are
5458 faulted at parse time, so will never appear here. */
5459
5460 #ifdef SUPPORT_UNICODE
5461 case ESC_p:
5462 case ESC_P:
5463 {
5464 uint32_t ptype = *(++pptr) >> 16;
5465 uint32_t pdata = *pptr & 0xffff;
5466 *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5467 *class_uchardata++ = ptype;
5468 *class_uchardata++ = pdata;
5469 xclass_has_prop = TRUE;
5470 class_has_8bitchar--; /* Undo! */
5471 }
5472 break;
5473 #endif
5474 }
5475
5476 goto CONTINUE_CLASS;
5477 } /* End handling \d-type escapes */
5478
5479 /* A literal character may be followed by a range meta. At parse time
5480 there are checks for out-of-order characters, for ranges where the two
5481 characters are equal, and for hyphens that cannot indicate a range. At
5482 this point, therefore, no checking is needed. */
5483
5484 else
5485 {
5486 uint32_t c, d;
5487
5488 CLASS_LITERAL:
5489 c = d = meta;
5490
5491 /* Remember if \r or \n were explicitly used */
5492
5493 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5494
5495 /* Process a character range */
5496
5497 if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
5498 {
5499 #ifdef EBCDIC
5500 BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
5501 #endif
5502 pptr += 2;
5503 d = *pptr;
5504 if (d == META_BIGVALUE) d = *(++pptr);
5505
5506 /* Remember an explicit \r or \n, and add the range to the class. */
5507
5508 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5509
5510 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
5511 because there are holes in the encoding, and simply using the range
5512 A-Z (for example) would include the characters in the holes. This
5513 applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
5514
5515 #ifdef EBCDIC
5516 if (range_is_literal &&
5517 (cb->ctypes[c] & ctype_letter) != 0 &&
5518 (cb->ctypes[d] & ctype_letter) != 0 &&
5519 (d <= CHAR_z) == (d <= CHAR_z))
5520 {
5521 uint32_t uc = (d <= CHAR_z)? 0 : 64;
5522 uint32_t C = d - uc;
5523 uint32_t D = d - uc;
5524
5525 if (C <= CHAR_i)
5526 {
5527 class_has_8bitchar +=
5528 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5529 ((D < CHAR_i)? D : CHAR_i) + uc);
5530 C = CHAR_j;
5531 }
5532
5533 if (C <= D && C <= CHAR_r)
5534 {
5535 class_has_8bitchar +=
5536 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5537 ((D < CHAR_r)? D : CHAR_r) + uc);
5538 C = CHAR_s;
5539 }
5540
5541 if (C <= D)
5542 {
5543 class_has_8bitchar +=
5544 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5545 D + uc);
5546 }
5547 }
5548 else
5549 #endif
5550 /* Not an EBCDIC special range */
5551
5552 class_has_8bitchar +=
5553 add_to_class(classbits, &class_uchardata, options, cb, c, d);
5554 goto CONTINUE_CLASS; /* Go get the next char in the class */
5555 } /* End of range handling */
5556
5557
5558 /* Handle a single character. */
5559
5560 class_has_8bitchar +=
5561 add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
5562 }
5563
5564 /* Continue to the next item in the class. */
5565
5566 CONTINUE_CLASS:
5567
5568 #ifdef SUPPORT_WIDE_CHARS
5569 /* If any wide characters or Unicode properties have been encountered,
5570 set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
5571 of the extra data and reset the pointer. This is so that very large
5572 classes that contain a zillion wide characters or Unicode property tests
5573 do not overwrite the workspace (which is on the stack). */
5574
5575 if (class_uchardata > class_uchardata_base)
5576 {
5577 xclass = TRUE;
5578 if (lengthptr != NULL)
5579 {
5580 *lengthptr += class_uchardata - class_uchardata_base;
5581 class_uchardata = class_uchardata_base;
5582 }
5583 }
5584 #endif
5585
5586 continue; /* Needed to avoid error when not supporting wide chars */
5587 } /* End of main class-processing loop */
5588
5589 /* If this class is the first thing in the branch, there can be no first
5590 char setting, whatever the repeat count. Any reqcu setting must remain
5591 unchanged after any kind of repeat. */
5592
5593 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5594 zerofirstcu = firstcu;
5595 zerofirstcuflags = firstcuflags;
5596 zeroreqcu = reqcu;
5597 zeroreqcuflags = reqcuflags;
5598
5599 /* If there are characters with values > 255, or Unicode property settings
5600 (\p or \P), we have to compile an extended class, with its own opcode,
5601 unless there were no property settings and there was a negated special such
5602 as \S in the class, and PCRE2_UCP is not set, because in that case all
5603 characters > 255 are in or not in the class, so any that were explicitly
5604 given as well can be ignored.
5605
5606 In the UCP case, if certain negated POSIX classes ([:^ascii:] or
5607 [^:xdigit:]) were present in a class, we either have to match or not match
5608 all wide characters (depending on whether the whole class is or is not
5609 negated). This requirement is indicated by match_all_or_no_wide_chars being
5610 true. We do this by including an explicit range, which works in both cases.
5611 This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
5612 cannot be any wide characters in 8-bit non-UTF mode.
5613
5614 When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
5615 class where \S etc is present without PCRE2_UCP, causing an extended class
5616 to be compiled, we make sure that all characters > 255 are included by
5617 forcing match_all_or_no_wide_chars to be true.
5618
5619 If, when generating an xclass, there are no characters < 256, we can omit
5620 the bitmap in the actual compiled code. */
5621
5622 #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
5623 if (xclass && (
5624 #ifdef SUPPORT_UNICODE
5625 (options & PCRE2_UCP) != 0 ||
5626 #endif
5627 xclass_has_prop || !should_flip_negation))
5628 {
5629 if (match_all_or_no_wide_chars || (
5630 #if PCRE2_CODE_UNIT_WIDTH == 8
5631 utf &&
5632 #endif
5633 should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
5634 {
5635 *class_uchardata++ = XCL_RANGE;
5636 if (utf) /* Will always be utf in the 8-bit library */
5637 {
5638 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5639 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
5640 }
5641 else /* Can only happen for the 16-bit & 32-bit libraries */
5642 {
5643 #if PCRE2_CODE_UNIT_WIDTH == 16
5644 *class_uchardata++ = 0x100;
5645 *class_uchardata++ = 0xffffu;
5646 #elif PCRE2_CODE_UNIT_WIDTH == 32
5647 *class_uchardata++ = 0x100;
5648 *class_uchardata++ = 0xffffffffu;
5649 #endif
5650 }
5651 }
5652 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5653 *code++ = OP_XCLASS;
5654 code += LINK_SIZE;
5655 *code = negate_class? XCL_NOT:0;
5656 if (xclass_has_prop) *code |= XCL_HASPROP;
5657
5658 /* If the map is required, move up the extra data to make room for it;
5659 otherwise just move the code pointer to the end of the extra data. */
5660
5661 if (class_has_8bitchar > 0)
5662 {
5663 *code++ |= XCL_MAP;
5664 (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
5665 CU2BYTES(class_uchardata - code));
5666 if (negate_class && !xclass_has_prop)
5667 for (i = 0; i < 32; i++) classbits[i] = ~classbits[i];
5668 memcpy(code, classbits, 32);
5669 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
5670 }
5671 else code = class_uchardata;
5672
5673 /* Now fill in the complete length of the item */
5674
5675 PUT(previous, 1, (int)(code - previous));
5676 break; /* End of class handling */
5677 }
5678 #endif /* SUPPORT_WIDE_CHARS */
5679
5680 /* If there are no characters > 255, or they are all to be included or
5681 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5682 whole class was negated and whether there were negative specials such as \S
5683 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5684 negating it if necessary. */
5685
5686 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5687 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5688 {
5689 if (negate_class)
5690 for (i = 0; i < 32; i++) classbits[i] = ~classbits[i];
5691 memcpy(code, classbits, 32);
5692 }
5693 code += 32 / sizeof(PCRE2_UCHAR);
5694 break; /* End of class processing */
5695
5696
5697 /* ===================================================================*/
5698 /* Deal with (*VERB)s. */
5699
5700 /* Check for open captures before ACCEPT and close those that are within
5701 the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
5702 assertion. In the first pass, just accumulate the length required;
5703 otherwise hitting (*ACCEPT) inside many nested parentheses can cause
5704 workspace overflow. Do not set firstcu after *ACCEPT. */
5705
5706 case META_ACCEPT:
5707 cb->had_accept = TRUE;
5708 for (oc = cb->open_caps;
5709 oc != NULL && oc->assert_depth >= cb->assert_depth;
5710 oc = oc->next)
5711 {
5712 if (lengthptr != NULL)
5713 {
5714 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
5715 }
5716 else
5717 {
5718 *code++ = OP_CLOSE;
5719 PUT2INC(code, 0, oc->number);
5720 }
5721 }
5722 *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5723 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5724 break;
5725
5726 case META_PRUNE:
5727 case META_SKIP:
5728 cb->had_pruneorskip = TRUE;
5729 /* Fall through */
5730 case META_COMMIT:
5731 case META_FAIL:
5732 *code++ = verbops[(meta - META_MARK) >> 16];
5733 break;
5734
5735 case META_THEN:
5736 cb->external_flags |= PCRE2_HASTHEN;
5737 *code++ = OP_THEN;
5738 break;
5739
5740 /* Handle verbs with arguments. Arguments can be very long, especially in
5741 16- and 32-bit modes, and can overflow the workspace in the first pass.
5742 However, the argument length is constrained to be small enough to fit in
5743 one code unit. This check happens in parse_regex(). In the first pass,
5744 instead of putting the argument into memory, we just update the length
5745 counter and set up an empty argument. */
5746
5747 case META_THEN_ARG:
5748 cb->external_flags |= PCRE2_HASTHEN;
5749 goto VERB_ARG;
5750
5751 case META_PRUNE_ARG:
5752 case META_SKIP_ARG:
5753 cb->had_pruneorskip = TRUE;
5754 /* Fall through */
5755 case META_MARK:
5756 case META_COMMIT_ARG:
5757 VERB_ARG:
5758 *code++ = verbops[(meta - META_MARK) >> 16];
5759 /* The length is in characters. */
5760 verbarglen = *(++pptr);
5761 verbculen = 0;
5762 tempcode = code++;
5763 for (i = 0; i < (int)verbarglen; i++)
5764 {
5765 meta = *(++pptr);
5766 #ifdef SUPPORT_UNICODE
5767 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
5768 #endif
5769 {
5770 mclength = 1;
5771 mcbuffer[0] = meta;
5772 }
5773 if (lengthptr != NULL) *lengthptr += mclength; else
5774 {
5775 memcpy(code, mcbuffer, CU2BYTES(mclength));
5776 code += mclength;
5777 verbculen += mclength;
5778 }
5779 }
5780
5781 *tempcode = verbculen; /* Fill in the code unit length */
5782 *code++ = 0; /* Terminating zero */
5783 break;
5784
5785
5786 /* ===================================================================*/
5787 /* Handle options change. The new setting must be passed back for use in
5788 subsequent branches. Reset the greedy defaults and the case value for
5789 firstcu and reqcu. */
5790
5791 case META_OPTIONS:
5792 *optionsptr = options = *(++pptr);
5793 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5794 greedy_non_default = greedy_default ^ 1;
5795 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5796 break;
5797
5798
5799 /* ===================================================================*/
5800 /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
5801 because it could be a numerical check on recursion, or a name check on a
5802 group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
5803 we can handle it either way. We first try for a name; if not found, process
5804 the number. */
5805
5806 case META_COND_RNUMBER: /* (?(Rdigits) */
5807 case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
5808 case META_COND_RNAME: /* (?(R&name) - test for recursion */
5809 bravalue = OP_COND;
5810 {
5811 int count, index;
5812 PCRE2_SPTR name;
5813 named_group *ng = cb->named_groups;
5814 uint32_t length = *(++pptr);
5815
5816 GETPLUSOFFSET(offset, pptr);
5817 name = cb->start_pattern + offset;
5818
5819 /* In the first pass, the names generated in the pre-pass are available,
5820 but the main name table has not yet been created. Scan the list of names
5821 generated in the pre-pass in order to get a number and whether or not
5822 this name is duplicated. If it is not duplicated, we can handle it as a
5823 numerical group. */
5824
5825 for (i = 0; i < cb->names_found; i++, ng++)
5826 {
5827 if (length == ng->length &&
5828 PRIV(strncmp)(name, ng->name, length) == 0)
5829 {
5830 if (!ng->isdup)
5831 {
5832 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
5833 PUT2(code, 2+LINK_SIZE, ng->number);
5834 if (ng->number > cb->top_backref) cb->top_backref = ng->number;
5835 skipunits = 1+IMM2_SIZE;
5836 goto GROUP_PROCESS_NOTE_EMPTY;
5837 }
5838 break; /* Found a duplicated name */
5839 }
5840 }
5841
5842 /* If the name was not found we have a bad reference, unless we are
5843 dealing with R<digits>, which is treated as a recursion test by number.
5844 */
5845
5846 if (i >= cb->names_found)
5847 {
5848 groupnumber = 0;
5849 if (meta == META_COND_RNUMBER)
5850 {
5851 for (i = 1; i < (int)length; i++)
5852 {
5853 groupnumber = groupnumber * 10 + name[i] - CHAR_0;
5854 if (groupnumber > MAX_GROUP_NUMBER)
5855 {
5856 *errorcodeptr = ERR61;
5857 cb->erroroffset = offset + i;
5858 return 0;
5859 }
5860 }
5861 }
5862
5863 if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
5864 {
5865 *errorcodeptr = ERR15;
5866 cb->erroroffset = offset;
5867 return 0;
5868 }
5869
5870 /* (?Rdigits) treated as a recursion reference by number. A value of
5871 zero (which is the result of both (?R) and (?R0)) means "any", and is
5872 translated into RREF_ANY (which is 0xffff). */
5873
5874 if (groupnumber == 0) groupnumber = RREF_ANY;
5875 code[1+LINK_SIZE] = OP_RREF;
5876 PUT2(code, 2+LINK_SIZE, groupnumber);
5877 skipunits = 1+IMM2_SIZE;
5878 goto GROUP_PROCESS_NOTE_EMPTY;
5879 }
5880
5881 /* A duplicated name was found. Note that if an R<digits> name is found
5882 (META_COND_RNUMBER), it is a reference test, not a recursion test. */
5883
5884 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
5885
5886 /* We have a duplicated name. In the compile pass we have to search the
5887 main table in order to get the index and count values. */
5888
5889 count = 0; /* Values for first pass (avoids compiler warning) */
5890 index = 0;
5891 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
5892 &count, errorcodeptr, cb)) return 0;
5893
5894 /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
5895 insert appropriate data values. */
5896
5897 code[1+LINK_SIZE]++;
5898 skipunits = 1+2*IMM2_SIZE;
5899 PUT2(code, 2+LINK_SIZE, index);
5900 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
5901 }
5902 goto GROUP_PROCESS_NOTE_EMPTY;
5903
5904 /* The DEFINE condition is always false. It's internal groups may never
5905 be called, so matched_char must remain false, hence the jump to
5906 GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
5907
5908 case META_COND_DEFINE:
5909 bravalue = OP_COND;
5910 GETPLUSOFFSET(offset, pptr);
5911 code[1+LINK_SIZE] = OP_DEFINE;
5912 skipunits = 1;
5913 goto GROUP_PROCESS;
5914
5915 /* Conditional test of a group's being set. */
5916
5917 case META_COND_NUMBER:
5918 bravalue = OP_COND;
5919 GETPLUSOFFSET(offset, pptr);
5920 groupnumber = *(++pptr);
5921 if (groupnumber > cb->bracount)
5922 {
5923 *errorcodeptr = ERR15;
5924 cb->erroroffset = offset;
5925 return 0;
5926 }
5927 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5928 offset -= 2; /* Point at initial ( for too many branches error */
5929 code[1+LINK_SIZE] = OP_CREF;
5930 skipunits = 1+IMM2_SIZE;
5931 PUT2(code, 2+LINK_SIZE, groupnumber);
5932 goto GROUP_PROCESS_NOTE_EMPTY;
5933
5934 /* Test for the PCRE2 version. */
5935
5936 case META_COND_VERSION:
5937 bravalue = OP_COND;
5938 if (pptr[1] > 0)
5939 code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
5940 (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
5941 OP_TRUE : OP_FALSE;
5942 else
5943 code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
5944 OP_TRUE : OP_FALSE;
5945 skipunits = 1;
5946 pptr += 3;
5947 goto GROUP_PROCESS_NOTE_EMPTY;
5948
5949 /* The condition is an assertion, possibly preceded by a callout. */
5950
5951 case META_COND_ASSERT:
5952 bravalue = OP_COND;
5953 goto GROUP_PROCESS_NOTE_EMPTY;
5954
5955
5956 /* ===================================================================*/
5957 /* Handle all kinds of nested bracketed groups. The non-capturing,
5958 non-conditional cases are here; others come to GROUP_PROCESS via goto. */
5959
5960 case META_LOOKAHEAD:
5961 bravalue = OP_ASSERT;
5962 cb->assert_depth += 1;
5963 goto GROUP_PROCESS;
5964
5965 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
5966 thing to do, but Perl allows all assertions to be quantified, and when
5967 they contain capturing parentheses there may be a potential use for
5968 this feature. Not that that applies to a quantified (?!) but we allow
5969 it for uniformity. */
5970
5971 case META_LOOKAHEADNOT:
5972 if (pptr[1] == META_KET &&
5973 (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
5974 {
5975 *code++ = OP_FAIL;
5976 pptr++;
5977 }
5978 else
5979 {
5980 bravalue = OP_ASSERT_NOT;
5981 cb->assert_depth += 1;
5982 goto GROUP_PROCESS;
5983 }
5984 break;
5985
5986 case META_LOOKBEHIND:
5987 bravalue = OP_ASSERTBACK;
5988 cb->assert_depth += 1;
5989 goto GROUP_PROCESS;
5990
5991 case META_LOOKBEHINDNOT:
5992 bravalue = OP_ASSERTBACK_NOT;
5993 cb->assert_depth += 1;
5994 goto GROUP_PROCESS;
5995
5996 case META_ATOMIC:
5997 bravalue = OP_ONCE;
5998 goto GROUP_PROCESS_NOTE_EMPTY;
5999
6000 case META_NOCAPTURE:
6001 bravalue = OP_BRA;
6002 /* Fall through */
6003
6004 /* Process nested bracketed regex. The nesting depth is maintained for the
6005 benefit of the stackguard function. The test for too deep nesting is now
6006 done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6007 others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6008 note of whether or not they may match an empty string. */
6009
6010 GROUP_PROCESS_NOTE_EMPTY:
6011 note_group_empty = TRUE;
6012
6013 GROUP_PROCESS:
6014 cb->parens_depth += 1;
6015 *code = bravalue;
6016 pptr++;
6017 tempcode = code;
6018 tempreqvary = cb->req_varyopt; /* Save value before group */
6019 length_prevgroup = 0; /* Initialize for pre-compile phase */
6020
6021 if ((group_return =
6022 compile_regex(
6023 options, /* The option state */
6024 &tempcode, /* Where to put code (updated) */
6025 &pptr, /* Input pointer (updated) */
6026 errorcodeptr, /* Where to put an error message */
6027 skipunits, /* Skip over bracket number */
6028 &subfirstcu, /* For possible first char */
6029 &subfirstcuflags,
6030 &subreqcu, /* For possible last char */
6031 &subreqcuflags,
6032 bcptr, /* Current branch chain */
6033 cb, /* Compile data block */
6034 (lengthptr == NULL)? NULL : /* Actual compile phase */
6035 &length_prevgroup /* Pre-compile phase */
6036 )) == 0)
6037 return 0; /* Error */
6038
6039 cb->parens_depth -= 1;
6040
6041 /* If that was a non-conditional significant group (not an assertion, not a
6042 DEFINE) that matches at least one character, then the current item matches
6043 a character. Conditionals are handled below. */
6044
6045 if (note_group_empty && bravalue != OP_COND && group_return > 0)
6046 matched_char = TRUE;
6047
6048 /* If we've just compiled an assertion, pop the assert depth. */
6049
6050 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6051 cb->assert_depth -= 1;
6052
6053 /* At the end of compiling, code is still pointing to the start of the
6054 group, while tempcode has been updated to point past the end of the group.
6055 The parsed pattern pointer (pptr) is on the closing META_KET.
6056
6057 If this is a conditional bracket, check that there are no more than
6058 two branches in the group, or just one if it's a DEFINE group. We do this
6059 in the real compile phase, not in the pre-pass, where the whole group may
6060 not be available. */
6061
6062 if (bravalue == OP_COND && lengthptr == NULL)
6063 {
6064 PCRE2_UCHAR *tc = code;
6065 int condcount = 0;
6066
6067 do {
6068 condcount++;
6069 tc += GET(tc,1);
6070 }
6071 while (*tc != OP_KET);
6072
6073 /* A DEFINE group is never obeyed inline (the "condition" is always
6074 false). It must have only one branch. Having checked this, change the
6075 opcode to OP_FALSE. */
6076
6077 if (code[LINK_SIZE+1] == OP_DEFINE)
6078 {
6079 if (condcount > 1)
6080 {
6081 cb->erroroffset = offset;
6082 *errorcodeptr = ERR54;
6083 return 0;
6084 }
6085 code[LINK_SIZE+1] = OP_FALSE;
6086 bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6087 }
6088
6089 /* A "normal" conditional group. If there is just one branch, we must not
6090 make use of its firstcu or reqcu, because this is equivalent to an
6091 empty second branch. Also, it may match an empty string. If there are two
6092 branches, this item must match a character if the group must. */
6093
6094 else
6095 {
6096 if (condcount > 2)
6097 {
6098 cb->erroroffset = offset;
6099 *errorcodeptr = ERR27;
6100 return 0;
6101 }
6102 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6103 else if (group_return > 0) matched_char = TRUE;
6104 }
6105 }
6106
6107 /* In the pre-compile phase, update the length by the length of the group,
6108 less the brackets at either end. Then reduce the compiled code to just a
6109 set of non-capturing brackets so that it doesn't use much memory if it is
6110 duplicated by a quantifier.*/
6111
6112 if (lengthptr != NULL)
6113 {
6114 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6115 {
6116 *errorcodeptr = ERR20;
6117 return 0;
6118 }
6119 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6120 code++; /* This already contains bravalue */
6121 PUTINC(code, 0, 1 + LINK_SIZE);
6122 *code++ = OP_KET;
6123 PUTINC(code, 0, 1 + LINK_SIZE);
6124 break; /* No need to waste time with special character handling */
6125 }
6126
6127 /* Otherwise update the main code pointer to the end of the group. */
6128
6129 code = tempcode;
6130
6131 /* For a DEFINE group, required and first character settings are not
6132 relevant. */
6133
6134 if (bravalue == OP_DEFINE) break;
6135
6136 /* Handle updating of the required and first code units for other types of
6137 group. Update for normal brackets of all kinds, and conditions with two
6138 branches (see code above). If the bracket is followed by a quantifier with
6139 zero repeat, we have to back off. Hence the definition of zeroreqcu and
6140 zerofirstcu outside the main loop so that they can be accessed for the back
6141 off. */
6142
6143 zeroreqcu = reqcu;
6144 zeroreqcuflags = reqcuflags;
6145 zerofirstcu = firstcu;
6146 zerofirstcuflags = firstcuflags;
6147 groupsetfirstcu = FALSE;
6148
6149 if (bravalue >= OP_ONCE) /* Not an assertion */
6150 {
6151 /* If we have not yet set a firstcu in this branch, take it from the
6152 subpattern, remembering that it was set here so that a repeat of more
6153 than one can replicate it as reqcu if necessary. If the subpattern has
6154 no firstcu, set "none" for the whole branch. In both cases, a zero
6155 repeat forces firstcu to "none". */
6156
6157 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6158 {
6159 if (subfirstcuflags >= 0)
6160 {
6161 firstcu = subfirstcu;
6162 firstcuflags = subfirstcuflags;
6163 groupsetfirstcu = TRUE;
6164 }
6165 else firstcuflags = REQ_NONE;
6166 zerofirstcuflags = REQ_NONE;
6167 }
6168
6169 /* If firstcu was previously set, convert the subpattern's firstcu
6170 into reqcu if there wasn't one, using the vary flag that was in
6171 existence beforehand. */
6172
6173 else if (subfirstcuflags >= 0 && subreqcuflags < 0)
6174 {
6175 subreqcu = subfirstcu;
6176 subreqcuflags = subfirstcuflags | tempreqvary;
6177 }
6178
6179 /* If the subpattern set a required code unit (or set a first code unit
6180 that isn't really the first code unit - see above), set it. */
6181
6182 if (subreqcuflags >= 0)
6183 {
6184 reqcu = subreqcu;
6185 reqcuflags = subreqcuflags;
6186 }
6187 }
6188
6189 /* For a forward assertion, we take the reqcu, if set, provided that the
6190 group has also set a firstcu. This can be helpful if the pattern that
6191 follows the assertion doesn't set a different char. For example, it's
6192 useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6193 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6194 the "real" "a" would then become a reqcu instead of a firstcu. This is
6195 overcome by a scan at the end if there's no firstcu, looking for an
6196 asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6197 we must only take the reqcu when the group also set a firstcu. Otherwise,
6198 in that example, 'X' ends up set for both. */
6199
6200 else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
6201 subfirstcuflags >= 0)
6202 {
6203 reqcu = subreqcu;
6204 reqcuflags = subreqcuflags;
6205 }
6206
6207 break; /* End of nested group handling */
6208
6209
6210 /* ===================================================================*/
6211 /* Handle named backreferences and recursions. */
6212
6213 case META_BACKREF_BYNAME:
6214 case META_RECURSE_BYNAME:
6215 {
6216 int count, index;
6217 PCRE2_SPTR name;
6218 BOOL is_dupname = FALSE;
6219 named_group *ng = cb->named_groups;
6220 uint32_t length = *(++pptr);
6221
6222 GETPLUSOFFSET(offset, pptr);
6223 name = cb->start_pattern + offset;
6224
6225 /* In the first pass, the names generated in the pre-pass are available,
6226 but the main name table has not yet been created. Scan the list of names
6227 generated in the pre-pass in order to get a number and whether or not
6228 this name is duplicated. */
6229
6230 groupnumber = 0;
6231 for (i = 0; i < cb->names_found; i++, ng++)
6232 {
6233 if (length == ng->length &&
6234 PRIV(strncmp)(name, ng->name, length) == 0)
6235 {
6236 is_dupname = ng->isdup;
6237 groupnumber = ng->number;
6238
6239 /* For a recursion, that's all that is needed. We can now go to
6240 the code above that handles numerical recursion, applying it to
6241 the first group with the given name. */
6242
6243 if (meta == META_RECURSE_BYNAME)
6244 {
6245 meta_arg = groupnumber;
6246 goto HANDLE_NUMERICAL_RECURSION;
6247 }
6248
6249 /* For a back reference, update the back reference map and the
6250 maximum back reference. Then, for each group, we must check to
6251 see if it is recursive, that is, it is inside the group that it
6252 references. A flag is set so that the group can be made atomic.
6253 */
6254
6255 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6256 if (groupnumber > cb->top_backref)
6257 cb->top_backref = groupnumber;
6258
6259 for (oc = cb->open_caps; oc != NULL; oc = oc->next)
6260 {
6261 if (oc->number == groupnumber)
6262 {
6263 oc->flag = TRUE;
6264 break;
6265 }
6266 }
6267 }
6268 }
6269
6270 /* If the name was not found we have a bad reference. */
6271
6272 if (groupnumber == 0)
6273 {
6274 *errorcodeptr = ERR15;
6275 cb->erroroffset = offset;
6276 return 0;
6277 }
6278
6279 /* If a back reference name is not duplicated, we can handle it as
6280 a numerical reference. */
6281
6282 if (!is_dupname)
6283 {
6284 meta_arg = groupnumber;
6285 goto HANDLE_SINGLE_REFERENCE;
6286 }
6287
6288 /* If a back reference name is duplicated, we generate a different
6289 opcode to a numerical back reference. In the second pass we must
6290 search for the index and count in the final name table. */
6291
6292 count = 0; /* Values for first pass (avoids compiler warning) */
6293 index = 0;
6294 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6295 &count, errorcodeptr, cb)) return 0;
6296
6297 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6298 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6299 PUT2INC(code, 0, index);
6300 PUT2INC(code, 0, count);
6301 }
6302 break;
6303
6304
6305 /* ===================================================================*/
6306 /* Handle a numerical callout. */
6307
6308 case META_CALLOUT_NUMBER:
6309 code[0] = OP_CALLOUT;
6310 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6311 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6312 code[1 + 2*LINK_SIZE] = pptr[3];
6313 pptr += 3;
6314 code += PRIV(OP_lengths)[OP_CALLOUT];
6315 break;
6316
6317
6318 /* ===================================================================*/
6319 /* Handle a callout with a string argument. In the pre-pass we just compute
6320 the length without generating anything. The length in pptr[3] includes both
6321 delimiters; in the actual compile only the first one is copied, but a
6322 terminating zero is added. Any doubled delimiters within the string make
6323 this an overestimate, but it is not worth bothering about. */
6324
6325 case META_CALLOUT_STRING:
6326 if (lengthptr != NULL)
6327 {
6328 *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6329 pptr += 3;
6330 SKIPOFFSET(pptr);
6331 }
6332
6333 /* In the real compile we can copy the string. The starting delimiter is
6334 included so that the client can discover it if they want. We also pass the
6335 start offset to help a script language give better error messages. */
6336
6337 else
6338 {
6339 PCRE2_SPTR pp;
6340 uint32_t delimiter;
6341 uint32_t length = pptr[3];
6342 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6343
6344 code[0] = OP_CALLOUT_STR;
6345 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6346 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6347
6348 pptr += 3;
6349 GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
6350 pp = cb->start_pattern + offset;
6351 delimiter = *callout_string++ = *pp++;
6352 if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6353 delimiter = CHAR_RIGHT_CURLY_BRACKET;
6354 PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
6355
6356 /* The syntax of the pattern was checked in the parsing scan. The length
6357 includes both delimiters, but we have passed the opening one just above,
6358 so we reduce length before testing it. The test is for > 1 because we do
6359 not want to copy the final delimiter. This also ensures that pp[1] is
6360 accessible. */
6361
6362 while (--length > 1)
6363 {
6364 if (*pp == delimiter && pp[1] == delimiter)
6365 {
6366 *callout_string++ = delimiter;
6367 pp += 2;
6368 length--;
6369 }
6370 else *callout_string++ = *pp++;
6371 }
6372 *callout_string++ = CHAR_NUL;
6373
6374 /* Set the length of the entire item, the advance to its end. */
6375
6376 PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6377 code = callout_string;
6378 }
6379 break;
6380
6381
6382 /* ===================================================================*/
6383 /* Handle repetition. The different types are all sorted out in the parsing
6384 pass. */
6385
6386 case META_MINMAX_PLUS:
6387 case META_MINMAX_QUERY:
6388 case META_MINMAX:
6389 repeat_min = *(++pptr);
6390 repeat_max = *(++pptr);
6391 goto REPEAT;
6392
6393 case META_ASTERISK:
6394 case META_ASTERISK_PLUS:
6395 case META_ASTERISK_QUERY:
6396 repeat_min = 0;
6397 repeat_max = REPEAT_UNLIMITED;
6398 goto REPEAT;
6399
6400 case META_PLUS:
6401 case META_PLUS_PLUS:
6402 case META_PLUS_QUERY:
6403 repeat_min = 1;
6404 repeat_max = REPEAT_UNLIMITED;
6405 goto REPEAT;
6406
6407 case META_QUERY:
6408 case META_QUERY_PLUS:
6409 case META_QUERY_QUERY:
6410 repeat_min = 0;
6411 repeat_max = 1;
6412
6413 REPEAT:
6414 if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6415
6416 /* Remember whether this is a variable length repeat, and default to
6417 single-char opcodes. */
6418
6419 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6420 op_type = 0;
6421
6422 /* If the repeat is {1} we can ignore it. */
6423
6424 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6425
6426 /* Adjust first and required code units for a zero repeat. */
6427
6428 if (repeat_min == 0)
6429 {
6430 firstcu = zerofirstcu;
6431 firstcuflags = zerofirstcuflags;
6432 reqcu = zeroreqcu;
6433 reqcuflags = zeroreqcuflags;
6434 }
6435
6436 /* Note the greediness and possessiveness. */
6437
6438 switch (meta)
6439 {
6440 case META_MINMAX_PLUS:
6441 case META_ASTERISK_PLUS:
6442 case META_PLUS_PLUS:
6443 case META_QUERY_PLUS:
6444 repeat_type = 0; /* Force greedy */
6445 possessive_quantifier = TRUE;
6446 break;
6447
6448 case META_MINMAX_QUERY:
6449 case META_ASTERISK_QUERY:
6450 case META_PLUS_QUERY:
6451 case META_QUERY_QUERY:
6452 repeat_type = greedy_non_default;
6453 possessive_quantifier = FALSE;
6454 break;
6455
6456 default:
6457 repeat_type = greedy_default;
6458 possessive_quantifier = FALSE;
6459 break;
6460 }
6461
6462 /* Save start of previous item, in case we have to move it up in order to
6463 insert something before it, and remember what it was. */
6464
6465 tempcode = previous;
6466 op_previous = *previous;
6467
6468 /* Now handle repetition for the different types of item. */
6469
6470 switch (op_previous)
6471 {
6472 /* If previous was a character or negated character match, abolish the
6473 item and generate a repeat item instead. If a char item has a minimum of
6474 more than one, ensure that it is set in reqcu - it might not be if a
6475 sequence such as x{3} is the first thing in a branch because the x will
6476 have gone into firstcu instead. */
6477
6478 case OP_CHAR:
6479 case OP_CHARI:
6480 case OP_NOT:
6481 case OP_NOTI:
6482 op_type = chartypeoffset[op_previous - OP_CHAR];
6483
6484 /* Deal with UTF characters that take up more than one code unit. */
6485
6486 #ifdef MAYBE_UTF_MULTI
6487 if (utf && NOT_FIRSTCU(code[-1]))
6488 {
6489 PCRE2_UCHAR *lastchar = code - 1;
6490 BACKCHAR(lastchar);
6491 mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
6492 memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
6493 }
6494 else
6495 #endif /* MAYBE_UTF_MULTI */
6496
6497 /* Handle the case of a single code unit - either with no UTF support, or
6498 with UTF disabled, or for a single-code-unit UTF character. */
6499 {
6500 mcbuffer[0] = code[-1];
6501 mclength = 1;
6502 if (op_previous <= OP_CHARI && repeat_min > 1)
6503 {
6504 reqcu = mcbuffer[0];
6505 reqcuflags = req_caseopt | cb->req_varyopt;
6506 }
6507 }
6508 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
6509
6510 /* If previous was a character class or a back reference, we put the
6511 repeat stuff after it, but just skip the item if the repeat was {0,0}. */
6512
6513 #ifdef SUPPORT_WIDE_CHARS
6514 case OP_XCLASS:
6515 #endif
6516 case OP_CLASS:
6517 case OP_NCLASS:
6518 case OP_REF:
6519 case OP_REFI:
6520 case OP_DNREF:
6521 case OP_DNREFI:
6522
6523 if (repeat_max == 0)
6524 {
6525 code = previous;
6526 goto END_REPEAT;
6527 }
6528
6529 if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
6530 *code++ = OP_CRSTAR + repeat_type;
6531 else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
6532 *code++ = OP_CRPLUS + repeat_type;
6533 else if (repeat_min == 0 && repeat_max == 1)
6534 *code++ = OP_CRQUERY + repeat_type;
6535 else
6536 {
6537 *code++ = OP_CRRANGE + repeat_type;
6538 PUT2INC(code, 0, repeat_min);
6539 if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
6540 PUT2INC(code, 0, repeat_max);
6541 }
6542 break;
6543
6544 /* If previous is OP_FAIL, it was generated by an empty class []
6545 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
6546 generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
6547 time. We can just ignore this repeat. */
6548
6549 case OP_FAIL:
6550 goto END_REPEAT;
6551
6552 /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
6553 because pcre2_match() could not handle backtracking into recursively
6554 called groups. Now that this backtracking is available, we no longer need
6555 to do this. However, we still need to replicate recursions as we do for
6556 groups so as to have independent backtracking points. We can replicate
6557 for the minimum number of repeats directly. For optional repeats we now
6558 wrap the recursion in OP_BRA brackets and make use of the bracket
6559 repetition. */
6560
6561 case OP_RECURSE:
6562
6563 /* Generate unwrapped repeats for a non-zero minimum, except when the
6564 minimum is 1 and the maximum unlimited, because that can be handled with
6565 OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
6566 minimum, we just need to generate the appropriate additional copies.
6567 Otherwise we need to generate one more, to simulate the situation when
6568 the minimum is zero. */
6569
6570 if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
6571 {
6572 int replicate = repeat_min;
6573 if (repeat_min == repeat_max) replicate--;
6574
6575 /* In the pre-compile phase, we don't actually do the replication. We
6576 just adjust the length as if we had. Do some paranoid checks for
6577 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6578 integer type when available, otherwise double. */
6579
6580 if (lengthptr != NULL)
6581 {
6582 PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
6583 if ((INT64_OR_DOUBLE)replicate*
6584 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
6585 (INT64_OR_DOUBLE)INT_MAX ||
6586 OFLOW_MAX - *lengthptr < delta)
6587 {
6588 *errorcodeptr = ERR20;
6589 return 0;
6590 }
6591 *lengthptr += delta;
6592 }
6593
6594 else for (i = 0; i < replicate; i++)
6595 {
6596 memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
6597 previous = code;
6598 code += 1 + LINK_SIZE;
6599 }
6600
6601 /* If the number of repeats is fixed, we are done. Otherwise, adjust
6602 the counts and fall through. */
6603
6604 if (repeat_min == repeat_max) break;
6605 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
6606 repeat_min = 0;
6607 }
6608
6609 /* Wrap the recursion call in OP_BRA brackets. */
6610
6611 (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
6612 op_previous = *previous = OP_BRA;
6613 PUT(previous, 1, 2 + 2*LINK_SIZE);
6614 previous[2 + 2*LINK_SIZE] = OP_KET;
6615 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
6616 code += 2 + 2 * LINK_SIZE;
6617 length_prevgroup = 3 + 3*LINK_SIZE;
6618 group_return = -1; /* Set "may match empty string" */
6619
6620 /* Now treat as a repeated OP_BRA. */
6621 /* Fall through */
6622
6623 /* If previous was a bracket group, we may have to replicate it in
6624 certain cases. Note that at this point we can encounter only the "basic"
6625 bracket opcodes such as BRA and CBRA, as this is the place where they get
6626 converted into the more special varieties such as BRAPOS and SBRA.
6627 Originally, PCRE did not allow repetition of assertions, but now it does,
6628 for Perl compatibility. */
6629
6630 case OP_ASSERT:
6631 case OP_ASSERT_NOT:
6632 case OP_ASSERTBACK:
6633 case OP_ASSERTBACK_NOT:
6634 case OP_ONCE:
6635 case OP_BRA:
6636 case OP_CBRA:
6637 case OP_COND:
6638 {
6639 int len = (int)(code - previous);
6640 PCRE2_UCHAR *bralink = NULL;
6641 PCRE2_UCHAR *brazeroptr = NULL;
6642
6643 /* Repeating a DEFINE group (or any group where the condition is always
6644 FALSE and there is only one branch) is pointless, but Perl allows the
6645 syntax, so we just ignore the repeat. */
6646
6647 if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
6648 previous[GET(previous, 1)] != OP_ALT)
6649 goto END_REPEAT;
6650
6651 /* There is no sense in actually repeating assertions. The only
6652 potential use of repetition is in cases when the assertion is optional.
6653 Therefore, if the minimum is greater than zero, just ignore the repeat.
6654 If the maximum is not zero or one, set it to 1. */
6655
6656 if (op_previous < OP_ONCE) /* Assertion */
6657 {
6658 if (repeat_min > 0) goto END_REPEAT;
6659 if (repeat_max > 1) repeat_max = 1;
6660 }
6661
6662 /* The case of a zero minimum is special because of the need to stick
6663 OP_BRAZERO in front of it, and because the group appears once in the
6664 data, whereas in other cases it appears the minimum number of times. For
6665 this reason, it is simplest to treat this case separately, as otherwise
6666 the code gets far too messy. There are several special subcases when the
6667 minimum is zero. */
6668
6669 if (repeat_min == 0)
6670 {
6671 /* If the maximum is also zero, we used to just omit the group from
6672 the output altogether, like this:
6673
6674 ** if (repeat_max == 0)
6675 ** {
6676 ** code = previous;
6677 ** goto END_REPEAT;
6678 ** }
6679
6680 However, that fails when a group or a subgroup within it is
6681 referenced as a subroutine from elsewhere in the pattern, so now we
6682 stick in OP_SKIPZERO in front of it so that it is skipped on
6683 execution. As we don't have a list of which groups are referenced, we
6684 cannot do this selectively.
6685
6686 If the maximum is 1 or unlimited, we just have to stick in the
6687 BRAZERO and do no more at this point. */
6688
6689 if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
6690 {
6691 (void)memmove(previous + 1, previous, CU2BYTES(len));
6692 code++;
6693 if (repeat_max == 0)
6694 {
6695 *previous++ = OP_SKIPZERO;
6696 goto END_REPEAT;
6697 }
6698 brazeroptr = previous; /* Save for possessive optimizing */
6699 *previous++ = OP_BRAZERO + repeat_type;
6700 }
6701
6702 /* If the maximum is greater than 1 and limited, we have to replicate
6703 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6704 The first one has to be handled carefully because it's the original
6705 copy, which has to be moved up. The remainder can be handled by code
6706 that is common with the non-zero minimum case below. We have to
6707 adjust the value or repeat_max, since one less copy is required. */
6708
6709 else
6710 {
6711 int linkoffset;
6712 (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
6713 code += 2 + LINK_SIZE;
6714 *previous++ = OP_BRAZERO + repeat_type;
6715 *previous++ = OP_BRA;
6716
6717 /* We chain together the bracket link offset fields that have to be
6718 filled in later when the ends of the brackets are reached. */
6719
6720 linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
6721 bralink = previous;
6722 PUTINC(previous, 0, linkoffset);
6723 }
6724
6725 if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
6726 }
6727
6728 /* If the minimum is greater than zero, replicate the group as many
6729 times as necessary, and adjust the maximum to the number of subsequent
6730 copies that we need. */
6731
6732 else
6733 {
6734 if (repeat_min > 1)
6735 {
6736 /* In the pre-compile phase, we don't actually do the replication.
6737 We just adjust the length as if we had. Do some paranoid checks for
6738 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6739 integer type when available, otherwise double. */
6740
6741 if (lengthptr != NULL)
6742 {
6743 PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
6744 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6745 (INT64_OR_DOUBLE)length_prevgroup >
6746 (INT64_OR_DOUBLE)INT_MAX ||
6747 OFLOW_MAX - *lengthptr < delta)
6748 {
6749 *errorcodeptr = ERR20;
6750 return 0;
6751 }
6752 *lengthptr += delta;
6753 }
6754
6755 /* This is compiling for real. If there is a set first code unit
6756 for the group, and we have not yet set a "required code unit", set
6757 it. */
6758
6759 else
6760 {
6761 if (groupsetfirstcu && reqcuflags < 0)
6762 {
6763 reqcu = firstcu;
6764 reqcuflags = firstcuflags;
6765 }
6766 for (i = 1; (uint32_t)i < repeat_min; i++)
6767 {
6768 memcpy(code, previous, CU2BYTES(len));
6769 code += len;
6770 }
6771 }
6772 }
6773
6774 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
6775 }
6776
6777 /* This code is common to both the zero and non-zero minimum cases. If
6778 the maximum is limited, it replicates the group in a nested fashion,
6779 remembering the bracket starts on a stack. In the case of a zero
6780 minimum, the first one was set up above. In all cases the repeat_max
6781 now specifies the number of additional copies needed. Again, we must
6782 remember to replicate entries on the forward reference list. */
6783
6784 if (repeat_max != REPEAT_UNLIMITED)
6785 {
6786 /* In the pre-compile phase, we don't actually do the replication. We
6787 just adjust the length as if we had. For each repetition we must add
6788 1 to the length for BRAZERO and for all but the last repetition we
6789 must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6790 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
6791 is a 64-bit integer type when available, otherwise double. */
6792
6793 if (lengthptr != NULL && repeat_max > 0)
6794 {
6795 PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6796 2 - 2*LINK_SIZE; /* Last one doesn't nest */
6797 if ((INT64_OR_DOUBLE)repeat_max *
6798 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6799 > (INT64_OR_DOUBLE)INT_MAX ||
6800 OFLOW_MAX - *lengthptr < delta)
6801 {
6802 *errorcodeptr = ERR20;
6803 return 0;
6804 }
6805 *lengthptr += delta;
6806 }
6807
6808 /* This is compiling for real */
6809
6810 else for (i = repeat_max - 1; i >= 0; i--)
6811 {
6812 *code++ = OP_BRAZERO + repeat_type;
6813
6814 /* All but the final copy start a new nesting, maintaining the
6815 chain of brackets outstanding. */
6816
6817 if (i != 0)
6818 {
6819 int linkoffset;
6820 *code++ = OP_BRA;
6821 linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
6822 bralink = code;
6823 PUTINC(code, 0, linkoffset);
6824 }
6825
6826 memcpy(code, previous, CU2BYTES(len));
6827 code += len;
6828 }
6829
6830 /* Now chain through the pending brackets, and fill in their length
6831 fields (which are holding the chain links pro tem). */
6832
6833 while (bralink != NULL)
6834 {
6835 int oldlinkoffset;
6836 int linkoffset = (int)(code - bralink + 1);
6837 PCRE2_UCHAR *bra = code - linkoffset;
6838 oldlinkoffset = GET(bra, 1);
6839 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6840 *code++ = OP_KET;
6841 PUTINC(code, 0, linkoffset);
6842 PUT(bra, 1, linkoffset);
6843 }
6844 }
6845
6846 /* If the maximum is unlimited, set a repeater in the final copy. For
6847 ONCE brackets, that's all we need to do. However, possessively repeated
6848 ONCE brackets can be converted into non-capturing brackets, as the
6849 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6850 deal with possessive ONCEs specially.
6851
6852 Otherwise, when we are doing the actual compile phase, check to see
6853 whether this group is one that could match an empty string. If so,
6854 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6855 that runtime checking can be done. [This check is also applied to ONCE
6856 groups at runtime, but in a different way.]
6857
6858 Then, if the quantifier was possessive and the bracket is not a
6859 conditional, we convert the BRA code to the POS form, and the KET code to
6860 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6861 subpattern at both the start and at the end.) The use of special opcodes
6862 makes it possible to reduce greatly the stack usage in pcre2_match(). If
6863 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6864
6865 Then, if the minimum number of matches is 1 or 0, cancel the possessive
6866 flag so that the default action below, of wrapping everything inside
6867 atomic brackets, does not happen. When the minimum is greater than 1,
6868 there will be earlier copies of the group, and so we still have to wrap
6869 the whole thing. */
6870
6871 else
6872 {
6873 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
6874 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
6875
6876 /* Convert possessive ONCE brackets to non-capturing */
6877
6878 if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
6879
6880 /* For non-possessive ONCE brackets, all we need to do is to
6881 set the KET. */
6882
6883 if (*bracode == OP_ONCE) *ketcode = OP_KETRMAX + repeat_type;
6884
6885 /* Handle non-ONCE brackets and possessive ONCEs (which have been
6886 converted to non-capturing above). */
6887
6888 else
6889 {
6890 /* In the compile phase, adjust the opcode if the group can match
6891 an empty string. For a conditional group with only one branch, the
6892 value of group_return will not show "could be empty", so we must
6893 check that separately. */
6894
6895 if (lengthptr == NULL)
6896 {
6897 if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
6898 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6899 *bracode = OP_SCOND;
6900 }
6901
6902 /* Handle possessive quantifiers. */
6903
6904 if (possessive_quantifier)
6905 {
6906 /* For COND brackets, we wrap the whole thing in a possessively
6907 repeated non-capturing bracket, because we have not invented POS
6908 versions of the COND opcodes. */
6909
6910 if (*bracode == OP_COND || *bracode == OP_SCOND)
6911 {
6912 int nlen = (int)(code - bracode);
6913 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
6914 code += 1 + LINK_SIZE;
6915 nlen += 1 + LINK_SIZE;
6916 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6917 *code++ = OP_KETRPOS;
6918 PUTINC(code, 0, nlen);
6919 PUT(bracode, 1, nlen);
6920 }
6921
6922 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6923
6924 else
6925 {
6926 *bracode += 1; /* Switch to xxxPOS opcodes */
6927 *ketcode = OP_KETRPOS;
6928 }
6929
6930 /* If the minimum is zero, mark it as possessive, then unset the
6931 possessive flag when the minimum is 0 or 1. */
6932
6933 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6934 if (repeat_min < 2) possessive_quantifier = FALSE;
6935 }
6936
6937 /* Non-possessive quantifier */
6938
6939 else *ketcode = OP_KETRMAX + repeat_type;
6940 }
6941 }
6942 }
6943 break;
6944
6945 /* If previous was a character type match (\d or similar), abolish it and
6946 create a suitable repeat item. The code is shared with single-character
6947 repeats by setting op_type to add a suitable offset into repeat_type.
6948 Note the the Unicode property types will be present only when
6949 SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
6950 here because it just makes it horribly messy. */
6951
6952 default:
6953 if (op_previous >= OP_EODN) /* Not a character type - internal error */
6954 {
6955 *errorcodeptr = ERR10;
6956 return 0;
6957 }
6958 else
6959 {
6960 int prop_type, prop_value;
6961 PCRE2_UCHAR *oldcode;
6962
6963 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
6964 mclength = 0; /* Not a character */
6965
6966 if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
6967 {
6968 prop_type = previous[1];
6969 prop_value = previous[2];
6970 }
6971 else
6972 {
6973 /* Come here from just above with a character in mcbuffer/mclength. */
6974 OUTPUT_SINGLE_REPEAT:
6975 prop_type = prop_value = -1;
6976 }
6977
6978 /* At this point, if prop_type == prop_value == -1 we either have a
6979 character in mcbuffer when mclength is greater than zero, or we have
6980 mclength zero, in which case there is a non-property character type in
6981 op_previous. If prop_type/value are not negative, we have a property
6982 character type in op_previous. */
6983
6984 oldcode = code; /* Save where we were */
6985 code = previous; /* Usually overwrite previous item */
6986
6987 /* If the maximum is zero then the minimum must also be zero; Perl allows
6988 this case, so we do too - by simply omitting the item altogether. */
6989
6990 if (repeat_max == 0) goto END_REPEAT;
6991
6992 /* Combine the op_type with the repeat_type */
6993
6994 repeat_type += op_type;
6995
6996 /* A minimum of zero is handled either as the special case * or ?, or as
6997 an UPTO, with the maximum given. */
6998
6999 if (repeat_min == 0)
7000 {
7001 if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7002 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7003 else
7004 {
7005 *code++ = OP_UPTO + repeat_type;
7006 PUT2INC(code, 0, repeat_max);
7007 }
7008 }
7009
7010 /* A repeat minimum of 1 is optimized into some special cases. If the
7011 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7012 left in place and, if the maximum is greater than 1, we use OP_UPTO with
7013 one less than the maximum. */
7014
7015 else if (repeat_min == 1)
7016 {
7017 if (repeat_max == REPEAT_UNLIMITED)
7018 *code++ = OP_PLUS + repeat_type;
7019 else
7020 {
7021 code = oldcode; /* Leave previous item in place */
7022 if (repeat_max == 1) goto END_REPEAT;
7023 *code++ = OP_UPTO + repeat_type;
7024 PUT2INC(code, 0, repeat_max - 1);
7025 }
7026 }
7027
7028 /* The case {n,n} is just an EXACT, while the general case {n,m} is
7029 handled as an EXACT followed by an UPTO or STAR or QUERY. */
7030
7031 else
7032 {
7033 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7034 PUT2INC(code, 0, repeat_min);
7035
7036 /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7037 and then generate the second opcode. For a repeated Unicode property
7038 match, there are two extra values that define the required property,
7039 and mclength is set zero to indicate this. */
7040
7041 if (repeat_max != repeat_min)
7042 {
7043 if (mclength > 0)
7044 {
7045 memcpy(code, mcbuffer, CU2BYTES(mclength));
7046 code += mclength;
7047 }
7048 else
7049 {
7050 *code++ = op_previous;
7051 if (prop_type >= 0)
7052 {
7053 *code++ = prop_type;
7054 *code++ = prop_value;
7055 }
7056 }
7057
7058 /* Now set up the following opcode */
7059
7060 if (repeat_max == REPEAT_UNLIMITED)
7061 *code++ = OP_STAR + repeat_type;
7062 else
7063 {
7064 repeat_max -= repeat_min;
7065 if (repeat_max == 1)
7066 {
7067 *code++ = OP_QUERY + repeat_type;
7068 }
7069 else
7070 {
7071 *code++ = OP_UPTO + repeat_type;
7072 PUT2INC(code, 0, repeat_max);
7073 }
7074 }
7075 }
7076 }
7077
7078 /* Fill in the character or character type for the final opcode. */
7079
7080 if (mclength > 0)
7081 {
7082 memcpy(code, mcbuffer, CU2BYTES(mclength));
7083 code += mclength;
7084 }
7085 else
7086 {
7087 *code++ = op_previous;
7088 if (prop_type >= 0)
7089 {
7090 *code++ = prop_type;
7091 *code++ = prop_value;
7092 }
7093 }
7094 }
7095 break;
7096 } /* End of switch on different op_previous values */
7097
7098
7099 /* If the character following a repeat is '+', possessive_quantifier is
7100 TRUE. For some opcodes, there are special alternative opcodes for this
7101 case. For anything else, we wrap the entire repeated item inside OP_ONCE
7102 brackets. Logically, the '+' notation is just syntactic sugar, taken from
7103 Sun's Java package, but the special opcodes can optimize it.
7104
7105 Some (but not all) possessively repeated subpatterns have already been
7106 completely handled in the code just above. For them, possessive_quantifier
7107 is always FALSE at this stage. Note that the repeated item starts at
7108 tempcode, not at previous, which might be the first part of a string whose
7109 (former) last char we repeated. */
7110
7111 if (possessive_quantifier)
7112 {
7113 int len;
7114
7115 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7116 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7117 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7118 remains is greater than zero, there's a further opcode that can be
7119 handled. If not, do nothing, leaving the EXACT alone. */
7120
7121 switch(*tempcode)
7122 {
7123 case OP_TYPEEXACT:
7124 tempcode += PRIV(OP_lengths)[*tempcode] +
7125 ((tempcode[1 + IMM2_SIZE] == OP_PROP
7126 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7127 break;
7128
7129 /* CHAR opcodes are used for exacts whose count is 1. */
7130
7131 case OP_CHAR:
7132 case OP_CHARI:
7133 case OP_NOT:
7134 case OP_NOTI:
7135 case OP_EXACT:
7136 case OP_EXACTI:
7137 case OP_NOTEXACT:
7138 case OP_NOTEXACTI:
7139 tempcode += PRIV(OP_lengths)[*tempcode];
7140 #ifdef SUPPORT_UNICODE
7141 if (utf && HAS_EXTRALEN(tempcode[-1]))
7142 tempcode += GET_EXTRALEN(tempcode[-1]);
7143 #endif
7144 break;
7145
7146 /* For the class opcodes, the repeat operator appears at the end;
7147 adjust tempcode to point to it. */
7148
7149 case OP_CLASS:
7150 case OP_NCLASS:
7151 tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7152 break;
7153
7154 #ifdef SUPPORT_WIDE_CHARS
7155 case OP_XCLASS:
7156 tempcode += GET(tempcode, 1);
7157 break;
7158 #endif
7159 }
7160
7161 /* If tempcode is equal to code (which points to the end of the repeated
7162 item), it means we have skipped an EXACT item but there is no following
7163 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7164 all other cases, tempcode will be pointing to the repeat opcode, and will
7165 be less than code, so the value of len will be greater than 0. */
7166
7167 len = (int)(code - tempcode);
7168 if (len > 0)
7169 {
7170 unsigned int repcode = *tempcode;
7171
7172 /* There is a table for possessifying opcodes, all of which are less
7173 than OP_CALLOUT. A zero entry means there is no possessified version.
7174 */
7175
7176 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7177 *tempcode = opcode_possessify[repcode];
7178
7179 /* For opcode without a special possessified version, wrap the item in
7180 ONCE brackets. */
7181
7182 else
7183 {
7184 (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7185 code += 1 + LINK_SIZE;
7186 len += 1 + LINK_SIZE;
7187 tempcode[0] = OP_ONCE;
7188 *code++ = OP_KET;
7189 PUTINC(code, 0, len);
7190 PUT(tempcode, 1, len);
7191 }
7192 }
7193 }
7194
7195 /* We set the "follows varying string" flag for subsequently encountered
7196 reqcus if it isn't already set and we have just passed a varying length
7197 item. */
7198
7199 END_REPEAT:
7200 cb->req_varyopt |= reqvary;
7201 break;
7202
7203
7204 /* ===================================================================*/
7205 /* Handle a 32-bit data character with a value greater than META_END. */
7206
7207 case META_BIGVALUE:
7208 pptr++;
7209 goto NORMAL_CHAR;
7210
7211
7212 /* ===============================================================*/
7213 /* Handle a back reference by number, which is the meta argument. The
7214 pattern offsets for back references to group numbers less than 10 are held
7215 in a special vector, to avoid using more than two parsed pattern elements
7216 in 64-bit environments. We only need the offset to the first occurrence,
7217 because if that doesn't fail, subsequent ones will also be OK. */
7218
7219 case META_BACKREF:
7220 if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7221 else GETPLUSOFFSET(offset, pptr);
7222
7223 if (meta_arg > cb->bracount)
7224 {
7225 cb->erroroffset = offset;
7226 *errorcodeptr = ERR15; /* Non-existent subpattern */
7227 return 0;
7228 }
7229
7230 /* Come here from named backref handling when the reference is to a
7231 single group (that is, not to a duplicated name). The back reference
7232 data will have already been updated. We must disable firstcu if not
7233 set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7234 later. */
7235
7236 HANDLE_SINGLE_REFERENCE:
7237 if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7238 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7239 PUT2INC(code, 0, meta_arg);
7240
7241 /* Update the map of back references, and keep the highest one. We
7242 could do this in parse_regex() for numerical back references, but not
7243 for named back references, because we don't know the numbers to which
7244 named back references refer. So we do it all in this function. */
7245
7246 cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7247 if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7248
7249 /* Check to see if this back reference is recursive, that it, it
7250 is inside the group that it references. A flag is set so that the
7251 group can be made atomic. */
7252
7253 for (oc = cb->open_caps; oc != NULL; oc = oc->next)
7254 {
7255 if (oc->number == meta_arg)
7256 {
7257 oc->flag = TRUE;
7258 break;
7259 }
7260 }
7261 break;
7262
7263
7264 /* ===============================================================*/
7265 /* Handle recursion by inserting the number of the called group (which is
7266 the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7267 scanned and these numbers are replaced by offsets within the pattern. It is
7268 done like this to avoid problems with forward references and adjusting
7269 offsets when groups are duplicated and moved (as discovered in previous
7270 implementations). Note that a recursion does not have a set first character
7271 (relevant if it is repeated, because it will then be wrapped with ONCE
7272 brackets). */
7273
7274 case META_RECURSE:
7275 GETPLUSOFFSET(offset, pptr);
7276 if (meta_arg > cb->bracount)
7277 {
7278 cb->erroroffset = offset;
7279 *errorcodeptr = ERR15; /* Non-existent subpattern */
7280 return 0;
7281 }
7282 HANDLE_NUMERICAL_RECURSION:
7283 *code = OP_RECURSE;
7284 PUT(code, 1, meta_arg);
7285 code += 1 + LINK_SIZE;
7286 groupsetfirstcu = FALSE;
7287 cb->had_recurse = TRUE;
7288 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7289 break;
7290
7291
7292 /* ===============================================================*/
7293 /* Handle capturing parentheses; the number is the meta argument. */
7294
7295 case META_CAPTURE:
7296 bravalue = OP_CBRA;
7297 skipunits = IMM2_SIZE;
7298 PUT2(code, 1+LINK_SIZE, meta_arg);
7299 cb->lastcapture = meta_arg;
7300 goto GROUP_PROCESS_NOTE_EMPTY;
7301
7302
7303 /* ===============================================================*/
7304 /* Handle escape sequence items. For ones like \d, the ESC_values are
7305 arranged to be the same as the corresponding OP_values in the default case
7306 when PCRE2_UCP is not set (which is the only case in which they will appear
7307 here).
7308
7309 Note: \Q and \E are never seen here, as they were dealt with in
7310 parse_pattern(). Neither are numerical back references or recursions, which
7311 were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7312 \g, when followed by names, are turned into META_BACKREF_BYNAME or
7313 META_RECURSE_BYNAME. */
7314
7315 case META_ESCAPE:
7316
7317 /* We can test for escape sequences that consume a character because their
7318 values lie between ESC_b and ESC_Z; this may have to change if any new ones
7319 are ever created. For these sequences, we disable the setting of a first
7320 character if it hasn't already been set. */
7321
7322 if (meta_arg > ESC_b && meta_arg < ESC_Z)
7323 {
7324 matched_char = TRUE;
7325 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7326 }
7327
7328 /* Set values to reset to if this is followed by a zero repeat. */
7329
7330 zerofirstcu = firstcu;
7331 zerofirstcuflags = firstcuflags;
7332 zeroreqcu = reqcu;
7333 zeroreqcuflags = reqcuflags;
7334
7335 /* If Unicode is not supported, \P and \p are not allowed and are
7336 faulted at parse time, so will never appear here. */
7337
7338 #ifdef SUPPORT_UNICODE
7339 if (meta_arg == ESC_P || meta_arg == ESC_p)
7340 {
7341 uint32_t ptype = *(++pptr) >> 16;
7342 uint32_t pdata = *pptr & 0xffff;
7343 *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7344 *code++ = ptype;
7345 *code++ = pdata;
7346 break; /* End META_ESCAPE */
7347 }
7348 #endif
7349
7350 /* For the rest (including \X when Unicode is supported - if not it's
7351 faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7352 not set; if it is set, these escapes do not show up here because they are
7353 converted into Unicode property tests in parse_regex(). Note that \b and \B
7354 do a one-character lookbehind, and \A also behaves as if it does. */
7355
7356 if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7357 if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7358 cb->max_lookbehind == 0)
7359 cb->max_lookbehind = 1;
7360
7361 /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7362 instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7363
7364 #if PCRE2_CODE_UNIT_WIDTH == 32
7365 *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7366 #else
7367 *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7368 #endif
7369 break; /* End META_ESCAPE */
7370
7371
7372 /* ===================================================================*/
7373 /* Handle an unrecognized meta value. A parsed pattern value less than
7374 META_END is a literal. Otherwise we have a problem. */
7375
7376 default:
7377 if (meta >= META_END)
7378 {
7379 #ifdef DEBUG_SHOW_PARSED
7380 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7381 #endif
7382 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
7383 return 0;
7384 }
7385
7386 /* Handle a literal character. We come here by goto in the case of a
7387 32-bit, non-UTF character whose value is greater than META_END. */
7388
7389 NORMAL_CHAR:
7390 meta = *pptr; /* Get the full 32 bits */
7391 NORMAL_CHAR_SET: /* Character is already in meta */
7392 matched_char = TRUE;
7393
7394 /* For caseless UTF mode, check whether this character has more than one
7395 other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
7396
7397 #ifdef SUPPORT_UNICODE
7398 if (utf && (options & PCRE2_CASELESS) != 0)
7399 {
7400 uint32_t caseset = UCD_CASESET(meta);
7401 if (caseset != 0)
7402 {
7403 *code++ = OP_PROP;
7404 *code++ = PT_CLIST;
7405 *code++ = caseset;
7406 if (firstcuflags == REQ_UNSET)
7407 firstcuflags = zerofirstcuflags = REQ_NONE;
7408 break; /* End handling this meta item */
7409 }
7410 }
7411 #endif
7412
7413 /* Caseful matches, or not one of the multicase characters. Get the
7414 character's code units into mcbuffer, with the length in mclength. When not
7415 in UTF mode, the length is always 1. */
7416
7417 #ifdef SUPPORT_UNICODE
7418 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7419 #endif
7420 {
7421 mclength = 1;
7422 mcbuffer[0] = meta;
7423 }
7424
7425 /* Generate the appropriate code */
7426
7427 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7428 memcpy(code, mcbuffer, CU2BYTES(mclength));
7429 code += mclength;
7430
7431 /* Remember if \r or \n were seen */
7432
7433 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7434 cb->external_flags |= PCRE2_HASCRORLF;
7435
7436 /* Set the first and required code units appropriately. If no previous
7437 first code unit, set it from this character, but revert to none on a zero
7438 repeat. Otherwise, leave the firstcu value alone, and don't change it on
7439 a zero repeat. */
7440
7441 if (firstcuflags == REQ_UNSET)
7442 {
7443 zerofirstcuflags = REQ_NONE;
7444 zeroreqcu = reqcu;
7445 zeroreqcuflags = reqcuflags;
7446
7447 /* If the character is more than one code unit long, we can set firstcu
7448 only if it is not to be matched caselessly. */
7449
7450 if (mclength == 1 || req_caseopt == 0)
7451 {
7452 firstcu = mcbuffer[0];
7453 firstcuflags = req_caseopt;
7454 if (mclength != 1)
7455 {
7456 reqcu = code[-1];
7457 reqcuflags = cb->req_varyopt;
7458 }
7459 }
7460 else firstcuflags = reqcuflags = REQ_NONE;
7461 }
7462
7463 /* firstcu was previously set; we can set reqcu only if the length is
7464 1 or the matching is caseful. */
7465
7466 else
7467 {
7468 zerofirstcu = firstcu;
7469 zerofirstcuflags = firstcuflags;
7470 zeroreqcu = reqcu;
7471 zeroreqcuflags = reqcuflags;
7472 if (mclength == 1 || req_caseopt == 0)
7473 {
7474 reqcu = code[-1];
7475 reqcuflags = req_caseopt | cb->req_varyopt;
7476 }
7477 }
7478 break; /* End default meta handling */
7479 } /* End of big switch */
7480 } /* End of big loop */
7481
7482 /* Control never reaches here. */
7483 }
7484
7485
7486
7487 /*************************************************
7488 * Compile regex: a sequence of alternatives *
7489 *************************************************/
7490
7491 /* On entry, pptr is pointing past the bracket meta, but on return it points to
7492 the closing bracket or META_END. The code variable is pointing at the code unit
7493 into which the BRA operator has been stored. This function is used during the
7494 pre-compile phase when we are trying to find out the amount of memory needed,
7495 as well as during the real compile phase. The value of lengthptr distinguishes
7496 the two phases.
7497
7498 Arguments:
7499 options option bits, including any changes for this subpattern
7500 codeptr -> the address of the current code pointer
7501 pptrptr -> the address of the current parsed pattern pointer
7502 errorcodeptr -> pointer to error code variable
7503 skipunits skip this many code units at start (for brackets and OP_COND)
7504 firstcuptr place to put the first required code unit
7505 firstcuflagsptr place to put the first code unit flags, or a negative number
7506 reqcuptr place to put the last required code unit
7507 reqcuflagsptr place to put the last required code unit flags, or a negative number
7508 bcptr pointer to the chain of currently open branches
7509 cb points to the data block with tables pointers etc.
7510 lengthptr NULL during the real compile phase
7511 points to length accumulator during pre-compile phase
7512
7513 Returns: 0 There has been an error
7514 +1 Success, this group must match at least one character
7515 -1 Success, this group may match an empty string
7516 */
7517
7518 static int
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)7519 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
7520 int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
7521 int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
7522 branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
7523 {
7524 PCRE2_UCHAR *code = *codeptr;
7525 PCRE2_UCHAR *last_branch = code;
7526 PCRE2_UCHAR *start_bracket = code;
7527 BOOL lookbehind;
7528 open_capitem capitem;
7529 int capnumber = 0;
7530 int okreturn = 1;
7531 uint32_t *pptr = *pptrptr;
7532 uint32_t firstcu, reqcu;
7533 uint32_t lookbehindlength;
7534 int32_t firstcuflags, reqcuflags;
7535 uint32_t branchfirstcu, branchreqcu;
7536 int32_t branchfirstcuflags, branchreqcuflags;
7537 PCRE2_SIZE length;
7538 branch_chain bc;
7539
7540 /* If set, call the external function that checks for stack availability. */
7541
7542 if (cb->cx->stack_guard != NULL &&
7543 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
7544 {
7545 *errorcodeptr= ERR33;
7546 return 0;
7547 }
7548
7549 /* Miscellaneous initialization */
7550
7551 bc.outer = bcptr;
7552 bc.current_branch = code;
7553
7554 firstcu = reqcu = 0;
7555 firstcuflags = reqcuflags = REQ_UNSET;
7556
7557 /* Accumulate the length for use in the pre-compile phase. Start with the
7558 length of the BRA and KET and any extra code units that are required at the
7559 beginning. We accumulate in a local variable to save frequent testing of
7560 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
7561 start and end of each alternative, because compiled items are discarded during
7562 the pre-compile phase so that the workspace is not exceeded. */
7563
7564 length = 2 + 2*LINK_SIZE + skipunits;
7565
7566 /* Remember if this is a lookbehind assertion, and if it is, save its length
7567 and skip over the pattern offset. */
7568
7569 lookbehind = *code == OP_ASSERTBACK || *code == OP_ASSERTBACK_NOT;
7570 if (lookbehind)
7571 {
7572 lookbehindlength = META_DATA(pptr[-1]);
7573 pptr += SIZEOFFSET;
7574 }
7575 else lookbehindlength = 0;
7576
7577 /* If this is a capturing subpattern, add to the chain of open capturing items
7578 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
7579 need be tested here; changing this opcode to one of its variants, e.g.
7580 OP_SCBRAPOS, happens later, after the group has been compiled. */
7581
7582 if (*code == OP_CBRA)
7583 {
7584 capnumber = GET2(code, 1 + LINK_SIZE);
7585 capitem.number = capnumber;
7586 capitem.next = cb->open_caps;
7587 capitem.flag = FALSE;
7588 capitem.assert_depth = cb->assert_depth;
7589 cb->open_caps = &capitem;
7590 }
7591
7592 /* Offset is set zero to mark that this bracket is still open */
7593
7594 PUT(code, 1, 0);
7595 code += 1 + LINK_SIZE + skipunits;
7596
7597 /* Loop for each alternative branch */
7598
7599 for (;;)
7600 {
7601 int branch_return;
7602
7603 /* Insert OP_REVERSE if this is as lookbehind assertion. */
7604
7605 if (lookbehind && lookbehindlength > 0)
7606 {
7607 *code++ = OP_REVERSE;
7608 PUTINC(code, 0, lookbehindlength);
7609 length += 1 + LINK_SIZE;
7610 }
7611
7612 /* Now compile the branch; in the pre-compile phase its length gets added
7613 into the length. */
7614
7615 if ((branch_return =
7616 compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
7617 &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
7618 cb, (lengthptr == NULL)? NULL : &length)) == 0)
7619 return 0;
7620
7621 /* If a branch can match an empty string, so can the whole group. */
7622
7623 if (branch_return < 0) okreturn = -1;
7624
7625 /* In the real compile phase, there is some post-processing to be done. */
7626
7627 if (lengthptr == NULL)
7628 {
7629 /* If this is the first branch, the firstcu and reqcu values for the
7630 branch become the values for the regex. */
7631
7632 if (*last_branch != OP_ALT)
7633 {
7634 firstcu = branchfirstcu;
7635 firstcuflags = branchfirstcuflags;
7636 reqcu = branchreqcu;
7637 reqcuflags = branchreqcuflags;
7638 }
7639
7640 /* If this is not the first branch, the first char and reqcu have to
7641 match the values from all the previous branches, except that if the
7642 previous value for reqcu didn't have REQ_VARY set, it can still match,
7643 and we set REQ_VARY for the regex. */
7644
7645 else
7646 {
7647 /* If we previously had a firstcu, but it doesn't match the new branch,
7648 we have to abandon the firstcu for the regex, but if there was
7649 previously no reqcu, it takes on the value of the old firstcu. */
7650
7651 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
7652 {
7653 if (firstcuflags >= 0)
7654 {
7655 if (reqcuflags < 0)
7656 {
7657 reqcu = firstcu;
7658 reqcuflags = firstcuflags;
7659 }
7660 }
7661 firstcuflags = REQ_NONE;
7662 }
7663
7664 /* If we (now or from before) have no firstcu, a firstcu from the
7665 branch becomes a reqcu if there isn't a branch reqcu. */
7666
7667 if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
7668 branchreqcuflags < 0)
7669 {
7670 branchreqcu = branchfirstcu;
7671 branchreqcuflags = branchfirstcuflags;
7672 }
7673
7674 /* Now ensure that the reqcus match */
7675
7676 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
7677 reqcu != branchreqcu)
7678 reqcuflags = REQ_NONE;
7679 else
7680 {
7681 reqcu = branchreqcu;
7682 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
7683 }
7684 }
7685 }
7686
7687 /* Handle reaching the end of the expression, either ')' or end of pattern.
7688 In the real compile phase, go back through the alternative branches and
7689 reverse the chain of offsets, with the field in the BRA item now becoming an
7690 offset to the first alternative. If there are no alternatives, it points to
7691 the end of the group. The length in the terminating ket is always the length
7692 of the whole bracketed item. Return leaving the pointer at the terminating
7693 char. */
7694
7695 if (META_CODE(*pptr) != META_ALT)
7696 {
7697 if (lengthptr == NULL)
7698 {
7699 PCRE2_SIZE branch_length = code - last_branch;
7700 do
7701 {
7702 PCRE2_SIZE prev_length = GET(last_branch, 1);
7703 PUT(last_branch, 1, branch_length);
7704 branch_length = prev_length;
7705 last_branch -= branch_length;
7706 }
7707 while (branch_length > 0);
7708 }
7709
7710 /* Fill in the ket */
7711
7712 *code = OP_KET;
7713 PUT(code, 1, (int)(code - start_bracket));
7714 code += 1 + LINK_SIZE;
7715
7716 /* If it was a capturing subpattern, check to see if it contained any
7717 recursive back references. If so, we must wrap it in atomic brackets. In
7718 any event, remove the block from the chain. */
7719
7720 if (capnumber > 0)
7721 {
7722 if (cb->open_caps->flag)
7723 {
7724 (void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
7725 CU2BYTES(code - start_bracket));
7726 *start_bracket = OP_ONCE;
7727 code += 1 + LINK_SIZE;
7728 PUT(start_bracket, 1, (int)(code - start_bracket));
7729 *code = OP_KET;
7730 PUT(code, 1, (int)(code - start_bracket));
7731 code += 1 + LINK_SIZE;
7732 length += 2 + 2*LINK_SIZE;
7733 }
7734 cb->open_caps = cb->open_caps->next;
7735 }
7736
7737 /* Set values to pass back */
7738
7739 *codeptr = code;
7740 *pptrptr = pptr;
7741 *firstcuptr = firstcu;
7742 *firstcuflagsptr = firstcuflags;
7743 *reqcuptr = reqcu;
7744 *reqcuflagsptr = reqcuflags;
7745 if (lengthptr != NULL)
7746 {
7747 if (OFLOW_MAX - *lengthptr < length)
7748 {
7749 *errorcodeptr = ERR20;
7750 return 0;
7751 }
7752 *lengthptr += length;
7753 }
7754 return okreturn;
7755 }
7756
7757 /* Another branch follows. In the pre-compile phase, we can move the code
7758 pointer back to where it was for the start of the first branch. (That is,
7759 pretend that each branch is the only one.)
7760
7761 In the real compile phase, insert an ALT node. Its length field points back
7762 to the previous branch while the bracket remains open. At the end the chain
7763 is reversed. It's done like this so that the start of the bracket has a
7764 zero offset until it is closed, making it possible to detect recursion. */
7765
7766 if (lengthptr != NULL)
7767 {
7768 code = *codeptr + 1 + LINK_SIZE + skipunits;
7769 length += 1 + LINK_SIZE;
7770 }
7771 else
7772 {
7773 *code = OP_ALT;
7774 PUT(code, 1, (int)(code - last_branch));
7775 bc.current_branch = last_branch = code;
7776 code += 1 + LINK_SIZE;
7777 }
7778
7779 /* Set the lookbehind length (if not in a lookbehind the value will be zero)
7780 and then advance past the vertical bar. */
7781
7782 lookbehindlength = META_DATA(*pptr);
7783 pptr++;
7784 }
7785 /* Control never reaches here */
7786 }
7787
7788
7789
7790 /*************************************************
7791 * Check for anchored pattern *
7792 *************************************************/
7793
7794 /* Try to find out if this is an anchored regular expression. Consider each
7795 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
7796 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
7797 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
7798 be found, because ^ generates OP_CIRCM in that mode.
7799
7800 We can also consider a regex to be anchored if OP_SOM starts all its branches.
7801 This is the code for \G, which means "match at start of match position, taking
7802 into account the match offset".
7803
7804 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
7805 because that will try the rest of the pattern at all possible matching points,
7806 so there is no point trying again.... er ....
7807
7808 .... except when the .* appears inside capturing parentheses, and there is a
7809 subsequent back reference to those parentheses. We haven't enough information
7810 to catch that case precisely.
7811
7812 At first, the best we could do was to detect when .* was in capturing brackets
7813 and the highest back reference was greater than or equal to that level.
7814 However, by keeping a bitmap of the first 31 back references, we can catch some
7815 of the more common cases more precisely.
7816
7817 ... A second exception is when the .* appears inside an atomic group, because
7818 this prevents the number of characters it matches from being adjusted.
7819
7820 Arguments:
7821 code points to start of the compiled pattern
7822 bracket_map a bitmap of which brackets we are inside while testing; this
7823 handles up to substring 31; after that we just have to take
7824 the less precise approach
7825 cb points to the compile data block
7826 atomcount atomic group level
7827 inassert TRUE if in an assertion
7828
7829 Returns: TRUE or FALSE
7830 */
7831
7832 static BOOL
is_anchored(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)7833 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
7834 int atomcount, BOOL inassert)
7835 {
7836 do {
7837 PCRE2_SPTR scode = first_significant_code(
7838 code + PRIV(OP_lengths)[*code], FALSE);
7839 int op = *scode;
7840
7841 /* Non-capturing brackets */
7842
7843 if (op == OP_BRA || op == OP_BRAPOS ||
7844 op == OP_SBRA || op == OP_SBRAPOS)
7845 {
7846 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
7847 return FALSE;
7848 }
7849
7850 /* Capturing brackets */
7851
7852 else if (op == OP_CBRA || op == OP_CBRAPOS ||
7853 op == OP_SCBRA || op == OP_SCBRAPOS)
7854 {
7855 int n = GET2(scode, 1+LINK_SIZE);
7856 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
7857 if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
7858 }
7859
7860 /* Positive forward assertion */
7861
7862 else if (op == OP_ASSERT)
7863 {
7864 if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
7865 }
7866
7867 /* Condition. If there is no second branch, it can't be anchored. */
7868
7869 else if (op == OP_COND || op == OP_SCOND)
7870 {
7871 if (scode[GET(scode,1)] != OP_ALT) return FALSE;
7872 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
7873 return FALSE;
7874 }
7875
7876 /* Atomic groups */
7877
7878 else if (op == OP_ONCE)
7879 {
7880 if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
7881 return FALSE;
7882 }
7883
7884 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7885 it isn't in brackets that are or may be referenced or inside an atomic
7886 group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
7887 because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
7888 with the subject "aab", which matches "b", i.e. not at the start of a line.
7889 There is also an option that disables auto-anchoring. */
7890
7891 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7892 op == OP_TYPEPOSSTAR))
7893 {
7894 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
7895 atomcount > 0 || cb->had_pruneorskip || inassert ||
7896 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
7897 return FALSE;
7898 }
7899
7900 /* Check for explicit anchoring */
7901
7902 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7903
7904 code += GET(code, 1);
7905 }
7906 while (*code == OP_ALT); /* Loop for each alternative */
7907 return TRUE;
7908 }
7909
7910
7911
7912 /*************************************************
7913 * Check for starting with ^ or .* *
7914 *************************************************/
7915
7916 /* This is called to find out if every branch starts with ^ or .* so that
7917 "first char" processing can be done to speed things up in multiline
7918 matching and for non-DOTALL patterns that start with .* (which must start at
7919 the beginning or after \n). As in the case of is_anchored() (see above), we
7920 have to take account of back references to capturing brackets that contain .*
7921 because in that case we can't make the assumption. Also, the appearance of .*
7922 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
7923 or *SKIP does not count, because once again the assumption no longer holds.
7924
7925 Arguments:
7926 code points to start of the compiled pattern or a group
7927 bracket_map a bitmap of which brackets we are inside while testing; this
7928 handles up to substring 31; after that we just have to take
7929 the less precise approach
7930 cb points to the compile data
7931 atomcount atomic group level
7932 inassert TRUE if in an assertion
7933
7934 Returns: TRUE or FALSE
7935 */
7936
7937 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)7938 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
7939 int atomcount, BOOL inassert)
7940 {
7941 do {
7942 PCRE2_SPTR scode = first_significant_code(
7943 code + PRIV(OP_lengths)[*code], FALSE);
7944 int op = *scode;
7945
7946 /* If we are at the start of a conditional assertion group, *both* the
7947 conditional assertion *and* what follows the condition must satisfy the test
7948 for start of line. Other kinds of condition fail. Note that there may be an
7949 auto-callout at the start of a condition. */
7950
7951 if (op == OP_COND)
7952 {
7953 scode += 1 + LINK_SIZE;
7954
7955 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
7956 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
7957
7958 switch (*scode)
7959 {
7960 case OP_CREF:
7961 case OP_DNCREF:
7962 case OP_RREF:
7963 case OP_DNRREF:
7964 case OP_FAIL:
7965 case OP_FALSE:
7966 case OP_TRUE:
7967 return FALSE;
7968
7969 default: /* Assertion */
7970 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
7971 do scode += GET(scode, 1); while (*scode == OP_ALT);
7972 scode += 1 + LINK_SIZE;
7973 break;
7974 }
7975 scode = first_significant_code(scode, FALSE);
7976 op = *scode;
7977 }
7978
7979 /* Non-capturing brackets */
7980
7981 if (op == OP_BRA || op == OP_BRAPOS ||
7982 op == OP_SBRA || op == OP_SBRAPOS)
7983 {
7984 if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
7985 return FALSE;
7986 }
7987
7988 /* Capturing brackets */
7989
7990 else if (op == OP_CBRA || op == OP_CBRAPOS ||
7991 op == OP_SCBRA || op == OP_SCBRAPOS)
7992 {
7993 int n = GET2(scode, 1+LINK_SIZE);
7994 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
7995 if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
7996 }
7997
7998 /* Positive forward assertions */
7999
8000 else if (op == OP_ASSERT)
8001 {
8002 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8003 return FALSE;
8004 }
8005
8006 /* Atomic brackets */
8007
8008 else if (op == OP_ONCE)
8009 {
8010 if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8011 return FALSE;
8012 }
8013
8014 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8015 brackets that may be referenced or an assertion, and as long as the pattern
8016 does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8017 for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8018 i.e. not at the start of a line. There is also an option that disables this
8019 optimization. */
8020
8021 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8022 {
8023 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8024 atomcount > 0 || cb->had_pruneorskip || inassert ||
8025 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8026 return FALSE;
8027 }
8028
8029 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8030 in particular that this includes atomic brackets OP_ONCE because the number
8031 of characters matched by .* cannot be adjusted inside them. */
8032
8033 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8034
8035 /* Move on to the next alternative */
8036
8037 code += GET(code, 1);
8038 }
8039 while (*code == OP_ALT); /* Loop for each alternative */
8040 return TRUE;
8041 }
8042
8043
8044
8045 /*************************************************
8046 * Scan compiled regex for recursion reference *
8047 *************************************************/
8048
8049 /* This function scans through a compiled pattern until it finds an instance of
8050 OP_RECURSE.
8051
8052 Arguments:
8053 code points to start of expression
8054 utf TRUE in UTF mode
8055
8056 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8057 */
8058
8059 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8060 find_recurse(PCRE2_SPTR code, BOOL utf)
8061 {
8062 for (;;)
8063 {
8064 PCRE2_UCHAR c = *code;
8065 if (c == OP_END) return NULL;
8066 if (c == OP_RECURSE) return code;
8067
8068 /* XCLASS is used for classes that cannot be represented just by a bit map.
8069 This includes negated single high-valued characters. CALLOUT_STR is used for
8070 callouts with string arguments. In both cases the length in the table is
8071 zero; the actual length is stored in the compiled code. */
8072
8073 if (c == OP_XCLASS) code += GET(code, 1);
8074 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8075
8076 /* Otherwise, we can get the item's length from the table, except that for
8077 repeated character types, we have to test for \p and \P, which have an extra
8078 two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8079 we must add in its length. */
8080
8081 else
8082 {
8083 switch(c)
8084 {
8085 case OP_TYPESTAR:
8086 case OP_TYPEMINSTAR:
8087 case OP_TYPEPLUS:
8088 case OP_TYPEMINPLUS:
8089 case OP_TYPEQUERY:
8090 case OP_TYPEMINQUERY:
8091 case OP_TYPEPOSSTAR:
8092 case OP_TYPEPOSPLUS:
8093 case OP_TYPEPOSQUERY:
8094 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8095 break;
8096
8097 case OP_TYPEPOSUPTO:
8098 case OP_TYPEUPTO:
8099 case OP_TYPEMINUPTO:
8100 case OP_TYPEEXACT:
8101 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8102 code += 2;
8103 break;
8104
8105 case OP_MARK:
8106 case OP_COMMIT_ARG:
8107 case OP_PRUNE_ARG:
8108 case OP_SKIP_ARG:
8109 case OP_THEN_ARG:
8110 code += code[1];
8111 break;
8112 }
8113
8114 /* Add in the fixed length from the table */
8115
8116 code += PRIV(OP_lengths)[c];
8117
8118 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8119 be followed by a multi-unit character. The length in the table is a
8120 minimum, so we have to arrange to skip the extra units. */
8121
8122 #ifdef MAYBE_UTF_MULTI
8123 if (utf) switch(c)
8124 {
8125 case OP_CHAR:
8126 case OP_CHARI:
8127 case OP_NOT:
8128 case OP_NOTI:
8129 case OP_EXACT:
8130 case OP_EXACTI:
8131 case OP_NOTEXACT:
8132 case OP_NOTEXACTI:
8133 case OP_UPTO:
8134 case OP_UPTOI:
8135 case OP_NOTUPTO:
8136 case OP_NOTUPTOI:
8137 case OP_MINUPTO:
8138 case OP_MINUPTOI:
8139 case OP_NOTMINUPTO:
8140 case OP_NOTMINUPTOI:
8141 case OP_POSUPTO:
8142 case OP_POSUPTOI:
8143 case OP_NOTPOSUPTO:
8144 case OP_NOTPOSUPTOI:
8145 case OP_STAR:
8146 case OP_STARI:
8147 case OP_NOTSTAR:
8148 case OP_NOTSTARI:
8149 case OP_MINSTAR:
8150 case OP_MINSTARI:
8151 case OP_NOTMINSTAR:
8152 case OP_NOTMINSTARI:
8153 case OP_POSSTAR:
8154 case OP_POSSTARI:
8155 case OP_NOTPOSSTAR:
8156 case OP_NOTPOSSTARI:
8157 case OP_PLUS:
8158 case OP_PLUSI:
8159 case OP_NOTPLUS:
8160 case OP_NOTPLUSI:
8161 case OP_MINPLUS:
8162 case OP_MINPLUSI:
8163 case OP_NOTMINPLUS:
8164 case OP_NOTMINPLUSI:
8165 case OP_POSPLUS:
8166 case OP_POSPLUSI:
8167 case OP_NOTPOSPLUS:
8168 case OP_NOTPOSPLUSI:
8169 case OP_QUERY:
8170 case OP_QUERYI:
8171 case OP_NOTQUERY:
8172 case OP_NOTQUERYI:
8173 case OP_MINQUERY:
8174 case OP_MINQUERYI:
8175 case OP_NOTMINQUERY:
8176 case OP_NOTMINQUERYI:
8177 case OP_POSQUERY:
8178 case OP_POSQUERYI:
8179 case OP_NOTPOSQUERY:
8180 case OP_NOTPOSQUERYI:
8181 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8182 break;
8183 }
8184 #else
8185 (void)(utf); /* Keep compiler happy by referencing function argument */
8186 #endif /* MAYBE_UTF_MULTI */
8187 }
8188 }
8189 }
8190
8191
8192
8193 /*************************************************
8194 * Check for asserted fixed first code unit *
8195 *************************************************/
8196
8197 /* During compilation, the "first code unit" settings from forward assertions
8198 are discarded, because they can cause conflicts with actual literals that
8199 follow. However, if we end up without a first code unit setting for an
8200 unanchored pattern, it is worth scanning the regex to see if there is an
8201 initial asserted first code unit. If all branches start with the same asserted
8202 code unit, or with a non-conditional bracket all of whose alternatives start
8203 with the same asserted code unit (recurse ad lib), then we return that code
8204 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8205 REQ_NONE in the flags.
8206
8207 Arguments:
8208 code points to start of compiled pattern
8209 flags points to the first code unit flags
8210 inassert non-zero if in an assertion
8211
8212 Returns: the fixed first code unit, or 0 with REQ_NONE in flags
8213 */
8214
8215 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,int32_t * flags,uint32_t inassert)8216 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
8217 {
8218 uint32_t c = 0;
8219 int cflags = REQ_NONE;
8220
8221 *flags = REQ_NONE;
8222 do {
8223 uint32_t d;
8224 int dflags;
8225 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8226 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8227 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8228 PCRE2_UCHAR op = *scode;
8229
8230 switch(op)
8231 {
8232 default:
8233 return 0;
8234
8235 case OP_BRA:
8236 case OP_BRAPOS:
8237 case OP_CBRA:
8238 case OP_SCBRA:
8239 case OP_CBRAPOS:
8240 case OP_SCBRAPOS:
8241 case OP_ASSERT:
8242 case OP_ONCE:
8243 d = find_firstassertedcu(scode, &dflags, inassert + ((op==OP_ASSERT)?1:0));
8244 if (dflags < 0)
8245 return 0;
8246 if (cflags < 0) { c = d; cflags = dflags; }
8247 else if (c != d || cflags != dflags) return 0;
8248 break;
8249
8250 case OP_EXACT:
8251 scode += IMM2_SIZE;
8252 /* Fall through */
8253
8254 case OP_CHAR:
8255 case OP_PLUS:
8256 case OP_MINPLUS:
8257 case OP_POSPLUS:
8258 if (inassert == 0) return 0;
8259 if (cflags < 0) { c = scode[1]; cflags = 0; }
8260 else if (c != scode[1]) return 0;
8261 break;
8262
8263 case OP_EXACTI:
8264 scode += IMM2_SIZE;
8265 /* Fall through */
8266
8267 case OP_CHARI:
8268 case OP_PLUSI:
8269 case OP_MINPLUSI:
8270 case OP_POSPLUSI:
8271 if (inassert == 0) return 0;
8272 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8273 else if (c != scode[1]) return 0;
8274 break;
8275 }
8276
8277 code += GET(code, 1);
8278 }
8279 while (*code == OP_ALT);
8280
8281 *flags = cflags;
8282 return c;
8283 }
8284
8285
8286
8287 /*************************************************
8288 * Add an entry to the name/number table *
8289 *************************************************/
8290
8291 /* This function is called between compiling passes to add an entry to the
8292 name/number table, maintaining alphabetical order. Checking for permitted
8293 and forbidden duplicates has already been done.
8294
8295 Arguments:
8296 cb the compile data block
8297 name the name to add
8298 length the length of the name
8299 groupno the group number
8300 tablecount the count of names in the table so far
8301
8302 Returns: nothing
8303 */
8304
8305 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)8306 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8307 unsigned int groupno, uint32_t tablecount)
8308 {
8309 uint32_t i;
8310 PCRE2_UCHAR *slot = cb->name_table;
8311
8312 for (i = 0; i < tablecount; i++)
8313 {
8314 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8315 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8316 crc = -1; /* Current name is a substring */
8317
8318 /* Make space in the table and break the loop for an earlier name. For a
8319 duplicate or later name, carry on. We do this for duplicates so that in the
8320 simple case (when ?(| is not used) they are in order of their numbers. In all
8321 cases they are in the order in which they appear in the pattern. */
8322
8323 if (crc < 0)
8324 {
8325 (void)memmove(slot + cb->name_entry_size, slot,
8326 CU2BYTES((tablecount - i) * cb->name_entry_size));
8327 break;
8328 }
8329
8330 /* Continue the loop for a later or duplicate name */
8331
8332 slot += cb->name_entry_size;
8333 }
8334
8335 PUT2(slot, 0, groupno);
8336 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8337
8338 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8339 the memory is all initialized. Otherwise valgrind moans about uninitialized
8340 memory when saving serialized compiled patterns. */
8341
8342 memset(slot + IMM2_SIZE + length, 0,
8343 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8344 }
8345
8346
8347
8348 /*************************************************
8349 * Skip in parsed pattern *
8350 *************************************************/
8351
8352 /* This function is called to skip parts of the parsed pattern when finding the
8353 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8354 the end of the branch, it is called to skip over an internal lookaround, and it
8355 is also called to skip to the end of a class, during which it will never
8356 encounter nested groups (but there's no need to have special code for that).
8357
8358 When called to find the end of a branch or group, pptr must point to the first
8359 meta code inside the branch, not the branch-starting code. In other cases it
8360 can point to the item that causes the function to be called.
8361
8362 Arguments:
8363 pptr current pointer to skip from
8364 skiptype PSKIP_CLASS when skipping to end of class
8365 PSKIP_ALT when META_ALT ends the skip
8366 PSKIP_KET when only META_KET ends the skip
8367
8368 Returns: new value of pptr
8369 NULL if META_END is reached - should never occur
8370 or for an unknown meta value - likewise
8371 */
8372
8373 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)8374 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8375 {
8376 uint32_t nestlevel = 0;
8377
8378 for (;; pptr++)
8379 {
8380 uint32_t meta = META_CODE(*pptr);
8381
8382 switch(meta)
8383 {
8384 default: /* Just skip over most items */
8385 if (meta < META_END) continue; /* Literal */
8386 break;
8387
8388 /* This should never occur. */
8389
8390 case META_END:
8391 return NULL;
8392
8393 /* The data for these items is variable in length. */
8394
8395 case META_BACKREF: /* Offset is present only if group >= 10 */
8396 if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8397 break;
8398
8399 case META_ESCAPE: /* A few escapes are followed by data items. */
8400 switch (META_DATA(*pptr))
8401 {
8402 case ESC_P:
8403 case ESC_p:
8404 pptr += 1;
8405 break;
8406
8407 case ESC_g:
8408 case ESC_k:
8409 pptr += 1 + SIZEOFFSET;
8410 break;
8411 }
8412 break;
8413
8414 case META_MARK: /* Add the length of the name. */
8415 case META_COMMIT_ARG:
8416 case META_PRUNE_ARG:
8417 case META_SKIP_ARG:
8418 case META_THEN_ARG:
8419 pptr += pptr[1];
8420 break;
8421
8422 /* These are the "active" items in this loop. */
8423
8424 case META_CLASS_END:
8425 if (skiptype == PSKIP_CLASS) return pptr;
8426 break;
8427
8428 case META_ATOMIC:
8429 case META_CAPTURE:
8430 case META_COND_ASSERT:
8431 case META_COND_DEFINE:
8432 case META_COND_NAME:
8433 case META_COND_NUMBER:
8434 case META_COND_RNAME:
8435 case META_COND_RNUMBER:
8436 case META_COND_VERSION:
8437 case META_LOOKAHEAD:
8438 case META_LOOKAHEADNOT:
8439 case META_LOOKBEHIND:
8440 case META_LOOKBEHINDNOT:
8441 case META_NOCAPTURE:
8442 nestlevel++;
8443 break;
8444
8445 case META_ALT:
8446 if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
8447 break;
8448
8449 case META_KET:
8450 if (nestlevel == 0) return pptr;
8451 nestlevel--;
8452 break;
8453 }
8454
8455 /* The extra data item length for each meta is in a table. */
8456
8457 meta = (meta >> 16) & 0x7fff;
8458 if (meta >= sizeof(meta_extra_lengths)) return NULL;
8459 pptr += meta_extra_lengths[meta];
8460 }
8461 /* Control never reaches here */
8462 return pptr;
8463 }
8464
8465
8466
8467 /*************************************************
8468 * Find length of a parsed group *
8469 *************************************************/
8470
8471 /* This is called for nested groups within a branch of a lookbehind whose
8472 length is being computed. If all the branches in the nested group have the same
8473 length, that is OK. On entry, the pointer must be at the first element after
8474 the group initializing code. On exit it points to OP_KET. Caching is used to
8475 improve processing speed when the same capturing group occurs many times.
8476
8477 Arguments:
8478 pptrptr pointer to pointer in the parsed pattern
8479 isinline FALSE if a reference or recursion; TRUE for inline group
8480 errcodeptr pointer to the errorcode
8481 lcptr pointer to the loop counter
8482 group number of captured group or -1 for a non-capturing group
8483 recurses chain of recurse_check to catch mutual recursion
8484 cb pointer to the compile data
8485
8486 Returns: the group length or a negative number
8487 */
8488
8489 static int
get_grouplength(uint32_t ** pptrptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)8490 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
8491 int group, parsed_recurse_check *recurses, compile_block *cb)
8492 {
8493 int branchlength;
8494 int grouplength = -1;
8495
8496 /* The cache can be used only if there is no possibility of there being two
8497 groups with the same number. We do not need to set the end pointer for a group
8498 that is being processed as a back reference or recursion, but we must do so for
8499 an inline group. */
8500
8501 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
8502 {
8503 uint32_t groupinfo = cb->groupinfo[group];
8504 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
8505 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
8506 {
8507 if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
8508 return groupinfo & GI_FIXED_LENGTH_MASK;
8509 }
8510 }
8511
8512 /* Scan the group. In this case we find the end pointer of necessity. */
8513
8514 for(;;)
8515 {
8516 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
8517 if (branchlength < 0) goto ISNOTFIXED;
8518 if (grouplength == -1) grouplength = branchlength;
8519 else if (grouplength != branchlength) goto ISNOTFIXED;
8520 if (**pptrptr == META_KET) break;
8521 *pptrptr += 1; /* Skip META_ALT */
8522 }
8523
8524 if (group > 0)
8525 cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
8526 return grouplength;
8527
8528 ISNOTFIXED:
8529 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
8530 return -1;
8531 }
8532
8533
8534
8535 /*************************************************
8536 * Find length of a parsed branch *
8537 *************************************************/
8538
8539 /* Return a fixed length for a branch in a lookbehind, giving an error if the
8540 length is not fixed. If any lookbehinds are encountered on the way, they get
8541 their length set. On entry, *pptrptr points to the first element inside the
8542 branch. On exit it is set to point to the ALT or KET.
8543
8544 Arguments:
8545 pptrptr pointer to pointer in the parsed pattern
8546 errcodeptr pointer to error code
8547 lcptr pointer to loop counter
8548 recurses chain of recurse_check to catch mutual recursion
8549 cb pointer to compile block
8550
8551 Returns: the length, or a negative value on error
8552 */
8553
8554 static int
get_branchlength(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)8555 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
8556 parsed_recurse_check *recurses, compile_block *cb)
8557 {
8558 int branchlength = 0;
8559 int grouplength;
8560 uint32_t lastitemlength = 0;
8561 uint32_t *pptr = *pptrptr;
8562 PCRE2_SIZE offset;
8563 parsed_recurse_check this_recurse;
8564
8565 /* A large and/or complex regex can take too long to process. This can happen
8566 more often when (?| groups are present in the pattern because their length
8567 cannot be cached. */
8568
8569 if ((*lcptr)++ > 2000)
8570 {
8571 *errcodeptr = ERR35; /* Lookbehind is too complicated */
8572 return -1;
8573 }
8574
8575 /* Scan the branch, accumulating the length. */
8576
8577 for (;; pptr++)
8578 {
8579 parsed_recurse_check *r;
8580 uint32_t *gptr, *gptrend;
8581 uint32_t escape;
8582 uint32_t group = 0;
8583 uint32_t itemlength = 0;
8584
8585 if (*pptr < META_END)
8586 {
8587 itemlength = 1;
8588 }
8589
8590 else switch (META_CODE(*pptr))
8591 {
8592 case META_KET:
8593 case META_ALT:
8594 goto EXIT;
8595
8596 /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
8597 actual termination. */
8598
8599 case META_ACCEPT:
8600 case META_FAIL:
8601 pptr = parsed_skip(pptr, PSKIP_ALT);
8602 if (pptr == NULL) goto PARSED_SKIP_FAILED;
8603 goto EXIT;
8604
8605 case META_MARK:
8606 case META_COMMIT_ARG:
8607 case META_PRUNE_ARG:
8608 case META_SKIP_ARG:
8609 case META_THEN_ARG:
8610 pptr += pptr[1] + 1;
8611 break;
8612
8613 case META_CIRCUMFLEX:
8614 case META_COMMIT:
8615 case META_DOLLAR:
8616 case META_PRUNE:
8617 case META_SKIP:
8618 case META_THEN:
8619 break;
8620
8621 case META_OPTIONS:
8622 pptr += 1;
8623 break;
8624
8625 case META_BIGVALUE:
8626 itemlength = 1;
8627 pptr += 1;
8628 break;
8629
8630 case META_CLASS:
8631 case META_CLASS_NOT:
8632 itemlength = 1;
8633 pptr = parsed_skip(pptr, PSKIP_CLASS);
8634 if (pptr == NULL) goto PARSED_SKIP_FAILED;
8635 break;
8636
8637 case META_CLASS_EMPTY_NOT:
8638 case META_DOT:
8639 itemlength = 1;
8640 break;
8641
8642 case META_CALLOUT_NUMBER:
8643 pptr += 3;
8644 break;
8645
8646 case META_CALLOUT_STRING:
8647 pptr += 3 + SIZEOFFSET;
8648 break;
8649
8650 /* Only some escapes consume a character. Of those, \R and \X are never
8651 allowed because they might match more than character. \C is allowed only in
8652 32-bit and non-UTF 8/16-bit modes. */
8653
8654 case META_ESCAPE:
8655 escape = META_DATA(*pptr);
8656 if (escape == ESC_R || escape == ESC_X) return -1;
8657 if (escape > ESC_b && escape < ESC_Z)
8658 {
8659 #if PCRE2_CODE_UNIT_WIDTH != 32
8660 if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
8661 {
8662 *errcodeptr = ERR36;
8663 return -1;
8664 }
8665 #endif
8666 itemlength = 1;
8667 if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
8668 }
8669 break;
8670
8671 /* Lookaheads can be ignored, but we must start the skip inside the group
8672 so that it isn't treated as a group within the branch. */
8673
8674 case META_LOOKAHEAD:
8675 case META_LOOKAHEADNOT:
8676 pptr = parsed_skip(pptr + 1, PSKIP_KET);
8677 if (pptr == NULL) goto PARSED_SKIP_FAILED;
8678
8679 /* Also ignore any qualifiers that follow a lookahead assertion. */
8680
8681 switch (pptr[1])
8682 {
8683 case META_ASTERISK:
8684 case META_ASTERISK_PLUS:
8685 case META_ASTERISK_QUERY:
8686 case META_PLUS:
8687 case META_PLUS_PLUS:
8688 case META_PLUS_QUERY:
8689 case META_QUERY:
8690 case META_QUERY_PLUS:
8691 case META_QUERY_QUERY:
8692 pptr++;
8693 break;
8694
8695 case META_MINMAX:
8696 case META_MINMAX_PLUS:
8697 case META_MINMAX_QUERY:
8698 pptr += 3;
8699 break;
8700
8701 default:
8702 break;
8703 }
8704 break;
8705
8706 /* Lookbehinds can be ignored, but must themselves be checked. */
8707
8708 case META_LOOKBEHIND:
8709 case META_LOOKBEHINDNOT:
8710 if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
8711 return -1;
8712 break;
8713
8714 /* Back references and recursions are handled by very similar code. At this
8715 stage, the names generated in the parsing pass are available, but the main
8716 name table has not yet been created. So for the named varieties, scan the
8717 list of names in order to get the number of the first one in the pattern,
8718 and whether or not this name is duplicated. */
8719
8720 case META_BACKREF_BYNAME:
8721 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
8722 goto ISNOTFIXED;
8723 /* Fall through */
8724
8725 case META_RECURSE_BYNAME:
8726 {
8727 int i;
8728 PCRE2_SPTR name;
8729 BOOL is_dupname = FALSE;
8730 named_group *ng = cb->named_groups;
8731 uint32_t meta_code = META_CODE(*pptr);
8732 uint32_t length = *(++pptr);
8733
8734 GETPLUSOFFSET(offset, pptr);
8735 name = cb->start_pattern + offset;
8736 for (i = 0; i < cb->names_found; i++, ng++)
8737 {
8738 if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
8739 {
8740 group = ng->number;
8741 is_dupname = ng->isdup;
8742 break;
8743 }
8744 }
8745
8746 if (group == 0)
8747 {
8748 *errcodeptr = ERR15; /* Non-existent subpattern */
8749 cb->erroroffset = offset;
8750 return -1;
8751 }
8752
8753 /* A numerical back reference can be fixed length if duplicate capturing
8754 groups are not being used. A non-duplicate named back reference can also
8755 be handled. */
8756
8757 if (meta_code == META_RECURSE_BYNAME ||
8758 (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
8759 goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
8760 }
8761 goto ISNOTFIXED; /* Duplicate name or number */
8762
8763 /* The offset values for back references < 10 are in a separate vector
8764 because otherwise they would use more than two parsed pattern elements on
8765 64-bit systems. */
8766
8767 case META_BACKREF:
8768 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
8769 (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
8770 goto ISNOTFIXED;
8771 group = META_DATA(*pptr);
8772 if (group < 10)
8773 {
8774 offset = cb->small_ref_offset[group];
8775 goto RECURSE_OR_BACKREF_LENGTH;
8776 }
8777
8778 /* Fall through */
8779 /* For groups >= 10 - picking up group twice does no harm. */
8780
8781 /* A true recursion implies not fixed length, but a subroutine call may
8782 be OK. Back reference "recursions" are also failed. */
8783
8784 case META_RECURSE:
8785 group = META_DATA(*pptr);
8786 GETPLUSOFFSET(offset, pptr);
8787
8788 RECURSE_OR_BACKREF_LENGTH:
8789 if (group > cb->bracount)
8790 {
8791 cb->erroroffset = offset;
8792 *errcodeptr = ERR15; /* Non-existent subpattern */
8793 return -1;
8794 }
8795 if (group == 0) goto ISNOTFIXED; /* Local recursion */
8796 for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
8797 {
8798 if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
8799 else if (*gptr == (META_CAPTURE | group)) break;
8800 }
8801
8802 /* We must start the search for the end of the group at the first meta code
8803 inside the group. Otherwise it will be treated as an enclosed group. */
8804
8805 gptrend = parsed_skip(gptr + 1, PSKIP_KET);
8806 if (gptrend == NULL) goto PARSED_SKIP_FAILED;
8807 if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
8808 for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
8809 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
8810 this_recurse.prev = recurses;
8811 this_recurse.groupptr = gptr;
8812
8813 /* We do not need to know the position of the end of the group, that is,
8814 gptr is not used after the call to get_grouplength(). Setting the second
8815 argument FALSE stops it scanning for the end when the length can be found
8816 in the cache. */
8817
8818 gptr++;
8819 grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
8820 &this_recurse, cb);
8821 if (grouplength < 0)
8822 {
8823 if (*errcodeptr == 0) goto ISNOTFIXED;
8824 return -1; /* Error already set */
8825 }
8826 itemlength = grouplength;
8827 break;
8828
8829 /* Check nested groups - advance past the initial data for each type and
8830 then seek a fixed length with get_grouplength(). */
8831
8832 case META_COND_NAME:
8833 case META_COND_NUMBER:
8834 case META_COND_RNAME:
8835 case META_COND_RNUMBER:
8836 case META_COND_DEFINE:
8837 pptr += 2 + SIZEOFFSET;
8838 goto CHECK_GROUP;
8839
8840 case META_COND_ASSERT:
8841 pptr += 1;
8842 goto CHECK_GROUP;
8843
8844 case META_COND_VERSION:
8845 pptr += 4;
8846 goto CHECK_GROUP;
8847
8848 case META_CAPTURE:
8849 group = META_DATA(*pptr);
8850 /* Fall through */
8851
8852 case META_ATOMIC:
8853 case META_NOCAPTURE:
8854 pptr++;
8855 CHECK_GROUP:
8856 grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
8857 recurses, cb);
8858 if (grouplength < 0) return -1;
8859 itemlength = grouplength;
8860 break;
8861
8862 /* Exact repetition is OK; variable repetition is not. A repetition of zero
8863 must subtract the length that has already been added. */
8864
8865 case META_MINMAX:
8866 case META_MINMAX_PLUS:
8867 case META_MINMAX_QUERY:
8868 if (pptr[1] == pptr[2])
8869 {
8870 if (pptr[1] == 0) branchlength -= lastitemlength;
8871 else itemlength = (pptr[1] - 1) * lastitemlength;
8872 pptr += 2;
8873 break;
8874 }
8875 /* Fall through */
8876
8877 /* Any other item means this branch does not have a fixed length. */
8878
8879 default:
8880 ISNOTFIXED:
8881 *errcodeptr = ERR25; /* Not fixed length */
8882 return -1;
8883 }
8884
8885 /* Add the item length to the branchlength, and save it for use if the next
8886 thing is a quantifier. */
8887
8888 branchlength += itemlength;
8889 lastitemlength = itemlength;
8890
8891 /* Ensure that the length does not overflow the limit. */
8892
8893 if (branchlength > LOOKBEHIND_MAX)
8894 {
8895 *errcodeptr = ERR87;
8896 return -1;
8897 }
8898 }
8899
8900 EXIT:
8901 *pptrptr = pptr;
8902 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
8903 return branchlength;
8904
8905 PARSED_SKIP_FAILED:
8906 *errcodeptr = ERR90;
8907 return -1;
8908 }
8909
8910
8911
8912 /*************************************************
8913 * Set lengths in a lookbehind *
8914 *************************************************/
8915
8916 /* This function is called for each lookbehind, to set the lengths in its
8917 branches. An error occurs if any branch does not have a fixed length that is
8918 less than the maximum (65535). On exit, the pointer must be left on the final
8919 ket.
8920
8921 Arguments:
8922 pptrptr pointer to pointer in the parsed pattern
8923 errcodeptr pointer to error code
8924 lcptr pointer to loop counter
8925 recurses chain of recurse_check to catch mutual recursion
8926 cb pointer to compile block
8927
8928 Returns: TRUE if all is well
8929 FALSE otherwise, with error code and offset set
8930 */
8931
8932 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)8933 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
8934 parsed_recurse_check *recurses, compile_block *cb)
8935 {
8936 PCRE2_SIZE offset;
8937 int branchlength;
8938 uint32_t *bptr = *pptrptr;
8939
8940 READPLUSOFFSET(offset, bptr); /* Offset for error messages */
8941 *pptrptr += SIZEOFFSET;
8942
8943 do
8944 {
8945 *pptrptr += 1;
8946 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
8947 if (branchlength < 0)
8948 {
8949 /* The errorcode and offset may already be set from a nested lookbehind. */
8950 if (*errcodeptr == 0) *errcodeptr = ERR25;
8951 if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
8952 return FALSE;
8953 }
8954 *bptr |= branchlength; /* branchlength never more than 65535 */
8955 bptr = *pptrptr;
8956 }
8957 while (*bptr == META_ALT);
8958
8959 return TRUE;
8960 }
8961
8962
8963
8964 /*************************************************
8965 * Check parsed pattern lookbehinds *
8966 *************************************************/
8967
8968 /* This function is called at the end of parsing a pattern if any lookbehinds
8969 were encountered. It scans the parsed pattern for them, calling
8970 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
8971 the error offset is marked unset. The enables the functions above not to
8972 override settings from deeper nestings.
8973
8974 Arguments cb points to the compile block
8975 Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
8976 */
8977
8978 static int
check_lookbehinds(compile_block * cb)8979 check_lookbehinds(compile_block *cb)
8980 {
8981 uint32_t *pptr;
8982 int errorcode = 0;
8983 int loopcount = 0;
8984
8985 cb->erroroffset = PCRE2_UNSET;
8986
8987 for (pptr = cb->parsed_pattern; *pptr != META_END; pptr++)
8988 {
8989 if (*pptr < META_END) continue; /* Literal */
8990
8991 switch (META_CODE(*pptr))
8992 {
8993 default:
8994 return ERR70; /* Unrecognized meta code */
8995
8996 case META_ESCAPE:
8997 if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
8998 pptr += 1;
8999 break;
9000
9001 case META_ACCEPT:
9002 case META_ALT:
9003 case META_ASTERISK:
9004 case META_ASTERISK_PLUS:
9005 case META_ASTERISK_QUERY:
9006 case META_ATOMIC:
9007 case META_BACKREF:
9008 case META_CAPTURE:
9009 case META_CIRCUMFLEX:
9010 case META_CLASS:
9011 case META_CLASS_EMPTY:
9012 case META_CLASS_EMPTY_NOT:
9013 case META_CLASS_END:
9014 case META_CLASS_NOT:
9015 case META_COMMIT:
9016 case META_COND_ASSERT:
9017 case META_DOLLAR:
9018 case META_DOT:
9019 case META_FAIL:
9020 case META_KET:
9021 case META_LOOKAHEAD:
9022 case META_LOOKAHEADNOT:
9023 case META_NOCAPTURE:
9024 case META_PLUS:
9025 case META_PLUS_PLUS:
9026 case META_PLUS_QUERY:
9027 case META_PRUNE:
9028 case META_QUERY:
9029 case META_QUERY_PLUS:
9030 case META_QUERY_QUERY:
9031 case META_RANGE_ESCAPED:
9032 case META_RANGE_LITERAL:
9033 case META_SKIP:
9034 case META_THEN:
9035 break;
9036
9037 case META_RECURSE:
9038 pptr += SIZEOFFSET;
9039 break;
9040
9041 case META_BACKREF_BYNAME:
9042 case META_COND_DEFINE:
9043 case META_COND_NAME:
9044 case META_COND_NUMBER:
9045 case META_COND_RNAME:
9046 case META_COND_RNUMBER:
9047 case META_RECURSE_BYNAME:
9048 pptr += 1 + SIZEOFFSET;
9049 break;
9050
9051 case META_CALLOUT_STRING:
9052 pptr += 3 + SIZEOFFSET;
9053 break;
9054
9055 case META_BIGVALUE:
9056 case META_OPTIONS:
9057 case META_POSIX:
9058 case META_POSIX_NEG:
9059 pptr += 1;
9060 break;
9061
9062 case META_MINMAX:
9063 case META_MINMAX_QUERY:
9064 case META_MINMAX_PLUS:
9065 pptr += 2;
9066 break;
9067
9068 case META_CALLOUT_NUMBER:
9069 case META_COND_VERSION:
9070 pptr += 3;
9071 break;
9072
9073 case META_MARK:
9074 case META_COMMIT_ARG:
9075 case META_PRUNE_ARG:
9076 case META_SKIP_ARG:
9077 case META_THEN_ARG:
9078 pptr += 1 + pptr[1];
9079 break;
9080
9081 case META_LOOKBEHIND:
9082 case META_LOOKBEHINDNOT:
9083 if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, NULL, cb))
9084 return errorcode;
9085 break;
9086 }
9087 }
9088
9089 return 0;
9090 }
9091
9092
9093
9094 /*************************************************
9095 * External function to compile a pattern *
9096 *************************************************/
9097
9098 /* This function reads a regular expression in the form of a string and returns
9099 a pointer to a block of store holding a compiled version of the expression.
9100
9101 Arguments:
9102 pattern the regular expression
9103 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
9104 options option bits
9105 errorptr pointer to errorcode
9106 erroroffset pointer to error offset
9107 ccontext points to a compile context or is NULL
9108
9109 Returns: pointer to compiled data block, or NULL on error,
9110 with errorcode and erroroffset set
9111 */
9112
9113 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)9114 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9115 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9116 {
9117 BOOL utf; /* Set TRUE for UTF mode */
9118 BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
9119 BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
9120 pcre2_real_code *re = NULL; /* What we will return */
9121 compile_block cb; /* "Static" compile-time data */
9122 const uint8_t *tables; /* Char tables base pointer */
9123
9124 PCRE2_UCHAR *code; /* Current pointer in compiled code */
9125 PCRE2_SPTR codestart; /* Start of compiled code */
9126 PCRE2_SPTR ptr; /* Current pointer in pattern */
9127 uint32_t *pptr; /* Current pointer in parsed pattern */
9128
9129 PCRE2_SIZE length = 1; /* Allow for final END opcode */
9130 PCRE2_SIZE usedlength; /* Actual length used */
9131 PCRE2_SIZE re_blocksize; /* Size of memory block */
9132 PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
9133 PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
9134
9135 int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
9136 uint32_t firstcu, reqcu; /* Value of first/req code unit */
9137 uint32_t setflags = 0; /* NL and BSR set flags */
9138
9139 uint32_t skipatstart; /* When checking (*UTF) etc */
9140 uint32_t limit_heap = UINT32_MAX;
9141 uint32_t limit_match = UINT32_MAX; /* Unset match limits */
9142 uint32_t limit_depth = UINT32_MAX;
9143
9144 int newline = 0; /* Unset; can be set by the pattern */
9145 int bsr = 0; /* Unset; can be set by the pattern */
9146 int errorcode = 0; /* Initialize to avoid compiler warn */
9147 int regexrc; /* Return from compile */
9148
9149 uint32_t i; /* Local loop counter */
9150
9151 /* Comments at the head of this file explain about these variables. */
9152
9153 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9154 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9155 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9156
9157 /* The workspace is used in different ways in the different compiling phases.
9158 It needs to be 16-bit aligned for the preliminary parsing scan. */
9159
9160 uint32_t c16workspace[C16_WORK_SIZE];
9161 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9162
9163
9164 /* -------------- Check arguments and set up the pattern ----------------- */
9165
9166 /* There must be error code and offset pointers. */
9167
9168 if (errorptr == NULL || erroroffset == NULL) return NULL;
9169 *errorptr = ERR0;
9170 *erroroffset = 0;
9171
9172 /* There must be a pattern! */
9173
9174 if (pattern == NULL)
9175 {
9176 *errorptr = ERR16;
9177 return NULL;
9178 }
9179
9180 /* A NULL compile context means "use a default context" */
9181
9182 if (ccontext == NULL)
9183 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9184
9185 /* Check that all undefined public option bits are zero. */
9186
9187 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9188 (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9189 {
9190 *errorptr = ERR17;
9191 return NULL;
9192 }
9193
9194 if ((options & PCRE2_LITERAL) != 0 &&
9195 ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9196 (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9197 {
9198 *errorptr = ERR92;
9199 return NULL;
9200 }
9201
9202 /* A zero-terminated pattern is indicated by the special length value
9203 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9204
9205 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9206 patlen = PRIV(strlen)(pattern);
9207
9208 if (patlen > ccontext->max_pattern_length)
9209 {
9210 *errorptr = ERR88;
9211 return NULL;
9212 }
9213
9214 /* From here on, all returns from this function should end up going via the
9215 EXIT label. */
9216
9217
9218 /* ------------ Initialize the "static" compile data -------------- */
9219
9220 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9221
9222 cb.lcc = tables + lcc_offset; /* Individual */
9223 cb.fcc = tables + fcc_offset; /* character */
9224 cb.cbits = tables + cbits_offset; /* tables */
9225 cb.ctypes = tables + ctypes_offset;
9226
9227 cb.assert_depth = 0;
9228 cb.bracount = 0;
9229 cb.cx = ccontext;
9230 cb.dupnames = FALSE;
9231 cb.end_pattern = pattern + patlen;
9232 cb.erroroffset = 0;
9233 cb.external_flags = 0;
9234 cb.external_options = options;
9235 cb.groupinfo = stack_groupinfo;
9236 cb.had_recurse = FALSE;
9237 cb.lastcapture = 0;
9238 cb.max_lookbehind = 0;
9239 cb.name_entry_size = 0;
9240 cb.name_table = NULL;
9241 cb.named_groups = named_groups;
9242 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9243 cb.names_found = 0;
9244 cb.open_caps = NULL;
9245 cb.parens_depth = 0;
9246 cb.parsed_pattern = stack_parsed_pattern;
9247 cb.req_varyopt = 0;
9248 cb.start_code = cworkspace;
9249 cb.start_pattern = pattern;
9250 cb.start_workspace = cworkspace;
9251 cb.workspace_size = COMPILE_WORK_SIZE;
9252
9253 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9254 references to help in deciding whether (.*) can be treated as anchored or not.
9255 */
9256
9257 cb.top_backref = 0;
9258 cb.backref_map = 0;
9259
9260 /* Escape sequences \1 to \9 are always back references, but as they are only
9261 two characters long, only two elements can be used in the parsed_pattern
9262 vector. The first contains the reference, and we'd like to use the second to
9263 record the offset in the pattern, so that forward references to non-existent
9264 groups can be diagnosed later with an offset. However, on 64-bit systems,
9265 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9266 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9267 references have enough space for the offset to be put into the parsed pattern.
9268 */
9269
9270 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9271
9272
9273 /* --------------- Start looking at the pattern --------------- */
9274
9275 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9276 the start of the pattern, and remember the offset to the actual regex. With
9277 valgrind support, make the terminator of a zero-terminated pattern
9278 inaccessible. This catches bugs that would otherwise only show up for
9279 non-zero-terminated patterns. */
9280
9281 #ifdef SUPPORT_VALGRIND
9282 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9283 #endif
9284
9285 ptr = pattern;
9286 skipatstart = 0;
9287
9288 if ((options & PCRE2_LITERAL) == 0)
9289 {
9290 while (patlen - skipatstart >= 2 &&
9291 ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9292 ptr[skipatstart+1] == CHAR_ASTERISK)
9293 {
9294 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9295 {
9296 uint32_t c, pp;
9297 pso *p = pso_list + i;
9298
9299 if (patlen - skipatstart - 2 >= p->length &&
9300 PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9301 p->length) == 0)
9302 {
9303 skipatstart += p->length + 2;
9304 switch(p->type)
9305 {
9306 case PSO_OPT:
9307 cb.external_options |= p->value;
9308 break;
9309
9310 case PSO_FLG:
9311 setflags |= p->value;
9312 break;
9313
9314 case PSO_NL:
9315 newline = p->value;
9316 setflags |= PCRE2_NL_SET;
9317 break;
9318
9319 case PSO_BSR:
9320 bsr = p->value;
9321 setflags |= PCRE2_BSR_SET;
9322 break;
9323
9324 case PSO_LIMM:
9325 case PSO_LIMD:
9326 case PSO_LIMH:
9327 c = 0;
9328 pp = skipatstart;
9329 if (!IS_DIGIT(ptr[pp]))
9330 {
9331 errorcode = ERR60;
9332 ptr += pp;
9333 goto HAD_EARLY_ERROR;
9334 }
9335 while (IS_DIGIT(ptr[pp]))
9336 {
9337 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
9338 c = c*10 + (ptr[pp++] - CHAR_0);
9339 }
9340 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9341 {
9342 errorcode = ERR60;
9343 ptr += pp;
9344 goto HAD_EARLY_ERROR;
9345 }
9346 if (p->type == PSO_LIMH) limit_heap = c;
9347 else if (p->type == PSO_LIMM) limit_match = c;
9348 else limit_depth = c;
9349 skipatstart += pp - skipatstart;
9350 break;
9351 }
9352 break; /* Out of the table scan loop */
9353 }
9354 }
9355 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
9356 }
9357 }
9358
9359 /* End of pattern-start options; advance to start of real regex. */
9360
9361 ptr += skipatstart;
9362
9363 /* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
9364
9365 #ifndef SUPPORT_UNICODE
9366 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
9367 {
9368 errorcode = ERR32;
9369 goto HAD_EARLY_ERROR;
9370 }
9371 #endif
9372
9373 /* Check UTF. We have the original options in 'options', with that value as
9374 modified by (*UTF) etc in cb->external_options. The extra option
9375 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
9376 surrogate code points cannot be represented in UTF-16. */
9377
9378 utf = (cb.external_options & PCRE2_UTF) != 0;
9379 if (utf)
9380 {
9381 if ((options & PCRE2_NEVER_UTF) != 0)
9382 {
9383 errorcode = ERR74;
9384 goto HAD_EARLY_ERROR;
9385 }
9386 if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
9387 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
9388 goto HAD_ERROR; /* Offset was set by valid_utf() */
9389
9390 #if PCRE2_CODE_UNIT_WIDTH == 16
9391 if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
9392 {
9393 errorcode = ERR91;
9394 goto HAD_EARLY_ERROR;
9395 }
9396 #endif
9397 }
9398
9399 /* Check UCP lockout. */
9400
9401 if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
9402 (PCRE2_UCP|PCRE2_NEVER_UCP))
9403 {
9404 errorcode = ERR75;
9405 goto HAD_EARLY_ERROR;
9406 }
9407
9408 /* Process the BSR setting. */
9409
9410 if (bsr == 0) bsr = ccontext->bsr_convention;
9411
9412 /* Process the newline setting. */
9413
9414 if (newline == 0) newline = ccontext->newline_convention;
9415 cb.nltype = NLTYPE_FIXED;
9416 switch(newline)
9417 {
9418 case PCRE2_NEWLINE_CR:
9419 cb.nllen = 1;
9420 cb.nl[0] = CHAR_CR;
9421 break;
9422
9423 case PCRE2_NEWLINE_LF:
9424 cb.nllen = 1;
9425 cb.nl[0] = CHAR_NL;
9426 break;
9427
9428 case PCRE2_NEWLINE_NUL:
9429 cb.nllen = 1;
9430 cb.nl[0] = CHAR_NUL;
9431 break;
9432
9433 case PCRE2_NEWLINE_CRLF:
9434 cb.nllen = 2;
9435 cb.nl[0] = CHAR_CR;
9436 cb.nl[1] = CHAR_NL;
9437 break;
9438
9439 case PCRE2_NEWLINE_ANY:
9440 cb.nltype = NLTYPE_ANY;
9441 break;
9442
9443 case PCRE2_NEWLINE_ANYCRLF:
9444 cb.nltype = NLTYPE_ANYCRLF;
9445 break;
9446
9447 default:
9448 errorcode = ERR56;
9449 goto HAD_EARLY_ERROR;
9450 }
9451
9452 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
9453 their numerical equivalents, so that this information is always available for
9454 the remaining processing. (2) At the same time, parse the pattern and put a
9455 processed version into the parsed_pattern vector. This has escapes interpreted
9456 and comments removed (amongst other things).
9457
9458 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
9459 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
9460 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
9461 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
9462 characters greater than META_END (0x80000000) have to be coded as two units. In
9463 this case, therefore, we scan the pattern to check for such values. */
9464
9465 #if PCRE2_CODE_UNIT_WIDTH == 32
9466 if (!utf)
9467 {
9468 PCRE2_SPTR p;
9469 for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
9470 }
9471 #endif
9472
9473 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
9474 is set we have to assume a numerical callout (4 elements) for each character
9475 plus one at the end. This is overkill, but memory is plentiful these days. For
9476 many smaller patterns the vector on the stack (which was set up above) can be
9477 used. */
9478
9479 parsed_size_needed = patlen - skipatstart + big32count;
9480
9481 if ((ccontext->extra_options &
9482 (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
9483 parsed_size_needed += 4;
9484
9485 if ((options & PCRE2_AUTO_CALLOUT) != 0)
9486 parsed_size_needed = (parsed_size_needed + 1) * 5;
9487
9488 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
9489 {
9490 uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
9491 (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
9492 if (heap_parsed_pattern == NULL)
9493 {
9494 *errorptr = ERR21;
9495 goto EXIT;
9496 }
9497 cb.parsed_pattern = heap_parsed_pattern;
9498 }
9499 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
9500
9501 /* Do the parsing scan. */
9502
9503 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
9504 if (errorcode != 0) goto HAD_CB_ERROR;
9505
9506 /* Workspace is needed to remember information about numbered groups: whether a
9507 group can match an empty string and what its fixed length is. This is done to
9508 avoid the possibility of recursive references causing very long compile times
9509 when checking these features. Unnumbered groups do not have this exposure since
9510 they cannot be referenced. We use an indexed vector for this purpose. If there
9511 are sufficiently few groups, the default vector on the stack, as set up above,
9512 can be used. Otherwise we have to get/free a special vector. The vector must be
9513 initialized to zero. */
9514
9515 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
9516 {
9517 cb.groupinfo = ccontext->memctl.malloc(
9518 (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
9519 if (cb.groupinfo == NULL)
9520 {
9521 errorcode = ERR21;
9522 cb.erroroffset = 0;
9523 goto HAD_CB_ERROR;
9524 }
9525 }
9526 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
9527
9528 /* If there were any lookbehinds, scan the parsed pattern to figure out their
9529 lengths. */
9530
9531 if (has_lookbehind)
9532 {
9533 errorcode = check_lookbehinds(&cb);
9534 if (errorcode != 0) goto HAD_CB_ERROR;
9535 }
9536
9537 /* For debugging, there is a function that shows the parsed data vector. */
9538
9539 #ifdef DEBUG_SHOW_PARSED
9540 fprintf(stderr, "+++ Pre-scan complete:\n");
9541 show_parsed(&cb);
9542 #endif
9543
9544 /* For debugging capturing information this code can be enabled. */
9545
9546 #ifdef DEBUG_SHOW_CAPTURES
9547 {
9548 named_group *ng = cb.named_groups;
9549 fprintf(stderr, "+++Captures: %d\n", cb.bracount);
9550 for (i = 0; i < cb.names_found; i++, ng++)
9551 {
9552 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
9553 }
9554 }
9555 #endif
9556
9557 /* Pretend to compile the pattern while actually just accumulating the amount
9558 of memory required in the 'length' variable. This behaviour is triggered by
9559 passing a non-NULL final argument to compile_regex(). We pass a block of
9560 workspace (cworkspace) for it to compile parts of the pattern into; the
9561 compiled code is discarded when it is no longer needed, so hopefully this
9562 workspace will never overflow, though there is a test for its doing so.
9563
9564 On error, errorcode will be set non-zero, so we don't need to look at the
9565 result of the function. The initial options have been put into the cb block,
9566 but we still have to pass a separate options variable (the first argument)
9567 because the options may change as the pattern is processed. */
9568
9569 cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
9570 pptr = cb.parsed_pattern;
9571 code = cworkspace;
9572 *code = OP_BRA;
9573
9574 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
9575 &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
9576
9577 if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
9578
9579 /* This should be caught in compile_regex(), but just in case... */
9580
9581 if (length > MAX_PATTERN_SIZE)
9582 {
9583 errorcode = ERR20;
9584 goto HAD_CB_ERROR;
9585 }
9586
9587 /* Compute the size of, and then get and initialize, the data block for storing
9588 the compiled pattern and names table. Integer overflow should no longer be
9589 possible because nowadays we limit the maximum value of cb.names_found and
9590 cb.name_entry_size. */
9591
9592 re_blocksize = sizeof(pcre2_real_code) +
9593 CU2BYTES(length +
9594 (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
9595 re = (pcre2_real_code *)
9596 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
9597 if (re == NULL)
9598 {
9599 errorcode = ERR21;
9600 goto HAD_CB_ERROR;
9601 }
9602
9603 /* The compiler may put padding at the end of the pcre2_real_code structure in
9604 order to round it up to a multiple of 4 or 8 bytes. This means that when a
9605 compiled pattern is copied (for example, when serialized) undefined bytes are
9606 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
9607 write to the last 8 bytes of the structure before setting the fields. */
9608
9609 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
9610 re->memctl = ccontext->memctl;
9611 re->tables = tables;
9612 re->executable_jit = NULL;
9613 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
9614 re->blocksize = re_blocksize;
9615 re->magic_number = MAGIC_NUMBER;
9616 re->compile_options = options;
9617 re->overall_options = cb.external_options;
9618 re->extra_options = ccontext->extra_options;
9619 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
9620 re->limit_heap = limit_heap;
9621 re->limit_match = limit_match;
9622 re->limit_depth = limit_depth;
9623 re->first_codeunit = 0;
9624 re->last_codeunit = 0;
9625 re->bsr_convention = bsr;
9626 re->newline_convention = newline;
9627 re->max_lookbehind = 0;
9628 re->minlength = 0;
9629 re->top_bracket = 0;
9630 re->top_backref = 0;
9631 re->name_entry_size = cb.name_entry_size;
9632 re->name_count = cb.names_found;
9633
9634 /* The basic block is immediately followed by the name table, and the compiled
9635 code follows after that. */
9636
9637 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
9638 re->name_entry_size * re->name_count;
9639
9640 /* Update the compile data block for the actual compile. The starting points of
9641 the name/number translation table and of the code are passed around in the
9642 compile data block. The start/end pattern and initial options are already set
9643 from the pre-compile phase, as is the name_entry_size field. */
9644
9645 cb.parens_depth = 0;
9646 cb.assert_depth = 0;
9647 cb.lastcapture = 0;
9648 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
9649 cb.start_code = codestart;
9650 cb.req_varyopt = 0;
9651 cb.had_accept = FALSE;
9652 cb.had_pruneorskip = FALSE;
9653 cb.open_caps = NULL;
9654
9655 /* If any named groups were found, create the name/number table from the list
9656 created in the pre-pass. */
9657
9658 if (cb.names_found > 0)
9659 {
9660 named_group *ng = cb.named_groups;
9661 for (i = 0; i < cb.names_found; i++, ng++)
9662 add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
9663 }
9664
9665 /* Set up a starting, non-extracting bracket, then compile the expression. On
9666 error, errorcode will be set non-zero, so we don't need to look at the result
9667 of the function here. */
9668
9669 pptr = cb.parsed_pattern;
9670 code = (PCRE2_UCHAR *)codestart;
9671 *code = OP_BRA;
9672 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
9673 &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
9674 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
9675 re->top_bracket = cb.bracount;
9676 re->top_backref = cb.top_backref;
9677 re->max_lookbehind = cb.max_lookbehind;
9678
9679 if (cb.had_accept)
9680 {
9681 reqcu = 0; /* Must disable after (*ACCEPT) */
9682 reqcuflags = REQ_NONE;
9683 }
9684
9685 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
9686 but the estimated length exceeds the really used length, adjust the value of
9687 re->blocksize, and if valgrind support is configured, mark the extra allocated
9688 memory as unaddressable, so that any out-of-bound reads can be detected. */
9689
9690 *code++ = OP_END;
9691 usedlength = code - codestart;
9692 if (usedlength > length) errorcode = ERR23; else
9693 {
9694 re->blocksize -= CU2BYTES(length - usedlength);
9695 #ifdef SUPPORT_VALGRIND
9696 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
9697 #endif
9698 }
9699
9700 /* Scan the pattern for recursion/subroutine calls and convert the group
9701 numbers into offsets. Maintain a small cache so that repeated groups containing
9702 recursions are efficiently handled. */
9703
9704 #define RSCAN_CACHE_SIZE 8
9705
9706 if (errorcode == 0 && cb.had_recurse)
9707 {
9708 PCRE2_UCHAR *rcode;
9709 PCRE2_SPTR rgroup;
9710 unsigned int ccount = 0;
9711 int start = RSCAN_CACHE_SIZE;
9712 recurse_cache rc[RSCAN_CACHE_SIZE];
9713
9714 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
9715 rcode != NULL;
9716 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
9717 {
9718 int p, groupnumber;
9719
9720 groupnumber = (int)GET(rcode, 1);
9721 if (groupnumber == 0) rgroup = codestart; else
9722 {
9723 PCRE2_SPTR search_from = codestart;
9724 rgroup = NULL;
9725 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
9726 {
9727 if (groupnumber == rc[p].groupnumber)
9728 {
9729 rgroup = rc[p].group;
9730 break;
9731 }
9732
9733 /* Group n+1 must always start to the right of group n, so we can save
9734 search time below when the new group number is greater than any of the
9735 previously found groups. */
9736
9737 if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
9738 }
9739
9740 if (rgroup == NULL)
9741 {
9742 rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
9743 if (rgroup == NULL)
9744 {
9745 errorcode = ERR53;
9746 break;
9747 }
9748 if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
9749 rc[start].groupnumber = groupnumber;
9750 rc[start].group = rgroup;
9751 if (ccount < RSCAN_CACHE_SIZE) ccount++;
9752 }
9753 }
9754
9755 PUT(rcode, 1, rgroup - codestart);
9756 }
9757 }
9758
9759 /* In rare debugging situations we sometimes need to look at the compiled code
9760 at this stage. */
9761
9762 #ifdef DEBUG_CALL_PRINTINT
9763 pcre2_printint(re, stderr, TRUE);
9764 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
9765 #endif
9766
9767 /* Unless disabled, check whether any single character iterators can be
9768 auto-possessified. The function overwrites the appropriate opcode values, so
9769 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9770 used in this code because at least one compiler gives a warning about loss of
9771 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
9772 function call. */
9773
9774 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
9775 {
9776 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
9777 if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
9778 }
9779
9780 /* Failed to compile, or error while post-processing. */
9781
9782 if (errorcode != 0) goto HAD_CB_ERROR;
9783
9784 /* Successful compile. If the anchored option was not passed, set it if
9785 we can determine that the pattern is anchored by virtue of ^ characters or \A
9786 or anything else, such as starting with non-atomic .* when DOTALL is set and
9787 there are no occurrences of *PRUNE or *SKIP (though there is an option to
9788 disable this case). */
9789
9790 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
9791 is_anchored(codestart, 0, &cb, 0, FALSE))
9792 re->overall_options |= PCRE2_ANCHORED;
9793
9794 /* Set up the first code unit or startline flag, the required code unit, and
9795 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
9796 is set, as the data it would create will not be used. Note that a first code
9797 unit (but not the startline flag) is useful for anchored patterns because it
9798 can still give a quick "no match" and also avoid searching for a last code
9799 unit. */
9800
9801 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
9802 {
9803 /* If we do not have a first code unit, see if there is one that is asserted
9804 (these are not saved during the compile because they can cause conflicts with
9805 actual literals that follow). */
9806
9807 if (firstcuflags < 0)
9808 firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
9809
9810 /* Save the data for a first code unit. */
9811
9812 if (firstcuflags >= 0)
9813 {
9814 re->first_codeunit = firstcu;
9815 re->flags |= PCRE2_FIRSTSET;
9816
9817 /* Handle caseless first code units. */
9818
9819 if ((firstcuflags & REQ_CASELESS) != 0)
9820 {
9821 if (firstcu < 128 || (!utf && firstcu < 255))
9822 {
9823 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
9824 }
9825
9826 /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
9827 8-bit UTF mode, codepoints in the range 128-255 are introductory code
9828 points and cannot have another case. In 16-bit and 32-bit modes, we can
9829 check wide characters when UTF (and therefore UCP) is supported. */
9830
9831 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
9832 else if (firstcu <= MAX_UTF_CODE_POINT &&
9833 UCD_OTHERCASE(firstcu) != firstcu)
9834 re->flags |= PCRE2_FIRSTCASELESS;
9835 #endif
9836 }
9837 }
9838
9839 /* When there is no first code unit, for non-anchored patterns, see if we can
9840 set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
9841 branches start with ^ and also when all branches start with non-atomic .* for
9842 non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
9843 that disables this case.) */
9844
9845 else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
9846 is_startline(codestart, 0, &cb, 0, FALSE))
9847 re->flags |= PCRE2_STARTLINE;
9848
9849 /* Handle the "required code unit", if one is set. In the case of an anchored
9850 pattern, do this only if it follows a variable length item in the pattern. */
9851
9852 if (reqcuflags >= 0 &&
9853 ((re->overall_options & PCRE2_ANCHORED) == 0 ||
9854 (reqcuflags & REQ_VARY) != 0))
9855 {
9856 re->last_codeunit = reqcu;
9857 re->flags |= PCRE2_LASTSET;
9858
9859 /* Handle caseless required code units as for first code units (above). */
9860
9861 if ((reqcuflags & REQ_CASELESS) != 0)
9862 {
9863 if (reqcu < 128 || (!utf && reqcu < 255))
9864 {
9865 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
9866 }
9867 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
9868 else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
9869 re->flags |= PCRE2_LASTCASELESS;
9870 #endif
9871 }
9872 }
9873
9874 /* Finally, study the compiled pattern to set up information such as a bitmap
9875 of starting code units and a minimum matching length. */
9876
9877 if (PRIV(study)(re) != 0)
9878 {
9879 errorcode = ERR31;
9880 goto HAD_CB_ERROR;
9881 }
9882 } /* End of start-of-match optimizations. */
9883
9884 /* Control ends up here in all cases. When running under valgrind, make a
9885 pattern's terminating zero defined again. If memory was obtained for the parsed
9886 version of the pattern, free it before returning. Also free the list of named
9887 groups if a larger one had to be obtained, and likewise the group information
9888 vector. */
9889
9890 EXIT:
9891 #ifdef SUPPORT_VALGRIND
9892 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
9893 #endif
9894 if (cb.parsed_pattern != stack_parsed_pattern)
9895 ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
9896 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
9897 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
9898 if (cb.groupinfo != stack_groupinfo)
9899 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
9900 return re; /* Will be NULL after an error */
9901
9902 /* Errors discovered in parse_regex() set the offset value in the compile
9903 block. Errors discovered before it is called must compute it from the ptr
9904 value. After parse_regex() is called, the offset in the compile block is set to
9905 the end of the pattern, but certain errors in compile_regex() may reset it if
9906 an offset is available in the parsed pattern. */
9907
9908 HAD_CB_ERROR:
9909 ptr = pattern + cb.erroroffset;
9910
9911 HAD_EARLY_ERROR:
9912 *erroroffset = ptr - pattern;
9913
9914 HAD_ERROR:
9915 *errorptr = errorcode;
9916 pcre2_code_free(re);
9917 re = NULL;
9918 goto EXIT;
9919 }
9920
9921 /* End of pcre2_compile.c */
9922