1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains functions that scan a compiled pattern and change
42 repeats into possessive repeats where possible. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49
50 #include "pcre2_internal.h"
51
52
53 /*************************************************
54 * Tables for auto-possessification *
55 *************************************************/
56
57 /* This table is used to check whether auto-possessification is possible
58 between adjacent character-type opcodes. The left-hand (repeated) opcode is
59 used to select the row, and the right-hand opcode is use to select the column.
60 A value of 1 means that auto-possessification is OK. For example, the second
61 value in the first row means that \D+\d can be turned into \D++\d.
62
63 The Unicode property types (\P and \p) have to be present to fill out the table
64 because of what their opcode values are, but the table values should always be
65 zero because property types are handled separately in the code. The last four
66 columns apply to items that cannot be repeated, so there is no need to have
67 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
68 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
69
70 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
71 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
72
73 static const uint8_t autoposstab[APTROWS][APTCOLS] = {
74 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
75 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
76 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
77 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
78 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
79 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
80 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
81 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
82 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
83 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
84 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
85 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
86 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
87 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
88 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
89 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
90 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
91 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
92 };
93
94 #ifdef SUPPORT_UNICODE
95 /* This table is used to check whether auto-possessification is possible
96 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
97 left-hand (repeated) opcode is used to select the row, and the right-hand
98 opcode is used to select the column. The values are as follows:
99
100 0 Always return FALSE (never auto-possessify)
101 1 Character groups are distinct (possessify if both are OP_PROP)
102 2 Check character categories in the same group (general or particular)
103 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
104
105 4 Check left general category vs right particular category
106 5 Check right general category vs left particular category
107
108 6 Left alphanum vs right general category
109 7 Left space vs right general category
110 8 Left word vs right general category
111
112 9 Right alphanum vs left general category
113 10 Right space vs left general category
114 11 Right word vs left general category
115
116 12 Left alphanum vs right particular category
117 13 Left space vs right particular category
118 14 Left word vs right particular category
119
120 15 Right alphanum vs left particular category
121 16 Right space vs left particular category
122 17 Right word vs left particular category
123 */
124
125 static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
126 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
127 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
128 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
129 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
130 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
131 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
132 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
133 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
134 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
135 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
136 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
137 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
138 };
139
140 /* This table is used to check whether auto-possessification is possible
141 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
142 specifies a general category and the other specifies a particular category. The
143 row is selected by the general category and the column by the particular
144 category. The value is 1 if the particular category is not part of the general
145 category. */
146
147 static const uint8_t catposstab[7][30] = {
148 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
149 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
150 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
151 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
152 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
153 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
154 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
155 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
156 };
157
158 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
159 a general or particular category. The properties in each row are those
160 that apply to the character set in question. Duplication means that a little
161 unnecessary work is done when checking, but this keeps things much simpler
162 because they can all use the same code. For more details see the comment where
163 this table is used.
164
165 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
166 "space", but from Perl 5.18 it's included, so both categories are treated the
167 same here. */
168
169 static const uint8_t posspropstab[3][4] = {
170 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
171 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
172 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
173 };
174 #endif /* SUPPORT_UNICODE */
175
176
177
178 #ifdef SUPPORT_UNICODE
179 /*************************************************
180 * Check a character and a property *
181 *************************************************/
182
183 /* This function is called by compare_opcodes() when a property item is
184 adjacent to a fixed character.
185
186 Arguments:
187 c the character
188 ptype the property type
189 pdata the data for the type
190 negated TRUE if it's a negated property (\P or \p{^)
191
192 Returns: TRUE if auto-possessifying is OK
193 */
194
195 static BOOL
check_char_prop(uint32_t c,unsigned int ptype,unsigned int pdata,BOOL negated)196 check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
197 BOOL negated)
198 {
199 const uint32_t *p;
200 const ucd_record *prop = GET_UCD(c);
201
202 switch(ptype)
203 {
204 case PT_LAMP:
205 return (prop->chartype == ucp_Lu ||
206 prop->chartype == ucp_Ll ||
207 prop->chartype == ucp_Lt) == negated;
208
209 case PT_GC:
210 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
211
212 case PT_PC:
213 return (pdata == prop->chartype) == negated;
214
215 case PT_SC:
216 return (pdata == prop->script) == negated;
217
218 /* These are specials */
219
220 case PT_ALNUM:
221 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
222 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
223
224 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
225 means that Perl space and POSIX space are now identical. PCRE was changed
226 at release 8.34. */
227
228 case PT_SPACE: /* Perl space */
229 case PT_PXSPACE: /* POSIX space */
230 switch(c)
231 {
232 HSPACE_CASES:
233 VSPACE_CASES:
234 return negated;
235
236 default:
237 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
238 }
239 break; /* Control never reaches here */
240
241 case PT_WORD:
242 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
243 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
244 c == CHAR_UNDERSCORE) == negated;
245
246 case PT_CLIST:
247 p = PRIV(ucd_caseless_sets) + prop->caseset;
248 for (;;)
249 {
250 if (c < *p) return !negated;
251 if (c == *p++) return negated;
252 }
253 break; /* Control never reaches here */
254 }
255
256 return FALSE;
257 }
258 #endif /* SUPPORT_UNICODE */
259
260
261
262 /*************************************************
263 * Base opcode of repeated opcodes *
264 *************************************************/
265
266 /* Returns the base opcode for repeated single character type opcodes. If the
267 opcode is not a repeated character type, it returns with the original value.
268
269 Arguments: c opcode
270 Returns: base opcode for the type
271 */
272
273 static PCRE2_UCHAR
get_repeat_base(PCRE2_UCHAR c)274 get_repeat_base(PCRE2_UCHAR c)
275 {
276 return (c > OP_TYPEPOSUPTO)? c :
277 (c >= OP_TYPESTAR)? OP_TYPESTAR :
278 (c >= OP_NOTSTARI)? OP_NOTSTARI :
279 (c >= OP_NOTSTAR)? OP_NOTSTAR :
280 (c >= OP_STARI)? OP_STARI :
281 OP_STAR;
282 }
283
284
285 /*************************************************
286 * Fill the character property list *
287 *************************************************/
288
289 /* Checks whether the code points to an opcode that can take part in auto-
290 possessification, and if so, fills a list with its properties.
291
292 Arguments:
293 code points to start of expression
294 utf TRUE if in UTF mode
295 fcc points to the case-flipping table
296 list points to output list
297 list[0] will be filled with the opcode
298 list[1] will be non-zero if this opcode
299 can match an empty character string
300 list[2..7] depends on the opcode
301
302 Returns: points to the start of the next opcode if *code is accepted
303 NULL if *code is not accepted
304 */
305
306 static PCRE2_SPTR
get_chr_property_list(PCRE2_SPTR code,BOOL utf,const uint8_t * fcc,uint32_t * list)307 get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
308 uint32_t *list)
309 {
310 PCRE2_UCHAR c = *code;
311 PCRE2_UCHAR base;
312 PCRE2_SPTR end;
313 uint32_t chr;
314
315 #ifdef SUPPORT_UNICODE
316 uint32_t *clist_dest;
317 const uint32_t *clist_src;
318 #else
319 (void)utf; /* Suppress "unused parameter" compiler warning */
320 #endif
321
322 list[0] = c;
323 list[1] = FALSE;
324 code++;
325
326 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
327 {
328 base = get_repeat_base(c);
329 c -= (base - OP_STAR);
330
331 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
332 code += IMM2_SIZE;
333
334 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&
335 c != OP_POSPLUS);
336
337 switch(base)
338 {
339 case OP_STAR:
340 list[0] = OP_CHAR;
341 break;
342
343 case OP_STARI:
344 list[0] = OP_CHARI;
345 break;
346
347 case OP_NOTSTAR:
348 list[0] = OP_NOT;
349 break;
350
351 case OP_NOTSTARI:
352 list[0] = OP_NOTI;
353 break;
354
355 case OP_TYPESTAR:
356 list[0] = *code;
357 code++;
358 break;
359 }
360 c = list[0];
361 }
362
363 switch(c)
364 {
365 case OP_NOT_DIGIT:
366 case OP_DIGIT:
367 case OP_NOT_WHITESPACE:
368 case OP_WHITESPACE:
369 case OP_NOT_WORDCHAR:
370 case OP_WORDCHAR:
371 case OP_ANY:
372 case OP_ALLANY:
373 case OP_ANYNL:
374 case OP_NOT_HSPACE:
375 case OP_HSPACE:
376 case OP_NOT_VSPACE:
377 case OP_VSPACE:
378 case OP_EXTUNI:
379 case OP_EODN:
380 case OP_EOD:
381 case OP_DOLL:
382 case OP_DOLLM:
383 return code;
384
385 case OP_CHAR:
386 case OP_NOT:
387 GETCHARINCTEST(chr, code);
388 list[2] = chr;
389 list[3] = NOTACHAR;
390 return code;
391
392 case OP_CHARI:
393 case OP_NOTI:
394 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
395 GETCHARINCTEST(chr, code);
396 list[2] = chr;
397
398 #ifdef SUPPORT_UNICODE
399 if (chr < 128 || (chr < 256 && !utf))
400 list[3] = fcc[chr];
401 else
402 list[3] = UCD_OTHERCASE(chr);
403 #elif defined SUPPORT_WIDE_CHARS
404 list[3] = (chr < 256) ? fcc[chr] : chr;
405 #else
406 list[3] = fcc[chr];
407 #endif
408
409 /* The othercase might be the same value. */
410
411 if (chr == list[3])
412 list[3] = NOTACHAR;
413 else
414 list[4] = NOTACHAR;
415 return code;
416
417 #ifdef SUPPORT_UNICODE
418 case OP_PROP:
419 case OP_NOTPROP:
420 if (code[0] != PT_CLIST)
421 {
422 list[2] = code[0];
423 list[3] = code[1];
424 return code + 2;
425 }
426
427 /* Convert only if we have enough space. */
428
429 clist_src = PRIV(ucd_caseless_sets) + code[1];
430 clist_dest = list + 2;
431 code += 2;
432
433 do {
434 if (clist_dest >= list + 8)
435 {
436 /* Early return if there is not enough space. This should never
437 happen, since all clists are shorter than 5 character now. */
438 list[2] = code[0];
439 list[3] = code[1];
440 return code;
441 }
442 *clist_dest++ = *clist_src;
443 }
444 while(*clist_src++ != NOTACHAR);
445
446 /* All characters are stored. The terminating NOTACHAR is copied from the
447 clist itself. */
448
449 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
450 return code;
451 #endif
452
453 case OP_NCLASS:
454 case OP_CLASS:
455 #ifdef SUPPORT_WIDE_CHARS
456 case OP_XCLASS:
457 if (c == OP_XCLASS)
458 end = code + GET(code, 0) - 1;
459 else
460 #endif
461 end = code + 32 / sizeof(PCRE2_UCHAR);
462
463 switch(*end)
464 {
465 case OP_CRSTAR:
466 case OP_CRMINSTAR:
467 case OP_CRQUERY:
468 case OP_CRMINQUERY:
469 case OP_CRPOSSTAR:
470 case OP_CRPOSQUERY:
471 list[1] = TRUE;
472 end++;
473 break;
474
475 case OP_CRPLUS:
476 case OP_CRMINPLUS:
477 case OP_CRPOSPLUS:
478 end++;
479 break;
480
481 case OP_CRRANGE:
482 case OP_CRMINRANGE:
483 case OP_CRPOSRANGE:
484 list[1] = (GET2(end, 1) == 0);
485 end += 1 + 2 * IMM2_SIZE;
486 break;
487 }
488 list[2] = (uint32_t)(end - code);
489 return end;
490 }
491 return NULL; /* Opcode not accepted */
492 }
493
494
495
496 /*************************************************
497 * Scan further character sets for match *
498 *************************************************/
499
500 /* Checks whether the base and the current opcode have a common character, in
501 which case the base cannot be possessified.
502
503 Arguments:
504 code points to the byte code
505 utf TRUE in UTF mode
506 cb compile data block
507 base_list the data list of the base opcode
508 base_end the end of the data list
509 rec_limit points to recursion depth counter
510
511 Returns: TRUE if the auto-possessification is possible
512 */
513
514 static BOOL
compare_opcodes(PCRE2_SPTR code,BOOL utf,const compile_block * cb,const uint32_t * base_list,PCRE2_SPTR base_end,int * rec_limit)515 compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
516 const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
517 {
518 PCRE2_UCHAR c;
519 uint32_t list[8];
520 const uint32_t *chr_ptr;
521 const uint32_t *ochr_ptr;
522 const uint32_t *list_ptr;
523 PCRE2_SPTR next_code;
524 #ifdef SUPPORT_WIDE_CHARS
525 PCRE2_SPTR xclass_flags;
526 #endif
527 const uint8_t *class_bitset;
528 const uint8_t *set1, *set2, *set_end;
529 uint32_t chr;
530 BOOL accepted, invert_bits;
531 BOOL entered_a_group = FALSE;
532
533 if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */
534
535 /* Note: the base_list[1] contains whether the current opcode has a greedy
536 (represented by a non-zero value) quantifier. This is a different from
537 other character type lists, which store here that the character iterator
538 matches to an empty string (also represented by a non-zero value). */
539
540 for(;;)
541 {
542 /* All operations move the code pointer forward.
543 Therefore infinite recursions are not possible. */
544
545 c = *code;
546
547 /* Skip over callouts */
548
549 if (c == OP_CALLOUT)
550 {
551 code += PRIV(OP_lengths)[c];
552 continue;
553 }
554
555 if (c == OP_CALLOUT_STR)
556 {
557 code += GET(code, 1 + 2*LINK_SIZE);
558 continue;
559 }
560
561 if (c == OP_ALT)
562 {
563 do code += GET(code, 1); while (*code == OP_ALT);
564 c = *code;
565 }
566
567 switch(c)
568 {
569 case OP_END:
570 case OP_KETRPOS:
571 /* TRUE only in greedy case. The non-greedy case could be replaced by
572 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
573 uses more memory, which we cannot get at this stage.) */
574
575 return base_list[1] != 0;
576
577 case OP_KET:
578 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
579 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
580 cannot be converted to a possessive form. */
581
582 if (base_list[1] == 0) return FALSE;
583
584 switch(*(code - GET(code, 1)))
585 {
586 case OP_ASSERT:
587 case OP_ASSERT_NOT:
588 case OP_ASSERTBACK:
589 case OP_ASSERTBACK_NOT:
590 case OP_ONCE:
591 case OP_ONCE_NC:
592 /* Atomic sub-patterns and assertions can always auto-possessify their
593 last iterator. However, if the group was entered as a result of checking
594 a previous iterator, this is not possible. */
595
596 return !entered_a_group;
597 }
598
599 code += PRIV(OP_lengths)[c];
600 continue;
601
602 case OP_ONCE:
603 case OP_ONCE_NC:
604 case OP_BRA:
605 case OP_CBRA:
606 next_code = code + GET(code, 1);
607 code += PRIV(OP_lengths)[c];
608
609 while (*next_code == OP_ALT)
610 {
611 if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
612 return FALSE;
613 code = next_code + 1 + LINK_SIZE;
614 next_code += GET(next_code, 1);
615 }
616
617 entered_a_group = TRUE;
618 continue;
619
620 case OP_BRAZERO:
621 case OP_BRAMINZERO:
622
623 next_code = code + 1;
624 if (*next_code != OP_BRA && *next_code != OP_CBRA
625 && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
626
627 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
628
629 /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
630
631 next_code += 1 + LINK_SIZE;
632 if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
633 return FALSE;
634
635 code += PRIV(OP_lengths)[c];
636 continue;
637
638 default:
639 break;
640 }
641
642 /* Check for a supported opcode, and load its properties. */
643
644 code = get_chr_property_list(code, utf, cb->fcc, list);
645 if (code == NULL) return FALSE; /* Unsupported */
646
647 /* If either opcode is a small character list, set pointers for comparing
648 characters from that list with another list, or with a property. */
649
650 if (base_list[0] == OP_CHAR)
651 {
652 chr_ptr = base_list + 2;
653 list_ptr = list;
654 }
655 else if (list[0] == OP_CHAR)
656 {
657 chr_ptr = list + 2;
658 list_ptr = base_list;
659 }
660
661 /* Character bitsets can also be compared to certain opcodes. */
662
663 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
664 #if PCRE2_CODE_UNIT_WIDTH == 8
665 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
666 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
667 #endif
668 )
669 {
670 #if PCRE2_CODE_UNIT_WIDTH == 8
671 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
672 #else
673 if (base_list[0] == OP_CLASS)
674 #endif
675 {
676 set1 = (uint8_t *)(base_end - base_list[2]);
677 list_ptr = list;
678 }
679 else
680 {
681 set1 = (uint8_t *)(code - list[2]);
682 list_ptr = base_list;
683 }
684
685 invert_bits = FALSE;
686 switch(list_ptr[0])
687 {
688 case OP_CLASS:
689 case OP_NCLASS:
690 set2 = (uint8_t *)
691 ((list_ptr == list ? code : base_end) - list_ptr[2]);
692 break;
693
694 #ifdef SUPPORT_WIDE_CHARS
695 case OP_XCLASS:
696 xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
697 if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
698 if ((*xclass_flags & XCL_MAP) == 0)
699 {
700 /* No bits are set for characters < 256. */
701 if (list[1] == 0) return TRUE;
702 /* Might be an empty repeat. */
703 continue;
704 }
705 set2 = (uint8_t *)(xclass_flags + 1);
706 break;
707 #endif
708
709 case OP_NOT_DIGIT:
710 invert_bits = TRUE;
711 /* Fall through */
712 case OP_DIGIT:
713 set2 = (uint8_t *)(cb->cbits + cbit_digit);
714 break;
715
716 case OP_NOT_WHITESPACE:
717 invert_bits = TRUE;
718 /* Fall through */
719 case OP_WHITESPACE:
720 set2 = (uint8_t *)(cb->cbits + cbit_space);
721 break;
722
723 case OP_NOT_WORDCHAR:
724 invert_bits = TRUE;
725 /* Fall through */
726 case OP_WORDCHAR:
727 set2 = (uint8_t *)(cb->cbits + cbit_word);
728 break;
729
730 default:
731 return FALSE;
732 }
733
734 /* Because the bit sets are unaligned bytes, we need to perform byte
735 comparison here. */
736
737 set_end = set1 + 32;
738 if (invert_bits)
739 {
740 do
741 {
742 if ((*set1++ & ~(*set2++)) != 0) return FALSE;
743 }
744 while (set1 < set_end);
745 }
746 else
747 {
748 do
749 {
750 if ((*set1++ & *set2++) != 0) return FALSE;
751 }
752 while (set1 < set_end);
753 }
754
755 if (list[1] == 0) return TRUE;
756 /* Might be an empty repeat. */
757 continue;
758 }
759
760 /* Some property combinations also acceptable. Unicode property opcodes are
761 processed specially; the rest can be handled with a lookup table. */
762
763 else
764 {
765 uint32_t leftop, rightop;
766
767 leftop = base_list[0];
768 rightop = list[0];
769
770 #ifdef SUPPORT_UNICODE
771 accepted = FALSE; /* Always set in non-unicode case. */
772 if (leftop == OP_PROP || leftop == OP_NOTPROP)
773 {
774 if (rightop == OP_EOD)
775 accepted = TRUE;
776 else if (rightop == OP_PROP || rightop == OP_NOTPROP)
777 {
778 int n;
779 const uint8_t *p;
780 BOOL same = leftop == rightop;
781 BOOL lisprop = leftop == OP_PROP;
782 BOOL risprop = rightop == OP_PROP;
783 BOOL bothprop = lisprop && risprop;
784
785 /* There's a table that specifies how each combination is to be
786 processed:
787 0 Always return FALSE (never auto-possessify)
788 1 Character groups are distinct (possessify if both are OP_PROP)
789 2 Check character categories in the same group (general or particular)
790 3 Return TRUE if the two opcodes are not the same
791 ... see comments below
792 */
793
794 n = propposstab[base_list[2]][list[2]];
795 switch(n)
796 {
797 case 0: break;
798 case 1: accepted = bothprop; break;
799 case 2: accepted = (base_list[3] == list[3]) != same; break;
800 case 3: accepted = !same; break;
801
802 case 4: /* Left general category, right particular category */
803 accepted = risprop && catposstab[base_list[3]][list[3]] == same;
804 break;
805
806 case 5: /* Right general category, left particular category */
807 accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
808 break;
809
810 /* This code is logically tricky. Think hard before fiddling with it.
811 The posspropstab table has four entries per row. Each row relates to
812 one of PCRE's special properties such as ALNUM or SPACE or WORD.
813 Only WORD actually needs all four entries, but using repeats for the
814 others means they can all use the same code below.
815
816 The first two entries in each row are Unicode general categories, and
817 apply always, because all the characters they include are part of the
818 PCRE character set. The third and fourth entries are a general and a
819 particular category, respectively, that include one or more relevant
820 characters. One or the other is used, depending on whether the check
821 is for a general or a particular category. However, in both cases the
822 category contains more characters than the specials that are defined
823 for the property being tested against. Therefore, it cannot be used
824 in a NOTPROP case.
825
826 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
827 Underscore is covered by ucp_P or ucp_Po. */
828
829 case 6: /* Left alphanum vs right general category */
830 case 7: /* Left space vs right general category */
831 case 8: /* Left word vs right general category */
832 p = posspropstab[n-6];
833 accepted = risprop && lisprop ==
834 (list[3] != p[0] &&
835 list[3] != p[1] &&
836 (list[3] != p[2] || !lisprop));
837 break;
838
839 case 9: /* Right alphanum vs left general category */
840 case 10: /* Right space vs left general category */
841 case 11: /* Right word vs left general category */
842 p = posspropstab[n-9];
843 accepted = lisprop && risprop ==
844 (base_list[3] != p[0] &&
845 base_list[3] != p[1] &&
846 (base_list[3] != p[2] || !risprop));
847 break;
848
849 case 12: /* Left alphanum vs right particular category */
850 case 13: /* Left space vs right particular category */
851 case 14: /* Left word vs right particular category */
852 p = posspropstab[n-12];
853 accepted = risprop && lisprop ==
854 (catposstab[p[0]][list[3]] &&
855 catposstab[p[1]][list[3]] &&
856 (list[3] != p[3] || !lisprop));
857 break;
858
859 case 15: /* Right alphanum vs left particular category */
860 case 16: /* Right space vs left particular category */
861 case 17: /* Right word vs left particular category */
862 p = posspropstab[n-15];
863 accepted = lisprop && risprop ==
864 (catposstab[p[0]][base_list[3]] &&
865 catposstab[p[1]][base_list[3]] &&
866 (base_list[3] != p[3] || !risprop));
867 break;
868 }
869 }
870 }
871
872 else
873 #endif /* SUPPORT_UNICODE */
874
875 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
876 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
877 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
878
879 if (!accepted) return FALSE;
880
881 if (list[1] == 0) return TRUE;
882 /* Might be an empty repeat. */
883 continue;
884 }
885
886 /* Control reaches here only if one of the items is a small character list.
887 All characters are checked against the other side. */
888
889 do
890 {
891 chr = *chr_ptr;
892
893 switch(list_ptr[0])
894 {
895 case OP_CHAR:
896 ochr_ptr = list_ptr + 2;
897 do
898 {
899 if (chr == *ochr_ptr) return FALSE;
900 ochr_ptr++;
901 }
902 while(*ochr_ptr != NOTACHAR);
903 break;
904
905 case OP_NOT:
906 ochr_ptr = list_ptr + 2;
907 do
908 {
909 if (chr == *ochr_ptr)
910 break;
911 ochr_ptr++;
912 }
913 while(*ochr_ptr != NOTACHAR);
914 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
915 break;
916
917 /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
918 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
919
920 case OP_DIGIT:
921 if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;
922 break;
923
924 case OP_NOT_DIGIT:
925 if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;
926 break;
927
928 case OP_WHITESPACE:
929 if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;
930 break;
931
932 case OP_NOT_WHITESPACE:
933 if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;
934 break;
935
936 case OP_WORDCHAR:
937 if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;
938 break;
939
940 case OP_NOT_WORDCHAR:
941 if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;
942 break;
943
944 case OP_HSPACE:
945 switch(chr)
946 {
947 HSPACE_CASES: return FALSE;
948 default: break;
949 }
950 break;
951
952 case OP_NOT_HSPACE:
953 switch(chr)
954 {
955 HSPACE_CASES: break;
956 default: return FALSE;
957 }
958 break;
959
960 case OP_ANYNL:
961 case OP_VSPACE:
962 switch(chr)
963 {
964 VSPACE_CASES: return FALSE;
965 default: break;
966 }
967 break;
968
969 case OP_NOT_VSPACE:
970 switch(chr)
971 {
972 VSPACE_CASES: break;
973 default: return FALSE;
974 }
975 break;
976
977 case OP_DOLL:
978 case OP_EODN:
979 switch (chr)
980 {
981 case CHAR_CR:
982 case CHAR_LF:
983 case CHAR_VT:
984 case CHAR_FF:
985 case CHAR_NEL:
986 #ifndef EBCDIC
987 case 0x2028:
988 case 0x2029:
989 #endif /* Not EBCDIC */
990 return FALSE;
991 }
992 break;
993
994 case OP_EOD: /* Can always possessify before \z */
995 break;
996
997 #ifdef SUPPORT_UNICODE
998 case OP_PROP:
999 case OP_NOTPROP:
1000 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
1001 list_ptr[0] == OP_NOTPROP))
1002 return FALSE;
1003 break;
1004 #endif
1005
1006 case OP_NCLASS:
1007 if (chr > 255) return FALSE;
1008 /* Fall through */
1009
1010 case OP_CLASS:
1011 if (chr > 255) break;
1012 class_bitset = (uint8_t *)
1013 ((list_ptr == list ? code : base_end) - list_ptr[2]);
1014 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
1015 break;
1016
1017 #ifdef SUPPORT_WIDE_CHARS
1018 case OP_XCLASS:
1019 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
1020 list_ptr[2] + LINK_SIZE, utf)) return FALSE;
1021 break;
1022 #endif
1023
1024 default:
1025 return FALSE;
1026 }
1027
1028 chr_ptr++;
1029 }
1030 while(*chr_ptr != NOTACHAR);
1031
1032 /* At least one character must be matched from this opcode. */
1033
1034 if (list[1] == 0) return TRUE;
1035 }
1036
1037 /* Control never reaches here. There used to be a fail-save return FALSE; here,
1038 but some compilers complain about an unreachable statement. */
1039 }
1040
1041
1042
1043 /*************************************************
1044 * Scan compiled regex for auto-possession *
1045 *************************************************/
1046
1047 /* Replaces single character iterations with their possessive alternatives
1048 if appropriate. This function modifies the compiled opcode! Hitting a
1049 non-existant opcode may indicate a bug in PCRE2, but it can also be caused if a
1050 bad UTF string was compiled with PCRE2_NO_UTF_CHECK.
1051
1052 Arguments:
1053 code points to start of the byte code
1054 utf TRUE in UTF mode
1055 cb compile data block
1056
1057 Returns: 0 for success
1058 -1 if a non-existant opcode is encountered
1059 */
1060
1061 int
PRIV(auto_possessify)1062 PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
1063 {
1064 register PCRE2_UCHAR c;
1065 PCRE2_SPTR end;
1066 PCRE2_UCHAR *repeat_opcode;
1067 uint32_t list[8];
1068 int rec_limit;
1069
1070 for (;;)
1071 {
1072 c = *code;
1073
1074 if (c > OP_TABLE_LENGTH) return -1; /* Something gone wrong */
1075
1076 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
1077 {
1078 c -= get_repeat_base(c) - OP_STAR;
1079 end = (c <= OP_MINUPTO) ?
1080 get_chr_property_list(code, utf, cb->fcc, list) : NULL;
1081 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
1082
1083 rec_limit = 1000;
1084 if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
1085 {
1086 switch(c)
1087 {
1088 case OP_STAR:
1089 *code += OP_POSSTAR - OP_STAR;
1090 break;
1091
1092 case OP_MINSTAR:
1093 *code += OP_POSSTAR - OP_MINSTAR;
1094 break;
1095
1096 case OP_PLUS:
1097 *code += OP_POSPLUS - OP_PLUS;
1098 break;
1099
1100 case OP_MINPLUS:
1101 *code += OP_POSPLUS - OP_MINPLUS;
1102 break;
1103
1104 case OP_QUERY:
1105 *code += OP_POSQUERY - OP_QUERY;
1106 break;
1107
1108 case OP_MINQUERY:
1109 *code += OP_POSQUERY - OP_MINQUERY;
1110 break;
1111
1112 case OP_UPTO:
1113 *code += OP_POSUPTO - OP_UPTO;
1114 break;
1115
1116 case OP_MINUPTO:
1117 *code += OP_POSUPTO - OP_MINUPTO;
1118 break;
1119 }
1120 }
1121 c = *code;
1122 }
1123 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
1124 {
1125 #ifdef SUPPORT_WIDE_CHARS
1126 if (c == OP_XCLASS)
1127 repeat_opcode = code + GET(code, 1);
1128 else
1129 #endif
1130 repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
1131
1132 c = *repeat_opcode;
1133 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
1134 {
1135 /* end must not be NULL. */
1136 end = get_chr_property_list(code, utf, cb->fcc, list);
1137
1138 list[1] = (c & 1) == 0;
1139
1140 rec_limit = 1000;
1141 if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
1142 {
1143 switch (c)
1144 {
1145 case OP_CRSTAR:
1146 case OP_CRMINSTAR:
1147 *repeat_opcode = OP_CRPOSSTAR;
1148 break;
1149
1150 case OP_CRPLUS:
1151 case OP_CRMINPLUS:
1152 *repeat_opcode = OP_CRPOSPLUS;
1153 break;
1154
1155 case OP_CRQUERY:
1156 case OP_CRMINQUERY:
1157 *repeat_opcode = OP_CRPOSQUERY;
1158 break;
1159
1160 case OP_CRRANGE:
1161 case OP_CRMINRANGE:
1162 *repeat_opcode = OP_CRPOSRANGE;
1163 break;
1164 }
1165 }
1166 }
1167 c = *code;
1168 }
1169
1170 switch(c)
1171 {
1172 case OP_END:
1173 return 0;
1174
1175 case OP_TYPESTAR:
1176 case OP_TYPEMINSTAR:
1177 case OP_TYPEPLUS:
1178 case OP_TYPEMINPLUS:
1179 case OP_TYPEQUERY:
1180 case OP_TYPEMINQUERY:
1181 case OP_TYPEPOSSTAR:
1182 case OP_TYPEPOSPLUS:
1183 case OP_TYPEPOSQUERY:
1184 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1185 break;
1186
1187 case OP_TYPEUPTO:
1188 case OP_TYPEMINUPTO:
1189 case OP_TYPEEXACT:
1190 case OP_TYPEPOSUPTO:
1191 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1192 code += 2;
1193 break;
1194
1195 case OP_CALLOUT_STR:
1196 code += GET(code, 1 + 2*LINK_SIZE);
1197 break;
1198
1199 #ifdef SUPPORT_WIDE_CHARS
1200 case OP_XCLASS:
1201 code += GET(code, 1);
1202 break;
1203 #endif
1204
1205 case OP_MARK:
1206 case OP_PRUNE_ARG:
1207 case OP_SKIP_ARG:
1208 case OP_THEN_ARG:
1209 code += code[1];
1210 break;
1211 }
1212
1213 /* Add in the fixed length from the table */
1214
1215 code += PRIV(OP_lengths)[c];
1216
1217 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
1218 followed by a multi-byte character. The length in the table is a minimum, so
1219 we have to arrange to skip the extra code units. */
1220
1221 #ifdef MAYBE_UTF_MULTI
1222 if (utf) switch(c)
1223 {
1224 case OP_CHAR:
1225 case OP_CHARI:
1226 case OP_NOT:
1227 case OP_NOTI:
1228 case OP_STAR:
1229 case OP_MINSTAR:
1230 case OP_PLUS:
1231 case OP_MINPLUS:
1232 case OP_QUERY:
1233 case OP_MINQUERY:
1234 case OP_UPTO:
1235 case OP_MINUPTO:
1236 case OP_EXACT:
1237 case OP_POSSTAR:
1238 case OP_POSPLUS:
1239 case OP_POSQUERY:
1240 case OP_POSUPTO:
1241 case OP_STARI:
1242 case OP_MINSTARI:
1243 case OP_PLUSI:
1244 case OP_MINPLUSI:
1245 case OP_QUERYI:
1246 case OP_MINQUERYI:
1247 case OP_UPTOI:
1248 case OP_MINUPTOI:
1249 case OP_EXACTI:
1250 case OP_POSSTARI:
1251 case OP_POSPLUSI:
1252 case OP_POSQUERYI:
1253 case OP_POSUPTOI:
1254 case OP_NOTSTAR:
1255 case OP_NOTMINSTAR:
1256 case OP_NOTPLUS:
1257 case OP_NOTMINPLUS:
1258 case OP_NOTQUERY:
1259 case OP_NOTMINQUERY:
1260 case OP_NOTUPTO:
1261 case OP_NOTMINUPTO:
1262 case OP_NOTEXACT:
1263 case OP_NOTPOSSTAR:
1264 case OP_NOTPOSPLUS:
1265 case OP_NOTPOSQUERY:
1266 case OP_NOTPOSUPTO:
1267 case OP_NOTSTARI:
1268 case OP_NOTMINSTARI:
1269 case OP_NOTPLUSI:
1270 case OP_NOTMINPLUSI:
1271 case OP_NOTQUERYI:
1272 case OP_NOTMINQUERYI:
1273 case OP_NOTUPTOI:
1274 case OP_NOTMINUPTOI:
1275 case OP_NOTEXACTI:
1276 case OP_NOTPOSSTARI:
1277 case OP_NOTPOSPLUSI:
1278 case OP_NOTPOSQUERYI:
1279 case OP_NOTPOSUPTOI:
1280 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1281 break;
1282 }
1283 #else
1284 (void)(utf); /* Keep compiler happy by referencing function argument */
1285 #endif /* SUPPORT_WIDE_CHARS */
1286 }
1287 }
1288
1289 /* End of pcre2_auto_possess.c */
1290