1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49
50 #include "pcre2_internal.h"
51
52
53 /* Set a bit in the starting code unit bit map. */
54
55 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
56
57 /* Returns from set_start_bits() */
58
59 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
60
61
62 /*************************************************
63 * Find the minimum subject length for a group *
64 *************************************************/
65
66 /* Scan a parenthesized group and compute the minimum length of subject that
67 is needed to match it. This is a lower bound; it does not mean there is a
68 string of that length that matches. In UTF mode, the result is in characters
69 rather than code units. The field in a compiled pattern for storing the minimum
70 length is 16-bits long (on the grounds that anything longer than that is
71 pathological), so we give up when we reach that amount. This also means that
72 integer overflow for really crazy patterns cannot happen.
73
74 Arguments:
75 re compiled pattern block
76 code pointer to start of group (the bracket)
77 startcode pointer to start of the whole pattern's code
78 utf UTF flag
79 recurses chain of recurse_check to catch mutual recursion
80 countptr pointer to call count (to catch over complexity)
81
82 Returns: the minimum length
83 -1 \C in UTF-8 mode
84 or (*ACCEPT)
85 or pattern too complicated
86 or back reference to duplicate name/number
87 -2 internal error (missing capturing bracket)
88 -3 internal error (opcode not listed)
89 */
90
91 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,BOOL utf,recurse_check * recurses,int * countptr)92 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
93 PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
94 {
95 int length = -1;
96 int prev_cap_recno = -1;
97 int prev_cap_d = 0;
98 int prev_recurse_recno = -1;
99 int prev_recurse_d = 0;
100 uint32_t once_fudge = 0;
101 BOOL had_recurse = FALSE;
102 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
103 recurse_check this_recurse;
104 register int branchlength = 0;
105 register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
106
107 /* If this is a "could be empty" group, its minimum length is 0. */
108
109 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
110
111 /* Skip over capturing bracket number */
112
113 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
114
115 /* A large and/or complex regex can take too long to process. */
116
117 if ((*countptr)++ > 1000) return -1;
118
119 /* Scan along the opcodes for this branch. If we get to the end of the branch,
120 check the length against that of the other branches. If the accumulated length
121 passes 16-bits, stop. */
122
123 for (;;)
124 {
125 int d, min, recno;
126 PCRE2_UCHAR *cs, *ce;
127 register PCRE2_UCHAR op = *cc;
128
129 if (branchlength >= UINT16_MAX) return UINT16_MAX;
130
131 switch (op)
132 {
133 case OP_COND:
134 case OP_SCOND:
135
136 /* If there is only one branch in a condition, the implied branch has zero
137 length, so we don't add anything. This covers the DEFINE "condition"
138 automatically. If there are two branches we can treat it the same as any
139 other non-capturing subpattern. */
140
141 cs = cc + GET(cc, 1);
142 if (*cs != OP_ALT)
143 {
144 cc = cs + 1 + LINK_SIZE;
145 break;
146 }
147 goto PROCESS_NON_CAPTURE;
148
149 /* There's a special case of OP_ONCE, when it is wrapped round an
150 OP_RECURSE. We'd like to process the latter at this level so that
151 remembering the value works for repeated cases. So we do nothing, but
152 set a fudge value to skip over the OP_KET after the recurse. */
153
154 case OP_ONCE:
155 if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
156 {
157 once_fudge = 1 + LINK_SIZE;
158 cc += 1 + LINK_SIZE;
159 break;
160 }
161 /* Fall through */
162
163 case OP_ONCE_NC:
164 case OP_BRA:
165 case OP_SBRA:
166 case OP_BRAPOS:
167 case OP_SBRAPOS:
168 PROCESS_NON_CAPTURE:
169 d = find_minlength(re, cc, startcode, utf, recurses, countptr);
170 if (d < 0) return d;
171 branchlength += d;
172 do cc += GET(cc, 1); while (*cc == OP_ALT);
173 cc += 1 + LINK_SIZE;
174 break;
175
176 /* To save time for repeated capturing subpatterns, we remember the
177 length of the previous one. Unfortunately we can't do the same for
178 the unnumbered ones above. Nor can we do this if (?| is present in the
179 pattern because captures with the same number are not then identical. */
180
181 case OP_CBRA:
182 case OP_SCBRA:
183 case OP_CBRAPOS:
184 case OP_SCBRAPOS:
185 recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE);
186 if (recno != prev_cap_recno)
187 {
188 prev_cap_recno = recno;
189 prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
190 if (prev_cap_d < 0) return prev_cap_d;
191 }
192 branchlength += prev_cap_d;
193 do cc += GET(cc, 1); while (*cc == OP_ALT);
194 cc += 1 + LINK_SIZE;
195 break;
196
197 /* ACCEPT makes things far too complicated; we have to give up. */
198
199 case OP_ACCEPT:
200 case OP_ASSERT_ACCEPT:
201 return -1;
202
203 /* Reached end of a branch; if it's a ket it is the end of a nested
204 call. If it's ALT it is an alternation in a nested call. If it is END it's
205 the end of the outer call. All can be handled by the same code. If an
206 ACCEPT was previously encountered, use the length that was in force at that
207 time, and pass back the shortest ACCEPT length. */
208
209 case OP_ALT:
210 case OP_KET:
211 case OP_KETRMAX:
212 case OP_KETRMIN:
213 case OP_KETRPOS:
214 case OP_END:
215 if (length < 0 || (!had_recurse && branchlength < length))
216 length = branchlength;
217 if (op != OP_ALT) return length;
218 cc += 1 + LINK_SIZE;
219 branchlength = 0;
220 had_recurse = FALSE;
221 break;
222
223 /* Skip over assertive subpatterns */
224
225 case OP_ASSERT:
226 case OP_ASSERT_NOT:
227 case OP_ASSERTBACK:
228 case OP_ASSERTBACK_NOT:
229 do cc += GET(cc, 1); while (*cc == OP_ALT);
230 /* Fall through */
231
232 /* Skip over things that don't match chars */
233
234 case OP_REVERSE:
235 case OP_CREF:
236 case OP_DNCREF:
237 case OP_RREF:
238 case OP_DNRREF:
239 case OP_FALSE:
240 case OP_TRUE:
241 case OP_CALLOUT:
242 case OP_SOD:
243 case OP_SOM:
244 case OP_EOD:
245 case OP_EODN:
246 case OP_CIRC:
247 case OP_CIRCM:
248 case OP_DOLL:
249 case OP_DOLLM:
250 case OP_NOT_WORD_BOUNDARY:
251 case OP_WORD_BOUNDARY:
252 cc += PRIV(OP_lengths)[*cc];
253 break;
254
255 case OP_CALLOUT_STR:
256 cc += GET(cc, 1 + 2*LINK_SIZE);
257 break;
258
259 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
260
261 case OP_BRAZERO:
262 case OP_BRAMINZERO:
263 case OP_BRAPOSZERO:
264 case OP_SKIPZERO:
265 cc += PRIV(OP_lengths)[*cc];
266 do cc += GET(cc, 1); while (*cc == OP_ALT);
267 cc += 1 + LINK_SIZE;
268 break;
269
270 /* Handle literal characters and + repetitions */
271
272 case OP_CHAR:
273 case OP_CHARI:
274 case OP_NOT:
275 case OP_NOTI:
276 case OP_PLUS:
277 case OP_PLUSI:
278 case OP_MINPLUS:
279 case OP_MINPLUSI:
280 case OP_POSPLUS:
281 case OP_POSPLUSI:
282 case OP_NOTPLUS:
283 case OP_NOTPLUSI:
284 case OP_NOTMINPLUS:
285 case OP_NOTMINPLUSI:
286 case OP_NOTPOSPLUS:
287 case OP_NOTPOSPLUSI:
288 branchlength++;
289 cc += 2;
290 #ifdef SUPPORT_UNICODE
291 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
292 #endif
293 break;
294
295 case OP_TYPEPLUS:
296 case OP_TYPEMINPLUS:
297 case OP_TYPEPOSPLUS:
298 branchlength++;
299 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
300 break;
301
302 /* Handle exact repetitions. The count is already in characters, but we
303 may need to skip over a multibyte character in UTF mode. */
304
305 case OP_EXACT:
306 case OP_EXACTI:
307 case OP_NOTEXACT:
308 case OP_NOTEXACTI:
309 branchlength += GET2(cc,1);
310 cc += 2 + IMM2_SIZE;
311 #ifdef SUPPORT_UNICODE
312 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
313 #endif
314 break;
315
316 case OP_TYPEEXACT:
317 branchlength += GET2(cc,1);
318 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
319 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
320 break;
321
322 /* Handle single-char non-literal matchers */
323
324 case OP_PROP:
325 case OP_NOTPROP:
326 cc += 2;
327 /* Fall through */
328
329 case OP_NOT_DIGIT:
330 case OP_DIGIT:
331 case OP_NOT_WHITESPACE:
332 case OP_WHITESPACE:
333 case OP_NOT_WORDCHAR:
334 case OP_WORDCHAR:
335 case OP_ANY:
336 case OP_ALLANY:
337 case OP_EXTUNI:
338 case OP_HSPACE:
339 case OP_NOT_HSPACE:
340 case OP_VSPACE:
341 case OP_NOT_VSPACE:
342 branchlength++;
343 cc++;
344 break;
345
346 /* "Any newline" might match two characters, but it also might match just
347 one. */
348
349 case OP_ANYNL:
350 branchlength += 1;
351 cc++;
352 break;
353
354 /* The single-byte matcher means we can't proceed in UTF mode. (In
355 non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
356 appear, but leave the code, just in case.) */
357
358 case OP_ANYBYTE:
359 #ifdef SUPPORT_UNICODE
360 if (utf) return -1;
361 #endif
362 branchlength++;
363 cc++;
364 break;
365
366 /* For repeated character types, we have to test for \p and \P, which have
367 an extra two bytes of parameters. */
368
369 case OP_TYPESTAR:
370 case OP_TYPEMINSTAR:
371 case OP_TYPEQUERY:
372 case OP_TYPEMINQUERY:
373 case OP_TYPEPOSSTAR:
374 case OP_TYPEPOSQUERY:
375 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
376 cc += PRIV(OP_lengths)[op];
377 break;
378
379 case OP_TYPEUPTO:
380 case OP_TYPEMINUPTO:
381 case OP_TYPEPOSUPTO:
382 if (cc[1 + IMM2_SIZE] == OP_PROP
383 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
384 cc += PRIV(OP_lengths)[op];
385 break;
386
387 /* Check a class for variable quantification */
388
389 case OP_CLASS:
390 case OP_NCLASS:
391 #ifdef SUPPORT_WIDE_CHARS
392 case OP_XCLASS:
393 /* The original code caused an unsigned overflow in 64 bit systems,
394 so now we use a conditional statement. */
395 if (op == OP_XCLASS)
396 cc += GET(cc, 1);
397 else
398 cc += PRIV(OP_lengths)[OP_CLASS];
399 #else
400 cc += PRIV(OP_lengths)[OP_CLASS];
401 #endif
402
403 switch (*cc)
404 {
405 case OP_CRPLUS:
406 case OP_CRMINPLUS:
407 case OP_CRPOSPLUS:
408 branchlength++;
409 /* Fall through */
410
411 case OP_CRSTAR:
412 case OP_CRMINSTAR:
413 case OP_CRQUERY:
414 case OP_CRMINQUERY:
415 case OP_CRPOSSTAR:
416 case OP_CRPOSQUERY:
417 cc++;
418 break;
419
420 case OP_CRRANGE:
421 case OP_CRMINRANGE:
422 case OP_CRPOSRANGE:
423 branchlength += GET2(cc,1);
424 cc += 1 + 2 * IMM2_SIZE;
425 break;
426
427 default:
428 branchlength++;
429 break;
430 }
431 break;
432
433 /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
434 way: we find the minimum length for the subpattern. A recursion
435 (backreference or subroutine) causes an a flag to be set that causes the
436 length of this branch to be ignored. The logic is that a recursion can only
437 make sense if there is another alternative that stops the recursing. That
438 will provide the minimum length (when no recursion happens).
439
440 If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
441 matches an empty string (by default it causes a matching failure), so in
442 that case we must set the minimum length to zero. */
443
444 /* Duplicate named pattern back reference. We cannot reliably find a length
445 for this if duplicate numbers are present in the pattern. */
446
447 case OP_DNREF:
448 case OP_DNREFI:
449 if (dupcapused) return -1;
450 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
451 {
452 int count = GET2(cc, 1+IMM2_SIZE);
453 PCRE2_UCHAR *slot =
454 (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
455 GET2(cc, 1) * re->name_entry_size;
456
457 d = INT_MAX;
458
459 /* Scan all groups with the same name */
460
461 while (count-- > 0)
462 {
463 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
464 if (cs == NULL) return -2;
465 do ce += GET(ce, 1); while (*ce == OP_ALT);
466 if (cc > cs && cc < ce) /* Simple recursion */
467 {
468 d = 0;
469 had_recurse = TRUE;
470 break;
471 }
472 else
473 {
474 recurse_check *r = recurses;
475 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
476 if (r != NULL) /* Mutual recursion */
477 {
478 d = 0;
479 had_recurse = TRUE;
480 break;
481 }
482 else
483 {
484 int dd;
485 this_recurse.prev = recurses;
486 this_recurse.group = cs;
487 dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
488 if (dd < d) d = dd;
489 }
490 }
491 slot += re->name_entry_size;
492 }
493 }
494 else d = 0;
495 cc += 1 + 2*IMM2_SIZE;
496 goto REPEAT_BACK_REFERENCE;
497
498 /* Single back reference. We cannot find a length for this if duplicate
499 numbers are present in the pattern. */
500
501 case OP_REF:
502 case OP_REFI:
503 if (dupcapused) return -1;
504 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
505 {
506 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
507 if (cs == NULL) return -2;
508 do ce += GET(ce, 1); while (*ce == OP_ALT);
509 if (cc > cs && cc < ce) /* Simple recursion */
510 {
511 d = 0;
512 had_recurse = TRUE;
513 }
514 else
515 {
516 recurse_check *r = recurses;
517 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
518 if (r != NULL) /* Mutual recursion */
519 {
520 d = 0;
521 had_recurse = TRUE;
522 }
523 else
524 {
525 this_recurse.prev = recurses;
526 this_recurse.group = cs;
527 d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
528 }
529 }
530 }
531 else d = 0;
532 cc += 1 + IMM2_SIZE;
533
534 /* Handle repeated back references */
535
536 REPEAT_BACK_REFERENCE:
537 switch (*cc)
538 {
539 case OP_CRSTAR:
540 case OP_CRMINSTAR:
541 case OP_CRQUERY:
542 case OP_CRMINQUERY:
543 case OP_CRPOSSTAR:
544 case OP_CRPOSQUERY:
545 min = 0;
546 cc++;
547 break;
548
549 case OP_CRPLUS:
550 case OP_CRMINPLUS:
551 case OP_CRPOSPLUS:
552 min = 1;
553 cc++;
554 break;
555
556 case OP_CRRANGE:
557 case OP_CRMINRANGE:
558 case OP_CRPOSRANGE:
559 min = GET2(cc, 1);
560 cc += 1 + 2 * IMM2_SIZE;
561 break;
562
563 default:
564 min = 1;
565 break;
566 }
567
568 /* Take care not to overflow: (1) min and d are ints, so check that their
569 product is not greater than INT_MAX. (2) branchlength is limited to
570 UINT16_MAX (checked at the top of the loop). */
571
572 if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
573 branchlength = UINT16_MAX;
574 else branchlength += min * d;
575 break;
576
577 /* Recursion always refers to the first occurrence of a subpattern with a
578 given number. Therefore, we can always make use of caching, even when the
579 pattern contains multiple subpatterns with the same number. */
580
581 case OP_RECURSE:
582 cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
583 recno = GET2(cs, 1+LINK_SIZE);
584 if (recno == prev_recurse_recno)
585 {
586 branchlength += prev_recurse_d;
587 }
588 else
589 {
590 do ce += GET(ce, 1); while (*ce == OP_ALT);
591 if (cc > cs && cc < ce) /* Simple recursion */
592 had_recurse = TRUE;
593 else
594 {
595 recurse_check *r = recurses;
596 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
597 if (r != NULL) /* Mutual recursion */
598 had_recurse = TRUE;
599 else
600 {
601 this_recurse.prev = recurses;
602 this_recurse.group = cs;
603 prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
604 countptr);
605 if (prev_recurse_d < 0) return prev_recurse_d;
606 prev_recurse_recno = recno;
607 branchlength += prev_recurse_d;
608 }
609 }
610 }
611 cc += 1 + LINK_SIZE + once_fudge;
612 once_fudge = 0;
613 break;
614
615 /* Anything else does not or need not match a character. We can get the
616 item's length from the table, but for those that can match zero occurrences
617 of a character, we must take special action for UTF-8 characters. As it
618 happens, the "NOT" versions of these opcodes are used at present only for
619 ASCII characters, so they could be omitted from this list. However, in
620 future that may change, so we include them here so as not to leave a
621 gotcha for a future maintainer. */
622
623 case OP_UPTO:
624 case OP_UPTOI:
625 case OP_NOTUPTO:
626 case OP_NOTUPTOI:
627 case OP_MINUPTO:
628 case OP_MINUPTOI:
629 case OP_NOTMINUPTO:
630 case OP_NOTMINUPTOI:
631 case OP_POSUPTO:
632 case OP_POSUPTOI:
633 case OP_NOTPOSUPTO:
634 case OP_NOTPOSUPTOI:
635
636 case OP_STAR:
637 case OP_STARI:
638 case OP_NOTSTAR:
639 case OP_NOTSTARI:
640 case OP_MINSTAR:
641 case OP_MINSTARI:
642 case OP_NOTMINSTAR:
643 case OP_NOTMINSTARI:
644 case OP_POSSTAR:
645 case OP_POSSTARI:
646 case OP_NOTPOSSTAR:
647 case OP_NOTPOSSTARI:
648
649 case OP_QUERY:
650 case OP_QUERYI:
651 case OP_NOTQUERY:
652 case OP_NOTQUERYI:
653 case OP_MINQUERY:
654 case OP_MINQUERYI:
655 case OP_NOTMINQUERY:
656 case OP_NOTMINQUERYI:
657 case OP_POSQUERY:
658 case OP_POSQUERYI:
659 case OP_NOTPOSQUERY:
660 case OP_NOTPOSQUERYI:
661
662 cc += PRIV(OP_lengths)[op];
663 #ifdef SUPPORT_UNICODE
664 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
665 #endif
666 break;
667
668 /* Skip these, but we need to add in the name length. */
669
670 case OP_MARK:
671 case OP_PRUNE_ARG:
672 case OP_SKIP_ARG:
673 case OP_THEN_ARG:
674 cc += PRIV(OP_lengths)[op] + cc[1];
675 break;
676
677 /* The remaining opcodes are just skipped over. */
678
679 case OP_CLOSE:
680 case OP_COMMIT:
681 case OP_FAIL:
682 case OP_PRUNE:
683 case OP_SET_SOM:
684 case OP_SKIP:
685 case OP_THEN:
686 cc += PRIV(OP_lengths)[op];
687 break;
688
689 /* This should not occur: we list all opcodes explicitly so that when
690 new ones get added they are properly considered. */
691
692 default:
693 return -3;
694 }
695 }
696 /* Control never gets here */
697 }
698
699
700
701 /*************************************************
702 * Set a bit and maybe its alternate case *
703 *************************************************/
704
705 /* Given a character, set its first code unit's bit in the table, and also the
706 corresponding bit for the other version of a letter if we are caseless.
707
708 Arguments:
709 re points to the regex block
710 p points to the first code unit of the character
711 caseless TRUE if caseless
712 utf TRUE for UTF mode
713
714 Returns: pointer after the character
715 */
716
717 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf)718 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
719 {
720 uint32_t c = *p++; /* First code unit */
721 (void)utf; /* Stop compiler warning when UTF not supported */
722
723 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
724 0xff. */
725
726 #if PCRE2_CODE_UNIT_WIDTH != 8
727 if (c > 0xff) SET_BIT(0xff); else
728 #endif
729
730 SET_BIT(c);
731
732 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
733 the end of the character, even when caseless. */
734
735 #ifdef SUPPORT_UNICODE
736 if (utf)
737 {
738 #if PCRE2_CODE_UNIT_WIDTH == 8
739 if (c >= 0xc0) GETUTF8INC(c, p);
740 #elif PCRE2_CODE_UNIT_WIDTH == 16
741 if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
742 #endif
743 }
744 #endif /* SUPPORT_UNICODE */
745
746 /* If caseless, handle the other case of the character. */
747
748 if (caseless)
749 {
750 if (utf)
751 {
752 #if PCRE2_CODE_UNIT_WIDTH == 8
753 PCRE2_UCHAR buff[6];
754 c = UCD_OTHERCASE(c);
755 (void)PRIV(ord2utf)(c, buff);
756 SET_BIT(buff[0]);
757 #else /* 16-bit or 32-bit mode */
758 c = UCD_OTHERCASE(c);
759 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
760 #endif
761 }
762
763 /* Not UTF */
764
765 else if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
766 }
767
768 return p;
769 }
770
771
772
773 /*************************************************
774 * Set bits for a positive character type *
775 *************************************************/
776
777 /* This function sets starting bits for a character type. In UTF-8 mode, we can
778 only do a direct setting for bytes less than 128, as otherwise there can be
779 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
780 environment, the tables will only recognize ASCII characters anyway, but in at
781 least one Windows environment, some higher bytes bits were set in the tables.
782 So we deal with that case by considering the UTF-8 encoding.
783
784 Arguments:
785 re the regex block
786 cbit type the type of character wanted
787 table_limit 32 for non-UTF-8; 16 for UTF-8
788
789 Returns: nothing
790 */
791
792 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)793 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
794 {
795 register uint32_t c;
796 for (c = 0; c < table_limit; c++)
797 re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
798 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
799 if (table_limit == 32) return;
800 for (c = 128; c < 256; c++)
801 {
802 if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0)
803 {
804 PCRE2_UCHAR buff[6];
805 (void)PRIV(ord2utf)(c, buff);
806 SET_BIT(buff[0]);
807 }
808 }
809 #endif /* UTF-8 */
810 }
811
812
813 /*************************************************
814 * Set bits for a negative character type *
815 *************************************************/
816
817 /* This function sets starting bits for a negative character type such as \D.
818 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
819 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
820 Unlike in the positive case, where we can set appropriate starting bits for
821 specific high-valued UTF-8 characters, in this case we have to set the bits for
822 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
823 0xc0 (192) for simplicity.
824
825 Arguments:
826 re the regex block
827 cbit type the type of character wanted
828 table_limit 32 for non-UTF-8; 16 for UTF-8
829
830 Returns: nothing
831 */
832
833 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)834 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
835 {
836 register uint32_t c;
837 for (c = 0; c < table_limit; c++)
838 re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
839 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
840 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
841 #endif
842 }
843
844
845
846 /*************************************************
847 * Create bitmap of starting bytes *
848 *************************************************/
849
850 /* This function scans a compiled unanchored expression recursively and
851 attempts to build a bitmap of the set of possible starting code units whose
852 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
853 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
854 we pass a value of 16 rather than 32 as the final argument. (See comments in
855 those functions for the reason.)
856
857 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
858 (a*)b where the group provides some optional starting code units but scanning
859 must continue at the outer level to find at least one mandatory code unit. At
860 the outermost level, this function fails unless the result is SSB_DONE.
861
862 Arguments:
863 re points to the compiled regex block
864 code points to an expression
865 utf TRUE if in UTF mode
866
867 Returns: SSB_FAIL => Failed to find any starting code units
868 SSB_DONE => Found mandatory starting code units
869 SSB_CONTINUE => Found optional starting code units
870 SSB_UNKNOWN => Hit an unrecognized opcode
871 */
872
873 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf)874 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
875 {
876 register uint32_t c;
877 int yield = SSB_DONE;
878
879 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
880 int table_limit = utf? 16:32;
881 #else
882 int table_limit = 32;
883 #endif
884
885 do
886 {
887 BOOL try_next = TRUE;
888 PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
889
890 if (*code == OP_CBRA || *code == OP_SCBRA ||
891 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
892
893 while (try_next) /* Loop for items in this branch */
894 {
895 int rc;
896 uint8_t *classmap = NULL;
897
898 switch(*tcode)
899 {
900 /* If we reach something we don't understand, it means a new opcode has
901 been created that hasn't been added to this function. Hopefully this
902 problem will be discovered during testing. */
903
904 default:
905 return SSB_UNKNOWN;
906
907 /* Fail for a valid opcode that implies no starting bits. */
908
909 case OP_ACCEPT:
910 case OP_ASSERT_ACCEPT:
911 case OP_ALLANY:
912 case OP_ANY:
913 case OP_ANYBYTE:
914 case OP_CIRC:
915 case OP_CIRCM:
916 case OP_CLOSE:
917 case OP_COMMIT:
918 case OP_COND:
919 case OP_CREF:
920 case OP_FALSE:
921 case OP_TRUE:
922 case OP_DNCREF:
923 case OP_DNREF:
924 case OP_DNREFI:
925 case OP_DNRREF:
926 case OP_DOLL:
927 case OP_DOLLM:
928 case OP_END:
929 case OP_EOD:
930 case OP_EODN:
931 case OP_EXTUNI:
932 case OP_FAIL:
933 case OP_MARK:
934 case OP_NOT:
935 case OP_NOTEXACT:
936 case OP_NOTEXACTI:
937 case OP_NOTI:
938 case OP_NOTMINPLUS:
939 case OP_NOTMINPLUSI:
940 case OP_NOTMINQUERY:
941 case OP_NOTMINQUERYI:
942 case OP_NOTMINSTAR:
943 case OP_NOTMINSTARI:
944 case OP_NOTMINUPTO:
945 case OP_NOTMINUPTOI:
946 case OP_NOTPLUS:
947 case OP_NOTPLUSI:
948 case OP_NOTPOSPLUS:
949 case OP_NOTPOSPLUSI:
950 case OP_NOTPOSQUERY:
951 case OP_NOTPOSQUERYI:
952 case OP_NOTPOSSTAR:
953 case OP_NOTPOSSTARI:
954 case OP_NOTPOSUPTO:
955 case OP_NOTPOSUPTOI:
956 case OP_NOTPROP:
957 case OP_NOTQUERY:
958 case OP_NOTQUERYI:
959 case OP_NOTSTAR:
960 case OP_NOTSTARI:
961 case OP_NOTUPTO:
962 case OP_NOTUPTOI:
963 case OP_NOT_HSPACE:
964 case OP_NOT_VSPACE:
965 case OP_PRUNE:
966 case OP_PRUNE_ARG:
967 case OP_RECURSE:
968 case OP_REF:
969 case OP_REFI:
970 case OP_REVERSE:
971 case OP_RREF:
972 case OP_SCOND:
973 case OP_SET_SOM:
974 case OP_SKIP:
975 case OP_SKIP_ARG:
976 case OP_SOD:
977 case OP_SOM:
978 case OP_THEN:
979 case OP_THEN_ARG:
980 return SSB_FAIL;
981
982 /* A "real" property test implies no starting bits, but the fake property
983 PT_CLIST identifies a list of characters. These lists are short, as they
984 are used for characters with more than one "other case", so there is no
985 point in recognizing them for OP_NOTPROP. */
986
987 case OP_PROP:
988 if (tcode[1] != PT_CLIST) return SSB_FAIL;
989 {
990 const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
991 while ((c = *p++) < NOTACHAR)
992 {
993 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
994 if (utf)
995 {
996 PCRE2_UCHAR buff[6];
997 (void)PRIV(ord2utf)(c, buff);
998 c = buff[0];
999 }
1000 #endif
1001 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1002 }
1003 }
1004 try_next = FALSE;
1005 break;
1006
1007 /* We can ignore word boundary tests. */
1008
1009 case OP_WORD_BOUNDARY:
1010 case OP_NOT_WORD_BOUNDARY:
1011 tcode++;
1012 break;
1013
1014 /* If we hit a bracket or a positive lookahead assertion, recurse to set
1015 bits from within the subpattern. If it can't find anything, we have to
1016 give up. If it finds some mandatory character(s), we are done for this
1017 branch. Otherwise, carry on scanning after the subpattern. */
1018
1019 case OP_BRA:
1020 case OP_SBRA:
1021 case OP_CBRA:
1022 case OP_SCBRA:
1023 case OP_BRAPOS:
1024 case OP_SBRAPOS:
1025 case OP_CBRAPOS:
1026 case OP_SCBRAPOS:
1027 case OP_ONCE:
1028 case OP_ONCE_NC:
1029 case OP_ASSERT:
1030 rc = set_start_bits(re, tcode, utf);
1031 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1032 if (rc == SSB_DONE) try_next = FALSE; else
1033 {
1034 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1035 tcode += 1 + LINK_SIZE;
1036 }
1037 break;
1038
1039 /* If we hit ALT or KET, it means we haven't found anything mandatory in
1040 this branch, though we might have found something optional. For ALT, we
1041 continue with the next alternative, but we have to arrange that the final
1042 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1043 return SSB_CONTINUE: if this is the top level, that indicates failure,
1044 but after a nested subpattern, it causes scanning to continue. */
1045
1046 case OP_ALT:
1047 yield = SSB_CONTINUE;
1048 try_next = FALSE;
1049 break;
1050
1051 case OP_KET:
1052 case OP_KETRMAX:
1053 case OP_KETRMIN:
1054 case OP_KETRPOS:
1055 return SSB_CONTINUE;
1056
1057 /* Skip over callout */
1058
1059 case OP_CALLOUT:
1060 tcode += PRIV(OP_lengths)[OP_CALLOUT];
1061 break;
1062
1063 case OP_CALLOUT_STR:
1064 tcode += GET(tcode, 1 + 2*LINK_SIZE);
1065 break;
1066
1067 /* Skip over lookbehind and negative lookahead assertions */
1068
1069 case OP_ASSERT_NOT:
1070 case OP_ASSERTBACK:
1071 case OP_ASSERTBACK_NOT:
1072 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1073 tcode += 1 + LINK_SIZE;
1074 break;
1075
1076 /* BRAZERO does the bracket, but carries on. */
1077
1078 case OP_BRAZERO:
1079 case OP_BRAMINZERO:
1080 case OP_BRAPOSZERO:
1081 rc = set_start_bits(re, ++tcode, utf);
1082 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1083 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1084 tcode += 1 + LINK_SIZE;
1085 break;
1086
1087 /* SKIPZERO skips the bracket. */
1088
1089 case OP_SKIPZERO:
1090 tcode++;
1091 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1092 tcode += 1 + LINK_SIZE;
1093 break;
1094
1095 /* Single-char * or ? sets the bit and tries the next item */
1096
1097 case OP_STAR:
1098 case OP_MINSTAR:
1099 case OP_POSSTAR:
1100 case OP_QUERY:
1101 case OP_MINQUERY:
1102 case OP_POSQUERY:
1103 tcode = set_table_bit(re, tcode + 1, FALSE, utf);
1104 break;
1105
1106 case OP_STARI:
1107 case OP_MINSTARI:
1108 case OP_POSSTARI:
1109 case OP_QUERYI:
1110 case OP_MINQUERYI:
1111 case OP_POSQUERYI:
1112 tcode = set_table_bit(re, tcode + 1, TRUE, utf);
1113 break;
1114
1115 /* Single-char upto sets the bit and tries the next */
1116
1117 case OP_UPTO:
1118 case OP_MINUPTO:
1119 case OP_POSUPTO:
1120 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
1121 break;
1122
1123 case OP_UPTOI:
1124 case OP_MINUPTOI:
1125 case OP_POSUPTOI:
1126 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
1127 break;
1128
1129 /* At least one single char sets the bit and stops */
1130
1131 case OP_EXACT:
1132 tcode += IMM2_SIZE;
1133 /* Fall through */
1134 case OP_CHAR:
1135 case OP_PLUS:
1136 case OP_MINPLUS:
1137 case OP_POSPLUS:
1138 (void)set_table_bit(re, tcode + 1, FALSE, utf);
1139 try_next = FALSE;
1140 break;
1141
1142 case OP_EXACTI:
1143 tcode += IMM2_SIZE;
1144 /* Fall through */
1145 case OP_CHARI:
1146 case OP_PLUSI:
1147 case OP_MINPLUSI:
1148 case OP_POSPLUSI:
1149 (void)set_table_bit(re, tcode + 1, TRUE, utf);
1150 try_next = FALSE;
1151 break;
1152
1153 /* Special spacing and line-terminating items. These recognize specific
1154 lists of characters. The difference between VSPACE and ANYNL is that the
1155 latter can match the two-character CRLF sequence, but that is not
1156 relevant for finding the first character, so their code here is
1157 identical. */
1158
1159 case OP_HSPACE:
1160 SET_BIT(CHAR_HT);
1161 SET_BIT(CHAR_SPACE);
1162
1163 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1164 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1165
1166 #if PCRE2_CODE_UNIT_WIDTH != 8
1167 SET_BIT(0xA0);
1168 SET_BIT(0xFF);
1169 #else
1170 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1171 units of horizontal space characters. */
1172
1173 #ifdef SUPPORT_UNICODE
1174 if (utf)
1175 {
1176 SET_BIT(0xC2); /* For U+00A0 */
1177 SET_BIT(0xE1); /* For U+1680, U+180E */
1178 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1179 SET_BIT(0xE3); /* For U+3000 */
1180 }
1181 else
1182 #endif
1183 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1184 the code is EBCDIC. */
1185 {
1186 #ifndef EBCDIC
1187 SET_BIT(0xA0);
1188 #endif /* Not EBCDIC */
1189 }
1190 #endif /* 8-bit support */
1191
1192 try_next = FALSE;
1193 break;
1194
1195 case OP_ANYNL:
1196 case OP_VSPACE:
1197 SET_BIT(CHAR_LF);
1198 SET_BIT(CHAR_VT);
1199 SET_BIT(CHAR_FF);
1200 SET_BIT(CHAR_CR);
1201
1202 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1203 the bits for NEL and for code units >= 255, independently of UTF. */
1204
1205 #if PCRE2_CODE_UNIT_WIDTH != 8
1206 SET_BIT(CHAR_NEL);
1207 SET_BIT(0xFF);
1208 #else
1209 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1210 units of vertical space characters. */
1211
1212 #ifdef SUPPORT_UNICODE
1213 if (utf)
1214 {
1215 SET_BIT(0xC2); /* For U+0085 (NEL) */
1216 SET_BIT(0xE2); /* For U+2028, U+2029 */
1217 }
1218 else
1219 #endif
1220 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1221 {
1222 SET_BIT(CHAR_NEL);
1223 }
1224 #endif /* 8-bit support */
1225
1226 try_next = FALSE;
1227 break;
1228
1229 /* Single character types set the bits and stop. Note that if PCRE2_UCP
1230 is set, we do not see these op codes because \d etc are converted to
1231 properties. Therefore, these apply in the case when only characters less
1232 than 256 are recognized to match the types. */
1233
1234 case OP_NOT_DIGIT:
1235 set_nottype_bits(re, cbit_digit, table_limit);
1236 try_next = FALSE;
1237 break;
1238
1239 case OP_DIGIT:
1240 set_type_bits(re, cbit_digit, table_limit);
1241 try_next = FALSE;
1242 break;
1243
1244 case OP_NOT_WHITESPACE:
1245 set_nottype_bits(re, cbit_space, table_limit);
1246 try_next = FALSE;
1247 break;
1248
1249 case OP_WHITESPACE:
1250 set_type_bits(re, cbit_space, table_limit);
1251 try_next = FALSE;
1252 break;
1253
1254 case OP_NOT_WORDCHAR:
1255 set_nottype_bits(re, cbit_word, table_limit);
1256 try_next = FALSE;
1257 break;
1258
1259 case OP_WORDCHAR:
1260 set_type_bits(re, cbit_word, table_limit);
1261 try_next = FALSE;
1262 break;
1263
1264 /* One or more character type fudges the pointer and restarts, knowing
1265 it will hit a single character type and stop there. */
1266
1267 case OP_TYPEPLUS:
1268 case OP_TYPEMINPLUS:
1269 case OP_TYPEPOSPLUS:
1270 tcode++;
1271 break;
1272
1273 case OP_TYPEEXACT:
1274 tcode += 1 + IMM2_SIZE;
1275 break;
1276
1277 /* Zero or more repeats of character types set the bits and then
1278 try again. */
1279
1280 case OP_TYPEUPTO:
1281 case OP_TYPEMINUPTO:
1282 case OP_TYPEPOSUPTO:
1283 tcode += IMM2_SIZE; /* Fall through */
1284
1285 case OP_TYPESTAR:
1286 case OP_TYPEMINSTAR:
1287 case OP_TYPEPOSSTAR:
1288 case OP_TYPEQUERY:
1289 case OP_TYPEMINQUERY:
1290 case OP_TYPEPOSQUERY:
1291 switch(tcode[1])
1292 {
1293 default:
1294 case OP_ANY:
1295 case OP_ALLANY:
1296 return SSB_FAIL;
1297
1298 case OP_HSPACE:
1299 SET_BIT(CHAR_HT);
1300 SET_BIT(CHAR_SPACE);
1301
1302 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1303 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1304
1305 #if PCRE2_CODE_UNIT_WIDTH != 8
1306 SET_BIT(0xA0);
1307 SET_BIT(0xFF);
1308 #else
1309 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1310 units of horizontal space characters. */
1311
1312 #ifdef SUPPORT_UNICODE
1313 if (utf)
1314 {
1315 SET_BIT(0xC2); /* For U+00A0 */
1316 SET_BIT(0xE1); /* For U+1680, U+180E */
1317 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1318 SET_BIT(0xE3); /* For U+3000 */
1319 }
1320 else
1321 #endif
1322 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1323 the code is EBCDIC. */
1324 {
1325 #ifndef EBCDIC
1326 SET_BIT(0xA0);
1327 #endif /* Not EBCDIC */
1328 }
1329 #endif /* 8-bit support */
1330 break;
1331
1332 case OP_ANYNL:
1333 case OP_VSPACE:
1334 SET_BIT(CHAR_LF);
1335 SET_BIT(CHAR_VT);
1336 SET_BIT(CHAR_FF);
1337 SET_BIT(CHAR_CR);
1338
1339 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1340 the bits for NEL and for code units >= 255, independently of UTF. */
1341
1342 #if PCRE2_CODE_UNIT_WIDTH != 8
1343 SET_BIT(CHAR_NEL);
1344 SET_BIT(0xFF);
1345 #else
1346 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1347 units of vertical space characters. */
1348
1349 #ifdef SUPPORT_UNICODE
1350 if (utf)
1351 {
1352 SET_BIT(0xC2); /* For U+0085 (NEL) */
1353 SET_BIT(0xE2); /* For U+2028, U+2029 */
1354 }
1355 else
1356 #endif
1357 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1358 {
1359 SET_BIT(CHAR_NEL);
1360 }
1361 #endif /* 8-bit support */
1362 break;
1363
1364 case OP_NOT_DIGIT:
1365 set_nottype_bits(re, cbit_digit, table_limit);
1366 break;
1367
1368 case OP_DIGIT:
1369 set_type_bits(re, cbit_digit, table_limit);
1370 break;
1371
1372 case OP_NOT_WHITESPACE:
1373 set_nottype_bits(re, cbit_space, table_limit);
1374 break;
1375
1376 case OP_WHITESPACE:
1377 set_type_bits(re, cbit_space, table_limit);
1378 break;
1379
1380 case OP_NOT_WORDCHAR:
1381 set_nottype_bits(re, cbit_word, table_limit);
1382 break;
1383
1384 case OP_WORDCHAR:
1385 set_type_bits(re, cbit_word, table_limit);
1386 break;
1387 }
1388
1389 tcode += 2;
1390 break;
1391
1392 /* Extended class: if there are any property checks, or if this is a
1393 negative XCLASS without a map, give up. If there are no property checks,
1394 there must be wide characters on the XCLASS list, because otherwise an
1395 XCLASS would not have been created. This means that code points >= 255
1396 are always potential starters. */
1397
1398 #ifdef SUPPORT_WIDE_CHARS
1399 case OP_XCLASS:
1400 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
1401 (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1402 return SSB_FAIL;
1403
1404 /* We have a positive XCLASS or a negative one without a map. Set up the
1405 map pointer if there is one, and fall through. */
1406
1407 classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
1408 (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1409 #endif
1410
1411 /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1412 in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1413 because it starts a character with a value > 255. In 8-bit non-UTF mode,
1414 there is no difference between CLASS and NCLASS. In all other wide
1415 character modes, set the 0xFF bit to indicate code units >= 255. */
1416
1417 case OP_NCLASS:
1418 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1419 if (utf)
1420 {
1421 re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1422 memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1423 }
1424 #elif PCRE2_CODE_UNIT_WIDTH != 8
1425 SET_BIT(0xFF); /* For characters >= 255 */
1426 #endif
1427 /* Fall through */
1428
1429 /* Enter here for a positive non-XCLASS. If we have fallen through from
1430 an XCLASS, classmap will already be set; just advance the code pointer.
1431 Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1432
1433 case OP_CLASS:
1434 if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1435 {
1436 classmap = (uint8_t *)(++tcode);
1437 tcode += 32 / sizeof(PCRE2_UCHAR);
1438 }
1439
1440 /* When wide characters are supported, classmap may be NULL. In UTF-8
1441 (sic) mode, the bits in a class bit map correspond to character values,
1442 not to byte values. However, the bit map we are constructing is for byte
1443 values. So we have to do a conversion for characters whose code point is
1444 greater than 127. In fact, there are only two possible starting bytes for
1445 characters in the range 128 - 255. */
1446
1447 if (classmap != NULL)
1448 {
1449 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1450 if (utf)
1451 {
1452 for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1453 for (c = 128; c < 256; c++)
1454 {
1455 if ((classmap[c/8] & (1 << (c&7))) != 0)
1456 {
1457 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1458 re->start_bitmap[d/8] |= (1 << (d&7)); /* and then skip on to the */
1459 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1460 }
1461 }
1462 }
1463 else
1464 #endif
1465 /* In all modes except UTF-8, the two bit maps are compatible. */
1466
1467 {
1468 for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1469 }
1470 }
1471
1472 /* Act on what follows the class. For a zero minimum repeat, continue;
1473 otherwise stop processing. */
1474
1475 switch (*tcode)
1476 {
1477 case OP_CRSTAR:
1478 case OP_CRMINSTAR:
1479 case OP_CRQUERY:
1480 case OP_CRMINQUERY:
1481 case OP_CRPOSSTAR:
1482 case OP_CRPOSQUERY:
1483 tcode++;
1484 break;
1485
1486 case OP_CRRANGE:
1487 case OP_CRMINRANGE:
1488 case OP_CRPOSRANGE:
1489 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1490 else try_next = FALSE;
1491 break;
1492
1493 default:
1494 try_next = FALSE;
1495 break;
1496 }
1497 break; /* End of class handling case */
1498 } /* End of switch for opcodes */
1499 } /* End of try_next loop */
1500
1501 code += GET(code, 1); /* Advance to next branch */
1502 }
1503 while (*code == OP_ALT);
1504
1505 return yield;
1506 }
1507
1508
1509
1510 /*************************************************
1511 * Study a compiled expression *
1512 *************************************************/
1513
1514 /* This function is handed a compiled expression that it must study to produce
1515 information that will speed up the matching.
1516
1517 Argument: points to the compiled expression
1518 Returns: 0 normally; non-zero should never normally occur
1519 1 unknown opcode in set_start_bits
1520 2 missing capturing bracket
1521 3 unknown opcode in find_minlength
1522 */
1523
1524 int
PRIV(study)1525 PRIV(study)(pcre2_real_code *re)
1526 {
1527 int min;
1528 int count = 0;
1529 PCRE2_UCHAR *code;
1530 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1531
1532 /* Find start of compiled code */
1533
1534 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1535 re->name_entry_size * re->name_count;
1536
1537 /* For an anchored pattern, or an unanchored pattern that has a first code
1538 unit, or a multiline pattern that matches only at "line start", there is no
1539 point in seeking a list of starting code units. */
1540
1541 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
1542 (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1543 {
1544 int rc = set_start_bits(re, code, utf);
1545 if (rc == SSB_UNKNOWN) return 1;
1546 if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
1547 }
1548
1549 /* Find the minimum length of subject string. If it can match an empty string,
1550 the minimum length is already known. */
1551
1552 if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
1553 {
1554 switch(min = find_minlength(re, code, code, utf, NULL, &count))
1555 {
1556 case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
1557 break; /* Leave minlength unchanged (will be zero) */
1558
1559 case -2:
1560 return 2; /* missing capturing bracket */
1561
1562 case -3:
1563 return 3; /* unrecognized opcode */
1564
1565 default:
1566 if (min > UINT16_MAX) min = UINT16_MAX;
1567 re->minlength = min;
1568 break;
1569 }
1570 }
1571
1572 return 0;
1573 }
1574
1575 /* End of pcre2_study.c */
1576