1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2019 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre2_internal.h"
50
51 /* The maximum remembered capturing brackets minimum. */
52
53 #define MAX_CACHE_BACKREF 128
54
55 /* Set a bit in the starting code unit bit map. */
56
57 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58
59 /* Returns from set_start_bits() */
60
61 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
62
63
64 /*************************************************
65 * Find the minimum subject length for a group *
66 *************************************************/
67
68 /* Scan a parenthesized group and compute the minimum length of subject that
69 is needed to match it. This is a lower bound; it does not mean there is a
70 string of that length that matches. In UTF mode, the result is in characters
71 rather than code units. The field in a compiled pattern for storing the minimum
72 length is 16-bits long (on the grounds that anything longer than that is
73 pathological), so we give up when we reach that amount. This also means that
74 integer overflow for really crazy patterns cannot happen.
75
76 Backreference minimum lengths are cached to speed up multiple references. This
77 function is called only when the highest back reference in the pattern is less
78 than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79 caching vector. The zeroth element contains the number of the highest set
80 value.
81
82 Arguments:
83 re compiled pattern block
84 code pointer to start of group (the bracket)
85 startcode pointer to start of the whole pattern's code
86 utf UTF flag
87 recurses chain of recurse_check to catch mutual recursion
88 countptr pointer to call count (to catch over complexity)
89 backref_cache vector for caching back references.
90
91 Returns: the minimum length
92 -1 \C in UTF-8 mode
93 or (*ACCEPT)
94 or pattern too complicated
95 or back reference to duplicate name/number
96 -2 internal error (missing capturing bracket)
97 -3 internal error (opcode not listed)
98 */
99
100 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,BOOL utf,recurse_check * recurses,int * countptr,int * backref_cache)101 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
102 PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
103 int *backref_cache)
104 {
105 int length = -1;
106 int prev_cap_recno = -1;
107 int prev_cap_d = 0;
108 int prev_recurse_recno = -1;
109 int prev_recurse_d = 0;
110 uint32_t once_fudge = 0;
111 BOOL had_recurse = FALSE;
112 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
113 recurse_check this_recurse;
114 int branchlength = 0;
115 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
116
117 /* If this is a "could be empty" group, its minimum length is 0. */
118
119 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
120
121 /* Skip over capturing bracket number */
122
123 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
124
125 /* A large and/or complex regex can take too long to process. */
126
127 if ((*countptr)++ > 1000) return -1;
128
129 /* Scan along the opcodes for this branch. If we get to the end of the branch,
130 check the length against that of the other branches. If the accumulated length
131 passes 16-bits, stop. */
132
133 for (;;)
134 {
135 int d, min, recno;
136 PCRE2_UCHAR *cs, *ce;
137 PCRE2_UCHAR op = *cc;
138
139 if (branchlength >= UINT16_MAX) return UINT16_MAX;
140
141 switch (op)
142 {
143 case OP_COND:
144 case OP_SCOND:
145
146 /* If there is only one branch in a condition, the implied branch has zero
147 length, so we don't add anything. This covers the DEFINE "condition"
148 automatically. If there are two branches we can treat it the same as any
149 other non-capturing subpattern. */
150
151 cs = cc + GET(cc, 1);
152 if (*cs != OP_ALT)
153 {
154 cc = cs + 1 + LINK_SIZE;
155 break;
156 }
157 goto PROCESS_NON_CAPTURE;
158
159 case OP_BRA:
160 /* There's a special case of OP_BRA, when it is wrapped round a repeated
161 OP_RECURSE. We'd like to process the latter at this level so that
162 remembering the value works for repeated cases. So we do nothing, but
163 set a fudge value to skip over the OP_KET after the recurse. */
164
165 if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
166 {
167 once_fudge = 1 + LINK_SIZE;
168 cc += 1 + LINK_SIZE;
169 break;
170 }
171 /* Fall through */
172
173 case OP_ONCE:
174 case OP_SCRIPT_RUN:
175 case OP_SBRA:
176 case OP_BRAPOS:
177 case OP_SBRAPOS:
178 PROCESS_NON_CAPTURE:
179 d = find_minlength(re, cc, startcode, utf, recurses, countptr,
180 backref_cache);
181 if (d < 0) return d;
182 branchlength += d;
183 do cc += GET(cc, 1); while (*cc == OP_ALT);
184 cc += 1 + LINK_SIZE;
185 break;
186
187 /* To save time for repeated capturing subpatterns, we remember the
188 length of the previous one. Unfortunately we can't do the same for
189 the unnumbered ones above. Nor can we do this if (?| is present in the
190 pattern because captures with the same number are not then identical. */
191
192 case OP_CBRA:
193 case OP_SCBRA:
194 case OP_CBRAPOS:
195 case OP_SCBRAPOS:
196 recno = (int)GET2(cc, 1+LINK_SIZE);
197 if (dupcapused || recno != prev_cap_recno)
198 {
199 prev_cap_recno = recno;
200 prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
201 backref_cache);
202 if (prev_cap_d < 0) return prev_cap_d;
203 }
204 branchlength += prev_cap_d;
205 do cc += GET(cc, 1); while (*cc == OP_ALT);
206 cc += 1 + LINK_SIZE;
207 break;
208
209 /* ACCEPT makes things far too complicated; we have to give up. */
210
211 case OP_ACCEPT:
212 case OP_ASSERT_ACCEPT:
213 return -1;
214
215 /* Reached end of a branch; if it's a ket it is the end of a nested
216 call. If it's ALT it is an alternation in a nested call. If it is END it's
217 the end of the outer call. All can be handled by the same code. If an
218 ACCEPT was previously encountered, use the length that was in force at that
219 time, and pass back the shortest ACCEPT length. */
220
221 case OP_ALT:
222 case OP_KET:
223 case OP_KETRMAX:
224 case OP_KETRMIN:
225 case OP_KETRPOS:
226 case OP_END:
227 if (length < 0 || (!had_recurse && branchlength < length))
228 length = branchlength;
229 if (op != OP_ALT) return length;
230 cc += 1 + LINK_SIZE;
231 branchlength = 0;
232 had_recurse = FALSE;
233 break;
234
235 /* Skip over assertive subpatterns */
236
237 case OP_ASSERT:
238 case OP_ASSERT_NOT:
239 case OP_ASSERTBACK:
240 case OP_ASSERTBACK_NOT:
241 do cc += GET(cc, 1); while (*cc == OP_ALT);
242 /* Fall through */
243
244 /* Skip over things that don't match chars */
245
246 case OP_REVERSE:
247 case OP_CREF:
248 case OP_DNCREF:
249 case OP_RREF:
250 case OP_DNRREF:
251 case OP_FALSE:
252 case OP_TRUE:
253 case OP_CALLOUT:
254 case OP_SOD:
255 case OP_SOM:
256 case OP_EOD:
257 case OP_EODN:
258 case OP_CIRC:
259 case OP_CIRCM:
260 case OP_DOLL:
261 case OP_DOLLM:
262 case OP_NOT_WORD_BOUNDARY:
263 case OP_WORD_BOUNDARY:
264 cc += PRIV(OP_lengths)[*cc];
265 break;
266
267 case OP_CALLOUT_STR:
268 cc += GET(cc, 1 + 2*LINK_SIZE);
269 break;
270
271 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
272
273 case OP_BRAZERO:
274 case OP_BRAMINZERO:
275 case OP_BRAPOSZERO:
276 case OP_SKIPZERO:
277 cc += PRIV(OP_lengths)[*cc];
278 do cc += GET(cc, 1); while (*cc == OP_ALT);
279 cc += 1 + LINK_SIZE;
280 break;
281
282 /* Handle literal characters and + repetitions */
283
284 case OP_CHAR:
285 case OP_CHARI:
286 case OP_NOT:
287 case OP_NOTI:
288 case OP_PLUS:
289 case OP_PLUSI:
290 case OP_MINPLUS:
291 case OP_MINPLUSI:
292 case OP_POSPLUS:
293 case OP_POSPLUSI:
294 case OP_NOTPLUS:
295 case OP_NOTPLUSI:
296 case OP_NOTMINPLUS:
297 case OP_NOTMINPLUSI:
298 case OP_NOTPOSPLUS:
299 case OP_NOTPOSPLUSI:
300 branchlength++;
301 cc += 2;
302 #ifdef SUPPORT_UNICODE
303 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
304 #endif
305 break;
306
307 case OP_TYPEPLUS:
308 case OP_TYPEMINPLUS:
309 case OP_TYPEPOSPLUS:
310 branchlength++;
311 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
312 break;
313
314 /* Handle exact repetitions. The count is already in characters, but we
315 may need to skip over a multibyte character in UTF mode. */
316
317 case OP_EXACT:
318 case OP_EXACTI:
319 case OP_NOTEXACT:
320 case OP_NOTEXACTI:
321 branchlength += GET2(cc,1);
322 cc += 2 + IMM2_SIZE;
323 #ifdef SUPPORT_UNICODE
324 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
325 #endif
326 break;
327
328 case OP_TYPEEXACT:
329 branchlength += GET2(cc,1);
330 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
331 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
332 break;
333
334 /* Handle single-char non-literal matchers */
335
336 case OP_PROP:
337 case OP_NOTPROP:
338 cc += 2;
339 /* Fall through */
340
341 case OP_NOT_DIGIT:
342 case OP_DIGIT:
343 case OP_NOT_WHITESPACE:
344 case OP_WHITESPACE:
345 case OP_NOT_WORDCHAR:
346 case OP_WORDCHAR:
347 case OP_ANY:
348 case OP_ALLANY:
349 case OP_EXTUNI:
350 case OP_HSPACE:
351 case OP_NOT_HSPACE:
352 case OP_VSPACE:
353 case OP_NOT_VSPACE:
354 branchlength++;
355 cc++;
356 break;
357
358 /* "Any newline" might match two characters, but it also might match just
359 one. */
360
361 case OP_ANYNL:
362 branchlength += 1;
363 cc++;
364 break;
365
366 /* The single-byte matcher means we can't proceed in UTF mode. (In
367 non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
368 appear, but leave the code, just in case.) */
369
370 case OP_ANYBYTE:
371 #ifdef SUPPORT_UNICODE
372 if (utf) return -1;
373 #endif
374 branchlength++;
375 cc++;
376 break;
377
378 /* For repeated character types, we have to test for \p and \P, which have
379 an extra two bytes of parameters. */
380
381 case OP_TYPESTAR:
382 case OP_TYPEMINSTAR:
383 case OP_TYPEQUERY:
384 case OP_TYPEMINQUERY:
385 case OP_TYPEPOSSTAR:
386 case OP_TYPEPOSQUERY:
387 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
388 cc += PRIV(OP_lengths)[op];
389 break;
390
391 case OP_TYPEUPTO:
392 case OP_TYPEMINUPTO:
393 case OP_TYPEPOSUPTO:
394 if (cc[1 + IMM2_SIZE] == OP_PROP
395 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
396 cc += PRIV(OP_lengths)[op];
397 break;
398
399 /* Check a class for variable quantification */
400
401 case OP_CLASS:
402 case OP_NCLASS:
403 #ifdef SUPPORT_WIDE_CHARS
404 case OP_XCLASS:
405 /* The original code caused an unsigned overflow in 64 bit systems,
406 so now we use a conditional statement. */
407 if (op == OP_XCLASS)
408 cc += GET(cc, 1);
409 else
410 cc += PRIV(OP_lengths)[OP_CLASS];
411 #else
412 cc += PRIV(OP_lengths)[OP_CLASS];
413 #endif
414
415 switch (*cc)
416 {
417 case OP_CRPLUS:
418 case OP_CRMINPLUS:
419 case OP_CRPOSPLUS:
420 branchlength++;
421 /* Fall through */
422
423 case OP_CRSTAR:
424 case OP_CRMINSTAR:
425 case OP_CRQUERY:
426 case OP_CRMINQUERY:
427 case OP_CRPOSSTAR:
428 case OP_CRPOSQUERY:
429 cc++;
430 break;
431
432 case OP_CRRANGE:
433 case OP_CRMINRANGE:
434 case OP_CRPOSRANGE:
435 branchlength += GET2(cc,1);
436 cc += 1 + 2 * IMM2_SIZE;
437 break;
438
439 default:
440 branchlength++;
441 break;
442 }
443 break;
444
445 /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
446 way: we find the minimum length for the subpattern. A recursion
447 (backreference or subroutine) causes an a flag to be set that causes the
448 length of this branch to be ignored. The logic is that a recursion can only
449 make sense if there is another alternative that stops the recursing. That
450 will provide the minimum length (when no recursion happens).
451
452 If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
453 matches an empty string (by default it causes a matching failure), so in
454 that case we must set the minimum length to zero. */
455
456 /* Duplicate named pattern back reference. We cannot reliably find a length
457 for this if duplicate numbers are present in the pattern. */
458
459 case OP_DNREF:
460 case OP_DNREFI:
461 if (dupcapused) return -1;
462 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
463 {
464 int count = GET2(cc, 1+IMM2_SIZE);
465 PCRE2_UCHAR *slot =
466 (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
467 GET2(cc, 1) * re->name_entry_size;
468
469 d = INT_MAX;
470
471 /* Scan all groups with the same name; find the shortest. */
472
473 while (count-- > 0)
474 {
475 int dd, i;
476 recno = GET2(slot, 0);
477
478 if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
479 dd = backref_cache[recno];
480 else
481 {
482 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
483 if (cs == NULL) return -2;
484 do ce += GET(ce, 1); while (*ce == OP_ALT);
485 if (cc > cs && cc < ce) /* Simple recursion */
486 {
487 dd = 0;
488 had_recurse = TRUE;
489 }
490 else
491 {
492 recurse_check *r = recurses;
493 for (r = recurses; r != NULL; r = r->prev)
494 if (r->group == cs) break;
495 if (r != NULL) /* Mutual recursion */
496 {
497 dd = 0;
498 had_recurse = TRUE;
499 }
500 else
501 {
502 this_recurse.prev = recurses;
503 this_recurse.group = cs;
504 dd = find_minlength(re, cs, startcode, utf, &this_recurse,
505 countptr, backref_cache);
506 if (dd < 0) return dd;
507 }
508 }
509
510 backref_cache[recno] = dd;
511 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
512 backref_cache[0] = recno;
513 }
514
515 if (dd < d) d = dd;
516 if (d <= 0) break; /* No point looking at any more */
517 slot += re->name_entry_size;
518 }
519 }
520 else d = 0;
521 cc += 1 + 2*IMM2_SIZE;
522 goto REPEAT_BACK_REFERENCE;
523
524 /* Single back reference. We cannot find a length for this if duplicate
525 numbers are present in the pattern. */
526
527 case OP_REF:
528 case OP_REFI:
529 if (dupcapused) return -1;
530 recno = GET2(cc, 1);
531 if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
532 d = backref_cache[recno];
533 else
534 {
535 int i;
536 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
537 {
538 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
539 if (cs == NULL) return -2;
540 do ce += GET(ce, 1); while (*ce == OP_ALT);
541 if (cc > cs && cc < ce) /* Simple recursion */
542 {
543 d = 0;
544 had_recurse = TRUE;
545 }
546 else
547 {
548 recurse_check *r = recurses;
549 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
550 if (r != NULL) /* Mutual recursion */
551 {
552 d = 0;
553 had_recurse = TRUE;
554 }
555 else
556 {
557 this_recurse.prev = recurses;
558 this_recurse.group = cs;
559 d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
560 backref_cache);
561 if (d < 0) return d;
562 }
563 }
564 }
565 else d = 0;
566
567 backref_cache[recno] = d;
568 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
569 backref_cache[0] = recno;
570 }
571
572 cc += 1 + IMM2_SIZE;
573
574 /* Handle repeated back references */
575
576 REPEAT_BACK_REFERENCE:
577 switch (*cc)
578 {
579 case OP_CRSTAR:
580 case OP_CRMINSTAR:
581 case OP_CRQUERY:
582 case OP_CRMINQUERY:
583 case OP_CRPOSSTAR:
584 case OP_CRPOSQUERY:
585 min = 0;
586 cc++;
587 break;
588
589 case OP_CRPLUS:
590 case OP_CRMINPLUS:
591 case OP_CRPOSPLUS:
592 min = 1;
593 cc++;
594 break;
595
596 case OP_CRRANGE:
597 case OP_CRMINRANGE:
598 case OP_CRPOSRANGE:
599 min = GET2(cc, 1);
600 cc += 1 + 2 * IMM2_SIZE;
601 break;
602
603 default:
604 min = 1;
605 break;
606 }
607
608 /* Take care not to overflow: (1) min and d are ints, so check that their
609 product is not greater than INT_MAX. (2) branchlength is limited to
610 UINT16_MAX (checked at the top of the loop). */
611
612 if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
613 branchlength = UINT16_MAX;
614 else branchlength += min * d;
615 break;
616
617 /* Recursion always refers to the first occurrence of a subpattern with a
618 given number. Therefore, we can always make use of caching, even when the
619 pattern contains multiple subpatterns with the same number. */
620
621 case OP_RECURSE:
622 cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
623 recno = GET2(cs, 1+LINK_SIZE);
624 if (recno == prev_recurse_recno)
625 {
626 branchlength += prev_recurse_d;
627 }
628 else
629 {
630 do ce += GET(ce, 1); while (*ce == OP_ALT);
631 if (cc > cs && cc < ce) /* Simple recursion */
632 had_recurse = TRUE;
633 else
634 {
635 recurse_check *r = recurses;
636 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
637 if (r != NULL) /* Mutual recursion */
638 had_recurse = TRUE;
639 else
640 {
641 this_recurse.prev = recurses;
642 this_recurse.group = cs;
643 prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
644 countptr, backref_cache);
645 if (prev_recurse_d < 0) return prev_recurse_d;
646 prev_recurse_recno = recno;
647 branchlength += prev_recurse_d;
648 }
649 }
650 }
651 cc += 1 + LINK_SIZE + once_fudge;
652 once_fudge = 0;
653 break;
654
655 /* Anything else does not or need not match a character. We can get the
656 item's length from the table, but for those that can match zero occurrences
657 of a character, we must take special action for UTF-8 characters. As it
658 happens, the "NOT" versions of these opcodes are used at present only for
659 ASCII characters, so they could be omitted from this list. However, in
660 future that may change, so we include them here so as not to leave a
661 gotcha for a future maintainer. */
662
663 case OP_UPTO:
664 case OP_UPTOI:
665 case OP_NOTUPTO:
666 case OP_NOTUPTOI:
667 case OP_MINUPTO:
668 case OP_MINUPTOI:
669 case OP_NOTMINUPTO:
670 case OP_NOTMINUPTOI:
671 case OP_POSUPTO:
672 case OP_POSUPTOI:
673 case OP_NOTPOSUPTO:
674 case OP_NOTPOSUPTOI:
675
676 case OP_STAR:
677 case OP_STARI:
678 case OP_NOTSTAR:
679 case OP_NOTSTARI:
680 case OP_MINSTAR:
681 case OP_MINSTARI:
682 case OP_NOTMINSTAR:
683 case OP_NOTMINSTARI:
684 case OP_POSSTAR:
685 case OP_POSSTARI:
686 case OP_NOTPOSSTAR:
687 case OP_NOTPOSSTARI:
688
689 case OP_QUERY:
690 case OP_QUERYI:
691 case OP_NOTQUERY:
692 case OP_NOTQUERYI:
693 case OP_MINQUERY:
694 case OP_MINQUERYI:
695 case OP_NOTMINQUERY:
696 case OP_NOTMINQUERYI:
697 case OP_POSQUERY:
698 case OP_POSQUERYI:
699 case OP_NOTPOSQUERY:
700 case OP_NOTPOSQUERYI:
701
702 cc += PRIV(OP_lengths)[op];
703 #ifdef SUPPORT_UNICODE
704 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
705 #endif
706 break;
707
708 /* Skip these, but we need to add in the name length. */
709
710 case OP_MARK:
711 case OP_COMMIT_ARG:
712 case OP_PRUNE_ARG:
713 case OP_SKIP_ARG:
714 case OP_THEN_ARG:
715 cc += PRIV(OP_lengths)[op] + cc[1];
716 break;
717
718 /* The remaining opcodes are just skipped over. */
719
720 case OP_CLOSE:
721 case OP_COMMIT:
722 case OP_FAIL:
723 case OP_PRUNE:
724 case OP_SET_SOM:
725 case OP_SKIP:
726 case OP_THEN:
727 cc += PRIV(OP_lengths)[op];
728 break;
729
730 /* This should not occur: we list all opcodes explicitly so that when
731 new ones get added they are properly considered. */
732
733 default:
734 return -3;
735 }
736 }
737 /* Control never gets here */
738 }
739
740
741
742 /*************************************************
743 * Set a bit and maybe its alternate case *
744 *************************************************/
745
746 /* Given a character, set its first code unit's bit in the table, and also the
747 corresponding bit for the other version of a letter if we are caseless.
748
749 Arguments:
750 re points to the regex block
751 p points to the first code unit of the character
752 caseless TRUE if caseless
753 utf TRUE for UTF mode
754
755 Returns: pointer after the character
756 */
757
758 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf)759 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
760 {
761 uint32_t c = *p++; /* First code unit */
762 (void)utf; /* Stop compiler warning when UTF not supported */
763
764 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
765 0xff. */
766
767 #if PCRE2_CODE_UNIT_WIDTH != 8
768 if (c > 0xff) SET_BIT(0xff); else
769 #endif
770
771 SET_BIT(c);
772
773 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
774 the end of the character, even when caseless. */
775
776 #ifdef SUPPORT_UNICODE
777 if (utf)
778 {
779 #if PCRE2_CODE_UNIT_WIDTH == 8
780 if (c >= 0xc0) GETUTF8INC(c, p);
781 #elif PCRE2_CODE_UNIT_WIDTH == 16
782 if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
783 #endif
784 }
785 #endif /* SUPPORT_UNICODE */
786
787 /* If caseless, handle the other case of the character. */
788
789 if (caseless)
790 {
791 #ifdef SUPPORT_UNICODE
792 if (utf)
793 {
794 #if PCRE2_CODE_UNIT_WIDTH == 8
795 PCRE2_UCHAR buff[6];
796 c = UCD_OTHERCASE(c);
797 (void)PRIV(ord2utf)(c, buff);
798 SET_BIT(buff[0]);
799 #else /* 16-bit or 32-bit mode */
800 c = UCD_OTHERCASE(c);
801 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
802 #endif
803 }
804 else
805 #endif /* SUPPORT_UNICODE */
806
807 /* Not UTF */
808
809 if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
810 }
811
812 return p;
813 }
814
815
816
817 /*************************************************
818 * Set bits for a positive character type *
819 *************************************************/
820
821 /* This function sets starting bits for a character type. In UTF-8 mode, we can
822 only do a direct setting for bytes less than 128, as otherwise there can be
823 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
824 environment, the tables will only recognize ASCII characters anyway, but in at
825 least one Windows environment, some higher bytes bits were set in the tables.
826 So we deal with that case by considering the UTF-8 encoding.
827
828 Arguments:
829 re the regex block
830 cbit type the type of character wanted
831 table_limit 32 for non-UTF-8; 16 for UTF-8
832
833 Returns: nothing
834 */
835
836 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)837 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
838 {
839 uint32_t c;
840 for (c = 0; c < table_limit; c++)
841 re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
842 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
843 if (table_limit == 32) return;
844 for (c = 128; c < 256; c++)
845 {
846 if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
847 {
848 PCRE2_UCHAR buff[6];
849 (void)PRIV(ord2utf)(c, buff);
850 SET_BIT(buff[0]);
851 }
852 }
853 #endif /* UTF-8 */
854 }
855
856
857 /*************************************************
858 * Set bits for a negative character type *
859 *************************************************/
860
861 /* This function sets starting bits for a negative character type such as \D.
862 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
863 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
864 Unlike in the positive case, where we can set appropriate starting bits for
865 specific high-valued UTF-8 characters, in this case we have to set the bits for
866 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
867 0xc0 (192) for simplicity.
868
869 Arguments:
870 re the regex block
871 cbit type the type of character wanted
872 table_limit 32 for non-UTF-8; 16 for UTF-8
873
874 Returns: nothing
875 */
876
877 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)878 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
879 {
880 uint32_t c;
881 for (c = 0; c < table_limit; c++)
882 re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
883 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
884 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
885 #endif
886 }
887
888
889
890 /*************************************************
891 * Create bitmap of starting bytes *
892 *************************************************/
893
894 /* This function scans a compiled unanchored expression recursively and
895 attempts to build a bitmap of the set of possible starting code units whose
896 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
897 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
898 we pass a value of 16 rather than 32 as the final argument. (See comments in
899 those functions for the reason.)
900
901 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
902 (a*)b where the group provides some optional starting code units but scanning
903 must continue at the outer level to find at least one mandatory code unit. At
904 the outermost level, this function fails unless the result is SSB_DONE.
905
906 Arguments:
907 re points to the compiled regex block
908 code points to an expression
909 utf TRUE if in UTF mode
910
911 Returns: SSB_FAIL => Failed to find any starting code units
912 SSB_DONE => Found mandatory starting code units
913 SSB_CONTINUE => Found optional starting code units
914 SSB_UNKNOWN => Hit an unrecognized opcode
915 */
916
917 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf)918 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
919 {
920 uint32_t c;
921 int yield = SSB_DONE;
922
923 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
924 int table_limit = utf? 16:32;
925 #else
926 int table_limit = 32;
927 #endif
928
929 do
930 {
931 BOOL try_next = TRUE;
932 PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
933
934 if (*code == OP_CBRA || *code == OP_SCBRA ||
935 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
936
937 while (try_next) /* Loop for items in this branch */
938 {
939 int rc;
940 uint8_t *classmap = NULL;
941
942 switch(*tcode)
943 {
944 /* If we reach something we don't understand, it means a new opcode has
945 been created that hasn't been added to this function. Hopefully this
946 problem will be discovered during testing. */
947
948 default:
949 return SSB_UNKNOWN;
950
951 /* Fail for a valid opcode that implies no starting bits. */
952
953 case OP_ACCEPT:
954 case OP_ASSERT_ACCEPT:
955 case OP_ALLANY:
956 case OP_ANY:
957 case OP_ANYBYTE:
958 case OP_CIRCM:
959 case OP_CLOSE:
960 case OP_COMMIT:
961 case OP_COMMIT_ARG:
962 case OP_COND:
963 case OP_CREF:
964 case OP_FALSE:
965 case OP_TRUE:
966 case OP_DNCREF:
967 case OP_DNREF:
968 case OP_DNREFI:
969 case OP_DNRREF:
970 case OP_DOLL:
971 case OP_DOLLM:
972 case OP_END:
973 case OP_EOD:
974 case OP_EODN:
975 case OP_EXTUNI:
976 case OP_FAIL:
977 case OP_MARK:
978 case OP_NOT:
979 case OP_NOTEXACT:
980 case OP_NOTEXACTI:
981 case OP_NOTI:
982 case OP_NOTMINPLUS:
983 case OP_NOTMINPLUSI:
984 case OP_NOTMINQUERY:
985 case OP_NOTMINQUERYI:
986 case OP_NOTMINSTAR:
987 case OP_NOTMINSTARI:
988 case OP_NOTMINUPTO:
989 case OP_NOTMINUPTOI:
990 case OP_NOTPLUS:
991 case OP_NOTPLUSI:
992 case OP_NOTPOSPLUS:
993 case OP_NOTPOSPLUSI:
994 case OP_NOTPOSQUERY:
995 case OP_NOTPOSQUERYI:
996 case OP_NOTPOSSTAR:
997 case OP_NOTPOSSTARI:
998 case OP_NOTPOSUPTO:
999 case OP_NOTPOSUPTOI:
1000 case OP_NOTPROP:
1001 case OP_NOTQUERY:
1002 case OP_NOTQUERYI:
1003 case OP_NOTSTAR:
1004 case OP_NOTSTARI:
1005 case OP_NOTUPTO:
1006 case OP_NOTUPTOI:
1007 case OP_NOT_HSPACE:
1008 case OP_NOT_VSPACE:
1009 case OP_PRUNE:
1010 case OP_PRUNE_ARG:
1011 case OP_RECURSE:
1012 case OP_REF:
1013 case OP_REFI:
1014 case OP_REVERSE:
1015 case OP_RREF:
1016 case OP_SCOND:
1017 case OP_SET_SOM:
1018 case OP_SKIP:
1019 case OP_SKIP_ARG:
1020 case OP_SOD:
1021 case OP_SOM:
1022 case OP_THEN:
1023 case OP_THEN_ARG:
1024 return SSB_FAIL;
1025
1026 /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1027 uses OP_CIRCM). Skip over it. */
1028
1029 case OP_CIRC:
1030 tcode += PRIV(OP_lengths)[OP_CIRC];
1031 break;
1032
1033 /* A "real" property test implies no starting bits, but the fake property
1034 PT_CLIST identifies a list of characters. These lists are short, as they
1035 are used for characters with more than one "other case", so there is no
1036 point in recognizing them for OP_NOTPROP. */
1037
1038 case OP_PROP:
1039 if (tcode[1] != PT_CLIST) return SSB_FAIL;
1040 {
1041 const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1042 while ((c = *p++) < NOTACHAR)
1043 {
1044 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1045 if (utf)
1046 {
1047 PCRE2_UCHAR buff[6];
1048 (void)PRIV(ord2utf)(c, buff);
1049 c = buff[0];
1050 }
1051 #endif
1052 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1053 }
1054 }
1055 try_next = FALSE;
1056 break;
1057
1058 /* We can ignore word boundary tests. */
1059
1060 case OP_WORD_BOUNDARY:
1061 case OP_NOT_WORD_BOUNDARY:
1062 tcode++;
1063 break;
1064
1065 /* If we hit a bracket or a positive lookahead assertion, recurse to set
1066 bits from within the subpattern. If it can't find anything, we have to
1067 give up. If it finds some mandatory character(s), we are done for this
1068 branch. Otherwise, carry on scanning after the subpattern. */
1069
1070 case OP_BRA:
1071 case OP_SBRA:
1072 case OP_CBRA:
1073 case OP_SCBRA:
1074 case OP_BRAPOS:
1075 case OP_SBRAPOS:
1076 case OP_CBRAPOS:
1077 case OP_SCBRAPOS:
1078 case OP_ONCE:
1079 case OP_SCRIPT_RUN:
1080 case OP_ASSERT:
1081 rc = set_start_bits(re, tcode, utf);
1082 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1083 if (rc == SSB_DONE) try_next = FALSE; else
1084 {
1085 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1086 tcode += 1 + LINK_SIZE;
1087 }
1088 break;
1089
1090 /* If we hit ALT or KET, it means we haven't found anything mandatory in
1091 this branch, though we might have found something optional. For ALT, we
1092 continue with the next alternative, but we have to arrange that the final
1093 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1094 return SSB_CONTINUE: if this is the top level, that indicates failure,
1095 but after a nested subpattern, it causes scanning to continue. */
1096
1097 case OP_ALT:
1098 yield = SSB_CONTINUE;
1099 try_next = FALSE;
1100 break;
1101
1102 case OP_KET:
1103 case OP_KETRMAX:
1104 case OP_KETRMIN:
1105 case OP_KETRPOS:
1106 return SSB_CONTINUE;
1107
1108 /* Skip over callout */
1109
1110 case OP_CALLOUT:
1111 tcode += PRIV(OP_lengths)[OP_CALLOUT];
1112 break;
1113
1114 case OP_CALLOUT_STR:
1115 tcode += GET(tcode, 1 + 2*LINK_SIZE);
1116 break;
1117
1118 /* Skip over lookbehind and negative lookahead assertions */
1119
1120 case OP_ASSERT_NOT:
1121 case OP_ASSERTBACK:
1122 case OP_ASSERTBACK_NOT:
1123 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1124 tcode += 1 + LINK_SIZE;
1125 break;
1126
1127 /* BRAZERO does the bracket, but carries on. */
1128
1129 case OP_BRAZERO:
1130 case OP_BRAMINZERO:
1131 case OP_BRAPOSZERO:
1132 rc = set_start_bits(re, ++tcode, utf);
1133 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1134 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1135 tcode += 1 + LINK_SIZE;
1136 break;
1137
1138 /* SKIPZERO skips the bracket. */
1139
1140 case OP_SKIPZERO:
1141 tcode++;
1142 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1143 tcode += 1 + LINK_SIZE;
1144 break;
1145
1146 /* Single-char * or ? sets the bit and tries the next item */
1147
1148 case OP_STAR:
1149 case OP_MINSTAR:
1150 case OP_POSSTAR:
1151 case OP_QUERY:
1152 case OP_MINQUERY:
1153 case OP_POSQUERY:
1154 tcode = set_table_bit(re, tcode + 1, FALSE, utf);
1155 break;
1156
1157 case OP_STARI:
1158 case OP_MINSTARI:
1159 case OP_POSSTARI:
1160 case OP_QUERYI:
1161 case OP_MINQUERYI:
1162 case OP_POSQUERYI:
1163 tcode = set_table_bit(re, tcode + 1, TRUE, utf);
1164 break;
1165
1166 /* Single-char upto sets the bit and tries the next */
1167
1168 case OP_UPTO:
1169 case OP_MINUPTO:
1170 case OP_POSUPTO:
1171 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
1172 break;
1173
1174 case OP_UPTOI:
1175 case OP_MINUPTOI:
1176 case OP_POSUPTOI:
1177 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
1178 break;
1179
1180 /* At least one single char sets the bit and stops */
1181
1182 case OP_EXACT:
1183 tcode += IMM2_SIZE;
1184 /* Fall through */
1185 case OP_CHAR:
1186 case OP_PLUS:
1187 case OP_MINPLUS:
1188 case OP_POSPLUS:
1189 (void)set_table_bit(re, tcode + 1, FALSE, utf);
1190 try_next = FALSE;
1191 break;
1192
1193 case OP_EXACTI:
1194 tcode += IMM2_SIZE;
1195 /* Fall through */
1196 case OP_CHARI:
1197 case OP_PLUSI:
1198 case OP_MINPLUSI:
1199 case OP_POSPLUSI:
1200 (void)set_table_bit(re, tcode + 1, TRUE, utf);
1201 try_next = FALSE;
1202 break;
1203
1204 /* Special spacing and line-terminating items. These recognize specific
1205 lists of characters. The difference between VSPACE and ANYNL is that the
1206 latter can match the two-character CRLF sequence, but that is not
1207 relevant for finding the first character, so their code here is
1208 identical. */
1209
1210 case OP_HSPACE:
1211 SET_BIT(CHAR_HT);
1212 SET_BIT(CHAR_SPACE);
1213
1214 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1215 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1216
1217 #if PCRE2_CODE_UNIT_WIDTH != 8
1218 SET_BIT(0xA0);
1219 SET_BIT(0xFF);
1220 #else
1221 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1222 units of horizontal space characters. */
1223
1224 #ifdef SUPPORT_UNICODE
1225 if (utf)
1226 {
1227 SET_BIT(0xC2); /* For U+00A0 */
1228 SET_BIT(0xE1); /* For U+1680, U+180E */
1229 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1230 SET_BIT(0xE3); /* For U+3000 */
1231 }
1232 else
1233 #endif
1234 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1235 the code is EBCDIC. */
1236 {
1237 #ifndef EBCDIC
1238 SET_BIT(0xA0);
1239 #endif /* Not EBCDIC */
1240 }
1241 #endif /* 8-bit support */
1242
1243 try_next = FALSE;
1244 break;
1245
1246 case OP_ANYNL:
1247 case OP_VSPACE:
1248 SET_BIT(CHAR_LF);
1249 SET_BIT(CHAR_VT);
1250 SET_BIT(CHAR_FF);
1251 SET_BIT(CHAR_CR);
1252
1253 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1254 the bits for NEL and for code units >= 255, independently of UTF. */
1255
1256 #if PCRE2_CODE_UNIT_WIDTH != 8
1257 SET_BIT(CHAR_NEL);
1258 SET_BIT(0xFF);
1259 #else
1260 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1261 units of vertical space characters. */
1262
1263 #ifdef SUPPORT_UNICODE
1264 if (utf)
1265 {
1266 SET_BIT(0xC2); /* For U+0085 (NEL) */
1267 SET_BIT(0xE2); /* For U+2028, U+2029 */
1268 }
1269 else
1270 #endif
1271 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1272 {
1273 SET_BIT(CHAR_NEL);
1274 }
1275 #endif /* 8-bit support */
1276
1277 try_next = FALSE;
1278 break;
1279
1280 /* Single character types set the bits and stop. Note that if PCRE2_UCP
1281 is set, we do not see these opcodes because \d etc are converted to
1282 properties. Therefore, these apply in the case when only characters less
1283 than 256 are recognized to match the types. */
1284
1285 case OP_NOT_DIGIT:
1286 set_nottype_bits(re, cbit_digit, table_limit);
1287 try_next = FALSE;
1288 break;
1289
1290 case OP_DIGIT:
1291 set_type_bits(re, cbit_digit, table_limit);
1292 try_next = FALSE;
1293 break;
1294
1295 case OP_NOT_WHITESPACE:
1296 set_nottype_bits(re, cbit_space, table_limit);
1297 try_next = FALSE;
1298 break;
1299
1300 case OP_WHITESPACE:
1301 set_type_bits(re, cbit_space, table_limit);
1302 try_next = FALSE;
1303 break;
1304
1305 case OP_NOT_WORDCHAR:
1306 set_nottype_bits(re, cbit_word, table_limit);
1307 try_next = FALSE;
1308 break;
1309
1310 case OP_WORDCHAR:
1311 set_type_bits(re, cbit_word, table_limit);
1312 try_next = FALSE;
1313 break;
1314
1315 /* One or more character type fudges the pointer and restarts, knowing
1316 it will hit a single character type and stop there. */
1317
1318 case OP_TYPEPLUS:
1319 case OP_TYPEMINPLUS:
1320 case OP_TYPEPOSPLUS:
1321 tcode++;
1322 break;
1323
1324 case OP_TYPEEXACT:
1325 tcode += 1 + IMM2_SIZE;
1326 break;
1327
1328 /* Zero or more repeats of character types set the bits and then
1329 try again. */
1330
1331 case OP_TYPEUPTO:
1332 case OP_TYPEMINUPTO:
1333 case OP_TYPEPOSUPTO:
1334 tcode += IMM2_SIZE; /* Fall through */
1335
1336 case OP_TYPESTAR:
1337 case OP_TYPEMINSTAR:
1338 case OP_TYPEPOSSTAR:
1339 case OP_TYPEQUERY:
1340 case OP_TYPEMINQUERY:
1341 case OP_TYPEPOSQUERY:
1342 switch(tcode[1])
1343 {
1344 default:
1345 case OP_ANY:
1346 case OP_ALLANY:
1347 return SSB_FAIL;
1348
1349 case OP_HSPACE:
1350 SET_BIT(CHAR_HT);
1351 SET_BIT(CHAR_SPACE);
1352
1353 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1354 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1355
1356 #if PCRE2_CODE_UNIT_WIDTH != 8
1357 SET_BIT(0xA0);
1358 SET_BIT(0xFF);
1359 #else
1360 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1361 units of horizontal space characters. */
1362
1363 #ifdef SUPPORT_UNICODE
1364 if (utf)
1365 {
1366 SET_BIT(0xC2); /* For U+00A0 */
1367 SET_BIT(0xE1); /* For U+1680, U+180E */
1368 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1369 SET_BIT(0xE3); /* For U+3000 */
1370 }
1371 else
1372 #endif
1373 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1374 the code is EBCDIC. */
1375 {
1376 #ifndef EBCDIC
1377 SET_BIT(0xA0);
1378 #endif /* Not EBCDIC */
1379 }
1380 #endif /* 8-bit support */
1381 break;
1382
1383 case OP_ANYNL:
1384 case OP_VSPACE:
1385 SET_BIT(CHAR_LF);
1386 SET_BIT(CHAR_VT);
1387 SET_BIT(CHAR_FF);
1388 SET_BIT(CHAR_CR);
1389
1390 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1391 the bits for NEL and for code units >= 255, independently of UTF. */
1392
1393 #if PCRE2_CODE_UNIT_WIDTH != 8
1394 SET_BIT(CHAR_NEL);
1395 SET_BIT(0xFF);
1396 #else
1397 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1398 units of vertical space characters. */
1399
1400 #ifdef SUPPORT_UNICODE
1401 if (utf)
1402 {
1403 SET_BIT(0xC2); /* For U+0085 (NEL) */
1404 SET_BIT(0xE2); /* For U+2028, U+2029 */
1405 }
1406 else
1407 #endif
1408 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1409 {
1410 SET_BIT(CHAR_NEL);
1411 }
1412 #endif /* 8-bit support */
1413 break;
1414
1415 case OP_NOT_DIGIT:
1416 set_nottype_bits(re, cbit_digit, table_limit);
1417 break;
1418
1419 case OP_DIGIT:
1420 set_type_bits(re, cbit_digit, table_limit);
1421 break;
1422
1423 case OP_NOT_WHITESPACE:
1424 set_nottype_bits(re, cbit_space, table_limit);
1425 break;
1426
1427 case OP_WHITESPACE:
1428 set_type_bits(re, cbit_space, table_limit);
1429 break;
1430
1431 case OP_NOT_WORDCHAR:
1432 set_nottype_bits(re, cbit_word, table_limit);
1433 break;
1434
1435 case OP_WORDCHAR:
1436 set_type_bits(re, cbit_word, table_limit);
1437 break;
1438 }
1439
1440 tcode += 2;
1441 break;
1442
1443 /* Extended class: if there are any property checks, or if this is a
1444 negative XCLASS without a map, give up. If there are no property checks,
1445 there must be wide characters on the XCLASS list, because otherwise an
1446 XCLASS would not have been created. This means that code points >= 255
1447 are always potential starters. */
1448
1449 #ifdef SUPPORT_WIDE_CHARS
1450 case OP_XCLASS:
1451 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
1452 (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1453 return SSB_FAIL;
1454
1455 /* We have a positive XCLASS or a negative one without a map. Set up the
1456 map pointer if there is one, and fall through. */
1457
1458 classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
1459 (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1460 #endif
1461 /* It seems that the fall through comment must be outside the #ifdef if
1462 it is to avoid the gcc compiler warning. */
1463
1464 /* Fall through */
1465
1466 /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1467 in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1468 because it starts a character with a value > 255. In 8-bit non-UTF mode,
1469 there is no difference between CLASS and NCLASS. In all other wide
1470 character modes, set the 0xFF bit to indicate code units >= 255. */
1471
1472 case OP_NCLASS:
1473 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1474 if (utf)
1475 {
1476 re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1477 memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1478 }
1479 #elif PCRE2_CODE_UNIT_WIDTH != 8
1480 SET_BIT(0xFF); /* For characters >= 255 */
1481 #endif
1482 /* Fall through */
1483
1484 /* Enter here for a positive non-XCLASS. If we have fallen through from
1485 an XCLASS, classmap will already be set; just advance the code pointer.
1486 Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1487
1488 case OP_CLASS:
1489 if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1490 {
1491 classmap = (uint8_t *)(++tcode);
1492 tcode += 32 / sizeof(PCRE2_UCHAR);
1493 }
1494
1495 /* When wide characters are supported, classmap may be NULL. In UTF-8
1496 (sic) mode, the bits in a class bit map correspond to character values,
1497 not to byte values. However, the bit map we are constructing is for byte
1498 values. So we have to do a conversion for characters whose code point is
1499 greater than 127. In fact, there are only two possible starting bytes for
1500 characters in the range 128 - 255. */
1501
1502 if (classmap != NULL)
1503 {
1504 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1505 if (utf)
1506 {
1507 for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1508 for (c = 128; c < 256; c++)
1509 {
1510 if ((classmap[c/8] & (1u << (c&7))) != 0)
1511 {
1512 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1513 re->start_bitmap[d/8] |= (1u << (d&7)); /* and then skip on to the */
1514 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1515 }
1516 }
1517 }
1518 else
1519 #endif
1520 /* In all modes except UTF-8, the two bit maps are compatible. */
1521
1522 {
1523 for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1524 }
1525 }
1526
1527 /* Act on what follows the class. For a zero minimum repeat, continue;
1528 otherwise stop processing. */
1529
1530 switch (*tcode)
1531 {
1532 case OP_CRSTAR:
1533 case OP_CRMINSTAR:
1534 case OP_CRQUERY:
1535 case OP_CRMINQUERY:
1536 case OP_CRPOSSTAR:
1537 case OP_CRPOSQUERY:
1538 tcode++;
1539 break;
1540
1541 case OP_CRRANGE:
1542 case OP_CRMINRANGE:
1543 case OP_CRPOSRANGE:
1544 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1545 else try_next = FALSE;
1546 break;
1547
1548 default:
1549 try_next = FALSE;
1550 break;
1551 }
1552 break; /* End of class handling case */
1553 } /* End of switch for opcodes */
1554 } /* End of try_next loop */
1555
1556 code += GET(code, 1); /* Advance to next branch */
1557 }
1558 while (*code == OP_ALT);
1559
1560 return yield;
1561 }
1562
1563
1564
1565 /*************************************************
1566 * Study a compiled expression *
1567 *************************************************/
1568
1569 /* This function is handed a compiled expression that it must study to produce
1570 information that will speed up the matching.
1571
1572 Argument: points to the compiled expression
1573 Returns: 0 normally; non-zero should never normally occur
1574 1 unknown opcode in set_start_bits
1575 2 missing capturing bracket
1576 3 unknown opcode in find_minlength
1577 */
1578
1579 int
PRIV(study)1580 PRIV(study)(pcre2_real_code *re)
1581 {
1582 int min;
1583 int count = 0;
1584 PCRE2_UCHAR *code;
1585 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1586
1587 /* Find start of compiled code */
1588
1589 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1590 re->name_entry_size * re->name_count;
1591
1592 /* For a pattern that has a first code unit, or a multiline pattern that
1593 matches only at "line start", there is no point in seeking a list of starting
1594 code units. */
1595
1596 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1597 {
1598 int rc = set_start_bits(re, code, utf);
1599 if (rc == SSB_UNKNOWN) return 1;
1600 if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
1601 }
1602
1603 /* Find the minimum length of subject string. If the pattern can match an empty
1604 string, the minimum length is already known. If there are more back references
1605 than the size of the vector we are going to cache them in, do nothing. A
1606 pattern that complicated will probably take a long time to analyze and may in
1607 any case turn out to be too complicated. Note that back reference minima are
1608 held as 16-bit numbers. */
1609
1610 if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
1611 re->top_backref <= MAX_CACHE_BACKREF)
1612 {
1613 int backref_cache[MAX_CACHE_BACKREF+1];
1614 backref_cache[0] = 0; /* Highest one that is set */
1615 min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1616 switch(min)
1617 {
1618 case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
1619 break; /* Leave minlength unchanged (will be zero) */
1620
1621 case -2:
1622 return 2; /* missing capturing bracket */
1623
1624 case -3:
1625 return 3; /* unrecognized opcode */
1626
1627 default:
1628 if (min > UINT16_MAX) min = UINT16_MAX;
1629 re->minlength = min;
1630 break;
1631 }
1632 }
1633
1634 return 0;
1635 }
1636
1637 /* End of pcre2_study.c */
1638