1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2018 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre2_internal.h"
50
51 /* The maximum remembered capturing brackets minimum. */
52
53 #define MAX_CACHE_BACKREF 128
54
55 /* Set a bit in the starting code unit bit map. */
56
57 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
58
59 /* Returns from set_start_bits() */
60
61 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
62
63
64 /*************************************************
65 * Find the minimum subject length for a group *
66 *************************************************/
67
68 /* Scan a parenthesized group and compute the minimum length of subject that
69 is needed to match it. This is a lower bound; it does not mean there is a
70 string of that length that matches. In UTF mode, the result is in characters
71 rather than code units. The field in a compiled pattern for storing the minimum
72 length is 16-bits long (on the grounds that anything longer than that is
73 pathological), so we give up when we reach that amount. This also means that
74 integer overflow for really crazy patterns cannot happen.
75
76 Backreference minimum lengths are cached to speed up multiple references. This
77 function is called only when the highest back reference in the pattern is less
78 than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79 caching vector. The zeroth element contains the number of the highest set
80 value.
81
82 Arguments:
83 re compiled pattern block
84 code pointer to start of group (the bracket)
85 startcode pointer to start of the whole pattern's code
86 utf UTF flag
87 recurses chain of recurse_check to catch mutual recursion
88 countptr pointer to call count (to catch over complexity)
89 backref_cache vector for caching back references.
90
91 Returns: the minimum length
92 -1 \C in UTF-8 mode
93 or (*ACCEPT)
94 or pattern too complicated
95 or back reference to duplicate name/number
96 -2 internal error (missing capturing bracket)
97 -3 internal error (opcode not listed)
98 */
99
100 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,BOOL utf,recurse_check * recurses,int * countptr,int * backref_cache)101 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
102 PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
103 int *backref_cache)
104 {
105 int length = -1;
106 int prev_cap_recno = -1;
107 int prev_cap_d = 0;
108 int prev_recurse_recno = -1;
109 int prev_recurse_d = 0;
110 uint32_t once_fudge = 0;
111 BOOL had_recurse = FALSE;
112 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
113 recurse_check this_recurse;
114 int branchlength = 0;
115 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
116
117 /* If this is a "could be empty" group, its minimum length is 0. */
118
119 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
120
121 /* Skip over capturing bracket number */
122
123 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
124
125 /* A large and/or complex regex can take too long to process. */
126
127 if ((*countptr)++ > 1000) return -1;
128
129 /* Scan along the opcodes for this branch. If we get to the end of the branch,
130 check the length against that of the other branches. If the accumulated length
131 passes 16-bits, stop. */
132
133 for (;;)
134 {
135 int d, min, recno;
136 PCRE2_UCHAR *cs, *ce;
137 PCRE2_UCHAR op = *cc;
138
139 if (branchlength >= UINT16_MAX) return UINT16_MAX;
140
141 switch (op)
142 {
143 case OP_COND:
144 case OP_SCOND:
145
146 /* If there is only one branch in a condition, the implied branch has zero
147 length, so we don't add anything. This covers the DEFINE "condition"
148 automatically. If there are two branches we can treat it the same as any
149 other non-capturing subpattern. */
150
151 cs = cc + GET(cc, 1);
152 if (*cs != OP_ALT)
153 {
154 cc = cs + 1 + LINK_SIZE;
155 break;
156 }
157 goto PROCESS_NON_CAPTURE;
158
159 case OP_BRA:
160 /* There's a special case of OP_BRA, when it is wrapped round a repeated
161 OP_RECURSE. We'd like to process the latter at this level so that
162 remembering the value works for repeated cases. So we do nothing, but
163 set a fudge value to skip over the OP_KET after the recurse. */
164
165 if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
166 {
167 once_fudge = 1 + LINK_SIZE;
168 cc += 1 + LINK_SIZE;
169 break;
170 }
171 /* Fall through */
172
173 case OP_ONCE:
174 case OP_SBRA:
175 case OP_BRAPOS:
176 case OP_SBRAPOS:
177 PROCESS_NON_CAPTURE:
178 d = find_minlength(re, cc, startcode, utf, recurses, countptr,
179 backref_cache);
180 if (d < 0) return d;
181 branchlength += d;
182 do cc += GET(cc, 1); while (*cc == OP_ALT);
183 cc += 1 + LINK_SIZE;
184 break;
185
186 /* To save time for repeated capturing subpatterns, we remember the
187 length of the previous one. Unfortunately we can't do the same for
188 the unnumbered ones above. Nor can we do this if (?| is present in the
189 pattern because captures with the same number are not then identical. */
190
191 case OP_CBRA:
192 case OP_SCBRA:
193 case OP_CBRAPOS:
194 case OP_SCBRAPOS:
195 recno = (int)GET2(cc, 1+LINK_SIZE);
196 if (dupcapused || recno != prev_cap_recno)
197 {
198 prev_cap_recno = recno;
199 prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
200 backref_cache);
201 if (prev_cap_d < 0) return prev_cap_d;
202 }
203 branchlength += prev_cap_d;
204 do cc += GET(cc, 1); while (*cc == OP_ALT);
205 cc += 1 + LINK_SIZE;
206 break;
207
208 /* ACCEPT makes things far too complicated; we have to give up. */
209
210 case OP_ACCEPT:
211 case OP_ASSERT_ACCEPT:
212 return -1;
213
214 /* Reached end of a branch; if it's a ket it is the end of a nested
215 call. If it's ALT it is an alternation in a nested call. If it is END it's
216 the end of the outer call. All can be handled by the same code. If an
217 ACCEPT was previously encountered, use the length that was in force at that
218 time, and pass back the shortest ACCEPT length. */
219
220 case OP_ALT:
221 case OP_KET:
222 case OP_KETRMAX:
223 case OP_KETRMIN:
224 case OP_KETRPOS:
225 case OP_END:
226 if (length < 0 || (!had_recurse && branchlength < length))
227 length = branchlength;
228 if (op != OP_ALT) return length;
229 cc += 1 + LINK_SIZE;
230 branchlength = 0;
231 had_recurse = FALSE;
232 break;
233
234 /* Skip over assertive subpatterns */
235
236 case OP_ASSERT:
237 case OP_ASSERT_NOT:
238 case OP_ASSERTBACK:
239 case OP_ASSERTBACK_NOT:
240 do cc += GET(cc, 1); while (*cc == OP_ALT);
241 /* Fall through */
242
243 /* Skip over things that don't match chars */
244
245 case OP_REVERSE:
246 case OP_CREF:
247 case OP_DNCREF:
248 case OP_RREF:
249 case OP_DNRREF:
250 case OP_FALSE:
251 case OP_TRUE:
252 case OP_CALLOUT:
253 case OP_SOD:
254 case OP_SOM:
255 case OP_EOD:
256 case OP_EODN:
257 case OP_CIRC:
258 case OP_CIRCM:
259 case OP_DOLL:
260 case OP_DOLLM:
261 case OP_NOT_WORD_BOUNDARY:
262 case OP_WORD_BOUNDARY:
263 cc += PRIV(OP_lengths)[*cc];
264 break;
265
266 case OP_CALLOUT_STR:
267 cc += GET(cc, 1 + 2*LINK_SIZE);
268 break;
269
270 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
271
272 case OP_BRAZERO:
273 case OP_BRAMINZERO:
274 case OP_BRAPOSZERO:
275 case OP_SKIPZERO:
276 cc += PRIV(OP_lengths)[*cc];
277 do cc += GET(cc, 1); while (*cc == OP_ALT);
278 cc += 1 + LINK_SIZE;
279 break;
280
281 /* Handle literal characters and + repetitions */
282
283 case OP_CHAR:
284 case OP_CHARI:
285 case OP_NOT:
286 case OP_NOTI:
287 case OP_PLUS:
288 case OP_PLUSI:
289 case OP_MINPLUS:
290 case OP_MINPLUSI:
291 case OP_POSPLUS:
292 case OP_POSPLUSI:
293 case OP_NOTPLUS:
294 case OP_NOTPLUSI:
295 case OP_NOTMINPLUS:
296 case OP_NOTMINPLUSI:
297 case OP_NOTPOSPLUS:
298 case OP_NOTPOSPLUSI:
299 branchlength++;
300 cc += 2;
301 #ifdef SUPPORT_UNICODE
302 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
303 #endif
304 break;
305
306 case OP_TYPEPLUS:
307 case OP_TYPEMINPLUS:
308 case OP_TYPEPOSPLUS:
309 branchlength++;
310 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
311 break;
312
313 /* Handle exact repetitions. The count is already in characters, but we
314 may need to skip over a multibyte character in UTF mode. */
315
316 case OP_EXACT:
317 case OP_EXACTI:
318 case OP_NOTEXACT:
319 case OP_NOTEXACTI:
320 branchlength += GET2(cc,1);
321 cc += 2 + IMM2_SIZE;
322 #ifdef SUPPORT_UNICODE
323 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
324 #endif
325 break;
326
327 case OP_TYPEEXACT:
328 branchlength += GET2(cc,1);
329 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
330 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
331 break;
332
333 /* Handle single-char non-literal matchers */
334
335 case OP_PROP:
336 case OP_NOTPROP:
337 cc += 2;
338 /* Fall through */
339
340 case OP_NOT_DIGIT:
341 case OP_DIGIT:
342 case OP_NOT_WHITESPACE:
343 case OP_WHITESPACE:
344 case OP_NOT_WORDCHAR:
345 case OP_WORDCHAR:
346 case OP_ANY:
347 case OP_ALLANY:
348 case OP_EXTUNI:
349 case OP_HSPACE:
350 case OP_NOT_HSPACE:
351 case OP_VSPACE:
352 case OP_NOT_VSPACE:
353 branchlength++;
354 cc++;
355 break;
356
357 /* "Any newline" might match two characters, but it also might match just
358 one. */
359
360 case OP_ANYNL:
361 branchlength += 1;
362 cc++;
363 break;
364
365 /* The single-byte matcher means we can't proceed in UTF mode. (In
366 non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
367 appear, but leave the code, just in case.) */
368
369 case OP_ANYBYTE:
370 #ifdef SUPPORT_UNICODE
371 if (utf) return -1;
372 #endif
373 branchlength++;
374 cc++;
375 break;
376
377 /* For repeated character types, we have to test for \p and \P, which have
378 an extra two bytes of parameters. */
379
380 case OP_TYPESTAR:
381 case OP_TYPEMINSTAR:
382 case OP_TYPEQUERY:
383 case OP_TYPEMINQUERY:
384 case OP_TYPEPOSSTAR:
385 case OP_TYPEPOSQUERY:
386 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
387 cc += PRIV(OP_lengths)[op];
388 break;
389
390 case OP_TYPEUPTO:
391 case OP_TYPEMINUPTO:
392 case OP_TYPEPOSUPTO:
393 if (cc[1 + IMM2_SIZE] == OP_PROP
394 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
395 cc += PRIV(OP_lengths)[op];
396 break;
397
398 /* Check a class for variable quantification */
399
400 case OP_CLASS:
401 case OP_NCLASS:
402 #ifdef SUPPORT_WIDE_CHARS
403 case OP_XCLASS:
404 /* The original code caused an unsigned overflow in 64 bit systems,
405 so now we use a conditional statement. */
406 if (op == OP_XCLASS)
407 cc += GET(cc, 1);
408 else
409 cc += PRIV(OP_lengths)[OP_CLASS];
410 #else
411 cc += PRIV(OP_lengths)[OP_CLASS];
412 #endif
413
414 switch (*cc)
415 {
416 case OP_CRPLUS:
417 case OP_CRMINPLUS:
418 case OP_CRPOSPLUS:
419 branchlength++;
420 /* Fall through */
421
422 case OP_CRSTAR:
423 case OP_CRMINSTAR:
424 case OP_CRQUERY:
425 case OP_CRMINQUERY:
426 case OP_CRPOSSTAR:
427 case OP_CRPOSQUERY:
428 cc++;
429 break;
430
431 case OP_CRRANGE:
432 case OP_CRMINRANGE:
433 case OP_CRPOSRANGE:
434 branchlength += GET2(cc,1);
435 cc += 1 + 2 * IMM2_SIZE;
436 break;
437
438 default:
439 branchlength++;
440 break;
441 }
442 break;
443
444 /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
445 way: we find the minimum length for the subpattern. A recursion
446 (backreference or subroutine) causes an a flag to be set that causes the
447 length of this branch to be ignored. The logic is that a recursion can only
448 make sense if there is another alternative that stops the recursing. That
449 will provide the minimum length (when no recursion happens).
450
451 If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
452 matches an empty string (by default it causes a matching failure), so in
453 that case we must set the minimum length to zero. */
454
455 /* Duplicate named pattern back reference. We cannot reliably find a length
456 for this if duplicate numbers are present in the pattern. */
457
458 case OP_DNREF:
459 case OP_DNREFI:
460 if (dupcapused) return -1;
461 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
462 {
463 int count = GET2(cc, 1+IMM2_SIZE);
464 PCRE2_UCHAR *slot =
465 (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
466 GET2(cc, 1) * re->name_entry_size;
467
468 d = INT_MAX;
469
470 /* Scan all groups with the same name; find the shortest. */
471
472 while (count-- > 0)
473 {
474 int dd, i;
475 recno = GET2(slot, 0);
476
477 if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
478 dd = backref_cache[recno];
479 else
480 {
481 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
482 if (cs == NULL) return -2;
483 do ce += GET(ce, 1); while (*ce == OP_ALT);
484 if (cc > cs && cc < ce) /* Simple recursion */
485 {
486 dd = 0;
487 had_recurse = TRUE;
488 }
489 else
490 {
491 recurse_check *r = recurses;
492 for (r = recurses; r != NULL; r = r->prev)
493 if (r->group == cs) break;
494 if (r != NULL) /* Mutual recursion */
495 {
496 dd = 0;
497 had_recurse = TRUE;
498 }
499 else
500 {
501 this_recurse.prev = recurses;
502 this_recurse.group = cs;
503 dd = find_minlength(re, cs, startcode, utf, &this_recurse,
504 countptr, backref_cache);
505 if (dd < 0) return dd;
506 }
507 }
508
509 backref_cache[recno] = dd;
510 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
511 backref_cache[0] = recno;
512 }
513
514 if (dd < d) d = dd;
515 if (d <= 0) break; /* No point looking at any more */
516 slot += re->name_entry_size;
517 }
518 }
519 else d = 0;
520 cc += 1 + 2*IMM2_SIZE;
521 goto REPEAT_BACK_REFERENCE;
522
523 /* Single back reference. We cannot find a length for this if duplicate
524 numbers are present in the pattern. */
525
526 case OP_REF:
527 case OP_REFI:
528 if (dupcapused) return -1;
529 recno = GET2(cc, 1);
530 if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
531 d = backref_cache[recno];
532 else
533 {
534 int i;
535 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
536 {
537 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
538 if (cs == NULL) return -2;
539 do ce += GET(ce, 1); while (*ce == OP_ALT);
540 if (cc > cs && cc < ce) /* Simple recursion */
541 {
542 d = 0;
543 had_recurse = TRUE;
544 }
545 else
546 {
547 recurse_check *r = recurses;
548 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
549 if (r != NULL) /* Mutual recursion */
550 {
551 d = 0;
552 had_recurse = TRUE;
553 }
554 else
555 {
556 this_recurse.prev = recurses;
557 this_recurse.group = cs;
558 d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
559 backref_cache);
560 if (d < 0) return d;
561 }
562 }
563 }
564 else d = 0;
565
566 backref_cache[recno] = d;
567 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
568 backref_cache[0] = recno;
569 }
570
571 cc += 1 + IMM2_SIZE;
572
573 /* Handle repeated back references */
574
575 REPEAT_BACK_REFERENCE:
576 switch (*cc)
577 {
578 case OP_CRSTAR:
579 case OP_CRMINSTAR:
580 case OP_CRQUERY:
581 case OP_CRMINQUERY:
582 case OP_CRPOSSTAR:
583 case OP_CRPOSQUERY:
584 min = 0;
585 cc++;
586 break;
587
588 case OP_CRPLUS:
589 case OP_CRMINPLUS:
590 case OP_CRPOSPLUS:
591 min = 1;
592 cc++;
593 break;
594
595 case OP_CRRANGE:
596 case OP_CRMINRANGE:
597 case OP_CRPOSRANGE:
598 min = GET2(cc, 1);
599 cc += 1 + 2 * IMM2_SIZE;
600 break;
601
602 default:
603 min = 1;
604 break;
605 }
606
607 /* Take care not to overflow: (1) min and d are ints, so check that their
608 product is not greater than INT_MAX. (2) branchlength is limited to
609 UINT16_MAX (checked at the top of the loop). */
610
611 if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
612 branchlength = UINT16_MAX;
613 else branchlength += min * d;
614 break;
615
616 /* Recursion always refers to the first occurrence of a subpattern with a
617 given number. Therefore, we can always make use of caching, even when the
618 pattern contains multiple subpatterns with the same number. */
619
620 case OP_RECURSE:
621 cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
622 recno = GET2(cs, 1+LINK_SIZE);
623 if (recno == prev_recurse_recno)
624 {
625 branchlength += prev_recurse_d;
626 }
627 else
628 {
629 do ce += GET(ce, 1); while (*ce == OP_ALT);
630 if (cc > cs && cc < ce) /* Simple recursion */
631 had_recurse = TRUE;
632 else
633 {
634 recurse_check *r = recurses;
635 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
636 if (r != NULL) /* Mutual recursion */
637 had_recurse = TRUE;
638 else
639 {
640 this_recurse.prev = recurses;
641 this_recurse.group = cs;
642 prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
643 countptr, backref_cache);
644 if (prev_recurse_d < 0) return prev_recurse_d;
645 prev_recurse_recno = recno;
646 branchlength += prev_recurse_d;
647 }
648 }
649 }
650 cc += 1 + LINK_SIZE + once_fudge;
651 once_fudge = 0;
652 break;
653
654 /* Anything else does not or need not match a character. We can get the
655 item's length from the table, but for those that can match zero occurrences
656 of a character, we must take special action for UTF-8 characters. As it
657 happens, the "NOT" versions of these opcodes are used at present only for
658 ASCII characters, so they could be omitted from this list. However, in
659 future that may change, so we include them here so as not to leave a
660 gotcha for a future maintainer. */
661
662 case OP_UPTO:
663 case OP_UPTOI:
664 case OP_NOTUPTO:
665 case OP_NOTUPTOI:
666 case OP_MINUPTO:
667 case OP_MINUPTOI:
668 case OP_NOTMINUPTO:
669 case OP_NOTMINUPTOI:
670 case OP_POSUPTO:
671 case OP_POSUPTOI:
672 case OP_NOTPOSUPTO:
673 case OP_NOTPOSUPTOI:
674
675 case OP_STAR:
676 case OP_STARI:
677 case OP_NOTSTAR:
678 case OP_NOTSTARI:
679 case OP_MINSTAR:
680 case OP_MINSTARI:
681 case OP_NOTMINSTAR:
682 case OP_NOTMINSTARI:
683 case OP_POSSTAR:
684 case OP_POSSTARI:
685 case OP_NOTPOSSTAR:
686 case OP_NOTPOSSTARI:
687
688 case OP_QUERY:
689 case OP_QUERYI:
690 case OP_NOTQUERY:
691 case OP_NOTQUERYI:
692 case OP_MINQUERY:
693 case OP_MINQUERYI:
694 case OP_NOTMINQUERY:
695 case OP_NOTMINQUERYI:
696 case OP_POSQUERY:
697 case OP_POSQUERYI:
698 case OP_NOTPOSQUERY:
699 case OP_NOTPOSQUERYI:
700
701 cc += PRIV(OP_lengths)[op];
702 #ifdef SUPPORT_UNICODE
703 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
704 #endif
705 break;
706
707 /* Skip these, but we need to add in the name length. */
708
709 case OP_MARK:
710 case OP_COMMIT_ARG:
711 case OP_PRUNE_ARG:
712 case OP_SKIP_ARG:
713 case OP_THEN_ARG:
714 cc += PRIV(OP_lengths)[op] + cc[1];
715 break;
716
717 /* The remaining opcodes are just skipped over. */
718
719 case OP_CLOSE:
720 case OP_COMMIT:
721 case OP_FAIL:
722 case OP_PRUNE:
723 case OP_SET_SOM:
724 case OP_SKIP:
725 case OP_THEN:
726 cc += PRIV(OP_lengths)[op];
727 break;
728
729 /* This should not occur: we list all opcodes explicitly so that when
730 new ones get added they are properly considered. */
731
732 default:
733 return -3;
734 }
735 }
736 /* Control never gets here */
737 }
738
739
740
741 /*************************************************
742 * Set a bit and maybe its alternate case *
743 *************************************************/
744
745 /* Given a character, set its first code unit's bit in the table, and also the
746 corresponding bit for the other version of a letter if we are caseless.
747
748 Arguments:
749 re points to the regex block
750 p points to the first code unit of the character
751 caseless TRUE if caseless
752 utf TRUE for UTF mode
753
754 Returns: pointer after the character
755 */
756
757 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf)758 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
759 {
760 uint32_t c = *p++; /* First code unit */
761 (void)utf; /* Stop compiler warning when UTF not supported */
762
763 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
764 0xff. */
765
766 #if PCRE2_CODE_UNIT_WIDTH != 8
767 if (c > 0xff) SET_BIT(0xff); else
768 #endif
769
770 SET_BIT(c);
771
772 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
773 the end of the character, even when caseless. */
774
775 #ifdef SUPPORT_UNICODE
776 if (utf)
777 {
778 #if PCRE2_CODE_UNIT_WIDTH == 8
779 if (c >= 0xc0) GETUTF8INC(c, p);
780 #elif PCRE2_CODE_UNIT_WIDTH == 16
781 if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
782 #endif
783 }
784 #endif /* SUPPORT_UNICODE */
785
786 /* If caseless, handle the other case of the character. */
787
788 if (caseless)
789 {
790 #ifdef SUPPORT_UNICODE
791 if (utf)
792 {
793 #if PCRE2_CODE_UNIT_WIDTH == 8
794 PCRE2_UCHAR buff[6];
795 c = UCD_OTHERCASE(c);
796 (void)PRIV(ord2utf)(c, buff);
797 SET_BIT(buff[0]);
798 #else /* 16-bit or 32-bit mode */
799 c = UCD_OTHERCASE(c);
800 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
801 #endif
802 }
803 else
804 #endif /* SUPPORT_UNICODE */
805
806 /* Not UTF */
807
808 if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
809 }
810
811 return p;
812 }
813
814
815
816 /*************************************************
817 * Set bits for a positive character type *
818 *************************************************/
819
820 /* This function sets starting bits for a character type. In UTF-8 mode, we can
821 only do a direct setting for bytes less than 128, as otherwise there can be
822 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
823 environment, the tables will only recognize ASCII characters anyway, but in at
824 least one Windows environment, some higher bytes bits were set in the tables.
825 So we deal with that case by considering the UTF-8 encoding.
826
827 Arguments:
828 re the regex block
829 cbit type the type of character wanted
830 table_limit 32 for non-UTF-8; 16 for UTF-8
831
832 Returns: nothing
833 */
834
835 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)836 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
837 {
838 uint32_t c;
839 for (c = 0; c < table_limit; c++)
840 re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
841 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
842 if (table_limit == 32) return;
843 for (c = 128; c < 256; c++)
844 {
845 if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0)
846 {
847 PCRE2_UCHAR buff[6];
848 (void)PRIV(ord2utf)(c, buff);
849 SET_BIT(buff[0]);
850 }
851 }
852 #endif /* UTF-8 */
853 }
854
855
856 /*************************************************
857 * Set bits for a negative character type *
858 *************************************************/
859
860 /* This function sets starting bits for a negative character type such as \D.
861 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
862 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
863 Unlike in the positive case, where we can set appropriate starting bits for
864 specific high-valued UTF-8 characters, in this case we have to set the bits for
865 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
866 0xc0 (192) for simplicity.
867
868 Arguments:
869 re the regex block
870 cbit type the type of character wanted
871 table_limit 32 for non-UTF-8; 16 for UTF-8
872
873 Returns: nothing
874 */
875
876 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)877 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
878 {
879 uint32_t c;
880 for (c = 0; c < table_limit; c++)
881 re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
882 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
883 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
884 #endif
885 }
886
887
888
889 /*************************************************
890 * Create bitmap of starting bytes *
891 *************************************************/
892
893 /* This function scans a compiled unanchored expression recursively and
894 attempts to build a bitmap of the set of possible starting code units whose
895 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
896 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
897 we pass a value of 16 rather than 32 as the final argument. (See comments in
898 those functions for the reason.)
899
900 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
901 (a*)b where the group provides some optional starting code units but scanning
902 must continue at the outer level to find at least one mandatory code unit. At
903 the outermost level, this function fails unless the result is SSB_DONE.
904
905 Arguments:
906 re points to the compiled regex block
907 code points to an expression
908 utf TRUE if in UTF mode
909
910 Returns: SSB_FAIL => Failed to find any starting code units
911 SSB_DONE => Found mandatory starting code units
912 SSB_CONTINUE => Found optional starting code units
913 SSB_UNKNOWN => Hit an unrecognized opcode
914 */
915
916 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf)917 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
918 {
919 uint32_t c;
920 int yield = SSB_DONE;
921
922 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
923 int table_limit = utf? 16:32;
924 #else
925 int table_limit = 32;
926 #endif
927
928 do
929 {
930 BOOL try_next = TRUE;
931 PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
932
933 if (*code == OP_CBRA || *code == OP_SCBRA ||
934 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
935
936 while (try_next) /* Loop for items in this branch */
937 {
938 int rc;
939 uint8_t *classmap = NULL;
940
941 switch(*tcode)
942 {
943 /* If we reach something we don't understand, it means a new opcode has
944 been created that hasn't been added to this function. Hopefully this
945 problem will be discovered during testing. */
946
947 default:
948 return SSB_UNKNOWN;
949
950 /* Fail for a valid opcode that implies no starting bits. */
951
952 case OP_ACCEPT:
953 case OP_ASSERT_ACCEPT:
954 case OP_ALLANY:
955 case OP_ANY:
956 case OP_ANYBYTE:
957 case OP_CIRCM:
958 case OP_CLOSE:
959 case OP_COMMIT:
960 case OP_COMMIT_ARG:
961 case OP_COND:
962 case OP_CREF:
963 case OP_FALSE:
964 case OP_TRUE:
965 case OP_DNCREF:
966 case OP_DNREF:
967 case OP_DNREFI:
968 case OP_DNRREF:
969 case OP_DOLL:
970 case OP_DOLLM:
971 case OP_END:
972 case OP_EOD:
973 case OP_EODN:
974 case OP_EXTUNI:
975 case OP_FAIL:
976 case OP_MARK:
977 case OP_NOT:
978 case OP_NOTEXACT:
979 case OP_NOTEXACTI:
980 case OP_NOTI:
981 case OP_NOTMINPLUS:
982 case OP_NOTMINPLUSI:
983 case OP_NOTMINQUERY:
984 case OP_NOTMINQUERYI:
985 case OP_NOTMINSTAR:
986 case OP_NOTMINSTARI:
987 case OP_NOTMINUPTO:
988 case OP_NOTMINUPTOI:
989 case OP_NOTPLUS:
990 case OP_NOTPLUSI:
991 case OP_NOTPOSPLUS:
992 case OP_NOTPOSPLUSI:
993 case OP_NOTPOSQUERY:
994 case OP_NOTPOSQUERYI:
995 case OP_NOTPOSSTAR:
996 case OP_NOTPOSSTARI:
997 case OP_NOTPOSUPTO:
998 case OP_NOTPOSUPTOI:
999 case OP_NOTPROP:
1000 case OP_NOTQUERY:
1001 case OP_NOTQUERYI:
1002 case OP_NOTSTAR:
1003 case OP_NOTSTARI:
1004 case OP_NOTUPTO:
1005 case OP_NOTUPTOI:
1006 case OP_NOT_HSPACE:
1007 case OP_NOT_VSPACE:
1008 case OP_PRUNE:
1009 case OP_PRUNE_ARG:
1010 case OP_RECURSE:
1011 case OP_REF:
1012 case OP_REFI:
1013 case OP_REVERSE:
1014 case OP_RREF:
1015 case OP_SCOND:
1016 case OP_SET_SOM:
1017 case OP_SKIP:
1018 case OP_SKIP_ARG:
1019 case OP_SOD:
1020 case OP_SOM:
1021 case OP_THEN:
1022 case OP_THEN_ARG:
1023 return SSB_FAIL;
1024
1025 /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1026 uses OP_CIRCM). Skip over it. */
1027
1028 case OP_CIRC:
1029 tcode += PRIV(OP_lengths)[OP_CIRC];
1030 break;
1031
1032 /* A "real" property test implies no starting bits, but the fake property
1033 PT_CLIST identifies a list of characters. These lists are short, as they
1034 are used for characters with more than one "other case", so there is no
1035 point in recognizing them for OP_NOTPROP. */
1036
1037 case OP_PROP:
1038 if (tcode[1] != PT_CLIST) return SSB_FAIL;
1039 {
1040 const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1041 while ((c = *p++) < NOTACHAR)
1042 {
1043 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1044 if (utf)
1045 {
1046 PCRE2_UCHAR buff[6];
1047 (void)PRIV(ord2utf)(c, buff);
1048 c = buff[0];
1049 }
1050 #endif
1051 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1052 }
1053 }
1054 try_next = FALSE;
1055 break;
1056
1057 /* We can ignore word boundary tests. */
1058
1059 case OP_WORD_BOUNDARY:
1060 case OP_NOT_WORD_BOUNDARY:
1061 tcode++;
1062 break;
1063
1064 /* If we hit a bracket or a positive lookahead assertion, recurse to set
1065 bits from within the subpattern. If it can't find anything, we have to
1066 give up. If it finds some mandatory character(s), we are done for this
1067 branch. Otherwise, carry on scanning after the subpattern. */
1068
1069 case OP_BRA:
1070 case OP_SBRA:
1071 case OP_CBRA:
1072 case OP_SCBRA:
1073 case OP_BRAPOS:
1074 case OP_SBRAPOS:
1075 case OP_CBRAPOS:
1076 case OP_SCBRAPOS:
1077 case OP_ONCE:
1078 case OP_ASSERT:
1079 rc = set_start_bits(re, tcode, utf);
1080 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1081 if (rc == SSB_DONE) try_next = FALSE; else
1082 {
1083 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1084 tcode += 1 + LINK_SIZE;
1085 }
1086 break;
1087
1088 /* If we hit ALT or KET, it means we haven't found anything mandatory in
1089 this branch, though we might have found something optional. For ALT, we
1090 continue with the next alternative, but we have to arrange that the final
1091 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1092 return SSB_CONTINUE: if this is the top level, that indicates failure,
1093 but after a nested subpattern, it causes scanning to continue. */
1094
1095 case OP_ALT:
1096 yield = SSB_CONTINUE;
1097 try_next = FALSE;
1098 break;
1099
1100 case OP_KET:
1101 case OP_KETRMAX:
1102 case OP_KETRMIN:
1103 case OP_KETRPOS:
1104 return SSB_CONTINUE;
1105
1106 /* Skip over callout */
1107
1108 case OP_CALLOUT:
1109 tcode += PRIV(OP_lengths)[OP_CALLOUT];
1110 break;
1111
1112 case OP_CALLOUT_STR:
1113 tcode += GET(tcode, 1 + 2*LINK_SIZE);
1114 break;
1115
1116 /* Skip over lookbehind and negative lookahead assertions */
1117
1118 case OP_ASSERT_NOT:
1119 case OP_ASSERTBACK:
1120 case OP_ASSERTBACK_NOT:
1121 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1122 tcode += 1 + LINK_SIZE;
1123 break;
1124
1125 /* BRAZERO does the bracket, but carries on. */
1126
1127 case OP_BRAZERO:
1128 case OP_BRAMINZERO:
1129 case OP_BRAPOSZERO:
1130 rc = set_start_bits(re, ++tcode, utf);
1131 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1132 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1133 tcode += 1 + LINK_SIZE;
1134 break;
1135
1136 /* SKIPZERO skips the bracket. */
1137
1138 case OP_SKIPZERO:
1139 tcode++;
1140 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1141 tcode += 1 + LINK_SIZE;
1142 break;
1143
1144 /* Single-char * or ? sets the bit and tries the next item */
1145
1146 case OP_STAR:
1147 case OP_MINSTAR:
1148 case OP_POSSTAR:
1149 case OP_QUERY:
1150 case OP_MINQUERY:
1151 case OP_POSQUERY:
1152 tcode = set_table_bit(re, tcode + 1, FALSE, utf);
1153 break;
1154
1155 case OP_STARI:
1156 case OP_MINSTARI:
1157 case OP_POSSTARI:
1158 case OP_QUERYI:
1159 case OP_MINQUERYI:
1160 case OP_POSQUERYI:
1161 tcode = set_table_bit(re, tcode + 1, TRUE, utf);
1162 break;
1163
1164 /* Single-char upto sets the bit and tries the next */
1165
1166 case OP_UPTO:
1167 case OP_MINUPTO:
1168 case OP_POSUPTO:
1169 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
1170 break;
1171
1172 case OP_UPTOI:
1173 case OP_MINUPTOI:
1174 case OP_POSUPTOI:
1175 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
1176 break;
1177
1178 /* At least one single char sets the bit and stops */
1179
1180 case OP_EXACT:
1181 tcode += IMM2_SIZE;
1182 /* Fall through */
1183 case OP_CHAR:
1184 case OP_PLUS:
1185 case OP_MINPLUS:
1186 case OP_POSPLUS:
1187 (void)set_table_bit(re, tcode + 1, FALSE, utf);
1188 try_next = FALSE;
1189 break;
1190
1191 case OP_EXACTI:
1192 tcode += IMM2_SIZE;
1193 /* Fall through */
1194 case OP_CHARI:
1195 case OP_PLUSI:
1196 case OP_MINPLUSI:
1197 case OP_POSPLUSI:
1198 (void)set_table_bit(re, tcode + 1, TRUE, utf);
1199 try_next = FALSE;
1200 break;
1201
1202 /* Special spacing and line-terminating items. These recognize specific
1203 lists of characters. The difference between VSPACE and ANYNL is that the
1204 latter can match the two-character CRLF sequence, but that is not
1205 relevant for finding the first character, so their code here is
1206 identical. */
1207
1208 case OP_HSPACE:
1209 SET_BIT(CHAR_HT);
1210 SET_BIT(CHAR_SPACE);
1211
1212 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1213 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1214
1215 #if PCRE2_CODE_UNIT_WIDTH != 8
1216 SET_BIT(0xA0);
1217 SET_BIT(0xFF);
1218 #else
1219 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1220 units of horizontal space characters. */
1221
1222 #ifdef SUPPORT_UNICODE
1223 if (utf)
1224 {
1225 SET_BIT(0xC2); /* For U+00A0 */
1226 SET_BIT(0xE1); /* For U+1680, U+180E */
1227 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1228 SET_BIT(0xE3); /* For U+3000 */
1229 }
1230 else
1231 #endif
1232 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1233 the code is EBCDIC. */
1234 {
1235 #ifndef EBCDIC
1236 SET_BIT(0xA0);
1237 #endif /* Not EBCDIC */
1238 }
1239 #endif /* 8-bit support */
1240
1241 try_next = FALSE;
1242 break;
1243
1244 case OP_ANYNL:
1245 case OP_VSPACE:
1246 SET_BIT(CHAR_LF);
1247 SET_BIT(CHAR_VT);
1248 SET_BIT(CHAR_FF);
1249 SET_BIT(CHAR_CR);
1250
1251 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1252 the bits for NEL and for code units >= 255, independently of UTF. */
1253
1254 #if PCRE2_CODE_UNIT_WIDTH != 8
1255 SET_BIT(CHAR_NEL);
1256 SET_BIT(0xFF);
1257 #else
1258 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1259 units of vertical space characters. */
1260
1261 #ifdef SUPPORT_UNICODE
1262 if (utf)
1263 {
1264 SET_BIT(0xC2); /* For U+0085 (NEL) */
1265 SET_BIT(0xE2); /* For U+2028, U+2029 */
1266 }
1267 else
1268 #endif
1269 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1270 {
1271 SET_BIT(CHAR_NEL);
1272 }
1273 #endif /* 8-bit support */
1274
1275 try_next = FALSE;
1276 break;
1277
1278 /* Single character types set the bits and stop. Note that if PCRE2_UCP
1279 is set, we do not see these opcodes because \d etc are converted to
1280 properties. Therefore, these apply in the case when only characters less
1281 than 256 are recognized to match the types. */
1282
1283 case OP_NOT_DIGIT:
1284 set_nottype_bits(re, cbit_digit, table_limit);
1285 try_next = FALSE;
1286 break;
1287
1288 case OP_DIGIT:
1289 set_type_bits(re, cbit_digit, table_limit);
1290 try_next = FALSE;
1291 break;
1292
1293 case OP_NOT_WHITESPACE:
1294 set_nottype_bits(re, cbit_space, table_limit);
1295 try_next = FALSE;
1296 break;
1297
1298 case OP_WHITESPACE:
1299 set_type_bits(re, cbit_space, table_limit);
1300 try_next = FALSE;
1301 break;
1302
1303 case OP_NOT_WORDCHAR:
1304 set_nottype_bits(re, cbit_word, table_limit);
1305 try_next = FALSE;
1306 break;
1307
1308 case OP_WORDCHAR:
1309 set_type_bits(re, cbit_word, table_limit);
1310 try_next = FALSE;
1311 break;
1312
1313 /* One or more character type fudges the pointer and restarts, knowing
1314 it will hit a single character type and stop there. */
1315
1316 case OP_TYPEPLUS:
1317 case OP_TYPEMINPLUS:
1318 case OP_TYPEPOSPLUS:
1319 tcode++;
1320 break;
1321
1322 case OP_TYPEEXACT:
1323 tcode += 1 + IMM2_SIZE;
1324 break;
1325
1326 /* Zero or more repeats of character types set the bits and then
1327 try again. */
1328
1329 case OP_TYPEUPTO:
1330 case OP_TYPEMINUPTO:
1331 case OP_TYPEPOSUPTO:
1332 tcode += IMM2_SIZE; /* Fall through */
1333
1334 case OP_TYPESTAR:
1335 case OP_TYPEMINSTAR:
1336 case OP_TYPEPOSSTAR:
1337 case OP_TYPEQUERY:
1338 case OP_TYPEMINQUERY:
1339 case OP_TYPEPOSQUERY:
1340 switch(tcode[1])
1341 {
1342 default:
1343 case OP_ANY:
1344 case OP_ALLANY:
1345 return SSB_FAIL;
1346
1347 case OP_HSPACE:
1348 SET_BIT(CHAR_HT);
1349 SET_BIT(CHAR_SPACE);
1350
1351 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1352 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1353
1354 #if PCRE2_CODE_UNIT_WIDTH != 8
1355 SET_BIT(0xA0);
1356 SET_BIT(0xFF);
1357 #else
1358 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1359 units of horizontal space characters. */
1360
1361 #ifdef SUPPORT_UNICODE
1362 if (utf)
1363 {
1364 SET_BIT(0xC2); /* For U+00A0 */
1365 SET_BIT(0xE1); /* For U+1680, U+180E */
1366 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1367 SET_BIT(0xE3); /* For U+3000 */
1368 }
1369 else
1370 #endif
1371 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1372 the code is EBCDIC. */
1373 {
1374 #ifndef EBCDIC
1375 SET_BIT(0xA0);
1376 #endif /* Not EBCDIC */
1377 }
1378 #endif /* 8-bit support */
1379 break;
1380
1381 case OP_ANYNL:
1382 case OP_VSPACE:
1383 SET_BIT(CHAR_LF);
1384 SET_BIT(CHAR_VT);
1385 SET_BIT(CHAR_FF);
1386 SET_BIT(CHAR_CR);
1387
1388 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1389 the bits for NEL and for code units >= 255, independently of UTF. */
1390
1391 #if PCRE2_CODE_UNIT_WIDTH != 8
1392 SET_BIT(CHAR_NEL);
1393 SET_BIT(0xFF);
1394 #else
1395 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1396 units of vertical space characters. */
1397
1398 #ifdef SUPPORT_UNICODE
1399 if (utf)
1400 {
1401 SET_BIT(0xC2); /* For U+0085 (NEL) */
1402 SET_BIT(0xE2); /* For U+2028, U+2029 */
1403 }
1404 else
1405 #endif
1406 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1407 {
1408 SET_BIT(CHAR_NEL);
1409 }
1410 #endif /* 8-bit support */
1411 break;
1412
1413 case OP_NOT_DIGIT:
1414 set_nottype_bits(re, cbit_digit, table_limit);
1415 break;
1416
1417 case OP_DIGIT:
1418 set_type_bits(re, cbit_digit, table_limit);
1419 break;
1420
1421 case OP_NOT_WHITESPACE:
1422 set_nottype_bits(re, cbit_space, table_limit);
1423 break;
1424
1425 case OP_WHITESPACE:
1426 set_type_bits(re, cbit_space, table_limit);
1427 break;
1428
1429 case OP_NOT_WORDCHAR:
1430 set_nottype_bits(re, cbit_word, table_limit);
1431 break;
1432
1433 case OP_WORDCHAR:
1434 set_type_bits(re, cbit_word, table_limit);
1435 break;
1436 }
1437
1438 tcode += 2;
1439 break;
1440
1441 /* Extended class: if there are any property checks, or if this is a
1442 negative XCLASS without a map, give up. If there are no property checks,
1443 there must be wide characters on the XCLASS list, because otherwise an
1444 XCLASS would not have been created. This means that code points >= 255
1445 are always potential starters. */
1446
1447 #ifdef SUPPORT_WIDE_CHARS
1448 case OP_XCLASS:
1449 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
1450 (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1451 return SSB_FAIL;
1452
1453 /* We have a positive XCLASS or a negative one without a map. Set up the
1454 map pointer if there is one, and fall through. */
1455
1456 classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
1457 (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1458 #endif
1459 /* It seems that the fall through comment must be outside the #ifdef if
1460 it is to avoid the gcc compiler warning. */
1461
1462 /* Fall through */
1463
1464 /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1465 in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1466 because it starts a character with a value > 255. In 8-bit non-UTF mode,
1467 there is no difference between CLASS and NCLASS. In all other wide
1468 character modes, set the 0xFF bit to indicate code units >= 255. */
1469
1470 case OP_NCLASS:
1471 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1472 if (utf)
1473 {
1474 re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1475 memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1476 }
1477 #elif PCRE2_CODE_UNIT_WIDTH != 8
1478 SET_BIT(0xFF); /* For characters >= 255 */
1479 #endif
1480 /* Fall through */
1481
1482 /* Enter here for a positive non-XCLASS. If we have fallen through from
1483 an XCLASS, classmap will already be set; just advance the code pointer.
1484 Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1485
1486 case OP_CLASS:
1487 if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1488 {
1489 classmap = (uint8_t *)(++tcode);
1490 tcode += 32 / sizeof(PCRE2_UCHAR);
1491 }
1492
1493 /* When wide characters are supported, classmap may be NULL. In UTF-8
1494 (sic) mode, the bits in a class bit map correspond to character values,
1495 not to byte values. However, the bit map we are constructing is for byte
1496 values. So we have to do a conversion for characters whose code point is
1497 greater than 127. In fact, there are only two possible starting bytes for
1498 characters in the range 128 - 255. */
1499
1500 if (classmap != NULL)
1501 {
1502 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1503 if (utf)
1504 {
1505 for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1506 for (c = 128; c < 256; c++)
1507 {
1508 if ((classmap[c/8] & (1 << (c&7))) != 0)
1509 {
1510 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1511 re->start_bitmap[d/8] |= (1 << (d&7)); /* and then skip on to the */
1512 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1513 }
1514 }
1515 }
1516 else
1517 #endif
1518 /* In all modes except UTF-8, the two bit maps are compatible. */
1519
1520 {
1521 for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1522 }
1523 }
1524
1525 /* Act on what follows the class. For a zero minimum repeat, continue;
1526 otherwise stop processing. */
1527
1528 switch (*tcode)
1529 {
1530 case OP_CRSTAR:
1531 case OP_CRMINSTAR:
1532 case OP_CRQUERY:
1533 case OP_CRMINQUERY:
1534 case OP_CRPOSSTAR:
1535 case OP_CRPOSQUERY:
1536 tcode++;
1537 break;
1538
1539 case OP_CRRANGE:
1540 case OP_CRMINRANGE:
1541 case OP_CRPOSRANGE:
1542 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1543 else try_next = FALSE;
1544 break;
1545
1546 default:
1547 try_next = FALSE;
1548 break;
1549 }
1550 break; /* End of class handling case */
1551 } /* End of switch for opcodes */
1552 } /* End of try_next loop */
1553
1554 code += GET(code, 1); /* Advance to next branch */
1555 }
1556 while (*code == OP_ALT);
1557
1558 return yield;
1559 }
1560
1561
1562
1563 /*************************************************
1564 * Study a compiled expression *
1565 *************************************************/
1566
1567 /* This function is handed a compiled expression that it must study to produce
1568 information that will speed up the matching.
1569
1570 Argument: points to the compiled expression
1571 Returns: 0 normally; non-zero should never normally occur
1572 1 unknown opcode in set_start_bits
1573 2 missing capturing bracket
1574 3 unknown opcode in find_minlength
1575 */
1576
1577 int
PRIV(study)1578 PRIV(study)(pcre2_real_code *re)
1579 {
1580 int min;
1581 int count = 0;
1582 PCRE2_UCHAR *code;
1583 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1584
1585 /* Find start of compiled code */
1586
1587 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1588 re->name_entry_size * re->name_count;
1589
1590 /* For a pattern that has a first code unit, or a multiline pattern that
1591 matches only at "line start", there is no point in seeking a list of starting
1592 code units. */
1593
1594 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1595 {
1596 int rc = set_start_bits(re, code, utf);
1597 if (rc == SSB_UNKNOWN) return 1;
1598 if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
1599 }
1600
1601 /* Find the minimum length of subject string. If the pattern can match an empty
1602 string, the minimum length is already known. If there are more back references
1603 than the size of the vector we are going to cache them in, do nothing. A
1604 pattern that complicated will probably take a long time to analyze and may in
1605 any case turn out to be too complicated. Note that back reference minima are
1606 held as 16-bit numbers. */
1607
1608 if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
1609 re->top_backref <= MAX_CACHE_BACKREF)
1610 {
1611 int backref_cache[MAX_CACHE_BACKREF+1];
1612 backref_cache[0] = 0; /* Highest one that is set */
1613 min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1614 switch(min)
1615 {
1616 case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
1617 break; /* Leave minlength unchanged (will be zero) */
1618
1619 case -2:
1620 return 2; /* missing capturing bracket */
1621
1622 case -3:
1623 return 3; /* unrecognized opcode */
1624
1625 default:
1626 if (min > UINT16_MAX) min = UINT16_MAX;
1627 re->minlength = min;
1628 break;
1629 }
1630 }
1631
1632 return 0;
1633 }
1634
1635 /* End of pcre2_study.c */
1636